diff --git a/build_docker.sh b/build_docker.sh index a5aea45e6ff5024b71818dea6f4e7cfb0d0ae6c0..50d857a6f9deadefb85cab7b12442920d1734290 100644 --- a/build_docker.sh +++ b/build_docker.sh @@ -1,3 +1,4 @@ docker build -t image-matching-webui:latest . --no-cache docker tag image-matching-webui:latest vincentqin/image-matching-webui:latest docker push vincentqin/image-matching-webui:latest + \ No newline at end of file diff --git a/hloc/matchers/roma.py b/hloc/matchers/roma.py index 1f9bcb0edff59453680f9309f9ead6b364d8c8ad..d91fbb8dcc35354c75ad30d2753c8ed85fb82da5 100644 --- a/hloc/matchers/roma.py +++ b/hloc/matchers/roma.py @@ -6,7 +6,7 @@ from PIL import Image from ..utils.base_model import BaseModel from .. import logger -roma_path = Path(__file__).parent / "../../third_party/Roma" +roma_path = Path(__file__).parent / "../../third_party/RoMa" sys.path.append(str(roma_path)) from roma.models.model_zoo.roma_models import roma_model @@ -63,6 +63,8 @@ class Roma(BaseModel): weights=weights, dinov2_weights=dinov2_weights, device=device, + #temp fix issue: https://github.com/Parskatt/RoMa/issues/26 + amp_dtype=torch.float32, ) logger.info(f"Load Roma model done.") diff --git a/third_party/Roma/.gitignore b/third_party/RoMa/.gitignore similarity index 100% rename from third_party/Roma/.gitignore rename to third_party/RoMa/.gitignore diff --git a/third_party/RoMa/LICENSE b/third_party/RoMa/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..ca95157052a76debc473afb395bffae0c1329e63 --- /dev/null +++ b/third_party/RoMa/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Johan Edstedt + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/third_party/RoMa/README.md b/third_party/RoMa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a3b6484b2a6c19af426b731396c5c91331f99ada --- /dev/null +++ b/third_party/RoMa/README.md @@ -0,0 +1,92 @@ +# +

+

RoMa đŸ›ïž:
Robust Dense Feature Matching
⭐CVPR 2024⭐

+

+ Johan Edstedt + · + Qiyu Sun + · + Georg Bökman + · + MÄrten WadenbÀck + · + Michael Felsberg +

+

+ Paper | + Project Page +

+
+

+
+

+ example +
+ RoMa is the robust dense feature matcher capable of estimating pixel-dense warps and reliable certainties for almost any image pair. +

+ +## Setup/Install +In your python environment (tested on Linux python 3.10), run: +```bash +pip install -e . +``` +## Demo / How to Use +We provide two demos in the [demos folder](demo). +Here's the gist of it: +```python +from roma import roma_outdoor +roma_model = roma_outdoor(device=device) +# Match +warp, certainty = roma_model.match(imA_path, imB_path, device=device) +# Sample matches for estimation +matches, certainty = roma_model.sample(warp, certainty) +# Convert to pixel coordinates (RoMa produces matches in [-1,1]x[-1,1]) +kptsA, kptsB = roma_model.to_pixel_coordinates(matches, H_A, W_A, H_B, W_B) +# Find a fundamental matrix (or anything else of interest) +F, mask = cv2.findFundamentalMat( + kptsA.cpu().numpy(), kptsB.cpu().numpy(), ransacReprojThreshold=0.2, method=cv2.USAC_MAGSAC, confidence=0.999999, maxIters=10000 +) +``` + +**New**: You can also match arbitrary keypoints with RoMa. A demo for this will be added soon. +## Settings + +### Resolution +By default RoMa uses an initial resolution of (560,560) which is then upsampled to (864,864). +You can change this at construction (see roma_outdoor kwargs). +You can also change this later, by changing the roma_model.w_resized, roma_model.h_resized, and roma_model.upsample_res. + +### Sampling +roma_model.sample_thresh controls the thresholding used when sampling matches for estimation. In certain cases a lower or higher threshold may improve results. + + +## Reproducing Results +The experiments in the paper are provided in the [experiments folder](experiments). + +### Training +1. First follow the instructions provided here: https://github.com/Parskatt/DKM for downloading and preprocessing datasets. +2. Run the relevant experiment, e.g., +```bash +torchrun --nproc_per_node=4 --nnodes=1 --rdzv_backend=c10d experiments/roma_outdoor.py +``` +### Testing +```bash +python experiments/roma_outdoor.py --only_test --benchmark mega-1500 +``` +## License +All our code except DINOv2 is MIT license. +DINOv2 has an Apache 2 license [DINOv2](https://github.com/facebookresearch/dinov2/blob/main/LICENSE). + +## Acknowledgement +Our codebase builds on the code in [DKM](https://github.com/Parskatt/DKM). + +## BibTeX +If you find our models useful, please consider citing our paper! +``` +@article{edstedt2024roma, +title={{RoMa: Robust Dense Feature Matching}}, +author={Edstedt, Johan and Sun, Qiyu and Bökman, Georg and WadenbĂ€ck, MĂ„rten and Felsberg, Michael}, +journal={IEEE Conference on Computer Vision and Pattern Recognition}, +year={2024} +} +``` diff --git a/third_party/Roma/assets/sacre_coeur_A.jpg b/third_party/RoMa/assets/sacre_coeur_A.jpg similarity index 100% rename from third_party/Roma/assets/sacre_coeur_A.jpg rename to third_party/RoMa/assets/sacre_coeur_A.jpg diff --git a/third_party/Roma/assets/sacre_coeur_B.jpg b/third_party/RoMa/assets/sacre_coeur_B.jpg similarity index 100% rename from third_party/Roma/assets/sacre_coeur_B.jpg rename to third_party/RoMa/assets/sacre_coeur_B.jpg diff --git a/third_party/RoMa/assets/toronto_A.jpg b/third_party/RoMa/assets/toronto_A.jpg new file mode 100644 index 0000000000000000000000000000000000000000..450622c06c06b5bdcb4b20150ec4b5e8e34f9787 --- /dev/null +++ b/third_party/RoMa/assets/toronto_A.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40270c227df93f0f31b55e0f2ff38eb24f47940c4800c83758a74a5dfd7346ec +size 525339 diff --git a/third_party/RoMa/assets/toronto_B.jpg b/third_party/RoMa/assets/toronto_B.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6a8c7907bfc9bcd88f9d9deaa6e148e18a764d12 --- /dev/null +++ b/third_party/RoMa/assets/toronto_B.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2c07550ed87e40fca8c38076eb3a81395d760a88bf0b8615167704107deff2f +size 286466 diff --git a/third_party/Roma/data/.gitignore b/third_party/RoMa/data/.gitignore similarity index 100% rename from third_party/Roma/data/.gitignore rename to third_party/RoMa/data/.gitignore diff --git a/third_party/RoMa/demo/demo_3D_effect.py b/third_party/RoMa/demo/demo_3D_effect.py new file mode 100644 index 0000000000000000000000000000000000000000..5afd6e5ce0fdd32788160e8c24df0b26a27f34dd --- /dev/null +++ b/third_party/RoMa/demo/demo_3D_effect.py @@ -0,0 +1,46 @@ +from PIL import Image +import torch +import torch.nn.functional as F +import numpy as np +from roma.utils.utils import tensor_to_pil + +from roma import roma_outdoor + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + +if __name__ == "__main__": + from argparse import ArgumentParser + parser = ArgumentParser() + parser.add_argument("--im_A_path", default="assets/toronto_A.jpg", type=str) + parser.add_argument("--im_B_path", default="assets/toronto_B.jpg", type=str) + parser.add_argument("--save_path", default="demo/gif/roma_warp_toronto", type=str) + + args, _ = parser.parse_known_args() + im1_path = args.im_A_path + im2_path = args.im_B_path + save_path = args.save_path + + # Create model + roma_model = roma_outdoor(device=device, coarse_res=560, upsample_res=(864, 1152)) + roma_model.symmetric = False + + H, W = roma_model.get_output_resolution() + + im1 = Image.open(im1_path).resize((W, H)) + im2 = Image.open(im2_path).resize((W, H)) + + # Match + warp, certainty = roma_model.match(im1_path, im2_path, device=device) + # Sampling not needed, but can be done with model.sample(warp, certainty) + x1 = (torch.tensor(np.array(im1)) / 255).to(device).permute(2, 0, 1) + x2 = (torch.tensor(np.array(im2)) / 255).to(device).permute(2, 0, 1) + + coords_A, coords_B = warp[...,:2], warp[...,2:] + for i, x in enumerate(np.linspace(0,2*np.pi,200)): + t = (1 + np.cos(x))/2 + interp_warp = (1-t)*coords_A + t*coords_B + im2_transfer_rgb = F.grid_sample( + x2[None], interp_warp[None], mode="bilinear", align_corners=False + )[0] + tensor_to_pil(im2_transfer_rgb, unnormalize=False).save(f"{save_path}_{i:03d}.jpg") \ No newline at end of file diff --git a/third_party/Roma/demo/demo_fundamental.py b/third_party/RoMa/demo/demo_fundamental.py similarity index 76% rename from third_party/Roma/demo/demo_fundamental.py rename to third_party/RoMa/demo/demo_fundamental.py index a71fd5532412fb4c65eb109e8e9f83813c11fd85..31618d4b06cd56fdd4be9065fb00b826a19e10f9 100644 --- a/third_party/Roma/demo/demo_fundamental.py +++ b/third_party/RoMa/demo/demo_fundamental.py @@ -3,12 +3,11 @@ import torch import cv2 from roma import roma_outdoor -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if __name__ == "__main__": from argparse import ArgumentParser - parser = ArgumentParser() parser.add_argument("--im_A_path", default="assets/sacre_coeur_A.jpg", type=str) parser.add_argument("--im_B_path", default="assets/sacre_coeur_B.jpg", type=str) @@ -20,6 +19,7 @@ if __name__ == "__main__": # Create model roma_model = roma_outdoor(device=device) + W_A, H_A = Image.open(im1_path).size W_B, H_B = Image.open(im2_path).size @@ -27,12 +27,7 @@ if __name__ == "__main__": warp, certainty = roma_model.match(im1_path, im2_path, device=device) # Sample matches for estimation matches, certainty = roma_model.sample(warp, certainty) - kpts1, kpts2 = roma_model.to_pixel_coordinates(matches, H_A, W_A, H_B, W_B) + kpts1, kpts2 = roma_model.to_pixel_coordinates(matches, H_A, W_A, H_B, W_B) F, mask = cv2.findFundamentalMat( - kpts1.cpu().numpy(), - kpts2.cpu().numpy(), - ransacReprojThreshold=0.2, - method=cv2.USAC_MAGSAC, - confidence=0.999999, - maxIters=10000, - ) + kpts1.cpu().numpy(), kpts2.cpu().numpy(), ransacReprojThreshold=0.2, method=cv2.USAC_MAGSAC, confidence=0.999999, maxIters=10000 + ) \ No newline at end of file diff --git a/third_party/Roma/demo/demo_match.py b/third_party/RoMa/demo/demo_match.py similarity index 56% rename from third_party/Roma/demo/demo_match.py rename to third_party/RoMa/demo/demo_match.py index 69eb07ffb0b480db99252bbb03a9858964e8d5f0..80dfcd252e6665246a1b21cca7c8c64a183fa0e2 100644 --- a/third_party/Roma/demo/demo_match.py +++ b/third_party/RoMa/demo/demo_match.py @@ -4,20 +4,17 @@ import torch.nn.functional as F import numpy as np from roma.utils.utils import tensor_to_pil -from roma import roma_indoor +from roma import roma_outdoor -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if __name__ == "__main__": from argparse import ArgumentParser - parser = ArgumentParser() - parser.add_argument("--im_A_path", default="assets/sacre_coeur_A.jpg", type=str) - parser.add_argument("--im_B_path", default="assets/sacre_coeur_B.jpg", type=str) - parser.add_argument( - "--save_path", default="demo/dkmv3_warp_sacre_coeur.jpg", type=str - ) + parser.add_argument("--im_A_path", default="assets/toronto_A.jpg", type=str) + parser.add_argument("--im_B_path", default="assets/toronto_B.jpg", type=str) + parser.add_argument("--save_path", default="demo/roma_warp_toronto.jpg", type=str) args, _ = parser.parse_known_args() im1_path = args.im_A_path @@ -25,7 +22,7 @@ if __name__ == "__main__": save_path = args.save_path # Create model - roma_model = roma_indoor(device=device) + roma_model = roma_outdoor(device=device, coarse_res=560, upsample_res=(864, 1152)) H, W = roma_model.get_output_resolution() @@ -39,12 +36,12 @@ if __name__ == "__main__": x2 = (torch.tensor(np.array(im2)) / 255).to(device).permute(2, 0, 1) im2_transfer_rgb = F.grid_sample( - x2[None], warp[:, :W, 2:][None], mode="bilinear", align_corners=False + x2[None], warp[:,:W, 2:][None], mode="bilinear", align_corners=False )[0] im1_transfer_rgb = F.grid_sample( - x1[None], warp[:, W:, :2][None], mode="bilinear", align_corners=False + x1[None], warp[:, W:, :2][None], mode="bilinear", align_corners=False )[0] - warp_im = torch.cat((im2_transfer_rgb, im1_transfer_rgb), dim=2) - white_im = torch.ones((H, 2 * W), device=device) + warp_im = torch.cat((im2_transfer_rgb,im1_transfer_rgb),dim=2) + white_im = torch.ones((H,2*W),device=device) vis_im = certainty * warp_im + (1 - certainty) * white_im - tensor_to_pil(vis_im, unnormalize=False).save(save_path) + tensor_to_pil(vis_im, unnormalize=False).save(save_path) \ No newline at end of file diff --git a/third_party/RoMa/demo/demo_match_opencv_sift.py b/third_party/RoMa/demo/demo_match_opencv_sift.py new file mode 100644 index 0000000000000000000000000000000000000000..3196fcfaab248f6c4c6247a0afb4db745206aee8 --- /dev/null +++ b/third_party/RoMa/demo/demo_match_opencv_sift.py @@ -0,0 +1,43 @@ +from PIL import Image +import numpy as np + +import numpy as np +import cv2 as cv +import matplotlib.pyplot as plt + + + +if __name__ == "__main__": + from argparse import ArgumentParser + parser = ArgumentParser() + parser.add_argument("--im_A_path", default="assets/toronto_A.jpg", type=str) + parser.add_argument("--im_B_path", default="assets/toronto_B.jpg", type=str) + parser.add_argument("--save_path", default="demo/roma_warp_toronto.jpg", type=str) + + args, _ = parser.parse_known_args() + im1_path = args.im_A_path + im2_path = args.im_B_path + save_path = args.save_path + + img1 = cv.imread(im1_path,cv.IMREAD_GRAYSCALE) # queryImage + img2 = cv.imread(im2_path,cv.IMREAD_GRAYSCALE) # trainImage + # Initiate SIFT detector + sift = cv.SIFT_create() + # find the keypoints and descriptors with SIFT + kp1, des1 = sift.detectAndCompute(img1,None) + kp2, des2 = sift.detectAndCompute(img2,None) + # BFMatcher with default params + bf = cv.BFMatcher() + matches = bf.knnMatch(des1,des2,k=2) + # Apply ratio test + good = [] + for m,n in matches: + if m.distance < 0.75*n.distance: + good.append([m]) + # cv.drawMatchesKnn expects list of lists as matches. + draw_params = dict(matchColor = (255,0,0), # draw matches in red color + singlePointColor = None, + flags = 2) + + img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,good,None,**draw_params) + Image.fromarray(img3).save("demo/sift_matches.png") diff --git a/third_party/RoMa/demo/gif/.gitignore b/third_party/RoMa/demo/gif/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c96a04f008ee21e260b28f7701595ed59e2839e3 --- /dev/null +++ b/third_party/RoMa/demo/gif/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file diff --git a/third_party/Roma/requirements.txt b/third_party/RoMa/requirements.txt similarity index 65% rename from third_party/Roma/requirements.txt rename to third_party/RoMa/requirements.txt index 12addf0d0eb74e6cac0da6bca704eac0b28990d7..f0dbab3d4cb35a5f00e3dbc8e3f8b00a3e578428 100644 --- a/third_party/Roma/requirements.txt +++ b/third_party/RoMa/requirements.txt @@ -10,4 +10,4 @@ matplotlib h5py wandb timm -xformers # Optional, used for memefficient attention \ No newline at end of file +#xformers # Optional, used for memefficient attention \ No newline at end of file diff --git a/third_party/Roma/roma/__init__.py b/third_party/RoMa/roma/__init__.py similarity index 62% rename from third_party/Roma/roma/__init__.py rename to third_party/RoMa/roma/__init__.py index a3c12d5247b93a83882edfb45bd127db794e791f..a7c96481e0a808b68c7b3054a3e34fa0b5c45ab9 100644 --- a/third_party/Roma/roma/__init__.py +++ b/third_party/RoMa/roma/__init__.py @@ -2,7 +2,7 @@ import os from .models import roma_outdoor, roma_indoor DEBUG_MODE = False -RANK = int(os.environ.get("RANK", default=0)) +RANK = int(os.environ.get('RANK', default = 0)) GLOBAL_STEP = 0 STEP_SIZE = 1 -LOCAL_RANK = -1 +LOCAL_RANK = -1 \ No newline at end of file diff --git a/third_party/Roma/roma/benchmarks/__init__.py b/third_party/RoMa/roma/benchmarks/__init__.py similarity index 100% rename from third_party/Roma/roma/benchmarks/__init__.py rename to third_party/RoMa/roma/benchmarks/__init__.py diff --git a/third_party/Roma/roma/benchmarks/hpatches_sequences_homog_benchmark.py b/third_party/RoMa/roma/benchmarks/hpatches_sequences_homog_benchmark.py similarity index 91% rename from third_party/Roma/roma/benchmarks/hpatches_sequences_homog_benchmark.py rename to third_party/RoMa/roma/benchmarks/hpatches_sequences_homog_benchmark.py index 6417d4d54798360a027a0d11d50fc65cdfae015a..2154a471c73d9e883c3ba8ed1b90d708f4950a63 100644 --- a/third_party/Roma/roma/benchmarks/hpatches_sequences_homog_benchmark.py +++ b/third_party/RoMa/roma/benchmarks/hpatches_sequences_homog_benchmark.py @@ -53,7 +53,7 @@ class HpatchesHomogBenchmark: ) return im_A_coords, im_A_to_im_B - def benchmark(self, model, model_name=None): + def benchmark(self, model, model_name = None): n_matches = [] homog_dists = [] for seq_idx, seq_name in tqdm( @@ -69,7 +69,9 @@ class HpatchesHomogBenchmark: H = np.loadtxt( os.path.join(self.seqs_path, seq_name, "H_1_" + str(im_idx)) ) - dense_matches, dense_certainty = model.match(im_A_path, im_B_path) + dense_matches, dense_certainty = model.match( + im_A_path, im_B_path + ) good_matches, _ = model.sample(dense_matches, dense_certainty, 5000) pos_a, pos_b = self.convert_coordinates( good_matches[:, :2], good_matches[:, 2:], w1, h1, w2, h2 @@ -78,9 +80,9 @@ class HpatchesHomogBenchmark: H_pred, inliers = cv2.findHomography( pos_a, pos_b, - method=cv2.RANSAC, - confidence=0.99999, - ransacReprojThreshold=3 * min(w2, h2) / 480, + method = cv2.RANSAC, + confidence = 0.99999, + ransacReprojThreshold = 3 * min(w2, h2) / 480, ) except: H_pred = None diff --git a/third_party/Roma/roma/benchmarks/megadepth_dense_benchmark.py b/third_party/RoMa/roma/benchmarks/megadepth_dense_benchmark.py similarity index 81% rename from third_party/Roma/roma/benchmarks/megadepth_dense_benchmark.py rename to third_party/RoMa/roma/benchmarks/megadepth_dense_benchmark.py index f51a77e15510572b8f594dbc7713a0f348a33fd8..0600d354b1d0dfa7f8e2b0f8882a4cc08fafeed9 100644 --- a/third_party/Roma/roma/benchmarks/megadepth_dense_benchmark.py +++ b/third_party/RoMa/roma/benchmarks/megadepth_dense_benchmark.py @@ -6,11 +6,8 @@ from roma.utils import warp_kpts from torch.utils.data import ConcatDataset import roma - class MegadepthDenseBenchmark: - def __init__( - self, data_root="data/megadepth", h=384, w=512, num_samples=2000 - ) -> None: + def __init__(self, data_root="data/megadepth", h = 384, w = 512, num_samples = 2000) -> None: mega = MegadepthBuilder(data_root=data_root) self.dataset = ConcatDataset( mega.build_scenes(split="test_loftr", ht=h, wt=w) @@ -52,15 +49,13 @@ class MegadepthDenseBenchmark: pck_3_tot = 0.0 pck_5_tot = 0.0 sampler = torch.utils.data.WeightedRandomSampler( - torch.ones(len(self.dataset)), - replacement=False, - num_samples=self.num_samples, + torch.ones(len(self.dataset)), replacement=False, num_samples=self.num_samples ) B = batch_size dataloader = torch.utils.data.DataLoader( self.dataset, batch_size=B, num_workers=batch_size, sampler=sampler ) - for idx, data in tqdm.tqdm(enumerate(dataloader), disable=roma.RANK > 0): + for idx, data in tqdm.tqdm(enumerate(dataloader), disable = roma.RANK > 0): im_A, im_B, depth1, depth2, T_1to2, K1, K2 = ( data["im_A"], data["im_B"], @@ -77,36 +72,25 @@ class MegadepthDenseBenchmark: if roma.DEBUG_MODE: from roma.utils.utils import tensor_to_pil import torch.nn.functional as F - path = "vis" H, W = model.get_output_resolution() - white_im = torch.ones((B, 1, H, W), device="cuda") + white_im = torch.ones((B,1,H,W),device="cuda") im_B_transfer_rgb = F.grid_sample( - im_B.cuda(), - matches[:, :, :W, 2:], - mode="bilinear", - align_corners=False, + im_B.cuda(), matches[:,:,:W, 2:], mode="bilinear", align_corners=False ) warp_im = im_B_transfer_rgb - c_b = certainty[ - :, None - ] # (certainty*0.9 + 0.1*torch.ones_like(certainty))[:,None] + c_b = certainty[:,None]#(certainty*0.9 + 0.1*torch.ones_like(certainty))[:,None] vis_im = c_b * warp_im + (1 - c_b) * white_im for b in range(B): import os - - os.makedirs( - f"{path}/{model.name}/{idx}_{b}_{H}_{W}", exist_ok=True - ) + os.makedirs(f"{path}/{model.name}/{idx}_{b}_{H}_{W}",exist_ok=True) tensor_to_pil(vis_im[b], unnormalize=True).save( - f"{path}/{model.name}/{idx}_{b}_{H}_{W}/warp.jpg" - ) + f"{path}/{model.name}/{idx}_{b}_{H}_{W}/warp.jpg") tensor_to_pil(im_A[b].cuda(), unnormalize=True).save( - f"{path}/{model.name}/{idx}_{b}_{H}_{W}/im_A.jpg" - ) + f"{path}/{model.name}/{idx}_{b}_{H}_{W}/im_A.jpg") tensor_to_pil(im_B[b].cuda(), unnormalize=True).save( - f"{path}/{model.name}/{idx}_{b}_{H}_{W}/im_B.jpg" - ) + f"{path}/{model.name}/{idx}_{b}_{H}_{W}/im_B.jpg") + gd_tot, pck_1_tot, pck_3_tot, pck_5_tot = ( gd_tot + gd.mean(), diff --git a/third_party/Roma/roma/benchmarks/megadepth_pose_estimation_benchmark.py b/third_party/RoMa/roma/benchmarks/megadepth_pose_estimation_benchmark.py similarity index 69% rename from third_party/Roma/roma/benchmarks/megadepth_pose_estimation_benchmark.py rename to third_party/RoMa/roma/benchmarks/megadepth_pose_estimation_benchmark.py index 5d936a07d550763d0378a23ea83c79cec5d373fe..217aebab4cb73471cc156de9e8d3d882a1b2af95 100644 --- a/third_party/Roma/roma/benchmarks/megadepth_pose_estimation_benchmark.py +++ b/third_party/RoMa/roma/benchmarks/megadepth_pose_estimation_benchmark.py @@ -7,9 +7,8 @@ import torch.nn.functional as F import roma import kornia.geometry.epipolar as kepi - class MegaDepthPoseEstimationBenchmark: - def __init__(self, data_root="data/megadepth", scene_names=None) -> None: + def __init__(self, data_root="data/megadepth", scene_names = None) -> None: if scene_names is None: self.scene_names = [ "0015_0.1_0.3.npz", @@ -26,22 +25,13 @@ class MegaDepthPoseEstimationBenchmark: ] self.data_root = data_root - def benchmark( - self, - model, - model_name=None, - resolution=None, - scale_intrinsics=True, - calibrated=True, - ): - H, W = model.get_output_resolution() + def benchmark(self, model, model_name = None): with torch.no_grad(): data_root = self.data_root tot_e_t, tot_e_R, tot_e_pose = [], [], [] thresholds = [5, 10, 20] for scene_ind in range(len(self.scenes)): import os - scene_name = os.path.splitext(self.scene_names[scene_ind])[0] scene = self.scenes[scene_ind] pairs = scene["pair_infos"] @@ -58,22 +48,21 @@ class MegaDepthPoseEstimationBenchmark: T2 = poses[idx2].copy() R2, t2 = T2[:3, :3], T2[:3, 3] R, t = compute_relative_pose(R1, t1, R2, t2) - T1_to_2 = np.concatenate((R, t[:, None]), axis=-1) + T1_to_2 = np.concatenate((R,t[:,None]), axis=-1) im_A_path = f"{data_root}/{im_paths[idx1]}" im_B_path = f"{data_root}/{im_paths[idx2]}" dense_matches, dense_certainty = model.match( im_A_path, im_B_path, K1.copy(), K2.copy(), T1_to_2.copy() ) - sparse_matches, _ = model.sample( - dense_matches, dense_certainty, 5000 + sparse_matches,_ = model.sample( + dense_matches, dense_certainty, 5_000 ) - + im_A = Image.open(im_A_path) w1, h1 = im_A.size im_B = Image.open(im_B_path) w2, h2 = im_B.size - - if scale_intrinsics: + if True: # Note: we keep this true as it was used in DKM/RoMa papers. There is very little difference compared to setting to False. scale1 = 1200 / max(w1, h1) scale2 = 1200 / max(w2, h2) w1, h1 = scale1 * w1, scale1 * h1 @@ -82,42 +71,23 @@ class MegaDepthPoseEstimationBenchmark: K1[:2] = K1[:2] * scale1 K2[:2] = K2[:2] * scale2 - kpts1 = sparse_matches[:, :2] - kpts1 = np.stack( - ( - w1 * (kpts1[:, 0] + 1) / 2, - h1 * (kpts1[:, 1] + 1) / 2, - ), - axis=-1, - ) - kpts2 = sparse_matches[:, 2:] - kpts2 = np.stack( - ( - w2 * (kpts2[:, 0] + 1) / 2, - h2 * (kpts2[:, 1] + 1) / 2, - ), - axis=-1, - ) - + kpts1, kpts2 = model.to_pixel_coordinates(sparse_matches, h1, w1, h2, w2) + kpts1, kpts2 = kpts1.cpu().numpy(), kpts2.cpu().numpy() for _ in range(5): shuffling = np.random.permutation(np.arange(len(kpts1))) kpts1 = kpts1[shuffling] kpts2 = kpts2[shuffling] try: - threshold = 0.5 - if calibrated: - norm_threshold = threshold / ( - np.mean(np.abs(K1[:2, :2])) - + np.mean(np.abs(K2[:2, :2])) - ) - R_est, t_est, mask = estimate_pose( - kpts1, - kpts2, - K1, - K2, - norm_threshold, - conf=0.99999, - ) + threshold = 0.5 + norm_threshold = threshold / (np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2]))) + R_est, t_est, mask = estimate_pose( + kpts1, + kpts2, + K1, + K2, + norm_threshold, + conf=0.99999, + ) T1_to_2_est = np.concatenate((R_est, t_est), axis=-1) # e_t, e_R = compute_pose_error(T1_to_2_est, R, t) e_pose = max(e_t, e_R) diff --git a/third_party/Roma/roma/benchmarks/scannet_benchmark.py b/third_party/RoMa/roma/benchmarks/scannet_benchmark.py similarity index 79% rename from third_party/Roma/roma/benchmarks/scannet_benchmark.py rename to third_party/RoMa/roma/benchmarks/scannet_benchmark.py index 3187c2acf79f5af8f64397f55f6df40af327945b..853af0d0ebef4dfefe2632eb49e4156ea791ee76 100644 --- a/third_party/Roma/roma/benchmarks/scannet_benchmark.py +++ b/third_party/RoMa/roma/benchmarks/scannet_benchmark.py @@ -10,7 +10,7 @@ class ScanNetBenchmark: def __init__(self, data_root="data/scannet") -> None: self.data_root = data_root - def benchmark(self, model, model_name=None): + def benchmark(self, model, model_name = None): model.train(False) with torch.no_grad(): data_root = self.data_root @@ -24,20 +24,20 @@ class ScanNetBenchmark: scene = pairs[pairind] scene_name = f"scene0{scene[0]}_00" im_A_path = osp.join( - self.data_root, - "scans_test", - scene_name, - "color", - f"{scene[2]}.jpg", - ) + self.data_root, + "scans_test", + scene_name, + "color", + f"{scene[2]}.jpg", + ) im_A = Image.open(im_A_path) im_B_path = osp.join( - self.data_root, - "scans_test", - scene_name, - "color", - f"{scene[3]}.jpg", - ) + self.data_root, + "scans_test", + scene_name, + "color", + f"{scene[3]}.jpg", + ) im_B = Image.open(im_B_path) T_gt = rel_pose[pairind].reshape(3, 4) R, t = T_gt[:3, :3], T_gt[:3, 3] @@ -76,20 +76,24 @@ class ScanNetBenchmark: offset = 0.5 kpts1 = sparse_matches[:, :2] - kpts1 = np.stack( - ( - w1 * (kpts1[:, 0] + 1) / 2 - offset, - h1 * (kpts1[:, 1] + 1) / 2 - offset, - ), - axis=-1, + kpts1 = ( + np.stack( + ( + w1 * (kpts1[:, 0] + 1) / 2 - offset, + h1 * (kpts1[:, 1] + 1) / 2 - offset, + ), + axis=-1, + ) ) kpts2 = sparse_matches[:, 2:] - kpts2 = np.stack( - ( - w2 * (kpts2[:, 0] + 1) / 2 - offset, - h2 * (kpts2[:, 1] + 1) / 2 - offset, - ), - axis=-1, + kpts2 = ( + np.stack( + ( + w2 * (kpts2[:, 0] + 1) / 2 - offset, + h2 * (kpts2[:, 1] + 1) / 2 - offset, + ), + axis=-1, + ) ) for _ in range(5): shuffling = np.random.permutation(np.arange(len(kpts1))) @@ -97,8 +101,7 @@ class ScanNetBenchmark: kpts2 = kpts2[shuffling] try: norm_threshold = 0.5 / ( - np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2])) - ) + np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2]))) R_est, t_est, mask = estimate_pose( kpts1, kpts2, diff --git a/third_party/Roma/roma/checkpointing/__init__.py b/third_party/RoMa/roma/checkpointing/__init__.py similarity index 100% rename from third_party/Roma/roma/checkpointing/__init__.py rename to third_party/RoMa/roma/checkpointing/__init__.py diff --git a/third_party/Roma/roma/checkpointing/checkpoint.py b/third_party/RoMa/roma/checkpointing/checkpoint.py similarity index 96% rename from third_party/Roma/roma/checkpointing/checkpoint.py rename to third_party/RoMa/roma/checkpointing/checkpoint.py index 6372d89fe86c00c7acedf015886717bfeca7bb1f..8995efeb54f4d558127ea63423fa958c64e9088f 100644 --- a/third_party/Roma/roma/checkpointing/checkpoint.py +++ b/third_party/RoMa/roma/checkpointing/checkpoint.py @@ -7,7 +7,6 @@ import gc import roma - class CheckPoint: def __init__(self, dir=None, name="tmp"): self.name = name @@ -20,7 +19,7 @@ class CheckPoint: optimizer, lr_scheduler, n, - ): + ): if roma.RANK == 0: assert model is not None if isinstance(model, (DataParallel, DistributedDataParallel)): @@ -33,14 +32,14 @@ class CheckPoint: } torch.save(states, self.dir + self.name + f"_latest.pth") logger.info(f"Saved states {list(states.keys())}, at step {n}") - + def load( self, model, optimizer, lr_scheduler, n, - ): + ): if os.path.exists(self.dir + self.name + f"_latest.pth") and roma.RANK == 0: states = torch.load(self.dir + self.name + f"_latest.pth") if "model" in states: @@ -58,4 +57,4 @@ class CheckPoint: del states gc.collect() torch.cuda.empty_cache() - return model, optimizer, lr_scheduler, n + return model, optimizer, lr_scheduler, n \ No newline at end of file diff --git a/third_party/Roma/roma/datasets/__init__.py b/third_party/RoMa/roma/datasets/__init__.py similarity index 52% rename from third_party/Roma/roma/datasets/__init__.py rename to third_party/RoMa/roma/datasets/__init__.py index 6a11f122e222f0a9eded4afd3dd0b900826063e8..b60c709926a4a7bd019b73eac10879063a996c90 100644 --- a/third_party/Roma/roma/datasets/__init__.py +++ b/third_party/RoMa/roma/datasets/__init__.py @@ -1,2 +1,2 @@ from .megadepth import MegadepthBuilder -from .scannet import ScanNetBuilder +from .scannet import ScanNetBuilder \ No newline at end of file diff --git a/third_party/Roma/roma/datasets/megadepth.py b/third_party/RoMa/roma/datasets/megadepth.py similarity index 75% rename from third_party/Roma/roma/datasets/megadepth.py rename to third_party/RoMa/roma/datasets/megadepth.py index 75cb72ded02c80d1ad6bce0d0269626ee49a9275..5deee5ac30c439a9f300c0ad2271f141931020c0 100644 --- a/third_party/Roma/roma/datasets/megadepth.py +++ b/third_party/RoMa/roma/datasets/megadepth.py @@ -10,7 +10,6 @@ import roma from roma.utils import * import math - class MegadepthScene: def __init__( self, @@ -23,20 +22,18 @@ class MegadepthScene: shake_t=0, rot_prob=0.0, normalize=True, - max_num_pairs=100_000, - scene_name=None, - use_horizontal_flip_aug=False, - use_single_horizontal_flip_aug=False, - colorjiggle_params=None, - random_eraser=None, - use_randaug=False, - randaug_params=None, - randomize_size=False, + max_num_pairs = 100_000, + scene_name = None, + use_horizontal_flip_aug = False, + use_single_horizontal_flip_aug = False, + colorjiggle_params = None, + random_eraser = None, + use_randaug = False, + randaug_params = None, + randomize_size = False, ) -> None: self.data_root = data_root - self.scene_name = ( - os.path.splitext(scene_name)[0] + f"_{min_overlap}_{max_overlap}" - ) + self.scene_name = os.path.splitext(scene_name)[0]+f"_{min_overlap}_{max_overlap}" self.image_paths = scene_info["image_paths"] self.depth_paths = scene_info["depth_paths"] self.intrinsics = scene_info["intrinsics"] @@ -54,18 +51,18 @@ class MegadepthScene: self.overlaps = self.overlaps[pairinds] if randomize_size: area = ht * wt - s = int(16 * (math.sqrt(area) // 16)) - sizes = ((ht, wt), (s, s), (wt, ht)) + s = int(16 * (math.sqrt(area)//16)) + sizes = ((ht,wt), (s,s), (wt,ht)) choice = roma.RANK % 3 - ht, wt = sizes[choice] + ht, wt = sizes[choice] # counts, bins = np.histogram(self.overlaps,20) # print(counts) self.im_transform_ops = get_tuple_transform_ops( - resize=(ht, wt), - normalize=normalize, - colorjiggle_params=colorjiggle_params, + resize=(ht, wt), normalize=normalize, colorjiggle_params = colorjiggle_params, ) - self.depth_transform_ops = get_depth_tuple_transform_ops(resize=(ht, wt)) + self.depth_transform_ops = get_depth_tuple_transform_ops( + resize=(ht, wt) + ) self.wt, self.ht = wt, ht self.shake_t = shake_t self.random_eraser = random_eraser @@ -78,19 +75,17 @@ class MegadepthScene: def load_im(self, im_path): im = Image.open(im_path) return im - - def horizontal_flip(self, im_A, im_B, depth_A, depth_B, K_A, K_B): + + def horizontal_flip(self, im_A, im_B, depth_A, depth_B, K_A, K_B): im_A = im_A.flip(-1) im_B = im_B.flip(-1) - depth_A, depth_B = depth_A.flip(-1), depth_B.flip(-1) - flip_mat = torch.tensor([[-1, 0, self.wt], [0, 1, 0], [0, 0, 1.0]]).to( - K_A.device - ) - K_A = flip_mat @ K_A - K_B = flip_mat @ K_B - + depth_A, depth_B = depth_A.flip(-1), depth_B.flip(-1) + flip_mat = torch.tensor([[-1, 0, self.wt],[0,1,0],[0,0,1.]]).to(K_A.device) + K_A = flip_mat@K_A + K_B = flip_mat@K_B + return im_A, im_B, depth_A, depth_B, K_A, K_B - + def load_depth(self, depth_ref, crop=None): depth = np.array(h5py.File(depth_ref, "r")["depth"]) return torch.from_numpy(depth) @@ -145,31 +140,29 @@ class MegadepthScene: depth_A, depth_B = self.depth_transform_ops( (depth_A[None, None], depth_B[None, None]) ) - - [im_A, im_B, depth_A, depth_B], t = self.rand_shake( - im_A, im_B, depth_A, depth_B - ) + + [im_A, im_B, depth_A, depth_B], t = self.rand_shake(im_A, im_B, depth_A, depth_B) K1[:2, 2] += t K2[:2, 2] += t - + im_A, im_B = im_A[None], im_B[None] if self.random_eraser is not None: im_A, depth_A = self.random_eraser(im_A, depth_A) im_B, depth_B = self.random_eraser(im_B, depth_B) - + if self.use_horizontal_flip_aug: if np.random.rand() > 0.5: - im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip( - im_A, im_B, depth_A, depth_B, K1, K2 - ) + im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip(im_A, im_B, depth_A, depth_B, K1, K2) if self.use_single_horizontal_flip_aug: if np.random.rand() > 0.5: im_B, depth_B, K2 = self.single_horizontal_flip(im_B, depth_B, K2) - + if roma.DEBUG_MODE: - tensor_to_pil(im_A[0], unnormalize=True).save(f"vis/im_A.jpg") - tensor_to_pil(im_B[0], unnormalize=True).save(f"vis/im_B.jpg") - + tensor_to_pil(im_A[0], unnormalize=True).save( + f"vis/im_A.jpg") + tensor_to_pil(im_B[0], unnormalize=True).save( + f"vis/im_B.jpg") + data_dict = { "im_A": im_A[0], "im_A_identifier": self.image_paths[idx1].split("/")[-1].split(".jpg")[0], @@ -182,53 +175,25 @@ class MegadepthScene: "T_1to2": T_1to2, "im_A_path": im_A_ref, "im_B_path": im_B_ref, + } return data_dict class MegadepthBuilder: - def __init__( - self, data_root="data/megadepth", loftr_ignore=True, imc21_ignore=True - ) -> None: + def __init__(self, data_root="data/megadepth", loftr_ignore=True, imc21_ignore = True) -> None: self.data_root = data_root self.scene_info_root = os.path.join(data_root, "prep_scene_info") self.all_scenes = os.listdir(self.scene_info_root) self.test_scenes = ["0017.npy", "0004.npy", "0048.npy", "0013.npy"] # LoFTR did the D2-net preprocessing differently than we did and got more ignore scenes, can optionially ignore those - self.loftr_ignore_scenes = set( - [ - "0121.npy", - "0133.npy", - "0168.npy", - "0178.npy", - "0229.npy", - "0349.npy", - "0412.npy", - "0430.npy", - "0443.npy", - "1001.npy", - "5014.npy", - "5015.npy", - "5016.npy", - ] - ) - self.imc21_scenes = set( - [ - "0008.npy", - "0019.npy", - "0021.npy", - "0024.npy", - "0025.npy", - "0032.npy", - "0063.npy", - "1589.npy", - ] - ) + self.loftr_ignore_scenes = set(['0121.npy', '0133.npy', '0168.npy', '0178.npy', '0229.npy', '0349.npy', '0412.npy', '0430.npy', '0443.npy', '1001.npy', '5014.npy', '5015.npy', '5016.npy']) + self.imc21_scenes = set(['0008.npy', '0019.npy', '0021.npy', '0024.npy', '0025.npy', '0032.npy', '0063.npy', '1589.npy']) self.test_scenes_loftr = ["0015.npy", "0022.npy"] self.loftr_ignore = loftr_ignore self.imc21_ignore = imc21_ignore - def build_scenes(self, split="train", min_overlap=0.0, scene_names=None, **kwargs): + def build_scenes(self, split="train", min_overlap=0.0, scene_names = None, **kwargs): if split == "train": scene_names = set(self.all_scenes) - set(self.test_scenes) elif split == "train_loftr": @@ -252,11 +217,7 @@ class MegadepthBuilder: ).item() scenes.append( MegadepthScene( - self.data_root, - scene_info, - min_overlap=min_overlap, - scene_name=scene_name, - **kwargs, + self.data_root, scene_info, min_overlap=min_overlap,scene_name = scene_name, **kwargs ) ) return scenes diff --git a/third_party/RoMa/roma/datasets/scannet.py b/third_party/RoMa/roma/datasets/scannet.py new file mode 100644 index 0000000000000000000000000000000000000000..704ea57259afdfbbca627ad143bee97a0a79d41c --- /dev/null +++ b/third_party/RoMa/roma/datasets/scannet.py @@ -0,0 +1,160 @@ +import os +import random +from PIL import Image +import cv2 +import h5py +import numpy as np +import torch +from torch.utils.data import ( + Dataset, + DataLoader, + ConcatDataset) + +import torchvision.transforms.functional as tvf +import kornia.augmentation as K +import os.path as osp +import matplotlib.pyplot as plt +import roma +from roma.utils import get_depth_tuple_transform_ops, get_tuple_transform_ops +from roma.utils.transforms import GeometricSequential +from tqdm import tqdm + +class ScanNetScene: + def __init__(self, data_root, scene_info, ht = 384, wt = 512, min_overlap=0., shake_t = 0, rot_prob=0.,use_horizontal_flip_aug = False, +) -> None: + self.scene_root = osp.join(data_root,"scans","scans_train") + self.data_names = scene_info['name'] + self.overlaps = scene_info['score'] + # Only sample 10s + valid = (self.data_names[:,-2:] % 10).sum(axis=-1) == 0 + self.overlaps = self.overlaps[valid] + self.data_names = self.data_names[valid] + if len(self.data_names) > 10000: + pairinds = np.random.choice(np.arange(0,len(self.data_names)),10000,replace=False) + self.data_names = self.data_names[pairinds] + self.overlaps = self.overlaps[pairinds] + self.im_transform_ops = get_tuple_transform_ops(resize=(ht, wt), normalize=True) + self.depth_transform_ops = get_depth_tuple_transform_ops(resize=(ht, wt), normalize=False) + self.wt, self.ht = wt, ht + self.shake_t = shake_t + self.H_generator = GeometricSequential(K.RandomAffine(degrees=90, p=rot_prob)) + self.use_horizontal_flip_aug = use_horizontal_flip_aug + + def load_im(self, im_B, crop=None): + im = Image.open(im_B) + return im + + def load_depth(self, depth_ref, crop=None): + depth = cv2.imread(str(depth_ref), cv2.IMREAD_UNCHANGED) + depth = depth / 1000 + depth = torch.from_numpy(depth).float() # (h, w) + return depth + + def __len__(self): + return len(self.data_names) + + def scale_intrinsic(self, K, wi, hi): + sx, sy = self.wt / wi, self.ht / hi + sK = torch.tensor([[sx, 0, 0], + [0, sy, 0], + [0, 0, 1]]) + return sK@K + + def horizontal_flip(self, im_A, im_B, depth_A, depth_B, K_A, K_B): + im_A = im_A.flip(-1) + im_B = im_B.flip(-1) + depth_A, depth_B = depth_A.flip(-1), depth_B.flip(-1) + flip_mat = torch.tensor([[-1, 0, self.wt],[0,1,0],[0,0,1.]]).to(K_A.device) + K_A = flip_mat@K_A + K_B = flip_mat@K_B + + return im_A, im_B, depth_A, depth_B, K_A, K_B + def read_scannet_pose(self,path): + """ Read ScanNet's Camera2World pose and transform it to World2Camera. + + Returns: + pose_w2c (np.ndarray): (4, 4) + """ + cam2world = np.loadtxt(path, delimiter=' ') + world2cam = np.linalg.inv(cam2world) + return world2cam + + + def read_scannet_intrinsic(self,path): + """ Read ScanNet's intrinsic matrix and return the 3x3 matrix. + """ + intrinsic = np.loadtxt(path, delimiter=' ') + return torch.tensor(intrinsic[:-1, :-1], dtype = torch.float) + + def __getitem__(self, pair_idx): + # read intrinsics of original size + data_name = self.data_names[pair_idx] + scene_name, scene_sub_name, stem_name_1, stem_name_2 = data_name + scene_name = f'scene{scene_name:04d}_{scene_sub_name:02d}' + + # read the intrinsic of depthmap + K1 = K2 = self.read_scannet_intrinsic(osp.join(self.scene_root, + scene_name, + 'intrinsic', 'intrinsic_color.txt'))#the depth K is not the same, but doesnt really matter + # read and compute relative poses + T1 = self.read_scannet_pose(osp.join(self.scene_root, + scene_name, + 'pose', f'{stem_name_1}.txt')) + T2 = self.read_scannet_pose(osp.join(self.scene_root, + scene_name, + 'pose', f'{stem_name_2}.txt')) + T_1to2 = torch.tensor(np.matmul(T2, np.linalg.inv(T1)), dtype=torch.float)[:4, :4] # (4, 4) + + # Load positive pair data + im_A_ref = os.path.join(self.scene_root, scene_name, 'color', f'{stem_name_1}.jpg') + im_B_ref = os.path.join(self.scene_root, scene_name, 'color', f'{stem_name_2}.jpg') + depth_A_ref = os.path.join(self.scene_root, scene_name, 'depth', f'{stem_name_1}.png') + depth_B_ref = os.path.join(self.scene_root, scene_name, 'depth', f'{stem_name_2}.png') + + im_A = self.load_im(im_A_ref) + im_B = self.load_im(im_B_ref) + depth_A = self.load_depth(depth_A_ref) + depth_B = self.load_depth(depth_B_ref) + + # Recompute camera intrinsic matrix due to the resize + K1 = self.scale_intrinsic(K1, im_A.width, im_A.height) + K2 = self.scale_intrinsic(K2, im_B.width, im_B.height) + # Process images + im_A, im_B = self.im_transform_ops((im_A, im_B)) + depth_A, depth_B = self.depth_transform_ops((depth_A[None,None], depth_B[None,None])) + if self.use_horizontal_flip_aug: + if np.random.rand() > 0.5: + im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip(im_A, im_B, depth_A, depth_B, K1, K2) + + data_dict = {'im_A': im_A, + 'im_B': im_B, + 'im_A_depth': depth_A[0,0], + 'im_B_depth': depth_B[0,0], + 'K1': K1, + 'K2': K2, + 'T_1to2':T_1to2, + } + return data_dict + + +class ScanNetBuilder: + def __init__(self, data_root = 'data/scannet') -> None: + self.data_root = data_root + self.scene_info_root = os.path.join(data_root,'scannet_indices') + self.all_scenes = os.listdir(self.scene_info_root) + + def build_scenes(self, split = 'train', min_overlap=0., **kwargs): + # Note: split doesn't matter here as we always use same scannet_train scenes + scene_names = self.all_scenes + scenes = [] + for scene_name in tqdm(scene_names, disable = roma.RANK > 0): + scene_info = np.load(os.path.join(self.scene_info_root,scene_name), allow_pickle=True) + scenes.append(ScanNetScene(self.data_root, scene_info, min_overlap=min_overlap, **kwargs)) + return scenes + + def weight_scenes(self, concat_dataset, alpha=.5): + ns = [] + for d in concat_dataset.datasets: + ns.append(len(d)) + ws = torch.cat([torch.ones(n)/n**alpha for n in ns]) + return ws diff --git a/third_party/RoMa/roma/losses/__init__.py b/third_party/RoMa/roma/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2e08abacfc0f83d7de0f2ddc0583766a80bf53cf --- /dev/null +++ b/third_party/RoMa/roma/losses/__init__.py @@ -0,0 +1 @@ +from .robust_loss import RobustLosses \ No newline at end of file diff --git a/third_party/RoMa/roma/losses/robust_loss.py b/third_party/RoMa/roma/losses/robust_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..b932b2706f619c083485e1be0d86eec44ead83ef --- /dev/null +++ b/third_party/RoMa/roma/losses/robust_loss.py @@ -0,0 +1,157 @@ +from einops.einops import rearrange +import torch +import torch.nn as nn +import torch.nn.functional as F +from roma.utils.utils import get_gt_warp +import wandb +import roma +import math + +class RobustLosses(nn.Module): + def __init__( + self, + robust=False, + center_coords=False, + scale_normalize=False, + ce_weight=0.01, + local_loss=True, + local_dist=4.0, + local_largest_scale=8, + smooth_mask = False, + depth_interpolation_mode = "bilinear", + mask_depth_loss = False, + relative_depth_error_threshold = 0.05, + alpha = 1., + c = 1e-3, + ): + super().__init__() + self.robust = robust # measured in pixels + self.center_coords = center_coords + self.scale_normalize = scale_normalize + self.ce_weight = ce_weight + self.local_loss = local_loss + self.local_dist = local_dist + self.local_largest_scale = local_largest_scale + self.smooth_mask = smooth_mask + self.depth_interpolation_mode = depth_interpolation_mode + self.mask_depth_loss = mask_depth_loss + self.relative_depth_error_threshold = relative_depth_error_threshold + self.avg_overlap = dict() + self.alpha = alpha + self.c = c + + def gm_cls_loss(self, x2, prob, scale_gm_cls, gm_certainty, scale): + with torch.no_grad(): + B, C, H, W = scale_gm_cls.shape + device = x2.device + cls_res = round(math.sqrt(C)) + G = torch.meshgrid(*[torch.linspace(-1+1/cls_res, 1 - 1/cls_res, steps = cls_res,device = device) for _ in range(2)]) + G = torch.stack((G[1], G[0]), dim = -1).reshape(C,2) + GT = (G[None,:,None,None,:]-x2[:,None]).norm(dim=-1).min(dim=1).indices + cls_loss = F.cross_entropy(scale_gm_cls, GT, reduction = 'none')[prob > 0.99] + if not torch.any(cls_loss): + cls_loss = (certainty_loss * 0.0) # Prevent issues where prob is 0 everywhere + + certainty_loss = F.binary_cross_entropy_with_logits(gm_certainty[:,0], prob) + losses = { + f"gm_certainty_loss_{scale}": certainty_loss.mean(), + f"gm_cls_loss_{scale}": cls_loss.mean(), + } + wandb.log(losses, step = roma.GLOBAL_STEP) + return losses + + def delta_cls_loss(self, x2, prob, flow_pre_delta, delta_cls, certainty, scale, offset_scale): + with torch.no_grad(): + B, C, H, W = delta_cls.shape + device = x2.device + cls_res = round(math.sqrt(C)) + G = torch.meshgrid(*[torch.linspace(-1+1/cls_res, 1 - 1/cls_res, steps = cls_res,device = device) for _ in range(2)]) + G = torch.stack((G[1], G[0]), dim = -1).reshape(C,2) * offset_scale + GT = (G[None,:,None,None,:] + flow_pre_delta[:,None] - x2[:,None]).norm(dim=-1).min(dim=1).indices + cls_loss = F.cross_entropy(delta_cls, GT, reduction = 'none')[prob > 0.99] + if not torch.any(cls_loss): + cls_loss = (certainty_loss * 0.0) # Prevent issues where prob is 0 everywhere + certainty_loss = F.binary_cross_entropy_with_logits(certainty[:,0], prob) + losses = { + f"delta_certainty_loss_{scale}": certainty_loss.mean(), + f"delta_cls_loss_{scale}": cls_loss.mean(), + } + wandb.log(losses, step = roma.GLOBAL_STEP) + return losses + + def regression_loss(self, x2, prob, flow, certainty, scale, eps=1e-8, mode = "delta"): + epe = (flow.permute(0,2,3,1) - x2).norm(dim=-1) + if scale == 1: + pck_05 = (epe[prob > 0.99] < 0.5 * (2/512)).float().mean() + wandb.log({"train_pck_05": pck_05}, step = roma.GLOBAL_STEP) + + ce_loss = F.binary_cross_entropy_with_logits(certainty[:, 0], prob) + a = self.alpha + cs = self.c * scale + x = epe[prob > 0.99] + reg_loss = cs**a * ((x/(cs))**2 + 1**2)**(a/2) + if not torch.any(reg_loss): + reg_loss = (ce_loss * 0.0) # Prevent issues where prob is 0 everywhere + losses = { + f"{mode}_certainty_loss_{scale}": ce_loss.mean(), + f"{mode}_regression_loss_{scale}": reg_loss.mean(), + } + wandb.log(losses, step = roma.GLOBAL_STEP) + return losses + + def forward(self, corresps, batch): + scales = list(corresps.keys()) + tot_loss = 0.0 + # scale_weights due to differences in scale for regression gradients and classification gradients + scale_weights = {1:1, 2:1, 4:1, 8:1, 16:1} + for scale in scales: + scale_corresps = corresps[scale] + scale_certainty, flow_pre_delta, delta_cls, offset_scale, scale_gm_cls, scale_gm_certainty, flow, scale_gm_flow = ( + scale_corresps["certainty"], + scale_corresps["flow_pre_delta"], + scale_corresps.get("delta_cls"), + scale_corresps.get("offset_scale"), + scale_corresps.get("gm_cls"), + scale_corresps.get("gm_certainty"), + scale_corresps["flow"], + scale_corresps.get("gm_flow"), + + ) + flow_pre_delta = rearrange(flow_pre_delta, "b d h w -> b h w d") + b, h, w, d = flow_pre_delta.shape + gt_warp, gt_prob = get_gt_warp( + batch["im_A_depth"], + batch["im_B_depth"], + batch["T_1to2"], + batch["K1"], + batch["K2"], + H=h, + W=w, + ) + x2 = gt_warp.float() + prob = gt_prob + + if self.local_largest_scale >= scale: + prob = prob * ( + F.interpolate(prev_epe[:, None], size=(h, w), mode="nearest-exact")[:, 0] + < (2 / 512) * (self.local_dist[scale] * scale)) + + if scale_gm_cls is not None: + gm_cls_losses = self.gm_cls_loss(x2, prob, scale_gm_cls, scale_gm_certainty, scale) + gm_loss = self.ce_weight * gm_cls_losses[f"gm_certainty_loss_{scale}"] + gm_cls_losses[f"gm_cls_loss_{scale}"] + tot_loss = tot_loss + scale_weights[scale] * gm_loss + elif scale_gm_flow is not None: + gm_flow_losses = self.regression_loss(x2, prob, scale_gm_flow, scale_gm_certainty, scale, mode = "gm") + gm_loss = self.ce_weight * gm_flow_losses[f"gm_certainty_loss_{scale}"] + gm_flow_losses[f"gm_regression_loss_{scale}"] + tot_loss = tot_loss + scale_weights[scale] * gm_loss + + if delta_cls is not None: + delta_cls_losses = self.delta_cls_loss(x2, prob, flow_pre_delta, delta_cls, scale_certainty, scale, offset_scale) + delta_cls_loss = self.ce_weight * delta_cls_losses[f"delta_certainty_loss_{scale}"] + delta_cls_losses[f"delta_cls_loss_{scale}"] + tot_loss = tot_loss + scale_weights[scale] * delta_cls_loss + else: + delta_regression_losses = self.regression_loss(x2, prob, flow, scale_certainty, scale) + reg_loss = self.ce_weight * delta_regression_losses[f"delta_certainty_loss_{scale}"] + delta_regression_losses[f"delta_regression_loss_{scale}"] + tot_loss = tot_loss + scale_weights[scale] * reg_loss + prev_epe = (flow.permute(0,2,3,1) - x2).norm(dim=-1).detach() + return tot_loss diff --git a/third_party/RoMa/roma/models/__init__.py b/third_party/RoMa/roma/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5f20461e2f3a1722e558cefab94c5164be8842c3 --- /dev/null +++ b/third_party/RoMa/roma/models/__init__.py @@ -0,0 +1 @@ +from .model_zoo import roma_outdoor, roma_indoor \ No newline at end of file diff --git a/third_party/Roma/roma/models/encoders.py b/third_party/RoMa/roma/models/encoders.py similarity index 83% rename from third_party/Roma/roma/models/encoders.py rename to third_party/RoMa/roma/models/encoders.py index 3b9a1a1791ec7b2f1352be1984d5232911366c0e..643360c9d61766f9f411a74bdf3a6f1114326bcb 100644 --- a/third_party/Roma/roma/models/encoders.py +++ b/third_party/RoMa/roma/models/encoders.py @@ -8,7 +8,8 @@ import gc class ResNet50(nn.Module): - def __init__(self, pretrained=False, high_res = False, weights = None, dilation = None, freeze_bn = True, anti_aliased = False, early_exit = False, amp = False) -> None: + def __init__(self, pretrained=False, high_res = False, weights = None, + dilation = None, freeze_bn = True, anti_aliased = False, early_exit = False, amp = False, amp_dtype = torch.float16) -> None: super().__init__() if dilation is None: dilation = [False,False,False] @@ -24,10 +25,7 @@ class ResNet50(nn.Module): self.freeze_bn = freeze_bn self.early_exit = early_exit self.amp = amp - if not torch.cuda.is_available(): - self.amp_dtype = torch.float32 - else: - self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 + self.amp_dtype = amp_dtype def forward(self, x, **kwargs): with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype): @@ -59,14 +57,11 @@ class ResNet50(nn.Module): pass class VGG19(nn.Module): - def __init__(self, pretrained=False, amp = False) -> None: + def __init__(self, pretrained=False, amp = False, amp_dtype = torch.float16) -> None: super().__init__() self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40]) self.amp = amp - if not torch.cuda.is_available(): - self.amp_dtype = torch.float32 - else: - self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 + self.amp_dtype = amp_dtype def forward(self, x, **kwargs): with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype): @@ -80,7 +75,7 @@ class VGG19(nn.Module): return feats class CNNandDinov2(nn.Module): - def __init__(self, cnn_kwargs = None, amp = False, use_vgg = False, dinov2_weights = None): + def __init__(self, cnn_kwargs = None, amp = False, use_vgg = False, dinov2_weights = None, amp_dtype = torch.float16): super().__init__() if dinov2_weights is None: dinov2_weights = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth", map_location="cpu") @@ -100,10 +95,7 @@ class CNNandDinov2(nn.Module): else: self.cnn = VGG19(**cnn_kwargs) self.amp = amp - if not torch.cuda.is_available(): - self.amp_dtype = torch.float32 - else: - self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 + self.amp_dtype = amp_dtype if self.amp: dinov2_vitl14 = dinov2_vitl14.to(self.amp_dtype) self.dinov2_vitl14 = [dinov2_vitl14] # ugly hack to not show parameters to DDP diff --git a/third_party/Roma/roma/models/matcher.py b/third_party/RoMa/roma/models/matcher.py similarity index 83% rename from third_party/Roma/roma/models/matcher.py rename to third_party/RoMa/roma/models/matcher.py index b68f2984e2d4515c2cf0a864213de27e714383fa..25a89c8dd99bc1eca8c591dbbc3b5ddbd987829c 100644 --- a/third_party/Roma/roma/models/matcher.py +++ b/third_party/RoMa/roma/models/matcher.py @@ -7,6 +7,7 @@ import torch.nn.functional as F from einops import rearrange import warnings from warnings import warn +from PIL import Image import roma from roma.utils import get_tuple_transform_ops @@ -37,6 +38,7 @@ class ConvRefiner(nn.Module): sample_mode = "bilinear", norm_type = nn.BatchNorm2d, bn_momentum = 0.1, + amp_dtype = torch.float16, ): super().__init__() self.bn_momentum = bn_momentum @@ -71,12 +73,8 @@ class ConvRefiner(nn.Module): self.disable_local_corr_grad = disable_local_corr_grad self.is_classifier = is_classifier self.sample_mode = sample_mode - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - if not torch.cuda.is_available(): - self.amp_dtype = torch.float32 - else: - self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 - + self.amp_dtype = amp_dtype + def create_block( self, in_dim, @@ -113,8 +111,8 @@ class ConvRefiner(nn.Module): if self.has_displacement_emb: im_A_coords = torch.meshgrid( ( - torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device=self.device), - torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device=self.device), + torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device=x.device), + torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device=x.device), ) ) im_A_coords = torch.stack((im_A_coords[1], im_A_coords[0])) @@ -278,7 +276,7 @@ class Decoder(nn.Module): def __init__( self, embedding_decoder, gps, proj, conv_refiner, detach=False, scales="all", pos_embeddings = None, num_refinement_steps_per_scale = 1, warp_noise_std = 0.0, displacement_dropout_p = 0.0, gm_warp_dropout_p = 0.0, - flow_upsample_mode = "bilinear" + flow_upsample_mode = "bilinear", amp_dtype = torch.float16, ): super().__init__() self.embedding_decoder = embedding_decoder @@ -300,11 +298,8 @@ class Decoder(nn.Module): self.displacement_dropout_p = displacement_dropout_p self.gm_warp_dropout_p = gm_warp_dropout_p self.flow_upsample_mode = flow_upsample_mode - if not torch.cuda.is_available(): - self.amp_dtype = torch.float32 - else: - self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 - + self.amp_dtype = amp_dtype + def get_placeholder_flow(self, b, h, w, device): coarse_coords = torch.meshgrid( ( @@ -367,7 +362,7 @@ class Decoder(nn.Module): corresps[ins] = {} f1_s, f2_s = f1[ins], f2[ins] if new_scale in self.proj: - with torch.autocast("cuda", self.amp_dtype): + with torch.autocast("cuda", dtype = self.amp_dtype): f1_s, f2_s = self.proj[new_scale](f1_s), self.proj[new_scale](f2_s) if ins in coarse_scales: @@ -429,11 +424,12 @@ class RegressionMatcher(nn.Module): decoder, h=448, w=448, - sample_mode = "threshold", + sample_mode = "threshold_balanced", upsample_preds = False, symmetric = False, name = None, attenuate_cert = None, + recrop_upsample = False, ): super().__init__() self.attenuate_cert = attenuate_cert @@ -448,6 +444,7 @@ class RegressionMatcher(nn.Module): self.upsample_res = (14*16*6, 14*16*6) self.symmetric = symmetric self.sample_thresh = 0.05 + self.recrop_upsample = recrop_upsample def get_output_resolution(self): if not self.upsample_preds: @@ -527,12 +524,62 @@ class RegressionMatcher(nn.Module): scale_factor=scale_factor) return corresps - def to_pixel_coordinates(self, matches, H_A, W_A, H_B, W_B): - kpts_A, kpts_B = matches[...,:2], matches[...,2:] + def to_pixel_coordinates(self, coords, H_A, W_A, H_B, W_B): + if isinstance(coords, (list, tuple)): + kpts_A, kpts_B = coords[0], coords[1] + else: + kpts_A, kpts_B = coords[...,:2], coords[...,2:] kpts_A = torch.stack((W_A/2 * (kpts_A[...,0]+1), H_A/2 * (kpts_A[...,1]+1)),axis=-1) kpts_B = torch.stack((W_B/2 * (kpts_B[...,0]+1), H_B/2 * (kpts_B[...,1]+1)),axis=-1) return kpts_A, kpts_B + + def to_normalized_coordinates(self, coords, H_A, W_A, H_B, W_B): + if isinstance(coords, (list, tuple)): + kpts_A, kpts_B = coords[0], coords[1] + else: + kpts_A, kpts_B = coords[...,:2], coords[...,2:] + kpts_A = torch.stack((2/W_A * kpts_A[...,0] - 1, 2/H_A * kpts_A[...,1] - 1),axis=-1) + kpts_B = torch.stack((2/W_B * kpts_B[...,0] - 1, 2/H_B * kpts_B[...,1] - 1),axis=-1) + return kpts_A, kpts_B + def match_keypoints(self, x_A, x_B, warp, certainty, return_tuple = True, return_inds = False): + x_A_to_B = F.grid_sample(warp[...,-2:].permute(2,0,1)[None], x_A[None,None], align_corners = False, mode = "bilinear")[0,:,0].mT + cert_A_to_B = F.grid_sample(certainty[None,None,...], x_A[None,None], align_corners = False, mode = "bilinear")[0,0,0] + D = torch.cdist(x_A_to_B, x_B) + inds_A, inds_B = torch.nonzero((D == D.min(dim=-1, keepdim = True).values) * (D == D.min(dim=-2, keepdim = True).values) * (cert_A_to_B[:,None] > self.sample_thresh), as_tuple = True) + + if return_tuple: + if return_inds: + return inds_A, inds_B + else: + return x_A[inds_A], x_B[inds_B] + else: + if return_inds: + return torch.cat((inds_A, inds_B),dim=-1) + else: + return torch.cat((x_A[inds_A], x_B[inds_B]),dim=-1) + + def get_roi(self, certainty, W, H, thr = 0.025): + raise NotImplementedError("WIP, disable for now") + hs,ws = certainty.shape + certainty = certainty/certainty.sum(dim=(-1,-2)) + cum_certainty_w = certainty.cumsum(dim=-1).sum(dim=-2) + cum_certainty_h = certainty.cumsum(dim=-2).sum(dim=-1) + print(cum_certainty_w) + print(torch.min(torch.nonzero(cum_certainty_w > thr))) + print(torch.min(torch.nonzero(cum_certainty_w < thr))) + left = int(W/ws * torch.min(torch.nonzero(cum_certainty_w > thr))) + right = int(W/ws * torch.max(torch.nonzero(cum_certainty_w < 1 - thr))) + top = int(H/hs * torch.min(torch.nonzero(cum_certainty_h > thr))) + bottom = int(H/hs * torch.max(torch.nonzero(cum_certainty_h < 1 - thr))) + print(left, right, top, bottom) + return left, top, right, bottom + + def recrop(self, certainty, image_path): + roi = self.get_roi(certainty, *Image.open(image_path).size) + return Image.open(image_path).convert("RGB").crop(roi) + + @torch.inference_mode() def match( self, im_A_path, @@ -543,9 +590,8 @@ class RegressionMatcher(nn.Module): ): if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - from PIL import Image if isinstance(im_A_path, (str, os.PathLike)): - im_A, im_B = Image.open(im_A_path), Image.open(im_B_path) + im_A, im_B = Image.open(im_A_path).convert("RGB"), Image.open(im_B_path).convert("RGB") else: # Assume its not a path im_A, im_B = im_A_path, im_B_path @@ -597,7 +643,14 @@ class RegressionMatcher(nn.Module): test_transform = get_tuple_transform_ops( resize=(hs, ws), normalize=True ) - im_A, im_B = Image.open(im_A_path), Image.open(im_B_path) + if self.recrop_upsample: + certainty = corresps[finest_scale]["certainty"] + print(certainty.shape) + im_A = self.recrop(certainty[0,0], im_A_path) + im_B = self.recrop(certainty[1,0], im_B_path) + #TODO: need to adjust corresps when doing this + else: + im_A, im_B = Image.open(im_A_path).convert("RGB"), Image.open(im_B_path).convert("RGB") im_A, im_B = test_transform((im_A, im_B)) im_A, im_B = im_A[None].to(device), im_B[None].to(device) scale_factor = math.sqrt(self.upsample_res[0] * self.upsample_res[1] / (self.w_resized * self.h_resized)) @@ -653,4 +706,30 @@ class RegressionMatcher(nn.Module): warp[0], certainty[0, 0], ) + + def visualize_warp(self, warp, certainty, im_A = None, im_B = None, im_A_path = None, im_B_path = None, device = "cuda", symmetric = True, save_path = None): + assert symmetric == True, "Currently assuming bidirectional warp, might update this if someone complains ;)" + H,W2,_ = warp.shape + W = W2//2 if symmetric else W2 + if im_A is None: + from PIL import Image + im_A, im_B = Image.open(im_A_path).convert("RGB"), Image.open(im_B_path).convert("RGB") + im_A = im_A.resize((W,H)) + im_B = im_B.resize((W,H)) + + x_A = (torch.tensor(np.array(im_A)) / 255).to(device).permute(2, 0, 1) + x_B = (torch.tensor(np.array(im_B)) / 255).to(device).permute(2, 0, 1) + im_A_transfer_rgb = F.grid_sample( + x_B[None], warp[:,:W, 2:][None], mode="bilinear", align_corners=False + )[0] + im_B_transfer_rgb = F.grid_sample( + x_A[None], warp[:, W:, :2][None], mode="bilinear", align_corners=False + )[0] + warp_im = torch.cat((im_A_transfer_rgb,im_B_transfer_rgb),dim=2) + white_im = torch.ones((H,2*W),device=device) + vis_im = certainty * warp_im + (1 - certainty) * white_im + if save_path is not None: + from roma.utils import tensor_to_pil + tensor_to_pil(vis_im, unnormalize=False).save(save_path) + return vis_im diff --git a/third_party/RoMa/roma/models/model_zoo/__init__.py b/third_party/RoMa/roma/models/model_zoo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..49ca7b8557cb8f6948bca28c631e39d899e49177 --- /dev/null +++ b/third_party/RoMa/roma/models/model_zoo/__init__.py @@ -0,0 +1,53 @@ +from typing import Union +import torch +from .roma_models import roma_model + +weight_urls = { + "roma": { + "outdoor": "https://github.com/Parskatt/storage/releases/download/roma/roma_outdoor.pth", + "indoor": "https://github.com/Parskatt/storage/releases/download/roma/roma_indoor.pth", + }, + "dinov2": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth", #hopefully this doesnt change :D +} + +def roma_outdoor(device, weights=None, dinov2_weights=None, coarse_res: Union[int,tuple[int,int]] = 560, upsample_res: Union[int,tuple[int,int]] = 864, amp_dtype: torch.dtype = torch.float16): + if isinstance(coarse_res, int): + coarse_res = (coarse_res, coarse_res) + if isinstance(upsample_res, int): + upsample_res = (upsample_res, upsample_res) + + assert coarse_res[0] % 14 == 0, "Needs to be multiple of 14 for backbone" + assert coarse_res[1] % 14 == 0, "Needs to be multiple of 14 for backbone" + + if weights is None: + weights = torch.hub.load_state_dict_from_url(weight_urls["roma"]["outdoor"], + map_location=device) + if dinov2_weights is None: + dinov2_weights = torch.hub.load_state_dict_from_url(weight_urls["dinov2"], + map_location=device) + model = roma_model(resolution=coarse_res, upsample_preds=True, + weights=weights,dinov2_weights = dinov2_weights,device=device, amp_dtype=amp_dtype) + model.upsample_res = upsample_res + print(f"Using coarse resolution {coarse_res}, and upsample res {model.upsample_res}") + return model + +def roma_indoor(device, weights=None, dinov2_weights=None, coarse_res: Union[int,tuple[int,int]] = 560, upsample_res: Union[int,tuple[int,int]] = 864, amp_dtype: torch.dtype = torch.float16): + if isinstance(coarse_res, int): + coarse_res = (coarse_res, coarse_res) + if isinstance(upsample_res, int): + upsample_res = (upsample_res, upsample_res) + + assert coarse_res[0] % 14 == 0, "Needs to be multiple of 14 for backbone" + assert coarse_res[1] % 14 == 0, "Needs to be multiple of 14 for backbone" + + if weights is None: + weights = torch.hub.load_state_dict_from_url(weight_urls["roma"]["indoor"], + map_location=device) + if dinov2_weights is None: + dinov2_weights = torch.hub.load_state_dict_from_url(weight_urls["dinov2"], + map_location=device) + model = roma_model(resolution=coarse_res, upsample_preds=True, + weights=weights,dinov2_weights = dinov2_weights,device=device, amp_dtype=amp_dtype) + model.upsample_res = upsample_res + print(f"Using coarse resolution {coarse_res}, and upsample res {model.upsample_res}") + return model diff --git a/third_party/RoMa/roma/models/model_zoo/roma_models.py b/third_party/RoMa/roma/models/model_zoo/roma_models.py new file mode 100644 index 0000000000000000000000000000000000000000..13f8d872f3aad6ef42b090b123f77a96ff1ce68f --- /dev/null +++ b/third_party/RoMa/roma/models/model_zoo/roma_models.py @@ -0,0 +1,160 @@ +import warnings +import torch.nn as nn +import torch +from roma.models.matcher import * +from roma.models.transformer import Block, TransformerDecoder, MemEffAttention +from roma.models.encoders import * + +def roma_model(resolution, upsample_preds, device = None, weights=None, dinov2_weights=None, amp_dtype: torch.dtype=torch.float16, **kwargs): + # roma weights and dinov2 weights are loaded seperately, as dinov2 weights are not parameters + #torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul TODO: these probably ruin stuff, should be careful + #torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn + warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated') + gp_dim = 512 + feat_dim = 512 + decoder_dim = gp_dim + feat_dim + cls_to_coord_res = 64 + coordinate_decoder = TransformerDecoder( + nn.Sequential(*[Block(decoder_dim, 8, attn_class=MemEffAttention) for _ in range(5)]), + decoder_dim, + cls_to_coord_res**2 + 1, + is_classifier=True, + amp = True, + pos_enc = False,) + dw = True + hidden_blocks = 8 + kernel_size = 5 + displacement_emb = "linear" + disable_local_corr_grad = True + + conv_refiner = nn.ModuleDict( + { + "16": ConvRefiner( + 2 * 512+128+(2*7+1)**2, + 2 * 512+128+(2*7+1)**2, + 2 + 1, + kernel_size=kernel_size, + dw=dw, + hidden_blocks=hidden_blocks, + displacement_emb=displacement_emb, + displacement_emb_dim=128, + local_corr_radius = 7, + corr_in_other = True, + amp = True, + disable_local_corr_grad = disable_local_corr_grad, + bn_momentum = 0.01, + ), + "8": ConvRefiner( + 2 * 512+64+(2*3+1)**2, + 2 * 512+64+(2*3+1)**2, + 2 + 1, + kernel_size=kernel_size, + dw=dw, + hidden_blocks=hidden_blocks, + displacement_emb=displacement_emb, + displacement_emb_dim=64, + local_corr_radius = 3, + corr_in_other = True, + amp = True, + disable_local_corr_grad = disable_local_corr_grad, + bn_momentum = 0.01, + ), + "4": ConvRefiner( + 2 * 256+32+(2*2+1)**2, + 2 * 256+32+(2*2+1)**2, + 2 + 1, + kernel_size=kernel_size, + dw=dw, + hidden_blocks=hidden_blocks, + displacement_emb=displacement_emb, + displacement_emb_dim=32, + local_corr_radius = 2, + corr_in_other = True, + amp = True, + disable_local_corr_grad = disable_local_corr_grad, + bn_momentum = 0.01, + ), + "2": ConvRefiner( + 2 * 64+16, + 128+16, + 2 + 1, + kernel_size=kernel_size, + dw=dw, + hidden_blocks=hidden_blocks, + displacement_emb=displacement_emb, + displacement_emb_dim=16, + amp = True, + disable_local_corr_grad = disable_local_corr_grad, + bn_momentum = 0.01, + ), + "1": ConvRefiner( + 2 * 9 + 6, + 24, + 2 + 1, + kernel_size=kernel_size, + dw=dw, + hidden_blocks = hidden_blocks, + displacement_emb = displacement_emb, + displacement_emb_dim = 6, + amp = True, + disable_local_corr_grad = disable_local_corr_grad, + bn_momentum = 0.01, + ), + } + ) + kernel_temperature = 0.2 + learn_temperature = False + no_cov = True + kernel = CosKernel + only_attention = False + basis = "fourier" + gp16 = GP( + kernel, + T=kernel_temperature, + learn_temperature=learn_temperature, + only_attention=only_attention, + gp_dim=gp_dim, + basis=basis, + no_cov=no_cov, + ) + gps = nn.ModuleDict({"16": gp16}) + proj16 = nn.Sequential(nn.Conv2d(1024, 512, 1, 1), nn.BatchNorm2d(512)) + proj8 = nn.Sequential(nn.Conv2d(512, 512, 1, 1), nn.BatchNorm2d(512)) + proj4 = nn.Sequential(nn.Conv2d(256, 256, 1, 1), nn.BatchNorm2d(256)) + proj2 = nn.Sequential(nn.Conv2d(128, 64, 1, 1), nn.BatchNorm2d(64)) + proj1 = nn.Sequential(nn.Conv2d(64, 9, 1, 1), nn.BatchNorm2d(9)) + proj = nn.ModuleDict({ + "16": proj16, + "8": proj8, + "4": proj4, + "2": proj2, + "1": proj1, + }) + displacement_dropout_p = 0.0 + gm_warp_dropout_p = 0.0 + decoder = Decoder(coordinate_decoder, + gps, + proj, + conv_refiner, + detach=True, + scales=["16", "8", "4", "2", "1"], + displacement_dropout_p = displacement_dropout_p, + gm_warp_dropout_p = gm_warp_dropout_p) + + encoder = CNNandDinov2( + cnn_kwargs = dict( + pretrained=False, + amp = True), + amp = True, + use_vgg = True, + dinov2_weights = dinov2_weights, + amp_dtype=amp_dtype, + ) + h,w = resolution + symmetric = True + attenuate_cert = True + sample_mode = "threshold_balanced" + matcher = RegressionMatcher(encoder, decoder, h=h, w=w, upsample_preds=upsample_preds, + symmetric = symmetric, attenuate_cert = attenuate_cert, sample_mode = sample_mode, **kwargs).to(device) + matcher.load_state_dict(weights) + return matcher diff --git a/third_party/RoMa/roma/models/transformer/__init__.py b/third_party/RoMa/roma/models/transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c93008ecdaab3fa19d7166b213f8d4f664bf65d5 --- /dev/null +++ b/third_party/RoMa/roma/models/transformer/__init__.py @@ -0,0 +1,47 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from roma.utils.utils import get_grid +from .layers.block import Block +from .layers.attention import MemEffAttention +from .dinov2 import vit_large + +class TransformerDecoder(nn.Module): + def __init__(self, blocks, hidden_dim, out_dim, is_classifier = False, *args, + amp = False, pos_enc = True, learned_embeddings = False, embedding_dim = None, amp_dtype = torch.float16, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.blocks = blocks + self.to_out = nn.Linear(hidden_dim, out_dim) + self.hidden_dim = hidden_dim + self.out_dim = out_dim + self._scales = [16] + self.is_classifier = is_classifier + self.amp = amp + self.amp_dtype = amp_dtype + self.pos_enc = pos_enc + self.learned_embeddings = learned_embeddings + if self.learned_embeddings: + self.learned_pos_embeddings = nn.Parameter(nn.init.kaiming_normal_(torch.empty((1, hidden_dim, embedding_dim, embedding_dim)))) + + def scales(self): + return self._scales.copy() + + def forward(self, gp_posterior, features, old_stuff, new_scale): + with torch.autocast("cuda", dtype=self.amp_dtype, enabled=self.amp): + B,C,H,W = gp_posterior.shape + x = torch.cat((gp_posterior, features), dim = 1) + B,C,H,W = x.shape + grid = get_grid(B, H, W, x.device).reshape(B,H*W,2) + if self.learned_embeddings: + pos_enc = F.interpolate(self.learned_pos_embeddings, size = (H,W), mode = 'bilinear', align_corners = False).permute(0,2,3,1).reshape(1,H*W,C) + else: + pos_enc = 0 + tokens = x.reshape(B,C,H*W).permute(0,2,1) + pos_enc + z = self.blocks(tokens) + out = self.to_out(z) + out = out.permute(0,2,1).reshape(B, self.out_dim, H, W) + warp, certainty = out[:, :-1], out[:, -1:] + return warp, certainty, None + + diff --git a/third_party/Roma/roma/models/transformer/dinov2.py b/third_party/RoMa/roma/models/transformer/dinov2.py similarity index 82% rename from third_party/Roma/roma/models/transformer/dinov2.py rename to third_party/RoMa/roma/models/transformer/dinov2.py index 1c27c65b5061cc0113792e40b96eaf7f4266ce18..b556c63096d17239c8603d5fe626c331963099fd 100644 --- a/third_party/Roma/roma/models/transformer/dinov2.py +++ b/third_party/RoMa/roma/models/transformer/dinov2.py @@ -18,29 +18,16 @@ import torch.nn as nn import torch.utils.checkpoint from torch.nn.init import trunc_normal_ -from .layers import ( - Mlp, - PatchEmbed, - SwiGLUFFNFused, - MemEffAttention, - NestedTensorBlock as Block, -) - - -def named_apply( - fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False -) -> nn.Module: +from .layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block + + + +def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module: if not depth_first and include_root: fn(module=module, name=name) for child_name, child_module in module.named_children(): child_name = ".".join((name, child_name)) if name else child_name - named_apply( - fn=fn, - module=child_module, - name=child_name, - depth_first=depth_first, - include_root=True, - ) + named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True) if depth_first and include_root: fn(module=module, name=name) return module @@ -100,33 +87,22 @@ class DinoVisionTransformer(nn.Module): super().__init__() norm_layer = partial(nn.LayerNorm, eps=1e-6) - self.num_features = ( - self.embed_dim - ) = embed_dim # num_features for consistency with other models + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models self.num_tokens = 1 self.n_blocks = depth self.num_heads = num_heads self.patch_size = patch_size - self.patch_embed = embed_layer( - img_size=img_size, - patch_size=patch_size, - in_chans=in_chans, - embed_dim=embed_dim, - ) + self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) num_patches = self.patch_embed.num_patches self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) - self.pos_embed = nn.Parameter( - torch.zeros(1, num_patches + self.num_tokens, embed_dim) - ) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) if drop_path_uniform is True: dpr = [drop_path_rate] * depth else: - dpr = [ - x.item() for x in torch.linspace(0, drop_path_rate, depth) - ] # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule if ffn_layer == "mlp": ffn_layer = Mlp @@ -163,9 +139,7 @@ class DinoVisionTransformer(nn.Module): chunksize = depth // block_chunks for i in range(0, depth, chunksize): # this is to keep the block index consistent if we chunk the block list - chunked_blocks.append( - [nn.Identity()] * i + blocks_list[i : i + chunksize] - ) + chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize]) self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks]) else: self.chunked_blocks = False @@ -179,7 +153,7 @@ class DinoVisionTransformer(nn.Module): self.init_weights() for param in self.parameters(): param.requires_grad = False - + @property def device(self): return self.cls_token.device @@ -206,29 +180,20 @@ class DinoVisionTransformer(nn.Module): w0, h0 = w0 + 0.1, h0 + 0.1 patch_pos_embed = nn.functional.interpolate( - patch_pos_embed.reshape( - 1, int(math.sqrt(N)), int(math.sqrt(N)), dim - ).permute(0, 3, 1, 2), + patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2), scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), mode="bicubic", ) - assert ( - int(w0) == patch_pos_embed.shape[-2] - and int(h0) == patch_pos_embed.shape[-1] - ) + assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1] patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to( - previous_dtype - ) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype) def prepare_tokens_with_masks(self, x, masks=None): B, nc, w, h = x.shape x = self.patch_embed(x) if masks is not None: - x = torch.where( - masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x - ) + x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x) x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) x = x + self.interpolate_pos_encoding(x, w, h) @@ -236,10 +201,7 @@ class DinoVisionTransformer(nn.Module): return x def forward_features_list(self, x_list, masks_list): - x = [ - self.prepare_tokens_with_masks(x, masks) - for x, masks in zip(x_list, masks_list) - ] + x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)] for blk in self.blocks: x = blk(x) @@ -278,34 +240,26 @@ class DinoVisionTransformer(nn.Module): x = self.prepare_tokens_with_masks(x) # If n is an int, take the n last blocks. If it's a list, take them output, total_block_len = [], len(self.blocks) - blocks_to_take = ( - range(total_block_len - n, total_block_len) if isinstance(n, int) else n - ) + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n for i, blk in enumerate(self.blocks): x = blk(x) if i in blocks_to_take: output.append(x) - assert len(output) == len( - blocks_to_take - ), f"only {len(output)} / {len(blocks_to_take)} blocks found" + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" return output def _get_intermediate_layers_chunked(self, x, n=1): x = self.prepare_tokens_with_masks(x) output, i, total_block_len = [], 0, len(self.blocks[-1]) # If n is an int, take the n last blocks. If it's a list, take them - blocks_to_take = ( - range(total_block_len - n, total_block_len) if isinstance(n, int) else n - ) + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n for block_chunk in self.blocks: for blk in block_chunk[i:]: # Passing the nn.Identity() x = blk(x) if i in blocks_to_take: output.append(x) i += 1 - assert len(output) == len( - blocks_to_take - ), f"only {len(output)} / {len(blocks_to_take)} blocks found" + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" return output def get_intermediate_layers( @@ -327,9 +281,7 @@ class DinoVisionTransformer(nn.Module): if reshape: B, _, w, h = x.shape outputs = [ - out.reshape(B, w // self.patch_size, h // self.patch_size, -1) - .permute(0, 3, 1, 2) - .contiguous() + out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous() for out in outputs ] if return_class_token: @@ -404,4 +356,4 @@ def vit_giant2(patch_size=16, **kwargs): block_fn=partial(Block, attn_class=MemEffAttention), **kwargs, ) - return model + return model \ No newline at end of file diff --git a/third_party/Roma/roma/models/transformer/layers/__init__.py b/third_party/RoMa/roma/models/transformer/layers/__init__.py similarity index 100% rename from third_party/Roma/roma/models/transformer/layers/__init__.py rename to third_party/RoMa/roma/models/transformer/layers/__init__.py diff --git a/third_party/Roma/roma/models/transformer/layers/attention.py b/third_party/RoMa/roma/models/transformer/layers/attention.py similarity index 93% rename from third_party/Roma/roma/models/transformer/layers/attention.py rename to third_party/RoMa/roma/models/transformer/layers/attention.py index 12f388719bf5f171d59aee238d902bb7915f864b..1f9b0c94b40967dfdff4f261c127cbd21328c905 100644 --- a/third_party/Roma/roma/models/transformer/layers/attention.py +++ b/third_party/RoMa/roma/models/transformer/layers/attention.py @@ -48,11 +48,7 @@ class Attention(nn.Module): def forward(self, x: Tensor) -> Tensor: B, N, C = x.shape - qkv = ( - self.qkv(x) - .reshape(B, N, 3, self.num_heads, C // self.num_heads) - .permute(2, 0, 3, 1, 4) - ) + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] attn = q @ k.transpose(-2, -1) diff --git a/third_party/Roma/roma/models/transformer/layers/block.py b/third_party/RoMa/roma/models/transformer/layers/block.py similarity index 83% rename from third_party/Roma/roma/models/transformer/layers/block.py rename to third_party/RoMa/roma/models/transformer/layers/block.py index 1b5f5158f073788d3d5fe3e09742d4485ef26441..25488f57cc0ad3c692f86b62555f6668e2a66db1 100644 --- a/third_party/Roma/roma/models/transformer/layers/block.py +++ b/third_party/RoMa/roma/models/transformer/layers/block.py @@ -62,9 +62,7 @@ class Block(nn.Module): attn_drop=attn_drop, proj_drop=drop, ) - self.ls1 = ( - LayerScale(dim, init_values=init_values) if init_values else nn.Identity() - ) + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) @@ -76,9 +74,7 @@ class Block(nn.Module): drop=drop, bias=ffn_bias, ) - self.ls2 = ( - LayerScale(dim, init_values=init_values) if init_values else nn.Identity() - ) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.sample_drop_ratio = drop_path @@ -131,9 +127,7 @@ def drop_add_residual_stochastic_depth( residual_scale_factor = b / sample_subset_size # 3) add the residual - x_plus_residual = torch.index_add( - x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor - ) + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) return x_plus_residual.view_as(x) @@ -149,16 +143,10 @@ def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None if scaling_vector is None: x_flat = x.flatten(1) residual = residual.flatten(1) - x_plus_residual = torch.index_add( - x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor - ) + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) else: x_plus_residual = scaled_index_add( - x, - brange, - residual.to(dtype=x.dtype), - scaling=scaling_vector, - alpha=residual_scale_factor, + x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor ) return x_plus_residual @@ -170,11 +158,7 @@ def get_attn_bias_and_cat(x_list, branges=None): """ this will perform the index select, cat the tensors, and provide the attn_bias from cache """ - batch_sizes = ( - [b.shape[0] for b in branges] - if branges is not None - else [x.shape[0] for x in x_list] - ) + batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list] all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) if all_shapes not in attn_bias_cache.keys(): seqlens = [] @@ -186,9 +170,7 @@ def get_attn_bias_and_cat(x_list, branges=None): attn_bias_cache[all_shapes] = attn_bias if branges is not None: - cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view( - 1, -1, x_list[0].shape[-1] - ) + cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1]) else: tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list) cat_tensors = torch.cat(tensors_bs1, dim=1) @@ -203,9 +185,7 @@ def drop_add_residual_stochastic_depth_list( scaling_vector=None, ) -> Tensor: # 1) generate random set of indices for dropping samples in the batch - branges_scales = [ - get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list - ] + branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list] branges = [s[0] for s in branges_scales] residual_scale_factors = [s[1] for s in branges_scales] @@ -216,14 +196,8 @@ def drop_add_residual_stochastic_depth_list( residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore outputs = [] - for x, brange, residual, residual_scale_factor in zip( - x_list, branges, residual_list, residual_scale_factors - ): - outputs.append( - add_residual( - x, brange, residual, residual_scale_factor, scaling_vector - ).view_as(x) - ) + for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors): + outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x)) return outputs @@ -246,17 +220,13 @@ class NestedTensorBlock(Block): x_list, residual_func=attn_residual_func, sample_drop_ratio=self.sample_drop_ratio, - scaling_vector=self.ls1.gamma - if isinstance(self.ls1, LayerScale) - else None, + scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None, ) x_list = drop_add_residual_stochastic_depth_list( x_list, residual_func=ffn_residual_func, sample_drop_ratio=self.sample_drop_ratio, - scaling_vector=self.ls2.gamma - if isinstance(self.ls1, LayerScale) - else None, + scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None, ) return x_list else: @@ -276,9 +246,7 @@ class NestedTensorBlock(Block): if isinstance(x_or_x_list, Tensor): return super().forward(x_or_x_list) elif isinstance(x_or_x_list, list): - assert ( - XFORMERS_AVAILABLE - ), "Please install xFormers for nested tensors usage" + assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage" return self.forward_nested(x_or_x_list) else: raise AssertionError diff --git a/third_party/Roma/roma/models/transformer/layers/dino_head.py b/third_party/RoMa/roma/models/transformer/layers/dino_head.py similarity index 85% rename from third_party/Roma/roma/models/transformer/layers/dino_head.py rename to third_party/RoMa/roma/models/transformer/layers/dino_head.py index 1147dd3a3c046aee8d427b42b1055f38a218275b..7212db92a4fd8d4c7230e284e551a0234e9d8623 100644 --- a/third_party/Roma/roma/models/transformer/layers/dino_head.py +++ b/third_party/RoMa/roma/models/transformer/layers/dino_head.py @@ -23,14 +23,7 @@ class DINOHead(nn.Module): ): super().__init__() nlayers = max(nlayers, 1) - self.mlp = _build_mlp( - nlayers, - in_dim, - bottleneck_dim, - hidden_dim=hidden_dim, - use_bn=use_bn, - bias=mlp_bias, - ) + self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias) self.apply(self._init_weights) self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False)) self.last_layer.weight_g.data.fill_(1) @@ -49,9 +42,7 @@ class DINOHead(nn.Module): return x -def _build_mlp( - nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True -): +def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True): if nlayers == 1: return nn.Linear(in_dim, bottleneck_dim, bias=bias) else: diff --git a/third_party/Roma/roma/models/transformer/layers/drop_path.py b/third_party/RoMa/roma/models/transformer/layers/drop_path.py similarity index 90% rename from third_party/Roma/roma/models/transformer/layers/drop_path.py rename to third_party/RoMa/roma/models/transformer/layers/drop_path.py index a23ba7325d0fd154d5885573770956042ce2311d..af05625984dd14682cc96a63bf0c97bab1f123b1 100644 --- a/third_party/Roma/roma/models/transformer/layers/drop_path.py +++ b/third_party/RoMa/roma/models/transformer/layers/drop_path.py @@ -16,9 +16,7 @@ def drop_path(x, drop_prob: float = 0.0, training: bool = False): if drop_prob == 0.0 or not training: return x keep_prob = 1 - drop_prob - shape = (x.shape[0],) + (1,) * ( - x.ndim - 1 - ) # work with diff dim tensors, not just 2D ConvNets + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets random_tensor = x.new_empty(shape).bernoulli_(keep_prob) if keep_prob > 0.0: random_tensor.div_(keep_prob) diff --git a/third_party/Roma/roma/models/transformer/layers/layer_scale.py b/third_party/RoMa/roma/models/transformer/layers/layer_scale.py similarity index 100% rename from third_party/Roma/roma/models/transformer/layers/layer_scale.py rename to third_party/RoMa/roma/models/transformer/layers/layer_scale.py diff --git a/third_party/Roma/roma/models/transformer/layers/mlp.py b/third_party/RoMa/roma/models/transformer/layers/mlp.py similarity index 100% rename from third_party/Roma/roma/models/transformer/layers/mlp.py rename to third_party/RoMa/roma/models/transformer/layers/mlp.py diff --git a/third_party/Roma/roma/models/transformer/layers/patch_embed.py b/third_party/RoMa/roma/models/transformer/layers/patch_embed.py similarity index 81% rename from third_party/Roma/roma/models/transformer/layers/patch_embed.py rename to third_party/RoMa/roma/models/transformer/layers/patch_embed.py index 837f952cf9a463444feeb146e0d5b539102ee26c..574abe41175568d700a389b8b96d1ba554914779 100644 --- a/third_party/Roma/roma/models/transformer/layers/patch_embed.py +++ b/third_party/RoMa/roma/models/transformer/layers/patch_embed.py @@ -63,21 +63,15 @@ class PatchEmbed(nn.Module): self.flatten_embedding = flatten_embedding - self.proj = nn.Conv2d( - in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW - ) + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() def forward(self, x: Tensor) -> Tensor: _, _, H, W = x.shape patch_H, patch_W = self.patch_size - assert ( - H % patch_H == 0 - ), f"Input image height {H} is not a multiple of patch height {patch_H}" - assert ( - W % patch_W == 0 - ), f"Input image width {W} is not a multiple of patch width: {patch_W}" + assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" + assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" x = self.proj(x) # B C H W H, W = x.size(2), x.size(3) @@ -89,13 +83,7 @@ class PatchEmbed(nn.Module): def flops(self) -> float: Ho, Wo = self.patches_resolution - flops = ( - Ho - * Wo - * self.embed_dim - * self.in_chans - * (self.patch_size[0] * self.patch_size[1]) - ) + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) if self.norm is not None: flops += Ho * Wo * self.embed_dim return flops diff --git a/third_party/Roma/roma/models/transformer/layers/swiglu_ffn.py b/third_party/RoMa/roma/models/transformer/layers/swiglu_ffn.py similarity index 100% rename from third_party/Roma/roma/models/transformer/layers/swiglu_ffn.py rename to third_party/RoMa/roma/models/transformer/layers/swiglu_ffn.py diff --git a/third_party/Roma/roma/train/__init__.py b/third_party/RoMa/roma/train/__init__.py similarity index 100% rename from third_party/Roma/roma/train/__init__.py rename to third_party/RoMa/roma/train/__init__.py diff --git a/third_party/Roma/roma/train/train.py b/third_party/RoMa/roma/train/train.py similarity index 65% rename from third_party/Roma/roma/train/train.py rename to third_party/RoMa/roma/train/train.py index eb3deaf1792a315d1cce77a2ee0fd50ae9e98ac1..5556f7ebf9b6378e1395c125dde093f5e55e7141 100644 --- a/third_party/Roma/roma/train/train.py +++ b/third_party/RoMa/roma/train/train.py @@ -4,62 +4,41 @@ import roma import torch import wandb - -def log_param_statistics(named_parameters, norm_type=2): +def log_param_statistics(named_parameters, norm_type = 2): named_parameters = list(named_parameters) grads = [p.grad for n, p in named_parameters if p.grad is not None] - weight_norms = [ - p.norm(p=norm_type) for n, p in named_parameters if p.grad is not None - ] - names = [n for n, p in named_parameters if p.grad is not None] + weight_norms = [p.norm(p=norm_type) for n, p in named_parameters if p.grad is not None] + names = [n for n,p in named_parameters if p.grad is not None] param_norm = torch.stack(weight_norms).norm(p=norm_type) device = grads[0].device - grad_norms = torch.stack( - [torch.norm(g.detach(), norm_type).to(device) for g in grads] - ) + grad_norms = torch.stack([torch.norm(g.detach(), norm_type).to(device) for g in grads]) nans_or_infs = torch.isinf(grad_norms) | torch.isnan(grad_norms) nan_inf_names = [name for name, naninf in zip(names, nans_or_infs) if naninf] total_grad_norm = torch.norm(grad_norms, norm_type) if torch.any(nans_or_infs): print(f"These params have nan or inf grads: {nan_inf_names}") - wandb.log({"grad_norm": total_grad_norm.item()}, step=roma.GLOBAL_STEP) - wandb.log({"param_norm": param_norm.item()}, step=roma.GLOBAL_STEP) - + wandb.log({"grad_norm": total_grad_norm.item()}, step = roma.GLOBAL_STEP) + wandb.log({"param_norm": param_norm.item()}, step = roma.GLOBAL_STEP) -def train_step( - train_batch, model, objective, optimizer, grad_scaler, grad_clip_norm=1.0, **kwargs -): +def train_step(train_batch, model, objective, optimizer, grad_scaler, grad_clip_norm = 1.,**kwargs): optimizer.zero_grad() out = model(train_batch) l = objective(out, train_batch) grad_scaler.scale(l).backward() grad_scaler.unscale_(optimizer) log_param_statistics(model.named_parameters()) - torch.nn.utils.clip_grad_norm_( - model.parameters(), grad_clip_norm - ) # what should max norm be? + torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_norm) # what should max norm be? grad_scaler.step(optimizer) grad_scaler.update() - wandb.log({"grad_scale": grad_scaler._scale.item()}, step=roma.GLOBAL_STEP) - if grad_scaler._scale < 1.0: - grad_scaler._scale = torch.tensor(1.0).to(grad_scaler._scale) - roma.GLOBAL_STEP = roma.GLOBAL_STEP + roma.STEP_SIZE # increment global step + wandb.log({"grad_scale": grad_scaler._scale.item()}, step = roma.GLOBAL_STEP) + if grad_scaler._scale < 1.: + grad_scaler._scale = torch.tensor(1.).to(grad_scaler._scale) + roma.GLOBAL_STEP = roma.GLOBAL_STEP + roma.STEP_SIZE # increment global step return {"train_out": out, "train_loss": l.item()} def train_k_steps( - n_0, - k, - dataloader, - model, - objective, - optimizer, - lr_scheduler, - grad_scaler, - progress_bar=True, - grad_clip_norm=1.0, - warmup=None, - ema_model=None, + n_0, k, dataloader, model, objective, optimizer, lr_scheduler, grad_scaler, progress_bar=True, grad_clip_norm = 1., warmup = None, ema_model = None, ): for n in tqdm(range(n_0, n_0 + k), disable=(not progress_bar) or roma.RANK > 0): batch = next(dataloader) @@ -73,7 +52,7 @@ def train_k_steps( lr_scheduler=lr_scheduler, grad_scaler=grad_scaler, n=n, - grad_clip_norm=grad_clip_norm, + grad_clip_norm = grad_clip_norm, ) if ema_model is not None: ema_model.update() @@ -82,10 +61,7 @@ def train_k_steps( lr_scheduler.step() else: lr_scheduler.step() - [ - wandb.log({f"lr_group_{grp}": lr}) - for grp, lr in enumerate(lr_scheduler.get_last_lr()) - ] + [wandb.log({f"lr_group_{grp}": lr}) for grp, lr in enumerate(lr_scheduler.get_last_lr())] def train_epoch( diff --git a/third_party/Roma/roma/utils/__init__.py b/third_party/RoMa/roma/utils/__init__.py similarity index 100% rename from third_party/Roma/roma/utils/__init__.py rename to third_party/RoMa/roma/utils/__init__.py diff --git a/third_party/RoMa/roma/utils/kde.py b/third_party/RoMa/roma/utils/kde.py new file mode 100644 index 0000000000000000000000000000000000000000..6ee1378282965ab091b77c2a97f0e80bd13d4637 --- /dev/null +++ b/third_party/RoMa/roma/utils/kde.py @@ -0,0 +1,8 @@ +import torch + +def kde(x, std = 0.1): + # use a gaussian kernel to estimate density + x = x.half() # Do it in half precision TODO: remove hardcoding + scores = (-torch.cdist(x,x)**2/(2*std**2)).exp() + density = scores.sum(dim=-1) + return density \ No newline at end of file diff --git a/third_party/RoMa/roma/utils/local_correlation.py b/third_party/RoMa/roma/utils/local_correlation.py new file mode 100644 index 0000000000000000000000000000000000000000..2919595b93aef10c6f95938e5bf104705ee0cbb6 --- /dev/null +++ b/third_party/RoMa/roma/utils/local_correlation.py @@ -0,0 +1,44 @@ +import torch +import torch.nn.functional as F + +def local_correlation( + feature0, + feature1, + local_radius, + padding_mode="zeros", + flow = None, + sample_mode = "bilinear", +): + r = local_radius + K = (2*r+1)**2 + B, c, h, w = feature0.size() + corr = torch.empty((B,K,h,w), device = feature0.device, dtype=feature0.dtype) + if flow is None: + # If flow is None, assume feature0 and feature1 are aligned + coords = torch.meshgrid( + ( + torch.linspace(-1 + 1 / h, 1 - 1 / h, h, device=feature0.device), + torch.linspace(-1 + 1 / w, 1 - 1 / w, w, device=feature0.device), + )) + coords = torch.stack((coords[1], coords[0]), dim=-1)[ + None + ].expand(B, h, w, 2) + else: + coords = flow.permute(0,2,3,1) # If using flow, sample around flow target. + local_window = torch.meshgrid( + ( + torch.linspace(-2*local_radius/h, 2*local_radius/h, 2*r+1, device=feature0.device), + torch.linspace(-2*local_radius/w, 2*local_radius/w, 2*r+1, device=feature0.device), + )) + local_window = torch.stack((local_window[1], local_window[0]), dim=-1)[ + None + ].expand(1, 2*r+1, 2*r+1, 2).reshape(1, (2*r+1)**2, 2) + for _ in range(B): + with torch.no_grad(): + local_window_coords = (coords[_,:,:,None]+local_window[:,None,None]).reshape(1,h,w*(2*r+1)**2,2) + window_feature = F.grid_sample( + feature1[_:_+1], local_window_coords, padding_mode=padding_mode, align_corners=False, mode = sample_mode, # + ) + window_feature = window_feature.reshape(c,h,w,(2*r+1)**2) + corr[_] = (feature0[_,...,None]/(c**.5)*window_feature).sum(dim=0).permute(2,0,1) + return corr diff --git a/third_party/Roma/roma/utils/transforms.py b/third_party/RoMa/roma/utils/transforms.py similarity index 94% rename from third_party/Roma/roma/utils/transforms.py rename to third_party/RoMa/roma/utils/transforms.py index b33c3f30f422bca6a81aa201952b7bb2d3d906bf..ea6476bd816a31df36f7d1b5417853637b65474b 100644 --- a/third_party/Roma/roma/utils/transforms.py +++ b/third_party/RoMa/roma/utils/transforms.py @@ -16,9 +16,7 @@ class GeometricSequential: for t in self.transforms: if np.random.rand() < t.p: M = M.matmul( - t.compute_transformation( - x, t.generate_parameters((b, c, h, w)), None - ) + t.compute_transformation(x, t.generate_parameters((b, c, h, w)), None) ) return ( warp_perspective( @@ -106,14 +104,15 @@ class RandomPerspective(K.RandomPerspective): return dict(start_points=start_points, end_points=end_points) + class RandomErasing: - def __init__(self, p=0.0, scale=0.0) -> None: + def __init__(self, p = 0., scale = 0.) -> None: self.p = p self.scale = scale - self.random_eraser = K.RandomErasing(scale=(0.02, scale), p=p) - + self.random_eraser = K.RandomErasing(scale = (0.02, scale), p = p) def __call__(self, image, depth): if self.p > 0: image = self.random_eraser(image) depth = self.random_eraser(depth, params=self.random_eraser._params) return image, depth + \ No newline at end of file diff --git a/third_party/Roma/roma/utils/utils.py b/third_party/RoMa/roma/utils/utils.py similarity index 73% rename from third_party/Roma/roma/utils/utils.py rename to third_party/RoMa/roma/utils/utils.py index 969e1003419f3b7f05874830b79de73363017f01..d7717b2ee37417c4082706ad58143b7ebfc34624 100644 --- a/third_party/Roma/roma/utils/utils.py +++ b/third_party/RoMa/roma/utils/utils.py @@ -9,14 +9,13 @@ import torch.nn.functional as F from PIL import Image import kornia - def recover_pose(E, kpts0, kpts1, K0, K1, mask): best_num_inliers = 0 - K0inv = np.linalg.inv(K0[:2, :2]) - K1inv = np.linalg.inv(K1[:2, :2]) + K0inv = np.linalg.inv(K0[:2,:2]) + K1inv = np.linalg.inv(K1[:2,:2]) - kpts0_n = (K0inv @ (kpts0 - K0[None, :2, 2]).T).T - kpts1_n = (K1inv @ (kpts1 - K1[None, :2, 2]).T).T + kpts0_n = (K0inv @ (kpts0-K0[None,:2,2]).T).T + kpts1_n = (K1inv @ (kpts1-K1[None,:2,2]).T).T for _E in np.split(E, len(E) / 3): n, R, t, _ = cv2.recoverPose(_E, kpts0_n, kpts1_n, np.eye(3), 1e9, mask=mask) @@ -26,16 +25,17 @@ def recover_pose(E, kpts0, kpts1, K0, K1, mask): return ret + # Code taken from https://github.com/PruneTruong/DenseMatching/blob/40c29a6b5c35e86b9509e65ab0cd12553d998e5f/validation/utils_pose_estimation.py # --- GEOMETRY --- def estimate_pose(kpts0, kpts1, K0, K1, norm_thresh, conf=0.99999): if len(kpts0) < 5: return None - K0inv = np.linalg.inv(K0[:2, :2]) - K1inv = np.linalg.inv(K1[:2, :2]) + K0inv = np.linalg.inv(K0[:2,:2]) + K1inv = np.linalg.inv(K1[:2,:2]) - kpts0 = (K0inv @ (kpts0 - K0[None, :2, 2]).T).T - kpts1 = (K1inv @ (kpts1 - K1[None, :2, 2]).T).T + kpts0 = (K0inv @ (kpts0-K0[None,:2,2]).T).T + kpts1 = (K1inv @ (kpts1-K1[None,:2,2]).T).T E, mask = cv2.findEssentialMat( kpts0, kpts1, np.eye(3), threshold=norm_thresh, prob=conf ) @@ -51,40 +51,31 @@ def estimate_pose(kpts0, kpts1, K0, K1, norm_thresh, conf=0.99999): ret = (R, t, mask.ravel() > 0) return ret - def estimate_pose_uncalibrated(kpts0, kpts1, K0, K1, norm_thresh, conf=0.99999): if len(kpts0) < 5: return None method = cv2.USAC_ACCURATE F, mask = cv2.findFundamentalMat( - kpts0, - kpts1, - ransacReprojThreshold=norm_thresh, - confidence=conf, - method=method, - maxIters=10000, + kpts0, kpts1, ransacReprojThreshold=norm_thresh, confidence=conf, method=method, maxIters=10000 ) - E = K1.T @ F @ K0 + E = K1.T@F@K0 ret = None if E is not None: best_num_inliers = 0 - K0inv = np.linalg.inv(K0[:2, :2]) - K1inv = np.linalg.inv(K1[:2, :2]) - - kpts0_n = (K0inv @ (kpts0 - K0[None, :2, 2]).T).T - kpts1_n = (K1inv @ (kpts1 - K1[None, :2, 2]).T).T + K0inv = np.linalg.inv(K0[:2,:2]) + K1inv = np.linalg.inv(K1[:2,:2]) + kpts0_n = (K0inv @ (kpts0-K0[None,:2,2]).T).T + kpts1_n = (K1inv @ (kpts1-K1[None,:2,2]).T).T + for _E in np.split(E, len(E) / 3): - n, R, t, _ = cv2.recoverPose( - _E, kpts0_n, kpts1_n, np.eye(3), 1e9, mask=mask - ) + n, R, t, _ = cv2.recoverPose(_E, kpts0_n, kpts1_n, np.eye(3), 1e9, mask=mask) if n > best_num_inliers: best_num_inliers = n ret = (R, t, mask.ravel() > 0) return ret - -def unnormalize_coords(x_n, h, w): +def unnormalize_coords(x_n,h,w): x = torch.stack( (w * (x_n[..., 0] + 1) / 2, h * (x_n[..., 1] + 1) / 2), dim=-1 ) # [-1+1/h, 1-1/h] -> [0.5, h-0.5] @@ -164,7 +155,6 @@ def get_depth_tuple_transform_ops_nearest_exact(resize=None): ops.append(TupleResizeNearestExact(resize)) return TupleCompose(ops) - def get_depth_tuple_transform_ops(resize=None, normalize=True, unscale=False): ops = [] if resize: @@ -172,9 +162,7 @@ def get_depth_tuple_transform_ops(resize=None, normalize=True, unscale=False): return TupleCompose(ops) -def get_tuple_transform_ops( - resize=None, normalize=True, unscale=False, clahe=False, colorjiggle_params=None -): +def get_tuple_transform_ops(resize=None, normalize=True, unscale=False, clahe = False, colorjiggle_params = None): ops = [] if resize: ops.append(TupleResize(resize)) @@ -185,7 +173,6 @@ def get_tuple_transform_ops( ) # Imagenet mean/std return TupleCompose(ops) - class ToTensorScaled(object): """Convert a RGB PIL Image to a CHW ordered Tensor, scale the range to [0, 1]""" @@ -234,15 +221,11 @@ class TupleToTensorUnscaled(object): def __repr__(self): return "TupleToTensorUnscaled()" - class TupleResizeNearestExact: def __init__(self, size): self.size = size - def __call__(self, im_tuple): - return [ - F.interpolate(im, size=self.size, mode="nearest-exact") for im in im_tuple - ] + return [F.interpolate(im, size = self.size, mode = 'nearest-exact') for im in im_tuple] def __repr__(self): return "TupleResizeNearestExact(size={})".format(self.size) @@ -252,19 +235,17 @@ class TupleResize(object): def __init__(self, size, mode=InterpolationMode.BICUBIC): self.size = size self.resize = transforms.Resize(size, mode) - def __call__(self, im_tuple): return [self.resize(im) for im in im_tuple] def __repr__(self): return "TupleResize(size={})".format(self.size) - - + class Normalize: - def __call__(self, im): - mean = im.mean(dim=(1, 2), keepdims=True) - std = im.std(dim=(1, 2), keepdims=True) - return (im - mean) / std + def __call__(self,im): + mean = im.mean(dim=(1,2), keepdims=True) + std = im.std(dim=(1,2), keepdims=True) + return (im-mean)/std class TupleNormalize(object): @@ -274,7 +255,7 @@ class TupleNormalize(object): self.normalize = transforms.Normalize(mean=mean, std=std) def __call__(self, im_tuple): - c, h, w = im_tuple[0].shape + c,h,w = im_tuple[0].shape if c > 3: warnings.warn(f"Number of channels c={c} > 3, assuming first 3 are rgb") return [self.normalize(im[:3]) for im in im_tuple] @@ -300,82 +281,50 @@ class TupleCompose(object): format_string += "\n)" return format_string - @torch.no_grad() -def cls_to_flow(cls, deterministic_sampling=True): - B, C, H, W = cls.shape +def cls_to_flow(cls, deterministic_sampling = True): + B,C,H,W = cls.shape device = cls.device res = round(math.sqrt(C)) - G = torch.meshgrid( - *[ - torch.linspace(-1 + 1 / res, 1 - 1 / res, steps=res, device=device) - for _ in range(2) - ] - ) - G = torch.stack([G[1], G[0]], dim=-1).reshape(C, 2) + G = torch.meshgrid(*[torch.linspace(-1+1/res, 1-1/res, steps = res, device = device) for _ in range(2)]) + G = torch.stack([G[1],G[0]],dim=-1).reshape(C,2) if deterministic_sampling: sampled_cls = cls.max(dim=1).indices else: - sampled_cls = torch.multinomial( - cls.permute(0, 2, 3, 1).reshape(B * H * W, C).softmax(dim=-1), 1 - ).reshape(B, H, W) + sampled_cls = torch.multinomial(cls.permute(0,2,3,1).reshape(B*H*W,C).softmax(dim=-1), 1).reshape(B,H,W) flow = G[sampled_cls] return flow - @torch.no_grad() def cls_to_flow_refine(cls): - B, C, H, W = cls.shape + B,C,H,W = cls.shape device = cls.device res = round(math.sqrt(C)) - G = torch.meshgrid( - *[ - torch.linspace(-1 + 1 / res, 1 - 1 / res, steps=res, device=device) - for _ in range(2) - ] - ) - G = torch.stack([G[1], G[0]], dim=-1).reshape(C, 2) + G = torch.meshgrid(*[torch.linspace(-1+1/res, 1-1/res, steps = res, device = device) for _ in range(2)]) + G = torch.stack([G[1],G[0]],dim=-1).reshape(C,2) cls = cls.softmax(dim=1) mode = cls.max(dim=1).indices - - index = ( - torch.stack((mode - 1, mode, mode + 1, mode - res, mode + res), dim=1) - .clamp(0, C - 1) - .long() - ) - neighbours = torch.gather(cls, dim=1, index=index)[..., None] - flow = ( - neighbours[:, 0] * G[index[:, 0]] - + neighbours[:, 1] * G[index[:, 1]] - + neighbours[:, 2] * G[index[:, 2]] - + neighbours[:, 3] * G[index[:, 3]] - + neighbours[:, 4] * G[index[:, 4]] - ) - tot_prob = neighbours.sum(dim=1) + + index = torch.stack((mode-1, mode, mode+1, mode - res, mode + res), dim = 1).clamp(0,C - 1).long() + neighbours = torch.gather(cls, dim = 1, index = index)[...,None] + flow = neighbours[:,0] * G[index[:,0]] + neighbours[:,1] * G[index[:,1]] + neighbours[:,2] * G[index[:,2]] + neighbours[:,3] * G[index[:,3]] + neighbours[:,4] * G[index[:,4]] + tot_prob = neighbours.sum(dim=1) flow = flow / tot_prob return flow -def get_gt_warp( - depth1, - depth2, - T_1to2, - K1, - K2, - depth_interpolation_mode="bilinear", - relative_depth_error_threshold=0.05, - H=None, - W=None, -): - +def get_gt_warp(depth1, depth2, T_1to2, K1, K2, depth_interpolation_mode = 'bilinear', relative_depth_error_threshold = 0.05, H = None, W = None): + if H is None: - B, H, W = depth1.shape + B,H,W = depth1.shape else: B = depth1.shape[0] with torch.no_grad(): x1_n = torch.meshgrid( *[ - torch.linspace(-1 + 1 / n, 1 - 1 / n, n, device=depth1.device) + torch.linspace( + -1 + 1 / n, 1 - 1 / n, n, device=depth1.device + ) for n in (B, H, W) ] ) @@ -387,27 +336,15 @@ def get_gt_warp( T_1to2.double(), K1.double(), K2.double(), - depth_interpolation_mode=depth_interpolation_mode, - relative_depth_error_threshold=relative_depth_error_threshold, + depth_interpolation_mode = depth_interpolation_mode, + relative_depth_error_threshold = relative_depth_error_threshold, ) prob = mask.float().reshape(B, H, W) x2 = x2.reshape(B, H, W, 2) return x2, prob - @torch.no_grad() -def warp_kpts( - kpts0, - depth0, - depth1, - T_0to1, - K0, - K1, - smooth_mask=False, - return_relative_depth_error=False, - depth_interpolation_mode="bilinear", - relative_depth_error_threshold=0.05, -): +def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1, smooth_mask = False, return_relative_depth_error = False, depth_interpolation_mode = "bilinear", relative_depth_error_threshold = 0.05): """Warp kpts0 from I0 to I1 with depth, K and Rt Also check covisibility and depth consistency. Depth is consistent if relative error < 0.2 (hard-coded). @@ -432,44 +369,26 @@ def warp_kpts( # Inspired by approach in inloc, try to fill holes from bilinear interpolation by nearest neighbour interpolation if smooth_mask: raise NotImplementedError("Combined bilinear and NN warp not implemented") - valid_bilinear, warp_bilinear = warp_kpts( - kpts0, - depth0, - depth1, - T_0to1, - K0, - K1, - smooth_mask=smooth_mask, - return_relative_depth_error=return_relative_depth_error, - depth_interpolation_mode="bilinear", - relative_depth_error_threshold=relative_depth_error_threshold, - ) - valid_nearest, warp_nearest = warp_kpts( - kpts0, - depth0, - depth1, - T_0to1, - K0, - K1, - smooth_mask=smooth_mask, - return_relative_depth_error=return_relative_depth_error, - depth_interpolation_mode="nearest-exact", - relative_depth_error_threshold=relative_depth_error_threshold, - ) - nearest_valid_bilinear_invalid = (~valid_bilinear).logical_and(valid_nearest) + valid_bilinear, warp_bilinear = warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1, + smooth_mask = smooth_mask, + return_relative_depth_error = return_relative_depth_error, + depth_interpolation_mode = "bilinear", + relative_depth_error_threshold = relative_depth_error_threshold) + valid_nearest, warp_nearest = warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1, + smooth_mask = smooth_mask, + return_relative_depth_error = return_relative_depth_error, + depth_interpolation_mode = "nearest-exact", + relative_depth_error_threshold = relative_depth_error_threshold) + nearest_valid_bilinear_invalid = (~valid_bilinear).logical_and(valid_nearest) warp = warp_bilinear.clone() - warp[nearest_valid_bilinear_invalid] = warp_nearest[ - nearest_valid_bilinear_invalid - ] + warp[nearest_valid_bilinear_invalid] = warp_nearest[nearest_valid_bilinear_invalid] valid = valid_bilinear | valid_nearest return valid, warp - - kpts0_depth = F.grid_sample( - depth0[:, None], - kpts0[:, :, None], - mode=depth_interpolation_mode, - align_corners=False, - )[:, 0, :, 0] + + + kpts0_depth = F.grid_sample(depth0[:, None], kpts0[:, :, None], mode = depth_interpolation_mode, align_corners=False)[ + :, 0, :, 0 + ] kpts0 = torch.stack( (w * (kpts0[..., 0] + 1) / 2, h * (kpts0[..., 1] + 1) / 2), dim=-1 ) # [-1+1/h, 1-1/h] -> [0.5, h-0.5] @@ -508,26 +427,22 @@ def warp_kpts( # w_kpts0[~covisible_mask, :] = -5 # xd w_kpts0_depth = F.grid_sample( - depth1[:, None], - w_kpts0[:, :, None], - mode=depth_interpolation_mode, - align_corners=False, + depth1[:, None], w_kpts0[:, :, None], mode=depth_interpolation_mode, align_corners=False )[:, 0, :, 0] - + relative_depth_error = ( (w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth ).abs() if not smooth_mask: consistent_mask = relative_depth_error < relative_depth_error_threshold else: - consistent_mask = (-relative_depth_error / smooth_mask).exp() + consistent_mask = (-relative_depth_error/smooth_mask).exp() valid_mask = nonzero_mask * covisible_mask * consistent_mask if return_relative_depth_error: return relative_depth_error, w_kpts0 else: return valid_mask, w_kpts0 - imagenet_mean = torch.tensor([0.485, 0.456, 0.406]) imagenet_std = torch.tensor([0.229, 0.224, 0.225]) @@ -547,9 +462,7 @@ def numpy_to_pil(x: np.ndarray): def tensor_to_pil(x, unnormalize=False): if unnormalize: - x = x * (imagenet_std[:, None, None].to(x.device)) + ( - imagenet_mean[:, None, None].to(x.device) - ) + x = x * (imagenet_std[:, None, None].to(x.device)) + (imagenet_mean[:, None, None].to(x.device)) x = x.detach().permute(1, 2, 0).cpu().numpy() x = np.clip(x, 0.0, 1.0) return numpy_to_pil(x) @@ -579,63 +492,73 @@ def compute_relative_pose(R1, t1, R2, t2): trans = -rots @ t1 + t2 return rots, trans - @torch.no_grad() def reset_opt(opt): for group in opt.param_groups: - for p in group["params"]: + for p in group['params']: if p.requires_grad: state = opt.state[p] # State initialization # Exponential moving average of gradient values - state["exp_avg"] = torch.zeros_like(p) + state['exp_avg'] = torch.zeros_like(p) # Exponential moving average of squared gradient values - state["exp_avg_sq"] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) # Exponential moving average of gradient difference - state["exp_avg_diff"] = torch.zeros_like(p) + state['exp_avg_diff'] = torch.zeros_like(p) def flow_to_pixel_coords(flow, h1, w1): - flow = torch.stack( - ( - w1 * (flow[..., 0] + 1) / 2, - h1 * (flow[..., 1] + 1) / 2, - ), - axis=-1, + flow = ( + torch.stack( + ( + w1 * (flow[..., 0] + 1) / 2, + h1 * (flow[..., 1] + 1) / 2, + ), + axis=-1, + ) ) return flow +to_pixel_coords = flow_to_pixel_coords # just an alias def flow_to_normalized_coords(flow, h1, w1): - flow = torch.stack( - ( - 2 * (flow[..., 0]) / w1 - 1, - 2 * (flow[..., 1]) / h1 - 1, - ), - axis=-1, + flow = ( + torch.stack( + ( + 2 * (flow[..., 0]) / w1 - 1, + 2 * (flow[..., 1]) / h1 - 1, + ), + axis=-1, + ) ) return flow +to_normalized_coords = flow_to_normalized_coords # just an alias def warp_to_pixel_coords(warp, h1, w1, h2, w2): warp1 = warp[..., :2] - warp1 = torch.stack( - ( - w1 * (warp1[..., 0] + 1) / 2, - h1 * (warp1[..., 1] + 1) / 2, - ), - axis=-1, + warp1 = ( + torch.stack( + ( + w1 * (warp1[..., 0] + 1) / 2, + h1 * (warp1[..., 1] + 1) / 2, + ), + axis=-1, + ) ) warp2 = warp[..., 2:] - warp2 = torch.stack( - ( - w2 * (warp2[..., 0] + 1) / 2, - h2 * (warp2[..., 1] + 1) / 2, - ), - axis=-1, + warp2 = ( + torch.stack( + ( + w2 * (warp2[..., 0] + 1) / 2, + h2 * (warp2[..., 1] + 1) / 2, + ), + axis=-1, + ) ) - return torch.cat((warp1, warp2), dim=-1) + return torch.cat((warp1,warp2), dim=-1) + def signed_point_line_distance(point, line, eps: float = 1e-9): @@ -656,9 +579,7 @@ def signed_point_line_distance(point, line, eps: float = 1e-9): if not line.shape[-1] == 3: raise ValueError(f"lines must be a (*, 3) tensor. Got {line.shape}") - numerator = ( - line[..., 0] * point[..., 0] + line[..., 1] * point[..., 1] + line[..., 2] - ) + numerator = (line[..., 0] * point[..., 0] + line[..., 1] * point[..., 1] + line[..., 2]) denominator = line[..., :2].norm(dim=-1) return numerator / (denominator + eps) @@ -682,7 +603,6 @@ def signed_left_to_right_epipolar_distance(pts1, pts2, Fm): the computed Symmetrical distance with shape :math:`(*, N)`. """ import kornia - if (len(Fm.shape) < 3) or not Fm.shape[-2:] == (3, 3): raise ValueError(f"Fm must be a (*, 3, 3) tensor. Got {Fm.shape}") @@ -694,10 +614,12 @@ def signed_left_to_right_epipolar_distance(pts1, pts2, Fm): return signed_point_line_distance(pts2, line1_in_2) - def get_grid(b, h, w, device): grid = torch.meshgrid( - *[torch.linspace(-1 + 1 / n, 1 - 1 / n, n, device=device) for n in (b, h, w)] + *[ + torch.linspace(-1 + 1 / n, 1 - 1 / n, n, device=device) + for n in (b, h, w) + ] ) grid = torch.stack((grid[2], grid[1]), dim=-1).reshape(b, h, w, 2) return grid diff --git a/third_party/Roma/setup.py b/third_party/RoMa/setup.py similarity index 61% rename from third_party/Roma/setup.py rename to third_party/RoMa/setup.py index ae777c0e5a41f0e4b03a838d19bc9a2bb04d4617..fe2e6dc4be62254f702e34422e07468b00195dd2 100644 --- a/third_party/Roma/setup.py +++ b/third_party/RoMa/setup.py @@ -1,8 +1,8 @@ -from setuptools import setup +from setuptools import setup, find_packages setup( name="roma", - packages=["roma"], + packages=find_packages(include=("roma*",)), version="0.0.1", author="Johan Edstedt", install_requires=open("requirements.txt", "r").read().split("\n"), diff --git a/third_party/Roma/LICENSE b/third_party/Roma/LICENSE deleted file mode 100644 index a115f899f8d09ef3b1def4a16c7bae1a0bd50fbe..0000000000000000000000000000000000000000 --- a/third_party/Roma/LICENSE +++ /dev/null @@ -1,400 +0,0 @@ - -Attribution-NonCommercial 4.0 International - -======================================================================= - -Creative Commons Corporation ("Creative Commons") is not a law firm and -does not provide legal services or legal advice. Distribution of -Creative Commons public licenses does not create a lawyer-client or -other relationship. Creative Commons makes its licenses and related -information available on an "as-is" basis. Creative Commons gives no -warranties regarding its licenses, any material licensed under their -terms and conditions, or any related information. Creative Commons -disclaims all liability for damages resulting from their use to the -fullest extent possible. - -Using Creative Commons Public Licenses - -Creative Commons public licenses provide a standard set of terms and -conditions that creators and other rights holders may use to share -original works of authorship and other material subject to copyright -and certain other rights specified in the public license below. The -following considerations are for informational purposes only, are not -exhaustive, and do not form part of our licenses. - - Considerations for licensors: Our public licenses are - intended for use by those authorized to give the public - permission to use material in ways otherwise restricted by - copyright and certain other rights. Our licenses are - irrevocable. Licensors should read and understand the terms - and conditions of the license they choose before applying it. - Licensors should also secure all rights necessary before - applying our licenses so that the public can reuse the - material as expected. Licensors should clearly mark any - material not subject to the license. This includes other CC- - licensed material, or material used under an exception or - limitation to copyright. More considerations for licensors: - wiki.creativecommons.org/Considerations_for_licensors - - Considerations for the public: By using one of our public - licenses, a licensor grants the public permission to use the - licensed material under specified terms and conditions. If - the licensor's permission is not necessary for any reason--for - example, because of any applicable exception or limitation to - copyright--then that use is not regulated by the license. Our - licenses grant only permissions under copyright and certain - other rights that a licensor has authority to grant. Use of - the licensed material may still be restricted for other - reasons, including because others have copyright or other - rights in the material. A licensor may make special requests, - such as asking that all changes be marked or described. - Although not required by our licenses, you are encouraged to - respect those requests where reasonable. More_considerations - for the public: - wiki.creativecommons.org/Considerations_for_licensees - -======================================================================= - -Creative Commons Attribution-NonCommercial 4.0 International Public -License - -By exercising the Licensed Rights (defined below), You accept and agree -to be bound by the terms and conditions of this Creative Commons -Attribution-NonCommercial 4.0 International Public License ("Public -License"). To the extent this Public License may be interpreted as a -contract, You are granted the Licensed Rights in consideration of Your -acceptance of these terms and conditions, and the Licensor grants You -such rights in consideration of benefits the Licensor receives from -making the Licensed Material available under these terms and -conditions. - -Section 1 -- Definitions. - - a. Adapted Material means material subject to Copyright and Similar - Rights that is derived from or based upon the Licensed Material - and in which the Licensed Material is translated, altered, - arranged, transformed, or otherwise modified in a manner requiring - permission under the Copyright and Similar Rights held by the - Licensor. For purposes of this Public License, where the Licensed - Material is a musical work, performance, or sound recording, - Adapted Material is always produced where the Licensed Material is - synched in timed relation with a moving image. - - b. Adapter's License means the license You apply to Your Copyright - and Similar Rights in Your contributions to Adapted Material in - accordance with the terms and conditions of this Public License. - - c. Copyright and Similar Rights means copyright and/or similar rights - closely related to copyright including, without limitation, - performance, broadcast, sound recording, and Sui Generis Database - Rights, without regard to how the rights are labeled or - categorized. For purposes of this Public License, the rights - specified in Section 2(b)(1)-(2) are not Copyright and Similar - Rights. - d. Effective Technological Measures means those measures that, in the - absence of proper authority, may not be circumvented under laws - fulfilling obligations under Article 11 of the WIPO Copyright - Treaty adopted on December 20, 1996, and/or similar international - agreements. - - e. Exceptions and Limitations means fair use, fair dealing, and/or - any other exception or limitation to Copyright and Similar Rights - that applies to Your use of the Licensed Material. - - f. Licensed Material means the artistic or literary work, database, - or other material to which the Licensor applied this Public - License. - - g. Licensed Rights means the rights granted to You subject to the - terms and conditions of this Public License, which are limited to - all Copyright and Similar Rights that apply to Your use of the - Licensed Material and that the Licensor has authority to license. - - h. Licensor means the individual(s) or entity(ies) granting rights - under this Public License. - - i. NonCommercial means not primarily intended for or directed towards - commercial advantage or monetary compensation. For purposes of - this Public License, the exchange of the Licensed Material for - other material subject to Copyright and Similar Rights by digital - file-sharing or similar means is NonCommercial provided there is - no payment of monetary compensation in connection with the - exchange. - - j. Share means to provide material to the public by any means or - process that requires permission under the Licensed Rights, such - as reproduction, public display, public performance, distribution, - dissemination, communication, or importation, and to make material - available to the public including in ways that members of the - public may access the material from a place and at a time - individually chosen by them. - - k. Sui Generis Database Rights means rights other than copyright - resulting from Directive 96/9/EC of the European Parliament and of - the Council of 11 March 1996 on the legal protection of databases, - as amended and/or succeeded, as well as other essentially - equivalent rights anywhere in the world. - - l. You means the individual or entity exercising the Licensed Rights - under this Public License. Your has a corresponding meaning. - -Section 2 -- Scope. - - a. License grant. - - 1. Subject to the terms and conditions of this Public License, - the Licensor hereby grants You a worldwide, royalty-free, - non-sublicensable, non-exclusive, irrevocable license to - exercise the Licensed Rights in the Licensed Material to: - - a. reproduce and Share the Licensed Material, in whole or - in part, for NonCommercial purposes only; and - - b. produce, reproduce, and Share Adapted Material for - NonCommercial purposes only. - - 2. Exceptions and Limitations. For the avoidance of doubt, where - Exceptions and Limitations apply to Your use, this Public - License does not apply, and You do not need to comply with - its terms and conditions. - - 3. Term. The term of this Public License is specified in Section - 6(a). - - 4. Media and formats; technical modifications allowed. The - Licensor authorizes You to exercise the Licensed Rights in - all media and formats whether now known or hereafter created, - and to make technical modifications necessary to do so. The - Licensor waives and/or agrees not to assert any right or - authority to forbid You from making technical modifications - necessary to exercise the Licensed Rights, including - technical modifications necessary to circumvent Effective - Technological Measures. For purposes of this Public License, - simply making modifications authorized by this Section 2(a) - (4) never produces Adapted Material. - - 5. Downstream recipients. - - a. Offer from the Licensor -- Licensed Material. Every - recipient of the Licensed Material automatically - receives an offer from the Licensor to exercise the - Licensed Rights under the terms and conditions of this - Public License. - - b. No downstream restrictions. You may not offer or impose - any additional or different terms or conditions on, or - apply any Effective Technological Measures to, the - Licensed Material if doing so restricts exercise of the - Licensed Rights by any recipient of the Licensed - Material. - - 6. No endorsement. Nothing in this Public License constitutes or - may be construed as permission to assert or imply that You - are, or that Your use of the Licensed Material is, connected - with, or sponsored, endorsed, or granted official status by, - the Licensor or others designated to receive attribution as - provided in Section 3(a)(1)(A)(i). - - b. Other rights. - - 1. Moral rights, such as the right of integrity, are not - licensed under this Public License, nor are publicity, - privacy, and/or other similar personality rights; however, to - the extent possible, the Licensor waives and/or agrees not to - assert any such rights held by the Licensor to the limited - extent necessary to allow You to exercise the Licensed - Rights, but not otherwise. - - 2. Patent and trademark rights are not licensed under this - Public License. - - 3. To the extent possible, the Licensor waives any right to - collect royalties from You for the exercise of the Licensed - Rights, whether directly or through a collecting society - under any voluntary or waivable statutory or compulsory - licensing scheme. In all other cases the Licensor expressly - reserves any right to collect such royalties, including when - the Licensed Material is used other than for NonCommercial - purposes. - -Section 3 -- License Conditions. - -Your exercise of the Licensed Rights is expressly made subject to the -following conditions. - - a. Attribution. - - 1. If You Share the Licensed Material (including in modified - form), You must: - - a. retain the following if it is supplied by the Licensor - with the Licensed Material: - - i. identification of the creator(s) of the Licensed - Material and any others designated to receive - attribution, in any reasonable manner requested by - the Licensor (including by pseudonym if - designated); - - ii. a copyright notice; - - iii. a notice that refers to this Public License; - - iv. a notice that refers to the disclaimer of - warranties; - - v. a URI or hyperlink to the Licensed Material to the - extent reasonably practicable; - - b. indicate if You modified the Licensed Material and - retain an indication of any previous modifications; and - - c. indicate the Licensed Material is licensed under this - Public License, and include the text of, or the URI or - hyperlink to, this Public License. - - 2. You may satisfy the conditions in Section 3(a)(1) in any - reasonable manner based on the medium, means, and context in - which You Share the Licensed Material. For example, it may be - reasonable to satisfy the conditions by providing a URI or - hyperlink to a resource that includes the required - information. - - 3. If requested by the Licensor, You must remove any of the - information required by Section 3(a)(1)(A) to the extent - reasonably practicable. - - 4. If You Share Adapted Material You produce, the Adapter's - License You apply must not prevent recipients of the Adapted - Material from complying with this Public License. - -Section 4 -- Sui Generis Database Rights. - -Where the Licensed Rights include Sui Generis Database Rights that -apply to Your use of the Licensed Material: - - a. for the avoidance of doubt, Section 2(a)(1) grants You the right - to extract, reuse, reproduce, and Share all or a substantial - portion of the contents of the database for NonCommercial purposes - only; - - b. if You include all or a substantial portion of the database - contents in a database in which You have Sui Generis Database - Rights, then the database in which You have Sui Generis Database - Rights (but not its individual contents) is Adapted Material; and - - c. You must comply with the conditions in Section 3(a) if You Share - all or a substantial portion of the contents of the database. - -For the avoidance of doubt, this Section 4 supplements and does not -replace Your obligations under this Public License where the Licensed -Rights include other Copyright and Similar Rights. - -Section 5 -- Disclaimer of Warranties and Limitation of Liability. - - a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE - EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS - AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF - ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, - IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, - WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR - PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, - ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT - KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT - ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. - - b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE - TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, - NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, - INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, - COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR - USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN - ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR - DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR - IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. - - c. The disclaimer of warranties and limitation of liability provided - above shall be interpreted in a manner that, to the extent - possible, most closely approximates an absolute disclaimer and - waiver of all liability. - -Section 6 -- Term and Termination. - - a. This Public License applies for the term of the Copyright and - Similar Rights licensed here. However, if You fail to comply with - this Public License, then Your rights under this Public License - terminate automatically. - - b. Where Your right to use the Licensed Material has terminated under - Section 6(a), it reinstates: - - 1. automatically as of the date the violation is cured, provided - it is cured within 30 days of Your discovery of the - violation; or - - 2. upon express reinstatement by the Licensor. - - For the avoidance of doubt, this Section 6(b) does not affect any - right the Licensor may have to seek remedies for Your violations - of this Public License. - - c. For the avoidance of doubt, the Licensor may also offer the - Licensed Material under separate terms or conditions or stop - distributing the Licensed Material at any time; however, doing so - will not terminate this Public License. - - d. Sections 1, 5, 6, 7, and 8 survive termination of this Public - License. - -Section 7 -- Other Terms and Conditions. - - a. The Licensor shall not be bound by any additional or different - terms or conditions communicated by You unless expressly agreed. - - b. Any arrangements, understandings, or agreements regarding the - Licensed Material not stated herein are separate from and - independent of the terms and conditions of this Public License. - -Section 8 -- Interpretation. - - a. For the avoidance of doubt, this Public License does not, and - shall not be interpreted to, reduce, limit, restrict, or impose - conditions on any use of the Licensed Material that could lawfully - be made without permission under this Public License. - - b. To the extent possible, if any provision of this Public License is - deemed unenforceable, it shall be automatically reformed to the - minimum extent necessary to make it enforceable. If the provision - cannot be reformed, it shall be severed from this Public License - without affecting the enforceability of the remaining terms and - conditions. - - c. No term or condition of this Public License will be waived and no - failure to comply consented to unless expressly agreed to by the - Licensor. - - d. Nothing in this Public License constitutes or may be interpreted - as a limitation upon, or waiver of, any privileges and immunities - that apply to the Licensor or You, including from the legal - processes of any jurisdiction or authority. - -======================================================================= - -Creative Commons is not a party to its public -licenses. Notwithstanding, Creative Commons may elect to apply one of -its public licenses to material it publishes and in those instances -will be considered the “Licensor.” The text of the Creative Commons -public licenses is dedicated to the public domain under the CC0 Public -Domain Dedication. Except for the limited purpose of indicating that -material is shared under a Creative Commons public license or as -otherwise permitted by the Creative Commons policies published at -creativecommons.org/policies, Creative Commons does not authorize the -use of the trademark "Creative Commons" or any other trademark or logo -of Creative Commons without its prior written consent including, -without limitation, in connection with any unauthorized modifications -to any of its public licenses or any other arrangements, -understandings, or agreements concerning use of licensed material. For -the avoidance of doubt, this paragraph does not form part of the -public licenses. - -Creative Commons may be contacted at creativecommons.org. diff --git a/third_party/Roma/README.md b/third_party/Roma/README.md deleted file mode 100644 index 5e984366c8f7af37615d7666f34cd82a90073fee..0000000000000000000000000000000000000000 --- a/third_party/Roma/README.md +++ /dev/null @@ -1,63 +0,0 @@ -# RoMa: Revisiting Robust Losses for Dense Feature Matching -### [Project Page (TODO)](https://parskatt.github.io/RoMa) | [Paper](https://arxiv.org/abs/2305.15404) -
- -> RoMa: Revisiting Robust Lossses for Dense Feature Matching -> [Johan Edstedt](https://scholar.google.com/citations?user=Ul-vMR0AAAAJ), [Qiyu Sun](https://scholar.google.com/citations?user=HS2WuHkAAAAJ), [Georg Bökman](https://scholar.google.com/citations?user=FUE3Wd0AAAAJ), [MÄrten WadenbÀck](https://scholar.google.com/citations?user=6WRQpCQAAAAJ), [Michael Felsberg](https://scholar.google.com/citations?&user=lkWfR08AAAAJ) -> Arxiv 2023 - -**NOTE!!! Very early code, there might be bugs** - -The codebase is in the [roma folder](roma). - -## Setup/Install -In your python environment (tested on Linux python 3.10), run: -```bash -pip install -e . -``` -## Demo / How to Use -We provide two demos in the [demos folder](demo). -Here's the gist of it: -```python -from roma import roma_outdoor -roma_model = roma_outdoor(device=device) -# Match -warp, certainty = roma_model.match(imA_path, imB_path, device=device) -# Sample matches for estimation -matches, certainty = roma_model.sample(warp, certainty) -# Convert to pixel coordinates (RoMa produces matches in [-1,1]x[-1,1]) -kptsA, kptsB = roma_model.to_pixel_coordinates(matches, H_A, W_A, H_B, W_B) -# Find a fundamental matrix (or anything else of interest) -F, mask = cv2.findFundamentalMat( - kptsA.cpu().numpy(), kptsB.cpu().numpy(), ransacReprojThreshold=0.2, method=cv2.USAC_MAGSAC, confidence=0.999999, maxIters=10000 -) -``` -## Reproducing Results -The experiments in the paper are provided in the [experiments folder](experiments). - -### Training -1. First follow the instructions provided here: https://github.com/Parskatt/DKM for downloading and preprocessing datasets. -2. Run the relevant experiment, e.g., -```bash -torchrun --nproc_per_node=4 --nnodes=1 --rdzv_backend=c10d experiments/roma_outdoor.py -``` -### Testing -```bash -python experiments/roma_outdoor.py --only_test --benchmark mega-1500 -``` -## License -Due to our dependency on [DINOv2](https://github.com/facebookresearch/dinov2/blob/main/LICENSE), the license is sadly non-commercial only for the moment. - -## Acknowledgement -Our codebase builds on the code in [DKM](https://github.com/Parskatt/DKM). - -## BibTeX -If you find our models useful, please consider citing our paper! -``` -@article{edstedt2023roma, -title={{RoMa}: Revisiting Robust Lossses for Dense Feature Matching}, -author={Edstedt, Johan and Sun, Qiyu and Bökman, Georg and WadenbÀck, MÄrten and Felsberg, Michael}, -journal={arXiv preprint arXiv:2305.15404}, -year={2023} -} -``` diff --git a/third_party/Roma/roma/datasets/scannet.py b/third_party/Roma/roma/datasets/scannet.py deleted file mode 100644 index 91bea57c9d1ae2773c11a9c8d47f31026a2c227b..0000000000000000000000000000000000000000 --- a/third_party/Roma/roma/datasets/scannet.py +++ /dev/null @@ -1,191 +0,0 @@ -import os -import random -from PIL import Image -import cv2 -import h5py -import numpy as np -import torch -from torch.utils.data import Dataset, DataLoader, ConcatDataset - -import torchvision.transforms.functional as tvf -import kornia.augmentation as K -import os.path as osp -import matplotlib.pyplot as plt -import roma -from roma.utils import get_depth_tuple_transform_ops, get_tuple_transform_ops -from roma.utils.transforms import GeometricSequential -from tqdm import tqdm - - -class ScanNetScene: - def __init__( - self, - data_root, - scene_info, - ht=384, - wt=512, - min_overlap=0.0, - shake_t=0, - rot_prob=0.0, - use_horizontal_flip_aug=False, - ) -> None: - self.scene_root = osp.join(data_root, "scans", "scans_train") - self.data_names = scene_info["name"] - self.overlaps = scene_info["score"] - # Only sample 10s - valid = (self.data_names[:, -2:] % 10).sum(axis=-1) == 0 - self.overlaps = self.overlaps[valid] - self.data_names = self.data_names[valid] - if len(self.data_names) > 10000: - pairinds = np.random.choice( - np.arange(0, len(self.data_names)), 10000, replace=False - ) - self.data_names = self.data_names[pairinds] - self.overlaps = self.overlaps[pairinds] - self.im_transform_ops = get_tuple_transform_ops(resize=(ht, wt), normalize=True) - self.depth_transform_ops = get_depth_tuple_transform_ops( - resize=(ht, wt), normalize=False - ) - self.wt, self.ht = wt, ht - self.shake_t = shake_t - self.H_generator = GeometricSequential(K.RandomAffine(degrees=90, p=rot_prob)) - self.use_horizontal_flip_aug = use_horizontal_flip_aug - - def load_im(self, im_B, crop=None): - im = Image.open(im_B) - return im - - def load_depth(self, depth_ref, crop=None): - depth = cv2.imread(str(depth_ref), cv2.IMREAD_UNCHANGED) - depth = depth / 1000 - depth = torch.from_numpy(depth).float() # (h, w) - return depth - - def __len__(self): - return len(self.data_names) - - def scale_intrinsic(self, K, wi, hi): - sx, sy = self.wt / wi, self.ht / hi - sK = torch.tensor([[sx, 0, 0], [0, sy, 0], [0, 0, 1]]) - return sK @ K - - def horizontal_flip(self, im_A, im_B, depth_A, depth_B, K_A, K_B): - im_A = im_A.flip(-1) - im_B = im_B.flip(-1) - depth_A, depth_B = depth_A.flip(-1), depth_B.flip(-1) - flip_mat = torch.tensor([[-1, 0, self.wt], [0, 1, 0], [0, 0, 1.0]]).to( - K_A.device - ) - K_A = flip_mat @ K_A - K_B = flip_mat @ K_B - - return im_A, im_B, depth_A, depth_B, K_A, K_B - - def read_scannet_pose(self, path): - """Read ScanNet's Camera2World pose and transform it to World2Camera. - - Returns: - pose_w2c (np.ndarray): (4, 4) - """ - cam2world = np.loadtxt(path, delimiter=" ") - world2cam = np.linalg.inv(cam2world) - return world2cam - - def read_scannet_intrinsic(self, path): - """Read ScanNet's intrinsic matrix and return the 3x3 matrix.""" - intrinsic = np.loadtxt(path, delimiter=" ") - return torch.tensor(intrinsic[:-1, :-1], dtype=torch.float) - - def __getitem__(self, pair_idx): - # read intrinsics of original size - data_name = self.data_names[pair_idx] - scene_name, scene_sub_name, stem_name_1, stem_name_2 = data_name - scene_name = f"scene{scene_name:04d}_{scene_sub_name:02d}" - - # read the intrinsic of depthmap - K1 = K2 = self.read_scannet_intrinsic( - osp.join(self.scene_root, scene_name, "intrinsic", "intrinsic_color.txt") - ) # the depth K is not the same, but doesnt really matter - # read and compute relative poses - T1 = self.read_scannet_pose( - osp.join(self.scene_root, scene_name, "pose", f"{stem_name_1}.txt") - ) - T2 = self.read_scannet_pose( - osp.join(self.scene_root, scene_name, "pose", f"{stem_name_2}.txt") - ) - T_1to2 = torch.tensor(np.matmul(T2, np.linalg.inv(T1)), dtype=torch.float)[ - :4, :4 - ] # (4, 4) - - # Load positive pair data - im_A_ref = os.path.join( - self.scene_root, scene_name, "color", f"{stem_name_1}.jpg" - ) - im_B_ref = os.path.join( - self.scene_root, scene_name, "color", f"{stem_name_2}.jpg" - ) - depth_A_ref = os.path.join( - self.scene_root, scene_name, "depth", f"{stem_name_1}.png" - ) - depth_B_ref = os.path.join( - self.scene_root, scene_name, "depth", f"{stem_name_2}.png" - ) - - im_A = self.load_im(im_A_ref) - im_B = self.load_im(im_B_ref) - depth_A = self.load_depth(depth_A_ref) - depth_B = self.load_depth(depth_B_ref) - - # Recompute camera intrinsic matrix due to the resize - K1 = self.scale_intrinsic(K1, im_A.width, im_A.height) - K2 = self.scale_intrinsic(K2, im_B.width, im_B.height) - # Process images - im_A, im_B = self.im_transform_ops((im_A, im_B)) - depth_A, depth_B = self.depth_transform_ops( - (depth_A[None, None], depth_B[None, None]) - ) - if self.use_horizontal_flip_aug: - if np.random.rand() > 0.5: - im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip( - im_A, im_B, depth_A, depth_B, K1, K2 - ) - - data_dict = { - "im_A": im_A, - "im_B": im_B, - "im_A_depth": depth_A[0, 0], - "im_B_depth": depth_B[0, 0], - "K1": K1, - "K2": K2, - "T_1to2": T_1to2, - } - return data_dict - - -class ScanNetBuilder: - def __init__(self, data_root="data/scannet") -> None: - self.data_root = data_root - self.scene_info_root = os.path.join(data_root, "scannet_indices") - self.all_scenes = os.listdir(self.scene_info_root) - - def build_scenes(self, split="train", min_overlap=0.0, **kwargs): - # Note: split doesn't matter here as we always use same scannet_train scenes - scene_names = self.all_scenes - scenes = [] - for scene_name in tqdm(scene_names, disable=roma.RANK > 0): - scene_info = np.load( - os.path.join(self.scene_info_root, scene_name), allow_pickle=True - ) - scenes.append( - ScanNetScene( - self.data_root, scene_info, min_overlap=min_overlap, **kwargs - ) - ) - return scenes - - def weight_scenes(self, concat_dataset, alpha=0.5): - ns = [] - for d in concat_dataset.datasets: - ns.append(len(d)) - ws = torch.cat([torch.ones(n) / n**alpha for n in ns]) - return ws diff --git a/third_party/Roma/roma/losses/__init__.py b/third_party/Roma/roma/losses/__init__.py deleted file mode 100644 index 12cb6d40b90ca3ccf712321f78c033401db865fb..0000000000000000000000000000000000000000 --- a/third_party/Roma/roma/losses/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .robust_loss import RobustLosses diff --git a/third_party/Roma/roma/losses/robust_loss.py b/third_party/Roma/roma/losses/robust_loss.py deleted file mode 100644 index cd9fd5bbc9c2d01bb6dd40823e350b588bd598b3..0000000000000000000000000000000000000000 --- a/third_party/Roma/roma/losses/robust_loss.py +++ /dev/null @@ -1,222 +0,0 @@ -from einops.einops import rearrange -import torch -import torch.nn as nn -import torch.nn.functional as F -from roma.utils.utils import get_gt_warp -import wandb -import roma -import math - - -class RobustLosses(nn.Module): - def __init__( - self, - robust=False, - center_coords=False, - scale_normalize=False, - ce_weight=0.01, - local_loss=True, - local_dist=4.0, - local_largest_scale=8, - smooth_mask=False, - depth_interpolation_mode="bilinear", - mask_depth_loss=False, - relative_depth_error_threshold=0.05, - alpha=1.0, - c=1e-3, - ): - super().__init__() - self.robust = robust # measured in pixels - self.center_coords = center_coords - self.scale_normalize = scale_normalize - self.ce_weight = ce_weight - self.local_loss = local_loss - self.local_dist = local_dist - self.local_largest_scale = local_largest_scale - self.smooth_mask = smooth_mask - self.depth_interpolation_mode = depth_interpolation_mode - self.mask_depth_loss = mask_depth_loss - self.relative_depth_error_threshold = relative_depth_error_threshold - self.avg_overlap = dict() - self.alpha = alpha - self.c = c - - def gm_cls_loss(self, x2, prob, scale_gm_cls, gm_certainty, scale): - with torch.no_grad(): - B, C, H, W = scale_gm_cls.shape - device = x2.device - cls_res = round(math.sqrt(C)) - G = torch.meshgrid( - *[ - torch.linspace( - -1 + 1 / cls_res, 1 - 1 / cls_res, steps=cls_res, device=device - ) - for _ in range(2) - ] - ) - G = torch.stack((G[1], G[0]), dim=-1).reshape(C, 2) - GT = ( - (G[None, :, None, None, :] - x2[:, None]) - .norm(dim=-1) - .min(dim=1) - .indices - ) - cls_loss = F.cross_entropy(scale_gm_cls, GT, reduction="none")[prob > 0.99] - if not torch.any(cls_loss): - cls_loss = certainty_loss * 0.0 # Prevent issues where prob is 0 everywhere - - certainty_loss = F.binary_cross_entropy_with_logits(gm_certainty[:, 0], prob) - losses = { - f"gm_certainty_loss_{scale}": certainty_loss.mean(), - f"gm_cls_loss_{scale}": cls_loss.mean(), - } - wandb.log(losses, step=roma.GLOBAL_STEP) - return losses - - def delta_cls_loss( - self, x2, prob, flow_pre_delta, delta_cls, certainty, scale, offset_scale - ): - with torch.no_grad(): - B, C, H, W = delta_cls.shape - device = x2.device - cls_res = round(math.sqrt(C)) - G = torch.meshgrid( - *[ - torch.linspace( - -1 + 1 / cls_res, 1 - 1 / cls_res, steps=cls_res, device=device - ) - for _ in range(2) - ] - ) - G = torch.stack((G[1], G[0]), dim=-1).reshape(C, 2) * offset_scale - GT = ( - (G[None, :, None, None, :] + flow_pre_delta[:, None] - x2[:, None]) - .norm(dim=-1) - .min(dim=1) - .indices - ) - cls_loss = F.cross_entropy(delta_cls, GT, reduction="none")[prob > 0.99] - if not torch.any(cls_loss): - cls_loss = certainty_loss * 0.0 # Prevent issues where prob is 0 everywhere - certainty_loss = F.binary_cross_entropy_with_logits(certainty[:, 0], prob) - losses = { - f"delta_certainty_loss_{scale}": certainty_loss.mean(), - f"delta_cls_loss_{scale}": cls_loss.mean(), - } - wandb.log(losses, step=roma.GLOBAL_STEP) - return losses - - def regression_loss(self, x2, prob, flow, certainty, scale, eps=1e-8, mode="delta"): - epe = (flow.permute(0, 2, 3, 1) - x2).norm(dim=-1) - if scale == 1: - pck_05 = (epe[prob > 0.99] < 0.5 * (2 / 512)).float().mean() - wandb.log({"train_pck_05": pck_05}, step=roma.GLOBAL_STEP) - - ce_loss = F.binary_cross_entropy_with_logits(certainty[:, 0], prob) - a = self.alpha - cs = self.c * scale - x = epe[prob > 0.99] - reg_loss = cs**a * ((x / (cs)) ** 2 + 1**2) ** (a / 2) - if not torch.any(reg_loss): - reg_loss = ce_loss * 0.0 # Prevent issues where prob is 0 everywhere - losses = { - f"{mode}_certainty_loss_{scale}": ce_loss.mean(), - f"{mode}_regression_loss_{scale}": reg_loss.mean(), - } - wandb.log(losses, step=roma.GLOBAL_STEP) - return losses - - def forward(self, corresps, batch): - scales = list(corresps.keys()) - tot_loss = 0.0 - # scale_weights due to differences in scale for regression gradients and classification gradients - scale_weights = {1: 1, 2: 1, 4: 1, 8: 1, 16: 1} - for scale in scales: - scale_corresps = corresps[scale] - ( - scale_certainty, - flow_pre_delta, - delta_cls, - offset_scale, - scale_gm_cls, - scale_gm_certainty, - flow, - scale_gm_flow, - ) = ( - scale_corresps["certainty"], - scale_corresps["flow_pre_delta"], - scale_corresps.get("delta_cls"), - scale_corresps.get("offset_scale"), - scale_corresps.get("gm_cls"), - scale_corresps.get("gm_certainty"), - scale_corresps["flow"], - scale_corresps.get("gm_flow"), - ) - flow_pre_delta = rearrange(flow_pre_delta, "b d h w -> b h w d") - b, h, w, d = flow_pre_delta.shape - gt_warp, gt_prob = get_gt_warp( - batch["im_A_depth"], - batch["im_B_depth"], - batch["T_1to2"], - batch["K1"], - batch["K2"], - H=h, - W=w, - ) - x2 = gt_warp.float() - prob = gt_prob - - if self.local_largest_scale >= scale: - prob = prob * ( - F.interpolate(prev_epe[:, None], size=(h, w), mode="nearest-exact")[ - :, 0 - ] - < (2 / 512) * (self.local_dist[scale] * scale) - ) - - if scale_gm_cls is not None: - gm_cls_losses = self.gm_cls_loss( - x2, prob, scale_gm_cls, scale_gm_certainty, scale - ) - gm_loss = ( - self.ce_weight * gm_cls_losses[f"gm_certainty_loss_{scale}"] - + gm_cls_losses[f"gm_cls_loss_{scale}"] - ) - tot_loss = tot_loss + scale_weights[scale] * gm_loss - elif scale_gm_flow is not None: - gm_flow_losses = self.regression_loss( - x2, prob, scale_gm_flow, scale_gm_certainty, scale, mode="gm" - ) - gm_loss = ( - self.ce_weight * gm_flow_losses[f"gm_certainty_loss_{scale}"] - + gm_flow_losses[f"gm_regression_loss_{scale}"] - ) - tot_loss = tot_loss + scale_weights[scale] * gm_loss - - if delta_cls is not None: - delta_cls_losses = self.delta_cls_loss( - x2, - prob, - flow_pre_delta, - delta_cls, - scale_certainty, - scale, - offset_scale, - ) - delta_cls_loss = ( - self.ce_weight * delta_cls_losses[f"delta_certainty_loss_{scale}"] - + delta_cls_losses[f"delta_cls_loss_{scale}"] - ) - tot_loss = tot_loss + scale_weights[scale] * delta_cls_loss - else: - delta_regression_losses = self.regression_loss( - x2, prob, flow, scale_certainty, scale - ) - reg_loss = ( - self.ce_weight - * delta_regression_losses[f"delta_certainty_loss_{scale}"] - + delta_regression_losses[f"delta_regression_loss_{scale}"] - ) - tot_loss = tot_loss + scale_weights[scale] * reg_loss - prev_epe = (flow.permute(0, 2, 3, 1) - x2).norm(dim=-1).detach() - return tot_loss diff --git a/third_party/Roma/roma/models/__init__.py b/third_party/Roma/roma/models/__init__.py deleted file mode 100644 index 3918d67063b9ab7a8ced80c22a5e74f95ff7fd4a..0000000000000000000000000000000000000000 --- a/third_party/Roma/roma/models/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .model_zoo import roma_outdoor, roma_indoor diff --git a/third_party/Roma/roma/models/model_zoo/__init__.py b/third_party/Roma/roma/models/model_zoo/__init__.py deleted file mode 100644 index 2ef0b6cf03473500d4198521764cd6dc9ccba784..0000000000000000000000000000000000000000 --- a/third_party/Roma/roma/models/model_zoo/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -import torch -from .roma_models import roma_model - -weight_urls = { - "roma": { - "outdoor": "https://github.com/Parskatt/storage/releases/download/roma/roma_outdoor.pth", - "indoor": "https://github.com/Parskatt/storage/releases/download/roma/roma_indoor.pth", - }, - "dinov2": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth", # hopefully this doesnt change :D -} - - -def roma_outdoor(device, weights=None, dinov2_weights=None): - if weights is None: - weights = torch.hub.load_state_dict_from_url( - weight_urls["roma"]["outdoor"], map_location=device - ) - if dinov2_weights is None: - dinov2_weights = torch.hub.load_state_dict_from_url( - weight_urls["dinov2"], map_location=device - ) - return roma_model( - resolution=(14 * 8 * 6, 14 * 8 * 6), - upsample_preds=True, - weights=weights, - dinov2_weights=dinov2_weights, - device=device, - ) - - -def roma_indoor(device, weights=None, dinov2_weights=None): - if weights is None: - weights = torch.hub.load_state_dict_from_url( - weight_urls["roma"]["indoor"], map_location=device - ) - if dinov2_weights is None: - dinov2_weights = torch.hub.load_state_dict_from_url( - weight_urls["dinov2"], map_location=device - ) - return roma_model( - resolution=(14 * 8 * 5, 14 * 8 * 5), - upsample_preds=False, - weights=weights, - dinov2_weights=dinov2_weights, - device=device, - ) diff --git a/third_party/Roma/roma/models/model_zoo/roma_models.py b/third_party/Roma/roma/models/model_zoo/roma_models.py deleted file mode 100644 index f98ee44f5e2ebd7e43a8e4b17f99b6ed0e85c93a..0000000000000000000000000000000000000000 --- a/third_party/Roma/roma/models/model_zoo/roma_models.py +++ /dev/null @@ -1,175 +0,0 @@ -import warnings -import torch.nn as nn -from roma.models.matcher import * -from roma.models.transformer import Block, TransformerDecoder, MemEffAttention -from roma.models.encoders import * - - -def roma_model( - resolution, upsample_preds, device=None, weights=None, dinov2_weights=None, **kwargs -): - # roma weights and dinov2 weights are loaded seperately, as dinov2 weights are not parameters - torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul - torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn - warnings.filterwarnings( - "ignore", category=UserWarning, message="TypedStorage is deprecated" - ) - gp_dim = 512 - feat_dim = 512 - decoder_dim = gp_dim + feat_dim - cls_to_coord_res = 64 - coordinate_decoder = TransformerDecoder( - nn.Sequential( - *[Block(decoder_dim, 8, attn_class=MemEffAttention) for _ in range(5)] - ), - decoder_dim, - cls_to_coord_res**2 + 1, - is_classifier=True, - amp=True, - pos_enc=False, - ) - dw = True - hidden_blocks = 8 - kernel_size = 5 - displacement_emb = "linear" - disable_local_corr_grad = True - - conv_refiner = nn.ModuleDict( - { - "16": ConvRefiner( - 2 * 512 + 128 + (2 * 7 + 1) ** 2, - 2 * 512 + 128 + (2 * 7 + 1) ** 2, - 2 + 1, - kernel_size=kernel_size, - dw=dw, - hidden_blocks=hidden_blocks, - displacement_emb=displacement_emb, - displacement_emb_dim=128, - local_corr_radius=7, - corr_in_other=True, - amp=True, - disable_local_corr_grad=disable_local_corr_grad, - bn_momentum=0.01, - ), - "8": ConvRefiner( - 2 * 512 + 64 + (2 * 3 + 1) ** 2, - 2 * 512 + 64 + (2 * 3 + 1) ** 2, - 2 + 1, - kernel_size=kernel_size, - dw=dw, - hidden_blocks=hidden_blocks, - displacement_emb=displacement_emb, - displacement_emb_dim=64, - local_corr_radius=3, - corr_in_other=True, - amp=True, - disable_local_corr_grad=disable_local_corr_grad, - bn_momentum=0.01, - ), - "4": ConvRefiner( - 2 * 256 + 32 + (2 * 2 + 1) ** 2, - 2 * 256 + 32 + (2 * 2 + 1) ** 2, - 2 + 1, - kernel_size=kernel_size, - dw=dw, - hidden_blocks=hidden_blocks, - displacement_emb=displacement_emb, - displacement_emb_dim=32, - local_corr_radius=2, - corr_in_other=True, - amp=True, - disable_local_corr_grad=disable_local_corr_grad, - bn_momentum=0.01, - ), - "2": ConvRefiner( - 2 * 64 + 16, - 128 + 16, - 2 + 1, - kernel_size=kernel_size, - dw=dw, - hidden_blocks=hidden_blocks, - displacement_emb=displacement_emb, - displacement_emb_dim=16, - amp=True, - disable_local_corr_grad=disable_local_corr_grad, - bn_momentum=0.01, - ), - "1": ConvRefiner( - 2 * 9 + 6, - 24, - 2 + 1, - kernel_size=kernel_size, - dw=dw, - hidden_blocks=hidden_blocks, - displacement_emb=displacement_emb, - displacement_emb_dim=6, - amp=True, - disable_local_corr_grad=disable_local_corr_grad, - bn_momentum=0.01, - ), - } - ) - kernel_temperature = 0.2 - learn_temperature = False - no_cov = True - kernel = CosKernel - only_attention = False - basis = "fourier" - gp16 = GP( - kernel, - T=kernel_temperature, - learn_temperature=learn_temperature, - only_attention=only_attention, - gp_dim=gp_dim, - basis=basis, - no_cov=no_cov, - ) - gps = nn.ModuleDict({"16": gp16}) - proj16 = nn.Sequential(nn.Conv2d(1024, 512, 1, 1), nn.BatchNorm2d(512)) - proj8 = nn.Sequential(nn.Conv2d(512, 512, 1, 1), nn.BatchNorm2d(512)) - proj4 = nn.Sequential(nn.Conv2d(256, 256, 1, 1), nn.BatchNorm2d(256)) - proj2 = nn.Sequential(nn.Conv2d(128, 64, 1, 1), nn.BatchNorm2d(64)) - proj1 = nn.Sequential(nn.Conv2d(64, 9, 1, 1), nn.BatchNorm2d(9)) - proj = nn.ModuleDict( - { - "16": proj16, - "8": proj8, - "4": proj4, - "2": proj2, - "1": proj1, - } - ) - displacement_dropout_p = 0.0 - gm_warp_dropout_p = 0.0 - decoder = Decoder( - coordinate_decoder, - gps, - proj, - conv_refiner, - detach=True, - scales=["16", "8", "4", "2", "1"], - displacement_dropout_p=displacement_dropout_p, - gm_warp_dropout_p=gm_warp_dropout_p, - ) - - encoder = CNNandDinov2( - cnn_kwargs=dict(pretrained=False, amp=True), - amp=True, - use_vgg=True, - dinov2_weights=dinov2_weights, - ) - h, w = resolution - symmetric = True - attenuate_cert = True - matcher = RegressionMatcher( - encoder, - decoder, - h=h, - w=w, - upsample_preds=upsample_preds, - symmetric=symmetric, - attenuate_cert=attenuate_cert, - **kwargs - ).to(device) - matcher.load_state_dict(weights) - return matcher diff --git a/third_party/Roma/roma/models/transformer/__init__.py b/third_party/Roma/roma/models/transformer/__init__.py deleted file mode 100644 index b1409045ef9c5dddef88484762137b9a2ab79cd5..0000000000000000000000000000000000000000 --- a/third_party/Roma/roma/models/transformer/__init__.py +++ /dev/null @@ -1,79 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - -from roma.utils.utils import get_grid -from .layers.block import Block -from .layers.attention import MemEffAttention -from .dinov2 import vit_large - -device = "cuda" if torch.cuda.is_available() else "cpu" - - -class TransformerDecoder(nn.Module): - def __init__( - self, - blocks, - hidden_dim, - out_dim, - is_classifier=False, - *args, - amp=False, - pos_enc=True, - learned_embeddings=False, - embedding_dim=None, - **kwargs - ) -> None: - super().__init__(*args, **kwargs) - self.blocks = blocks - self.to_out = nn.Linear(hidden_dim, out_dim) - self.hidden_dim = hidden_dim - self.out_dim = out_dim - self._scales = [16] - self.is_classifier = is_classifier - self.amp = amp - if torch.cuda.is_available(): - if torch.cuda.is_bf16_supported(): - self.amp_dtype = torch.bfloat16 - else: - self.amp_dtype = torch.float16 - else: - self.amp_dtype = torch.float32 - - self.pos_enc = pos_enc - self.learned_embeddings = learned_embeddings - if self.learned_embeddings: - self.learned_pos_embeddings = nn.Parameter( - nn.init.kaiming_normal_( - torch.empty((1, hidden_dim, embedding_dim, embedding_dim)) - ) - ) - - def scales(self): - return self._scales.copy() - - def forward(self, gp_posterior, features, old_stuff, new_scale): - with torch.autocast(device, dtype=self.amp_dtype, enabled=self.amp): - B, C, H, W = gp_posterior.shape - x = torch.cat((gp_posterior, features), dim=1) - B, C, H, W = x.shape - grid = get_grid(B, H, W, x.device).reshape(B, H * W, 2) - if self.learned_embeddings: - pos_enc = ( - F.interpolate( - self.learned_pos_embeddings, - size=(H, W), - mode="bilinear", - align_corners=False, - ) - .permute(0, 2, 3, 1) - .reshape(1, H * W, C) - ) - else: - pos_enc = 0 - tokens = x.reshape(B, C, H * W).permute(0, 2, 1) + pos_enc - z = self.blocks(tokens) - out = self.to_out(z) - out = out.permute(0, 2, 1).reshape(B, self.out_dim, H, W) - warp, certainty = out[:, :-1], out[:, -1:] - return warp, certainty, None diff --git a/third_party/Roma/roma/utils/kde.py b/third_party/Roma/roma/utils/kde.py deleted file mode 100644 index eff7c72dad4a3f90f5ff79d2630427de89838fc5..0000000000000000000000000000000000000000 --- a/third_party/Roma/roma/utils/kde.py +++ /dev/null @@ -1,9 +0,0 @@ -import torch - - -def kde(x, std=0.1): - # use a gaussian kernel to estimate density - x = x.half() # Do it in half precision - scores = (-torch.cdist(x, x) ** 2 / (2 * std**2)).exp() - density = scores.sum(dim=-1) - return density diff --git a/third_party/Roma/roma/utils/local_correlation.py b/third_party/Roma/roma/utils/local_correlation.py deleted file mode 100644 index 603ab524333c29fbc284a73065847645f3100847..0000000000000000000000000000000000000000 --- a/third_party/Roma/roma/utils/local_correlation.py +++ /dev/null @@ -1,68 +0,0 @@ -import torch -import torch.nn.functional as F - -device = "cuda" if torch.cuda.is_available() else "cpu" - - -def local_correlation( - feature0, - feature1, - local_radius, - padding_mode="zeros", - flow=None, - sample_mode="bilinear", -): - r = local_radius - K = (2 * r + 1) ** 2 - B, c, h, w = feature0.size() - feature0 = feature0.half() - feature1 = feature1.half() - corr = torch.empty((B, K, h, w), device=feature0.device, dtype=feature0.dtype) - if flow is None: - # If flow is None, assume feature0 and feature1 are aligned - coords = torch.meshgrid( - ( - torch.linspace(-1 + 1 / h, 1 - 1 / h, h, device=device), - torch.linspace(-1 + 1 / w, 1 - 1 / w, w, device=device), - ) - ) - coords = torch.stack((coords[1], coords[0]), dim=-1)[None].expand(B, h, w, 2) - else: - coords = flow.permute(0, 2, 3, 1) # If using flow, sample around flow target. - local_window = torch.meshgrid( - ( - torch.linspace( - -2 * local_radius / h, 2 * local_radius / h, 2 * r + 1, device=device - ), - torch.linspace( - -2 * local_radius / w, 2 * local_radius / w, 2 * r + 1, device=device - ), - ) - ) - local_window = ( - torch.stack((local_window[1], local_window[0]), dim=-1)[None] - .expand(1, 2 * r + 1, 2 * r + 1, 2) - .reshape(1, (2 * r + 1) ** 2, 2) - ) - for _ in range(B): - with torch.no_grad(): - local_window_coords = ( - (coords[_, :, :, None] + local_window[:, None, None]) - .reshape(1, h, w * (2 * r + 1) ** 2, 2) - .float() - ) - window_feature = F.grid_sample( - feature1[_ : _ + 1].float(), - local_window_coords, - padding_mode=padding_mode, - align_corners=False, - mode=sample_mode, # - ) - window_feature = window_feature.reshape(c, h, w, (2 * r + 1) ** 2) - corr[_] = ( - (feature0[_, ..., None] / (c**0.5) * window_feature) - .sum(dim=0) - .permute(2, 0, 1) - ) - torch.cuda.empty_cache() - return corr