zach commited on Apr 13, 2024

Commit

3ef1661

1 Parent(s): 474ea9a

initial commit based on github repo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
LICENSE +121 -0
README.md +319 -3
data/gene_annos_kitti_demo.py +32 -0
data/gene_annos_nyu_demo.py +31 -0
data/kitti_demo/depth/0000000005.png +3 -0
data/kitti_demo/depth/0000000050.png +3 -0
data/kitti_demo/depth/0000000100.png +3 -0
data/kitti_demo/rgb/0000000005.png +3 -0
data/kitti_demo/rgb/0000000050.png +3 -0
data/kitti_demo/rgb/0000000100.png +3 -0
data/kitti_demo/test_annotations.json +1 -0
data/nyu_demo/depth/sync_depth_00000.png +3 -0
data/nyu_demo/depth/sync_depth_00050.png +3 -0
data/nyu_demo/depth/sync_depth_00100.png +3 -0
data/nyu_demo/rgb/rgb_00000.jpg +0 -0
data/nyu_demo/rgb/rgb_00050.jpg +0 -0
data/nyu_demo/rgb/rgb_00100.jpg +0 -0
data/nyu_demo/test_annotations.json +1 -0
data/wild_demo/david-kohler-VFRTXGw1VjU-unsplash.jpg +0 -0
data/wild_demo/jonathan-borba-CnthDZXCdoY-unsplash.jpg +0 -0
data/wild_demo/randy-fath-G1yhU1Ej-9A-unsplash.jpg +0 -0
data_info/__init__.py +2 -0
data_info/pretrained_weight.py +16 -0
data_info/public_datasets.py +7 -0
media/gifs/demo_1.gif +3 -0
media/gifs/demo_12.gif +3 -0
media/gifs/demo_2.gif +3 -0
media/gifs/demo_22.gif +3 -0
media/screenshots/challenge.PNG +0 -0
media/screenshots/page2.png +3 -0
media/screenshots/pipeline.png +3 -0
mono/configs/HourglassDecoder/convlarge.0.3_150.py +25 -0
mono/configs/HourglassDecoder/test_kitti_convlarge.0.3_150.py +25 -0
mono/configs/HourglassDecoder/test_nyu_convlarge.0.3_150.py +25 -0
mono/configs/HourglassDecoder/vit.raft5.large.py +33 -0
mono/configs/HourglassDecoder/vit.raft5.small.py +33 -0
mono/configs/__init__.py +1 -0
mono/configs/_base_/_data_base_.py +13 -0
mono/configs/_base_/datasets/_data_base_.py +12 -0
mono/configs/_base_/default_runtime.py +4 -0
mono/configs/_base_/models/backbones/convnext_large.py +16 -0
mono/configs/_base_/models/backbones/dino_vit_large.py +7 -0
mono/configs/_base_/models/backbones/dino_vit_large_reg.py +7 -0
mono/configs/_base_/models/backbones/dino_vit_small_reg.py +7 -0
mono/configs/_base_/models/encoder_decoder/convnext_large.hourglassdecoder.py +10 -0
mono/configs/_base_/models/encoder_decoder/dino_vit_large.dpt_raft.py +20 -0
mono/configs/_base_/models/encoder_decoder/dino_vit_large_reg.dpt_raft.py +19 -0
mono/configs/_base_/models/encoder_decoder/dino_vit_small_reg.dpt_raft.py +19 -0
mono/model/__init__.py +5 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,121 @@

+Creative Commons Legal Code
+CC0 1.0 Universal
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+Statement of Purpose
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+4. Limitations and Disclaimers.
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.

README.md CHANGED Viewed

@@ -1,3 +1,319 @@
----
-license: cc0-1.0
----

+# 🚀 Metric3D Project 🚀
+**Official PyTorch implementation of Metric3Dv1 and Metric3Dv2:**
+[1] [Metric3D: Towards Zero-shot Metric 3D Prediction from A Single Image](https://arxiv.org/abs/2307.10984)
+[2] Metric3Dv2: A Versatile Monocular Geometric Foundation Model for Zero-shot Metric Depth and Surface Normal Estimation
+<a href='https://jugghm.github.io/Metric3Dv2'><img src='https://img.shields.io/badge/project%20page-@Metric3D-yellow.svg'></a>
+<a href='https://arxiv.org/abs/2307.10984'><img src='https://img.shields.io/badge/arxiv-@Metric3Dv1-green'></a>
+<a href='https:'><img src='https://img.shields.io/badge/arxiv (on hold)-@Metric3Dv2-red'></a>
+<a href='https://huggingface.co/spaces/JUGGHM/Metric3D'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
+[//]: # (### [Project Page]&#40;https://arxiv.org/abs/2307.08695&#41; | [v2 Paper]&#40;https://arxiv.org/abs/2307.10984&#41; | [v1 Arxiv]&#40;https://arxiv.org/abs/2307.10984&#41; | [Video]&#40;https://www.youtube.com/playlist?list=PLEuyXJsWqUNd04nwfm9gFBw5FVbcaQPl3&#41; | [Hugging Face 🤗]&#40;https://huggingface.co/spaces/JUGGHM/Metric3D&#41; )
+## News and TO DO LIST
+- [ ] Droid slam codes
+- [ ] Release the ViT-giant2 model
+- [ ] Focal length free mode
+- [ ] Floating noise removing mode
+- [ ] Improving HuggingFace Demo and Visualization
+- [x] Release training codes
+- `[2024/3/18]` HuggingFace GPU version updated!
+- `[2024/3/18]` [Project page](https://jugghm.github.io/Metric3Dv2/) released!
+- `[2024/3/18]` Metric3D V2 models released, supporting metric depth and surface normal now!
+- `[2023/8/10]` Inference codes, pretrained weights, and demo released.
+- `[2023/7]` Metric3D accepted by ICCV 2023!
+- `[2023/4]` The Champion of [2nd Monocular Depth Estimation Challenge](https://jspenmar.github.io/MDEC) in CVPR 2023
+##  🌼 Abstract
+Metric3D is a versatile geometric foundation model for high-quality and zero-shot **metric depth** and **surface normal** estimation from a single image. It excels at solving in-the-wild scene reconstruction.
+![page2](media/screenshots/page2.png)
+##  📝 Benchmarks
+### Metric Depth
+[//]: # (#### Zero-shot Testing)
+[//]: # (Our models work well on both indoor and outdoor scenarios, compared with other zero-shot metric depth estimation methods.)
+[//]: # ()
+[//]: # (|                 | Backbone   | KITTI $\delta 1$ ↑ | KITTI $\delta 2$  ↑ | KITTI $\delta 3$ ↑ | KITTI AbsRel  ↓ | KITTI RMSE  ↓ | KITTI RMS_log  ↓ | NYU $\delta 1$ ↑ | NYU $\delta 2$ ↑ | NYU $\delta 3$ ↑ | NYU AbsRel  ↓ | NYU RMSE  ↓ | NYU log10  ↓ |)
+[//]: # (|-----------------|------------|--------------------|---------------------|--------------------|-----------------|---------------|------------------|------------------|------------------|------------------|---------------|-------------|--------------|)
+[//]: # (| ZeroDepth       | ResNet-18 | 0.910              | 0.980               | 0.996              | 0.057           | 4.044         | 0.083            | 0.901            | 0.961            | -                | 0.100         | 0.380       | -            |)
+[//]: # (| PolyMax         | ConvNeXt-L    | -                  | -                   | -                  | -               | -             | -                | 0.969            | 0.996            | 0.999            | 0.067         | 0.250       | 0.033        |)
+[//]: # (| Ours | ViT-L     | 0.985              | 0.995               | 0.999              | 0.052           | 2.511         | 0.074            | 0.975            | 0.994            | 0.998            | 0.063         | 0.251       | 0.028        |)
+[//]: # (| Ours | ViT-g2    | 0.989              | 0.996               | 0.999              | 0.051           | 2.403         | 0.080            | 0.980            | 0.997            | 0.999            | 0.067         | 0.260       | 0.030        |)
+[//]: # ()
+[//]: # ([//]: # &#40;| Adabins | Efficient-B5 | 0.964 | 0.995 | 0.999 | 0.058  |  2.360 | 0.088            | 0.903  | 0.984  | 0.997  | 0.103  | 0.0444  | 0.364 |&#41;)
+[//]: # ([//]: # &#40;| NewCRFs | SwinT-L | 0.974 | 0.997 | 0.999 | 0.052  |  2.129 | 0.079            | 0.922  | 0.983  | 0.994  | 0.095  | 0.041  | 0.334 |&#41;)
+[//]: # ([//]: # &#40;| Ours &#40;CSTM_label&#41; | ConvNeXt-L |      0.964      | 0.993   | 0.998  | 0.058 | 2.770  | 0.092            | 0.944  |  0.986 | 0.995   | 0.083  |  0.035 |  0.310 |&#41;)
+[//]: # (#### Finetuned)
+Our models rank 1st on the routing KITTI and NYU benchmarks.
+|               | Backbone    | KITTI δ1 ↑ | KITTI δ2  ↑  | KITTI AbsRel  ↓ | KITTI RMSE  ↓ | KITTI RMS_log  ↓ | NYU δ1 ↑ | NYU δ2 ↑  | NYU AbsRel  ↓ | NYU RMSE  ↓ | NYU log10  ↓ |
+|---------------|-------------|------------|-------------|-----------------|---------------|------------------|----------|----------|---------------|-------------|--------------|
+| ZoeDepth      | ViT-Large   | 0.971      | 0.995                  | 0.053           | 2.281         | 0.082            | 0.953    | 0.995        | 0.077         | 0.277       | 0.033        |
+| ZeroDepth     | ResNet-18   | 0.968      | 0.996                   | 0.057           | 2.087         | 0.083            | 0.954    | 0.995           | 0.074         | 0.269       | 0.103        |
+| IEBins        | SwinT-Large | 0.978      | 0.998                  | 0.050           | 2.011         | 0.075            | 0.936    | 0.992           | 0.087         | 0.314       | 0.031        |
+| DepthAnything | ViT-Large   | 0.982      | 0.998                  | 0.046           | 1.985         | 0.069            | 0.984    | 0.998           | 0.056         | 0.206       | 0.024        |
+| Ours          | ViT-Large   | 0.985      | 0.998       | 0.999                        | 1.985         | 0.064            | 0.989    | 0.998           | 0.047         | 0.183       | 0.020        |
+| Ours          | ViT-giant2  | 0.989      | 0.998       | 1.000                        | 1.766         | 0.060            | 0.987    | 0.997           | 0.045         | 0.187       | 0.015        |
+### Affine-invariant Depth
+Even compared to recent affine-invariant depth methods (Marigold and Depth Anything), our metric-depth (and normal) models still show superior performance.
+|                       | #Data for Pretrain and Train                 | KITTI Absrel ↓ | KITTI δ1 ↑ | NYUv2 AbsRel  ↓ | NYUv2 δ1 ↑ | DIODE-Full AbsRel ↓ | DIODE-Full δ1 ↑ | Eth3d AbsRel  ↓ | Eth3d δ1 ↑ |
+|-----------------------|----------------------------------------------|----------------|------------|-----------------|------------|---------------------|-----------------|----------------------|------------|
+| OmniData (v2, ViT-L)       | 1.3M + 12.2M                                 | 0.069          | 0.948      | 0.074           | 0.945      | 0.149               | 0.835           | 0.166                | 0.778      |
+| MariGold  (LDMv2)     | 5B + 74K                                     | 0.099          | 0.916      | 0.055           | 0.961      | 0.308               | 0.773           | 0.127                | 0.960      |
+| DepthAnything (ViT-L) | 142M + 63M                                   | 0.076          | 0.947      | 0.043           | 0.981      | 0.277               | 0.759           | 0.065                | 0.882      |
+| Ours (ViT-L)          | 142M + 16M                                   | 0.042          | 0.979      | 0.042           | 0.980      | 0.141               | 0.882           | 0.042                | 0.987      |
+| Ours (ViT-g)          | 142M + 16M                                   | 0.043          | 0.982      | 0.043           | 0.981      | 0.136               | 0.895           | 0.042                | 0.983      |
+### Surface Normal
+Our models also show powerful performance on normal benchmarks.
+|              | NYU 11.25° ↑ | NYU Mean ↓ | NYU RMS ↓ | ScanNet 11.25° ↑ | ScanNet Mean ↓ | ScanNet RMS ↓ | iBims 11.25° ↑ | iBims Mean ↓ | iBims RMS ↓ |
+|--------------|----------|----------|-----------|-----------------|----------------|--------------|---------------|--------------|-------------|
+| EESNU        | 0.597    | 16.0     | 24.7      | 0.711           | 11.8           | 20.3         | 0.585         | 20.0         | -           |
+| IronDepth    | -        | -        | -         | -               | -              | -            | 0.431         | 25.3         | 37.4        |
+| PolyMax      | 0.656    | 13.1     | 20.4      | -               | -              | -            | -             | -            | -           |
+| Ours (ViT-L) | 0.688    | 12.0     | 19.2      | 0.760           | 9.9            | 16.4         | 0.694         | 19.4         | 34.9        |
+| Ours (ViT-g)   | 0.662    | 13.2     | 20.2      | 0.778           | 9.2            | 15.3         | 0.697         | 19.6         | 35.2        |
+## 🌈 DEMOs
+### Zero-shot monocular metric depth & surface normal
+<img src="media/gifs/demo_1.gif" width="600" height="337">
+<img src="media/gifs/demo_12.gif" width="600" height="337">
+### Zero-shot metric 3D recovery
+<img src="media/gifs/demo_2.gif" width="600" height="337">
+### Improving monocular SLAM
+<img src="media/gifs/demo_22.gif" width="600" height="337">
+[//]: # (https://github.com/YvanYin/Metric3D/assets/35299633/f95815ef-2506-4193-a6d9-1163ea821268)
+[//]: # (https://github.com/YvanYin/Metric3D/assets/35299633/ed00706c-41cc-49ea-accb-ad0532633cc2)
+[//]: # (### Zero-shot metric 3D recovery)
+[//]: # (https://github.com/YvanYin/Metric3D/assets/35299633/26cd7ae1-dd5a-4446-b275-54c5ca7ef945)
+[//]: # (https://github.com/YvanYin/Metric3D/assets/35299633/21e5484b-c304-4fe3-b1d3-8eebc4e26e42)
+[//]: # (### Monocular reconstruction for a Sequence)
+[//]: # ()
+[//]: # (### In-the-wild 3D reconstruction)
+[//]: # ()
+[//]: # (|           | Image | Reconstruction | Pointcloud File |)
+[//]: # (|:---------:|:------------------:|:------------------:|:--------:|)
+[//]: # (|    room   |    <img src="data/wild_demo/jonathan-borba-CnthDZXCdoY-unsplash.jpg" width="300" height="335">     |     <img src="media/gifs/room.gif" width="300" height="335">            |  [Download]&#40;https://drive.google.com/file/d/1P1izSegH2c4LUrXGiUksw037PVb0hjZr/view?usp=drive_link&#41;        |)
+[//]: # (| Colosseum |    <img src="data/wild_demo/david-kohler-VFRTXGw1VjU-unsplash.jpg" width="300" height="169">     |     <img src="media/gifs/colo.gif" width="300" height="169">         |     [Download]&#40;https://drive.google.com/file/d/1jJCXe5IpxBhHDr0TZtNZhjxKTRUz56Hg/view?usp=drive_link&#41;     |)
+[//]: # (|   chess   |    <img src="data/wild_demo/randy-fath-G1yhU1Ej-9A-unsplash.jpg" width="300" height="169" align=center>     |     <img src="media/gifs/chess.gif" width="300" height="169">            |      [Download]&#40;https://drive.google.com/file/d/1oV_Foq25_p-tTDRTcyO2AzXEdFJQz-Wm/view?usp=drive_link&#41;    |)
+[//]: # ()
+[//]: # (All three images are downloaded from [unplash]&#40;https://unsplash.com/&#41; and put in the data/wild_demo directory.)
+[//]: # ()
+[//]: # (### 3D metric reconstruction, Metric3D × DroidSLAM)
+[//]: # (Metric3D can also provide scale information for DroidSLAM, help to solve the scale drift problem for better trajectories. )
+[//]: # ()
+[//]: # (#### Bird Eyes' View &#40;Left: Droid-SLAM &#40;mono&#41;. Right: Droid-SLAM with Metric-3D&#41;)
+[//]: # ()
+[//]: # (<div align=center>)
+[//]: # (<img src="media/gifs/0028.gif"> )
+[//]: # (</div>)
+[//]: # ()
+[//]: # (### Front View)
+[//]: # ()
+[//]: # (<div align=center>)
+[//]: # (<img src="media/gifs/0028_fv.gif"> )
+[//]: # (</div>)
+[//]: # ()
+[//]: # (#### KITTI odemetry evaluation &#40;Translational RMS drift &#40;t_rel, ↓&#41; / Rotational RMS drift &#40;r_rel, ↓&#41;&#41;)
+[//]: # (|            | Modality |   seq 00   |   seq 02   |   seq 05  |   seq 06   |   seq 08   |   seq 09  |   seq 10  |)
+[//]: # (|:----------:|:--------:|:----------:|:----------:|:---------:|:----------:|:----------:|:---------:|:---------:|)
+[//]: # (|  ORB-SLAM2 |   Mono   | 11.43/0.58 | 10.34/0.26 | 9.04/0.26 | 14.56/0.26 | 11.46/0.28 |  9.3/0.26 | 2.57/0.32 |)
+[//]: # (| Droid-SLAM |   Mono   |  33.9/0.29 | 34.88/0.27 | 23.4/0.27 |  17.2/0.26 |  39.6/0.31 | 21.7/0.23 |   7/0.25  |)
+[//]: # (| Droid+Ours |   Mono   |  1.44/0.37 |  2.64/0.29 | 1.44/0.25 |   0.6/0.2  |   2.2/0.3  | 1.63/0.22 | 2.73/0.23 |)
+[//]: # (|  ORB-SLAM2 |  Stereo  |  0.88/0.31 |  0.77/0.28 | 0.62/0.26 |  0.89/0.27 |  1.03/0.31 | 0.86/0.25 | 0.62/0.29 |)
+[//]: # ()
+[//]: # (Metric3D makes the mono-SLAM scale-aware, like stereo systems.)
+[//]: # ()
+[//]: # (#### KITTI sequence videos - Youtube)
+[//]: # ([2011_09_30_drive_0028]&#40;https://youtu.be/gcTB4MgVCLQ&#41; /)
+[//]: # ([2011_09_30_drive_0033]&#40;https://youtu.be/He581fmoPP4&#41; /)
+[//]: # ([2011_09_30_drive_0034]&#40;https://youtu.be/I3PkukQ3_F8&#41;)
+[//]: # ()
+[//]: # (#### Estimated pose)
+[//]: # ([2011_09_30_drive_0033]&#40;https://drive.google.com/file/d/1SMXWzLYrEdmBe6uYMR9ShtDXeFDewChv/view?usp=drive_link&#41; / )
+[//]: # ([2011_09_30_drive_0034]&#40;https://drive.google.com/file/d/1ONU4GxpvTlgW0TjReF1R2i-WFxbbjQPG/view?usp=drive_link&#41; /)
+[//]: # ([2011_10_03_drive_0042]&#40;https://drive.google.com/file/d/19fweg6p1Q6TjJD2KlD7EMA_aV4FIeQUD/view?usp=drive_link&#41;)
+[//]: # ()
+[//]: # (#### Pointcloud files)
+[//]: # ([2011_09_30_drive_0033]&#40;https://drive.google.com/file/d/1K0o8DpUmLf-f_rue0OX1VaHlldpHBAfw/view?usp=drive_link&#41; /)
+[//]: # ([2011_09_30_drive_0034]&#40;https://drive.google.com/file/d/1bvZ6JwMRyvi07H7Z2VD_0NX1Im8qraZo/view?usp=drive_link&#41; /)
+[//]: # ([2011_10_03_drive_0042]&#40;https://drive.google.com/file/d/1Vw59F8nN5ApWdLeGKXvYgyS9SNKHKy4x/view?usp=drive_link&#41;)
+## 🔨 Installation
+### One-line Installation
+For the ViT models, use the following environment：
+```bash
+pip install -r requirements_v2.txt
+```
+For ConvNeXt-L, it is
+```bash
+pip install -r requirements_v1.txt
+```
+### dataset annotation components
+With off-the-shelf depth datasets, we need to generate json annotaions in compatible with this dataset, which is organized by:
+```
+dict(
+	'files':list(
+		dict(
+			'rgb': 'data/kitti_demo/rgb/xxx.png',
+			'depth': 'data/kitti_demo/depth/xxx.png',
+			'depth_scale': 1000.0 # the depth scale of gt depth img.
+			'cam_in': [fx, fy, cx, cy],
+		),
+		dict(
+			...
+		),
+		...
+	)
+)
+```
+To generate such annotations, please refer to the "Inference" section.
+### configs
+In ```mono/configs``` we provide different config setups.
+Intrinsics of the canonical camera is set bellow:
+```
+    canonical_space = dict(
+        img_size=(512, 960),
+        focal_length=1000.0,
+    ),
+```
+where cx and cy is set to be half of the image size.
+Inference settings are defined as
+```
+    depth_range=(0, 1),
+    depth_normalize=(0.3, 150),
+    crop_size = (512, 1088),
+```
+where the images will be first resized as the ```crop_size``` and then fed into the model.
+## ✈️ Inference
+### Download Checkpoint
+|      |       Encoder       |      Decoder      |                                               Link                                                |
+|:----:|:-------------------:|:-----------------:|:-------------------------------------------------------------------------------------------------:|
+| v1-T |    ConvNeXt-Tiny    | Hourglass-Decoder |                                            Coming soon                                            |
+| v1-L |   ConvNeXt-Large    | Hourglass-Decoder | [Download](https://drive.google.com/file/d/1KVINiBkVpJylx_6z1lAC7CQ4kmn-RJRN/view?usp=drive_link) |
+| v2-S | DINO2reg-ViT-Small  |    RAFT-4iter     | [Download](https://drive.google.com/file/d/1YfmvXwpWmhLg3jSxnhT7LvY0yawlXcr_/view?usp=drive_link) |
+| v2-L | DINO2reg-ViT-Large  |    RAFT-8iter     | [Download](https://drive.google.com/file/d/1eT2gG-kwsVzNy5nJrbm4KC-9DbNKyLnr/view?usp=drive_link) |
+| v2-g | DINO2reg-ViT-giant2 |    RAFT-8iter     | Coming soon |
+### Dataset Mode
+1. put the trained ckpt file ```model.pth``` in ```weight/```.
+2. generate data annotation by following the code ```data/gene_annos_kitti_demo.py```, which includes 'rgb', (optional) 'intrinsic', (optional) 'depth', (optional) 'depth_scale'.
+3. change the 'test_data_path' in ```test_*.sh``` to the ```*.json``` path.
+4. run ```source test_kitti.sh``` or ```source test_nyu.sh```.
+### In-the-Wild Mode
+1. put the trained ckpt file ```model.pth``` in ```weight/```.
+2. change the 'test_data_path' in ```test.sh``` to the image folder path.
+3. run ```source test_vit.sh``` for transformers and ```source test.sh``` for convnets.
+As no intrinsics are provided, we provided by default 9 settings of focal length.
+## ❓ Q & A
+### Q1: Why depth maps look good but pointclouds are distorted?
+Because the focal length is not properly set! Please find a proper focal length by modifying codes [here](mono/utils/do_test.py#309) yourself.
+### Q2: Why the pointclouds are too slow to be generated?
+Because the images are too large! Use smaller ones instead.
+### Q3: Why predicted depth maps are not satisfactory?
+First be sure all black padding regions at image boundaries are cropped out. Then please try again.
+Besides, metric 3D is not almighty. Some objects (chandeliers, drones...) / camera views (aerial view, bev...) do not occur frequently in the training datasets. We will going deeper into this and release more powerful solutions.
+## 📧 Citation
+```
+@article{hu2024metric3dv2,
+  title={A Versatile Monocular Geometric Foundation Model for Zero-shot Metric Depth and Surface Normal Estimation},
+  author={Hu, Mu and Yin, Wei, and Zhang, Chi and Cai, Zhipeng and Long, Xiaoxiao and Chen, Hao, and Wang, Kaixuan and Yu, Gang and Shen, Chunhua and Shen, Shaojie},
+  booktitle={arXiv},
+  year={2024}
+}
+```
+```
+@article{yin2023metric,
+  title={Metric3D: Towards Zero-shot Metric 3D Prediction from A Single Image},
+  author={Wei Yin, Chi Zhang, Hao Chen, Zhipeng Cai, Gang Yu, Kaixuan Wang, Xiaozhi Chen, Chunhua Shen},
+  booktitle={ICCV},
+  year={2023}
+}
+```
+## License and Contact
+The *Metric 3D* code is under a 2-clause BSD License for non-commercial usage. For further questions, contact Dr. yvan.yin  [yvanwy@outlook.com] and Mr. mu.hu [mhuam@connect.ust.hk].

data/gene_annos_kitti_demo.py ADDED Viewed

	@@ -0,0 +1,32 @@

+if __name__=='__main__':
+    import os
+    import os.path as osp
+    import numpy as np
+    import cv2
+    import json
+    code_root = '/mnt/nas/share/home/xugk/MetricDepth_test/'
+    data_root = osp.join(code_root, 'data/kitti_demo')
+    split_root = code_root
+    files = []
+    rgb_root = osp.join(data_root, 'rgb')
+    depth_root = osp.join(data_root, 'depth')
+    for rgb_file in os.listdir(rgb_root):
+        rgb_path = osp.join(rgb_root, rgb_file).split(split_root)[-1]
+        depth_path = rgb_path.replace('/rgb/', '/depth/')
+        cam_in = [707.0493, 707.0493, 604.0814, 180.5066]
+        depth_scale = 256.
+        meta_data = {}
+        meta_data['cam_in'] = cam_in
+        meta_data['rgb'] = rgb_path
+        meta_data['depth'] = depth_path
+        meta_data['depth_scale'] = depth_scale
+        files.append(meta_data)
+    files_dict = dict(files=files)
+    with open(osp.join(code_root, 'data/kitti_demo/test_annotations.json'), 'w') as f:
+        json.dump(files_dict, f)

data/gene_annos_nyu_demo.py ADDED Viewed

	@@ -0,0 +1,31 @@

+if __name__=='__main__':
+    import os
+    import os.path as osp
+    import numpy as np
+    import cv2
+    import json
+    code_root = '/mnt/nas/share/home/xugk/MetricDepth_test/'
+    data_root = osp.join(code_root, 'data/nyu_demo')
+    split_root = code_root
+    files = []
+    rgb_root = osp.join(data_root, 'rgb')
+    depth_root = osp.join(data_root, 'depth')
+    for rgb_file in os.listdir(rgb_root):
+        rgb_path = osp.join(rgb_root, rgb_file).split(split_root)[-1]
+        depth_path = rgb_path.replace('.jpg', '.png').replace('/rgb_', '/sync_depth_').replace('/rgb/', '/depth/')
+        cam_in = [518.8579, 519.46961, 325.58245, 253.73617]
+        depth_scale = 1000.
+        meta_data = {}
+        meta_data['cam_in'] = cam_in
+        meta_data['rgb'] = rgb_path
+        meta_data['depth'] = depth_path
+        meta_data['depth_scale'] = depth_scale
+        files.append(meta_data)
+    files_dict = dict(files=files)
+    with open(osp.join(code_root, 'data/nyu_demo/test_annotations.json'), 'w') as f:
+        json.dump(files_dict, f)

data/kitti_demo/depth/0000000005.png ADDED Viewed

Git LFS Details

SHA256: eb0d83fc93bcf235384c690ae405e0b24b3bfc6a05e1220a4c902bed3b5ba113
Pointer size: 131 Bytes
Size of remote file: 192 kB

data/kitti_demo/depth/0000000050.png ADDED Viewed

Git LFS Details

SHA256: 3eef554b3b312829e7d1e76a1acd13e7261024eb3c4d6e176328be377ff9216e
Pointer size: 131 Bytes
Size of remote file: 201 kB

data/kitti_demo/depth/0000000100.png ADDED Viewed

Git LFS Details

SHA256: 4b7e9c85e2b4f8131019fe93e0c1cf36f5058b30d040998a8199c4bb2d97e9b1
Pointer size: 131 Bytes
Size of remote file: 182 kB

data/kitti_demo/rgb/0000000005.png ADDED Viewed

Git LFS Details

SHA256: a9754dcadc8b3ace31a368500af3e382e2c0763242a7b054d424650cec67646a
Pointer size: 131 Bytes
Size of remote file: 873 kB

data/kitti_demo/rgb/0000000050.png ADDED Viewed

Git LFS Details

SHA256: 19e4f8f377521c8e28aca9addf2b695f9e374e5f44ee38d58970d12a21fbc4bf
Pointer size: 131 Bytes
Size of remote file: 874 kB

data/kitti_demo/rgb/0000000100.png ADDED Viewed

Git LFS Details

SHA256: 1f216c6fa51fb640c6cfb8a16cc91f60b20b1d2775def3d86c52c2bba1388365
Pointer size: 131 Bytes
Size of remote file: 916 kB

data/kitti_demo/test_annotations.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"files": [{"cam_in": [707.0493, 707.0493, 604.0814, 180.5066], "rgb": "data/kitti_demo/rgb/0000000050.png", "depth": "data/kitti_demo/depth/0000000050.png", "depth_scale": 256.0}, {"cam_in": [707.0493, 707.0493, 604.0814, 180.5066], "rgb": "data/kitti_demo/rgb/0000000100.png", "depth": "data/kitti_demo/depth/0000000100.png", "depth_scale": 256.0}, {"cam_in": [707.0493, 707.0493, 604.0814, 180.5066], "rgb": "data/kitti_demo/rgb/0000000005.png", "depth": "data/kitti_demo/depth/0000000005.png", "depth_scale": 256.0}]}

data/nyu_demo/depth/sync_depth_00000.png ADDED Viewed

Git LFS Details

SHA256: 043e9c8bee7af97afff01e451da3f5e9cd1591995f415944dd0dc91036a35b5a
Pointer size: 131 Bytes
Size of remote file: 166 kB

data/nyu_demo/depth/sync_depth_00050.png ADDED Viewed

Git LFS Details

SHA256: 53c764e869f61cf4240586395bc7374dcc02e65b8442801b53b74ffa563d30fe
Pointer size: 131 Bytes
Size of remote file: 182 kB

data/nyu_demo/depth/sync_depth_00100.png ADDED Viewed

Git LFS Details

SHA256: dc0c16d56bfdcc958f37fa28bcf39b110a14c317bfe3c221b3c3bc6d73dec67d
Pointer size: 131 Bytes
Size of remote file: 142 kB

data/nyu_demo/rgb/rgb_00000.jpg ADDED Viewed

data/nyu_demo/rgb/rgb_00050.jpg ADDED Viewed

data/nyu_demo/rgb/rgb_00100.jpg ADDED Viewed

data/nyu_demo/test_annotations.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"files": [{"cam_in": [518.8579, 519.46961, 325.58245, 253.73617], "rgb": "data/nyu_demo/rgb/rgb_00000.jpg", "depth": "data/nyu_demo/depth/sync_depth_00000.png", "depth_scale": 1000.0}, {"cam_in": [518.8579, 519.46961, 325.58245, 253.73617], "rgb": "data/nyu_demo/rgb/rgb_00050.jpg", "depth": "data/nyu_demo/depth/sync_depth_00050.png", "depth_scale": 1000.0}, {"cam_in": [518.8579, 519.46961, 325.58245, 253.73617], "rgb": "data/nyu_demo/rgb/rgb_00100.jpg", "depth": "data/nyu_demo/depth/sync_depth_00100.png", "depth_scale": 1000.0}]}

data/wild_demo/david-kohler-VFRTXGw1VjU-unsplash.jpg ADDED Viewed

data/wild_demo/jonathan-borba-CnthDZXCdoY-unsplash.jpg ADDED Viewed

data/wild_demo/randy-fath-G1yhU1Ej-9A-unsplash.jpg ADDED Viewed

data_info/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .public_datasets import *
2	+ from .pretrained_weight import *

data_info/pretrained_weight.py ADDED Viewed

	@@ -0,0 +1,16 @@

+mldb_info={}
+mldb_info['checkpoint']={
+    'mldb_root': '/mnt/nas/share/home/xugk/ckpt', # NOTE: modify it to the pretrained ckpt root
+    # pretrained weight for convnext
+    'convnext_tiny': 'convnext/convnext_tiny_22k_1k_384.pth',
+    'convnext_small': 'convnext/convnext_small_22k_1k_384.pth',
+    'convnext_base': 'convnext/convnext_base_22k_1k_384.pth',
+    'convnext_large': 'convnext/convnext_large_22k_1k_384.pth',
+    'vit_large': 'vit/dinov2_vitl14_pretrain.pth',
+    'vit_small_reg': 'vit/dinov2_vits14_reg4_pretrain.pth',
+    'vit_large_reg': 'vit/dinov2_vitl14_reg4_pretrain.pth',
+    'vit_giant2_reg': 'vit/dinov2_vitg14_reg4_pretrain.pth',
+}

data_info/public_datasets.py ADDED Viewed

	@@ -0,0 +1,7 @@

+mldb_info = {}
+mldb_info['NYU']={
+    'mldb_root': '/mnt/nas/share/home/xugk/data/',
+    'data_root': 'nyu',
+    'test_annotations_path': 'nyu/test_annotation.json',
+}

media/gifs/demo_1.gif ADDED Viewed

Git LFS Details

SHA256: f07ee050ca8b76991966f45bb74eae6e61e6b11eeb9466b524c6ab5164711d36
Pointer size: 133 Bytes
Size of remote file: 10.7 MB

media/gifs/demo_12.gif ADDED Viewed

Git LFS Details

SHA256: c1886d6dff7714d015e6b7c004d88d3014057b1cefe1dc5544fa9bedb81383bc
Pointer size: 132 Bytes
Size of remote file: 9.41 MB

media/gifs/demo_2.gif ADDED Viewed

Git LFS Details

SHA256: d11e3f9a11374166fc363a3fed17928957de546a548eccc4c7efa4d9317cf4c5
Pointer size: 132 Bytes
Size of remote file: 9.02 MB

media/gifs/demo_22.gif ADDED Viewed

Git LFS Details

SHA256: c56b0785a5991126d02b349f8801980f31b2ef7b661cad07be4888ff42dc29d0
Pointer size: 132 Bytes
Size of remote file: 6.39 MB

media/screenshots/challenge.PNG ADDED Viewed

media/screenshots/page2.png ADDED Viewed

Git LFS Details

SHA256: c46a332e0f9f868c767f65f70c0fa11ec4f7da2dfe69d47046dff5c37964c171
Pointer size: 132 Bytes
Size of remote file: 4.35 MB

media/screenshots/pipeline.png ADDED Viewed

Git LFS Details

SHA256: 19a7b36e83761aae0ecd27e1215e31fded8c9ef3d308734e690456921703f662
Pointer size: 131 Bytes
Size of remote file: 399 kB

mono/configs/HourglassDecoder/convlarge.0.3_150.py ADDED Viewed

	@@ -0,0 +1,25 @@

+_base_=[
+       '../_base_/models/encoder_decoder/convnext_large.hourglassdecoder.py',
+       '../_base_/datasets/_data_base_.py',
+       '../_base_/default_runtime.py',
+       ]
+model = dict(
+    backbone=dict(
+        pretrained=False,
+    )
+)
+# configs of the canonical space
+data_basic=dict(
+    canonical_space = dict(
+        img_size=(512, 960),
+        focal_length=1000.0,
+    ),
+    depth_range=(0, 1),
+    depth_normalize=(0.3, 150),
+    crop_size = (544, 1216),
+)
+batchsize_per_gpu = 2
+thread_per_gpu = 4

mono/configs/HourglassDecoder/test_kitti_convlarge.0.3_150.py ADDED Viewed

	@@ -0,0 +1,25 @@

+_base_=[
+       '../_base_/models/encoder_decoder/convnext_large.hourglassdecoder.py',
+       '../_base_/datasets/_data_base_.py',
+       '../_base_/default_runtime.py',
+       ]
+model = dict(
+    backbone=dict(
+        pretrained=False,
+    )
+)
+# configs of the canonical space
+data_basic=dict(
+    canonical_space = dict(
+        img_size=(512, 960),
+        focal_length=1000.0,
+    ),
+    depth_range=(0, 1),
+    depth_normalize=(0.3, 150),
+    crop_size = (512, 1088),
+)
+batchsize_per_gpu = 2
+thread_per_gpu = 4

mono/configs/HourglassDecoder/test_nyu_convlarge.0.3_150.py ADDED Viewed

	@@ -0,0 +1,25 @@

+_base_=[
+       '../_base_/models/encoder_decoder/convnext_large.hourglassdecoder.py',
+       '../_base_/datasets/_data_base_.py',
+       '../_base_/default_runtime.py',
+       ]
+model = dict(
+    backbone=dict(
+        pretrained=False,
+    )
+)
+# configs of the canonical space
+data_basic=dict(
+    canonical_space = dict(
+        img_size=(512, 960),
+        focal_length=1000.0,
+    ),
+    depth_range=(0, 1),
+    depth_normalize=(0.3, 150),
+    crop_size = (480, 1216),
+)
+batchsize_per_gpu = 2
+thread_per_gpu = 4

mono/configs/HourglassDecoder/vit.raft5.large.py ADDED Viewed

	@@ -0,0 +1,33 @@

+_base_=[
+       '../_base_/models/encoder_decoder/dino_vit_large_reg.dpt_raft.py',
+       '../_base_/datasets/_data_base_.py',
+       '../_base_/default_runtime.py',
+       ]
+import numpy as np
+model=dict(
+    decode_head=dict(
+        type='RAFTDepthNormalDPT5',
+        iters=8,
+        n_downsample=2,
+        detach=False,
+    )
+)
+max_value = 200
+# configs of the canonical space
+data_basic=dict(
+    canonical_space = dict(
+        # img_size=(540, 960),
+        focal_length=1000.0,
+    ),
+    depth_range=(0, 1),
+    depth_normalize=(0.1, max_value),
+    crop_size = (616, 1064),  # %28 = 0
+     clip_depth_range=(0.1, 200),
+    vit_size=(616,1064)
+)
+batchsize_per_gpu = 1
+thread_per_gpu = 1

mono/configs/HourglassDecoder/vit.raft5.small.py ADDED Viewed

	@@ -0,0 +1,33 @@

+_base_=[
+       '../_base_/models/encoder_decoder/dino_vit_small_reg.dpt_raft.py',
+       '../_base_/datasets/_data_base_.py',
+       '../_base_/default_runtime.py',
+       ]
+import numpy as np
+model=dict(
+    decode_head=dict(
+        type='RAFTDepthNormalDPT5',
+        iters=4,
+        n_downsample=2,
+        detach=False,
+    )
+)
+max_value = 200
+# configs of the canonical space
+data_basic=dict(
+    canonical_space = dict(
+        # img_size=(540, 960),
+        focal_length=1000.0,
+    ),
+    depth_range=(0, 1),
+    depth_normalize=(0.1, max_value),
+    crop_size = (616, 1064),  # %28 = 0
+     clip_depth_range=(0.1, 200),
+    vit_size=(616,1064)
+)
+batchsize_per_gpu = 1
+thread_per_gpu = 1

mono/configs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

mono/configs/_base_/_data_base_.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# canonical camera setting and basic data setting
+# we set it same as the E300 camera (crop version)
+#
+data_basic=dict(
+    canonical_space = dict(
+        img_size=(540, 960),
+        focal_length=1196.0,
+    ),
+    depth_range=(0.9, 150),
+    depth_normalize=(0.006, 1.001),
+    crop_size = (512, 960),
+    clip_depth_range=(0.9, 150),
+)

mono/configs/_base_/datasets/_data_base_.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# canonical camera setting and basic data setting
+#
+data_basic=dict(
+    canonical_space = dict(
+        img_size=(540, 960),
+        focal_length=1196.0,
+    ),
+    depth_range=(0.9, 150),
+    depth_normalize=(0.006, 1.001),
+    crop_size = (512, 960),
+    clip_depth_range=(0.9, 150),
+)

mono/configs/_base_/default_runtime.py ADDED Viewed

	@@ -0,0 +1,4 @@

+load_from = None
+cudnn_benchmark = True
+test_metrics = ['abs_rel', 'rmse', 'silog', 'delta1', 'delta2', 'delta3','rmse_log', 'log10', 'sq_rel']

mono/configs/_base_/models/backbones/convnext_large.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#_base_ = ['./_model_base_.py',]
+#'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-large_3rdparty_in21k_20220301-e6e0ea0a.pth'
+model = dict(
+    #type='EncoderDecoderAuxi',
+    backbone=dict(
+        type='convnext_large',
+        pretrained=True,
+        in_22k=True,
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        checkpoint='data/pretrained_weight_repo/convnext/convnext_large_22k_1k_384.pth',
+        prefix='backbones.',
+        out_channels=[192, 384, 768, 1536]),
+    )

mono/configs/_base_/models/backbones/dino_vit_large.py ADDED Viewed

	@@ -0,0 +1,7 @@

+model = dict(
+    backbone=dict(
+        type='vit_large',
+        prefix='backbones.',
+        out_channels=[1024, 1024, 1024, 1024],
+        drop_path_rate = 0.0),
+    )

mono/configs/_base_/models/backbones/dino_vit_large_reg.py ADDED Viewed

	@@ -0,0 +1,7 @@

+model = dict(
+    backbone=dict(
+        type='vit_large_reg',
+        prefix='backbones.',
+        out_channels=[1024, 1024, 1024, 1024],
+        drop_path_rate = 0.0),
+    )

mono/configs/_base_/models/backbones/dino_vit_small_reg.py ADDED Viewed

	@@ -0,0 +1,7 @@

+model = dict(
+    backbone=dict(
+        type='vit_small_reg',
+        prefix='backbones.',
+        out_channels=[384, 384, 384, 384],
+        drop_path_rate = 0.0),
+    )

mono/configs/_base_/models/encoder_decoder/convnext_large.hourglassdecoder.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# model settings
+_base_ = ['../backbones/convnext_large.py',]
+model = dict(
+    type='DensePredModel',
+    decode_head=dict(
+        type='HourglassDecoder',
+        in_channels=[192, 384, 768, 1536],
+        decoder_channel=[128, 128, 256, 512],
+        prefix='decode_heads.'),
+)

mono/configs/_base_/models/encoder_decoder/dino_vit_large.dpt_raft.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# model settings
+_base_ = ['../backbones/dino_vit_large.py']
+model = dict(
+    type='DensePredModel',
+    decode_head=dict(
+        type='RAFTDepthDPT',
+        in_channels=[1024, 1024, 1024, 1024],
+        use_cls_token=True,
+        feature_channels = [256, 512, 1024, 1024], # [2/7, 1/7, 1/14, 1/14]
+        decoder_channels = [128, 256, 512, 1024, 1024], # [4/7, 2/7, 1/7, 1/14, 1/14]
+        up_scale = 7,
+        hidden_channels=[128, 128, 128, 128], # [x_4, x_8, x_16, x_32] [192, 384, 768, 1536]
+        n_gru_layers=3,
+        n_downsample=2,
+        iters=12,
+        slow_fast_gru=True,
+        corr_radius=4,
+        corr_levels=4,
+        prefix='decode_heads.'),
+)

mono/configs/_base_/models/encoder_decoder/dino_vit_large_reg.dpt_raft.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# model settings
+_base_ = ['../backbones/dino_vit_large_reg.py']
+model = dict(
+    type='DensePredModel',
+    decode_head=dict(
+        type='RAFTDepthDPT',
+        in_channels=[1024, 1024, 1024, 1024],
+        use_cls_token=True,
+        feature_channels = [256, 512, 1024, 1024], # [2/7, 1/7, 1/14, 1/14]
+        decoder_channels = [128, 256, 512, 1024, 1024], # [4/7, 2/7, 1/7, 1/14, 1/14]
+        up_scale = 7,
+        hidden_channels=[128, 128, 128, 128], # [x_4, x_8, x_16, x_32] [192, 384, 768, 1536]
+        n_gru_layers=3,
+        n_downsample=2,
+        iters=3,
+        slow_fast_gru=True,
+        num_register_tokens=4,
+        prefix='decode_heads.'),
+)

mono/configs/_base_/models/encoder_decoder/dino_vit_small_reg.dpt_raft.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# model settings
+_base_ = ['../backbones/dino_vit_small_reg.py']
+model = dict(
+    type='DensePredModel',
+    decode_head=dict(
+        type='RAFTDepthDPT',
+        in_channels=[384, 384, 384, 384],
+        use_cls_token=True,
+        feature_channels = [96, 192, 384, 768], # [2/7, 1/7, 1/14, 1/14]
+        decoder_channels = [48, 96, 192, 384, 384], # [-, 1/4, 1/7, 1/14, 1/14]
+        up_scale = 7,
+        hidden_channels=[48, 48, 48, 48], # [x_4, x_8, x_16, x_32] [1/4, 1/7, 1/14, -]
+        n_gru_layers=3,
+        n_downsample=2,
+        iters=3,
+        slow_fast_gru=True,
+        num_register_tokens=4,
+        prefix='decode_heads.'),
+)

mono/model/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .monodepth_model import DepthModel
+# from .__base_model__ import BaseDepthModel
+__all__ = ['DepthModel', 'BaseDepthModel']