CHSTR commited on
Commit
265ae36
·
verified ·
1 Parent(s): 23f154c

Upload src

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. src/__pycache__/model_LN_prompt.cpython-310.pyc +0 -0
  2. src/__pycache__/options.cpython-310.pyc +0 -0
  3. src/dinov2/__init__.py +6 -0
  4. src/dinov2/__pycache__/__init__.cpython-310.pyc +0 -0
  5. src/dinov2/configs/__init__.py +22 -0
  6. src/dinov2/configs/eval/vitb14_pretrain.yaml +6 -0
  7. src/dinov2/configs/eval/vitb14_reg4_pretrain.yaml +9 -0
  8. src/dinov2/configs/eval/vitg14_pretrain.yaml +7 -0
  9. src/dinov2/configs/eval/vitg14_reg4_pretrain.yaml +10 -0
  10. src/dinov2/configs/eval/vitl14_pretrain.yaml +6 -0
  11. src/dinov2/configs/eval/vitl14_reg4_pretrain.yaml +9 -0
  12. src/dinov2/configs/eval/vits14_pretrain.yaml +6 -0
  13. src/dinov2/configs/eval/vits14_reg4_pretrain.yaml +9 -0
  14. src/dinov2/configs/ssl_default_config.yaml +118 -0
  15. src/dinov2/configs/train/vitg14.yaml +26 -0
  16. src/dinov2/configs/train/vitl14.yaml +26 -0
  17. src/dinov2/configs/train/vitl16_short.yaml +6 -0
  18. src/dinov2/data/__init__.py +10 -0
  19. src/dinov2/data/adapters.py +28 -0
  20. src/dinov2/data/augmentations.py +118 -0
  21. src/dinov2/data/collate.py +49 -0
  22. src/dinov2/data/datasets/__init__.py +7 -0
  23. src/dinov2/data/datasets/decoders.py +31 -0
  24. src/dinov2/data/datasets/extended.py +38 -0
  25. src/dinov2/data/datasets/image_net.py +290 -0
  26. src/dinov2/data/datasets/image_net_22k.py +302 -0
  27. src/dinov2/data/loaders.py +222 -0
  28. src/dinov2/data/masking.py +86 -0
  29. src/dinov2/data/samplers.py +229 -0
  30. src/dinov2/data/transforms.py +91 -0
  31. src/dinov2/distributed/__init__.py +270 -0
  32. src/dinov2/eval/__init__.py +4 -0
  33. src/dinov2/eval/depth/__init__.py +4 -0
  34. src/dinov2/eval/depth/models/__init__.py +10 -0
  35. src/dinov2/eval/depth/models/backbones/__init__.py +6 -0
  36. src/dinov2/eval/depth/models/backbones/vision_transformer.py +16 -0
  37. src/dinov2/eval/depth/models/builder.py +49 -0
  38. src/dinov2/eval/depth/models/decode_heads/__init__.py +7 -0
  39. src/dinov2/eval/depth/models/decode_heads/decode_head.py +225 -0
  40. src/dinov2/eval/depth/models/decode_heads/dpt_head.py +270 -0
  41. src/dinov2/eval/depth/models/decode_heads/linear_head.py +89 -0
  42. src/dinov2/eval/depth/models/depther/__init__.py +7 -0
  43. src/dinov2/eval/depth/models/depther/base.py +194 -0
  44. src/dinov2/eval/depth/models/depther/encoder_decoder.py +236 -0
  45. src/dinov2/eval/depth/models/losses/__init__.py +7 -0
  46. src/dinov2/eval/depth/models/losses/gradientloss.py +69 -0
  47. src/dinov2/eval/depth/models/losses/sigloss.py +65 -0
  48. src/dinov2/eval/depth/ops/__init__.py +6 -0
  49. src/dinov2/eval/depth/ops/wrappers.py +28 -0
  50. src/dinov2/eval/knn.py +404 -0
src/__pycache__/model_LN_prompt.cpython-310.pyc ADDED
Binary file (2.7 kB). View file
 
src/__pycache__/options.cpython-310.pyc ADDED
Binary file (634 Bytes). View file
 
src/dinov2/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ __version__ = "0.0.1"
src/dinov2/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (157 Bytes). View file
 
src/dinov2/configs/__init__.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import pathlib
7
+
8
+ from omegaconf import OmegaConf
9
+
10
+
11
+ def load_config(config_name: str):
12
+ config_filename = config_name + ".yaml"
13
+ return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename)
14
+
15
+
16
+ dinov2_default_config = load_config("ssl_default_config")
17
+
18
+
19
+ def load_and_merge_config(config_name: str):
20
+ default_config = OmegaConf.create(dinov2_default_config)
21
+ loaded_config = load_config(config_name)
22
+ return OmegaConf.merge(default_config, loaded_config)
src/dinov2/configs/eval/vitb14_pretrain.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ student:
2
+ arch: vit_base
3
+ patch_size: 14
4
+ crops:
5
+ global_crops_size: 518 # this is to set up the position embeddings properly
6
+ local_crops_size: 98
src/dinov2/configs/eval/vitb14_reg4_pretrain.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ student:
2
+ arch: vit_base
3
+ patch_size: 14
4
+ num_register_tokens: 4
5
+ interpolate_antialias: true
6
+ interpolate_offset: 0.0
7
+ crops:
8
+ global_crops_size: 518 # this is to set up the position embeddings properly
9
+ local_crops_size: 98
src/dinov2/configs/eval/vitg14_pretrain.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ student:
2
+ arch: vit_giant2
3
+ patch_size: 14
4
+ ffn_layer: swiglufused
5
+ crops:
6
+ global_crops_size: 518 # this is to set up the position embeddings properly
7
+ local_crops_size: 98
src/dinov2/configs/eval/vitg14_reg4_pretrain.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ student:
2
+ arch: vit_giant2
3
+ patch_size: 14
4
+ ffn_layer: swiglufused
5
+ num_register_tokens: 4
6
+ interpolate_antialias: true
7
+ interpolate_offset: 0.0
8
+ crops:
9
+ global_crops_size: 518 # this is to set up the position embeddings properly
10
+ local_crops_size: 98
src/dinov2/configs/eval/vitl14_pretrain.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ student:
2
+ arch: vit_large
3
+ patch_size: 14
4
+ crops:
5
+ global_crops_size: 518 # this is to set up the position embeddings properly
6
+ local_crops_size: 98
src/dinov2/configs/eval/vitl14_reg4_pretrain.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ student:
2
+ arch: vit_large
3
+ patch_size: 14
4
+ num_register_tokens: 4
5
+ interpolate_antialias: true
6
+ interpolate_offset: 0.0
7
+ crops:
8
+ global_crops_size: 518 # this is to set up the position embeddings properly
9
+ local_crops_size: 98
src/dinov2/configs/eval/vits14_pretrain.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ student:
2
+ arch: vit_small
3
+ patch_size: 14
4
+ crops:
5
+ global_crops_size: 518 # this is to set up the position embeddings properly
6
+ local_crops_size: 98
src/dinov2/configs/eval/vits14_reg4_pretrain.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ student:
2
+ arch: vit_small
3
+ patch_size: 14
4
+ num_register_tokens: 4
5
+ interpolate_antialias: true
6
+ interpolate_offset: 0.0
7
+ crops:
8
+ global_crops_size: 518 # this is to set up the position embeddings properly
9
+ local_crops_size: 98
src/dinov2/configs/ssl_default_config.yaml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ WEIGHTS: ''
3
+ compute_precision:
4
+ grad_scaler: true
5
+ teacher:
6
+ backbone:
7
+ sharding_strategy: SHARD_GRAD_OP
8
+ mixed_precision:
9
+ param_dtype: fp16
10
+ reduce_dtype: fp16
11
+ buffer_dtype: fp32
12
+ dino_head:
13
+ sharding_strategy: SHARD_GRAD_OP
14
+ mixed_precision:
15
+ param_dtype: fp16
16
+ reduce_dtype: fp16
17
+ buffer_dtype: fp32
18
+ ibot_head:
19
+ sharding_strategy: SHARD_GRAD_OP
20
+ mixed_precision:
21
+ param_dtype: fp16
22
+ reduce_dtype: fp16
23
+ buffer_dtype: fp32
24
+ student:
25
+ backbone:
26
+ sharding_strategy: SHARD_GRAD_OP
27
+ mixed_precision:
28
+ param_dtype: fp16
29
+ reduce_dtype: fp16
30
+ buffer_dtype: fp32
31
+ dino_head:
32
+ sharding_strategy: SHARD_GRAD_OP
33
+ mixed_precision:
34
+ param_dtype: fp16
35
+ reduce_dtype: fp32
36
+ buffer_dtype: fp32
37
+ ibot_head:
38
+ sharding_strategy: SHARD_GRAD_OP
39
+ mixed_precision:
40
+ param_dtype: fp16
41
+ reduce_dtype: fp32
42
+ buffer_dtype: fp32
43
+ dino:
44
+ loss_weight: 1.0
45
+ head_n_prototypes: 65536
46
+ head_bottleneck_dim: 256
47
+ head_nlayers: 3
48
+ head_hidden_dim: 2048
49
+ koleo_loss_weight: 0.1
50
+ ibot:
51
+ loss_weight: 1.0
52
+ mask_sample_probability: 0.5
53
+ mask_ratio_min_max:
54
+ - 0.1
55
+ - 0.5
56
+ separate_head: false
57
+ head_n_prototypes: 65536
58
+ head_bottleneck_dim: 256
59
+ head_nlayers: 3
60
+ head_hidden_dim: 2048
61
+ train:
62
+ batch_size_per_gpu: 64
63
+ dataset_path: ImageNet:split=TRAIN
64
+ output_dir: .
65
+ saveckp_freq: 20
66
+ seed: 0
67
+ num_workers: 10
68
+ OFFICIAL_EPOCH_LENGTH: 1250
69
+ cache_dataset: true
70
+ centering: "centering" # or "sinkhorn_knopp"
71
+ student:
72
+ arch: vit_large
73
+ patch_size: 16
74
+ drop_path_rate: 0.3
75
+ layerscale: 1.0e-05
76
+ drop_path_uniform: true
77
+ pretrained_weights: ''
78
+ ffn_layer: "mlp"
79
+ block_chunks: 0
80
+ qkv_bias: true
81
+ proj_bias: true
82
+ ffn_bias: true
83
+ num_register_tokens: 0
84
+ interpolate_antialias: false
85
+ interpolate_offset: 0.1
86
+ teacher:
87
+ momentum_teacher: 0.992
88
+ final_momentum_teacher: 1
89
+ warmup_teacher_temp: 0.04
90
+ teacher_temp: 0.07
91
+ warmup_teacher_temp_epochs: 30
92
+ optim:
93
+ epochs: 100
94
+ weight_decay: 0.04
95
+ weight_decay_end: 0.4
96
+ base_lr: 0.004 # learning rate for a batch size of 1024
97
+ lr: 0. # will be set after applying scaling rule
98
+ warmup_epochs: 10
99
+ min_lr: 1.0e-06
100
+ clip_grad: 3.0
101
+ freeze_last_layer_epochs: 1
102
+ scaling_rule: sqrt_wrt_1024
103
+ patch_embed_lr_mult: 0.2
104
+ layerwise_decay: 0.9
105
+ adamw_beta1: 0.9
106
+ adamw_beta2: 0.999
107
+ crops:
108
+ global_crops_scale:
109
+ - 0.32
110
+ - 1.0
111
+ local_crops_number: 8
112
+ local_crops_scale:
113
+ - 0.05
114
+ - 0.32
115
+ global_crops_size: 224
116
+ local_crops_size: 96
117
+ evaluation:
118
+ eval_period_iterations: 12500
src/dinov2/configs/train/vitg14.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dino:
2
+ head_n_prototypes: 131072
3
+ head_bottleneck_dim: 384
4
+ ibot:
5
+ separate_head: true
6
+ head_n_prototypes: 131072
7
+ train:
8
+ batch_size_per_gpu: 12
9
+ dataset_path: ImageNet22k
10
+ centering: sinkhorn_knopp
11
+ student:
12
+ arch: vit_giant2
13
+ patch_size: 14
14
+ drop_path_rate: 0.4
15
+ ffn_layer: swiglufused
16
+ block_chunks: 4
17
+ teacher:
18
+ momentum_teacher: 0.994
19
+ optim:
20
+ epochs: 500
21
+ weight_decay_end: 0.2
22
+ base_lr: 2.0e-04 # learning rate for a batch size of 1024
23
+ warmup_epochs: 80
24
+ layerwise_decay: 1.0
25
+ crops:
26
+ local_crops_size: 98
src/dinov2/configs/train/vitl14.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dino:
2
+ head_n_prototypes: 131072
3
+ head_bottleneck_dim: 384
4
+ ibot:
5
+ separate_head: true
6
+ head_n_prototypes: 131072
7
+ train:
8
+ batch_size_per_gpu: 32
9
+ dataset_path: ImageNet22k
10
+ centering: sinkhorn_knopp
11
+ student:
12
+ arch: vit_large
13
+ patch_size: 14
14
+ drop_path_rate: 0.4
15
+ ffn_layer: swiglufused
16
+ block_chunks: 4
17
+ teacher:
18
+ momentum_teacher: 0.994
19
+ optim:
20
+ epochs: 500
21
+ weight_decay_end: 0.2
22
+ base_lr: 2.0e-04 # learning rate for a batch size of 1024
23
+ warmup_epochs: 80
24
+ layerwise_decay: 1.0
25
+ crops:
26
+ local_crops_size: 98
src/dinov2/configs/train/vitl16_short.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # this corresponds to the default config
2
+ train:
3
+ dataset_path: ImageNet:split=TRAIN
4
+ batch_size_per_gpu: 64
5
+ student:
6
+ block_chunks: 4
src/dinov2/data/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from .adapters import DatasetWithEnumeratedTargets
7
+ from .loaders import make_data_loader, make_dataset, SamplerType
8
+ from .collate import collate_data_and_cast
9
+ from .masking import MaskingGenerator
10
+ from .augmentations import DataAugmentationDINO
src/dinov2/data/adapters.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from typing import Any, Tuple
7
+
8
+ from torch.utils.data import Dataset
9
+
10
+
11
+ class DatasetWithEnumeratedTargets(Dataset):
12
+ def __init__(self, dataset):
13
+ self._dataset = dataset
14
+
15
+ def get_image_data(self, index: int) -> bytes:
16
+ return self._dataset.get_image_data(index)
17
+
18
+ def get_target(self, index: int) -> Tuple[Any, int]:
19
+ target = self._dataset.get_target(index)
20
+ return (index, target)
21
+
22
+ def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]:
23
+ image, target = self._dataset[index]
24
+ target = index if target is None else target
25
+ return image, (index, target)
26
+
27
+ def __len__(self) -> int:
28
+ return len(self._dataset)
src/dinov2/data/augmentations.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import logging
7
+
8
+ from torchvision import transforms
9
+
10
+ from .transforms import (
11
+ GaussianBlur,
12
+ make_normalize_transform,
13
+ )
14
+
15
+
16
+ logger = logging.getLogger("dinov2")
17
+
18
+
19
+ class DataAugmentationDINO(object):
20
+ def __init__(
21
+ self,
22
+ global_crops_scale,
23
+ local_crops_scale,
24
+ local_crops_number,
25
+ global_crops_size=224,
26
+ local_crops_size=96,
27
+ ):
28
+ self.global_crops_scale = global_crops_scale
29
+ self.local_crops_scale = local_crops_scale
30
+ self.local_crops_number = local_crops_number
31
+ self.global_crops_size = global_crops_size
32
+ self.local_crops_size = local_crops_size
33
+
34
+ logger.info("###################################")
35
+ logger.info("Using data augmentation parameters:")
36
+ logger.info(f"global_crops_scale: {global_crops_scale}")
37
+ logger.info(f"local_crops_scale: {local_crops_scale}")
38
+ logger.info(f"local_crops_number: {local_crops_number}")
39
+ logger.info(f"global_crops_size: {global_crops_size}")
40
+ logger.info(f"local_crops_size: {local_crops_size}")
41
+ logger.info("###################################")
42
+
43
+ # random resized crop and flip
44
+ self.geometric_augmentation_global = transforms.Compose(
45
+ [
46
+ transforms.RandomResizedCrop(
47
+ global_crops_size, scale=global_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
48
+ ),
49
+ transforms.RandomHorizontalFlip(p=0.5),
50
+ ]
51
+ )
52
+
53
+ self.geometric_augmentation_local = transforms.Compose(
54
+ [
55
+ transforms.RandomResizedCrop(
56
+ local_crops_size, scale=local_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
57
+ ),
58
+ transforms.RandomHorizontalFlip(p=0.5),
59
+ ]
60
+ )
61
+
62
+ # color distorsions / blurring
63
+ color_jittering = transforms.Compose(
64
+ [
65
+ transforms.RandomApply(
66
+ [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
67
+ p=0.8,
68
+ ),
69
+ transforms.RandomGrayscale(p=0.2),
70
+ ]
71
+ )
72
+
73
+ global_transfo1_extra = GaussianBlur(p=1.0)
74
+
75
+ global_transfo2_extra = transforms.Compose(
76
+ [
77
+ GaussianBlur(p=0.1),
78
+ transforms.RandomSolarize(threshold=128, p=0.2),
79
+ ]
80
+ )
81
+
82
+ local_transfo_extra = GaussianBlur(p=0.5)
83
+
84
+ # normalization
85
+ self.normalize = transforms.Compose(
86
+ [
87
+ transforms.ToTensor(),
88
+ make_normalize_transform(),
89
+ ]
90
+ )
91
+
92
+ self.global_transfo1 = transforms.Compose([color_jittering, global_transfo1_extra, self.normalize])
93
+ self.global_transfo2 = transforms.Compose([color_jittering, global_transfo2_extra, self.normalize])
94
+ self.local_transfo = transforms.Compose([color_jittering, local_transfo_extra, self.normalize])
95
+
96
+ def __call__(self, image):
97
+ output = {}
98
+
99
+ # global crops:
100
+ im1_base = self.geometric_augmentation_global(image)
101
+ global_crop_1 = self.global_transfo1(im1_base)
102
+
103
+ im2_base = self.geometric_augmentation_global(image)
104
+ global_crop_2 = self.global_transfo2(im2_base)
105
+
106
+ output["global_crops"] = [global_crop_1, global_crop_2]
107
+
108
+ # global crops for teacher:
109
+ output["global_crops_teacher"] = [global_crop_1, global_crop_2]
110
+
111
+ # local crops:
112
+ local_crops = [
113
+ self.local_transfo(self.geometric_augmentation_local(image)) for _ in range(self.local_crops_number)
114
+ ]
115
+ output["local_crops"] = local_crops
116
+ output["offsets"] = ()
117
+
118
+ return output
src/dinov2/data/collate.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import random
8
+
9
+
10
+ def collate_data_and_cast(samples_list, mask_ratio_tuple, mask_probability, dtype, n_tokens=None, mask_generator=None):
11
+ # dtype = torch.half # TODO: Remove
12
+
13
+ n_global_crops = len(samples_list[0][0]["global_crops"])
14
+ n_local_crops = len(samples_list[0][0]["local_crops"])
15
+
16
+ collated_global_crops = torch.stack([s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list])
17
+
18
+ collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list])
19
+
20
+ B = len(collated_global_crops)
21
+ N = n_tokens
22
+ n_samples_masked = int(B * mask_probability)
23
+ probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1)
24
+ upperbound = 0
25
+ masks_list = []
26
+ for i in range(0, n_samples_masked):
27
+ prob_min = probs[i]
28
+ prob_max = probs[i + 1]
29
+ masks_list.append(torch.BoolTensor(mask_generator(int(N * random.uniform(prob_min, prob_max)))))
30
+ upperbound += int(N * prob_max)
31
+ for i in range(n_samples_masked, B):
32
+ masks_list.append(torch.BoolTensor(mask_generator(0)))
33
+
34
+ random.shuffle(masks_list)
35
+
36
+ collated_masks = torch.stack(masks_list).flatten(1)
37
+ mask_indices_list = collated_masks.flatten().nonzero().flatten()
38
+
39
+ masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
40
+
41
+ return {
42
+ "collated_global_crops": collated_global_crops.to(dtype),
43
+ "collated_local_crops": collated_local_crops.to(dtype),
44
+ "collated_masks": collated_masks,
45
+ "mask_indices_list": mask_indices_list,
46
+ "masks_weight": masks_weight,
47
+ "upperbound": upperbound,
48
+ "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
49
+ }
src/dinov2/data/datasets/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from .image_net import ImageNet
7
+ from .image_net_22k import ImageNet22k
src/dinov2/data/datasets/decoders.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from io import BytesIO
7
+ from typing import Any
8
+
9
+ from PIL import Image
10
+
11
+
12
+ class Decoder:
13
+ def decode(self) -> Any:
14
+ raise NotImplementedError
15
+
16
+
17
+ class ImageDataDecoder(Decoder):
18
+ def __init__(self, image_data: bytes) -> None:
19
+ self._image_data = image_data
20
+
21
+ def decode(self) -> Image:
22
+ f = BytesIO(self._image_data)
23
+ return Image.open(f).convert(mode="RGB")
24
+
25
+
26
+ class TargetDecoder(Decoder):
27
+ def __init__(self, target: Any):
28
+ self._target = target
29
+
30
+ def decode(self) -> Any:
31
+ return self._target
src/dinov2/data/datasets/extended.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from typing import Any, Tuple
7
+
8
+ from torchvision.datasets import VisionDataset
9
+
10
+ from .decoders import TargetDecoder, ImageDataDecoder
11
+
12
+
13
+ class ExtendedVisionDataset(VisionDataset):
14
+ def __init__(self, *args, **kwargs) -> None:
15
+ super().__init__(*args, **kwargs) # type: ignore
16
+
17
+ def get_image_data(self, index: int) -> bytes:
18
+ raise NotImplementedError
19
+
20
+ def get_target(self, index: int) -> Any:
21
+ raise NotImplementedError
22
+
23
+ def __getitem__(self, index: int) -> Tuple[Any, Any]:
24
+ try:
25
+ image_data = self.get_image_data(index)
26
+ image = ImageDataDecoder(image_data).decode()
27
+ except Exception as e:
28
+ raise RuntimeError(f"can not read image for sample {index}") from e
29
+ target = self.get_target(index)
30
+ target = TargetDecoder(target).decode()
31
+
32
+ if self.transforms is not None:
33
+ image, target = self.transforms(image, target)
34
+
35
+ return image, target
36
+
37
+ def __len__(self) -> int:
38
+ raise NotImplementedError
src/dinov2/data/datasets/image_net.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import csv
7
+ from enum import Enum
8
+ import logging
9
+ import os
10
+ from typing import Callable, List, Optional, Tuple, Union
11
+
12
+ import numpy as np
13
+
14
+ from .extended import ExtendedVisionDataset
15
+
16
+
17
+ logger = logging.getLogger("dinov2")
18
+ _Target = int
19
+
20
+
21
+ class _Split(Enum):
22
+ TRAIN = "train"
23
+ VAL = "val"
24
+ TEST = "test" # NOTE: torchvision does not support the test split
25
+
26
+ @property
27
+ def length(self) -> int:
28
+ split_lengths = {
29
+ _Split.TRAIN: 1_281_167,
30
+ _Split.VAL: 50_000,
31
+ _Split.TEST: 100_000,
32
+ }
33
+ return split_lengths[self]
34
+
35
+ def get_dirname(self, class_id: Optional[str] = None) -> str:
36
+ return self.value if class_id is None else os.path.join(self.value, class_id)
37
+
38
+ def get_image_relpath(self, actual_index: int, class_id: Optional[str] = None) -> str:
39
+ dirname = self.get_dirname(class_id)
40
+ if self == _Split.TRAIN:
41
+ basename = f"{class_id}_{actual_index}"
42
+ else: # self in (_Split.VAL, _Split.TEST):
43
+ basename = f"ILSVRC2012_{self.value}_{actual_index:08d}"
44
+ return os.path.join(dirname, basename + ".JPEG")
45
+
46
+ def parse_image_relpath(self, image_relpath: str) -> Tuple[str, int]:
47
+ assert self != _Split.TEST
48
+ dirname, filename = os.path.split(image_relpath)
49
+ class_id = os.path.split(dirname)[-1]
50
+ basename, _ = os.path.splitext(filename)
51
+ actual_index = int(basename.split("_")[-1])
52
+ return class_id, actual_index
53
+
54
+
55
+ class ImageNet(ExtendedVisionDataset):
56
+ Target = Union[_Target]
57
+ Split = Union[_Split]
58
+
59
+ def __init__(
60
+ self,
61
+ *,
62
+ split: "ImageNet.Split",
63
+ root: str,
64
+ extra: str,
65
+ transforms: Optional[Callable] = None,
66
+ transform: Optional[Callable] = None,
67
+ target_transform: Optional[Callable] = None,
68
+ ) -> None:
69
+ super().__init__(root, transforms, transform, target_transform)
70
+ self._extra_root = extra
71
+ self._split = split
72
+
73
+ self._entries = None
74
+ self._class_ids = None
75
+ self._class_names = None
76
+
77
+ @property
78
+ def split(self) -> "ImageNet.Split":
79
+ return self._split
80
+
81
+ def _get_extra_full_path(self, extra_path: str) -> str:
82
+ return os.path.join(self._extra_root, extra_path)
83
+
84
+ def _load_extra(self, extra_path: str) -> np.ndarray:
85
+ extra_full_path = self._get_extra_full_path(extra_path)
86
+ return np.load(extra_full_path, mmap_mode="r")
87
+
88
+ def _save_extra(self, extra_array: np.ndarray, extra_path: str) -> None:
89
+ extra_full_path = self._get_extra_full_path(extra_path)
90
+ os.makedirs(self._extra_root, exist_ok=True)
91
+ np.save(extra_full_path, extra_array)
92
+
93
+ @property
94
+ def _entries_path(self) -> str:
95
+ return f"entries-{self._split.value.upper()}.npy"
96
+
97
+ @property
98
+ def _class_ids_path(self) -> str:
99
+ return f"class-ids-{self._split.value.upper()}.npy"
100
+
101
+ @property
102
+ def _class_names_path(self) -> str:
103
+ return f"class-names-{self._split.value.upper()}.npy"
104
+
105
+ def _get_entries(self) -> np.ndarray:
106
+ if self._entries is None:
107
+ self._entries = self._load_extra(self._entries_path)
108
+ assert self._entries is not None
109
+ return self._entries
110
+
111
+ def _get_class_ids(self) -> np.ndarray:
112
+ if self._split == _Split.TEST:
113
+ assert False, "Class IDs are not available in TEST split"
114
+ if self._class_ids is None:
115
+ self._class_ids = self._load_extra(self._class_ids_path)
116
+ assert self._class_ids is not None
117
+ return self._class_ids
118
+
119
+ def _get_class_names(self) -> np.ndarray:
120
+ if self._split == _Split.TEST:
121
+ assert False, "Class names are not available in TEST split"
122
+ if self._class_names is None:
123
+ self._class_names = self._load_extra(self._class_names_path)
124
+ assert self._class_names is not None
125
+ return self._class_names
126
+
127
+ def find_class_id(self, class_index: int) -> str:
128
+ class_ids = self._get_class_ids()
129
+ return str(class_ids[class_index])
130
+
131
+ def find_class_name(self, class_index: int) -> str:
132
+ class_names = self._get_class_names()
133
+ return str(class_names[class_index])
134
+
135
+ def get_image_data(self, index: int) -> bytes:
136
+ entries = self._get_entries()
137
+ actual_index = entries[index]["actual_index"]
138
+
139
+ class_id = self.get_class_id(index)
140
+
141
+ image_relpath = self.split.get_image_relpath(actual_index, class_id)
142
+ image_full_path = os.path.join(self.root, image_relpath)
143
+ with open(image_full_path, mode="rb") as f:
144
+ image_data = f.read()
145
+ return image_data
146
+
147
+ def get_target(self, index: int) -> Optional[Target]:
148
+ entries = self._get_entries()
149
+ class_index = entries[index]["class_index"]
150
+ return None if self.split == _Split.TEST else int(class_index)
151
+
152
+ def get_targets(self) -> Optional[np.ndarray]:
153
+ entries = self._get_entries()
154
+ return None if self.split == _Split.TEST else entries["class_index"]
155
+
156
+ def get_class_id(self, index: int) -> Optional[str]:
157
+ entries = self._get_entries()
158
+ class_id = entries[index]["class_id"]
159
+ return None if self.split == _Split.TEST else str(class_id)
160
+
161
+ def get_class_name(self, index: int) -> Optional[str]:
162
+ entries = self._get_entries()
163
+ class_name = entries[index]["class_name"]
164
+ return None if self.split == _Split.TEST else str(class_name)
165
+
166
+ def __len__(self) -> int:
167
+ entries = self._get_entries()
168
+ assert len(entries) == self.split.length
169
+ return len(entries)
170
+
171
+ def _load_labels(self, labels_path: str) -> List[Tuple[str, str]]:
172
+ labels_full_path = os.path.join(self.root, labels_path)
173
+ labels = []
174
+
175
+ try:
176
+ with open(labels_full_path, "r") as f:
177
+ reader = csv.reader(f)
178
+ for row in reader:
179
+ class_id, class_name = row
180
+ labels.append((class_id, class_name))
181
+ except OSError as e:
182
+ raise RuntimeError(f'can not read labels file "{labels_full_path}"') from e
183
+
184
+ return labels
185
+
186
+ def _dump_entries(self) -> None:
187
+ split = self.split
188
+ if split == ImageNet.Split.TEST:
189
+ dataset = None
190
+ sample_count = split.length
191
+ max_class_id_length, max_class_name_length = 0, 0
192
+ else:
193
+ labels_path = "labels.txt"
194
+ logger.info(f'loading labels from "{labels_path}"')
195
+ labels = self._load_labels(labels_path)
196
+
197
+ # NOTE: Using torchvision ImageFolder for consistency
198
+ from torchvision.datasets import ImageFolder
199
+
200
+ dataset_root = os.path.join(self.root, split.get_dirname())
201
+ dataset = ImageFolder(dataset_root)
202
+ sample_count = len(dataset)
203
+ max_class_id_length, max_class_name_length = -1, -1
204
+ for sample in dataset.samples:
205
+ _, class_index = sample
206
+ class_id, class_name = labels[class_index]
207
+ max_class_id_length = max(len(class_id), max_class_id_length)
208
+ max_class_name_length = max(len(class_name), max_class_name_length)
209
+
210
+ dtype = np.dtype(
211
+ [
212
+ ("actual_index", "<u4"),
213
+ ("class_index", "<u4"),
214
+ ("class_id", f"U{max_class_id_length}"),
215
+ ("class_name", f"U{max_class_name_length}"),
216
+ ]
217
+ )
218
+ entries_array = np.empty(sample_count, dtype=dtype)
219
+
220
+ if split == ImageNet.Split.TEST:
221
+ old_percent = -1
222
+ for index in range(sample_count):
223
+ percent = 100 * (index + 1) // sample_count
224
+ if percent > old_percent:
225
+ logger.info(f"creating entries: {percent}%")
226
+ old_percent = percent
227
+
228
+ actual_index = index + 1
229
+ class_index = np.uint32(-1)
230
+ class_id, class_name = "", ""
231
+ entries_array[index] = (actual_index, class_index, class_id, class_name)
232
+ else:
233
+ class_names = {class_id: class_name for class_id, class_name in labels}
234
+
235
+ assert dataset
236
+ old_percent = -1
237
+ for index in range(sample_count):
238
+ percent = 100 * (index + 1) // sample_count
239
+ if percent > old_percent:
240
+ logger.info(f"creating entries: {percent}%")
241
+ old_percent = percent
242
+
243
+ image_full_path, class_index = dataset.samples[index]
244
+ image_relpath = os.path.relpath(image_full_path, self.root)
245
+ class_id, actual_index = split.parse_image_relpath(image_relpath)
246
+ class_name = class_names[class_id]
247
+ entries_array[index] = (actual_index, class_index, class_id, class_name)
248
+
249
+ logger.info(f'saving entries to "{self._entries_path}"')
250
+ self._save_extra(entries_array, self._entries_path)
251
+
252
+ def _dump_class_ids_and_names(self) -> None:
253
+ split = self.split
254
+ if split == ImageNet.Split.TEST:
255
+ return
256
+
257
+ entries_array = self._load_extra(self._entries_path)
258
+
259
+ max_class_id_length, max_class_name_length, max_class_index = -1, -1, -1
260
+ for entry in entries_array:
261
+ class_index, class_id, class_name = (
262
+ entry["class_index"],
263
+ entry["class_id"],
264
+ entry["class_name"],
265
+ )
266
+ max_class_index = max(int(class_index), max_class_index)
267
+ max_class_id_length = max(len(str(class_id)), max_class_id_length)
268
+ max_class_name_length = max(len(str(class_name)), max_class_name_length)
269
+
270
+ class_count = max_class_index + 1
271
+ class_ids_array = np.empty(class_count, dtype=f"U{max_class_id_length}")
272
+ class_names_array = np.empty(class_count, dtype=f"U{max_class_name_length}")
273
+ for entry in entries_array:
274
+ class_index, class_id, class_name = (
275
+ entry["class_index"],
276
+ entry["class_id"],
277
+ entry["class_name"],
278
+ )
279
+ class_ids_array[class_index] = class_id
280
+ class_names_array[class_index] = class_name
281
+
282
+ logger.info(f'saving class IDs to "{self._class_ids_path}"')
283
+ self._save_extra(class_ids_array, self._class_ids_path)
284
+
285
+ logger.info(f'saving class names to "{self._class_names_path}"')
286
+ self._save_extra(class_names_array, self._class_names_path)
287
+
288
+ def dump_extra(self) -> None:
289
+ self._dump_entries()
290
+ self._dump_class_ids_and_names()
src/dinov2/data/datasets/image_net_22k.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from functools import lru_cache
9
+ from gzip import GzipFile
10
+ from io import BytesIO
11
+ from mmap import ACCESS_READ, mmap
12
+ import os
13
+ from typing import Any, Callable, List, Optional, Set, Tuple
14
+ import warnings
15
+
16
+ import numpy as np
17
+
18
+ from .extended import ExtendedVisionDataset
19
+
20
+
21
+ _Labels = int
22
+
23
+ _DEFAULT_MMAP_CACHE_SIZE = 16 # Warning: This can exhaust file descriptors
24
+
25
+
26
+ @dataclass
27
+ class _ClassEntry:
28
+ block_offset: int
29
+ maybe_filename: Optional[str] = None
30
+
31
+
32
+ @dataclass
33
+ class _Entry:
34
+ class_index: int # noqa: E701
35
+ start_offset: int
36
+ end_offset: int
37
+ filename: str
38
+
39
+
40
+ class _Split(Enum):
41
+ TRAIN = "train"
42
+ VAL = "val"
43
+
44
+ @property
45
+ def length(self) -> int:
46
+ return {
47
+ _Split.TRAIN: 11_797_647,
48
+ _Split.VAL: 561_050,
49
+ }[self]
50
+
51
+ def entries_path(self):
52
+ return f"imagenet21kp_{self.value}.txt"
53
+
54
+
55
+ def _get_tarball_path(class_id: str) -> str:
56
+ return f"{class_id}.tar"
57
+
58
+
59
+ def _make_mmap_tarball(tarballs_root: str, mmap_cache_size: int):
60
+ @lru_cache(maxsize=mmap_cache_size)
61
+ def _mmap_tarball(class_id: str) -> mmap:
62
+ tarball_path = _get_tarball_path(class_id)
63
+ tarball_full_path = os.path.join(tarballs_root, tarball_path)
64
+ with open(tarball_full_path) as f:
65
+ return mmap(fileno=f.fileno(), length=0, access=ACCESS_READ)
66
+
67
+ return _mmap_tarball
68
+
69
+
70
+ class ImageNet22k(ExtendedVisionDataset):
71
+ _GZIPPED_INDICES: Set[int] = {
72
+ 841_545,
73
+ 1_304_131,
74
+ 2_437_921,
75
+ 2_672_079,
76
+ 2_795_676,
77
+ 2_969_786,
78
+ 6_902_965,
79
+ 6_903_550,
80
+ 6_903_628,
81
+ 7_432_557,
82
+ 7_432_589,
83
+ 7_813_809,
84
+ 8_329_633,
85
+ 10_296_990,
86
+ 10_417_652,
87
+ 10_492_265,
88
+ 10_598_078,
89
+ 10_782_398,
90
+ 10_902_612,
91
+ 11_203_736,
92
+ 11_342_890,
93
+ 11_397_596,
94
+ 11_589_762,
95
+ 11_705_103,
96
+ 12_936_875,
97
+ 13_289_782,
98
+ }
99
+ Labels = _Labels
100
+
101
+ def __init__(
102
+ self,
103
+ *,
104
+ root: str,
105
+ extra: str,
106
+ transforms: Optional[Callable] = None,
107
+ transform: Optional[Callable] = None,
108
+ target_transform: Optional[Callable] = None,
109
+ mmap_cache_size: int = _DEFAULT_MMAP_CACHE_SIZE,
110
+ ) -> None:
111
+ super().__init__(root, transforms, transform, target_transform)
112
+ self._extra_root = extra
113
+
114
+ entries_path = self._get_entries_path(root)
115
+ self._entries = self._load_extra(entries_path)
116
+
117
+ class_ids_path = self._get_class_ids_path(root)
118
+ self._class_ids = self._load_extra(class_ids_path)
119
+
120
+ self._gzipped_indices = ImageNet22k._GZIPPED_INDICES
121
+ self._mmap_tarball = _make_mmap_tarball(self._tarballs_root, mmap_cache_size)
122
+
123
+ def _get_entries_path(self, root: Optional[str] = None) -> str:
124
+ return "entries.npy"
125
+
126
+ def _get_class_ids_path(self, root: Optional[str] = None) -> str:
127
+ return "class-ids.npy"
128
+
129
+ def _find_class_ids(self, path: str) -> List[str]:
130
+ class_ids = []
131
+
132
+ with os.scandir(path) as entries:
133
+ for entry in entries:
134
+ root, ext = os.path.splitext(entry.name)
135
+ if ext != ".tar":
136
+ continue
137
+ class_ids.append(root)
138
+
139
+ return sorted(class_ids)
140
+
141
+ def _load_entries_class_ids(self, root: Optional[str] = None) -> Tuple[List[_Entry], List[str]]:
142
+ root = self.get_root(root)
143
+ entries: List[_Entry] = []
144
+ class_ids = self._find_class_ids(root)
145
+
146
+ for class_index, class_id in enumerate(class_ids):
147
+ path = os.path.join(root, "blocks", f"{class_id}.log")
148
+ class_entries = []
149
+
150
+ try:
151
+ with open(path) as f:
152
+ for line in f:
153
+ line = line.rstrip()
154
+ block, filename = line.split(":")
155
+ block_offset = int(block[6:])
156
+ filename = filename[1:]
157
+
158
+ maybe_filename = None
159
+ if filename != "** Block of NULs **":
160
+ maybe_filename = filename
161
+ _, ext = os.path.splitext(filename)
162
+ # assert ext == ".JPEG"
163
+
164
+ class_entry = _ClassEntry(block_offset, maybe_filename)
165
+ class_entries.append(class_entry)
166
+ except OSError as e:
167
+ raise RuntimeError(f'can not read blocks file "{path}"') from e
168
+
169
+ assert class_entries[-1].maybe_filename is None
170
+
171
+ for class_entry1, class_entry2 in zip(class_entries, class_entries[1:]):
172
+ assert class_entry1.block_offset <= class_entry2.block_offset
173
+ start_offset = 512 * class_entry1.block_offset
174
+ end_offset = 512 * class_entry2.block_offset
175
+ assert class_entry1.maybe_filename is not None
176
+ filename = class_entry1.maybe_filename
177
+ entry = _Entry(class_index, start_offset, end_offset, filename)
178
+ # Skip invalid image files (PIL throws UnidentifiedImageError)
179
+ if filename == "n06470073_47249.JPEG":
180
+ continue
181
+ entries.append(entry)
182
+
183
+ return entries, class_ids
184
+
185
+ def _load_extra(self, extra_path: str) -> np.ndarray:
186
+ extra_root = self._extra_root
187
+ extra_full_path = os.path.join(extra_root, extra_path)
188
+ return np.load(extra_full_path, mmap_mode="r")
189
+
190
+ def _save_extra(self, extra_array: np.ndarray, extra_path: str) -> None:
191
+ extra_root = self._extra_root
192
+ extra_full_path = os.path.join(extra_root, extra_path)
193
+ os.makedirs(extra_root, exist_ok=True)
194
+ np.save(extra_full_path, extra_array)
195
+
196
+ @property
197
+ def _tarballs_root(self) -> str:
198
+ return self.root
199
+
200
+ def find_class_id(self, class_index: int) -> str:
201
+ return str(self._class_ids[class_index])
202
+
203
+ def get_image_data(self, index: int) -> bytes:
204
+ entry = self._entries[index]
205
+ class_id = entry["class_id"]
206
+ class_mmap = self._mmap_tarball(class_id)
207
+
208
+ start_offset, end_offset = entry["start_offset"], entry["end_offset"]
209
+ try:
210
+ mapped_data = class_mmap[start_offset:end_offset]
211
+ data = mapped_data[512:] # Skip entry header block
212
+
213
+ if len(data) >= 2 and tuple(data[:2]) == (0x1F, 0x8B):
214
+ assert index in self._gzipped_indices, f"unexpected gzip header for sample {index}"
215
+ with GzipFile(fileobj=BytesIO(data)) as g:
216
+ data = g.read()
217
+ except Exception as e:
218
+ raise RuntimeError(f"can not retrieve image data for sample {index} " f'from "{class_id}" tarball') from e
219
+
220
+ return data
221
+
222
+ def get_target(self, index: int) -> Any:
223
+ return int(self._entries[index]["class_index"])
224
+
225
+ def get_targets(self) -> np.ndarray:
226
+ return self._entries["class_index"]
227
+
228
+ def get_class_id(self, index: int) -> str:
229
+ return str(self._entries[index]["class_id"])
230
+
231
+ def get_class_ids(self) -> np.ndarray:
232
+ return self._entries["class_id"]
233
+
234
+ def __getitem__(self, index: int) -> Tuple[Any, Any]:
235
+ with warnings.catch_warnings():
236
+ warnings.simplefilter("ignore")
237
+ return super().__getitem__(index)
238
+
239
+ def __len__(self) -> int:
240
+ return len(self._entries)
241
+
242
+ def _dump_entries(self, *args, **kwargs) -> None:
243
+ entries, class_ids = self._load_entries_class_ids(*args, **kwargs)
244
+
245
+ max_class_id_length, max_filename_length, max_class_index = -1, -1, -1
246
+ for entry in entries:
247
+ class_id = class_ids[entry.class_index]
248
+ max_class_index = max(entry.class_index, max_class_index)
249
+ max_class_id_length = max(len(class_id), max_class_id_length)
250
+ max_filename_length = max(len(entry.filename), max_filename_length)
251
+
252
+ dtype = np.dtype(
253
+ [
254
+ ("class_index", "<u4"),
255
+ ("class_id", f"U{max_class_id_length}"),
256
+ ("start_offset", "<u4"),
257
+ ("end_offset", "<u4"),
258
+ ("filename", f"U{max_filename_length}"),
259
+ ]
260
+ )
261
+ sample_count = len(entries)
262
+ entries_array = np.empty(sample_count, dtype=dtype)
263
+ for i, entry in enumerate(entries):
264
+ class_index = entry.class_index
265
+ class_id = class_ids[class_index]
266
+ start_offset = entry.start_offset
267
+ end_offset = entry.end_offset
268
+ filename = entry.filename
269
+ entries_array[i] = (
270
+ class_index,
271
+ class_id,
272
+ start_offset,
273
+ end_offset,
274
+ filename,
275
+ )
276
+
277
+ entries_path = self._get_entries_path(*args, **kwargs)
278
+ self._save_extra(entries_array, entries_path)
279
+
280
+ def _dump_class_ids(self, *args, **kwargs) -> None:
281
+ entries_path = self._get_entries_path(*args, **kwargs)
282
+ entries_array = self._load_extra(entries_path)
283
+
284
+ max_class_id_length, max_class_index = -1, -1
285
+ for entry in entries_array:
286
+ class_index, class_id = entry["class_index"], entry["class_id"]
287
+ max_class_index = max(int(class_index), max_class_index)
288
+ max_class_id_length = max(len(str(class_id)), max_class_id_length)
289
+
290
+ class_ids_array = np.empty(max_class_index + 1, dtype=f"U{max_class_id_length}")
291
+ for entry in entries_array:
292
+ class_index, class_id = entry["class_index"], entry["class_id"]
293
+ class_ids_array[class_index] = class_id
294
+ class_ids_path = self._get_class_ids_path(*args, **kwargs)
295
+ self._save_extra(class_ids_array, class_ids_path)
296
+
297
+ def _dump_extra(self, *args, **kwargs) -> None:
298
+ self._dump_entries(*args, *kwargs)
299
+ self._dump_class_ids(*args, *kwargs)
300
+
301
+ def dump_extra(self, root: Optional[str] = None) -> None:
302
+ return self._dump_extra(root)
src/dinov2/data/loaders.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import logging
7
+ from enum import Enum
8
+ from typing import Any, Callable, List, Optional, TypeVar
9
+
10
+ import torch
11
+ from torch.utils.data import Sampler
12
+
13
+ from .datasets import ImageNet, ImageNet22k
14
+ from .samplers import EpochSampler, InfiniteSampler, ShardedInfiniteSampler
15
+
16
+
17
+ logger = logging.getLogger("dinov2")
18
+
19
+
20
+ class SamplerType(Enum):
21
+ DISTRIBUTED = 0
22
+ EPOCH = 1
23
+ INFINITE = 2
24
+ SHARDED_INFINITE = 3
25
+ SHARDED_INFINITE_NEW = 4
26
+
27
+
28
+ def _make_bool_str(b: bool) -> str:
29
+ return "yes" if b else "no"
30
+
31
+
32
+ def _make_sample_transform(image_transform: Optional[Callable] = None, target_transform: Optional[Callable] = None):
33
+ def transform(sample):
34
+ image, target = sample
35
+ if image_transform is not None:
36
+ image = image_transform(image)
37
+ if target_transform is not None:
38
+ target = target_transform(target)
39
+ return image, target
40
+
41
+ return transform
42
+
43
+
44
+ def _parse_dataset_str(dataset_str: str):
45
+ tokens = dataset_str.split(":")
46
+
47
+ name = tokens[0]
48
+ kwargs = {}
49
+
50
+ for token in tokens[1:]:
51
+ key, value = token.split("=")
52
+ assert key in ("root", "extra", "split")
53
+ kwargs[key] = value
54
+
55
+ if name == "ImageNet":
56
+ class_ = ImageNet
57
+ if "split" in kwargs:
58
+ kwargs["split"] = ImageNet.Split[kwargs["split"]]
59
+ elif name == "ImageNet22k":
60
+ class_ = ImageNet22k
61
+ else:
62
+ raise ValueError(f'Unsupported dataset "{name}"')
63
+
64
+ return class_, kwargs
65
+
66
+
67
+ def make_dataset(
68
+ *,
69
+ dataset_str: str,
70
+ transform: Optional[Callable] = None,
71
+ target_transform: Optional[Callable] = None,
72
+ ):
73
+ """
74
+ Creates a dataset with the specified parameters.
75
+
76
+ Args:
77
+ dataset_str: A dataset string description (e.g. ImageNet:split=TRAIN).
78
+ transform: A transform to apply to images.
79
+ target_transform: A transform to apply to targets.
80
+
81
+ Returns:
82
+ The created dataset.
83
+ """
84
+ logger.info(f'using dataset: "{dataset_str}"')
85
+
86
+ class_, kwargs = _parse_dataset_str(dataset_str)
87
+ dataset = class_(transform=transform, target_transform=target_transform, **kwargs)
88
+
89
+ logger.info(f"# of dataset samples: {len(dataset):,d}")
90
+
91
+ # Aggregated datasets do not expose (yet) these attributes, so add them.
92
+ if not hasattr(dataset, "transform"):
93
+ setattr(dataset, "transform", transform)
94
+ if not hasattr(dataset, "target_transform"):
95
+ setattr(dataset, "target_transform", target_transform)
96
+
97
+ return dataset
98
+
99
+
100
+ def _make_sampler(
101
+ *,
102
+ dataset,
103
+ type: Optional[SamplerType] = None,
104
+ shuffle: bool = False,
105
+ seed: int = 0,
106
+ size: int = -1,
107
+ advance: int = 0,
108
+ ) -> Optional[Sampler]:
109
+ sample_count = len(dataset)
110
+
111
+ if type == SamplerType.INFINITE:
112
+ logger.info("sampler: infinite")
113
+ if size > 0:
114
+ raise ValueError("sampler size > 0 is invalid")
115
+ return InfiniteSampler(
116
+ sample_count=sample_count,
117
+ shuffle=shuffle,
118
+ seed=seed,
119
+ advance=advance,
120
+ )
121
+ elif type in (SamplerType.SHARDED_INFINITE, SamplerType.SHARDED_INFINITE_NEW):
122
+ logger.info("sampler: sharded infinite")
123
+ if size > 0:
124
+ raise ValueError("sampler size > 0 is invalid")
125
+ # TODO: Remove support for old shuffling
126
+ use_new_shuffle_tensor_slice = type == SamplerType.SHARDED_INFINITE_NEW
127
+ return ShardedInfiniteSampler(
128
+ sample_count=sample_count,
129
+ shuffle=shuffle,
130
+ seed=seed,
131
+ advance=advance,
132
+ use_new_shuffle_tensor_slice=use_new_shuffle_tensor_slice,
133
+ )
134
+ elif type == SamplerType.EPOCH:
135
+ logger.info("sampler: epoch")
136
+ if advance > 0:
137
+ raise NotImplementedError("sampler advance > 0 is not supported")
138
+ size = size if size > 0 else sample_count
139
+ logger.info(f"# of samples / epoch: {size:,d}")
140
+ return EpochSampler(
141
+ size=size,
142
+ sample_count=sample_count,
143
+ shuffle=shuffle,
144
+ seed=seed,
145
+ )
146
+ elif type == SamplerType.DISTRIBUTED:
147
+ logger.info("sampler: distributed")
148
+ if size > 0:
149
+ raise ValueError("sampler size > 0 is invalid")
150
+ if advance > 0:
151
+ raise ValueError("sampler advance > 0 is invalid")
152
+ return torch.utils.data.DistributedSampler(
153
+ dataset=dataset,
154
+ shuffle=shuffle,
155
+ seed=seed,
156
+ drop_last=False,
157
+ )
158
+
159
+ logger.info("sampler: none")
160
+ return None
161
+
162
+
163
+ T = TypeVar("T")
164
+
165
+
166
+ def make_data_loader(
167
+ *,
168
+ dataset,
169
+ batch_size: int,
170
+ num_workers: int,
171
+ shuffle: bool = True,
172
+ seed: int = 0,
173
+ sampler_type: Optional[SamplerType] = SamplerType.INFINITE,
174
+ sampler_size: int = -1,
175
+ sampler_advance: int = 0,
176
+ drop_last: bool = True,
177
+ persistent_workers: bool = False,
178
+ collate_fn: Optional[Callable[[List[T]], Any]] = None,
179
+ ):
180
+ """
181
+ Creates a data loader with the specified parameters.
182
+
183
+ Args:
184
+ dataset: A dataset (third party, LaViDa or WebDataset).
185
+ batch_size: The size of batches to generate.
186
+ num_workers: The number of workers to use.
187
+ shuffle: Whether to shuffle samples.
188
+ seed: The random seed to use.
189
+ sampler_type: Which sampler to use: EPOCH, INFINITE, SHARDED_INFINITE, SHARDED_INFINITE_NEW, DISTRIBUTED or None.
190
+ sampler_size: The number of images per epoch (when applicable) or -1 for the entire dataset.
191
+ sampler_advance: How many samples to skip (when applicable).
192
+ drop_last: Whether the last non-full batch of data should be dropped.
193
+ persistent_workers: maintain the workers Dataset instances alive after a dataset has been consumed once.
194
+ collate_fn: Function that performs batch collation
195
+ """
196
+
197
+ sampler = _make_sampler(
198
+ dataset=dataset,
199
+ type=sampler_type,
200
+ shuffle=shuffle,
201
+ seed=seed,
202
+ size=sampler_size,
203
+ advance=sampler_advance,
204
+ )
205
+
206
+ logger.info("using PyTorch data loader")
207
+ data_loader = torch.utils.data.DataLoader(
208
+ dataset,
209
+ sampler=sampler,
210
+ batch_size=batch_size,
211
+ num_workers=num_workers,
212
+ pin_memory=True,
213
+ drop_last=drop_last,
214
+ persistent_workers=persistent_workers,
215
+ collate_fn=collate_fn,
216
+ )
217
+
218
+ try:
219
+ logger.info(f"# of batches: {len(data_loader):,d}")
220
+ except TypeError: # data loader has no length
221
+ logger.info("infinite data loader")
222
+ return data_loader
src/dinov2/data/masking.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import random
7
+ import math
8
+ import numpy as np
9
+
10
+
11
+ class MaskingGenerator:
12
+ def __init__(
13
+ self,
14
+ input_size,
15
+ num_masking_patches=None,
16
+ min_num_patches=4,
17
+ max_num_patches=None,
18
+ min_aspect=0.3,
19
+ max_aspect=None,
20
+ ):
21
+ if not isinstance(input_size, tuple):
22
+ input_size = (input_size,) * 2
23
+ self.height, self.width = input_size
24
+
25
+ self.num_patches = self.height * self.width
26
+ self.num_masking_patches = num_masking_patches
27
+
28
+ self.min_num_patches = min_num_patches
29
+ self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches
30
+
31
+ max_aspect = max_aspect or 1 / min_aspect
32
+ self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
33
+
34
+ def __repr__(self):
35
+ repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
36
+ self.height,
37
+ self.width,
38
+ self.min_num_patches,
39
+ self.max_num_patches,
40
+ self.num_masking_patches,
41
+ self.log_aspect_ratio[0],
42
+ self.log_aspect_ratio[1],
43
+ )
44
+ return repr_str
45
+
46
+ def get_shape(self):
47
+ return self.height, self.width
48
+
49
+ def _mask(self, mask, max_mask_patches):
50
+ delta = 0
51
+ for _ in range(10):
52
+ target_area = random.uniform(self.min_num_patches, max_mask_patches)
53
+ aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
54
+ h = int(round(math.sqrt(target_area * aspect_ratio)))
55
+ w = int(round(math.sqrt(target_area / aspect_ratio)))
56
+ if w < self.width and h < self.height:
57
+ top = random.randint(0, self.height - h)
58
+ left = random.randint(0, self.width - w)
59
+
60
+ num_masked = mask[top : top + h, left : left + w].sum()
61
+ # Overlap
62
+ if 0 < h * w - num_masked <= max_mask_patches:
63
+ for i in range(top, top + h):
64
+ for j in range(left, left + w):
65
+ if mask[i, j] == 0:
66
+ mask[i, j] = 1
67
+ delta += 1
68
+
69
+ if delta > 0:
70
+ break
71
+ return delta
72
+
73
+ def __call__(self, num_masking_patches=0):
74
+ mask = np.zeros(shape=self.get_shape(), dtype=bool)
75
+ mask_count = 0
76
+ while mask_count < num_masking_patches:
77
+ max_mask_patches = num_masking_patches - mask_count
78
+ max_mask_patches = min(max_mask_patches, self.max_num_patches)
79
+
80
+ delta = self._mask(mask, max_mask_patches)
81
+ if delta == 0:
82
+ break
83
+ else:
84
+ mask_count += delta
85
+
86
+ return mask
src/dinov2/data/samplers.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import itertools
7
+ from typing import Any, Optional
8
+ import warnings
9
+
10
+ import numpy as np
11
+ import torch
12
+ from torch.utils.data.sampler import Sampler
13
+
14
+ import dinov2.distributed as distributed
15
+
16
+
17
+ class EpochSampler(Sampler):
18
+ def __init__(
19
+ self,
20
+ *,
21
+ size: int,
22
+ sample_count: int,
23
+ shuffle: bool = False,
24
+ seed: int = 0,
25
+ start: Optional[int] = None,
26
+ step: Optional[int] = None,
27
+ ):
28
+ self._size = size
29
+ self._sample_count = sample_count
30
+ self._shuffle = shuffle
31
+ self._seed = seed
32
+ self._start = distributed.get_global_rank() if start is None else start
33
+ self._step = distributed.get_global_size() if step is None else step
34
+ self._epoch = 0
35
+
36
+ def __iter__(self):
37
+ count = (self._size + self._sample_count - 1) // self._sample_count
38
+ tiled_indices = np.tile(np.arange(self._sample_count), count)
39
+ if self._shuffle:
40
+ seed = self._seed * self._epoch if self._seed != 0 else self._epoch
41
+ rng = np.random.default_rng(seed)
42
+ iterable = rng.choice(tiled_indices, self._size, replace=False)
43
+ else:
44
+ iterable = tiled_indices[: self._size]
45
+
46
+ yield from itertools.islice(iterable, self._start, None, self._step)
47
+
48
+ def __len__(self):
49
+ return (self._size - self._start + self._step - 1) // self._step
50
+
51
+ def set_epoch(self, epoch):
52
+ self._epoch = epoch
53
+
54
+
55
+ def _get_numpy_dtype(size: int) -> Any:
56
+ return np.int32 if size <= 2**31 else np.int64
57
+
58
+
59
+ def _get_torch_dtype(size: int) -> Any:
60
+ return torch.int32 if size <= 2**31 else torch.int64
61
+
62
+
63
+ def _generate_randperm_indices(*, size: int, generator: torch.Generator):
64
+ """Generate the indices of a random permutation."""
65
+ dtype = _get_torch_dtype(size)
66
+ # This is actually matching PyTorch's CPU implementation, see: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorFactories.cpp#L900-L921
67
+ perm = torch.arange(size, dtype=dtype)
68
+ for i in range(size):
69
+ j = torch.randint(i, size, size=(1,), generator=generator).item()
70
+
71
+ # Always swap even if no-op
72
+ value = perm[j].item()
73
+ perm[j] = perm[i].item()
74
+ perm[i] = value
75
+ yield value
76
+
77
+
78
+ class InfiniteSampler(Sampler):
79
+ def __init__(
80
+ self,
81
+ *,
82
+ sample_count: int,
83
+ shuffle: bool = False,
84
+ seed: int = 0,
85
+ start: Optional[int] = None,
86
+ step: Optional[int] = None,
87
+ advance: int = 0,
88
+ ):
89
+ self._sample_count = sample_count
90
+ self._seed = seed
91
+ self._shuffle = shuffle
92
+ self._start = distributed.get_global_rank() if start is None else start
93
+ self._step = distributed.get_global_size() if step is None else step
94
+ self._advance = advance
95
+
96
+ def __iter__(self):
97
+ if self._shuffle:
98
+ iterator = self._shuffled_iterator()
99
+ else:
100
+ iterator = self._iterator()
101
+
102
+ yield from itertools.islice(iterator, self._advance, None)
103
+
104
+ def _iterator(self):
105
+ assert not self._shuffle
106
+
107
+ while True:
108
+ iterable = range(self._sample_count)
109
+ yield from itertools.islice(iterable, self._start, None, self._step)
110
+
111
+ def _shuffled_iterator(self):
112
+ assert self._shuffle
113
+
114
+ # Instantiate a generator here (rather than in the ctor) to keep the class
115
+ # picklable (requirement of mp.spawn)
116
+ generator = torch.Generator().manual_seed(self._seed)
117
+
118
+ while True:
119
+ iterable = _generate_randperm_indices(size=self._sample_count, generator=generator)
120
+ yield from itertools.islice(iterable, self._start, None, self._step)
121
+
122
+
123
+ # The following function is somewhat equivalent to _new_shuffle_tensor_slice below,
124
+ # but avoids a full in-place random permutation generation.
125
+ def _shuffle_tensor_slice(
126
+ *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
127
+ ) -> np.ndarray:
128
+ stop = len(tensor)
129
+ count = stop // step
130
+ drop_count = stop - step * count
131
+ if drop_count:
132
+ warnings.warn(f"# of dropped samples: {drop_count}")
133
+
134
+ dtype = _get_numpy_dtype(stop)
135
+ result = np.empty(count, dtype=dtype)
136
+
137
+ for i in range(count):
138
+ j = torch.randint(0, i + 1, size=(1,), generator=generator).item() if i > 0 else 0
139
+
140
+ result[i] = result[j]
141
+ result[j] = tensor[start + i * step].item()
142
+
143
+ return result
144
+
145
+
146
+ def _new_shuffle_tensor_slice(
147
+ *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
148
+ ) -> np.ndarray:
149
+ stop = len(tensor)
150
+ count = stop // step
151
+ dtype = torch.int64 # Needed for using randperm result as indices
152
+ count = stop // step
153
+ drop_count = stop - step * count
154
+ if drop_count:
155
+ warnings.warn(f"# of dropped samples: {drop_count}")
156
+ indices = torch.randperm(count, dtype=dtype, generator=generator)
157
+ return tensor[start::step][indices].numpy()
158
+
159
+
160
+ def _make_seed(seed: int, start: int, iter_count: int) -> int:
161
+ # NOTE: Tried a few variants (including iter_count << 32), this one worked best.
162
+ return seed + start + (iter_count << 24)
163
+
164
+
165
+ class ShardedInfiniteSampler(Sampler):
166
+ def __init__(
167
+ self,
168
+ *,
169
+ sample_count: int,
170
+ shuffle: bool = False,
171
+ seed: int = 0,
172
+ start: Optional[int] = None,
173
+ step: Optional[int] = None,
174
+ advance: int = 0,
175
+ use_new_shuffle_tensor_slice: bool = False,
176
+ ):
177
+ self._sample_count = sample_count
178
+ self._seed = seed
179
+ self._shuffle = shuffle
180
+ self._start = distributed.get_global_rank() if start is None else start
181
+ self._step = distributed.get_global_size() if step is None else step
182
+ self._advance = advance
183
+ self._iter_count = 0
184
+ self._shuffle_tensor_slice_fn = (
185
+ _new_shuffle_tensor_slice if use_new_shuffle_tensor_slice else _shuffle_tensor_slice
186
+ )
187
+
188
+ def __iter__(self):
189
+ iter_count = self._advance // self._sample_count
190
+ if iter_count > 0:
191
+ self._advance -= iter_count * self._sample_count
192
+ self._iter_count += iter_count
193
+
194
+ if self._shuffle:
195
+ iterator = self._shuffled_iterator()
196
+ else:
197
+ iterator = self._iterator()
198
+
199
+ yield from itertools.islice(iterator, self._advance, None)
200
+
201
+ def _iterator(self):
202
+ assert not self._shuffle
203
+
204
+ while True:
205
+ iterable = range(self._sample_count)
206
+ yield from itertools.islice(iterable, self._start, None, self._step)
207
+
208
+ def _shuffled_iterator(self):
209
+ assert self._shuffle
210
+
211
+ # Instantiate a generator here (rather than in the ctor) to be keep the class
212
+ # picklable (requirement of mp.spawn)
213
+ generator = torch.Generator()
214
+
215
+ # Always shuffle everything first
216
+ generator.manual_seed(self._seed)
217
+ dtype = _get_torch_dtype(self._sample_count)
218
+ perm = torch.randperm(self._sample_count, dtype=dtype, generator=generator)
219
+
220
+ while True:
221
+ # Re-seed on each iteration to allow skipping whole permutations
222
+ seed = _make_seed(self._seed, self._start, self._iter_count)
223
+ generator.manual_seed(seed)
224
+
225
+ iterable = self._shuffle_tensor_slice_fn(
226
+ tensor=perm, start=self._start, step=self._step, generator=generator
227
+ )
228
+ yield from iterable
229
+ self._iter_count += 1
src/dinov2/data/transforms.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from typing import Sequence
7
+
8
+ import torch
9
+ from torchvision import transforms
10
+
11
+
12
+ class GaussianBlur(transforms.RandomApply):
13
+ """
14
+ Apply Gaussian Blur to the PIL image.
15
+ """
16
+
17
+ def __init__(self, *, p: float = 0.5, radius_min: float = 0.1, radius_max: float = 2.0):
18
+ # NOTE: torchvision is applying 1 - probability to return the original image
19
+ keep_p = 1 - p
20
+ transform = transforms.GaussianBlur(kernel_size=9, sigma=(radius_min, radius_max))
21
+ super().__init__(transforms=[transform], p=keep_p)
22
+
23
+
24
+ class MaybeToTensor(transforms.ToTensor):
25
+ """
26
+ Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor, or keep as is if already a tensor.
27
+ """
28
+
29
+ def __call__(self, pic):
30
+ """
31
+ Args:
32
+ pic (PIL Image, numpy.ndarray or torch.tensor): Image to be converted to tensor.
33
+ Returns:
34
+ Tensor: Converted image.
35
+ """
36
+ if isinstance(pic, torch.Tensor):
37
+ return pic
38
+ return super().__call__(pic)
39
+
40
+
41
+ # Use timm's names
42
+ IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
43
+ IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
44
+
45
+
46
+ def make_normalize_transform(
47
+ mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
48
+ std: Sequence[float] = IMAGENET_DEFAULT_STD,
49
+ ) -> transforms.Normalize:
50
+ return transforms.Normalize(mean=mean, std=std)
51
+
52
+
53
+ # This roughly matches torchvision's preset for classification training:
54
+ # https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L6-L44
55
+ def make_classification_train_transform(
56
+ *,
57
+ crop_size: int = 224,
58
+ interpolation=transforms.InterpolationMode.BICUBIC,
59
+ hflip_prob: float = 0.5,
60
+ mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
61
+ std: Sequence[float] = IMAGENET_DEFAULT_STD,
62
+ ):
63
+ transforms_list = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
64
+ if hflip_prob > 0.0:
65
+ transforms_list.append(transforms.RandomHorizontalFlip(hflip_prob))
66
+ transforms_list.extend(
67
+ [
68
+ MaybeToTensor(),
69
+ make_normalize_transform(mean=mean, std=std),
70
+ ]
71
+ )
72
+ return transforms.Compose(transforms_list)
73
+
74
+
75
+ # This matches (roughly) torchvision's preset for classification evaluation:
76
+ # https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L47-L69
77
+ def make_classification_eval_transform(
78
+ *,
79
+ resize_size: int = 256,
80
+ interpolation=transforms.InterpolationMode.BICUBIC,
81
+ crop_size: int = 224,
82
+ mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
83
+ std: Sequence[float] = IMAGENET_DEFAULT_STD,
84
+ ) -> transforms.Compose:
85
+ transforms_list = [
86
+ transforms.Resize(resize_size, interpolation=interpolation),
87
+ transforms.CenterCrop(crop_size),
88
+ MaybeToTensor(),
89
+ make_normalize_transform(mean=mean, std=std),
90
+ ]
91
+ return transforms.Compose(transforms_list)
src/dinov2/distributed/__init__.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import random
8
+ import re
9
+ import socket
10
+ from typing import Dict, List
11
+
12
+ import torch
13
+ import torch.distributed as dist
14
+
15
+ _LOCAL_RANK = -1
16
+ _LOCAL_WORLD_SIZE = -1
17
+
18
+
19
+ def is_enabled() -> bool:
20
+ """
21
+ Returns:
22
+ True if distributed training is enabled
23
+ """
24
+ return dist.is_available() and dist.is_initialized()
25
+
26
+
27
+ def get_global_size() -> int:
28
+ """
29
+ Returns:
30
+ The number of processes in the process group
31
+ """
32
+ return dist.get_world_size() if is_enabled() else 1
33
+
34
+
35
+ def get_global_rank() -> int:
36
+ """
37
+ Returns:
38
+ The rank of the current process within the global process group.
39
+ """
40
+ return dist.get_rank() if is_enabled() else 0
41
+
42
+
43
+ def get_local_rank() -> int:
44
+ """
45
+ Returns:
46
+ The rank of the current process within the local (per-machine) process group.
47
+ """
48
+ if not is_enabled():
49
+ return 0
50
+ assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
51
+ return _LOCAL_RANK
52
+
53
+
54
+ def get_local_size() -> int:
55
+ """
56
+ Returns:
57
+ The size of the per-machine process group,
58
+ i.e. the number of processes per machine.
59
+ """
60
+ if not is_enabled():
61
+ return 1
62
+ assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
63
+ return _LOCAL_WORLD_SIZE
64
+
65
+
66
+ def is_main_process() -> bool:
67
+ """
68
+ Returns:
69
+ True if the current process is the main one.
70
+ """
71
+ return get_global_rank() == 0
72
+
73
+
74
+ def _restrict_print_to_main_process() -> None:
75
+ """
76
+ This function disables printing when not in the main process
77
+ """
78
+ import builtins as __builtin__
79
+
80
+ builtin_print = __builtin__.print
81
+
82
+ def print(*args, **kwargs):
83
+ force = kwargs.pop("force", False)
84
+ if is_main_process() or force:
85
+ builtin_print(*args, **kwargs)
86
+
87
+ __builtin__.print = print
88
+
89
+
90
+ def _get_master_port(seed: int = 0) -> int:
91
+ MIN_MASTER_PORT, MAX_MASTER_PORT = (20_000, 60_000)
92
+
93
+ master_port_str = os.environ.get("MASTER_PORT")
94
+ if master_port_str is None:
95
+ rng = random.Random(seed)
96
+ return rng.randint(MIN_MASTER_PORT, MAX_MASTER_PORT)
97
+
98
+ return int(master_port_str)
99
+
100
+
101
+ def _get_available_port() -> int:
102
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
103
+ # A "" host address means INADDR_ANY i.e. binding to all interfaces.
104
+ # Note this is not compatible with IPv6.
105
+ s.bind(("", 0))
106
+ port = s.getsockname()[1]
107
+ return port
108
+
109
+
110
+ _TORCH_DISTRIBUTED_ENV_VARS = (
111
+ "MASTER_ADDR",
112
+ "MASTER_PORT",
113
+ "RANK",
114
+ "WORLD_SIZE",
115
+ "LOCAL_RANK",
116
+ "LOCAL_WORLD_SIZE",
117
+ )
118
+
119
+
120
+ def _collect_env_vars() -> Dict[str, str]:
121
+ return {env_var: os.environ[env_var] for env_var in _TORCH_DISTRIBUTED_ENV_VARS if env_var in os.environ}
122
+
123
+
124
+ def _is_slurm_job_process() -> bool:
125
+ return "SLURM_JOB_ID" in os.environ
126
+
127
+
128
+ def _parse_slurm_node_list(s: str) -> List[str]:
129
+ nodes = []
130
+ # Extract "hostname", "hostname[1-2,3,4-5]," substrings
131
+ p = re.compile(r"(([^\[]+)(?:\[([^\]]+)\])?),?")
132
+ for m in p.finditer(s):
133
+ prefix, suffixes = s[m.start(2) : m.end(2)], s[m.start(3) : m.end(3)]
134
+ for suffix in suffixes.split(","):
135
+ span = suffix.split("-")
136
+ if len(span) == 1:
137
+ nodes.append(prefix + suffix)
138
+ else:
139
+ width = len(span[0])
140
+ start, end = int(span[0]), int(span[1]) + 1
141
+ nodes.extend([prefix + f"{i:0{width}}" for i in range(start, end)])
142
+ return nodes
143
+
144
+
145
+ def _check_env_variable(key: str, new_value: str):
146
+ # Only check for difference with preset environment variables
147
+ if key in os.environ and os.environ[key] != new_value:
148
+ raise RuntimeError(f"Cannot export environment variables as {key} is already set")
149
+
150
+
151
+ class _TorchDistributedEnvironment:
152
+ def __init__(self):
153
+ self.master_addr = "127.0.0.1"
154
+ self.master_port = 0
155
+ self.rank = -1
156
+ self.world_size = -1
157
+ self.local_rank = -1
158
+ self.local_world_size = -1
159
+
160
+ if _is_slurm_job_process():
161
+ return self._set_from_slurm_env()
162
+
163
+ env_vars = _collect_env_vars()
164
+ if not env_vars:
165
+ # Environment is not set
166
+ pass
167
+ elif len(env_vars) == len(_TORCH_DISTRIBUTED_ENV_VARS):
168
+ # Environment is fully set
169
+ return self._set_from_preset_env()
170
+ else:
171
+ # Environment is partially set
172
+ collected_env_vars = ", ".join(env_vars.keys())
173
+ raise RuntimeError(f"Partially set environment: {collected_env_vars}")
174
+
175
+ if torch.cuda.device_count() > 0:
176
+ return self._set_from_local()
177
+
178
+ raise RuntimeError("Can't initialize PyTorch distributed environment")
179
+
180
+ # Slurm job created with sbatch, submitit, etc...
181
+ def _set_from_slurm_env(self):
182
+ # logger.info("Initialization from Slurm environment")
183
+ job_id = int(os.environ["SLURM_JOB_ID"])
184
+ node_count = int(os.environ["SLURM_JOB_NUM_NODES"])
185
+ nodes = _parse_slurm_node_list(os.environ["SLURM_JOB_NODELIST"])
186
+ assert len(nodes) == node_count
187
+
188
+ self.master_addr = nodes[0]
189
+ self.master_port = _get_master_port(seed=job_id)
190
+ self.rank = int(os.environ["SLURM_PROCID"])
191
+ self.world_size = int(os.environ["SLURM_NTASKS"])
192
+ assert self.rank < self.world_size
193
+ self.local_rank = int(os.environ["SLURM_LOCALID"])
194
+ self.local_world_size = self.world_size // node_count
195
+ assert self.local_rank < self.local_world_size
196
+
197
+ # Single node job with preset environment (i.e. torchrun)
198
+ def _set_from_preset_env(self):
199
+ # logger.info("Initialization from preset environment")
200
+ self.master_addr = os.environ["MASTER_ADDR"]
201
+ self.master_port = os.environ["MASTER_PORT"]
202
+ self.rank = int(os.environ["RANK"])
203
+ self.world_size = int(os.environ["WORLD_SIZE"])
204
+ assert self.rank < self.world_size
205
+ self.local_rank = int(os.environ["LOCAL_RANK"])
206
+ self.local_world_size = int(os.environ["LOCAL_WORLD_SIZE"])
207
+ assert self.local_rank < self.local_world_size
208
+
209
+ # Single node and GPU job (i.e. local script run)
210
+ def _set_from_local(self):
211
+ # logger.info("Initialization from local")
212
+ self.master_addr = "127.0.0.1"
213
+ self.master_port = _get_available_port()
214
+ self.rank = 0
215
+ self.world_size = 1
216
+ self.local_rank = 0
217
+ self.local_world_size = 1
218
+
219
+ def export(self, *, overwrite: bool) -> "_TorchDistributedEnvironment":
220
+ # See the "Environment variable initialization" section from
221
+ # https://pytorch.org/docs/stable/distributed.html for the complete list of
222
+ # environment variables required for the env:// initialization method.
223
+ env_vars = {
224
+ "MASTER_ADDR": self.master_addr,
225
+ "MASTER_PORT": str(self.master_port),
226
+ "RANK": str(self.rank),
227
+ "WORLD_SIZE": str(self.world_size),
228
+ "LOCAL_RANK": str(self.local_rank),
229
+ "LOCAL_WORLD_SIZE": str(self.local_world_size),
230
+ }
231
+ if not overwrite:
232
+ for k, v in env_vars.items():
233
+ _check_env_variable(k, v)
234
+
235
+ os.environ.update(env_vars)
236
+ return self
237
+
238
+
239
+ def enable(*, set_cuda_current_device: bool = True, overwrite: bool = False, allow_nccl_timeout: bool = False):
240
+ """Enable distributed mode
241
+
242
+ Args:
243
+ set_cuda_current_device: If True, call torch.cuda.set_device() to set the
244
+ current PyTorch CUDA device to the one matching the local rank.
245
+ overwrite: If True, overwrites already set variables. Else fails.
246
+ """
247
+
248
+ global _LOCAL_RANK, _LOCAL_WORLD_SIZE
249
+ if _LOCAL_RANK >= 0 or _LOCAL_WORLD_SIZE >= 0:
250
+ raise RuntimeError("Distributed mode has already been enabled")
251
+ torch_env = _TorchDistributedEnvironment()
252
+ torch_env.export(overwrite=overwrite)
253
+
254
+ if set_cuda_current_device:
255
+ torch.cuda.set_device(torch_env.local_rank)
256
+
257
+ if allow_nccl_timeout:
258
+ # This allows to use torch distributed timeout in a NCCL backend
259
+ key, value = "NCCL_ASYNC_ERROR_HANDLING", "1"
260
+ if not overwrite:
261
+ _check_env_variable(key, value)
262
+ os.environ[key] = value
263
+
264
+ dist.init_process_group(backend="nccl")
265
+ dist.barrier()
266
+
267
+ # Finalize setup
268
+ _LOCAL_RANK = torch_env.local_rank
269
+ _LOCAL_WORLD_SIZE = torch_env.local_world_size
270
+ _restrict_print_to_main_process()
src/dinov2/eval/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
src/dinov2/eval/depth/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
src/dinov2/eval/depth/models/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from .backbones import * # noqa: F403
7
+ from .builder import BACKBONES, DEPTHER, HEADS, LOSSES, build_backbone, build_depther, build_head, build_loss
8
+ from .decode_heads import * # noqa: F403
9
+ from .depther import * # noqa: F403
10
+ from .losses import * # noqa: F403
src/dinov2/eval/depth/models/backbones/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from .vision_transformer import DinoVisionTransformer
src/dinov2/eval/depth/models/backbones/vision_transformer.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from mmcv.runner import BaseModule
7
+
8
+ from ..builder import BACKBONES
9
+
10
+
11
+ @BACKBONES.register_module()
12
+ class DinoVisionTransformer(BaseModule):
13
+ """Vision Transformer."""
14
+
15
+ def __init__(self, *args, **kwargs):
16
+ super().__init__()
src/dinov2/eval/depth/models/builder.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import warnings
7
+
8
+ from mmcv.cnn import MODELS as MMCV_MODELS
9
+ from mmcv.cnn.bricks.registry import ATTENTION as MMCV_ATTENTION
10
+ from mmcv.utils import Registry
11
+
12
+ MODELS = Registry("models", parent=MMCV_MODELS)
13
+ ATTENTION = Registry("attention", parent=MMCV_ATTENTION)
14
+
15
+
16
+ BACKBONES = MODELS
17
+ NECKS = MODELS
18
+ HEADS = MODELS
19
+ LOSSES = MODELS
20
+ DEPTHER = MODELS
21
+
22
+
23
+ def build_backbone(cfg):
24
+ """Build backbone."""
25
+ return BACKBONES.build(cfg)
26
+
27
+
28
+ def build_neck(cfg):
29
+ """Build neck."""
30
+ return NECKS.build(cfg)
31
+
32
+
33
+ def build_head(cfg):
34
+ """Build head."""
35
+ return HEADS.build(cfg)
36
+
37
+
38
+ def build_loss(cfg):
39
+ """Build loss."""
40
+ return LOSSES.build(cfg)
41
+
42
+
43
+ def build_depther(cfg, train_cfg=None, test_cfg=None):
44
+ """Build depther."""
45
+ if train_cfg is not None or test_cfg is not None:
46
+ warnings.warn("train_cfg and test_cfg is deprecated, " "please specify them in model", UserWarning)
47
+ assert cfg.get("train_cfg") is None or train_cfg is None, "train_cfg specified in both outer field and model field "
48
+ assert cfg.get("test_cfg") is None or test_cfg is None, "test_cfg specified in both outer field and model field "
49
+ return DEPTHER.build(cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
src/dinov2/eval/depth/models/decode_heads/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from .dpt_head import DPTHead
7
+ from .linear_head import BNHead
src/dinov2/eval/depth/models/decode_heads/decode_head.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import copy
7
+ from abc import ABCMeta, abstractmethod
8
+
9
+ import mmcv
10
+ import numpy as np
11
+ import torch
12
+ import torch.nn as nn
13
+ from mmcv.runner import BaseModule, auto_fp16, force_fp32
14
+
15
+ from ...ops import resize
16
+ from ..builder import build_loss
17
+
18
+
19
+ class DepthBaseDecodeHead(BaseModule, metaclass=ABCMeta):
20
+ """Base class for BaseDecodeHead.
21
+
22
+ Args:
23
+ in_channels (List): Input channels.
24
+ channels (int): Channels after modules, before conv_depth.
25
+ conv_cfg (dict|None): Config of conv layers. Default: None.
26
+ act_cfg (dict): Config of activation layers.
27
+ Default: dict(type='ReLU')
28
+ loss_decode (dict): Config of decode loss.
29
+ Default: dict(type='SigLoss').
30
+ sampler (dict|None): The config of depth map sampler.
31
+ Default: None.
32
+ align_corners (bool): align_corners argument of F.interpolate.
33
+ Default: False.
34
+ min_depth (int): Min depth in dataset setting.
35
+ Default: 1e-3.
36
+ max_depth (int): Max depth in dataset setting.
37
+ Default: None.
38
+ norm_cfg (dict|None): Config of norm layers.
39
+ Default: None.
40
+ classify (bool): Whether predict depth in a cls.-reg. manner.
41
+ Default: False.
42
+ n_bins (int): The number of bins used in cls. step.
43
+ Default: 256.
44
+ bins_strategy (str): The discrete strategy used in cls. step.
45
+ Default: 'UD'.
46
+ norm_strategy (str): The norm strategy on cls. probability
47
+ distribution. Default: 'linear'
48
+ scale_up (str): Whether predict depth in a scale-up manner.
49
+ Default: False.
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ in_channels,
55
+ channels=96,
56
+ conv_cfg=None,
57
+ act_cfg=dict(type="ReLU"),
58
+ loss_decode=dict(type="SigLoss", valid_mask=True, loss_weight=10),
59
+ sampler=None,
60
+ align_corners=False,
61
+ min_depth=1e-3,
62
+ max_depth=None,
63
+ norm_cfg=None,
64
+ classify=False,
65
+ n_bins=256,
66
+ bins_strategy="UD",
67
+ norm_strategy="linear",
68
+ scale_up=False,
69
+ ):
70
+ super(DepthBaseDecodeHead, self).__init__()
71
+
72
+ self.in_channels = in_channels
73
+ self.channels = channels
74
+ self.conv_cfg = conv_cfg
75
+ self.act_cfg = act_cfg
76
+ if isinstance(loss_decode, dict):
77
+ self.loss_decode = build_loss(loss_decode)
78
+ elif isinstance(loss_decode, (list, tuple)):
79
+ self.loss_decode = nn.ModuleList()
80
+ for loss in loss_decode:
81
+ self.loss_decode.append(build_loss(loss))
82
+ self.align_corners = align_corners
83
+ self.min_depth = min_depth
84
+ self.max_depth = max_depth
85
+ self.norm_cfg = norm_cfg
86
+ self.classify = classify
87
+ self.n_bins = n_bins
88
+ self.scale_up = scale_up
89
+
90
+ if self.classify:
91
+ assert bins_strategy in ["UD", "SID"], "Support bins_strategy: UD, SID"
92
+ assert norm_strategy in ["linear", "softmax", "sigmoid"], "Support norm_strategy: linear, softmax, sigmoid"
93
+
94
+ self.bins_strategy = bins_strategy
95
+ self.norm_strategy = norm_strategy
96
+ self.softmax = nn.Softmax(dim=1)
97
+ self.conv_depth = nn.Conv2d(channels, n_bins, kernel_size=3, padding=1, stride=1)
98
+ else:
99
+ self.conv_depth = nn.Conv2d(channels, 1, kernel_size=3, padding=1, stride=1)
100
+
101
+ self.fp16_enabled = False
102
+ self.relu = nn.ReLU()
103
+ self.sigmoid = nn.Sigmoid()
104
+
105
+ def extra_repr(self):
106
+ """Extra repr."""
107
+ s = f"align_corners={self.align_corners}"
108
+ return s
109
+
110
+ @auto_fp16()
111
+ @abstractmethod
112
+ def forward(self, inputs, img_metas):
113
+ """Placeholder of forward function."""
114
+ pass
115
+
116
+ def forward_train(self, img, inputs, img_metas, depth_gt, train_cfg):
117
+ """Forward function for training.
118
+ Args:
119
+ inputs (list[Tensor]): List of multi-level img features.
120
+ img_metas (list[dict]): List of image info dict where each dict
121
+ has: 'img_shape', 'scale_factor', 'flip', and may also contain
122
+ 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
123
+ For details on the values of these keys see
124
+ `depth/datasets/pipelines/formatting.py:Collect`.
125
+ depth_gt (Tensor): GT depth
126
+ train_cfg (dict): The training config.
127
+
128
+ Returns:
129
+ dict[str, Tensor]: a dictionary of loss components
130
+ """
131
+ depth_pred = self.forward(inputs, img_metas)
132
+ losses = self.losses(depth_pred, depth_gt)
133
+
134
+ log_imgs = self.log_images(img[0], depth_pred[0], depth_gt[0], img_metas[0])
135
+ losses.update(**log_imgs)
136
+
137
+ return losses
138
+
139
+ def forward_test(self, inputs, img_metas, test_cfg):
140
+ """Forward function for testing.
141
+ Args:
142
+ inputs (list[Tensor]): List of multi-level img features.
143
+ img_metas (list[dict]): List of image info dict where each dict
144
+ has: 'img_shape', 'scale_factor', 'flip', and may also contain
145
+ 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
146
+ For details on the values of these keys see
147
+ `depth/datasets/pipelines/formatting.py:Collect`.
148
+ test_cfg (dict): The testing config.
149
+
150
+ Returns:
151
+ Tensor: Output depth map.
152
+ """
153
+ return self.forward(inputs, img_metas)
154
+
155
+ def depth_pred(self, feat):
156
+ """Prediction each pixel."""
157
+ if self.classify:
158
+ logit = self.conv_depth(feat)
159
+
160
+ if self.bins_strategy == "UD":
161
+ bins = torch.linspace(self.min_depth, self.max_depth, self.n_bins, device=feat.device)
162
+ elif self.bins_strategy == "SID":
163
+ bins = torch.logspace(self.min_depth, self.max_depth, self.n_bins, device=feat.device)
164
+
165
+ # following Adabins, default linear
166
+ if self.norm_strategy == "linear":
167
+ logit = torch.relu(logit)
168
+ eps = 0.1
169
+ logit = logit + eps
170
+ logit = logit / logit.sum(dim=1, keepdim=True)
171
+ elif self.norm_strategy == "softmax":
172
+ logit = torch.softmax(logit, dim=1)
173
+ elif self.norm_strategy == "sigmoid":
174
+ logit = torch.sigmoid(logit)
175
+ logit = logit / logit.sum(dim=1, keepdim=True)
176
+
177
+ output = torch.einsum("ikmn,k->imn", [logit, bins]).unsqueeze(dim=1)
178
+
179
+ else:
180
+ if self.scale_up:
181
+ output = self.sigmoid(self.conv_depth(feat)) * self.max_depth
182
+ else:
183
+ output = self.relu(self.conv_depth(feat)) + self.min_depth
184
+ return output
185
+
186
+ @force_fp32(apply_to=("depth_pred",))
187
+ def losses(self, depth_pred, depth_gt):
188
+ """Compute depth loss."""
189
+ loss = dict()
190
+ depth_pred = resize(
191
+ input=depth_pred, size=depth_gt.shape[2:], mode="bilinear", align_corners=self.align_corners, warning=False
192
+ )
193
+ if not isinstance(self.loss_decode, nn.ModuleList):
194
+ losses_decode = [self.loss_decode]
195
+ else:
196
+ losses_decode = self.loss_decode
197
+ for loss_decode in losses_decode:
198
+ if loss_decode.loss_name not in loss:
199
+ loss[loss_decode.loss_name] = loss_decode(depth_pred, depth_gt)
200
+ else:
201
+ loss[loss_decode.loss_name] += loss_decode(depth_pred, depth_gt)
202
+ return loss
203
+
204
+ def log_images(self, img_path, depth_pred, depth_gt, img_meta):
205
+ show_img = copy.deepcopy(img_path.detach().cpu().permute(1, 2, 0))
206
+ show_img = show_img.numpy().astype(np.float32)
207
+ show_img = mmcv.imdenormalize(
208
+ show_img,
209
+ img_meta["img_norm_cfg"]["mean"],
210
+ img_meta["img_norm_cfg"]["std"],
211
+ img_meta["img_norm_cfg"]["to_rgb"],
212
+ )
213
+ show_img = np.clip(show_img, 0, 255)
214
+ show_img = show_img.astype(np.uint8)
215
+ show_img = show_img[:, :, ::-1]
216
+ show_img = show_img.transpose(0, 2, 1)
217
+ show_img = show_img.transpose(1, 0, 2)
218
+
219
+ depth_pred = depth_pred / torch.max(depth_pred)
220
+ depth_gt = depth_gt / torch.max(depth_gt)
221
+
222
+ depth_pred_color = copy.deepcopy(depth_pred.detach().cpu())
223
+ depth_gt_color = copy.deepcopy(depth_gt.detach().cpu())
224
+
225
+ return {"img_rgb": show_img, "img_depth_pred": depth_pred_color, "img_depth_gt": depth_gt_color}
src/dinov2/eval/depth/models/decode_heads/dpt_head.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import math
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ from mmcv.cnn import ConvModule, Linear, build_activation_layer
11
+ from mmcv.runner import BaseModule
12
+
13
+ from ...ops import resize
14
+ from ..builder import HEADS
15
+ from .decode_head import DepthBaseDecodeHead
16
+
17
+
18
+ class Interpolate(nn.Module):
19
+ def __init__(self, scale_factor, mode, align_corners=False):
20
+ super(Interpolate, self).__init__()
21
+ self.interp = nn.functional.interpolate
22
+ self.scale_factor = scale_factor
23
+ self.mode = mode
24
+ self.align_corners = align_corners
25
+
26
+ def forward(self, x):
27
+ x = self.interp(x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners)
28
+ return x
29
+
30
+
31
+ class HeadDepth(nn.Module):
32
+ def __init__(self, features):
33
+ super(HeadDepth, self).__init__()
34
+ self.head = nn.Sequential(
35
+ nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
36
+ Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
37
+ nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
38
+ nn.ReLU(),
39
+ nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
40
+ )
41
+
42
+ def forward(self, x):
43
+ x = self.head(x)
44
+ return x
45
+
46
+
47
+ class ReassembleBlocks(BaseModule):
48
+ """ViTPostProcessBlock, process cls_token in ViT backbone output and
49
+ rearrange the feature vector to feature map.
50
+ Args:
51
+ in_channels (int): ViT feature channels. Default: 768.
52
+ out_channels (List): output channels of each stage.
53
+ Default: [96, 192, 384, 768].
54
+ readout_type (str): Type of readout operation. Default: 'ignore'.
55
+ patch_size (int): The patch size. Default: 16.
56
+ init_cfg (dict, optional): Initialization config dict. Default: None.
57
+ """
58
+
59
+ def __init__(
60
+ self, in_channels=768, out_channels=[96, 192, 384, 768], readout_type="ignore", patch_size=16, init_cfg=None
61
+ ):
62
+ super(ReassembleBlocks, self).__init__(init_cfg)
63
+
64
+ assert readout_type in ["ignore", "add", "project"]
65
+ self.readout_type = readout_type
66
+ self.patch_size = patch_size
67
+
68
+ self.projects = nn.ModuleList(
69
+ [
70
+ ConvModule(
71
+ in_channels=in_channels,
72
+ out_channels=out_channel,
73
+ kernel_size=1,
74
+ act_cfg=None,
75
+ )
76
+ for out_channel in out_channels
77
+ ]
78
+ )
79
+
80
+ self.resize_layers = nn.ModuleList(
81
+ [
82
+ nn.ConvTranspose2d(
83
+ in_channels=out_channels[0], out_channels=out_channels[0], kernel_size=4, stride=4, padding=0
84
+ ),
85
+ nn.ConvTranspose2d(
86
+ in_channels=out_channels[1], out_channels=out_channels[1], kernel_size=2, stride=2, padding=0
87
+ ),
88
+ nn.Identity(),
89
+ nn.Conv2d(
90
+ in_channels=out_channels[3], out_channels=out_channels[3], kernel_size=3, stride=2, padding=1
91
+ ),
92
+ ]
93
+ )
94
+ if self.readout_type == "project":
95
+ self.readout_projects = nn.ModuleList()
96
+ for _ in range(len(self.projects)):
97
+ self.readout_projects.append(
98
+ nn.Sequential(Linear(2 * in_channels, in_channels), build_activation_layer(dict(type="GELU")))
99
+ )
100
+
101
+ def forward(self, inputs):
102
+ assert isinstance(inputs, list)
103
+ out = []
104
+ for i, x in enumerate(inputs):
105
+ assert len(x) == 2
106
+ x, cls_token = x[0], x[1]
107
+ feature_shape = x.shape
108
+ if self.readout_type == "project":
109
+ x = x.flatten(2).permute((0, 2, 1))
110
+ readout = cls_token.unsqueeze(1).expand_as(x)
111
+ x = self.readout_projects[i](torch.cat((x, readout), -1))
112
+ x = x.permute(0, 2, 1).reshape(feature_shape)
113
+ elif self.readout_type == "add":
114
+ x = x.flatten(2) + cls_token.unsqueeze(-1)
115
+ x = x.reshape(feature_shape)
116
+ else:
117
+ pass
118
+ x = self.projects[i](x)
119
+ x = self.resize_layers[i](x)
120
+ out.append(x)
121
+ return out
122
+
123
+
124
+ class PreActResidualConvUnit(BaseModule):
125
+ """ResidualConvUnit, pre-activate residual unit.
126
+ Args:
127
+ in_channels (int): number of channels in the input feature map.
128
+ act_cfg (dict): dictionary to construct and config activation layer.
129
+ norm_cfg (dict): dictionary to construct and config norm layer.
130
+ stride (int): stride of the first block. Default: 1
131
+ dilation (int): dilation rate for convs layers. Default: 1.
132
+ init_cfg (dict, optional): Initialization config dict. Default: None.
133
+ """
134
+
135
+ def __init__(self, in_channels, act_cfg, norm_cfg, stride=1, dilation=1, init_cfg=None):
136
+ super(PreActResidualConvUnit, self).__init__(init_cfg)
137
+
138
+ self.conv1 = ConvModule(
139
+ in_channels,
140
+ in_channels,
141
+ 3,
142
+ stride=stride,
143
+ padding=dilation,
144
+ dilation=dilation,
145
+ norm_cfg=norm_cfg,
146
+ act_cfg=act_cfg,
147
+ bias=False,
148
+ order=("act", "conv", "norm"),
149
+ )
150
+
151
+ self.conv2 = ConvModule(
152
+ in_channels,
153
+ in_channels,
154
+ 3,
155
+ padding=1,
156
+ norm_cfg=norm_cfg,
157
+ act_cfg=act_cfg,
158
+ bias=False,
159
+ order=("act", "conv", "norm"),
160
+ )
161
+
162
+ def forward(self, inputs):
163
+ inputs_ = inputs.clone()
164
+ x = self.conv1(inputs)
165
+ x = self.conv2(x)
166
+ return x + inputs_
167
+
168
+
169
+ class FeatureFusionBlock(BaseModule):
170
+ """FeatureFusionBlock, merge feature map from different stages.
171
+ Args:
172
+ in_channels (int): Input channels.
173
+ act_cfg (dict): The activation config for ResidualConvUnit.
174
+ norm_cfg (dict): Config dict for normalization layer.
175
+ expand (bool): Whether expand the channels in post process block.
176
+ Default: False.
177
+ align_corners (bool): align_corner setting for bilinear upsample.
178
+ Default: True.
179
+ init_cfg (dict, optional): Initialization config dict. Default: None.
180
+ """
181
+
182
+ def __init__(self, in_channels, act_cfg, norm_cfg, expand=False, align_corners=True, init_cfg=None):
183
+ super(FeatureFusionBlock, self).__init__(init_cfg)
184
+
185
+ self.in_channels = in_channels
186
+ self.expand = expand
187
+ self.align_corners = align_corners
188
+
189
+ self.out_channels = in_channels
190
+ if self.expand:
191
+ self.out_channels = in_channels // 2
192
+
193
+ self.project = ConvModule(self.in_channels, self.out_channels, kernel_size=1, act_cfg=None, bias=True)
194
+
195
+ self.res_conv_unit1 = PreActResidualConvUnit(in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
196
+ self.res_conv_unit2 = PreActResidualConvUnit(in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
197
+
198
+ def forward(self, *inputs):
199
+ x = inputs[0]
200
+ if len(inputs) == 2:
201
+ if x.shape != inputs[1].shape:
202
+ res = resize(inputs[1], size=(x.shape[2], x.shape[3]), mode="bilinear", align_corners=False)
203
+ else:
204
+ res = inputs[1]
205
+ x = x + self.res_conv_unit1(res)
206
+ x = self.res_conv_unit2(x)
207
+ x = resize(x, scale_factor=2, mode="bilinear", align_corners=self.align_corners)
208
+ x = self.project(x)
209
+ return x
210
+
211
+
212
+ @HEADS.register_module()
213
+ class DPTHead(DepthBaseDecodeHead):
214
+ """Vision Transformers for Dense Prediction.
215
+ This head is implemented of `DPT <https://arxiv.org/abs/2103.13413>`_.
216
+ Args:
217
+ embed_dims (int): The embed dimension of the ViT backbone.
218
+ Default: 768.
219
+ post_process_channels (List): Out channels of post process conv
220
+ layers. Default: [96, 192, 384, 768].
221
+ readout_type (str): Type of readout operation. Default: 'ignore'.
222
+ patch_size (int): The patch size. Default: 16.
223
+ expand_channels (bool): Whether expand the channels in post process
224
+ block. Default: False.
225
+ """
226
+
227
+ def __init__(
228
+ self,
229
+ embed_dims=768,
230
+ post_process_channels=[96, 192, 384, 768],
231
+ readout_type="ignore",
232
+ patch_size=16,
233
+ expand_channels=False,
234
+ **kwargs
235
+ ):
236
+ super(DPTHead, self).__init__(**kwargs)
237
+
238
+ self.in_channels = self.in_channels
239
+ self.expand_channels = expand_channels
240
+ self.reassemble_blocks = ReassembleBlocks(embed_dims, post_process_channels, readout_type, patch_size)
241
+
242
+ self.post_process_channels = [
243
+ channel * math.pow(2, i) if expand_channels else channel for i, channel in enumerate(post_process_channels)
244
+ ]
245
+ self.convs = nn.ModuleList()
246
+ for channel in self.post_process_channels:
247
+ self.convs.append(ConvModule(channel, self.channels, kernel_size=3, padding=1, act_cfg=None, bias=False))
248
+ self.fusion_blocks = nn.ModuleList()
249
+ for _ in range(len(self.convs)):
250
+ self.fusion_blocks.append(FeatureFusionBlock(self.channels, self.act_cfg, self.norm_cfg))
251
+ self.fusion_blocks[0].res_conv_unit1 = None
252
+ self.project = ConvModule(self.channels, self.channels, kernel_size=3, padding=1, norm_cfg=self.norm_cfg)
253
+ self.num_fusion_blocks = len(self.fusion_blocks)
254
+ self.num_reassemble_blocks = len(self.reassemble_blocks.resize_layers)
255
+ self.num_post_process_channels = len(self.post_process_channels)
256
+ assert self.num_fusion_blocks == self.num_reassemble_blocks
257
+ assert self.num_reassemble_blocks == self.num_post_process_channels
258
+ self.conv_depth = HeadDepth(self.channels)
259
+
260
+ def forward(self, inputs, img_metas):
261
+ assert len(inputs) == self.num_reassemble_blocks
262
+ x = [inp for inp in inputs]
263
+ x = self.reassemble_blocks(x)
264
+ x = [self.convs[i](feature) for i, feature in enumerate(x)]
265
+ out = self.fusion_blocks[0](x[-1])
266
+ for i in range(1, len(self.fusion_blocks)):
267
+ out = self.fusion_blocks[i](out, x[-(i + 1)])
268
+ out = self.project(out)
269
+ out = self.depth_pred(out)
270
+ return out
src/dinov2/eval/depth/models/decode_heads/linear_head.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+
9
+ from ...ops import resize
10
+ from ..builder import HEADS
11
+ from .decode_head import DepthBaseDecodeHead
12
+
13
+
14
+ @HEADS.register_module()
15
+ class BNHead(DepthBaseDecodeHead):
16
+ """Just a batchnorm."""
17
+
18
+ def __init__(self, input_transform="resize_concat", in_index=(0, 1, 2, 3), upsample=1, **kwargs):
19
+ super().__init__(**kwargs)
20
+ self.input_transform = input_transform
21
+ self.in_index = in_index
22
+ self.upsample = upsample
23
+ # self.bn = nn.SyncBatchNorm(self.in_channels)
24
+ if self.classify:
25
+ self.conv_depth = nn.Conv2d(self.channels, self.n_bins, kernel_size=1, padding=0, stride=1)
26
+ else:
27
+ self.conv_depth = nn.Conv2d(self.channels, 1, kernel_size=1, padding=0, stride=1)
28
+
29
+ def _transform_inputs(self, inputs):
30
+ """Transform inputs for decoder.
31
+ Args:
32
+ inputs (list[Tensor]): List of multi-level img features.
33
+ Returns:
34
+ Tensor: The transformed inputs
35
+ """
36
+
37
+ if "concat" in self.input_transform:
38
+ inputs = [inputs[i] for i in self.in_index]
39
+ if "resize" in self.input_transform:
40
+ inputs = [
41
+ resize(
42
+ input=x,
43
+ size=[s * self.upsample for s in inputs[0].shape[2:]],
44
+ mode="bilinear",
45
+ align_corners=self.align_corners,
46
+ )
47
+ for x in inputs
48
+ ]
49
+ inputs = torch.cat(inputs, dim=1)
50
+ elif self.input_transform == "multiple_select":
51
+ inputs = [inputs[i] for i in self.in_index]
52
+ else:
53
+ inputs = inputs[self.in_index]
54
+
55
+ return inputs
56
+
57
+ def _forward_feature(self, inputs, img_metas=None, **kwargs):
58
+ """Forward function for feature maps before classifying each pixel with
59
+ ``self.cls_seg`` fc.
60
+ Args:
61
+ inputs (list[Tensor]): List of multi-level img features.
62
+ Returns:
63
+ feats (Tensor): A tensor of shape (batch_size, self.channels,
64
+ H, W) which is feature map for last layer of decoder head.
65
+ """
66
+ # accept lists (for cls token)
67
+ inputs = list(inputs)
68
+ for i, x in enumerate(inputs):
69
+ if len(x) == 2:
70
+ x, cls_token = x[0], x[1]
71
+ if len(x.shape) == 2:
72
+ x = x[:, :, None, None]
73
+ cls_token = cls_token[:, :, None, None].expand_as(x)
74
+ inputs[i] = torch.cat((x, cls_token), 1)
75
+ else:
76
+ x = x[0]
77
+ if len(x.shape) == 2:
78
+ x = x[:, :, None, None]
79
+ inputs[i] = x
80
+ x = self._transform_inputs(inputs)
81
+ # feats = self.bn(x)
82
+ return x
83
+
84
+ def forward(self, inputs, img_metas=None, **kwargs):
85
+ """Forward function."""
86
+ output = self._forward_feature(inputs, img_metas=img_metas, **kwargs)
87
+ output = self.depth_pred(output)
88
+
89
+ return output
src/dinov2/eval/depth/models/depther/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from .base import BaseDepther
7
+ from .encoder_decoder import DepthEncoderDecoder
src/dinov2/eval/depth/models/depther/base.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from abc import ABCMeta, abstractmethod
7
+ from collections import OrderedDict
8
+
9
+ import torch
10
+ import torch.distributed as dist
11
+ from mmcv.runner import BaseModule, auto_fp16
12
+
13
+
14
+ class BaseDepther(BaseModule, metaclass=ABCMeta):
15
+ """Base class for depther."""
16
+
17
+ def __init__(self, init_cfg=None):
18
+ super(BaseDepther, self).__init__(init_cfg)
19
+ self.fp16_enabled = False
20
+
21
+ @property
22
+ def with_neck(self):
23
+ """bool: whether the depther has neck"""
24
+ return hasattr(self, "neck") and self.neck is not None
25
+
26
+ @property
27
+ def with_auxiliary_head(self):
28
+ """bool: whether the depther has auxiliary head"""
29
+ return hasattr(self, "auxiliary_head") and self.auxiliary_head is not None
30
+
31
+ @property
32
+ def with_decode_head(self):
33
+ """bool: whether the depther has decode head"""
34
+ return hasattr(self, "decode_head") and self.decode_head is not None
35
+
36
+ @abstractmethod
37
+ def extract_feat(self, imgs):
38
+ """Placeholder for extract features from images."""
39
+ pass
40
+
41
+ @abstractmethod
42
+ def encode_decode(self, img, img_metas):
43
+ """Placeholder for encode images with backbone and decode into a
44
+ semantic depth map of the same size as input."""
45
+ pass
46
+
47
+ @abstractmethod
48
+ def forward_train(self, imgs, img_metas, **kwargs):
49
+ """Placeholder for Forward function for training."""
50
+ pass
51
+
52
+ @abstractmethod
53
+ def simple_test(self, img, img_meta, **kwargs):
54
+ """Placeholder for single image test."""
55
+ pass
56
+
57
+ @abstractmethod
58
+ def aug_test(self, imgs, img_metas, **kwargs):
59
+ """Placeholder for augmentation test."""
60
+ pass
61
+
62
+ def forward_test(self, imgs, img_metas, **kwargs):
63
+ """
64
+ Args:
65
+ imgs (List[Tensor]): the outer list indicates test-time
66
+ augmentations and inner Tensor should have a shape NxCxHxW,
67
+ which contains all images in the batch.
68
+ img_metas (List[List[dict]]): the outer list indicates test-time
69
+ augs (multiscale, flip, etc.) and the inner list indicates
70
+ images in a batch.
71
+ """
72
+ for var, name in [(imgs, "imgs"), (img_metas, "img_metas")]:
73
+ if not isinstance(var, list):
74
+ raise TypeError(f"{name} must be a list, but got " f"{type(var)}")
75
+ num_augs = len(imgs)
76
+ if num_augs != len(img_metas):
77
+ raise ValueError(f"num of augmentations ({len(imgs)}) != " f"num of image meta ({len(img_metas)})")
78
+ # all images in the same aug batch all of the same ori_shape and pad
79
+ # shape
80
+ for img_meta in img_metas:
81
+ ori_shapes = [_["ori_shape"] for _ in img_meta]
82
+ assert all(shape == ori_shapes[0] for shape in ori_shapes)
83
+ img_shapes = [_["img_shape"] for _ in img_meta]
84
+ assert all(shape == img_shapes[0] for shape in img_shapes)
85
+ pad_shapes = [_["pad_shape"] for _ in img_meta]
86
+ assert all(shape == pad_shapes[0] for shape in pad_shapes)
87
+
88
+ if num_augs == 1:
89
+ return self.simple_test(imgs[0], img_metas[0], **kwargs)
90
+ else:
91
+ return self.aug_test(imgs, img_metas, **kwargs)
92
+
93
+ @auto_fp16(apply_to=("img",))
94
+ def forward(self, img, img_metas, return_loss=True, **kwargs):
95
+ """Calls either :func:`forward_train` or :func:`forward_test` depending
96
+ on whether ``return_loss`` is ``True``.
97
+
98
+ Note this setting will change the expected inputs. When
99
+ ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
100
+ and List[dict]), and when ``resturn_loss=False``, img and img_meta
101
+ should be double nested (i.e. List[Tensor], List[List[dict]]), with
102
+ the outer list indicating test time augmentations.
103
+ """
104
+ if return_loss:
105
+ return self.forward_train(img, img_metas, **kwargs)
106
+ else:
107
+ return self.forward_test(img, img_metas, **kwargs)
108
+
109
+ def train_step(self, data_batch, optimizer, **kwargs):
110
+ """The iteration step during training.
111
+
112
+ This method defines an iteration step during training, except for the
113
+ back propagation and optimizer updating, which are done in an optimizer
114
+ hook. Note that in some complicated cases or models, the whole process
115
+ including back propagation and optimizer updating is also defined in
116
+ this method, such as GAN.
117
+
118
+ Args:
119
+ data (dict): The output of dataloader.
120
+ optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
121
+ runner is passed to ``train_step()``. This argument is unused
122
+ and reserved.
123
+
124
+ Returns:
125
+ dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
126
+ ``num_samples``.
127
+ ``loss`` is a tensor for back propagation, which can be a
128
+ weighted sum of multiple losses.
129
+ ``log_vars`` contains all the variables to be sent to the
130
+ logger.
131
+ ``num_samples`` indicates the batch size (when the model is
132
+ DDP, it means the batch size on each GPU), which is used for
133
+ averaging the logs.
134
+ """
135
+ losses = self(**data_batch)
136
+
137
+ # split losses and images
138
+ real_losses = {}
139
+ log_imgs = {}
140
+ for k, v in losses.items():
141
+ if "img" in k:
142
+ log_imgs[k] = v
143
+ else:
144
+ real_losses[k] = v
145
+
146
+ loss, log_vars = self._parse_losses(real_losses)
147
+
148
+ outputs = dict(loss=loss, log_vars=log_vars, num_samples=len(data_batch["img_metas"]), log_imgs=log_imgs)
149
+
150
+ return outputs
151
+
152
+ def val_step(self, data_batch, **kwargs):
153
+ """The iteration step during validation.
154
+
155
+ This method shares the same signature as :func:`train_step`, but used
156
+ during val epochs. Note that the evaluation after training epochs is
157
+ not implemented with this method, but an evaluation hook.
158
+ """
159
+ output = self(**data_batch, **kwargs)
160
+ return output
161
+
162
+ @staticmethod
163
+ def _parse_losses(losses):
164
+ """Parse the raw outputs (losses) of the network.
165
+
166
+ Args:
167
+ losses (dict): Raw output of the network, which usually contain
168
+ losses and other necessary information.
169
+
170
+ Returns:
171
+ tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
172
+ which may be a weighted sum of all losses, log_vars contains
173
+ all the variables to be sent to the logger.
174
+ """
175
+ log_vars = OrderedDict()
176
+ for loss_name, loss_value in losses.items():
177
+ if isinstance(loss_value, torch.Tensor):
178
+ log_vars[loss_name] = loss_value.mean()
179
+ elif isinstance(loss_value, list):
180
+ log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
181
+ else:
182
+ raise TypeError(f"{loss_name} is not a tensor or list of tensors")
183
+
184
+ loss = sum(_value for _key, _value in log_vars.items() if "loss" in _key)
185
+
186
+ log_vars["loss"] = loss
187
+ for loss_name, loss_value in log_vars.items():
188
+ # reduce loss when distributed training
189
+ if dist.is_available() and dist.is_initialized():
190
+ loss_value = loss_value.data.clone()
191
+ dist.all_reduce(loss_value.div_(dist.get_world_size()))
192
+ log_vars[loss_name] = loss_value.item()
193
+
194
+ return loss, log_vars
src/dinov2/eval/depth/models/depther/encoder_decoder.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import torch.nn.functional as F
8
+
9
+ from ...models import builder
10
+ from ...models.builder import DEPTHER
11
+ from ...ops import resize
12
+ from .base import BaseDepther
13
+
14
+
15
+ def add_prefix(inputs, prefix):
16
+ """Add prefix for dict.
17
+
18
+ Args:
19
+ inputs (dict): The input dict with str keys.
20
+ prefix (str): The prefix to add.
21
+
22
+ Returns:
23
+
24
+ dict: The dict with keys updated with ``prefix``.
25
+ """
26
+
27
+ outputs = dict()
28
+ for name, value in inputs.items():
29
+ outputs[f"{prefix}.{name}"] = value
30
+
31
+ return outputs
32
+
33
+
34
+ @DEPTHER.register_module()
35
+ class DepthEncoderDecoder(BaseDepther):
36
+ """Encoder Decoder depther.
37
+
38
+ EncoderDecoder typically consists of backbone, (neck) and decode_head.
39
+ """
40
+
41
+ def __init__(self, backbone, decode_head, neck=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None):
42
+ super(DepthEncoderDecoder, self).__init__(init_cfg)
43
+ if pretrained is not None:
44
+ assert backbone.get("pretrained") is None, "both backbone and depther set pretrained weight"
45
+ backbone.pretrained = pretrained
46
+ self.backbone = builder.build_backbone(backbone)
47
+ self._init_decode_head(decode_head)
48
+
49
+ if neck is not None:
50
+ self.neck = builder.build_neck(neck)
51
+
52
+ self.train_cfg = train_cfg
53
+ self.test_cfg = test_cfg
54
+
55
+ assert self.with_decode_head
56
+
57
+ def _init_decode_head(self, decode_head):
58
+ """Initialize ``decode_head``"""
59
+ self.decode_head = builder.build_head(decode_head)
60
+ self.align_corners = self.decode_head.align_corners
61
+
62
+ def extract_feat(self, img):
63
+ """Extract features from images."""
64
+ x = self.backbone(img)
65
+ if self.with_neck:
66
+ x = self.neck(x)
67
+ return x
68
+
69
+ def encode_decode(self, img, img_metas, rescale=True, size=None):
70
+ """Encode images with backbone and decode into a depth estimation
71
+ map of the same size as input."""
72
+ x = self.extract_feat(img)
73
+ out = self._decode_head_forward_test(x, img_metas)
74
+ # crop the pred depth to the certain range.
75
+ out = torch.clamp(out, min=self.decode_head.min_depth, max=self.decode_head.max_depth)
76
+ if rescale:
77
+ if size is None:
78
+ if img_metas is not None:
79
+ size = img_metas[0]["ori_shape"][:2]
80
+ else:
81
+ size = img.shape[2:]
82
+ out = resize(input=out, size=size, mode="bilinear", align_corners=self.align_corners)
83
+ return out
84
+
85
+ def _decode_head_forward_train(self, img, x, img_metas, depth_gt, **kwargs):
86
+ """Run forward function and calculate loss for decode head in
87
+ training."""
88
+ losses = dict()
89
+ loss_decode = self.decode_head.forward_train(img, x, img_metas, depth_gt, self.train_cfg, **kwargs)
90
+ losses.update(add_prefix(loss_decode, "decode"))
91
+ return losses
92
+
93
+ def _decode_head_forward_test(self, x, img_metas):
94
+ """Run forward function and calculate loss for decode head in
95
+ inference."""
96
+ depth_pred = self.decode_head.forward_test(x, img_metas, self.test_cfg)
97
+ return depth_pred
98
+
99
+ def forward_dummy(self, img):
100
+ """Dummy forward function."""
101
+ depth = self.encode_decode(img, None)
102
+
103
+ return depth
104
+
105
+ def forward_train(self, img, img_metas, depth_gt, **kwargs):
106
+ """Forward function for training.
107
+
108
+ Args:
109
+ img (Tensor): Input images.
110
+ img_metas (list[dict]): List of image info dict where each dict
111
+ has: 'img_shape', 'scale_factor', 'flip', and may also contain
112
+ 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
113
+ For details on the values of these keys see
114
+ `depth/datasets/pipelines/formatting.py:Collect`.
115
+ depth_gt (Tensor): Depth gt
116
+ used if the architecture supports depth estimation task.
117
+
118
+ Returns:
119
+ dict[str, Tensor]: a dictionary of loss components
120
+ """
121
+
122
+ x = self.extract_feat(img)
123
+
124
+ losses = dict()
125
+
126
+ # the last of x saves the info from neck
127
+ loss_decode = self._decode_head_forward_train(img, x, img_metas, depth_gt, **kwargs)
128
+
129
+ losses.update(loss_decode)
130
+
131
+ return losses
132
+
133
+ def whole_inference(self, img, img_meta, rescale, size=None):
134
+ """Inference with full image."""
135
+ depth_pred = self.encode_decode(img, img_meta, rescale, size=size)
136
+
137
+ return depth_pred
138
+
139
+ def slide_inference(self, img, img_meta, rescale):
140
+ """Inference by sliding-window with overlap.
141
+
142
+ If h_crop > h_img or w_crop > w_img, the small patch will be used to
143
+ decode without padding.
144
+ """
145
+
146
+ h_stride, w_stride = self.test_cfg.stride
147
+ h_crop, w_crop = self.test_cfg.crop_size
148
+ batch_size, _, h_img, w_img = img.size()
149
+ h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
150
+ w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
151
+ preds = img.new_zeros((batch_size, 1, h_img, w_img))
152
+ count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
153
+ for h_idx in range(h_grids):
154
+ for w_idx in range(w_grids):
155
+ y1 = h_idx * h_stride
156
+ x1 = w_idx * w_stride
157
+ y2 = min(y1 + h_crop, h_img)
158
+ x2 = min(x1 + w_crop, w_img)
159
+ y1 = max(y2 - h_crop, 0)
160
+ x1 = max(x2 - w_crop, 0)
161
+ crop_img = img[:, :, y1:y2, x1:x2]
162
+ depth_pred = self.encode_decode(crop_img, img_meta, rescale)
163
+ preds += F.pad(depth_pred, (int(x1), int(preds.shape[3] - x2), int(y1), int(preds.shape[2] - y2)))
164
+
165
+ count_mat[:, :, y1:y2, x1:x2] += 1
166
+ assert (count_mat == 0).sum() == 0
167
+ if torch.onnx.is_in_onnx_export():
168
+ # cast count_mat to constant while exporting to ONNX
169
+ count_mat = torch.from_numpy(count_mat.cpu().detach().numpy()).to(device=img.device)
170
+ preds = preds / count_mat
171
+ return preds
172
+
173
+ def inference(self, img, img_meta, rescale, size=None):
174
+ """Inference with slide/whole style.
175
+
176
+ Args:
177
+ img (Tensor): The input image of shape (N, 3, H, W).
178
+ img_meta (dict): Image info dict where each dict has: 'img_shape',
179
+ 'scale_factor', 'flip', and may also contain
180
+ 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
181
+ For details on the values of these keys see
182
+ `depth/datasets/pipelines/formatting.py:Collect`.
183
+ rescale (bool): Whether rescale back to original shape.
184
+
185
+ Returns:
186
+ Tensor: The output depth map.
187
+ """
188
+
189
+ assert self.test_cfg.mode in ["slide", "whole"]
190
+ ori_shape = img_meta[0]["ori_shape"]
191
+ assert all(_["ori_shape"] == ori_shape for _ in img_meta)
192
+ if self.test_cfg.mode == "slide":
193
+ depth_pred = self.slide_inference(img, img_meta, rescale)
194
+ else:
195
+ depth_pred = self.whole_inference(img, img_meta, rescale, size=size)
196
+ output = depth_pred
197
+ flip = img_meta[0]["flip"]
198
+ if flip:
199
+ flip_direction = img_meta[0]["flip_direction"]
200
+ assert flip_direction in ["horizontal", "vertical"]
201
+ if flip_direction == "horizontal":
202
+ output = output.flip(dims=(3,))
203
+ elif flip_direction == "vertical":
204
+ output = output.flip(dims=(2,))
205
+
206
+ return output
207
+
208
+ def simple_test(self, img, img_meta, rescale=True):
209
+ """Simple test with single image."""
210
+ depth_pred = self.inference(img, img_meta, rescale)
211
+ if torch.onnx.is_in_onnx_export():
212
+ # our inference backend only support 4D output
213
+ depth_pred = depth_pred.unsqueeze(0)
214
+ return depth_pred
215
+ depth_pred = depth_pred.cpu().numpy()
216
+ # unravel batch dim
217
+ depth_pred = list(depth_pred)
218
+ return depth_pred
219
+
220
+ def aug_test(self, imgs, img_metas, rescale=True):
221
+ """Test with augmentations.
222
+
223
+ Only rescale=True is supported.
224
+ """
225
+ # aug_test rescale all imgs back to ori_shape for now
226
+ assert rescale
227
+ # to save memory, we get augmented depth logit inplace
228
+ depth_pred = self.inference(imgs[0], img_metas[0], rescale)
229
+ for i in range(1, len(imgs)):
230
+ cur_depth_pred = self.inference(imgs[i], img_metas[i], rescale, size=depth_pred.shape[-2:])
231
+ depth_pred += cur_depth_pred
232
+ depth_pred /= len(imgs)
233
+ depth_pred = depth_pred.cpu().numpy()
234
+ # unravel batch dim
235
+ depth_pred = list(depth_pred)
236
+ return depth_pred
src/dinov2/eval/depth/models/losses/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from .gradientloss import GradientLoss
7
+ from .sigloss import SigLoss
src/dinov2/eval/depth/models/losses/gradientloss.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+
9
+ from ...models.builder import LOSSES
10
+
11
+
12
+ @LOSSES.register_module()
13
+ class GradientLoss(nn.Module):
14
+ """GradientLoss.
15
+
16
+ Adapted from https://www.cs.cornell.edu/projects/megadepth/
17
+
18
+ Args:
19
+ valid_mask (bool): Whether filter invalid gt (gt > 0). Default: True.
20
+ loss_weight (float): Weight of the loss. Default: 1.0.
21
+ max_depth (int): When filtering invalid gt, set a max threshold. Default: None.
22
+ """
23
+
24
+ def __init__(self, valid_mask=True, loss_weight=1.0, max_depth=None, loss_name="loss_grad"):
25
+ super(GradientLoss, self).__init__()
26
+ self.valid_mask = valid_mask
27
+ self.loss_weight = loss_weight
28
+ self.max_depth = max_depth
29
+ self.loss_name = loss_name
30
+
31
+ self.eps = 0.001 # avoid grad explode
32
+
33
+ def gradientloss(self, input, target):
34
+ input_downscaled = [input] + [input[:: 2 * i, :: 2 * i] for i in range(1, 4)]
35
+ target_downscaled = [target] + [target[:: 2 * i, :: 2 * i] for i in range(1, 4)]
36
+
37
+ gradient_loss = 0
38
+ for input, target in zip(input_downscaled, target_downscaled):
39
+ if self.valid_mask:
40
+ mask = target > 0
41
+ if self.max_depth is not None:
42
+ mask = torch.logical_and(target > 0, target <= self.max_depth)
43
+ N = torch.sum(mask)
44
+ else:
45
+ mask = torch.ones_like(target)
46
+ N = input.numel()
47
+ input_log = torch.log(input + self.eps)
48
+ target_log = torch.log(target + self.eps)
49
+ log_d_diff = input_log - target_log
50
+
51
+ log_d_diff = torch.mul(log_d_diff, mask)
52
+
53
+ v_gradient = torch.abs(log_d_diff[0:-2, :] - log_d_diff[2:, :])
54
+ v_mask = torch.mul(mask[0:-2, :], mask[2:, :])
55
+ v_gradient = torch.mul(v_gradient, v_mask)
56
+
57
+ h_gradient = torch.abs(log_d_diff[:, 0:-2] - log_d_diff[:, 2:])
58
+ h_mask = torch.mul(mask[:, 0:-2], mask[:, 2:])
59
+ h_gradient = torch.mul(h_gradient, h_mask)
60
+
61
+ gradient_loss += (torch.sum(h_gradient) + torch.sum(v_gradient)) / N
62
+
63
+ return gradient_loss
64
+
65
+ def forward(self, depth_pred, depth_gt):
66
+ """Forward function."""
67
+
68
+ gradient_loss = self.loss_weight * self.gradientloss(depth_pred, depth_gt)
69
+ return gradient_loss
src/dinov2/eval/depth/models/losses/sigloss.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+
9
+ from ...models.builder import LOSSES
10
+
11
+
12
+ @LOSSES.register_module()
13
+ class SigLoss(nn.Module):
14
+ """SigLoss.
15
+
16
+ This follows `AdaBins <https://arxiv.org/abs/2011.14141>`_.
17
+
18
+ Args:
19
+ valid_mask (bool): Whether filter invalid gt (gt > 0). Default: True.
20
+ loss_weight (float): Weight of the loss. Default: 1.0.
21
+ max_depth (int): When filtering invalid gt, set a max threshold. Default: None.
22
+ warm_up (bool): A simple warm up stage to help convergence. Default: False.
23
+ warm_iter (int): The number of warm up stage. Default: 100.
24
+ """
25
+
26
+ def __init__(
27
+ self, valid_mask=True, loss_weight=1.0, max_depth=None, warm_up=False, warm_iter=100, loss_name="sigloss"
28
+ ):
29
+ super(SigLoss, self).__init__()
30
+ self.valid_mask = valid_mask
31
+ self.loss_weight = loss_weight
32
+ self.max_depth = max_depth
33
+ self.loss_name = loss_name
34
+
35
+ self.eps = 0.001 # avoid grad explode
36
+
37
+ # HACK: a hack implementation for warmup sigloss
38
+ self.warm_up = warm_up
39
+ self.warm_iter = warm_iter
40
+ self.warm_up_counter = 0
41
+
42
+ def sigloss(self, input, target):
43
+ if self.valid_mask:
44
+ valid_mask = target > 0
45
+ if self.max_depth is not None:
46
+ valid_mask = torch.logical_and(target > 0, target <= self.max_depth)
47
+ input = input[valid_mask]
48
+ target = target[valid_mask]
49
+
50
+ if self.warm_up:
51
+ if self.warm_up_counter < self.warm_iter:
52
+ g = torch.log(input + self.eps) - torch.log(target + self.eps)
53
+ g = 0.15 * torch.pow(torch.mean(g), 2)
54
+ self.warm_up_counter += 1
55
+ return torch.sqrt(g)
56
+
57
+ g = torch.log(input + self.eps) - torch.log(target + self.eps)
58
+ Dg = torch.var(g) + 0.15 * torch.pow(torch.mean(g), 2)
59
+ return torch.sqrt(Dg)
60
+
61
+ def forward(self, depth_pred, depth_gt):
62
+ """Forward function."""
63
+
64
+ loss_depth = self.loss_weight * self.sigloss(depth_pred, depth_gt)
65
+ return loss_depth
src/dinov2/eval/depth/ops/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from .wrappers import resize
src/dinov2/eval/depth/ops/wrappers.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import warnings
7
+
8
+ import torch.nn.functional as F
9
+
10
+
11
+ def resize(input, size=None, scale_factor=None, mode="nearest", align_corners=None, warning=False):
12
+ if warning:
13
+ if size is not None and align_corners:
14
+ input_h, input_w = tuple(int(x) for x in input.shape[2:])
15
+ output_h, output_w = tuple(int(x) for x in size)
16
+ if output_h > input_h or output_w > output_h:
17
+ if (
18
+ (output_h > 1 and output_w > 1 and input_h > 1 and input_w > 1)
19
+ and (output_h - 1) % (input_h - 1)
20
+ and (output_w - 1) % (input_w - 1)
21
+ ):
22
+ warnings.warn(
23
+ f"When align_corners={align_corners}, "
24
+ "the output would more aligned if "
25
+ f"input size {(input_h, input_w)} is `x+1` and "
26
+ f"out size {(output_h, output_w)} is `nx+1`"
27
+ )
28
+ return F.interpolate(input, size, scale_factor, mode, align_corners)
src/dinov2/eval/knn.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ from functools import partial
8
+ import json
9
+ import logging
10
+ import os
11
+ import sys
12
+ from typing import List, Optional
13
+
14
+ import torch
15
+ from torch.nn.functional import one_hot, softmax
16
+
17
+ import dinov2.distributed as distributed
18
+ from dinov2.data import SamplerType, make_data_loader, make_dataset
19
+ from dinov2.data.transforms import make_classification_eval_transform
20
+ from dinov2.eval.metrics import AccuracyAveraging, build_topk_accuracy_metric
21
+ from dinov2.eval.setup import get_args_parser as get_setup_args_parser
22
+ from dinov2.eval.setup import setup_and_build_model
23
+ from dinov2.eval.utils import ModelWithNormalize, evaluate, extract_features
24
+
25
+
26
+ logger = logging.getLogger("dinov2")
27
+
28
+
29
+ def get_args_parser(
30
+ description: Optional[str] = None,
31
+ parents: Optional[List[argparse.ArgumentParser]] = None,
32
+ add_help: bool = True,
33
+ ):
34
+ parents = parents or []
35
+ setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
36
+ parents = [setup_args_parser]
37
+ parser = argparse.ArgumentParser(
38
+ description=description,
39
+ parents=parents,
40
+ add_help=add_help,
41
+ )
42
+ parser.add_argument(
43
+ "--train-dataset",
44
+ dest="train_dataset_str",
45
+ type=str,
46
+ help="Training dataset",
47
+ )
48
+ parser.add_argument(
49
+ "--val-dataset",
50
+ dest="val_dataset_str",
51
+ type=str,
52
+ help="Validation dataset",
53
+ )
54
+ parser.add_argument(
55
+ "--nb_knn",
56
+ nargs="+",
57
+ type=int,
58
+ help="Number of NN to use. 20 is usually working the best.",
59
+ )
60
+ parser.add_argument(
61
+ "--temperature",
62
+ type=float,
63
+ help="Temperature used in the voting coefficient",
64
+ )
65
+ parser.add_argument(
66
+ "--gather-on-cpu",
67
+ action="store_true",
68
+ help="Whether to gather the train features on cpu, slower"
69
+ "but useful to avoid OOM for large datasets (e.g. ImageNet22k).",
70
+ )
71
+ parser.add_argument(
72
+ "--batch-size",
73
+ type=int,
74
+ help="Batch size.",
75
+ )
76
+ parser.add_argument(
77
+ "--n-per-class-list",
78
+ nargs="+",
79
+ type=int,
80
+ help="Number to take per class",
81
+ )
82
+ parser.add_argument(
83
+ "--n-tries",
84
+ type=int,
85
+ help="Number of tries",
86
+ )
87
+ parser.set_defaults(
88
+ train_dataset_str="ImageNet:split=TRAIN",
89
+ val_dataset_str="ImageNet:split=VAL",
90
+ nb_knn=[10, 20, 100, 200],
91
+ temperature=0.07,
92
+ batch_size=256,
93
+ n_per_class_list=[-1],
94
+ n_tries=1,
95
+ )
96
+ return parser
97
+
98
+
99
+ class KnnModule(torch.nn.Module):
100
+ """
101
+ Gets knn of test features from all processes on a chunk of the train features
102
+
103
+ Each rank gets a chunk of the train features as well as a chunk of the test features.
104
+ In `compute_neighbors`, for each rank one after the other, its chunk of test features
105
+ is sent to all devices, partial knns are computed with each chunk of train features
106
+ then collated back on the original device.
107
+ """
108
+
109
+ def __init__(self, train_features, train_labels, nb_knn, T, device, num_classes=1000):
110
+ super().__init__()
111
+
112
+ self.global_rank = distributed.get_global_rank()
113
+ self.global_size = distributed.get_global_size()
114
+
115
+ self.device = device
116
+ self.train_features_rank_T = train_features.chunk(self.global_size)[self.global_rank].T.to(self.device)
117
+ self.candidates = train_labels.chunk(self.global_size)[self.global_rank].view(1, -1).to(self.device)
118
+
119
+ self.nb_knn = nb_knn
120
+ self.max_k = max(self.nb_knn)
121
+ self.T = T
122
+ self.num_classes = num_classes
123
+
124
+ def _get_knn_sims_and_labels(self, similarity, train_labels):
125
+ topk_sims, indices = similarity.topk(self.max_k, largest=True, sorted=True)
126
+ neighbors_labels = torch.gather(train_labels, 1, indices)
127
+ return topk_sims, neighbors_labels
128
+
129
+ def _similarity_for_rank(self, features_rank, source_rank):
130
+ # Send the features from `source_rank` to all ranks
131
+ broadcast_shape = torch.tensor(features_rank.shape).to(self.device)
132
+ torch.distributed.broadcast(broadcast_shape, source_rank)
133
+
134
+ broadcasted = features_rank
135
+ if self.global_rank != source_rank:
136
+ broadcasted = torch.zeros(*broadcast_shape, dtype=features_rank.dtype, device=self.device)
137
+ torch.distributed.broadcast(broadcasted, source_rank)
138
+
139
+ # Compute the neighbors for `source_rank` among `train_features_rank_T`
140
+ similarity_rank = torch.mm(broadcasted, self.train_features_rank_T)
141
+ candidate_labels = self.candidates.expand(len(similarity_rank), -1)
142
+ return self._get_knn_sims_and_labels(similarity_rank, candidate_labels)
143
+
144
+ def _gather_all_knn_for_rank(self, topk_sims, neighbors_labels, target_rank):
145
+ # Gather all neighbors for `target_rank`
146
+ topk_sims_rank = retrieved_rank = None
147
+ if self.global_rank == target_rank:
148
+ topk_sims_rank = [torch.zeros_like(topk_sims) for _ in range(self.global_size)]
149
+ retrieved_rank = [torch.zeros_like(neighbors_labels) for _ in range(self.global_size)]
150
+
151
+ torch.distributed.gather(topk_sims, topk_sims_rank, dst=target_rank)
152
+ torch.distributed.gather(neighbors_labels, retrieved_rank, dst=target_rank)
153
+
154
+ if self.global_rank == target_rank:
155
+ # Perform a second top-k on the k * global_size retrieved neighbors
156
+ topk_sims_rank = torch.cat(topk_sims_rank, dim=1)
157
+ retrieved_rank = torch.cat(retrieved_rank, dim=1)
158
+ results = self._get_knn_sims_and_labels(topk_sims_rank, retrieved_rank)
159
+ return results
160
+ return None
161
+
162
+ def compute_neighbors(self, features_rank):
163
+ for rank in range(self.global_size):
164
+ topk_sims, neighbors_labels = self._similarity_for_rank(features_rank, rank)
165
+ results = self._gather_all_knn_for_rank(topk_sims, neighbors_labels, rank)
166
+ if results is not None:
167
+ topk_sims_rank, neighbors_labels_rank = results
168
+ return topk_sims_rank, neighbors_labels_rank
169
+
170
+ def forward(self, features_rank):
171
+ """
172
+ Compute the results on all values of `self.nb_knn` neighbors from the full `self.max_k`
173
+ """
174
+ assert all(k <= self.max_k for k in self.nb_knn)
175
+
176
+ topk_sims, neighbors_labels = self.compute_neighbors(features_rank)
177
+ batch_size = neighbors_labels.shape[0]
178
+ topk_sims_transform = softmax(topk_sims / self.T, 1)
179
+ matmul = torch.mul(
180
+ one_hot(neighbors_labels, num_classes=self.num_classes),
181
+ topk_sims_transform.view(batch_size, -1, 1),
182
+ )
183
+ probas_for_k = {k: torch.sum(matmul[:, :k, :], 1) for k in self.nb_knn}
184
+ return probas_for_k
185
+
186
+
187
+ class DictKeysModule(torch.nn.Module):
188
+ def __init__(self, keys):
189
+ super().__init__()
190
+ self.keys = keys
191
+
192
+ def forward(self, features_dict, targets):
193
+ for k in self.keys:
194
+ features_dict = features_dict[k]
195
+ return {"preds": features_dict, "target": targets}
196
+
197
+
198
+ def create_module_dict(*, module, n_per_class_list, n_tries, nb_knn, train_features, train_labels):
199
+ modules = {}
200
+ mapping = create_class_indices_mapping(train_labels)
201
+ for npc in n_per_class_list:
202
+ if npc < 0: # Only one try needed when using the full data
203
+ full_module = module(
204
+ train_features=train_features,
205
+ train_labels=train_labels,
206
+ nb_knn=nb_knn,
207
+ )
208
+ modules["full"] = ModuleDictWithForward({"1": full_module})
209
+ continue
210
+ all_tries = {}
211
+ for t in range(n_tries):
212
+ final_indices = filter_train(mapping, npc, seed=t)
213
+ k_list = list(set(nb_knn + [npc]))
214
+ k_list = sorted([el for el in k_list if el <= npc])
215
+ all_tries[str(t)] = module(
216
+ train_features=train_features[final_indices],
217
+ train_labels=train_labels[final_indices],
218
+ nb_knn=k_list,
219
+ )
220
+ modules[f"{npc} per class"] = ModuleDictWithForward(all_tries)
221
+
222
+ return ModuleDictWithForward(modules)
223
+
224
+
225
+ def filter_train(mapping, n_per_class, seed):
226
+ torch.manual_seed(seed)
227
+ final_indices = []
228
+ for k in mapping.keys():
229
+ index = torch.randperm(len(mapping[k]))[:n_per_class]
230
+ final_indices.append(mapping[k][index])
231
+ return torch.cat(final_indices).squeeze()
232
+
233
+
234
+ def create_class_indices_mapping(labels):
235
+ unique_labels, inverse = torch.unique(labels, return_inverse=True)
236
+ mapping = {unique_labels[i]: (inverse == i).nonzero() for i in range(len(unique_labels))}
237
+ return mapping
238
+
239
+
240
+ class ModuleDictWithForward(torch.nn.ModuleDict):
241
+ def forward(self, *args, **kwargs):
242
+ return {k: module(*args, **kwargs) for k, module in self._modules.items()}
243
+
244
+
245
+ def eval_knn(
246
+ model,
247
+ train_dataset,
248
+ val_dataset,
249
+ accuracy_averaging,
250
+ nb_knn,
251
+ temperature,
252
+ batch_size,
253
+ num_workers,
254
+ gather_on_cpu,
255
+ n_per_class_list=[-1],
256
+ n_tries=1,
257
+ ):
258
+ model = ModelWithNormalize(model)
259
+
260
+ logger.info("Extracting features for train set...")
261
+ train_features, train_labels = extract_features(
262
+ model, train_dataset, batch_size, num_workers, gather_on_cpu=gather_on_cpu
263
+ )
264
+ logger.info(f"Train features created, shape {train_features.shape}.")
265
+
266
+ val_dataloader = make_data_loader(
267
+ dataset=val_dataset,
268
+ batch_size=batch_size,
269
+ num_workers=num_workers,
270
+ sampler_type=SamplerType.DISTRIBUTED,
271
+ drop_last=False,
272
+ shuffle=False,
273
+ persistent_workers=True,
274
+ )
275
+ num_classes = train_labels.max() + 1
276
+ metric_collection = build_topk_accuracy_metric(accuracy_averaging, num_classes=num_classes)
277
+
278
+ device = torch.cuda.current_device()
279
+ partial_module = partial(KnnModule, T=temperature, device=device, num_classes=num_classes)
280
+ knn_module_dict = create_module_dict(
281
+ module=partial_module,
282
+ n_per_class_list=n_per_class_list,
283
+ n_tries=n_tries,
284
+ nb_knn=nb_knn,
285
+ train_features=train_features,
286
+ train_labels=train_labels,
287
+ )
288
+ postprocessors, metrics = {}, {}
289
+ for n_per_class, knn_module in knn_module_dict.items():
290
+ for t, knn_try in knn_module.items():
291
+ postprocessors = {
292
+ **postprocessors,
293
+ **{(n_per_class, t, k): DictKeysModule([n_per_class, t, k]) for k in knn_try.nb_knn},
294
+ }
295
+ metrics = {**metrics, **{(n_per_class, t, k): metric_collection.clone() for k in knn_try.nb_knn}}
296
+ model_with_knn = torch.nn.Sequential(model, knn_module_dict)
297
+
298
+ # ============ evaluation ... ============
299
+ logger.info("Start the k-NN classification.")
300
+ _, results_dict = evaluate(model_with_knn, val_dataloader, postprocessors, metrics, device)
301
+
302
+ # Averaging the results over the n tries for each value of n_per_class
303
+ for n_per_class, knn_module in knn_module_dict.items():
304
+ first_try = list(knn_module.keys())[0]
305
+ k_list = knn_module[first_try].nb_knn
306
+ for k in k_list:
307
+ keys = results_dict[(n_per_class, first_try, k)].keys() # keys are e.g. `top-1` and `top-5`
308
+ results_dict[(n_per_class, k)] = {
309
+ key: torch.mean(torch.stack([results_dict[(n_per_class, t, k)][key] for t in knn_module.keys()]))
310
+ for key in keys
311
+ }
312
+ for t in knn_module.keys():
313
+ del results_dict[(n_per_class, t, k)]
314
+
315
+ return results_dict
316
+
317
+
318
+ def eval_knn_with_model(
319
+ model,
320
+ output_dir,
321
+ train_dataset_str="ImageNet:split=TRAIN",
322
+ val_dataset_str="ImageNet:split=VAL",
323
+ nb_knn=(10, 20, 100, 200),
324
+ temperature=0.07,
325
+ autocast_dtype=torch.float,
326
+ accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
327
+ transform=None,
328
+ gather_on_cpu=False,
329
+ batch_size=256,
330
+ num_workers=5,
331
+ n_per_class_list=[-1],
332
+ n_tries=1,
333
+ ):
334
+ transform = transform or make_classification_eval_transform()
335
+
336
+ train_dataset = make_dataset(
337
+ dataset_str=train_dataset_str,
338
+ transform=transform,
339
+ )
340
+ val_dataset = make_dataset(
341
+ dataset_str=val_dataset_str,
342
+ transform=transform,
343
+ )
344
+
345
+ with torch.cuda.amp.autocast(dtype=autocast_dtype):
346
+ results_dict_knn = eval_knn(
347
+ model=model,
348
+ train_dataset=train_dataset,
349
+ val_dataset=val_dataset,
350
+ accuracy_averaging=accuracy_averaging,
351
+ nb_knn=nb_knn,
352
+ temperature=temperature,
353
+ batch_size=batch_size,
354
+ num_workers=num_workers,
355
+ gather_on_cpu=gather_on_cpu,
356
+ n_per_class_list=n_per_class_list,
357
+ n_tries=n_tries,
358
+ )
359
+
360
+ results_dict = {}
361
+ if distributed.is_main_process():
362
+ for knn_ in results_dict_knn.keys():
363
+ top1 = results_dict_knn[knn_]["top-1"].item() * 100.0
364
+ top5 = results_dict_knn[knn_]["top-5"].item() * 100.0
365
+ results_dict[f"{knn_} Top 1"] = top1
366
+ results_dict[f"{knn_} Top 5"] = top5
367
+ logger.info(f"{knn_} classifier result: Top1: {top1:.2f} Top5: {top5:.2f}")
368
+
369
+ metrics_file_path = os.path.join(output_dir, "results_eval_knn.json")
370
+ with open(metrics_file_path, "a") as f:
371
+ for k, v in results_dict.items():
372
+ f.write(json.dumps({k: v}) + "\n")
373
+
374
+ if distributed.is_enabled():
375
+ torch.distributed.barrier()
376
+ return results_dict
377
+
378
+
379
+ def main(args):
380
+ model, autocast_dtype = setup_and_build_model(args)
381
+ eval_knn_with_model(
382
+ model=model,
383
+ output_dir=args.output_dir,
384
+ train_dataset_str=args.train_dataset_str,
385
+ val_dataset_str=args.val_dataset_str,
386
+ nb_knn=args.nb_knn,
387
+ temperature=args.temperature,
388
+ autocast_dtype=autocast_dtype,
389
+ accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
390
+ transform=None,
391
+ gather_on_cpu=args.gather_on_cpu,
392
+ batch_size=args.batch_size,
393
+ num_workers=5,
394
+ n_per_class_list=args.n_per_class_list,
395
+ n_tries=args.n_tries,
396
+ )
397
+ return 0
398
+
399
+
400
+ if __name__ == "__main__":
401
+ description = "DINOv2 k-NN evaluation"
402
+ args_parser = get_args_parser(description=description)
403
+ args = args_parser.parse_args()
404
+ sys.exit(main(args))