Spaces:
Sleeping
Sleeping
MODEL: | |
WEIGHTS: '' | |
compute_precision: | |
grad_scaler: true | |
teacher: | |
backbone: | |
sharding_strategy: SHARD_GRAD_OP | |
mixed_precision: | |
param_dtype: fp16 | |
reduce_dtype: fp16 | |
buffer_dtype: fp32 | |
dino_head: | |
sharding_strategy: SHARD_GRAD_OP | |
mixed_precision: | |
param_dtype: fp16 | |
reduce_dtype: fp16 | |
buffer_dtype: fp32 | |
ibot_head: | |
sharding_strategy: SHARD_GRAD_OP | |
mixed_precision: | |
param_dtype: fp16 | |
reduce_dtype: fp16 | |
buffer_dtype: fp32 | |
student: | |
backbone: | |
sharding_strategy: SHARD_GRAD_OP | |
mixed_precision: | |
param_dtype: fp16 | |
reduce_dtype: fp16 | |
buffer_dtype: fp32 | |
dino_head: | |
sharding_strategy: SHARD_GRAD_OP | |
mixed_precision: | |
param_dtype: fp16 | |
reduce_dtype: fp32 | |
buffer_dtype: fp32 | |
ibot_head: | |
sharding_strategy: SHARD_GRAD_OP | |
mixed_precision: | |
param_dtype: fp16 | |
reduce_dtype: fp32 | |
buffer_dtype: fp32 | |
dino: | |
loss_weight: 1.0 | |
head_n_prototypes: 65536 | |
head_bottleneck_dim: 256 | |
head_nlayers: 3 | |
head_hidden_dim: 2048 | |
koleo_loss_weight: 0.1 | |
ibot: | |
loss_weight: 1.0 | |
mask_sample_probability: 0.5 | |
mask_ratio_min_max: | |
- 0.1 | |
- 0.5 | |
separate_head: false | |
head_n_prototypes: 65536 | |
head_bottleneck_dim: 256 | |
head_nlayers: 3 | |
head_hidden_dim: 2048 | |
train: | |
batch_size_per_gpu: 64 | |
dataset_path: ImageNet:split=TRAIN | |
output_dir: . | |
saveckp_freq: 20 | |
seed: 0 | |
num_workers: 10 | |
OFFICIAL_EPOCH_LENGTH: 1250 | |
cache_dataset: true | |
centering: "centering" # or "sinkhorn_knopp" | |
student: | |
arch: vit_large | |
patch_size: 16 | |
drop_path_rate: 0.3 | |
layerscale: 1.0e-05 | |
drop_path_uniform: true | |
pretrained_weights: '' | |
ffn_layer: "mlp" | |
block_chunks: 0 | |
qkv_bias: true | |
proj_bias: true | |
ffn_bias: true | |
num_register_tokens: 0 | |
interpolate_antialias: false | |
interpolate_offset: 0.1 | |
teacher: | |
momentum_teacher: 0.992 | |
final_momentum_teacher: 1 | |
warmup_teacher_temp: 0.04 | |
teacher_temp: 0.07 | |
warmup_teacher_temp_epochs: 30 | |
optim: | |
epochs: 100 | |
weight_decay: 0.04 | |
weight_decay_end: 0.4 | |
base_lr: 0.004 # learning rate for a batch size of 1024 | |
lr: 0. # will be set after applying scaling rule | |
warmup_epochs: 10 | |
min_lr: 1.0e-06 | |
clip_grad: 3.0 | |
freeze_last_layer_epochs: 1 | |
scaling_rule: sqrt_wrt_1024 | |
patch_embed_lr_mult: 0.2 | |
layerwise_decay: 0.9 | |
adamw_beta1: 0.9 | |
adamw_beta2: 0.999 | |
crops: | |
global_crops_scale: | |
- 0.32 | |
- 1.0 | |
local_crops_number: 8 | |
local_crops_scale: | |
- 0.05 | |
- 0.32 | |
global_crops_size: 224 | |
local_crops_size: 96 | |
evaluation: | |
eval_period_iterations: 12500 | |