|
|
|
|
|
scratch:
|
|
resolution: 1024
|
|
train_batch_size: 1
|
|
num_train_workers: 10
|
|
num_frames: 8
|
|
max_num_objects: 3
|
|
base_lr: 5.0e-6
|
|
vision_lr: 3.0e-06
|
|
phases_per_epoch: 1
|
|
num_epochs: 40
|
|
|
|
dataset:
|
|
|
|
img_folder: null
|
|
gt_folder: null
|
|
file_list_txt: training/assets/MOSE_sample_train_list.txt
|
|
multiplier: 2
|
|
|
|
|
|
vos:
|
|
train_transforms:
|
|
- _target_: training.dataset.transforms.ComposeAPI
|
|
transforms:
|
|
- _target_: training.dataset.transforms.RandomHorizontalFlip
|
|
consistent_transform: True
|
|
- _target_: training.dataset.transforms.RandomAffine
|
|
degrees: 25
|
|
shear: 20
|
|
image_interpolation: bilinear
|
|
consistent_transform: True
|
|
- _target_: training.dataset.transforms.RandomResizeAPI
|
|
sizes: ${scratch.resolution}
|
|
square: true
|
|
consistent_transform: True
|
|
- _target_: training.dataset.transforms.ColorJitter
|
|
consistent_transform: True
|
|
brightness: 0.1
|
|
contrast: 0.03
|
|
saturation: 0.03
|
|
hue: null
|
|
- _target_: training.dataset.transforms.RandomGrayscale
|
|
p: 0.05
|
|
consistent_transform: True
|
|
- _target_: training.dataset.transforms.ColorJitter
|
|
consistent_transform: False
|
|
brightness: 0.1
|
|
contrast: 0.05
|
|
saturation: 0.05
|
|
hue: null
|
|
- _target_: training.dataset.transforms.ToTensorAPI
|
|
- _target_: training.dataset.transforms.NormalizeAPI
|
|
mean: [0.485, 0.456, 0.406]
|
|
std: [0.229, 0.224, 0.225]
|
|
|
|
trainer:
|
|
_target_: training.trainer.Trainer
|
|
mode: train_only
|
|
max_epochs: ${times:${scratch.num_epochs},${scratch.phases_per_epoch}}
|
|
accelerator: cuda
|
|
seed_value: 123
|
|
|
|
model:
|
|
_target_: training.model.sam2.SAM2Train
|
|
image_encoder:
|
|
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
|
scalp: 1
|
|
trunk:
|
|
_target_: sam2.modeling.backbones.hieradet.Hiera
|
|
embed_dim: 112
|
|
num_heads: 2
|
|
drop_path_rate: 0.1
|
|
neck:
|
|
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
|
position_encoding:
|
|
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
|
num_pos_feats: 256
|
|
normalize: true
|
|
scale: null
|
|
temperature: 10000
|
|
d_model: 256
|
|
backbone_channel_list: [896, 448, 224, 112]
|
|
fpn_top_down_levels: [2, 3]
|
|
fpn_interp_model: nearest
|
|
|
|
memory_attention:
|
|
_target_: sam2.modeling.memory_attention.MemoryAttention
|
|
d_model: 256
|
|
pos_enc_at_input: true
|
|
layer:
|
|
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
|
activation: relu
|
|
dim_feedforward: 2048
|
|
dropout: 0.1
|
|
pos_enc_at_attn: false
|
|
self_attention:
|
|
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
|
rope_theta: 10000.0
|
|
feat_sizes: [32, 32]
|
|
embedding_dim: 256
|
|
num_heads: 1
|
|
downsample_rate: 1
|
|
dropout: 0.1
|
|
d_model: 256
|
|
pos_enc_at_cross_attn_keys: true
|
|
pos_enc_at_cross_attn_queries: false
|
|
cross_attention:
|
|
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
|
rope_theta: 10000.0
|
|
feat_sizes: [32, 32]
|
|
rope_k_repeat: True
|
|
embedding_dim: 256
|
|
num_heads: 1
|
|
downsample_rate: 1
|
|
dropout: 0.1
|
|
kv_in_dim: 64
|
|
num_layers: 4
|
|
|
|
memory_encoder:
|
|
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
|
out_dim: 64
|
|
position_encoding:
|
|
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
|
num_pos_feats: 64
|
|
normalize: true
|
|
scale: null
|
|
temperature: 10000
|
|
mask_downsampler:
|
|
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
|
kernel_size: 3
|
|
stride: 2
|
|
padding: 1
|
|
fuser:
|
|
_target_: sam2.modeling.memory_encoder.Fuser
|
|
layer:
|
|
_target_: sam2.modeling.memory_encoder.CXBlock
|
|
dim: 256
|
|
kernel_size: 7
|
|
padding: 3
|
|
layer_scale_init_value: 1e-6
|
|
use_dwconv: True
|
|
num_layers: 2
|
|
|
|
num_maskmem: 7
|
|
image_size: ${scratch.resolution}
|
|
|
|
sigmoid_scale_for_mem_enc: 20.0
|
|
sigmoid_bias_for_mem_enc: -10.0
|
|
use_mask_input_as_output_without_sam: true
|
|
|
|
directly_add_no_mem_embed: true
|
|
no_obj_embed_spatial: true
|
|
|
|
use_high_res_features_in_sam: true
|
|
|
|
multimask_output_in_sam: true
|
|
|
|
iou_prediction_use_sigmoid: True
|
|
|
|
use_obj_ptrs_in_encoder: true
|
|
add_tpos_enc_to_obj_ptrs: true
|
|
proj_tpos_enc_in_obj_ptrs: true
|
|
use_signed_tpos_enc_to_obj_ptrs: true
|
|
only_obj_ptrs_in_the_past_for_eval: true
|
|
|
|
pred_obj_scores: true
|
|
pred_obj_scores_mlp: true
|
|
fixed_no_obj_ptr: true
|
|
|
|
multimask_output_for_tracking: true
|
|
use_multimask_token_for_obj_ptr: true
|
|
multimask_min_pt_num: 0
|
|
multimask_max_pt_num: 1
|
|
use_mlp_for_obj_ptr_proj: true
|
|
|
|
|
|
|
|
|
|
|
|
prob_to_use_pt_input_for_train: 0.5
|
|
prob_to_use_pt_input_for_eval: 0.0
|
|
prob_to_use_box_input_for_train: 0.5
|
|
prob_to_use_box_input_for_eval: 0.0
|
|
prob_to_sample_from_gt_for_train: 0.1
|
|
num_frames_to_correct_for_train: 2
|
|
num_frames_to_correct_for_eval: 1
|
|
rand_frames_to_correct_for_train: True
|
|
add_all_frames_to_correct_as_cond: True
|
|
|
|
num_init_cond_frames_for_train: 2
|
|
rand_init_cond_frames_for_train: True
|
|
num_correction_pt_per_frame: 7
|
|
use_act_ckpt_iterative_pt_sampling: false
|
|
|
|
|
|
|
|
num_init_cond_frames_for_eval: 1
|
|
forward_backbone_per_frame_for_eval: True
|
|
|
|
|
|
data:
|
|
train:
|
|
_target_: training.dataset.sam2_datasets.TorchTrainMixedDataset
|
|
phases_per_epoch: ${scratch.phases_per_epoch}
|
|
batch_sizes:
|
|
- ${scratch.train_batch_size}
|
|
|
|
datasets:
|
|
- _target_: training.dataset.utils.RepeatFactorWrapper
|
|
dataset:
|
|
_target_: training.dataset.utils.ConcatDataset
|
|
datasets:
|
|
- _target_: training.dataset.vos_dataset.VOSDataset
|
|
transforms: ${vos.train_transforms}
|
|
training: true
|
|
video_dataset:
|
|
_target_: training.dataset.vos_raw_dataset.PNGRawDataset
|
|
img_folder: ${dataset.img_folder}
|
|
gt_folder: ${dataset.gt_folder}
|
|
file_list_txt: ${dataset.file_list_txt}
|
|
sampler:
|
|
_target_: training.dataset.vos_sampler.RandomUniformSampler
|
|
num_frames: ${scratch.num_frames}
|
|
max_num_objects: ${scratch.max_num_objects}
|
|
multiplier: ${dataset.multiplier}
|
|
shuffle: True
|
|
num_workers: ${scratch.num_train_workers}
|
|
pin_memory: True
|
|
drop_last: True
|
|
collate_fn:
|
|
_target_: training.utils.data_utils.collate_fn
|
|
_partial_: true
|
|
dict_key: all
|
|
|
|
optim:
|
|
amp:
|
|
enabled: True
|
|
amp_dtype: bfloat16
|
|
|
|
optimizer:
|
|
_target_: torch.optim.AdamW
|
|
|
|
gradient_clip:
|
|
_target_: training.optimizer.GradientClipper
|
|
max_norm: 0.1
|
|
norm_type: 2
|
|
|
|
param_group_modifiers:
|
|
- _target_: training.optimizer.layer_decay_param_modifier
|
|
_partial_: True
|
|
layer_decay_value: 0.9
|
|
apply_to: 'image_encoder.trunk'
|
|
overrides:
|
|
- pattern: '*pos_embed*'
|
|
value: 1.0
|
|
|
|
options:
|
|
lr:
|
|
- scheduler:
|
|
_target_: fvcore.common.param_scheduler.CosineParamScheduler
|
|
start_value: ${scratch.base_lr}
|
|
end_value: ${divide:${scratch.base_lr},10}
|
|
- scheduler:
|
|
_target_: fvcore.common.param_scheduler.CosineParamScheduler
|
|
start_value: ${scratch.vision_lr}
|
|
end_value: ${divide:${scratch.vision_lr},10}
|
|
param_names:
|
|
- 'image_encoder.*'
|
|
weight_decay:
|
|
- scheduler:
|
|
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
|
|
value: 0.1
|
|
- scheduler:
|
|
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
|
|
value: 0.0
|
|
param_names:
|
|
- '*bias*'
|
|
module_cls_names: ['torch.nn.LayerNorm']
|
|
|
|
loss:
|
|
all:
|
|
_target_: training.loss_fns.MultiStepMultiMasksAndIous
|
|
weight_dict:
|
|
loss_mask: 20
|
|
loss_dice: 1
|
|
loss_iou: 1
|
|
loss_class: 1
|
|
supervise_all_iou: true
|
|
iou_use_l1_loss: true
|
|
pred_obj_scores: true
|
|
focal_gamma_obj_score: 0.0
|
|
focal_alpha_obj_score: -1.0
|
|
|
|
distributed:
|
|
backend: nccl
|
|
find_unused_parameters: True
|
|
|
|
logging:
|
|
tensorboard_writer:
|
|
_target_: training.utils.logger.make_tensorboard_logger
|
|
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
|
flush_secs: 120
|
|
should_log: True
|
|
log_dir: ${launcher.experiment_log_dir}/logs
|
|
log_freq: 10
|
|
|
|
|
|
checkpoint:
|
|
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
|
save_freq: 0
|
|
model_weight_initializer:
|
|
_partial_: True
|
|
_target_: training.utils.checkpoint_utils.load_state_dict_into_model
|
|
strict: True
|
|
ignore_unexpected_keys: null
|
|
ignore_missing_keys: null
|
|
|
|
state_dict:
|
|
_target_: training.utils.checkpoint_utils.load_checkpoint_and_apply_kernels
|
|
checkpoint_path: ./checkpoints/sam2.1_hiera_base_plus.pt
|
|
ckpt_state_dict_keys: ['model']
|
|
|
|
launcher:
|
|
num_nodes: 1
|
|
gpus_per_node: 8
|
|
experiment_log_dir: null
|
|
|
|
|
|
submitit:
|
|
partition: null
|
|
account: null
|
|
qos: null
|
|
cpus_per_task: 10
|
|
use_cluster: false
|
|
timeout_hour: 24
|
|
name: null
|
|
port_range: [10000, 65000]
|
|
|
|
|