ObjCtrl-2.5D / configs /svd_320_576_cameractrl.yaml
wzhouxiff
init
38e3f9b
raw
history blame
2.26 kB
output_dir: "output/cameractrl_model"
pretrained_model_path: "[replace with SVD root path]"
unet_subfolder: "unet"
down_block_types: ['CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'DownBlockSpatioTemporal']
up_block_types: ['UpBlockSpatioTemporal', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond']
train_data:
root_path: "[replace RealEstate10K root path]"
annotation_json: "annotations/train.json"
sample_stride: 8
sample_n_frames: 14
relative_pose: true
zero_t_first_frame: true
sample_size: [320, 576]
rescale_fxy: true
shuffle_frames: false
use_flip: false
validation_data:
root_path: "[replace RealEstate10K root path]"
annotation_json: "annotations/validation.json"
sample_stride: 8
sample_n_frames: 14
relative_pose: true
zero_t_first_frame: true
sample_size: [320, 576]
rescale_fxy: true
shuffle_frames: false
use_flip: false
return_clip_name: true
random_null_image_ratio: 0.15
pose_encoder_kwargs:
downscale_factor: 8
channels: [320, 640, 1280, 1280]
nums_rb: 2
cin: 384
ksize: 1
sk: true
use_conv: false
compression_factor: 1
temporal_attention_nhead: 8
attention_block_types: ["Temporal_Self", ]
temporal_position_encoding: true
temporal_position_encoding_max_len: 14
attention_processor_kwargs:
add_spatial: false
add_temporal: true
attn_processor_name: 'attn1'
pose_feature_dimensions: [320, 640, 1280, 1280]
query_condition: true
key_value_condition: true
scale: 1.0
do_sanity_check: true
sample_before_training: false
max_train_epoch: -1
max_train_steps: 50000
validation_steps: 2500
validation_steps_tuple: [500, ]
learning_rate: 3.e-5
P_mean: 0.7
P_std: 1.6
condition_image_noise_mean: -3.0
condition_image_noise_std: 0.5
sample_latent: true
first_image_cond: true
num_inference_steps: 25
min_guidance_scale: 1.0
max_guidance_scale: 3.0
num_workers: 8
train_batch_size: 1
checkpointing_epochs: -1
checkpointing_steps: 10000
mixed_precision_training: false
enable_xformers_memory_efficient_attention: true
global_seed: 42
logger_interval: 10