Spaces:
Running
on
Zero
Running
on
Zero
output_dir: "output/cameractrl_model" | |
pretrained_model_path: "[replace with SVD root path]" | |
unet_subfolder: "unet" | |
down_block_types: ['CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'DownBlockSpatioTemporal'] | |
up_block_types: ['UpBlockSpatioTemporal', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond'] | |
train_data: | |
root_path: "[replace RealEstate10K root path]" | |
annotation_json: "annotations/train.json" | |
sample_stride: 8 | |
sample_n_frames: 14 | |
relative_pose: true | |
zero_t_first_frame: true | |
sample_size: [320, 576] | |
rescale_fxy: true | |
shuffle_frames: false | |
use_flip: false | |
validation_data: | |
root_path: "[replace RealEstate10K root path]" | |
annotation_json: "annotations/validation.json" | |
sample_stride: 8 | |
sample_n_frames: 14 | |
relative_pose: true | |
zero_t_first_frame: true | |
sample_size: [320, 576] | |
rescale_fxy: true | |
shuffle_frames: false | |
use_flip: false | |
return_clip_name: true | |
random_null_image_ratio: 0.15 | |
pose_encoder_kwargs: | |
downscale_factor: 8 | |
channels: [320, 640, 1280, 1280] | |
nums_rb: 2 | |
cin: 384 | |
ksize: 1 | |
sk: true | |
use_conv: false | |
compression_factor: 1 | |
temporal_attention_nhead: 8 | |
attention_block_types: ["Temporal_Self", ] | |
temporal_position_encoding: true | |
temporal_position_encoding_max_len: 14 | |
attention_processor_kwargs: | |
add_spatial: false | |
add_temporal: true | |
attn_processor_name: 'attn1' | |
pose_feature_dimensions: [320, 640, 1280, 1280] | |
query_condition: true | |
key_value_condition: true | |
scale: 1.0 | |
do_sanity_check: true | |
sample_before_training: false | |
max_train_epoch: -1 | |
max_train_steps: 50000 | |
validation_steps: 2500 | |
validation_steps_tuple: [500, ] | |
learning_rate: 3.e-5 | |
P_mean: 0.7 | |
P_std: 1.6 | |
condition_image_noise_mean: -3.0 | |
condition_image_noise_std: 0.5 | |
sample_latent: true | |
first_image_cond: true | |
num_inference_steps: 25 | |
min_guidance_scale: 1.0 | |
max_guidance_scale: 3.0 | |
num_workers: 8 | |
train_batch_size: 1 | |
checkpointing_epochs: -1 | |
checkpointing_steps: 10000 | |
mixed_precision_training: false | |
enable_xformers_memory_efficient_attention: true | |
global_seed: 42 | |
logger_interval: 10 | |