Spaces:

yslan
/

ObjCtrl-2.5D

Running on Zero

ObjCtrl-2.5D / configs /svd_320_576_cameractrl.yaml

wzhouxiff

init

38e3f9b 20 days ago

2.26 kB

	output_dir: "output/cameractrl_model"
	pretrained_model_path: "[replace with SVD root path]"
	unet_subfolder: "unet"
	down_block_types: ['CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'DownBlockSpatioTemporal']
	up_block_types: ['UpBlockSpatioTemporal', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond']

	train_data:
	root_path: "[replace RealEstate10K root path]"
	annotation_json: "annotations/train.json"
	sample_stride: 8
	sample_n_frames: 14
	relative_pose: true
	zero_t_first_frame: true
	sample_size: [320, 576]
	rescale_fxy: true
	shuffle_frames: false
	use_flip: false

	validation_data:
	root_path: "[replace RealEstate10K root path]"
	annotation_json: "annotations/validation.json"
	sample_stride: 8
	sample_n_frames: 14
	relative_pose: true
	zero_t_first_frame: true
	sample_size: [320, 576]
	rescale_fxy: true
	shuffle_frames: false
	use_flip: false
	return_clip_name: true

	random_null_image_ratio: 0.15

	pose_encoder_kwargs:
	downscale_factor: 8
	channels: [320, 640, 1280, 1280]
	nums_rb: 2
	cin: 384
	ksize: 1
	sk: true
	use_conv: false
	compression_factor: 1
	temporal_attention_nhead: 8
	attention_block_types: ["Temporal_Self", ]
	temporal_position_encoding: true
	temporal_position_encoding_max_len: 14

	attention_processor_kwargs:
	add_spatial: false
	add_temporal: true
	attn_processor_name: 'attn1'
	pose_feature_dimensions: [320, 640, 1280, 1280]
	query_condition: true
	key_value_condition: true
	scale: 1.0

	do_sanity_check: true
	sample_before_training: false

	max_train_epoch: -1
	max_train_steps: 50000
	validation_steps: 2500
	validation_steps_tuple: [500, ]

	learning_rate: 3.e-5

	P_mean: 0.7
	P_std: 1.6
	condition_image_noise_mean: -3.0
	condition_image_noise_std: 0.5
	sample_latent: true
	first_image_cond: true

	num_inference_steps: 25
	min_guidance_scale: 1.0
	max_guidance_scale: 3.0

	num_workers: 8
	train_batch_size: 1
	checkpointing_epochs: -1
	checkpointing_steps: 10000

	mixed_precision_training: false
	enable_xformers_memory_efficient_attention: true

	global_seed: 42
	logger_interval: 10