image_finetune: false output_dir: "outputs" pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5" unet_additional_kwargs: use_motion_module : true motion_module_resolutions : [ 1,2,4,8 ] unet_use_cross_frame_attention : false unet_use_temporal_attention : false motion_module_type: Vanilla motion_module_kwargs: num_attention_heads : 8 num_transformer_block : 1 attention_block_types : [ "Temporal_Self", "Temporal_Self" ] temporal_position_encoding : true temporal_position_encoding_max_len : 24 temporal_attention_dim_div : 1 zero_initialize : true noise_scheduler_kwargs: num_train_timesteps: 1000 beta_start: 0.00085 beta_end: 0.012 beta_schedule: "linear" steps_offset: 1 clip_sample: false train_data: csv_path: "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv" video_folder: "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val" sample_size: 256 sample_stride: 4 sample_n_frames: 16 validation_data: prompts: - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons." - "A drone view of celebration with Christma tree and fireworks, starry sky - background." - "Robot dancing in times square." - "Pacific coast, carmel by the sea ocean and waves." num_inference_steps: 25 guidance_scale: 8. trainable_modules: - "motion_modules." unet_checkpoint_path: "" learning_rate: 1.e-4 train_batch_size: 4 max_train_epoch: -1 max_train_steps: 100 checkpointing_epochs: -1 checkpointing_steps: 60 validation_steps: 5000 validation_steps_tuple: [2, 50] global_seed: 42 mixed_precision_training: true enable_xformers_memory_efficient_attention: True is_debug: False