# pytorch_lightning==2.2.2 seed_everything: 33 trainer: accelerator: auto strategy: auto devices: '1' num_nodes: 1 precision: 16-mixed logger: False model: class_path: diffusion_trainer.streaming_svd.StreamingSVD init_args: vfi: class_path: modules.params.vfi.VFIParams init_args: ckpt_path_local: checkpoint/VFI/ours.pkl ckpt_path_global: https://drive.google.com/file/d/1XCNoyhA1RX3m8W-XJK8H8inH47l36kxP/view?usp=sharing i2v_enhance: class_path: modules.params.i2v_enhance.I2VEnhanceParams init_args: ckpt_path_local: checkpoint/i2v_enhance/ ckpt_path_global: ali-vilab/i2vgen-xl module_loader: class_path: modules.loader.module_loader.GenericModuleLoader init_args: pipeline_repo: stabilityai/stable-video-diffusion-img2vid-xt pipeline_obj: streamingt2v_pipeline set_prediction_type: '' module_names: - network_config - model - controlnet - denoiser - conditioner - first_stage_model - sampler - svd_pipeline module_config: controlnet: class_path: modules.loader.module_loader_config.ModuleLoaderConfig init_args: loader_cls_path: models.control.controlnet.ControlNet cls_func: from_unet cls_func_fast_dev_run: '' kwargs_diffusers: null model_params: merging_mode: addition zero_conv_mode: Identity frame_expansion: none downsample_controlnet_cond: true use_image_encoder_normalization: true use_controlnet_mask: false condition_encoder: '' conditioning_embedding_out_channels: - 32 - 96 - 256 - 512 kwargs_diff_trainer_params: null args: [] dependent_modules: model: model dependent_modules_cloned: null state_dict_path: '' strict_loading: true state_dict_filters: [] network_config: class_path: models.diffusion.video_model.VideoUNet init_args: in_channels: 8 model_channels: 320 out_channels: 4 num_res_blocks: 2 num_conditional_frames: null attention_resolutions: - 4 - 2 - 1 dropout: 0.0 channel_mult: - 1 - 2 - 4 - 4 conv_resample: true dims: 2 num_classes: sequential use_checkpoint: False num_heads: -1 num_head_channels: 64 num_heads_upsample: -1 use_scale_shift_norm: false resblock_updown: false transformer_depth: 1 transformer_depth_middle: null context_dim: 1024 time_downup: false time_context_dim: null extra_ff_mix_layer: true use_spatial_context: true merge_strategy: learned_with_images merge_factor: 0.5 spatial_transformer_attn_type: softmax-xformers video_kernel_size: - 3 - 1 - 1 use_linear_in_transformer: true adm_in_channels: 768 disable_temporal_crossattention: false max_ddpm_temb_period: 10000 merging_mode: attention_cross_attention controlnet_mode: true use_apm: false model: class_path: modules.loader.module_loader_config.ModuleLoaderConfig init_args: loader_cls_path: models.svd.sgm.modules.diffusionmodules.wrappers.OpenAIWrapper cls_func: '' cls_func_fast_dev_run: '' kwargs_diffusers: compile_model: false model_params: null model_params_fast_dev_run: null kwargs_diff_trainer_params: null args: [] dependent_modules: diffusion_model: network_config dependent_modules_cloned: null state_dict_path: '' strict_loading: true state_dict_filters: [] denoiser: class_path: models.svd.sgm.modules.diffusionmodules.denoiser.Denoiser init_args: scaling_config: target: models.svd.sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise sampler: class_path: models.svd.sgm.modules.diffusionmodules.sampling.EulerEDMSampler init_args: s_churn: 0.0 s_tmin: 0.0 s_tmax: .inf s_noise: 1.0 discretization_config: target: models.diffusion.discretizer.AlignYourSteps params: sigma_max: 700.0 num_steps: 30 guider_config: target: models.svd.sgm.modules.diffusionmodules.guiders.LinearPredictionGuider params: max_scale: 3.0 min_scale: 1.5 num_frames: 25 verbose: false device: cuda conditioner: class_path: models.svd.sgm.modules.GeneralConditioner init_args: emb_models: - is_trainable: false input_key: cond_frames_without_noise target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder params: n_cond_frames: 1 n_copies: 1 open_clip_embedding_config: target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder params: freeze: true - input_key: fps_id is_trainable: false target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 - input_key: motion_bucket_id is_trainable: false target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 - input_key: cond_frames is_trainable: false target: models.svd.sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder params: disable_encoder_autocast: true n_cond_frames: 1 n_copies: 1 is_ae: true encoder_config: target: models.svd.sgm.models.autoencoder.AutoencoderKLModeOnly params: embed_dim: 4 monitor: val/rec_loss ddconfig: attn_type: vanilla-xformers double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity - input_key: cond_aug is_trainable: false target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 first_stage_model: class_path: models.svd.sgm.AutoencodingEngine init_args: encoder_config: target: models.svd.sgm.modules.diffusionmodules.model.Encoder params: attn_type: vanilla double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 decoder_config: target: models.svd.sgm.modules.autoencoding.temporal_ae.VideoDecoder params: attn_type: vanilla double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 video_kernel_size: - 3 - 1 - 1 loss_config: target: torch.nn.Identity regularizer_config: target: models.svd.sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer optimizer_config: null lr_g_factor: 1.0 trainable_ae_params: null ae_optimizer_args: null trainable_disc_params: null disc_optimizer_args: null disc_start_iter: 0 diff_boost_factor: 3.0 ckpt_engine: null ckpt_path: null additional_decode_keys: null ema_decay: null monitor: null input_key: jpg svd_pipeline: class_path: modules.loader.module_loader_config.ModuleLoaderConfig init_args: loader_cls_path: diffusers.StableVideoDiffusionPipeline cls_func: from_pretrained cls_func_fast_dev_run: '' kwargs_diffusers: torch_dtype: torch.float16 variant: fp16 use_safetensors: true model_params: null model_params_fast_dev_run: null kwargs_diff_trainer_params: null args: - stabilityai/stable-video-diffusion-img2vid-xt dependent_modules: null dependent_modules_cloned: null state_dict_path: '' strict_loading: true state_dict_filters: [] root_cls: null diff_trainer_params: class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.DiffusionTrainerParams init_args: scale_factor: 0.18215 streamingsvd_ckpt: class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.CheckpointDescriptor init_args: ckpt_path_local: checkpoint/StreamingSVD/model.safetensors ckpt_path_global: PAIR/StreamingSVD/resolve/main/model.safetensors disable_first_stage_autocast: true inference_params: class_path: modules.params.diffusion.inference_params.T2VInferenceParams init_args: n_autoregressive_generations: 2 # Number of autoregression for StreamingSVD num_conditional_frames: 7 # is this used? anchor_frames: '6' # Take the (Number+1)th frame as CLIP encoding for StreamingSVD reset_seed_per_generation: true # If true, the seed is reset on every generation