{ "vae": { "_class_name": "AutoencoderKL", "_diffusers_version": "0.26.1", "_name_or_path": "damo-vilab/text-to-video-ms-1.7b", "act_fn": "silu", "block_out_channels": [ 128, 256, 512, 512 ], "down_block_types": [ "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D" ], "force_upcast": true, "in_channels": 3, "latent_channels": 4, "layers_per_block": 2, "norm_num_groups": 32, "out_channels": 3, "sample_size": 512, "scaling_factor": 0.18215, "up_block_types": [ "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D" ] }, "unet": { "_class_name": "UNet3DConditionModel", "_diffusers_version": "0.26.1", "_name_or_path": "damo-vilab/text-to-video-ms-1.7b", "act_fn": "silu", "attention_head_dim": 64, "block_out_channels": [ 320, 640, 1280, 1280 ], "concat": false, "cross_attention_dim": 1024, "down_block_types": [ "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D" ], "downsample_padding": 1, "in_channels": 4, "layers_per_block": 2, "merging_mode": "attention_cross_attention", "mid_block_scale_factor": 1, "norm_eps": 0.00001, "norm_num_groups": 32, "out_channels": 4, "sample_size": 32, "up_block_types": [ "UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D" ], "use_channel_expansion": false, "use_fps_conditioning": false, "use_image_embedding": true, "use_image_tokens": true, "use_repeat_context_img": true }, "resampler": { "_class_name": "ImageEmbeddingContextResampler", "_diffusers_version": "0.26.1", "cross_attention_dim": 1024, "expansion_factor": 16, "inner_dim": 1280 }, "controlnet": { "_class_name": "ControlNetModel", "_diffusers_version": "0.26.1", "act_fn": "silu", "attention_head_dim": 64, "block_out_channels": [ 320, 640, 1280, 1280 ], "class_embed_type": null, "conditioning_embedding_out_channels": [ 32, 96, 256, 512 ], "controlnet_conditioning_channel_order": "rgb", "cross_attention_dim": 1024, "down_block_types": [ "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D" ], "downsample_controlnet_cond": true, "downsample_padding": 1, "flip_sin_to_cos": true, "frame_expansion": "none", "freq_shift": 0, "global_pool_conditions": false, "in_channels": 4, "layers_per_block": 2, "merging_mode": "addition", "mid_block_scale_factor": 1, "norm_eps": 0.00001, "norm_num_groups": 32, "num_class_embeds": null, "num_frames": 8, "num_frames_conditioning": 8, "num_tranformers": 1, "only_cross_attention": false, "projection_class_embeddings_input_dim": null, "resnet_time_scale_shift": "default", "upcast_attention": false, "use_controlnet_mask": false, "use_image_embedding": false, "use_image_encoder_normalization": false, "use_image_tokens": false, "use_linear_projection": false, "use_repeat_context_img": true, "zero_conv_mode": "Identity" }, "text_encoder": { "_name_or_path": "damo-vilab/text-to-video-ms-1.7b", "architectures": [ "CLIPTextModel" ], "attention_dropout": 0, "bos_token_id": 0, "dropout": 0, "eos_token_id": 2, "hidden_act": "gelu", "hidden_size": 1024, "initializer_factor": 1, "initializer_range": 0.02, "intermediate_size": 4096, "layer_norm_eps": 0.00001, "max_position_embeddings": 77, "model_type": "clip_text_model", "num_attention_heads": 16, "num_hidden_layers": 23, "pad_token_id": 1, "projection_dim": 512, "torch_dtype": "float32", "transformers_version": "4.39.0", "vocab_size": 49408 }, "tokenizer": { "model": "ali-vilab/text-to-video-ms-1.7b", "subfolder": "tokenizer" }, "scheduler": { "_class_name": "DDIMScheduler", "_diffusers_version": "0.26.1", "beta_end": 0.012, "beta_schedule": "scaled_linear", "beta_start": 0.00085, "clip_sample": false, "clip_sample_range": 1, "dynamic_thresholding_ratio": 0.995, "num_train_timesteps": 1000, "prediction_type": "epsilon", "rescale_betas_zero_snr": false, "sample_max_value": 1, "set_alpha_to_one": false, "skip_prk_steps": true, "steps_offset": 1, "thresholding": false, "timestep_spacing": "leading", "trained_betas": null }, "num_frames": 16, "num_frames_conditioning": 8, "temp_attend_on_uncond_include_past": false, "temp_attend_on_neighborhood_of_condition_frames": false, "temporal_self_attention_mask_included_itself": false, "temporal_self_attention_only_on_conditioning": false, "spatial_attend_on_condition_frames": false, "image_encoder_version": "laion2b_s32b_b79k" }