alibaba-pai
/

EasyAnimateV3-XL-2-InP-512x512

@@ -1,10 +1,11 @@
 ---
-license: apache-2.0
-language:
-- en
-tags:
-- text-generation-inference
 ---
 # 📷 EasyAnimate | An End-to-End Solution for High-Resolution and Long Video Generation
 😊 EasyAnimate is an end-to-end solution for generating high-resolution and long videos. We can train transformer based diffusion generators, train VAEs for processing long videos, and preprocess metadata.
@@ -57,13 +58,11 @@ cd ../../
 # Model zoo
 EasyAnimateV3:
-| Name | Type | Storage Space | Url | Hugging Face | Description |
 |--|--|--|--|--|--|
-| EasyAnimateV3-XL-2-InP-512x512.tar | EasyAnimateV3 | 16.2GB | [Download](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Diffusion_Transformer/EasyAnimateV3-XL-2-InP-512x512.tar) | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-512x512) | EasyAnimateV3 official weights for 512x512 image to video resolution. Training with 144 frames and fps 24 |
-| EasyAnimateV3-XL-2-InP-768x768.tar | EasyAnimateV3 | 16.2GB | [Download](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Diffusion_Transformer/EasyAnimateV3-XL-2-InP-768x768.tar) | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-768x768) | EasyAnimateV3 official weights for 768x768 image to video resolution. Training with 144 frames and fps 24 |
-| EasyAnimateV3-XL-2-InP-960x960.tar | EasyAnimateV3 | 16.2GB | [Download](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Diffusion_Transformer/EasyAnimateV3-XL-2-InP-960x960.tar) | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-960x960) | EasyAnimateV3 official weights for 960x960 image to video resolution. Training with 144 frames and fps 24 |
-| easyanimatev3_minimalism_lora.safetensors | Lora of Pixart | 485.1MB | [Download](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Personalized_Model/easyanimatev2_minimalism_lora.safetensors) | - | A lora training with a specifial type images. Images can be downloaded from [Url](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/asset/v2/Minimalism.zip). |
 # Algorithm Detailed
 ### 1. Data Preprocessing

 ---
+frameworks:
+- Pytorch
+license: Apache License 2.0
+tasks:
+- text-to-video-synthesis
 ---
 # 📷 EasyAnimate | An End-to-End Solution for High-Resolution and Long Video Generation
 😊 EasyAnimate is an end-to-end solution for generating high-resolution and long videos. We can train transformer based diffusion generators, train VAEs for processing long videos, and preprocess metadata.
 # Model zoo
 EasyAnimateV3:
+| Name | Type | Storage Space | Hugging Face | Model Scope | Description |
 |--|--|--|--|--|--|
+| EasyAnimateV3-XL-2-InP-512x512.tar | EasyAnimateV3 | 18.2GB | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-512x512)| [😄Link](https://modelscope.cn/models/PAI/EasyAnimateV3-XL-2-InP-512x512) | EasyAnimateV3 official weights for 512x512 text and image to video resolution. Training with 144 frames and fps 24 |
+| EasyAnimateV3-XL-2-InP-768x768.tar | EasyAnimateV3 | 18.2GB | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-768x768) | [😄Link](https://modelscope.cn/models/PAI/EasyAnimateV3-XL-2-InP-768x768) | EasyAnimateV3 official weights for 768x768 text and image to video resolution. Training with 144 frames and fps 24 |
+| EasyAnimateV3-XL-2-InP-960x960.tar | EasyAnimateV3 | 18.2GB | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-960x960) | [😄Link](https://modelscope.cn/models/PAI/EasyAnimateV3-XL-2-InP-960x960) | EasyAnimateV3 official weights for 960x960 text and  image to video resolution. Training with 144 frames and fps 24 |
 # Algorithm Detailed
 ### 1. Data Preprocessing

transformer/config.json CHANGED Viewed

@@ -1,7 +1,9 @@
 {
   "_class_name": "Transformer3DModel",
-  "_diffusers_version": "0.27.0",
   "activation_fn": "gelu-approximate",
   "attention_bias": true,
   "attention_head_dim": 72,
   "attention_type": "default",
@@ -13,11 +15,14 @@
   "decay": 0.9999,
   "double_self_attention": false,
   "dropout": 0.0,
   "enable_uvit": true,
   "fake_3d": false,
   "in_channels": 12,
   "inv_gamma": 1.0,
   "min_decay": 0.0,
   "motion_module_kwargs_even": {
     "attention_block_types": [
       "Temporal_Self",
@@ -26,6 +31,7 @@
     "block_size": 1,
     "num_attention_heads": 16,
     "num_transformer_block": 1,
     "temporal_attention_dim_div": 1,
     "temporal_position_encoding": true,
     "temporal_position_encoding_max_len": 4096
@@ -38,6 +44,7 @@
     "block_size": 1,
     "num_attention_heads": 16,
     "num_transformer_block": 1,
     "temporal_attention_dim_div": 1,
     "temporal_position_encoding": true,
     "temporal_position_encoding_max_len": 4096
@@ -57,6 +64,8 @@
   "patch_3d": false,
   "patch_size": 2,
   "power": 0.6666666666666666,
   "sample_size": 64,
   "time_patch_size": null,
   "time_position_encoding_before_transformer": false,

 {
   "_class_name": "Transformer3DModel",
+  "_diffusers_version": "0.30.1",
   "activation_fn": "gelu-approximate",
+  "add_noise_in_inpaint_model": false,
+  "after_norm": false,
   "attention_bias": true,
   "attention_head_dim": 72,
   "attention_type": "default",
   "decay": 0.9999,
   "double_self_attention": false,
   "dropout": 0.0,
+  "enable_clip_in_inpaint": true,
+  "enable_text_attention_mask": true,
   "enable_uvit": true,
   "fake_3d": false,
   "in_channels": 12,
   "inv_gamma": 1.0,
   "min_decay": 0.0,
+  "motion_module_kwargs": null,
   "motion_module_kwargs_even": {
     "attention_block_types": [
       "Temporal_Self",
     "block_size": 1,
     "num_attention_heads": 16,
     "num_transformer_block": 1,
+    "remove_time_embedding_in_photo": false,
     "temporal_attention_dim_div": 1,
     "temporal_position_encoding": true,
     "temporal_position_encoding_max_len": 4096
     "block_size": 1,
     "num_attention_heads": 16,
     "num_transformer_block": 1,
+    "remove_time_embedding_in_photo": false,
     "temporal_attention_dim_div": 1,
     "temporal_position_encoding": true,
     "temporal_position_encoding_max_len": 4096
   "patch_3d": false,
   "patch_size": 2,
   "power": 0.6666666666666666,
+  "qk_norm": false,
+  "resize_inpaint_mask_directly": false,
   "sample_size": 64,
   "time_patch_size": null,
   "time_position_encoding_before_transformer": false,

vae/config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "_class_name": "AutoencoderKL",
-  "_diffusers_version": "0.22.0.dev0",
   "act_fn": "silu",
   "block_out_channels": [
     128,
@@ -8,9 +8,18 @@
     512,
     512
   ],
   "down_block_types": [
-    "SpatialDownBlock3D",
-    "SpatialTemporalDownBlock3D",
     "SpatialTemporalDownBlock3D",
     "SpatialTemporalDownBlock3D"
   ],
@@ -18,19 +27,31 @@
   "in_channels": 3,
   "latent_channels": 4,
   "layers_per_block": 2,
   "norm_num_groups": 32,
   "out_channels": 3,
   "sample_size": 256,
   "scaling_factor": 0.18215,
   "slice_compression_vae": true,
-  "use_tiling": true,
-  "mid_block_attention_type": "3d",
-  "mini_batch_encoder": 8,
-  "mini_batch_decoder": 2,
   "up_block_types": [
-    "SpatialUpBlock3D",
-    "SpatialTemporalUpBlock3D",
     "SpatialTemporalUpBlock3D",
     "SpatialTemporalUpBlock3D"
-  ]
 }

 {
+  "_class_name": "AutoencoderKLMagvit",
+  "_diffusers_version": "0.30.1",
   "act_fn": "silu",
   "block_out_channels": [
     128,
     512,
     512
   ],
+  "cache_compression_vae": false,
+  "cache_mag_vae": false,
+  "ch": 128,
+  "ch_mult": [
+    1,
+    2,
+    4,
+    4
+  ],
   "down_block_types": [
+    "SpatialDownBlock3D",
+    "SpatialTemporalDownBlock3D",
     "SpatialTemporalDownBlock3D",
     "SpatialTemporalDownBlock3D"
   ],
   "in_channels": 3,
   "latent_channels": 4,
   "layers_per_block": 2,
+  "mid_block_attention_type": "3d",
+  "mid_block_num_attention_heads": 1,
+  "mid_block_type": "MidBlock3D",
+  "mid_block_use_attention": true,
+  "mini_batch_decoder": 2,
+  "mini_batch_encoder": 8,
   "norm_num_groups": 32,
+  "num_attention_heads": 1,
   "out_channels": 3,
   "sample_size": 256,
   "scaling_factor": 0.18215,
   "slice_compression_vae": true,
+  "slice_mag_vae": true,
+  "spatial_group_norm": false,
+  "tile_overlap_factor": 0.25,
+  "tile_sample_min_size": 384,
   "up_block_types": [
+    "SpatialUpBlock3D",
+    "SpatialTemporalUpBlock3D",
     "SpatialTemporalUpBlock3D",
     "SpatialTemporalUpBlock3D"
+  ],
+  "upcast_vae": false,
+  "use_gc_blocks": null,
+  "use_tiling": true,
+  "use_tiling_decoder": false,
+  "use_tiling_encoder": false
 }