Logging to ./ creating model and diffusion... creating 3DAE... length of vit_decoder.blocks: 24 init pos_embed with sincos length of vit_decoder.blocks: 24 ignore dim_up_mlp: True AE( (encoder): MVEncoderGSDynamicInp( (conv_in): Conv2d(10, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (down): ModuleList( (0): Module( (block): ModuleList( (0): ResnetBlock( (norm1): GroupNorm(32, 64, eps=1e-06, affine=True) (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 64, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() (downsample): Downsample( (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2)) ) ) (1): Module( (block): ModuleList( (0): ResnetBlock( (norm1): GroupNorm(32, 64, eps=1e-06, affine=True) (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nin_shortcut): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1)) ) ) (attn): ModuleList() (downsample): Downsample( (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2)) ) ) (2): Module( (block): ModuleList( (0): ResnetBlock( (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nin_shortcut): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1)) ) ) (attn): ModuleList() (downsample): Downsample( (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) ) ) (3): Module( (block): ModuleList( (0): ResnetBlock( (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() ) ) (mid): Module( (block_1): ResnetBlock( (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (attn_1): SpatialTransformer3D( (norm): GroupNorm(32, 256, eps=1e-06, affine=True) (proj_in): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): BasicTransformerBlock3D( (attn1): MemoryEfficientCrossAttention( (to_q): Linear(in_features=512, out_features=512, bias=False) (to_k): Linear(in_features=512, out_features=512, bias=False) (q_norm): Identity() (k_norm): Identity() (to_v): Linear(in_features=512, out_features=512, bias=False) (to_out): Sequential( (0): Linear(in_features=512, out_features=512, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (ff): FeedForward( (net): Sequential( (0): GEGLU( (proj): Linear(in_features=512, out_features=4096, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): Linear(in_features=2048, out_features=512, bias=True) ) ) (attn2): MemoryEfficientCrossAttention( (to_q): Linear(in_features=512, out_features=512, bias=False) (to_k): Linear(in_features=512, out_features=512, bias=False) (q_norm): Identity() (k_norm): Identity() (to_v): Linear(in_features=512, out_features=512, bias=False) (to_out): Sequential( (0): Linear(in_features=512, out_features=512, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1)) ) (block_2): ResnetBlock( (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (norm_out): GroupNorm(32, 256, eps=1e-06, affine=True) (conv_out): Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (decoder): RodinSR_256_fusionv6_ConvQuant_liteSR_dinoInit3DAttn_SD_B_3L_C_withrollout_withSD_D_ditDecoder( (superresolution): ModuleDict( (ldm_upsample): PatchEmbedTriplane( (proj): Conv2d(12, 3072, kernel_size=(2, 2), stride=(2, 2), groups=3) (norm): Identity() ) (quant_conv): Conv2d(24, 24, kernel_size=(1, 1), stride=(1, 1), groups=3) (conv_sr): Decoder( (conv_in): Conv2d(1024, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (mid): Module( (block_1): ResnetBlock( (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (attn_1): MemoryEfficientAttnBlock( (norm): GroupNorm(32, 128, eps=1e-06, affine=True) (q): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1)) (k): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1)) (v): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1)) (proj_out): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1)) ) (block_2): ResnetBlock( (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (up): ModuleList( (0): Module( (block): ModuleList( (0): ResnetBlock( (norm1): GroupNorm(32, 64, eps=1e-06, affine=True) (conv1): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 32, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nin_shortcut): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1)) ) (1): ResnetBlock( (norm1): GroupNorm(32, 32, eps=1e-06, affine=True) (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 32, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() ) (1): Module( (block): ModuleList( (0-1): 2 x ResnetBlock( (norm1): GroupNorm(32, 64, eps=1e-06, affine=True) (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 64, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() (upsample): Upsample( (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (2): Module( (block): ModuleList( (0): ResnetBlock( (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) (conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 64, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nin_shortcut): Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1)) ) (1): ResnetBlock( (norm1): GroupNorm(32, 64, eps=1e-06, affine=True) (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 64, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() (upsample): Upsample( (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (3): Module( (block): ModuleList( (0-1): 2 x ResnetBlock( (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() (upsample): Upsample( (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) ) (norm_out): GroupNorm(32, 32, eps=1e-06, affine=True) (conv_out): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (vit_decoder): DiT2( (blocks): ModuleList( (0-23): 24 x DiTBlock2( (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=False) (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=False) (attn): MemEffAttention( (qkv): Linear(in_features=1024, out_features=3072, bias=True) (attn_drop): Dropout(p=0.0, inplace=False) (proj): Linear(in_features=1024, out_features=1024, bias=True) (proj_drop): Dropout(p=0.0, inplace=False) (q_norm): Identity() (k_norm): Identity() ) (mlp): FusedMLP( (mlp): Sequential( (0): Linear(in_features=1024, out_features=4096, bias=False) (1): FusedDropoutBias( (activation_pytorch): GELU(approximate='none') ) (2): Linear(in_features=4096, out_features=1024, bias=False) (3): FusedDropoutBias( (activation_pytorch): Identity() ) ) ) (adaLN_modulation): Sequential( (0): SiLU() (1): Linear(in_features=1024, out_features=6144, bias=True) ) ) ) ) (triplane_decoder): Triplane( (renderer): ImportanceRenderer( (ray_marcher): MipRayMarcher2() ) (ray_sampler): PatchRaySampler() (decoder): OSGDecoder( (net): Sequential( (0): FullyConnectedLayer(in_features=32, out_features=64, activation=linear) (1): Softplus(beta=1.0, threshold=20.0) (2): FullyConnectedLayer(in_features=64, out_features=4, activation=linear) ) ) ) (decoder_pred): None ) ) create dataset joint_denoise_rec_model enables AMP to accelerate training mark joint_denoise_rec_model loading loading model from huggingface: yslan/LN3Diff/checkpoints/objaverse/objaverse-dit/i23d/model_joint_denoise_rec_model2990000.safetensors... mark joint_denoise_rec_model loading finished