File size: 3,996 Bytes
ba30f60 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
cond_image_size: 512
isosurface_resolution: 160
isosurface_threshold: 9.5
radius: 0.87
# Setup point diffusion
scale_factor_xyz: 9.0
scale_factor_rgb: 9.0
bias_xyz: 0.0
bias_rgb: -4.5
train_time_steps: 1024
inference_time_steps: 32
mean_type: epsilon
var_type: fixed_small
diffu_sched: sigmoid
diffu_sched_exp: 12.0
guidance_scale: 3.0
sigma_max: 120.0
s_churn: 5.0
pdiff_camera_embedder_cls: spar3d.models.camera.LinearCameraEmbedder
pdiff_camera_embedder:
in_channels: 25
out_channels: 768
conditions:
- c2w_cond
- intrinsic_normed_cond
pdiff_image_tokenizer_cls: spar3d.models.tokenizers.image.DINOV2SingleImageTokenizer
pdiff_image_tokenizer:
pretrained_model_name_or_path: "facebook/dinov2-large"
width: 512
height: 512
modulation_cond_dim: 768
pdiff_backbone_cls: spar3d.models.transformers.point_diffusion.PointEDenoiser
pdiff_backbone:
in_channels: 6
out_channels: 6
num_attention_heads: 16
num_layers: 24
width: 1024
cond_dim: 1024
camera_embedder_cls: spar3d.models.camera.LinearCameraEmbedder
camera_embedder:
in_channels: 25
out_channels: 768
conditions:
- c2w_cond
- intrinsic_normed_cond
image_tokenizer_cls: spar3d.models.tokenizers.image.DINOV2SingleImageTokenizer
image_tokenizer:
pretrained_model_name_or_path: "facebook/dinov2-large"
width: 512
height: 512
modulation_cond_dim: 768
point_embedder_cls: spar3d.models.tokenizers.point.TransformerPointTokenizer
point_embedder:
in_channels: 6
out_channels: 1024
num_attention_heads: 16
attention_head_dim: 32
num_layers: 12
tokenizer_cls: spar3d.models.tokenizers.triplane.TriplaneLearnablePositionalEmbedding
tokenizer:
plane_size: 96
num_channels: 1024
backbone_cls: spar3d.models.transformers.backbone.TwoStreamInterleaveTransformer
backbone:
num_attention_heads: 16
attention_head_dim: 64
raw_triplane_channels: 1024
triplane_channels: 1024
raw_image_channels: 1024 # DINO features
num_latents: 1792
num_blocks: 4
num_basic_blocks: 3
post_processor_cls: spar3d.models.network.PixelShuffleUpsampleNetwork
post_processor:
in_channels: 1024
out_channels: 40
scale_factor: 4
conv_layers: 4
decoder_cls: spar3d.models.network.MaterialMLP
decoder:
in_channels: 120
n_neurons: 64
activation: silu
heads:
- name: density
out_channels: 1
n_hidden_layers: 2
output_activation: trunc_exp
- name: features
out_channels: 3
n_hidden_layers: 3
output_activation: sigmoid
- name: perturb_normal
out_channels: 3
n_hidden_layers: 3
output_activation: normalize_channel_last
- name: vertex_offset
out_channels: 3
n_hidden_layers: 2
image_estimator_cls: spar3d.models.image_estimator.clip_based_estimator.ClipBasedHeadEstimator
image_estimator:
heads:
- name: roughness
out_channels: 1
n_hidden_layers: 3
output_activation: linear
distribution_eval: mean
add_to_decoder_features: true
output_bias: 1.0
shape: [-1, 1, 1]
- name: metallic
out_channels: 1
n_hidden_layers: 3
output_activation: linear
distribution_eval: mode
add_to_decoder_features: true
output_bias: 1.0
shape: [-1, 1, 1]
global_estimator_cls: spar3d.models.global_estimator.reni_estimator.ReniLatentCodeEstimator
global_estimator:
triplane_features: 1024
n_layers: 2
pool: max
reni_env_config:
reni_config:
axis_of_invariance: z
conditioning: Attention
encoded_input: Directions
equivariance: SO2
first_omega_0: 30.0
fixed_decoder: False
hidden_features: 128
hidden_layers: 9
hidden_omega_0: 30.0
invariant_function: VN
last_layer_linear: True
latent_dim: 49
mapping_features: 128
mapping_layers: 5
num_attention_heads: 8
num_attention_layers: 6
old_implementation: False
out_features: 3
output_activation: exp
positional_encoding: NeRF
resolution: 64
|