cond_image_size: 512
isosurface_resolution: 160
isosurface_threshold: 9.5
radius: 0.87

# Setup point diffusion
scale_factor_xyz: 9.0
scale_factor_rgb: 9.0
bias_xyz: 0.0
bias_rgb: -4.5

train_time_steps: 1024
inference_time_steps: 32
mean_type: epsilon
var_type: fixed_small
diffu_sched: sigmoid
diffu_sched_exp: 12.0
guidance_scale: 3.0
sigma_max: 120.0
s_churn: 5.0

pdiff_camera_embedder_cls: spar3d.models.camera.LinearCameraEmbedder
pdiff_camera_embedder:
  in_channels: 25
  out_channels: 768
  conditions:
    - c2w_cond
    - intrinsic_normed_cond

pdiff_image_tokenizer_cls: spar3d.models.tokenizers.image.DINOV2SingleImageTokenizer
pdiff_image_tokenizer:
  pretrained_model_name_or_path: "facebook/dinov2-large"
  width: 512
  height: 512
  modulation_cond_dim: 768

pdiff_backbone_cls: spar3d.models.transformers.point_diffusion.PointEDenoiser
pdiff_backbone:
  in_channels: 6
  out_channels: 6
  num_attention_heads: 16
  num_layers: 24
  width: 1024
  cond_dim: 1024

camera_embedder_cls: spar3d.models.camera.LinearCameraEmbedder
camera_embedder:
  in_channels: 25
  out_channels: 768
  conditions:
    - c2w_cond
    - intrinsic_normed_cond

image_tokenizer_cls: spar3d.models.tokenizers.image.DINOV2SingleImageTokenizer
image_tokenizer:
  pretrained_model_name_or_path: "facebook/dinov2-large"
  width: 512
  height: 512
  modulation_cond_dim: 768

point_embedder_cls: spar3d.models.tokenizers.point.TransformerPointTokenizer
point_embedder:
  in_channels: 6
  out_channels: 1024
  num_attention_heads: 16
  attention_head_dim: 32
  num_layers: 12

tokenizer_cls: spar3d.models.tokenizers.triplane.TriplaneLearnablePositionalEmbedding
tokenizer:
  plane_size: 96
  num_channels: 1024

backbone_cls: spar3d.models.transformers.backbone.TwoStreamInterleaveTransformer
backbone:
  num_attention_heads: 16
  attention_head_dim: 64
  raw_triplane_channels: 1024
  triplane_channels: 1024
  raw_image_channels: 1024 # DINO features
  num_latents: 1792
  num_blocks: 4
  num_basic_blocks: 3

post_processor_cls: spar3d.models.network.PixelShuffleUpsampleNetwork
post_processor:
  in_channels: 1024
  out_channels: 40
  scale_factor: 4
  conv_layers: 4

decoder_cls: spar3d.models.network.MaterialMLP
decoder:
  in_channels: 120
  n_neurons: 64
  activation: silu
  heads:
    - name: density
      out_channels: 1
      n_hidden_layers: 2
      output_activation: trunc_exp
    - name: features
      out_channels: 3
      n_hidden_layers: 3
      output_activation: sigmoid
    - name: perturb_normal
      out_channels: 3
      n_hidden_layers: 3
      output_activation: normalize_channel_last
    - name: vertex_offset
      out_channels: 3
      n_hidden_layers: 2

image_estimator_cls: spar3d.models.image_estimator.clip_based_estimator.ClipBasedHeadEstimator
image_estimator:
  heads:
    - name: roughness
      out_channels: 1
      n_hidden_layers: 3
      output_activation: linear
      distribution_eval: mean
      add_to_decoder_features: true
      output_bias: 1.0
      shape: [-1, 1, 1]
    - name: metallic
      out_channels: 1
      n_hidden_layers: 3
      output_activation: linear
      distribution_eval: mode
      add_to_decoder_features: true
      output_bias: 1.0
      shape: [-1, 1, 1]

global_estimator_cls: spar3d.models.global_estimator.reni_estimator.ReniLatentCodeEstimator
global_estimator:
  triplane_features: 1024
  n_layers: 2
  pool: max
  reni_env_config:
    reni_config:
      axis_of_invariance: z
      conditioning: Attention
      encoded_input: Directions
      equivariance: SO2
      first_omega_0: 30.0
      fixed_decoder: False
      hidden_features: 128
      hidden_layers: 9
      hidden_omega_0: 30.0
      invariant_function: VN
      last_layer_linear: True
      latent_dim: 49
      mapping_features: 128
      mapping_layers: 5
      num_attention_heads: 8
      num_attention_layers: 6
      old_implementation: False
      out_features: 3
      output_activation: exp
      positional_encoding: NeRF
    resolution: 64