Spaces:

rerun
/

InstantMesh

Running on Zero

App Files Files Community

02alexander commited on Jun 18

Commit

71d5bf5

•

1 Parent(s): 344c16f

copy code to this repo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +2 -0
CMakeLists.txt +0 -18
Cargo.lock +0 -7
Cargo.toml +0 -198
README.md +2 -39
app.py +308 -0
configs/instant-mesh-base.yaml +22 -0
configs/instant-mesh-large.yaml +22 -0
configs/instant-nerf-base.yaml +21 -0
configs/instant-nerf-large.yaml +21 -0
examples/bird.jpg +0 -0
examples/bubble_mart_blue.png +0 -0
examples/cake.jpg +0 -0
examples/cartoon_dinosaur.png +0 -0
examples/chair_armed.png +0 -0
examples/chair_comfort.jpg +0 -0
examples/chair_wood.jpg +0 -0
examples/chest.jpg +0 -0
examples/cute_horse.jpg +0 -0
examples/cute_tiger.jpg +0 -0
examples/earphone.jpg +0 -0
examples/fox.jpg +0 -0
examples/fruit.jpg +0 -0
examples/fruit_elephant.jpg +0 -0
examples/genshin_building.png +0 -0
examples/genshin_teapot.png +0 -0
examples/hatsune_miku.png +0 -0
examples/house2.jpg +0 -0
examples/mushroom_teapot.jpg +0 -0
examples/pikachu.png +0 -0
examples/plant.jpg +0 -0
examples/robot.jpg +0 -0
examples/sea_turtle.png +0 -0
examples/skating_shoe.jpg +0 -0
examples/sorting_board.png +0 -0
examples/sword.png +0 -0
examples/toy_car.jpg +0 -0
examples/watermelon.png +0 -0
examples/whitedog.png +0 -0
examples/x_teapot.jpg +0 -0
examples/x_toyduck.jpg +0 -0
main.py +0 -11
requirements.txt +27 -1
src/__init__.py +0 -0
src/data/__init__.py +0 -0
src/data/objaverse.py +322 -0
src/lib.rs +0 -1
src/main.cpp +0 -8
src/main.rs +0 -5
src/model.py +313 -0

.gitignore CHANGED Viewed

@@ -20,3 +20,5 @@ __pycache__
 .mypy_cache
 .ruff_cache
 venv

 .mypy_cache
 .ruff_cache
 venv
+shell.nix

CMakeLists.txt DELETED Viewed

@@ -1,18 +0,0 @@
-cmake_minimum_required(VERSION 3.16...3.27)
-project(PROJ_NAME LANGUAGES CXX)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-if(NOT DEFINED CMAKE_CXX_STANDARD)
-    set(CMAKE_CXX_STANDARD 17)
-endif()
-# Rerun:
-include(FetchContent)
-FetchContent_Declare(rerun_sdk URL https://github.com/rerun-io/rerun/releases/download/0.15.1/rerun_cpp_sdk.zip)
-FetchContent_MakeAvailable(rerun_sdk)
-add_executable(PROJ_NAME src/main.cpp)
-target_link_libraries(PROJ_NAME rerun_sdk)
-target_include_directories(PROJ_NAME PRIVATE src)

Cargo.lock DELETED Viewed

@@ -1,7 +0,0 @@
-# This file is automatically @generated by Cargo.
-# It is not intended for manual editing.
-version = 3
-[[package]]
-name = "new_project_name"
-version = "0.1.0"

Cargo.toml DELETED Viewed

@@ -1,198 +0,0 @@
-[package]
-authors = ["rerun.io <opensource@rerun.io>"]
-categories = []                                                      # TODO: fill in if you plan on publishing the crate
-description = ""                                                     # TODO: fill in if you plan on publishing the crate
-edition = "2021"
-homepage = "https://github.com/rerun-io/new_repo_name"
-include = ["LICENSE-APACHE", "LICENSE-MIT", "**/*.rs", "Cargo.toml"]
-keywords = []                                                        # TODO: fill in if you plan on publishing the crate
-license = "MIT OR Apache-2.0"
-name = "new_project_name"
-publish = false                                                      # TODO: set to `true` if you plan on publishing the crate
-readme = "README.md"
-repository = "https://github.com/rerun-io/new_repo_name"
-rust-version = "1.76"
-version = "0.1.0"
-[package.metadata.docs.rs]
-all-features = true
-targets = ["x86_64-unknown-linux-gnu", "wasm32-unknown-unknown"]
-[features]
-default = []
-[dependencies]
-[dev-dependencies]
-[patch.crates-io]
-[lints]
-workspace = true
-[workspace.lints.rust]
-unsafe_code = "deny"
-elided_lifetimes_in_paths = "warn"
-future_incompatible = "warn"
-nonstandard_style = "warn"
-rust_2018_idioms = "warn"
-rust_2021_prelude_collisions = "warn"
-semicolon_in_expressions_from_macros = "warn"
-trivial_numeric_casts = "warn"
-unsafe_op_in_unsafe_fn = "warn"               # `unsafe_op_in_unsafe_fn` may become the default in future Rust versions: https://github.com/rust-lang/rust/issues/71668
-unused_extern_crates = "warn"
-unused_import_braces = "warn"
-unused_lifetimes = "warn"
-trivial_casts = "allow"
-unused_qualifications = "allow"
-[workspace.lints.rustdoc]
-all = "warn"
-missing_crate_level_docs = "warn"
-# See also clippy.toml
-[workspace.lints.clippy]
-as_ptr_cast_mut = "warn"
-await_holding_lock = "warn"
-bool_to_int_with_if = "warn"
-char_lit_as_u8 = "warn"
-checked_conversions = "warn"
-clear_with_drain = "warn"
-cloned_instead_of_copied = "warn"
-dbg_macro = "warn"
-debug_assert_with_mut_call = "warn"
-derive_partial_eq_without_eq = "warn"
-disallowed_macros = "warn"                  # See clippy.toml
-disallowed_methods = "warn"                 # See clippy.toml
-disallowed_names = "warn"                   # See clippy.toml
-disallowed_script_idents = "warn"           # See clippy.toml
-disallowed_types = "warn"                   # See clippy.toml
-doc_link_with_quotes = "warn"
-doc_markdown = "warn"
-empty_enum = "warn"
-enum_glob_use = "warn"
-equatable_if_let = "warn"
-exit = "warn"
-expl_impl_clone_on_copy = "warn"
-explicit_deref_methods = "warn"
-explicit_into_iter_loop = "warn"
-explicit_iter_loop = "warn"
-fallible_impl_from = "warn"
-filter_map_next = "warn"
-flat_map_option = "warn"
-float_cmp_const = "warn"
-fn_params_excessive_bools = "warn"
-fn_to_numeric_cast_any = "warn"
-from_iter_instead_of_collect = "warn"
-get_unwrap = "warn"
-if_let_mutex = "warn"
-implicit_clone = "warn"
-imprecise_flops = "warn"
-index_refutable_slice = "warn"
-inefficient_to_string = "warn"
-infinite_loop = "warn"
-into_iter_without_iter = "warn"
-invalid_upcast_comparisons = "warn"
-iter_not_returning_iterator = "warn"
-iter_on_empty_collections = "warn"
-iter_on_single_items = "warn"
-iter_over_hash_type = "warn"
-iter_without_into_iter = "warn"
-large_digit_groups = "warn"
-large_include_file = "warn"
-large_stack_arrays = "warn"
-large_stack_frames = "warn"
-large_types_passed_by_value = "warn"
-let_underscore_untyped = "warn"
-let_unit_value = "warn"
-linkedlist = "warn"
-lossy_float_literal = "warn"
-macro_use_imports = "warn"
-manual_assert = "warn"
-manual_clamp = "warn"
-manual_instant_elapsed = "warn"
-manual_let_else = "warn"
-manual_ok_or = "warn"
-manual_string_new = "warn"
-map_err_ignore = "warn"
-map_flatten = "warn"
-map_unwrap_or = "warn"
-match_on_vec_items = "warn"
-match_same_arms = "warn"
-match_wild_err_arm = "warn"
-match_wildcard_for_single_variants = "warn"
-mem_forget = "warn"
-mismatched_target_os = "warn"
-mismatching_type_param_order = "warn"
-missing_assert_message = "warn"
-missing_enforced_import_renames = "warn"
-missing_errors_doc = "warn"
-missing_safety_doc = "warn"
-mut_mut = "warn"
-mutex_integer = "warn"
-needless_borrow = "warn"
-needless_continue = "warn"
-needless_for_each = "warn"
-needless_pass_by_ref_mut = "warn"
-needless_pass_by_value = "warn"
-negative_feature_names = "warn"
-nonstandard_macro_braces = "warn"
-option_option = "warn"
-path_buf_push_overwrite = "warn"
-ptr_as_ptr = "warn"
-ptr_cast_constness = "warn"
-pub_without_shorthand = "warn"
-rc_mutex = "warn"
-readonly_write_lock = "warn"
-redundant_type_annotations = "warn"
-ref_option_ref = "warn"
-rest_pat_in_fully_bound_structs = "warn"
-same_functions_in_if_condition = "warn"
-semicolon_if_nothing_returned = "warn"
-should_panic_without_expect = "warn"
-significant_drop_tightening = "warn"
-single_match_else = "warn"
-str_to_string = "warn"
-string_add = "warn"
-string_add_assign = "warn"
-string_lit_as_bytes = "warn"
-string_lit_chars_any = "warn"
-string_to_string = "warn"
-suspicious_command_arg_space = "warn"
-suspicious_xor_used_as_pow = "warn"
-todo = "warn"
-too_many_lines = "warn"
-trailing_empty_array = "warn"
-trait_duplication_in_bounds = "warn"
-tuple_array_conversions = "warn"
-unchecked_duration_subtraction = "warn"
-undocumented_unsafe_blocks = "warn"
-unimplemented = "warn"
-uninhabited_references = "warn"
-uninlined_format_args = "warn"
-unnecessary_box_returns = "warn"
-unnecessary_safety_doc = "warn"
-unnecessary_struct_initialization = "warn"
-unnecessary_wraps = "warn"
-unnested_or_patterns = "warn"
-unused_peekable = "warn"
-unused_rounding = "warn"
-unused_self = "warn"
-unwrap_used = "warn"
-use_self = "warn"
-useless_transmute = "warn"
-verbose_file_reads = "warn"
-wildcard_dependencies = "warn"
-wildcard_imports = "warn"
-zero_sized_map_values = "warn"
-manual_range_contains = "allow" # this one is just worse imho
-ref_patterns = "allow"          # It's nice to avoid ref pattern, but there are some situations that are hard (impossible?) to express without.

README.md CHANGED Viewed

@@ -1,40 +1,3 @@
-# Rerun template repository
-Template for our private and public repos, containing CI, CoC, etc
-When creating a new Rerun repository, use this as a template, then modify it as it makes sense.
-This template should be the default for any repository of any kind, including:
-* Rust projects
-* C++ projects
-* Python projects
-* Other stuff
-This template includes
-* License files
-* Code of Conduct
-* Helpers for checking and linting Rust code
-  - `cargo-clippy`
-  - `cargo-deny`
-  - `rust-toolchain`
-  - …
-* CI for:
-  - Spell checking
-  - Link checking
-  - C++ checks
-  - Python checks
-  - Rust checks
-## How to use
-Start by clicking "Use this template" at https://github.com/rerun-io/rerun_template/ or follow [these instructions](https://docs.github.com/en/free-pro-team@latest/github/creating-cloning-and-archiving-repositories/creating-a-repository-from-a-template).
-Then follow these steps:
-* Run `scripts/template_update.py init --languages cpp,rust,python` to delete files you don't need (give the languages you need support for)
-* Search and replace all instances of `new_repo_name` with the name of the repository.
-* Search and replace all instances of `new_project_name` with the name of the project (crate/binary name).
-* Search for `TODO` and fill in all those places
-* Replace this `README.md` with something better
-* Commit!
-In the future you can always update this repository with the latest changes from the template by running:
-* `scripts/template_update.py update --languages cpp,rust,python`


1	+ ## Fork of the [InstantMesh space]() but with [Rerun](https://www.rerun.io) for visualization

2
3	+ The resulting Huggingface space can be found [here.](https://huggingface.co/spaces/rerun/InstantMesh)

app.py ADDED Viewed

	@@ -0,0 +1,308 @@

+from __future__ import annotations
+import os
+import shutil
+import threading
+from queue import SimpleQueue
+from typing import Any
+import gradio as gr
+import numpy as np
+import rembg
+import rerun as rr
+import rerun.blueprint as rrb
+import spaces
+import torch
+from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler
+from einops import rearrange
+from gradio_rerun import Rerun
+from huggingface_hub import hf_hub_download
+from omegaconf import OmegaConf
+from PIL import Image
+from pytorch_lightning import seed_everything
+from torchvision.transforms import v2
+from src.models.lrm_mesh import InstantMesh
+from src.utils.camera_util import (
+    FOV_to_intrinsics,
+    get_circular_camera_poses,
+    get_zero123plus_input_cameras,
+)
+from src.utils.infer_util import remove_background, resize_foreground
+from src.utils.train_util import instantiate_from_config
+def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False):
+    """Get the rendering camera parameters."""
+    c2ws = get_circular_camera_poses(M=M, radius=radius, elevation=elevation)
+    if is_flexicubes:
+        cameras = torch.linalg.inv(c2ws)
+        cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1, 1)
+    else:
+        extrinsics = c2ws.flatten(-2)
+        intrinsics = FOV_to_intrinsics(50.0).unsqueeze(0).repeat(M, 1, 1).float().flatten(-2)
+        cameras = torch.cat([extrinsics, intrinsics], dim=-1)
+        cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1)
+    return cameras
+###############################################################################
+# Configuration.
+###############################################################################
+def find_cuda():
+    # Check if CUDA_HOME or CUDA_PATH environment variables are set
+    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+    if cuda_home and os.path.exists(cuda_home):
+        return cuda_home
+    # Search for the nvcc executable in the system's PATH
+    nvcc_path = shutil.which("nvcc")
+    if nvcc_path:
+        # Remove the 'bin/nvcc' part to get the CUDA installation path
+        cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
+        return cuda_path
+    return None
+cuda_path = find_cuda()
+if cuda_path:
+    print(f"CUDA installation found at: {cuda_path}")
+else:
+    print("CUDA installation not found")
+config_path = "configs/instant-mesh-large.yaml"
+config = OmegaConf.load(config_path)
+config_name = os.path.basename(config_path).replace(".yaml", "")
+model_config = config.model_config
+infer_config = config.infer_config
+IS_FLEXICUBES = True if config_name.startswith("instant-mesh") else False
+device = torch.device("cuda")
+# load diffusion model
+print("Loading diffusion model ...")
+pipeline = DiffusionPipeline.from_pretrained(
+    "sudo-ai/zero123plus-v1.2",
+    custom_pipeline="zero123plus",
+    torch_dtype=torch.float16,
+)
+pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, timestep_spacing="trailing")
+# load custom white-background UNet
+unet_ckpt_path = hf_hub_download(
+    repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model"
+)
+state_dict = torch.load(unet_ckpt_path, map_location="cpu")
+pipeline.unet.load_state_dict(state_dict, strict=True)
+pipeline = pipeline.to(device)
+print(f"type(pipeline)={type(pipeline)}")
+# load reconstruction model
+print("Loading reconstruction model ...")
+model_ckpt_path = hf_hub_download(
+    repo_id="TencentARC/InstantMesh", filename="instant_mesh_large.ckpt", repo_type="model"
+)
+model: InstantMesh = instantiate_from_config(model_config)
+state_dict = torch.load(model_ckpt_path, map_location="cpu")["state_dict"]
+state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith("lrm_generator.") and "source_camera" not in k}
+model.load_state_dict(state_dict, strict=True)
+model = model.to(device)
+print("Loading Finished!")
+def check_input_image(input_image):
+    if input_image is None:
+        raise gr.Error("No image uploaded!")
+def preprocess(input_image, do_remove_background):
+    rembg_session = rembg.new_session() if do_remove_background else None
+    if do_remove_background:
+        input_image = remove_background(input_image, rembg_session)
+        input_image = resize_foreground(input_image, 0.85)
+    return input_image
+def pipeline_callback(
+    log_queue: SimpleQueue, pipe: Any, step_index: int, timestep: float, callback_kwargs: dict[str, Any]
+) -> dict[str, Any]:
+    latents = callback_kwargs["latents"]
+    image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]  # type: ignore[attr-defined]
+    image = pipe.image_processor.postprocess(image, output_type="np").squeeze()  # type: ignore[attr-defined]
+    log_queue.put(("mvs", rr.Image(image)))
+    log_queue.put(("latents", rr.Tensor(latents.squeeze())))
+    return callback_kwargs
+def generate_mvs(log_queue, input_image, sample_steps, sample_seed):
+    seed_everything(sample_seed)
+    return pipeline(
+        input_image,
+        num_inference_steps=sample_steps,
+        callback_on_step_end=lambda *args, **kwargs: pipeline_callback(log_queue, *args, **kwargs),
+    ).images[0]
+def make3d(log_queue, images: Image.Image):
+    global model
+    if IS_FLEXICUBES:
+        model.init_flexicubes_geometry(device, use_renderer=False)
+    model = model.eval()
+    images = np.asarray(images, dtype=np.float32) / 255.0
+    images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float()  # (3, 960, 640)
+    images = rearrange(images, "c (n h) (m w) -> (n m) c h w", n=3, m=2)  # (6, 3, 320, 320)
+    input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(device)
+    images = images.unsqueeze(0).to(device)
+    images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1)
+    with torch.no_grad():
+        # get triplane
+        planes = model.forward_planes(images, input_cameras)
+        # get mesh
+        mesh_out = model.extract_mesh(
+            planes,
+            use_texture_map=False,
+            **infer_config,
+        )
+        vertices, faces, vertex_colors = mesh_out
+        log_queue.put((
+            "mesh",
+            rr.Mesh3D(vertex_positions=vertices, vertex_colors=vertex_colors, triangle_indices=faces),
+        ))
+    return mesh_out
+def generate_blueprint() -> rrb.Blueprint:
+    return rrb.Blueprint(
+        rrb.Horizontal(
+            rrb.Spatial3DView(origin="mesh"),
+            rrb.Grid(
+                rrb.Spatial2DView(origin="z123image"),
+                rrb.Spatial2DView(origin="preprocessed_image"),
+                rrb.Spatial2DView(origin="mvs"),
+                rrb.TensorView(
+                    origin="latents",
+                ),
+            ),
+            column_shares=[1, 1],
+        ),
+        collapse_panels=True,
+    )
+def compute(log_queue, input_image, do_remove_background, sample_steps, sample_seed):
+    preprocessed_image = preprocess(input_image, do_remove_background)
+    log_queue.put(("preprocessed_image", rr.Image(preprocessed_image)))
+    z123_image = generate_mvs(log_queue, preprocessed_image, sample_steps, sample_seed)
+    log_queue.put(("z123image", rr.Image(z123_image)))
+    _mesh_out = make3d(log_queue, z123_image)
+    log_queue.put("done")
+@spaces.GPU
+@rr.thread_local_stream("InstantMesh")
+def log_to_rr(input_image, do_remove_background, sample_steps, sample_seed):
+    log_queue = SimpleQueue()
+    stream = rr.binary_stream()
+    blueprint = generate_blueprint()
+    rr.send_blueprint(blueprint)
+    yield stream.read()
+    handle = threading.Thread(
+        target=compute, args=[log_queue, input_image, do_remove_background, sample_steps, sample_seed]
+    )
+    handle.start()
+    while True:
+        msg = log_queue.get()
+        if msg == "done":
+            break
+        else:
+            entity_path, entity = msg
+            rr.log(entity_path, entity)
+            yield stream.read()
+    handle.join()
+_HEADER_ = """
+<h2><b>Duplicate of the <a href='https://huggingface.co/spaces/TencentARC/InstantMesh'>InstantMesh space</a> that uses <a href='https://rerun.io/'>Rerun</a> for visualization.</b></h2>
+<h2><a href='https://github.com/TencentARC/InstantMesh' target='_blank'><b>InstantMesh: Efficient 3D Mesh Generation from a Single Image with Sparse-view Large Reconstruction Models</b></a></h2>
+**InstantMesh** is a feed-forward framework for efficient 3D mesh generation from a single image based on the LRM/Instant3D architecture.
+Technical report: <a href='https://arxiv.org/abs/2404.07191' target='_blank'>ArXiv</a>.
+Source code: <a href='https://github.com/rerun-io/hf-example-instant-mesh'>Github</a>.
+"""
+with gr.Blocks() as demo:
+    gr.Markdown(_HEADER_)
+    with gr.Row(variant="panel"):
+        with gr.Column(scale=1):
+            with gr.Row():
+                input_image = gr.Image(
+                    label="Input Image",
+                    image_mode="RGBA",
+                    sources="upload",
+                    # width=256,
+                    # height=256,
+                    type="pil",
+                    elem_id="content_image",
+                )
+            with gr.Row():
+                with gr.Group():
+                    do_remove_background = gr.Checkbox(label="Remove Background", value=True)
+                    sample_seed = gr.Number(value=42, label="Seed Value", precision=0)
+                    sample_steps = gr.Slider(label="Sample Steps", minimum=30, maximum=75, value=75, step=5)
+            with gr.Row():
+                submit = gr.Button("Generate", elem_id="generate", variant="primary")
+            with gr.Row(variant="panel"):
+                gr.Examples(
+                    examples=[os.path.join("examples", img_name) for img_name in sorted(os.listdir("examples"))],
+                    inputs=[input_image],
+                    label="Examples",
+                    cache_examples=False,
+                    examples_per_page=16,
+                )
+        with gr.Column(scale=2):
+            viewer = Rerun(streaming=True, height=800)
+            with gr.Row():
+                gr.Markdown("""Try a different <b>seed value</b> if the result is unsatisfying (Default: 42).""")
+    mv_images = gr.State()
+    submit.click(fn=check_input_image, inputs=[input_image]).success(
+        fn=log_to_rr, inputs=[input_image, do_remove_background, sample_steps, sample_seed], outputs=[viewer]
+    )
+demo.launch()

configs/instant-mesh-base.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+model_config:
+  target: src.models.lrm_mesh.InstantMesh
+  params:
+    encoder_feat_dim: 768
+    encoder_freeze: false
+    encoder_model_name: facebook/dino-vitb16
+    transformer_dim: 1024
+    transformer_layers: 12
+    transformer_heads: 16
+    triplane_low_res: 32
+    triplane_high_res: 64
+    triplane_dim: 40
+    rendering_samples_per_ray: 96
+    grid_res: 128
+    grid_scale: 2.1
+infer_config:
+  unet_path: ckpts/diffusion_pytorch_model.bin
+  model_path: ckpts/instant_mesh_base.ckpt
+  texture_resolution: 1024
+  render_resolution: 512

configs/instant-mesh-large.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+model_config:
+  target: src.models.lrm_mesh.InstantMesh
+  params:
+    encoder_feat_dim: 768
+    encoder_freeze: false
+    encoder_model_name: facebook/dino-vitb16
+    transformer_dim: 1024
+    transformer_layers: 16
+    transformer_heads: 16
+    triplane_low_res: 32
+    triplane_high_res: 64
+    triplane_dim: 80
+    rendering_samples_per_ray: 128
+    grid_res: 128
+    grid_scale: 2.1
+infer_config:
+  unet_path: ckpts/diffusion_pytorch_model.bin
+  model_path: ckpts/instant_mesh_large.ckpt
+  texture_resolution: 1024
+  render_resolution: 512

configs/instant-nerf-base.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+model_config:
+  target: src.models.lrm.InstantNeRF
+  params:
+    encoder_feat_dim: 768
+    encoder_freeze: false
+    encoder_model_name: facebook/dino-vitb16
+    transformer_dim: 1024
+    transformer_layers: 12
+    transformer_heads: 16
+    triplane_low_res: 32
+    triplane_high_res: 64
+    triplane_dim: 40
+    rendering_samples_per_ray: 96
+infer_config:
+  unet_path: ckpts/diffusion_pytorch_model.bin
+  model_path: ckpts/instant_nerf_base.ckpt
+  mesh_threshold: 10.0
+  mesh_resolution: 256
+  render_resolution: 384

configs/instant-nerf-large.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+model_config:
+  target: src.models.lrm.InstantNeRF
+  params:
+    encoder_feat_dim: 768
+    encoder_freeze: false
+    encoder_model_name: facebook/dino-vitb16
+    transformer_dim: 1024
+    transformer_layers: 16
+    transformer_heads: 16
+    triplane_low_res: 32
+    triplane_high_res: 64
+    triplane_dim: 80
+    rendering_samples_per_ray: 128
+infer_config:
+  unet_path: ckpts/diffusion_pytorch_model.bin
+  model_path: ckpts/instant_nerf_large.ckpt
+  mesh_threshold: 10.0
+  mesh_resolution: 256
+  render_resolution: 384

examples/bird.jpg ADDED Viewed

examples/bubble_mart_blue.png ADDED Viewed

examples/cake.jpg ADDED Viewed

examples/cartoon_dinosaur.png ADDED Viewed

examples/chair_armed.png ADDED Viewed

examples/chair_comfort.jpg ADDED Viewed

examples/chair_wood.jpg ADDED Viewed

examples/chest.jpg ADDED Viewed

examples/cute_horse.jpg ADDED Viewed

examples/cute_tiger.jpg ADDED Viewed

examples/earphone.jpg ADDED Viewed

examples/fox.jpg ADDED Viewed

examples/fruit.jpg ADDED Viewed

examples/fruit_elephant.jpg ADDED Viewed

examples/genshin_building.png ADDED Viewed

examples/genshin_teapot.png ADDED Viewed

examples/hatsune_miku.png ADDED Viewed

examples/house2.jpg ADDED Viewed

examples/mushroom_teapot.jpg ADDED Viewed

examples/pikachu.png ADDED Viewed

examples/plant.jpg ADDED Viewed

examples/robot.jpg ADDED Viewed

examples/sea_turtle.png ADDED Viewed

examples/skating_shoe.jpg ADDED Viewed

examples/sorting_board.png ADDED Viewed

examples/sword.png ADDED Viewed

examples/toy_car.jpg ADDED Viewed

examples/watermelon.png ADDED Viewed

examples/whitedog.png ADDED Viewed

examples/x_teapot.jpg ADDED Viewed

examples/x_toyduck.jpg ADDED Viewed

main.py DELETED Viewed

@@ -1,11 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-def main() -> None:
-    pass
-if __name__ == "__main__":
-    main()

requirements.txt CHANGED Viewed

	@@ -1 +1,27 @@
1	- ~~rerun-sdk>=0.15.0,<0.16.0~~

+spaces
+torch==2.1.0
+torchvision==0.16.0
+torchaudio==2.1.0
+pytorch-lightning==2.1.2
+einops
+omegaconf
+deepspeed
+torchmetrics
+webdataset
+accelerate
+tensorboard
+PyMCubes
+trimesh
+rembg
+transformers
+diffusers==0.28.2
+bitsandbytes
+imageio[ffmpeg]
+xatlas
+plyfile
+xformers==0.0.22.post7
+git+https://github.com/NVlabs/nvdiffrast/
+huggingface-hub
+gradio_client >= 0.12
+rerun-sdk>=0.16.0,<0.17.0
+gradio_rerun

src/__init__.py ADDED Viewed

File without changes

src/data/__init__.py ADDED Viewed

File without changes

src/data/objaverse.py ADDED Viewed

	@@ -0,0 +1,322 @@

+from __future__ import annotations
+import json
+import math
+import os
+from pathlib import Path
+import cv2
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+import webdataset as wds
+from PIL import Image
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from src.utils.camera_util import (
+    FOV_to_intrinsics,
+    center_looking_at_camera_pose,
+    get_surrounding_views,
+)
+from src.utils.train_util import instantiate_from_config
+class DataModuleFromConfig(pl.LightningDataModule):
+    def __init__(
+        self,
+        batch_size=8,
+        num_workers=4,
+        train=None,
+        validation=None,
+        test=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.dataset_configs = dict()
+        if train is not None:
+            self.dataset_configs['train'] = train
+        if validation is not None:
+            self.dataset_configs['validation'] = validation
+        if test is not None:
+            self.dataset_configs['test'] = test
+    def setup(self, stage):
+        if stage in ['fit']:
+            self.datasets = dict((k, instantiate_from_config(self.dataset_configs[k])) for k in self.dataset_configs)
+        else:
+            raise NotImplementedError
+    def train_dataloader(self):
+        sampler = DistributedSampler(self.datasets['train'])
+        return wds.WebLoader(self.datasets['train'], batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False, sampler=sampler)
+    def val_dataloader(self):
+        sampler = DistributedSampler(self.datasets['validation'])
+        return wds.WebLoader(self.datasets['validation'], batch_size=1, num_workers=self.num_workers, shuffle=False, sampler=sampler)
+    def test_dataloader(self):
+        return wds.WebLoader(self.datasets['test'], batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
+class ObjaverseData(Dataset):
+    def __init__(self,
+        root_dir='objaverse/',
+        meta_fname='valid_paths.json',
+        input_image_dir='rendering_random_32views',
+        target_image_dir='rendering_random_32views',
+        input_view_num=6,
+        target_view_num=2,
+        total_view_n=32,
+        fov=50,
+        camera_rotation=True,
+        validation=False,
+    ):
+        self.root_dir = Path(root_dir)
+        self.input_image_dir = input_image_dir
+        self.target_image_dir = target_image_dir
+        self.input_view_num = input_view_num
+        self.target_view_num = target_view_num
+        self.total_view_n = total_view_n
+        self.fov = fov
+        self.camera_rotation = camera_rotation
+        with open(os.path.join(root_dir, meta_fname)) as f:
+            filtered_dict = json.load(f)
+        paths = filtered_dict['good_objs']
+        self.paths = paths
+        self.depth_scale = 4.0
+        len(self.paths)
+        print('============= length of dataset %d =============' % len(self.paths))
+    def __len__(self):
+        return len(self.paths)
+    def load_im(self, path, color):
+        """Replace background pixel with random color in rendering."""
+        pil_img = Image.open(path)
+        image = np.asarray(pil_img, dtype=np.float32) / 255.
+        alpha = image[:, :, 3:]
+        image = image[:, :, :3] * alpha + color * (1 - alpha)
+        image = torch.from_numpy(image).permute(2, 0, 1).contiguous().float()
+        alpha = torch.from_numpy(alpha).permute(2, 0, 1).contiguous().float()
+        return image, alpha
+    def __getitem__(self, index):
+        # load data
+        while True:
+            input_image_path = os.path.join(self.root_dir, self.input_image_dir, self.paths[index])
+            target_image_path = os.path.join(self.root_dir, self.target_image_dir, self.paths[index])
+            indices = np.random.choice(range(self.total_view_n), self.input_view_num + self.target_view_num, replace=False)
+            input_indices = indices[:self.input_view_num]
+            target_indices = indices[self.input_view_num:]
+            '''background color, default: white'''
+            bg_white = [1., 1., 1.]
+            bg_black = [0., 0., 0.]
+            image_list = []
+            alpha_list = []
+            depth_list = []
+            normal_list = []
+            pose_list = []
+            try:
+                input_cameras = np.load(os.path.join(input_image_path, 'cameras.npz'))['cam_poses']
+                for idx in input_indices:
+                    image, alpha = self.load_im(os.path.join(input_image_path, '%03d.png' % idx), bg_white)
+                    normal, _ = self.load_im(os.path.join(input_image_path, '%03d_normal.png' % idx), bg_black)
+                    depth = cv2.imread(os.path.join(input_image_path, '%03d_depth.png' % idx), cv2.IMREAD_UNCHANGED) / 255.0 * self.depth_scale
+                    depth = torch.from_numpy(depth).unsqueeze(0)
+                    pose = input_cameras[idx]
+                    pose = np.concatenate([pose, np.array([[0, 0, 0, 1]])], axis=0)
+                    image_list.append(image)
+                    alpha_list.append(alpha)
+                    depth_list.append(depth)
+                    normal_list.append(normal)
+                    pose_list.append(pose)
+                target_cameras = np.load(os.path.join(target_image_path, 'cameras.npz'))['cam_poses']
+                for idx in target_indices:
+                    image, alpha = self.load_im(os.path.join(target_image_path, '%03d.png' % idx), bg_white)
+                    normal, _ = self.load_im(os.path.join(target_image_path, '%03d_normal.png' % idx), bg_black)
+                    depth = cv2.imread(os.path.join(target_image_path, '%03d_depth.png' % idx), cv2.IMREAD_UNCHANGED) / 255.0 * self.depth_scale
+                    depth = torch.from_numpy(depth).unsqueeze(0)
+                    pose = target_cameras[idx]
+                    pose = np.concatenate([pose, np.array([[0, 0, 0, 1]])], axis=0)
+                    image_list.append(image)
+                    alpha_list.append(alpha)
+                    depth_list.append(depth)
+                    normal_list.append(normal)
+                    pose_list.append(pose)
+            except Exception as e:
+                print(e)
+                index = np.random.randint(0, len(self.paths))
+                continue
+            break
+        images = torch.stack(image_list, dim=0).float()                 # (6+V, 3, H, W)
+        alphas = torch.stack(alpha_list, dim=0).float()                 # (6+V, 1, H, W)
+        depths = torch.stack(depth_list, dim=0).float()                 # (6+V, 1, H, W)
+        normals = torch.stack(normal_list, dim=0).float()               # (6+V, 3, H, W)
+        w2cs = torch.from_numpy(np.stack(pose_list, axis=0)).float()    # (6+V, 4, 4)
+        c2ws = torch.linalg.inv(w2cs).float()
+        normals = normals * 2.0 - 1.0
+        normals = F.normalize(normals, dim=1)
+        normals = (normals + 1.0) / 2.0
+        normals = torch.lerp(torch.zeros_like(normals), normals, alphas)
+        # random rotation along z axis
+        if self.camera_rotation:
+            degree = np.random.uniform(0, math.pi * 2)
+            rot = torch.tensor([
+                [np.cos(degree), -np.sin(degree), 0, 0],
+                [np.sin(degree), np.cos(degree), 0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 0, 1],
+            ]).unsqueeze(0).float()
+            c2ws = torch.matmul(rot, c2ws)
+            # rotate normals
+            N, _, H, W = normals.shape
+            normals = normals * 2.0 - 1.0
+            normals = torch.matmul(rot[:, :3, :3], normals.view(N, 3, -1)).view(N, 3, H, W)
+            normals = F.normalize(normals, dim=1)
+            normals = (normals + 1.0) / 2.0
+            normals = torch.lerp(torch.zeros_like(normals), normals, alphas)
+        # random scaling
+        if np.random.rand() < 0.5:
+            scale = np.random.uniform(0.8, 1.0)
+            c2ws[:, :3, 3] *= scale
+            depths *= scale
+        # instrinsics of perspective cameras
+        K = FOV_to_intrinsics(self.fov)
+        Ks = K.unsqueeze(0).repeat(self.input_view_num + self.target_view_num, 1, 1).float()
+        data = {
+            'input_images': images[:self.input_view_num],     # (6, 3, H, W)
+            'input_alphas': alphas[:self.input_view_num],           # (6, 1, H, W)
+            'input_depths': depths[:self.input_view_num],           # (6, 1, H, W)
+            'input_normals': normals[:self.input_view_num],         # (6, 3, H, W)
+            'input_c2ws': c2ws_input[:self.input_view_num],         # (6, 4, 4)
+            'input_Ks': Ks[:self.input_view_num],                   # (6, 3, 3)
+            # lrm generator input and supervision
+            'target_images': images[self.input_view_num:],          # (V, 3, H, W)
+            'target_alphas': alphas[self.input_view_num:],          # (V, 1, H, W)
+            'target_depths': depths[self.input_view_num:],          # (V, 1, H, W)
+            'target_normals': normals[self.input_view_num:],        # (V, 3, H, W)
+            'target_c2ws': c2ws[self.input_view_num:],              # (V, 4, 4)
+            'target_Ks': Ks[self.input_view_num:],                  # (V, 3, 3)
+            'depth_available': 1,
+        }
+        return data
+class ValidationData(Dataset):
+    def __init__(self,
+        root_dir='objaverse/',
+        input_view_num=6,
+        input_image_size=256,
+        fov=50,
+    ):
+        self.root_dir = Path(root_dir)
+        self.input_view_num = input_view_num
+        self.input_image_size = input_image_size
+        self.fov = fov
+        self.paths = sorted(os.listdir(self.root_dir))
+        print('============= length of dataset %d =============' % len(self.paths))
+        cam_distance = 2.5
+        azimuths = np.array([30, 90, 150, 210, 270, 330])
+        elevations = np.array([30, -20, 30, -20, 30, -20])
+        azimuths = np.deg2rad(azimuths)
+        elevations = np.deg2rad(elevations)
+        x = cam_distance * np.cos(elevations) * np.cos(azimuths)
+        y = cam_distance * np.cos(elevations) * np.sin(azimuths)
+        z = cam_distance * np.sin(elevations)
+        cam_locations = np.stack([x, y, z], axis=-1)
+        cam_locations = torch.from_numpy(cam_locations).float()
+        c2ws = center_looking_at_camera_pose(cam_locations)
+        self.c2ws = c2ws.float()
+        self.Ks = FOV_to_intrinsics(self.fov).unsqueeze(0).repeat(6, 1, 1).float()
+        render_c2ws = get_surrounding_views(M=8, radius=cam_distance)
+        render_Ks = FOV_to_intrinsics(self.fov).unsqueeze(0).repeat(render_c2ws.shape[0], 1, 1)
+        self.render_c2ws = render_c2ws.float()
+        self.render_Ks = render_Ks.float()
+    def __len__(self):
+        return len(self.paths)
+    def load_im(self, path, color):
+        """Replace background pixel with random color in rendering."""
+        pil_img = Image.open(path)
+        pil_img = pil_img.resize((self.input_image_size, self.input_image_size), resample=Image.BICUBIC)
+        image = np.asarray(pil_img, dtype=np.float32) / 255.
+        if image.shape[-1] == 4:
+            alpha = image[:, :, 3:]
+            image = image[:, :, :3] * alpha + color * (1 - alpha)
+        else:
+            alpha = np.ones_like(image[:, :, :1])
+        image = torch.from_numpy(image).permute(2, 0, 1).contiguous().float()
+        alpha = torch.from_numpy(alpha).permute(2, 0, 1).contiguous().float()
+        return image, alpha
+    def __getitem__(self, index):
+        # load data
+        input_image_path = os.path.join(self.root_dir, self.paths[index])
+        '''background color, default: white'''
+        # color = np.random.uniform(0.48, 0.52)
+        bkg_color = [1.0, 1.0, 1.0]
+        image_list = []
+        alpha_list = []
+        for idx in range(self.input_view_num):
+            image, alpha = self.load_im(os.path.join(input_image_path, f'{idx:03d}.png'), bkg_color)
+            image_list.append(image)
+            alpha_list.append(alpha)
+        images = torch.stack(image_list, dim=0).float()                     # (6+V, 3, H, W)
+        alphas = torch.stack(alpha_list, dim=0).float()                 # (6+V, 1, H, W)
+        data = {
+            'input_images': images,                 # (6, 3, H, W)
+            'input_alphas': alphas,             # (6, 1, H, W)
+            'input_c2ws': self.c2ws,            # (6, 4, 4)
+            'input_Ks': self.Ks,                # (6, 3, 3)
+            'render_c2ws': self.render_c2ws,
+            'render_Ks': self.render_Ks,
+        }
+        return data

src/lib.rs DELETED Viewed

	@@ -1 +0,0 @@
1	- //! Example of a Rust library.

src/main.cpp DELETED Viewed

@@ -1,8 +0,0 @@
-#include <cstdio>
-#include <rerun.hpp>
-int main(int argc, const char* argv[]) {
-    printf("Hello, World!\n");
-    return 0;
-}

src/main.rs DELETED Viewed

@@ -1,5 +0,0 @@
-//! Example of a Rust binary.
-fn main() {
-    println!("Hello, PROJ_NAME!");
-}

src/model.py ADDED Viewed

	@@ -0,0 +1,313 @@

+from __future__ import annotations
+import os
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
+from torchvision.transforms import v2
+from torchvision.utils import make_grid, save_image
+from src.utils.train_util import instantiate_from_config
+class MVRecon(pl.LightningModule):
+    def __init__(
+        self,
+        lrm_generator_config,
+        lrm_path=None,
+        input_size=256,
+        render_size=192,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.render_size = render_size
+        # init modules
+        self.lrm_generator = instantiate_from_config(lrm_generator_config)
+        if lrm_path is not None:
+            lrm_ckpt = torch.load(lrm_path)
+            self.lrm_generator.load_state_dict(lrm_ckpt['weights'], strict=False)
+        self.lpips = LearnedPerceptualImagePatchSimilarity(net_type='vgg')
+        self.validation_step_outputs = []
+    def on_fit_start(self):
+        if self.global_rank == 0:
+            os.makedirs(os.path.join(self.logdir, 'images'), exist_ok=True)
+            os.makedirs(os.path.join(self.logdir, 'images_val'), exist_ok=True)
+    def prepare_batch_data(self, batch):
+        lrm_generator_input = {}
+        render_gt = {}   # for supervision
+        # input images
+        images = batch['input_images']
+        images = v2.functional.resize(
+            images, self.input_size, interpolation=3, antialias=True).clamp(0, 1)
+        lrm_generator_input['images'] = images.to(self.device)
+        # input cameras and render cameras
+        input_c2ws = batch['input_c2ws'].flatten(-2)
+        input_Ks = batch['input_Ks'].flatten(-2)
+        target_c2ws = batch['target_c2ws'].flatten(-2)
+        target_Ks = batch['target_Ks'].flatten(-2)
+        render_cameras_input = torch.cat([input_c2ws, input_Ks], dim=-1)
+        render_cameras_target = torch.cat([target_c2ws, target_Ks], dim=-1)
+        render_cameras = torch.cat([render_cameras_input, render_cameras_target], dim=1)
+        input_extrinsics = input_c2ws[:, :, :12]
+        input_intrinsics = torch.stack([
+            input_Ks[:, :, 0], input_Ks[:, :, 4],
+            input_Ks[:, :, 2], input_Ks[:, :, 5],
+        ], dim=-1)
+        cameras = torch.cat([input_extrinsics, input_intrinsics], dim=-1)
+        # add noise to input cameras
+        cameras = cameras + torch.rand_like(cameras) * 0.04 - 0.02
+        lrm_generator_input['cameras'] = cameras.to(self.device)
+        lrm_generator_input['render_cameras'] = render_cameras.to(self.device)
+        # target images
+        target_images = torch.cat([batch['input_images'], batch['target_images']], dim=1)
+        target_depths = torch.cat([batch['input_depths'], batch['target_depths']], dim=1)
+        target_alphas = torch.cat([batch['input_alphas'], batch['target_alphas']], dim=1)
+        # random crop
+        render_size = np.random.randint(self.render_size, 513)
+        target_images = v2.functional.resize(
+            target_images, render_size, interpolation=3, antialias=True).clamp(0, 1)
+        target_depths = v2.functional.resize(
+            target_depths, render_size, interpolation=0, antialias=True)
+        target_alphas = v2.functional.resize(
+            target_alphas, render_size, interpolation=0, antialias=True)
+        crop_params = v2.RandomCrop.get_params(
+            target_images, output_size=(self.render_size, self.render_size))
+        target_images = v2.functional.crop(target_images, *crop_params)
+        target_depths = v2.functional.crop(target_depths, *crop_params)[:, :, 0:1]
+        target_alphas = v2.functional.crop(target_alphas, *crop_params)[:, :, 0:1]
+        lrm_generator_input['render_size'] = render_size
+        lrm_generator_input['crop_params'] = crop_params
+        render_gt['target_images'] = target_images.to(self.device)
+        render_gt['target_depths'] = target_depths.to(self.device)
+        render_gt['target_alphas'] = target_alphas.to(self.device)
+        return lrm_generator_input, render_gt
+    def prepare_validation_batch_data(self, batch):
+        lrm_generator_input = {}
+        # input images
+        images = batch['input_images']
+        images = v2.functional.resize(
+            images, self.input_size, interpolation=3, antialias=True).clamp(0, 1)
+        lrm_generator_input['images'] = images.to(self.device)
+        input_c2ws = batch['input_c2ws'].flatten(-2)
+        input_Ks = batch['input_Ks'].flatten(-2)
+        input_extrinsics = input_c2ws[:, :, :12]
+        input_intrinsics = torch.stack([
+            input_Ks[:, :, 0], input_Ks[:, :, 4],
+            input_Ks[:, :, 2], input_Ks[:, :, 5],
+        ], dim=-1)
+        cameras = torch.cat([input_extrinsics, input_intrinsics], dim=-1)
+        lrm_generator_input['cameras'] = cameras.to(self.device)
+        render_c2ws = batch['render_c2ws'].flatten(-2)
+        render_Ks = batch['render_Ks'].flatten(-2)
+        render_cameras = torch.cat([render_c2ws, render_Ks], dim=-1)
+        lrm_generator_input['render_cameras'] = render_cameras.to(self.device)
+        lrm_generator_input['render_size'] = 384
+        lrm_generator_input['crop_params'] = None
+        return lrm_generator_input
+    def forward_lrm_generator(
+        self,
+        images,
+        cameras,
+        render_cameras,
+        render_size=192,
+        crop_params=None,
+        chunk_size=1,
+    ):
+        planes = torch.utils.checkpoint.checkpoint(
+            self.lrm_generator.forward_planes,
+            images,
+            cameras,
+            use_reentrant=False,
+        )
+        frames = []
+        for i in range(0, render_cameras.shape[1], chunk_size):
+            frames.append(
+                torch.utils.checkpoint.checkpoint(
+                    self.lrm_generator.synthesizer,
+                    planes,
+                    cameras=render_cameras[:, i:i+chunk_size],
+                    render_size=render_size,
+                    crop_params=crop_params,
+                    use_reentrant=False
+                )
+            )
+        frames = {
+            k: torch.cat([r[k] for r in frames], dim=1)
+            for k in frames[0].keys()
+        }
+        return frames
+    def forward(self, lrm_generator_input):
+        images = lrm_generator_input['images']
+        cameras = lrm_generator_input['cameras']
+        render_cameras = lrm_generator_input['render_cameras']
+        render_size = lrm_generator_input['render_size']
+        crop_params = lrm_generator_input['crop_params']
+        out = self.forward_lrm_generator(
+            images,
+            cameras,
+            render_cameras,
+            render_size=render_size,
+            crop_params=crop_params,
+            chunk_size=1,
+        )
+        render_images = torch.clamp(out['images_rgb'], 0.0, 1.0)
+        render_depths = out['images_depth']
+        render_alphas = torch.clamp(out['images_weight'], 0.0, 1.0)
+        out = {
+            'render_images': render_images,
+            'render_depths': render_depths,
+            'render_alphas': render_alphas,
+        }
+        return out
+    def training_step(self, batch, batch_idx):
+        lrm_generator_input, render_gt = self.prepare_batch_data(batch)
+        render_out = self.forward(lrm_generator_input)
+        loss, loss_dict = self.compute_loss(render_out, render_gt)
+        self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        if self.global_step % 1000 == 0 and self.global_rank == 0:
+            B, N, C, H, W = render_gt['target_images'].shape
+            N_in = lrm_generator_input['images'].shape[1]
+            input_images = v2.functional.resize(
+                lrm_generator_input['images'], (H, W), interpolation=3, antialias=True).clamp(0, 1)
+            input_images = torch.cat(
+                [input_images, torch.ones(B, N-N_in, C, H, W).to(input_images)], dim=1)
+            input_images = rearrange(
+                input_images, 'b n c h w -> b c h (n w)')
+            target_images = rearrange(
+                render_gt['target_images'], 'b n c h w -> b c h (n w)')
+            render_images = rearrange(
+                render_out['render_images'], 'b n c h w -> b c h (n w)')
+            target_alphas = rearrange(
+                repeat(render_gt['target_alphas'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            render_alphas = rearrange(
+                repeat(render_out['render_alphas'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            target_depths = rearrange(
+                repeat(render_gt['target_depths'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            render_depths = rearrange(
+                repeat(render_out['render_depths'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            MAX_DEPTH = torch.max(target_depths)
+            target_depths = target_depths / MAX_DEPTH * target_alphas
+            render_depths = render_depths / MAX_DEPTH
+            grid = torch.cat([
+                input_images,
+                target_images, render_images,
+                target_alphas, render_alphas,
+                target_depths, render_depths,
+            ], dim=-2)
+            grid = make_grid(grid, nrow=target_images.shape[0], normalize=True, value_range=(0, 1))
+            save_image(grid, os.path.join(self.logdir, 'images', f'train_{self.global_step:07d}.png'))
+        return loss
+    def compute_loss(self, render_out, render_gt):
+        # NOTE: the rgb value range of OpenLRM is [0, 1]
+        render_images = render_out['render_images']
+        target_images = render_gt['target_images'].to(render_images)
+        render_images = rearrange(render_images, 'b n ... -> (b n) ...') * 2.0 - 1.0
+        target_images = rearrange(target_images, 'b n ... -> (b n) ...') * 2.0 - 1.0
+        loss_mse = F.mse_loss(render_images, target_images)
+        loss_lpips = 2.0 * self.lpips(render_images, target_images)
+        render_alphas = render_out['render_alphas']
+        target_alphas = render_gt['target_alphas']
+        loss_mask = F.mse_loss(render_alphas, target_alphas)
+        loss = loss_mse + loss_lpips + loss_mask
+        prefix = 'train'
+        loss_dict = {}
+        loss_dict.update({f'{prefix}/loss_mse': loss_mse})
+        loss_dict.update({f'{prefix}/loss_lpips': loss_lpips})
+        loss_dict.update({f'{prefix}/loss_mask': loss_mask})
+        loss_dict.update({f'{prefix}/loss': loss})
+        return loss, loss_dict
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        lrm_generator_input = self.prepare_validation_batch_data(batch)
+        render_out = self.forward(lrm_generator_input)
+        render_images = render_out['render_images']
+        render_images = rearrange(render_images, 'b n c h w -> b c h (n w)')
+        self.validation_step_outputs.append(render_images)
+    def on_validation_epoch_end(self):
+        images = torch.cat(self.validation_step_outputs, dim=-1)
+        all_images = self.all_gather(images)
+        all_images = rearrange(all_images, 'r b c h w -> (r b) c h w')
+        if self.global_rank == 0:
+            image_path = os.path.join(self.logdir, 'images_val', f'val_{self.global_step:07d}.png')
+            grid = make_grid(all_images, nrow=1, normalize=True, value_range=(0, 1))
+            save_image(grid, image_path)
+            print(f"Saved image to {image_path}")
+        self.validation_step_outputs.clear()
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = []
+        lrm_params_fast, lrm_params_slow = [], []
+        for n, p in self.lrm_generator.named_parameters():
+            if 'adaLN_modulation' in n or 'camera_embedder' in n:
+                lrm_params_fast.append(p)
+            else:
+                lrm_params_slow.append(p)
+        params.append({"params": lrm_params_fast, "lr": lr, "weight_decay": 0.01 })
+        params.append({"params": lrm_params_slow, "lr": lr / 10.0, "weight_decay": 0.01 })
+        optimizer = torch.optim.AdamW(params, lr=lr, betas=(0.90, 0.95))
+        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 3000, eta_min=lr/4)
+        return {'optimizer': optimizer, 'lr_scheduler': scheduler}