Spaces:

LuLing
/

IScene-demo

Running on Zero

App Files Files Community

LuLing commited on 20 days ago

Commit

1ff1642

verified ·

1 Parent(s): a54d77c

initial zerogpu demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +8 -0
CITATION.cff +32 -0
LICENSE +21 -0
NOTICE +25 -0
README.md +13 -7
app.py +59 -0
examples/DL3DV/DL3DV-garden-rgb.png +3 -0
examples/DL3DV/DL3DV-garden-seg.png +0 -0
examples/DL3DV/DL3DV-table-chair-set-rgb.png +3 -0
examples/DL3DV/DL3DV-table-chair-set-seg.png +0 -0
examples/DL3DV/DL3DV-tables-rgb.png +3 -0
examples/DL3DV/DL3DV-tables-seg.png +0 -0
examples/Gen3DSR/Gen3DSR_scene1_rgb.png +3 -0
examples/Gen3DSR/Gen3DSR_scene1_seg.png +0 -0
examples/MIDI-example/cartoon_style_07_rgb.png +3 -0
examples/MIDI-example/cartoon_style_07_seg.png +0 -0
examples/Scenethesis/SAM-3D-testing-case_rgb.png +3 -0
examples/Scenethesis/SAM-3D-testing-case_seg.png +0 -0
examples/Scenethesis/children_playroom2_rgb.png +3 -0
examples/Scenethesis/children_playroom2_seg.png +0 -0
examples/Scenethesis/scenethesis-reading-corner-rgb.png +0 -0
examples/Scenethesis/scenethesis-reading-corner-seg.png +0 -0
examples/outdoor/scene_beach2_rgb.png +3 -0
examples/outdoor/scene_beach2_seg.png +0 -0
interactive_demo.py +585 -0
iscene/inference/__init__.py +0 -0
iscene/inference/inferencer.py +503 -0
iscene/inference/segmentation_utils.py +77 -0
iscene/trellis/__init__.py +7 -0
iscene/trellis/models/__init__.py +55 -0
iscene/trellis/models/image_conditioner.py +134 -0
iscene/trellis/models/sparse_structure_flow.py +201 -0
iscene/trellis/models/sparse_structure_sc_flow.py +111 -0
iscene/trellis/models/sparse_structure_vae.py +306 -0
iscene/trellis/models/structured_latent_flow.py +267 -0
iscene/trellis/models/structured_latent_vae/__init__.py +4 -0
iscene/trellis/models/structured_latent_vae/base.py +117 -0
iscene/trellis/models/structured_latent_vae/decoder_gs.py +122 -0
iscene/trellis/models/structured_latent_vae/decoder_mesh.py +167 -0
iscene/trellis/modules/attention/__init__.py +36 -0
iscene/trellis/modules/attention/full_attn.py +140 -0
iscene/trellis/modules/attention/modules.py +342 -0
iscene/trellis/modules/attention_resample.py +77 -0
iscene/trellis/modules/norm.py +24 -0
iscene/trellis/modules/sparse/__init__.py +102 -0
iscene/trellis/modules/sparse/attention/__init__.py +4 -0
iscene/trellis/modules/sparse/attention/full_attn.py +215 -0
iscene/trellis/modules/sparse/attention/modules.py +139 -0
iscene/trellis/modules/sparse/attention/serialized_attn.py +193 -0
iscene/trellis/modules/sparse/attention/windowed_attn.py +150 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/DL3DV/DL3DV-garden-rgb.png filter=lfs diff=lfs merge=lfs -text
+examples/DL3DV/DL3DV-table-chair-set-rgb.png filter=lfs diff=lfs merge=lfs -text
+examples/DL3DV/DL3DV-tables-rgb.png filter=lfs diff=lfs merge=lfs -text
+examples/Gen3DSR/Gen3DSR_scene1_rgb.png filter=lfs diff=lfs merge=lfs -text
+examples/MIDI-example/cartoon_style_07_rgb.png filter=lfs diff=lfs merge=lfs -text
+examples/Scenethesis/SAM-3D-testing-case_rgb.png filter=lfs diff=lfs merge=lfs -text
+examples/Scenethesis/children_playroom2_rgb.png filter=lfs diff=lfs merge=lfs -text
+examples/outdoor/scene_beach2_rgb.png filter=lfs diff=lfs merge=lfs -text

CITATION.cff ADDED Viewed

	@@ -0,0 +1,32 @@

+cff-version: 1.2.0
+title: "I-Scene: 3D Instance Models are Implicit Generalizable Spatial Learners"
+message: "If you use I-Scene, please cite the I-Scene paper and the TRELLIS paper."
+url: "https://luling06.github.io/I-Scene-web-page/"
+repository-code: "https://github.com/LuLing06/I-Scene-project"
+authors:
+  - family-names: "Ling"
+    given-names: "Lu"
+  - family-names: "Ge"
+    given-names: "Yunhao"
+  - family-names: "Sheng"
+    given-names: "Yichen"
+  - family-names: "Bera"
+    given-names: "Aniket"
+date-released: 2026-05-05
+references:
+  - type: article
+    title: "I-Scene: 3D Instance Models are Implicit Generalizable Spatial Learners"
+    authors:
+      - family-names: "Ling"
+        given-names: "Lu"
+      - family-names: "Ge"
+        given-names: "Yunhao"
+      - family-names: "Sheng"
+        given-names: "Yichen"
+      - family-names: "Bera"
+        given-names: "Aniket"
+    journal: "arXiv preprint arXiv:2512.13683"
+    year: 2025
+  - type: article
+    title: "Structured 3D Latents for Scalable and Versatile 3D Generation"
+    url: "https://trellis3d.github.io/"

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Lu Ling
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

NOTICE ADDED Viewed

	@@ -0,0 +1,25 @@

+I-Scene
+This repository contains the I-Scene inference code and the IScene-v1 model
+package for segmentation-conditioned 3D scene generation.
+IScene-v1:
+  Model package: IScene-v1
+  Project: https://luling06.github.io/I-Scene-web-page/
+  Code: https://github.com/LuLing06/I-Scene-project
+  Hugging Face repository: https://huggingface.co/LuLing/IScene
+  Contents: IScene-specific checkpoint files and inference configuration
+  Base model: microsoft/TRELLIS-image-large
+I-Scene builds on TRELLIS, the image-to-3D generation framework released by
+Microsoft under the MIT License.
+TRELLIS:
+  Repository: https://github.com/microsoft/TRELLIS
+  Model: https://huggingface.co/microsoft/TRELLIS-image-large
+  Paper: Structured 3D Latents for Scalable and Versatile 3D Generation
+The IScene-v1 model package provides I-Scene-specific checkpoint files and loads
+TRELLIS base components from `microsoft/TRELLIS-image-large`. The TRELLIS
+copyright notice and license terms should be preserved when redistributing code
+or model packages derived from TRELLIS.

README.md CHANGED Viewed

@@ -1,15 +1,21 @@
 ---
-title: IScene Demo
-emoji: 📉
-colorFrom: gray
 colorTo: yellow
 sdk: gradio
-sdk_version: 6.14.0
-python_version: '3.12'
 app_file: app.py
 pinned: false
 license: mit
-short_description: I-Scene online demo
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: I-Scene Demo
+emoji: 🏠
+colorFrom: yellow
 colorTo: yellow
 sdk: gradio
+sdk_version: 4.44.1
 app_file: app.py
 pinned: false
+suggested_hardware: zero-a10g
 license: mit
+short_description: Interactive I-Scene 3D scene generation demo
 ---
+# I-Scene Demo
+This Space runs the I-Scene interactive demo with the public checkpoint:
+https://huggingface.co/LuLing/IScene
+The first run may be slow because model checkpoints need to be downloaded and cached.

app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from __future__ import annotations
+import os
+os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "False")
+os.environ.setdefault("HF_HOME", "/data/.cache/huggingface")
+os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache/huggingface/transformers")
+import spaces
+import torch
+import interactive_demo
+def _configure_runtime_device() -> None:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.bfloat16 if device == "cuda" else torch.float32
+    if interactive_demo.DEVICE != device or interactive_demo.DTYPE != dtype:
+        interactive_demo._sam_cache.clear()
+    interactive_demo.DEVICE = device
+    interactive_demo.DTYPE = dtype
+_run_segmentation = interactive_demo.run_segmentation
+_run_gaussian_preview = interactive_demo.run_gaussian_preview
+_run_glb_export = interactive_demo.run_glb_export
+@spaces.GPU(duration=120)
+def run_segmentation(*args, **kwargs):
+    _configure_runtime_device()
+    return _run_segmentation(*args, **kwargs)
+@spaces.GPU(duration=180)
+def run_gaussian_preview(*args, **kwargs):
+    _configure_runtime_device()
+    return _run_gaussian_preview(*args, **kwargs)
+@spaces.GPU(duration=240)
+def run_glb_export(*args, **kwargs):
+    _configure_runtime_device()
+    yield from _run_glb_export(*args, **kwargs)
+interactive_demo.run_segmentation = run_segmentation
+interactive_demo.run_gaussian_preview = run_gaussian_preview
+interactive_demo.run_glb_export = run_glb_export
+interactive_demo.MODEL_ID = os.environ.get("ISCENE_MODEL", interactive_demo.DEFAULT_MODEL)
+interactive_demo.BASE_MODEL_ID = os.environ.get("ISCENE_BASE_MODEL") or None
+interactive_demo.DEFAULT_OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
+interactive_demo.UPLOAD_ROOT.mkdir(parents=True, exist_ok=True)
+demo = interactive_demo.build_demo()
+demo.queue()
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

examples/DL3DV/DL3DV-garden-rgb.png ADDED Viewed

Git LFS Details

SHA256: e26cc42b8ed2312941ab632f8846d21d9cecd1f5dea18ba34798d7e80d2a22fe
Pointer size: 132 Bytes
Size of remote file: 4.37 MB

examples/DL3DV/DL3DV-garden-seg.png ADDED Viewed

examples/DL3DV/DL3DV-table-chair-set-rgb.png ADDED Viewed

Git LFS Details

SHA256: ae76f97c7f09e4932d6a73810891e41ebbaa6cc6c61ce093ae17b175a6a7dd48
Pointer size: 131 Bytes
Size of remote file: 877 kB

examples/DL3DV/DL3DV-table-chair-set-seg.png ADDED Viewed

examples/DL3DV/DL3DV-tables-rgb.png ADDED Viewed

Git LFS Details

SHA256: f30330e4d6661733f3bc551a02c1567a69942806822f5a4de1b758cfa51a6cf4
Pointer size: 132 Bytes
Size of remote file: 3.35 MB

examples/DL3DV/DL3DV-tables-seg.png ADDED Viewed

examples/Gen3DSR/Gen3DSR_scene1_rgb.png ADDED Viewed

Git LFS Details

SHA256: f1fe5da5fc2b15a427ce59833d3e17488ccedc2237ffc52f003badfb7d08b833
Pointer size: 132 Bytes
Size of remote file: 1.93 MB

examples/Gen3DSR/Gen3DSR_scene1_seg.png ADDED Viewed

examples/MIDI-example/cartoon_style_07_rgb.png ADDED Viewed

Git LFS Details

SHA256: 9142b580956a91ee0120df93b2698fd6347293f7d079e54bb71846b94d088cb3
Pointer size: 132 Bytes
Size of remote file: 1.07 MB

examples/MIDI-example/cartoon_style_07_seg.png ADDED Viewed

examples/Scenethesis/SAM-3D-testing-case_rgb.png ADDED Viewed

Git LFS Details

SHA256: 6aaa5403abdbd8d1034f6d817b72eb8306829e1e40174ed3f8211206c12e618a
Pointer size: 132 Bytes
Size of remote file: 2.24 MB

examples/Scenethesis/SAM-3D-testing-case_seg.png ADDED Viewed

examples/Scenethesis/children_playroom2_rgb.png ADDED Viewed

Git LFS Details

SHA256: b0eec676a580885e65e8ba59a6e325729a35f2ba27a079c45ce6e2b990958a05
Pointer size: 131 Bytes
Size of remote file: 453 kB

examples/Scenethesis/children_playroom2_seg.png ADDED Viewed

examples/Scenethesis/scenethesis-reading-corner-rgb.png ADDED Viewed

examples/Scenethesis/scenethesis-reading-corner-seg.png ADDED Viewed

examples/outdoor/scene_beach2_rgb.png ADDED Viewed

Git LFS Details

SHA256: efa3144f87a622310adacc3cc2b212a52a80e75b9486cf864628606b4c42a009
Pointer size: 131 Bytes
Size of remote file: 525 kB

examples/outdoor/scene_beach2_seg.png ADDED Viewed

interactive_demo.py ADDED Viewed

	@@ -0,0 +1,585 @@

+"""Interactive I-Scene demo.
+Run from the repository root:
+    python interactive_demo.py
+"""
+from __future__ import annotations
+import argparse
+import os
+import uuid
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "False")
+import gradio as gr
+import numpy as np
+import torch
+from gradio_image_prompter import ImagePrompter
+from gradio_litmodel3d import LitModel3D
+from PIL import Image
+from transformers import AutoModelForMaskGeneration, AutoProcessor
+from iscene.inference.inferencer import ISceneInferencer
+REPO_ROOT = Path(__file__).resolve().parent
+DEFAULT_MODEL = "LuLing/IScene"
+MODEL_ID = DEFAULT_MODEL
+BASE_MODEL_ID: str | None = None
+DEFAULT_SEED = 43
+DEFAULT_SIMPLIFY = 0.95
+DEFAULT_OUTPUT_ROOT = REPO_ROOT / "outputs" / "demo"
+UPLOAD_ROOT = DEFAULT_OUTPUT_ROOT / "_uploads"
+TARGET_SIZE = (512, 512)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
+SAM_MODELS = {
+    "sam-vit-huge (best quality, 636M)": "facebook/sam-vit-huge",
+    "sam-vit-large (balanced, 308M)": "facebook/sam-vit-large",
+    "sam-vit-base (fastest, 91M)": "facebook/sam-vit-base",
+}
+MARKDOWN = """
+# I-Scene Interactive Demo
+Generate a 3D scene from one image.
+Workflow:
+1. Pick an example, or upload an image and draw boxes around objects.
+2. Use the example mask, or click **Run SAM Segmentation** to create a mask.
+3. Click **Generate Gaussian Splatting Preview** to create and preview `scene_pred.ply`.
+4. Click **Generate GLB** only when you need mesh assets.
+5. To save each instance in the scene, run the inference code with the same RGB/mask; `run_inference.py` writes per-instance assets alongside the scene output.
+Note: The first run may be slow because the model checkpoint needs to be downloaded and cached.
+"""
+EXAMPLE_ORDER = [
+    "Scenethesis/SAM-3D-testing-case_rgb.png",
+    "Gen3DSR/Gen3DSR_scene1_rgb.png",
+    "MIDI-example/cartoon_style_07_rgb.png",
+    "Scenethesis/children_playroom2_rgb.png",
+    "Scenethesis/scenethesis-reading-corner-rgb.png",
+    "DL3DV/DL3DV-garden-rgb.png",
+    "DL3DV/DL3DV-table-chair-set-rgb.png",
+    "DL3DV/DL3DV-tables-rgb.png",
+    "outdoor/scene_beach2_rgb.png",
+]
+def _discover_examples() -> list[tuple[str, Path, Path]]:
+    examples_root = REPO_ROOT / "examples"
+    pairs: list[tuple[str, Path, Path]] = []
+    for rel_name in EXAMPLE_ORDER:
+        rgb_path = examples_root / rel_name
+        if not rgb_path.exists():
+            continue
+        seg_path = None
+        if "_rgb" in rgb_path.name:
+            seg_path = rgb_path.with_name(rgb_path.name.replace("_rgb", "_seg"))
+        elif "-rgb" in rgb_path.name:
+            seg_path = rgb_path.with_name(rgb_path.name.replace("-rgb", "-seg"))
+        if seg_path is None or not seg_path.exists():
+            continue
+        rel = rgb_path.relative_to(examples_root)
+        case_name = rgb_path.stem.replace("_rgb", "").replace("-rgb", "")
+        label = f"{rel.parent.as_posix()} / {case_name}"
+        pairs.append((label, rgb_path, seg_path))
+    return pairs
+EXAMPLES = _discover_examples()
+EXAMPLE_ROWS = [[{"image": str(rgb)}, str(mask)] for _, rgb, mask in EXAMPLES]
+@dataclass
+class DemoRunState:
+    rgb_path: str
+    mask_path: str
+    output_dir: str
+    seed: int
+    simplify: float
+_sam_cache: dict[str, tuple[AutoProcessor, AutoModelForMaskGeneration]] = {}
+_inferencer_cache: dict[tuple[str, str], ISceneInferencer] = {}
+def _make_session_dir(request: gr.Request | None, root: Path = UPLOAD_ROOT) -> Path:
+    session_hash = getattr(request, "session_hash", None) or uuid.uuid4().hex[:10]
+    path = root / session_hash
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+def _timestamped_output_dir(request: gr.Request | None) -> Path:
+    session_hash = getattr(request, "session_hash", None) or uuid.uuid4().hex[:10]
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return DEFAULT_OUTPUT_ROOT / f"{timestamp}_{session_hash}"
+def _get_prompt_image(image_prompts: Any) -> Image.Image | None:
+    if image_prompts is None:
+        return None
+    if isinstance(image_prompts, dict):
+        image = image_prompts.get("image")
+    else:
+        image = image_prompts
+    if image is None:
+        return None
+    if isinstance(image, Image.Image):
+        return image.convert("RGB")
+    return Image.open(image).convert("RGB")
+def _save_prompt_rgb(image_prompts: Any, request: gr.Request | None) -> Path:
+    image = _get_prompt_image(image_prompts)
+    if image is None:
+        raise gr.Error("Please upload an RGB image.")
+    session_dir = _make_session_dir(request)
+    path = session_dir / "input_rgb.png"
+    image.save(path)
+    return path
+def _resolve_mask_path(mask_path: str | None) -> Path:
+    if not mask_path:
+        raise gr.Error("Please choose an example or run SAM segmentation first.")
+    path = Path(mask_path)
+    if not path.exists():
+        raise gr.Error(f"Mask file does not exist: {path}")
+    return path
+def _get_inferencer() -> ISceneInferencer:
+    key = (MODEL_ID, BASE_MODEL_ID or "")
+    if key not in _inferencer_cache:
+        _inferencer_cache[key] = ISceneInferencer.from_pretrained(MODEL_ID, base_model_id=BASE_MODEL_ID)
+    return _inferencer_cache[key]
+def _get_sam_model(model_choice: str) -> tuple[AutoProcessor, AutoModelForMaskGeneration]:
+    model_id = SAM_MODELS[model_choice]
+    if model_id in _sam_cache:
+        return _sam_cache[model_id]
+    processor = AutoProcessor.from_pretrained(model_id)
+    segmentator = AutoModelForMaskGeneration.from_pretrained(model_id).to(DEVICE, DTYPE)
+    segmentator.eval()
+    _sam_cache[model_id] = (processor, segmentator)
+    return processor, segmentator
+def _boxes_from_prompts(image_prompts: Any) -> list[list[list[int]]]:
+    points = image_prompts.get("points", []) if isinstance(image_prompts, dict) else []
+    if not points:
+        raise gr.Error("Please draw at least one box before running SAM segmentation.")
+    boxes = []
+    for box in points:
+        x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[3]), int(box[4])
+        x_min, x_max = sorted((x1, x2))
+        y_min, y_max = sorted((y1, y2))
+        if x_max <= x_min or y_max <= y_min:
+            continue
+        boxes.append([x_min, y_min, x_max, y_max])
+    if not boxes:
+        raise gr.Error("No valid boxes were drawn.")
+    return [boxes]
+def _mask_to_polygon(mask: np.ndarray) -> list[list[int]] | None:
+    import cv2
+    contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    if not contours:
+        return None
+    contour = max(contours, key=cv2.contourArea)
+    return contour.reshape(-1, 2).tolist()
+def _polygon_to_mask(polygon: list[list[int]], image_shape: tuple[int, int]) -> np.ndarray:
+    import cv2
+    mask = np.zeros(image_shape, dtype=np.uint8)
+    cv2.fillPoly(mask, [np.array(polygon, dtype=np.int32)], color=(1,))
+    return mask
+def _refine_masks(
+    masks: torch.Tensor,
+    *,
+    polygon_refinement: bool,
+    mask_threshold: float,
+) -> list[np.ndarray]:
+    masks = masks.detach().cpu().float()
+    if masks.ndim == 5:
+        masks = masks[:, :, 0]
+    if masks.ndim == 4:
+        masks = masks.mean(dim=1)
+    masks = (masks > mask_threshold).numpy().astype(np.uint8)
+    refined = [mask for mask in masks]
+    if polygon_refinement:
+        for idx, mask in enumerate(refined):
+            polygon = _mask_to_polygon(mask)
+            if polygon is not None:
+                refined[idx] = _polygon_to_mask(polygon, mask.shape)
+    return refined
+def _palette() -> list[int]:
+    colors = [0, 0, 0]
+    hue = 0.0
+    golden_ratio = 0.618033988749895
+    for _ in range(1, 256):
+        hue = (hue + golden_ratio) % 1.0
+        h = hue * 6.0
+        c = 0.81
+        x = c * (1 - abs(h % 2 - 1))
+        m = 0.09
+        if h < 1:
+            r, g, b = c, x, 0
+        elif h < 2:
+            r, g, b = x, c, 0
+        elif h < 3:
+            r, g, b = 0, c, x
+        elif h < 4:
+            r, g, b = 0, x, c
+        elif h < 5:
+            r, g, b = x, 0, c
+        else:
+            r, g, b = c, 0, x
+        colors.extend([int((r + m) * 255), int((g + m) * 255), int((b + m) * 255)])
+    return colors
+def _label_mask_to_pil(label_map: np.ndarray) -> Image.Image:
+    if label_map.max(initial=0) < 256:
+        image = Image.fromarray(label_map.astype(np.uint8), mode="P")
+        image.putpalette(_palette())
+        return image
+    encoded = np.zeros((*label_map.shape, 3), dtype=np.uint8)
+    encoded[..., 0] = label_map & 255
+    encoded[..., 1] = (label_map >> 8) & 255
+    return Image.fromarray(encoded, mode="RGB")
+def resize_prompt_image(image_prompts: Any) -> Any:
+    image = _get_prompt_image(image_prompts)
+    if image is None:
+        return image_prompts
+    resized = image.resize(TARGET_SIZE, Image.Resampling.LANCZOS)
+    UPLOAD_ROOT.mkdir(parents=True, exist_ok=True)
+    path = UPLOAD_ROOT / f"prompt_{uuid.uuid4().hex[:10]}.png"
+    resized.save(path)
+    return {"image": str(path), "points": []}
+def reset_uploaded_image(image_prompts: Any) -> tuple[Any, None, str]:
+    return resize_prompt_image(image_prompts), None, ""
+def remember_example_mask_path(_image_prompts: Any, mask_path: str) -> str:
+    return str(mask_path)
+@torch.no_grad()
+def run_segmentation(
+    image_prompts: Any,
+    model_choice: str,
+    polygon_refinement: bool,
+    mask_threshold: float,
+    request: gr.Request,
+) -> tuple[str, str]:
+    image = _get_prompt_image(image_prompts)
+    if image is None:
+        raise gr.Error("Please upload an RGB image before running segmentation.")
+    boxes = _boxes_from_prompts(image_prompts)
+    processor, segmentator = _get_sam_model(model_choice)
+    inputs = processor(images=image, input_boxes=boxes, return_tensors="pt").to(segmentator.device, segmentator.dtype)
+    outputs = segmentator(**inputs)
+    masks = processor.post_process_masks(
+        masks=outputs.pred_masks,
+        original_sizes=inputs.original_sizes,
+        reshaped_input_sizes=inputs.reshaped_input_sizes,
+    )[0]
+    masks = _refine_masks(masks, polygon_refinement=polygon_refinement, mask_threshold=mask_threshold)
+    label_map = np.zeros(image.size[::-1], dtype=np.uint32)
+    for idx, mask in enumerate(masks, start=1):
+        label_map[mask > 0] = idx
+    mask_image = _label_mask_to_pil(label_map)
+    session_dir = _make_session_dir(request)
+    raw_path = session_dir / "sam_mask.png"
+    mask_image.save(raw_path)
+    torch.cuda.empty_cache()
+    return str(raw_path), str(raw_path)
+def run_gaussian_preview(
+    image_prompts: Any,
+    mask_path: str | None,
+    seed: int,
+    simplify: float,
+    output_dir_text: str,
+    request: gr.Request,
+) -> tuple[str, dict[str, Any], dict[str, Any], str, DemoRunState]:
+    rgb_path = _save_prompt_rgb(image_prompts, request)
+    mask_path = _resolve_mask_path(mask_path)
+    output_dir = Path(output_dir_text).expanduser() if output_dir_text.strip() else _timestamped_output_dir(request)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    inferencer = _get_inferencer()
+    inferencer.infer_and_save_scene(
+        scene_rgb_path=rgb_path,
+        instance_seg_path=mask_path,
+        output_dir=output_dir,
+        overwrite=True,
+        save_dbg=False,
+        simplify=float(simplify),
+        only_3dgs=True,
+        seed=int(seed),
+    )
+    scene_ply = output_dir / "scene_pred.ply"
+    if not scene_ply.exists():
+        raise gr.Error(f"Generation finished but scene_pred.ply was not found in {output_dir}")
+    state = DemoRunState(
+        rgb_path=str(rgb_path),
+        mask_path=str(mask_path),
+        output_dir=str(output_dir),
+        seed=int(seed),
+        simplify=float(simplify),
+    )
+    torch.cuda.empty_cache()
+    return (
+        str(scene_ply),
+        gr.update(value=str(scene_ply), interactive=True),
+        gr.update(value=None, interactive=False),
+        "",
+        state,
+    )
+def _progress_bar(percent: int) -> str:
+    percent = max(0, min(100, int(percent)))
+    return f"""
+    <div style="height: 14px; width: 100%; background: #ece7dc; border-radius: 999px; overflow: hidden; border: 1px solid #d8cbb7;">
+      <div style="height: 100%; width: {percent}%; background: linear-gradient(90deg, #b77a2f, #e0b15a); transition: width 0.4s ease;"></div>
+    </div>
+    """
+def run_glb_export(
+    state: DemoRunState | dict[str, Any] | None,
+    simplify: float,
+) -> Any:
+    if state is None:
+        raise gr.Error("Please run GS preview first so the demo knows which RGB/mask/output directory to use.")
+    if isinstance(state, dict):
+        state = DemoRunState(**state)
+    output_dir = Path(state.output_dir)
+    yield gr.update(value=None, interactive=False), _progress_bar(5), gr.update(value=None)
+    inferencer = _get_inferencer()
+    yield gr.update(value=None, interactive=False), _progress_bar(15), gr.update(value=None)
+    inferencer.infer_and_save_scene(
+        scene_rgb_path=state.rgb_path,
+        instance_seg_path=state.mask_path,
+        output_dir=output_dir,
+        overwrite=True,
+        save_dbg=False,
+        simplify=float(simplify),
+        only_3dgs=False,
+        seed=int(state.seed),
+    )
+    scene_glb = output_dir / "scene_pred.glb"
+    if not scene_glb.exists():
+        raise gr.Error(f"GLB export finished but scene_pred.glb was not found in {output_dir}")
+    torch.cuda.empty_cache()
+    yield gr.update(value=str(scene_glb), interactive=True), _progress_bar(100), str(scene_glb)
+def clear_glb_outputs() -> tuple[dict[str, Any], str, None, dict[str, Any]]:
+    return gr.update(value=None, interactive=False), "", None, gr.update(value=None)
+def build_demo() -> gr.Blocks:
+    with gr.Blocks(title="I-Scene Interactive Demo", delete_cache=(3600, 3600)) as demo:
+        gr.Markdown(MARKDOWN)
+        run_state = gr.State(None)
+        with gr.Row():
+            with gr.Column(scale=1):
+                image_prompts = ImagePrompter(
+                    label="RGB image (upload, then optionally draw boxes for SAM)",
+                    type="pil",
+                    height=520,
+                )
+                with gr.Row():
+                    segment_button = gr.Button("Run SAM Segmentation", variant="secondary")
+                with gr.Accordion("Segmentation settings", open=False):
+                    sam_model = gr.Dropdown(
+                        choices=list(SAM_MODELS.keys()),
+                        value="sam-vit-huge (best quality, 636M)",
+                        label="SAM model",
+                    )
+                    mask_threshold = gr.Slider(
+                        minimum=-1.0,
+                        maximum=1.0,
+                        value=0.0,
+                        step=0.05,
+                        label="Mask threshold",
+                    )
+                    polygon_refinement = gr.Checkbox(
+                        label="Polygon refinement",
+                        value=False,
+                    )
+                sam_mask_preview = gr.Image(
+                    label="Instance mask",
+                    type="filepath",
+                    format="png",
+                    height=260,
+                )
+                mask_path_value = gr.Textbox(visible=False)
+                with gr.Accordion("Generation settings", open=False):
+                    seed = gr.Number(label="Seed", value=DEFAULT_SEED, precision=0)
+                    simplify = gr.Slider(
+                        minimum=0.5,
+                        maximum=1.0,
+                        value=DEFAULT_SIMPLIFY,
+                        step=0.01,
+                        label="GLB mesh simplify ratio",
+                    )
+                    output_dir = gr.Textbox(
+                        label="Output directory (optional)",
+                        placeholder="Leave empty to use outputs/demo/<timestamp>_<session>",
+                    )
+                generate_gs_button = gr.Button("Generate Gaussian Splatting Preview", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                preview = LitModel3D(
+                    label="3D preview",
+                    exposure=10.0,
+                    height=520,
+                )
+                download_gs = gr.DownloadButton(
+                    label="Download Gaussian Splatting PLY",
+                    interactive=False,
+                )
+                with gr.Row():
+                    generate_glb_button = gr.Button("Generate GLB", variant="secondary")
+                glb_progress = gr.HTML(value="")
+                glb_preview = gr.Model3D(
+                    label="GLB mesh preview",
+                    clear_color=(0.98, 0.96, 0.91, 1.0),
+                    display_mode="solid",
+                    height=360,
+                )
+                download_glb = gr.DownloadButton(
+                    label="Download Mesh GLB",
+                    interactive=False,
+                )
+        image_prompts.upload(
+            reset_uploaded_image,
+            inputs=[image_prompts],
+            outputs=[image_prompts, sam_mask_preview, mask_path_value],
+        )
+        segment_button.click(
+            run_segmentation,
+            inputs=[image_prompts, sam_model, polygon_refinement, mask_threshold],
+            outputs=[sam_mask_preview, mask_path_value],
+        )
+        generate_gs_button.click(
+            clear_glb_outputs,
+            outputs=[download_glb, glb_progress, run_state, glb_preview],
+            show_progress="hidden",
+        ).then(
+            run_gaussian_preview,
+            inputs=[
+                image_prompts,
+                mask_path_value,
+                seed,
+                simplify,
+                output_dir,
+            ],
+            outputs=[preview, download_gs, download_glb, glb_progress, run_state],
+            show_progress="full",
+        )
+        generate_glb_button.click(
+            run_glb_export,
+            inputs=[run_state, simplify],
+            outputs=[download_glb, glb_progress, glb_preview],
+            show_progress="hidden",
+        )
+        with gr.Row():
+            gr.Examples(
+                examples=EXAMPLE_ROWS,
+                inputs=[image_prompts, sam_mask_preview],
+                outputs=[mask_path_value],
+                fn=remember_example_mask_path,
+                cache_examples=False,
+                label="Examples",
+                run_on_click=True,
+            )
+    return demo
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--server_name", default="0.0.0.0")
+    parser.add_argument("--server_port", type=int, default=7860)
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--model", default=DEFAULT_MODEL, help="I-Scene model id or local model package path.")
+    parser.add_argument(
+        "--base_model",
+        default=None,
+        help="Optional TRELLIS base model id or local mirror path. Defaults to the model package metadata.",
+    )
+    return parser.parse_args()
+def main() -> None:
+    global MODEL_ID, BASE_MODEL_ID
+    args = parse_args()
+    MODEL_ID = args.model
+    BASE_MODEL_ID = args.base_model
+    DEFAULT_OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
+    UPLOAD_ROOT.mkdir(parents=True, exist_ok=True)
+    demo = build_demo()
+    demo.queue()
+    demo.launch(
+        server_name=args.server_name,
+        server_port=args.server_port,
+        share=args.share,
+    )
+if __name__ == "__main__":
+    main()

iscene/inference/__init__.py ADDED Viewed

File without changes

iscene/inference/inferencer.py ADDED Viewed

	@@ -0,0 +1,503 @@

+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+import numpy as np
+from PIL import Image
+from plyfile import PlyData, PlyElement
+import torch
+import trimesh
+from tqdm import tqdm
+from ..trellis.pipelines import TrellisImageTo3DSceneContextPipeline
+from ..trellis.modules import sparse as sp
+from .segmentation_utils import load_scene_and_instance_masks, segmentation_to_id_map
+DEFAULT_BASE_MODEL_ID = "microsoft/TRELLIS-image-large"
+SPARSE_STRUCTURE_SAMPLER_PARAMS = {"steps": 25, "cfg_strength": 3.0}
+SLAT_SAMPLER_PARAMS = {"steps": 25, "cfg_strength": 3.0}
+def _resolve_package_file(model_id_or_path: str | Path, filename: str, revision: str | None = None) -> Path:
+    root = Path(model_id_or_path).expanduser()
+    local_path = root / filename
+    if local_path.exists():
+        return local_path
+    from huggingface_hub import hf_hub_download
+    return Path(hf_hub_download(str(model_id_or_path), filename, revision=revision))
+class ISceneInferencer:
+    def __init__(
+        self,
+        model_id_or_path: str | Path,
+        *,
+        base_model_id: str | Path | None = None,
+        revision: str | None = None,
+        base_revision: str | None = None,
+    ):
+        self.model_id_or_path = str(model_id_or_path)
+        self.base_model_id = str(base_model_id) if base_model_id is not None else None
+        self.revision = revision
+        self.base_revision = base_revision
+        self.pipeline = None
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_id_or_path: str | Path,
+        *,
+        base_model_id: str | Path | None = None,
+        revision: str | None = None,
+        base_revision: str | None = None,
+    ) -> "ISceneInferencer":
+        return cls(
+            model_id_or_path,
+            base_model_id=base_model_id,
+            revision=revision,
+            base_revision=base_revision,
+        )
+    def _load_release_metadata(self) -> dict:
+        metadata_path = _resolve_package_file(self.model_id_or_path, "iscene_config.json", revision=self.revision)
+        with open(metadata_path, "r") as f:
+            metadata = json.load(f)
+        required_keys = {
+            "base_model_id",
+            "config_file",
+            "denoiser_checkpoint",
+            "image_conditioner_checkpoint",
+        }
+        missing = sorted(required_keys - set(metadata))
+        if missing:
+            raise ValueError(f"IScene model package is missing required metadata keys: {missing}")
+        return metadata
+    def setup_pipeline(self):
+        metadata = self._load_release_metadata()
+        config_file = _resolve_package_file(self.model_id_or_path, metadata["config_file"], revision=self.revision)
+        denoiser_checkpoint = _resolve_package_file(
+            self.model_id_or_path,
+            metadata["denoiser_checkpoint"],
+            revision=self.revision,
+        )
+        image_conditioner_checkpoint = _resolve_package_file(
+            self.model_id_or_path,
+            metadata["image_conditioner_checkpoint"],
+            revision=self.revision,
+        )
+        base_model_id = self.base_model_id or metadata.get("base_model_id", DEFAULT_BASE_MODEL_ID)
+        pipeline, cfg = TrellisImageTo3DSceneContextPipeline.from_pretrained(
+            str(base_model_id),
+            config_file=config_file,
+            denoiser_checkpoint=denoiser_checkpoint,
+            image_conditioner_checkpoint=image_conditioner_checkpoint,
+            revision=self.base_revision,
+        )
+        pipeline.cuda()
+        pipeline.set_exp_cfg(cfg)
+        return pipeline
+    def infer_and_save_scene(
+        self,
+        scene_rgb_path: str | Path,
+        instance_seg_path: str | Path,
+        output_dir: str | Path,
+        overwrite: bool = True,
+        save_dbg: bool = False,
+        simplify: float = 0.95,
+        only_3dgs: bool = False,
+        seed: int = 42,
+        verbose: bool = False,
+    ) -> None:
+        scene_results = self.infer_scene_instances(
+            scene_rgb_path,
+            instance_seg_path,
+            seed=seed,
+            only_3dgs=only_3dgs,
+            save_dbg=save_dbg,
+            verbose=verbose,
+        )
+        self.save_scene_outputs(
+            scene_results,
+            output_dir,
+            overwrite=overwrite,
+            save_dbg=save_dbg,
+            simplify=simplify,
+            only_3dgs=only_3dgs,
+            verbose=verbose,
+        )
+    @staticmethod
+    def _prepare_instance_inputs(
+        scene_rgb_path: str | Path,
+        instance_seg_path: str | Path,
+        input_loader=load_scene_and_instance_masks,
+    ):
+        scene_rgb, instance_masks, label_ids = input_loader(
+            scene_rgb_path,
+            instance_seg_path,
+        )
+        scene_mask = (segmentation_to_id_map(Image.open(instance_seg_path)) > 0).astype("uint8") * 255
+        scene_mask_pil = Image.fromarray(scene_mask)
+        return scene_rgb, instance_masks, scene_mask_pil, label_ids
+    @staticmethod
+    @torch.no_grad()
+    def _sample_sparse_structure(
+        pipeline,
+        *,
+        scene_rgb,
+        scene_mask,
+        instance_masks,
+        seed: int,
+        sparse_structure_sampler_params: dict,
+        collect_debug: bool,
+        verbose: bool,
+    ) -> dict | None:
+        if scene_rgb is None or not instance_masks:
+            logging.warning("Empty input lists for sparse-structure inference.")
+            return None
+        preprocessed_list = []
+        dbg_rets = [] if collect_debug else None
+        for instance_mask in instance_masks:
+            preprocessed, dbg_ret = pipeline.preprocess_image(
+                scene_rgb,
+                scene_mask,
+                instance_mask,
+                return_debug=collect_debug,
+            )
+            preprocessed_list.append(preprocessed)
+            if collect_debug and dbg_rets is not None:
+                dbg_rets.append(dbg_ret)
+        exp_setting = getattr(pipeline.exp_cfg.dataset.args, "exp_setting", "")
+        slot_names = ["scene_space_instance"]
+        if "global" in exp_setting:
+            slot_names.append("scene_space_scene")
+        if "local" in exp_setting:
+            slot_names.append("canonical_space_instance")
+        ss_cond, slat_cond, resolved_batch_size, num_slots = pipeline.get_cond_batch(preprocessed_list)
+        if len(slot_names) != num_slots:
+            slot_names = [f"slot_{i}" for i in range(num_slots)]
+        torch.manual_seed(seed)
+        coords = pipeline.sample_sparse_structure(
+            ss_cond,
+            num_samples=resolved_batch_size,
+            sampler_params=sparse_structure_sampler_params,
+            verbose=verbose,
+        )
+        results = {
+            "coords": coords,
+            "num_instances": resolved_batch_size,
+            "num_slots": num_slots,
+            "slot_names": slot_names,
+            "slat_cond": slat_cond,
+        }
+        if collect_debug:
+            results["dbg_ret_list"] = dbg_rets
+        return results
+    @torch.no_grad()
+    def infer_scene_instances(
+        self,
+        scene_rgb_path: str | Path,
+        instance_seg_path: str | Path,
+        seed: int = 42,
+        only_3dgs: bool = False,
+        save_dbg: bool = False,
+        verbose: bool = False,
+    ):
+        scene_rgb, instance_masks, scene_mask_pil, label_ids = self._prepare_instance_inputs(
+            scene_rgb_path,
+            instance_seg_path,
+        )
+        if not instance_masks:
+            logging.warning("No foreground instances found in segmentation.")
+            return None
+        if self.pipeline is None:
+            self.pipeline = self.setup_pipeline()
+        stage1_results = self._sample_sparse_structure(
+            self.pipeline,
+            scene_rgb=scene_rgb,
+            scene_mask=scene_mask_pil,
+            instance_masks=instance_masks,
+            seed=seed,
+            sparse_structure_sampler_params=SPARSE_STRUCTURE_SAMPLER_PARAMS,
+            collect_debug=save_dbg,
+            verbose=verbose,
+        )
+        if stage1_results is None:
+            return None
+        coords = stage1_results["coords"]
+        slat = self.pipeline.sample_slat(
+            stage1_results["slat_cond"],
+            coords,
+            sampler_params=SLAT_SAMPLER_PARAMS,
+            verbose=verbose,
+        )
+        num_instances = stage1_results["num_instances"]
+        num_slots = stage1_results["num_slots"]
+        slot_names = stage1_results["slot_names"]
+        total_slots = num_instances * num_slots
+        scene_slot_idx = slot_names.index("scene_space_scene") if "scene_space_scene" in slot_names else -1
+        skipped_slot_ids = {
+            instance_idx * num_slots + scene_slot_idx
+            for instance_idx in range(num_instances)
+        } if scene_slot_idx >= 0 else set()
+        unique_batch_ids = torch.unique(slat.coords[:, 0]).sort()[0]
+        decode_formats = ["gaussian"] if only_3dgs else ["mesh", "gaussian"]
+        decoded_results = {fmt: [None] * total_slots for fmt in decode_formats}
+        for bid in tqdm(unique_batch_ids, desc="Decoding assets", disable=not verbose):
+            bid_int = int(bid.item())
+            if bid_int in skipped_slot_ids:
+                continue
+            mask = slat.coords[:, 0] == bid
+            sample_coords = slat.coords[mask].clone()
+            sample_coords[:, 0] = 0
+            sample_slat = sp.SparseTensor(
+                feats=slat.feats[mask],
+                coords=sample_coords,
+            )
+            sample_decoded = self.pipeline.decode_slat(sample_slat, decode_formats)
+            for fmt, values in decoded_results.items():
+                if fmt in sample_decoded:
+                    values[bid_int] = sample_decoded[fmt]
+        scene_results = {
+            **decoded_results,
+            "coords": coords,
+            "num_instances": num_instances,
+            "num_slots": num_slots,
+            "slot_names": slot_names,
+        }
+        if save_dbg:
+            scene_results["dbg_ret_list"] = stage1_results.get("dbg_ret_list", [])
+        scene_results["label_ids"] = label_ids
+        if save_dbg:
+            scene_results["scene_rgb"] = scene_rgb
+            scene_results["instance_masks"] = instance_masks
+        return scene_results
+    def save_scene_outputs(
+        self,
+        scene_results,
+        output_dir: str | Path,
+        overwrite: bool = True,
+        save_dbg: bool = False,
+        simplify: float = 0.95,
+        only_3dgs: bool = False,
+        verbose: bool = False,
+    ) -> None:
+        if scene_results is None:
+            return
+        out_dir = Path(output_dir)
+        out_dir.mkdir(parents=True, exist_ok=True)
+        if overwrite:
+            for stale_scene_slot in out_dir.glob("instance_*_scene_space_scene.*"):
+                stale_scene_slot.unlink()
+            for stale_instance_slot in out_dir.glob("instance_*_scene_space_instance.*"):
+                stale_instance_slot.unlink()
+            for stale_scene_slot in out_dir.glob("scene_space_scene.*"):
+                stale_scene_slot.unlink()
+        label_ids = scene_results.get("label_ids", [])
+        slot_names = scene_results.get("slot_names", [])
+        num_instances = int(scene_results.get("num_instances", len(label_ids)))
+        num_slots = int(scene_results.get("num_slots", len(slot_names) if slot_names else 0))
+        meshes = scene_results.get("mesh")
+        gaussians = scene_results.get("gaussian")
+        coords = scene_results.get("coords")
+        if gaussians is None:
+            raise ValueError("scene_results must contain gaussian outputs.")
+        if not only_3dgs and meshes is None:
+            raise ValueError("scene_results must contain mesh outputs when only_3dgs=False.")
+        if num_slots <= 0:
+            num_slots = max(1, len(gaussians) // max(num_instances, 1))
+        if not slot_names or len(slot_names) != num_slots:
+            slot_names = [f"slot_{i}" for i in range(num_slots)]
+        scene_slot_idx = slot_names.index("scene_space_scene") if "scene_space_scene" in slot_names else -1
+        instance_slot_idx = slot_names.index("scene_space_instance") if "scene_space_instance" in slot_names else 0
+        if only_3dgs:
+            instance_ply_paths: list[str] = []
+            for instance_idx in tqdm(range(num_instances), desc="Saving Gaussian assets", disable=not verbose):
+                label_id = label_ids[instance_idx] if instance_idx < len(label_ids) else instance_idx
+                for slot_idx in range(num_slots):
+                    if slot_idx != instance_slot_idx or slot_idx == scene_slot_idx:
+                        continue
+                    flat_idx = instance_idx * num_slots + slot_idx
+                    ply_path = out_dir / f"instance_{int(label_id):02d}.ply"
+                    if ply_path.exists() and not overwrite:
+                        instance_ply_paths.append(str(ply_path))
+                        continue
+                    gaussian = gaussians[flat_idx]
+                    if gaussian is None:
+                        continue
+                    gaussian[0].save_ply(str(ply_path))
+                    instance_ply_paths.append(str(ply_path))
+            if instance_ply_paths:
+                scene_ply_path = out_dir / "scene_pred.ply"
+                if overwrite or not scene_ply_path.exists():
+                    merge_gaussian_ply_files(instance_ply_paths, str(scene_ply_path))
+        else:
+            from ..trellis.utils import postprocessing_utils
+            instance_glbs: list[Path] = []
+            for instance_idx in tqdm(range(num_instances), desc="Exporting GLB assets", disable=not verbose):
+                label_id = label_ids[instance_idx] if instance_idx < len(label_ids) else instance_idx
+                for slot_idx in range(num_slots):
+                    if slot_idx != instance_slot_idx or slot_idx == scene_slot_idx:
+                        continue
+                    flat_idx = instance_idx * num_slots + slot_idx
+                    out_path = out_dir / f"instance_{int(label_id):02d}.glb"
+                    if out_path.exists() and not overwrite:
+                        instance_glbs.append(out_path)
+                        continue
+                    gaussian = gaussians[flat_idx]
+                    mesh = meshes[flat_idx]
+                    if gaussian is None or mesh is None:
+                        continue
+                    glb = postprocessing_utils.to_glb(
+                        gaussian[0],
+                        mesh[0],
+                        simplify=simplify,
+                        texture_size=1024,
+                        verbose=False,
+                    )
+                    out_path.parent.mkdir(parents=True, exist_ok=True)
+                    glb.export(str(out_path))
+                    instance_glbs.append(out_path)
+            if instance_glbs:
+                scene_mesh = self._merge_instance_glbs_to_scene(sorted(instance_glbs))
+                scene_mesh.export(str(out_dir / "scene_pred.glb"))
+        if save_dbg:
+            self._save_debug_outputs(
+                scene_results,
+                out_dir,
+                label_ids=label_ids,
+                slot_names=slot_names,
+                num_slots=num_slots,
+                coords=coords,
+            )
+    def _save_debug_outputs(
+        self,
+        scene_results,
+        out_dir: Path,
+        *,
+        label_ids: list[int],
+        slot_names: list[str],
+        num_slots: int,
+        coords,
+    ) -> None:
+        scene_rgb = scene_results.get("scene_rgb")
+        instance_masks = scene_results.get("instance_masks")
+        dbg_ret_list = scene_results.get("dbg_ret_list", [])
+        num_instances = int(scene_results.get("num_instances", len(label_ids)))
+        for instance_idx in range(num_instances):
+            label_id = label_ids[instance_idx] if instance_idx < len(label_ids) else instance_idx
+            if scene_rgb is not None:
+                scene_rgb.save(str(out_dir / f"instance_{int(label_id):02d}_scene_rgb.png"))
+            if instance_masks is not None and instance_idx < len(instance_masks):
+                instance_masks[instance_idx].save(str(out_dir / f"instance_{int(label_id):02d}_instance_mask.png"))
+            if dbg_ret_list and instance_idx < len(dbg_ret_list):
+                dbg_ret = dbg_ret_list[instance_idx]
+                if "instance_rgb_canonical_tensor" in dbg_ret:
+                    canonical_np = dbg_ret["instance_rgb_canonical_tensor"].cpu().numpy().transpose(1, 2, 0)
+                    canonical_np = np.clip(canonical_np * 255.0, 0, 255).astype(np.uint8)
+                    Image.fromarray(canonical_np).save(
+                        str(out_dir / f"instance_{int(label_id):02d}_canonical_space_instance_rgb.png")
+                    )
+            if coords is not None:
+                for slot_idx in range(num_slots):
+                    flat_idx = instance_idx * num_slots + slot_idx
+                    coord_path = out_dir / f"instance_{int(label_id):02d}_{slot_names[slot_idx]}_coords.ply"
+                    save_sparse_coords_as_ply(coords[coords[:, 0] == flat_idx], str(coord_path))
+    @staticmethod
+    def _merge_instance_glbs_to_scene(instance_mesh_paths):
+        aggregated = trimesh.Scene()
+        for idx, mesh_path in enumerate(sorted(Path(p) for p in instance_mesh_paths)):
+            try:
+                loaded = trimesh.load(str(mesh_path))
+            except Exception as exc:
+                logging.warning("Failed to load %s for scene aggregation: %s", mesh_path, exc)
+                continue
+            stem = mesh_path.stem
+            if hasattr(loaded, "geometry"):
+                for sub_idx, (sub_name, geometry) in enumerate(loaded.geometry.items()):
+                    base_name = f"{stem}_{sub_name}" if sub_name else stem
+                    node_name = base_name if base_name not in aggregated.geometry else f"{base_name}_{sub_idx}"
+                    aggregated.add_geometry(geometry, node_name=node_name)
+            else:
+                node_name = stem if stem not in aggregated.geometry else f"{stem}_{idx}"
+                aggregated.add_geometry(loaded, node_name=node_name)
+        return aggregated
+def save_sparse_coords_as_ply(coords, output_path: str, resolution: int = 64) -> None:
+    spatial_coords = coords[:, 1:].float().cpu().numpy()
+    points = (spatial_coords + 0.5) / resolution * 2.0 - 1.0
+    points = points[:, [0, 2, 1]]
+    points[:, 2] = -points[:, 2]
+    trimesh.points.PointCloud(points).export(output_path)
+def merge_gaussian_ply_files(ply_paths: list[str], output_path: str) -> None:
+    all_vertices = []
+    for ply_path in ply_paths:
+        if not Path(ply_path).exists():
+            continue
+        try:
+            plydata = PlyData.read(str(ply_path))
+        except Exception as exc:
+            logging.warning("Failed to read %s: %s", ply_path, exc)
+            continue
+        all_vertices.append(plydata["vertex"].data)
+    if not all_vertices:
+        return
+    merged = np.concatenate(all_vertices)
+    PlyData([PlyElement.describe(merged, "vertex")]).write(output_path)

iscene/inference/segmentation_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Union
+import numpy as np
+from PIL import Image
+def load_rgb_image(image_path: Union[str, Path]) -> Image.Image:
+    """Load an RGB image and handle alpha / transparency consistently."""
+    img = Image.open(image_path)
+    if img.mode in ("RGBA", "LA") or ("transparency" in img.info):
+        rgba = img.convert("RGBA")
+        background = Image.new("RGBA", rgba.size, (0, 0, 0, 0))
+        return Image.alpha_composite(background, rgba).convert("RGB")
+    return img.convert("RGB")
+def segmentation_to_id_map(segmentation: Image.Image) -> np.ndarray:
+    """Decode an instance segmentation image into one integer label per pixel."""
+    seg_array = np.array(segmentation)
+    if seg_array.ndim == 2:
+        return seg_array.astype(np.uint32)
+    if seg_array.ndim == 3 and seg_array.shape[2] >= 1:
+        channels = seg_array[..., :3].astype(np.uint32)
+        if channels.shape[2] == 1:
+            return channels[..., 0]
+        r = channels[..., 0]
+        g = channels[..., 1]
+        b = channels[..., 2] if channels.shape[2] >= 3 else np.zeros_like(r)
+        if np.array_equal(r, g) and np.array_equal(r, b):
+            return r
+        packed_rg = r + (g << 8)
+        packed_rgb = packed_rg + (b << 16)
+        rg_ids = np.unique(packed_rg)
+        rgb_ids = np.unique(packed_rgb)
+        # Preserve the legacy 16-bit R/G packed format when B carries no label
+        # information. Use full RGB packing for color-coded masks so blue-only
+        # labels are not dropped and distinct colors are not merged.
+        if np.any(b != 0) or len(rgb_ids) != len(rg_ids):
+            return packed_rgb
+        return packed_rg
+    return np.zeros(seg_array.shape[:2], dtype=np.uint32)
+def load_scene_and_instance_masks(
+    rgb_image_path: Union[str, Path],
+    segmentation_path: Union[str, Path],
+) -> tuple[Image.Image, list[Image.Image], list[int]]:
+    """
+    Load one scene RGB image and split a multi-label segmentation into per-instance masks.
+    The segmentation can be single-channel label IDs, palette IDs, packed 16-bit
+    R/G IDs, or RGB color-coded instance IDs.
+    """
+    segmentation = Image.open(segmentation_path)
+    scene_rgb = load_rgb_image(rgb_image_path).resize(segmentation.size)
+    id_map = segmentation_to_id_map(segmentation)
+    label_ids = np.unique(id_map)
+    label_ids = sorted(int(label_id) for label_id in label_ids[label_ids > 0].tolist())
+    instance_masks: list[Image.Image] = []
+    for label_id in label_ids:
+        mask = np.zeros_like(id_map, dtype=np.uint8)
+        mask[id_map == label_id] = 255
+        instance_masks.append(Image.fromarray(mask))
+    return scene_rgb, instance_masks, label_ids

iscene/trellis/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Lightweight TRELLIS components used by IScene inference.
+Subpackages are imported by their direct users. Keeping this package init small
+avoids importing optional rendering dependencies for Gaussian-only inference.
+"""
+__all__ = ["models", "modules", "pipelines", "renderers", "representations", "utils"]

iscene/trellis/models/__init__.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import importlib
+__attributes = {
+    "SparseStructureDecoder": "sparse_structure_vae",
+    "SparseStructureSceneContextFlowModel": "sparse_structure_sc_flow",
+    "SLatGaussianDecoder": "structured_latent_vae.decoder_gs",
+    "SLatMeshDecoder": "structured_latent_vae.decoder_mesh",
+    "SLatFlowModel": "structured_latent_flow",
+    "ImageConditioner": "image_conditioner",
+}
+__all__ = list(__attributes.keys())
+def __getattr__(name):
+    if name not in __attributes:
+        raise AttributeError(f"module {__name__} has no attribute {name}")
+    module_name = __attributes[name]
+    module = importlib.import_module(f".{module_name}", __name__)
+    value = getattr(module, name)
+    globals()[name] = value
+    return value
+def from_pretrained(path: str, revision: str | None = None, **kwargs):
+    """
+    Load a model from a pretrained checkpoint.
+    Args:
+        path: The path to the checkpoint. Can be either local path or a Hugging Face model name.
+              NOTE: config file and model file should take the name f'{path}.json' and f'{path}.safetensors' respectively.
+        **kwargs: Additional arguments for the model constructor.
+    """
+    import os
+    import json
+    from safetensors.torch import load_file
+    is_local = os.path.exists(f"{path}.json") and os.path.exists(f"{path}.safetensors")
+    if is_local:
+        config_file = f"{path}.json"
+        model_file = f"{path}.safetensors"
+    else:
+        from huggingface_hub import hf_hub_download
+        path_parts = path.split('/')
+        repo_id = f'{path_parts[0]}/{path_parts[1]}'
+        model_name = '/'.join(path_parts[2:])
+        config_file = hf_hub_download(repo_id, f"{model_name}.json", revision=revision)
+        model_file = hf_hub_download(repo_id, f"{model_name}.safetensors", revision=revision)
+    with open(config_file, 'r') as f:
+        config = json.load(f)
+    model = __getattr__(config['name'])(**config['args'], **kwargs)
+    model.load_state_dict(load_file(model_file))
+    return model

iscene/trellis/models/image_conditioner.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import torch
+import torch.nn as nn
+from torchvision import transforms
+import torch.nn.functional as F
+import logging
+from ..modules.utils import convert_module_to_f32
+from ..utils import dist_utils
+class ImageConditioner(nn.Module):
+    def __init__(self, image_cond_model: str = 'dinov2_vitl14_reg', cond_in_channels: int = 10, use_fp16: bool = True):
+        super().__init__()
+        self.image_cond_model_name = image_cond_model
+        self.cond_in_channels = cond_in_channels
+        self._init_image_cond_model()
+        if use_fp16:
+            self.convert_to_fp16()
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+    def convert_to_fp16(self):
+        logging.info('Image conditioner does not support fp16, skip this.')
+    def convert_to_fp32(self):
+        logging.info('Image conditioner does not support fp32, skip this.')
+        self.base_img_conditioner.apply(convert_module_to_f32)
+    def forward(self, image: torch.Tensor):
+        if isinstance(image, torch.Tensor):
+            assert image.ndim == 4, "Image tensor should be batched (B, C, H, W)"
+        elif isinstance(image, list):
+            raise ValueError(f"Unsupported type of image: {type(image)}")
+        else:
+            raise ValueError(f"Unsupported type of image: {type(image)}")
+        image = image.to(self.dtype).cuda()
+        if image.shape[1] == 3:
+            base_img = self.base_transform(image)
+        else:
+            # Handle multi-channel input (e.g. 7 channels: RGB + RGB + Mask)
+            # We normalize every 3-channel block using ImageNet stats, and leave the rest as is.
+            mean = torch.tensor([0.485, 0.456, 0.406], device=image.device, dtype=image.dtype).view(1, 3, 1, 1)
+            std = torch.tensor([0.229, 0.224, 0.225], device=image.device, dtype=image.dtype).view(1, 3, 1, 1)
+            chunks = []
+            for i in range(0, image.shape[1], 3):
+                chunk = image[:, i:min(i+3, image.shape[1])]
+                if chunk.shape[1] == 3:
+                    chunk = (chunk - mean) / std
+                chunks.append(chunk)
+            base_img = torch.cat(chunks, dim=1)
+        B, C, H, W = base_img.shape
+        patchtokens = []
+        features = self.base_img_conditioner(base_img, is_training=True)['x_prenorm']
+        patchtokens = F.layer_norm(features, features.shape[-1:])
+        return patchtokens
+    def _init_image_cond_model(self):
+        """
+        Initialize the image conditioning model.
+        """
+        with dist_utils.local_master_first():
+            dinov2_model = torch.hub.load('facebookresearch/dinov2', self.image_cond_model_name, pretrained=True)
+        dinov2_model.eval().cuda()
+        transform = transforms.Compose([
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ])
+        self.base_img_conditioner = dinov2_model
+        self.base_transform = transform
+        if self.cond_in_channels > 3:
+            self.base_img_conditioner = self.expand_dinov2_model(self.base_img_conditioner, self.cond_in_channels)
+        self.set_param_requires_grad(self.base_img_conditioner, False)
+    def set_param_requires_grad(self, model, requires_grad: bool):
+        for param in model.parameters():
+            param.requires_grad = requires_grad
+    def expand_dinov2_model(self, dinov2_model, cond_in_channels: int):
+        """
+        Expand the DINOv2 patch embedding to accept additional input channels.
+        """
+        # locate the patch-embedding projection conv for both hf Dinov2Model and torch.hub model
+        if hasattr(dinov2_model, 'embeddings'):
+            proj = dinov2_model.embeddings.patch_embeddings.projection
+        elif hasattr(dinov2_model, 'patch_embed'):
+            proj = dinov2_model.patch_embed.proj
+        else:
+            raise RuntimeError('Cannot locate patch-embedding projection in DINOv2 model.')
+        if proj.weight.shape[1] < cond_in_channels:
+            weight = proj.weight  # (out_channels, 3, k, k)
+            extra = []
+            channels_left = cond_in_channels - 3
+            while channels_left > 0:
+                take = min(3, channels_left)
+                extra.append(weight[:, :take].clone())
+                channels_left -= take
+            new_weight = torch.cat([weight] + extra, dim=1)
+            new_proj = torch.nn.Conv2d(
+                in_channels=cond_in_channels,
+                out_channels=weight.shape[0],
+                kernel_size=proj.kernel_size,
+                stride=proj.stride,
+                padding=proj.padding,
+                bias=(proj.bias is not None),
+            )
+            new_proj.weight.data = new_weight
+            if proj.bias is not None:
+                new_proj.bias.data = proj.bias.data.clone()
+            # replace inside the model
+            if hasattr(dinov2_model, 'embeddings'):
+                dinov2_model.embeddings.patch_embeddings.projection = new_proj
+            else:
+                dinov2_model.patch_embed.proj = new_proj
+        return dinov2_model

iscene/trellis/models/sparse_structure_flow.py ADDED Viewed

	@@ -0,0 +1,201 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ..modules.utils import convert_module_to_f16, convert_module_to_f32
+from ..modules.transformer import AbsolutePositionEmbedder, ModulatedTransformerCrossBlock
+from ..modules.spatial import patchify, unpatchify
+import copy
+from pathlib import Path
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        Args:
+            t: a 1-D Tensor of N indices, one per batch element.
+                These may be fractional.
+            dim: the dimension of the output.
+            max_period: controls the minimum frequency of the embeddings.
+        Returns:
+            an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -np.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class SparseStructureFlowModel(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        cond_channels: int,
+        out_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.t_embedder = TimestepEmbedder(model_channels)
+        if share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels, 6 * model_channels, bias=True)
+            )
+        if pe_mode == "ape":
+            pos_embedder = AbsolutePositionEmbedder(model_channels, 3)
+            coords = torch.meshgrid(*[torch.arange(res, device=self.device) for res in [resolution // patch_size] * 3], indexing='ij')
+            coords = torch.stack(coords, dim=-1).reshape(-1, 3)
+            pos_emb = pos_embedder(coords)
+            self.register_buffer("pos_emb", pos_emb)
+        self.input_layer = nn.Linear(in_channels * patch_size**3, model_channels)
+        self.blocks = nn.ModuleList([
+            ModulatedTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+            )
+            for _ in range(num_blocks)
+        ])
+        self.out_layer = nn.Linear(model_channels, out_channels * patch_size**3)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        if self.share_mod:
+            nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        else:
+            for block in self.blocks:
+                nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(self, x: torch.Tensor, t: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
+        assert [*x.shape] == [x.shape[0], self.in_channels, *[self.resolution] * 3], \
+                f"Input shape mismatch, got {x.shape}, expected {[x.shape[0], self.in_channels, *[self.resolution] * 3]}"
+        h = patchify(x, self.patch_size)
+        h = h.view(*h.shape[:2], -1).permute(0, 2, 1).contiguous()
+        h = self.input_layer(h)
+        h = h + self.pos_emb[None]
+        t_emb = self.t_embedder(t)
+        if self.share_mod:
+            t_emb = self.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        h = h.type(self.dtype)
+        cond = cond.type(self.dtype)
+        for block in self.blocks:
+            h = block(h, t_emb, cond)
+        h = h.type(x.dtype)
+        h = F.layer_norm(h, h.shape[-1:])
+        h = self.out_layer(h)
+        h = h.permute(0, 2, 1).view(h.shape[0], h.shape[2], *[self.resolution // self.patch_size] * 3)
+        h = unpatchify(h, self.patch_size).contiguous()
+        return h

iscene/trellis/models/sparse_structure_sc_flow.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from typing import *
+import torch
+import torch.nn.functional as F
+from ..modules.utils import convert_module_to_f16
+from ..modules.spatial import patchify, unpatchify
+from pathlib import Path
+from .sparse_structure_flow import SparseStructureFlowModel
+class SparseStructureSceneContextFlowModel(SparseStructureFlowModel):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        cond_channels: int,
+        out_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        pretrained_base: Optional[str] = None,
+        scene_context_attn_num: int = 5,
+        learning_pattern: Literal['full-finetune'] = 'full-finetune',
+        exp_setting: str = "global local",
+        type_embedding_type = None,
+        k_bias_scale = 0.2,
+    ):
+        super().__init__(resolution, in_channels, model_channels, cond_channels, out_channels, num_blocks, num_heads, num_head_channels, mlp_ratio, patch_size, pe_mode, use_fp16, use_checkpoint, share_mod, qk_rms_norm, qk_rms_norm_cross)
+        assert pretrained_base is not None, 'pretrained_base is required for SparseStructureSceneContextFlowModel'
+        assert Path(pretrained_base).exists(), f'Pretrained base model {pretrained_base} not found'
+        self.scene_context_attn_num = scene_context_attn_num
+        # load the base model
+        if Path(pretrained_base).suffix == '.pt':
+            self.load_state_dict(torch.load(pretrained_base, map_location='cpu'), strict=True)
+        elif Path(pretrained_base).suffix == '.safetensors':
+            from safetensors.torch import load_file
+            self.load_state_dict(load_file(pretrained_base), strict=True)
+        else:
+            raise ValueError(f'Invalid pretrained base model {pretrained_base}')
+        # hijack some blocks to use scene context attention
+        block_num = len(self.blocks)
+        start_idx = block_num // 2 - scene_context_attn_num // 2
+        for i in range(scene_context_attn_num):
+            self.blocks[start_idx + i].is_scene_context = True
+            self.blocks[start_idx + i].num_instances = len(exp_setting.split(' ')) + 1
+            if type_embedding_type is not None:
+                enable_gate = 'enable_gate' in type_embedding_type
+                enable_k_bias = 'enable_k_bias' in type_embedding_type
+                k_bias_scale = k_bias_scale
+                self.blocks[start_idx + i].self_attn.initialize_positional_encoding(self.blocks[start_idx + i].num_instances - 1,
+                                                                                    enable_gate=enable_gate,
+                                                                                    enable_k_bias=enable_k_bias,
+                                                                                    k_bias_scale=k_bias_scale)
+        if use_fp16:
+            self.convert_to_fp16()
+        if learning_pattern != 'full-finetune':
+            raise ValueError(f'Unsupported learning pattern for release inference: {learning_pattern}')
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        for block in self.blocks:
+            block.apply(convert_module_to_f16)
+    def forward(self, x: torch.Tensor, t: torch.Tensor, cond: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        x: B, N, C, [resolution, resolution, resolution]
+        cond: B, N, C, H, W
+        """
+        B, N, C, *rest = x.shape
+        x = x.view(B * N, C, *rest)
+        B, N, T, C = cond.shape
+        cond = cond.view(B * N, T, C)
+        t = t.repeat_interleave(N, dim=0)
+        h = patchify(x, self.patch_size)
+        h = h.view(*h.shape[:2], -1).permute(0, 2, 1).contiguous()
+        h = self.input_layer(h)
+        h = h + self.pos_emb[None]
+        t_emb = self.t_embedder(t)
+        if self.share_mod:
+            t_emb = self.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        h = h.type(self.dtype)
+        cond = cond.type(self.dtype)
+        for block in self.blocks:
+            h = block(x=h, mod=t_emb, context=cond)
+        h = h.type(x.dtype)
+        h = F.layer_norm(h, h.shape[-1:])
+        h = self.out_layer(h)
+        h = h.permute(0, 2, 1).view(h.shape[0], h.shape[2], *[self.resolution // self.patch_size] * 3)
+        h = unpatchify(h, self.patch_size).contiguous()
+        h = h.view(B, N, *h.shape[1:])
+        return h

iscene/trellis/models/sparse_structure_vae.py ADDED Viewed

	@@ -0,0 +1,306 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..modules.norm import GroupNorm32, ChannelLayerNorm32
+from ..modules.spatial import pixel_shuffle_3d
+from ..modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+def norm_layer(norm_type: str, *args, **kwargs) -> nn.Module:
+    """
+    Return a normalization layer.
+    """
+    if norm_type == "group":
+        return GroupNorm32(32, *args, **kwargs)
+    elif norm_type == "layer":
+        return ChannelLayerNorm32(*args, **kwargs)
+    else:
+        raise ValueError(f"Invalid norm type {norm_type}")
+class ResBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        out_channels: Optional[int] = None,
+        norm_type: Literal["group", "layer"] = "layer",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.norm1 = norm_layer(norm_type, channels)
+        self.norm2 = norm_layer(norm_type, self.out_channels)
+        self.conv1 = nn.Conv3d(channels, self.out_channels, 3, padding=1)
+        self.conv2 = zero_module(nn.Conv3d(self.out_channels, self.out_channels, 3, padding=1))
+        self.skip_connection = nn.Conv3d(channels, self.out_channels, 1) if channels != self.out_channels else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.norm1(x)
+        h = F.silu(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = F.silu(h)
+        h = self.conv2(h)
+        h = h + self.skip_connection(x)
+        return h
+class DownsampleBlock3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["conv", "avgpool"] = "conv",
+    ):
+        assert mode in ["conv", "avgpool"], f"Invalid mode {mode}"
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if mode == "conv":
+            self.conv = nn.Conv3d(in_channels, out_channels, 2, stride=2)
+        elif mode == "avgpool":
+            assert in_channels == out_channels, "Pooling mode requires in_channels to be equal to out_channels"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "conv"):
+            return self.conv(x)
+        else:
+            return F.avg_pool3d(x, 2)
+class UpsampleBlock3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["conv", "nearest"] = "conv",
+    ):
+        assert mode in ["conv", "nearest"], f"Invalid mode {mode}"
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if mode == "conv":
+            self.conv = nn.Conv3d(in_channels, out_channels*8, 3, padding=1)
+        elif mode == "nearest":
+            assert in_channels == out_channels, "Nearest mode requires in_channels to be equal to out_channels"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "conv"):
+            x = self.conv(x)
+            return pixel_shuffle_3d(x, 2)
+        else:
+            return F.interpolate(x, scale_factor=2, mode="nearest")
+class SparseStructureEncoder(nn.Module):
+    """
+    Encoder for Sparse Structure (\mathcal{E}_S in the paper Sec. 3.3).
+    Args:
+        in_channels (int): Channels of the input.
+        latent_channels (int): Channels of the latent representation.
+        num_res_blocks (int): Number of residual blocks at each resolution.
+        channels (List[int]): Channels of the encoder blocks.
+        num_res_blocks_middle (int): Number of residual blocks in the middle.
+        norm_type (Literal["group", "layer"]): Type of normalization layer.
+        use_fp16 (bool): Whether to use FP16.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        latent_channels: int,
+        num_res_blocks: int,
+        channels: List[int],
+        num_res_blocks_middle: int = 2,
+        norm_type: Literal["group", "layer"] = "layer",
+        use_fp16: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.latent_channels = latent_channels
+        self.num_res_blocks = num_res_blocks
+        self.channels = channels
+        self.num_res_blocks_middle = num_res_blocks_middle
+        self.norm_type = norm_type
+        self.use_fp16 = use_fp16
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.input_layer = nn.Conv3d(in_channels, channels[0], 3, padding=1)
+        self.blocks = nn.ModuleList([])
+        for i, ch in enumerate(channels):
+            self.blocks.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks.append(
+                    DownsampleBlock3d(ch, channels[i+1])
+                )
+        self.middle_block = nn.Sequential(*[
+            ResBlock3d(channels[-1], channels[-1])
+            for _ in range(num_res_blocks_middle)
+        ])
+        self.out_layer = nn.Sequential(
+            norm_layer(norm_type, channels[-1]),
+            nn.SiLU(),
+            nn.Conv3d(channels[-1], latent_channels*2, 3, padding=1)
+        )
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.use_fp16 = True
+        self.dtype = torch.float16
+        self.blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.use_fp16 = False
+        self.dtype = torch.float32
+        self.blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x: torch.Tensor, sample_posterior: bool = False, return_raw: bool = False) -> torch.Tensor:
+        h = self.input_layer(x)
+        h = h.type(self.dtype)
+        for block in self.blocks:
+            h = block(h)
+        h = self.middle_block(h)
+        h = h.type(x.dtype)
+        h = self.out_layer(h)
+        mean, logvar = h.chunk(2, dim=1)
+        if sample_posterior:
+            std = torch.exp(0.5 * logvar)
+            z = mean + std * torch.randn_like(std)
+        else:
+            z = mean
+        if return_raw:
+            return z, mean, logvar
+        return z
+class SparseStructureDecoder(nn.Module):
+    """
+    Decoder for Sparse Structure (\mathcal{D}_S in the paper Sec. 3.3).
+    Args:
+        out_channels (int): Channels of the output.
+        latent_channels (int): Channels of the latent representation.
+        num_res_blocks (int): Number of residual blocks at each resolution.
+        channels (List[int]): Channels of the decoder blocks.
+        num_res_blocks_middle (int): Number of residual blocks in the middle.
+        norm_type (Literal["group", "layer"]): Type of normalization layer.
+        use_fp16 (bool): Whether to use FP16.
+    """
+    def __init__(
+        self,
+        out_channels: int,
+        latent_channels: int,
+        num_res_blocks: int,
+        channels: List[int],
+        num_res_blocks_middle: int = 2,
+        norm_type: Literal["group", "layer"] = "layer",
+        use_fp16: bool = False,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.latent_channels = latent_channels
+        self.num_res_blocks = num_res_blocks
+        self.channels = channels
+        self.num_res_blocks_middle = num_res_blocks_middle
+        self.norm_type = norm_type
+        self.use_fp16 = use_fp16
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.input_layer = nn.Conv3d(latent_channels, channels[0], 3, padding=1)
+        self.middle_block = nn.Sequential(*[
+            ResBlock3d(channels[0], channels[0])
+            for _ in range(num_res_blocks_middle)
+        ])
+        self.blocks = nn.ModuleList([])
+        for i, ch in enumerate(channels):
+            self.blocks.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks.append(
+                    UpsampleBlock3d(ch, channels[i+1])
+                )
+        self.out_layer = nn.Sequential(
+            norm_layer(norm_type, channels[-1]),
+            nn.SiLU(),
+            nn.Conv3d(channels[-1], out_channels, 3, padding=1)
+        )
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.use_fp16 = True
+        self.dtype = torch.float16
+        self.blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.use_fp16 = False
+        self.dtype = torch.float32
+        self.blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.input_layer(x)
+        h = h.type(self.dtype)
+        h = self.middle_block(h)
+        for block in self.blocks:
+            h = block(h)
+        h = h.type(x.dtype)
+        h = self.out_layer(h)
+        return h

iscene/trellis/models/structured_latent_flow.py ADDED Viewed

	@@ -0,0 +1,267 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ..modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ..modules.transformer import AbsolutePositionEmbedder
+from ..modules.norm import LayerNorm32
+from ..modules import sparse as sp
+from ..modules.sparse.transformer import ModulatedSparseTransformerCrossBlock
+from .sparse_structure_flow import TimestepEmbedder
+class SparseResBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        emb_channels: int,
+        out_channels: Optional[int] = None,
+        downsample: bool = False,
+        upsample: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.out_channels = out_channels or channels
+        self.downsample = downsample
+        self.upsample = upsample
+        assert not (downsample and upsample), "Cannot downsample and upsample at the same time"
+        self.norm1 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm2 = LayerNorm32(self.out_channels, elementwise_affine=False, eps=1e-6)
+        self.conv1 = sp.SparseConv3d(channels, self.out_channels, 3)
+        self.conv2 = zero_module(sp.SparseConv3d(self.out_channels, self.out_channels, 3))
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(emb_channels, 2 * self.out_channels, bias=True),
+        )
+        self.skip_connection = sp.SparseLinear(channels, self.out_channels) if channels != self.out_channels else nn.Identity()
+        self.updown = None
+        if self.downsample:
+            self.updown = sp.SparseDownsample(2)
+        elif self.upsample:
+            self.updown = sp.SparseUpsample(2)
+    def _updown(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        if self.updown is not None:
+            x = self.updown(x)
+        return x
+    def forward(self, x: sp.SparseTensor, emb: torch.Tensor) -> sp.SparseTensor:
+        emb_out = self.emb_layers(emb).type(x.dtype)
+        scale, shift = torch.chunk(emb_out, 2, dim=1)
+        x = self._updown(x)
+        h = x.replace(self.norm1(x.feats))
+        h = h.replace(F.silu(h.feats))
+        h = self.conv1(h)
+        h = h.replace(self.norm2(h.feats)) * (1 + scale) + shift
+        h = h.replace(F.silu(h.feats))
+        h = self.conv2(h)
+        h = h + self.skip_connection(x)
+        return h
+class SLatFlowModel(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        cond_channels: int,
+        out_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        num_io_res_blocks: int = 2,
+        io_block_channels: List[int] = None,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        use_skip_connection: bool = True,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.num_io_res_blocks = num_io_res_blocks
+        self.io_block_channels = io_block_channels
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.use_skip_connection = use_skip_connection
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        if self.io_block_channels is not None:
+            assert int(np.log2(patch_size)) == np.log2(patch_size), "Patch size must be a power of 2"
+            assert np.log2(patch_size) == len(io_block_channels), "Number of IO ResBlocks must match the number of stages"
+        self.t_embedder = TimestepEmbedder(model_channels)
+        if share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels, 6 * model_channels, bias=True)
+            )
+        if pe_mode == "ape":
+            self.pos_embedder = AbsolutePositionEmbedder(model_channels)
+        self.input_layer = sp.SparseLinear(in_channels, model_channels if io_block_channels is None else io_block_channels[0])
+        self.input_blocks = nn.ModuleList([])
+        if io_block_channels is not None:
+            for chs, next_chs in zip(io_block_channels, io_block_channels[1:] + [model_channels]):
+                self.input_blocks.extend([
+                    SparseResBlock3d(
+                        chs,
+                        model_channels,
+                        out_channels=chs,
+                    )
+                    for _ in range(num_io_res_blocks-1)
+                ])
+                self.input_blocks.append(
+                    SparseResBlock3d(
+                        chs,
+                        model_channels,
+                        out_channels=next_chs,
+                        downsample=True,
+                    )
+                )
+        self.blocks = nn.ModuleList([
+            ModulatedSparseTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=self.share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+            )
+            for _ in range(num_blocks)
+        ])
+        self.out_blocks = nn.ModuleList([])
+        if io_block_channels is not None:
+            for chs, prev_chs in zip(reversed(io_block_channels), [model_channels] + list(reversed(io_block_channels[1:]))):
+                self.out_blocks.append(
+                    SparseResBlock3d(
+                        prev_chs * 2 if self.use_skip_connection else prev_chs,
+                        model_channels,
+                        out_channels=chs,
+                        upsample=True,
+                    )
+                )
+                self.out_blocks.extend([
+                    SparseResBlock3d(
+                        chs * 2 if self.use_skip_connection else chs,
+                        model_channels,
+                        out_channels=chs,
+                    )
+                    for _ in range(num_io_res_blocks-1)
+                ])
+        self.out_layer = sp.SparseLinear(model_channels if io_block_channels is None else io_block_channels[0], out_channels)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.blocks.apply(convert_module_to_f16)
+        self.out_blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.blocks.apply(convert_module_to_f32)
+        self.out_blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        if self.share_mod:
+            nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        else:
+            for block in self.blocks:
+                nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(self, x: sp.SparseTensor, t: torch.Tensor, cond: torch.Tensor) -> sp.SparseTensor:
+        h = self.input_layer(x).type(self.dtype)
+        t_emb = self.t_embedder(t)
+        if self.share_mod:
+            t_emb = self.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        cond = cond.type(self.dtype)
+        skips = []
+        # pack with input blocks
+        for block in self.input_blocks:
+            h = block(h, t_emb)
+            skips.append(h.feats)
+        if self.pe_mode == "ape":
+            h = h + self.pos_embedder(h.coords[:, 1:]).type(self.dtype)
+        for block in self.blocks:
+            h = block(h, t_emb, cond)
+        # unpack with output blocks
+        for block, skip in zip(self.out_blocks, reversed(skips)):
+            if self.use_skip_connection:
+                h = block(h.replace(torch.cat([h.feats, skip], dim=1)), t_emb)
+            else:
+                h = block(h, t_emb)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h.type(x.dtype))
+        return h

iscene/trellis/models/structured_latent_vae/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .decoder_gs import SLatGaussianDecoder
+from .decoder_mesh import SLatMeshDecoder
+__all__ = ["SLatGaussianDecoder", "SLatMeshDecoder"]

iscene/trellis/models/structured_latent_vae/base.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ...modules.utils import convert_module_to_f16, convert_module_to_f32
+from ...modules import sparse as sp
+from ...modules.transformer import AbsolutePositionEmbedder
+from ...modules.sparse.transformer import SparseTransformerBlock
+def block_attn_config(self):
+    """
+    Return the attention configuration of the model.
+    """
+    for i in range(self.num_blocks):
+        if self.attn_mode == "shift_window":
+            yield "serialized", self.window_size, 0, (16 * (i % 2),) * 3, sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_sequence":
+            yield "serialized", self.window_size, self.window_size // 2 * (i % 2), (0, 0, 0), sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_order":
+            yield "serialized", self.window_size, 0, (0, 0, 0), sp.SerializeModes[i % 4]
+        elif self.attn_mode == "full":
+            yield "full", None, None, None, None
+        elif self.attn_mode == "swin":
+            yield "windowed", self.window_size, None, self.window_size // 2 * (i % 2), None
+class SparseTransformerBase(nn.Module):
+    """
+    Sparse Transformer without output layers.
+    Serve as the base class for encoder and decoder.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.num_blocks = num_blocks
+        self.window_size = window_size
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.attn_mode = attn_mode
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.qk_rms_norm = qk_rms_norm
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        if pe_mode == "ape":
+            self.pos_embedder = AbsolutePositionEmbedder(model_channels)
+        self.input_layer = sp.SparseLinear(in_channels, model_channels)
+        self.blocks = nn.ModuleList([
+            SparseTransformerBlock(
+                model_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode=attn_mode,
+                window_size=window_size,
+                shift_sequence=shift_sequence,
+                shift_window=shift_window,
+                serialize_mode=serialize_mode,
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                qk_rms_norm=self.qk_rms_norm,
+            )
+            for attn_mode, window_size, shift_sequence, shift_window, serialize_mode in block_attn_config(self)
+        ])
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        h = self.input_layer(x)
+        if self.pe_mode == "ape":
+            h = h + self.pos_embedder(x.coords[:, 1:])
+        h = h.type(self.dtype)
+        for block in self.blocks:
+            h = block(h)
+        return h

iscene/trellis/models/structured_latent_vae/decoder_gs.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...modules import sparse as sp
+from ...utils.random_utils import hammersley_sequence
+from .base import SparseTransformerBase
+from ...representations import Gaussian
+class SLatGaussianDecoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        representation_config: dict = None,
+    ):
+        super().__init__(
+            in_channels=latent_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.resolution = resolution
+        self.rep_config = representation_config
+        self._calc_layout()
+        self.out_layer = sp.SparseLinear(model_channels, self.out_channels)
+        self._build_perturbation()
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def _build_perturbation(self) -> None:
+        perturbation = [hammersley_sequence(3, i, self.rep_config['num_gaussians']) for i in range(self.rep_config['num_gaussians'])]
+        perturbation = torch.tensor(perturbation).float() * 2 - 1
+        perturbation = perturbation / self.rep_config['voxel_size']
+        perturbation = torch.atanh(perturbation).to(self.device)
+        self.register_buffer('offset_perturbation', perturbation)
+    def _calc_layout(self) -> None:
+        self.layout = {
+            '_xyz' : {'shape': (self.rep_config['num_gaussians'], 3), 'size': self.rep_config['num_gaussians'] * 3},
+            '_features_dc' : {'shape': (self.rep_config['num_gaussians'], 1, 3), 'size': self.rep_config['num_gaussians'] * 3},
+            '_scaling' : {'shape': (self.rep_config['num_gaussians'], 3), 'size': self.rep_config['num_gaussians'] * 3},
+            '_rotation' : {'shape': (self.rep_config['num_gaussians'], 4), 'size': self.rep_config['num_gaussians'] * 4},
+            '_opacity' : {'shape': (self.rep_config['num_gaussians'], 1), 'size': self.rep_config['num_gaussians']},
+        }
+        start = 0
+        for k, v in self.layout.items():
+            v['range'] = (start, start + v['size'])
+            start += v['size']
+        self.out_channels = start
+    def to_representation(self, x: sp.SparseTensor) -> List[Gaussian]:
+        """
+        Convert a batch of network outputs to 3D representations.
+        Args:
+            x: The [N x * x C] sparse tensor output by the network.
+        Returns:
+            list of representations
+        """
+        ret = []
+        for i in range(x.shape[0]):
+            representation = Gaussian(
+                sh_degree=0,
+                aabb=[-0.5, -0.5, -0.5, 1.0, 1.0, 1.0],
+                mininum_kernel_size = self.rep_config['3d_filter_kernel_size'],
+                scaling_bias = self.rep_config['scaling_bias'],
+                opacity_bias = self.rep_config['opacity_bias'],
+                scaling_activation = self.rep_config['scaling_activation']
+            )
+            xyz = (x.coords[x.layout[i]][:, 1:].float() + 0.5) / self.resolution
+            for k, v in self.layout.items():
+                if k == '_xyz':
+                    offset = x.feats[x.layout[i]][:, v['range'][0]:v['range'][1]].reshape(-1, *v['shape'])
+                    offset = offset * self.rep_config['lr'][k]
+                    if self.rep_config['perturb_offset']:
+                        offset = offset + self.offset_perturbation
+                    offset = torch.tanh(offset) / self.resolution * 0.5 * self.rep_config['voxel_size']
+                    _xyz = xyz.unsqueeze(1) + offset
+                    setattr(representation, k, _xyz.flatten(0, 1))
+                else:
+                    feats = x.feats[x.layout[i]][:, v['range'][0]:v['range'][1]].reshape(-1, *v['shape']).flatten(0, 1)
+                    feats = feats * self.rep_config['lr'][k]
+                    setattr(representation, k, feats)
+            ret.append(representation)
+        return ret
+    def forward(self, x: sp.SparseTensor) -> List[Gaussian]:
+        h = super().forward(x)
+        h = h.type(x.dtype)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h)
+        return self.to_representation(h)

iscene/trellis/models/structured_latent_vae/decoder_mesh.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ...modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ...modules import sparse as sp
+from .base import SparseTransformerBase
+from ...representations import MeshExtractResult
+from ...representations.mesh import SparseFeatures2Mesh
+class SparseSubdivideBlock3d(nn.Module):
+    """
+    A 3D subdivide block that can subdivide the sparse tensor.
+    Args:
+        channels: channels in the inputs and outputs.
+        out_channels: if specified, the number of output channels.
+        num_groups: the number of groups for the group norm.
+    """
+    def __init__(
+        self,
+        channels: int,
+        resolution: int,
+        out_channels: Optional[int] = None,
+        num_groups: int = 32
+    ):
+        super().__init__()
+        self.channels = channels
+        self.resolution = resolution
+        self.out_resolution = resolution * 2
+        self.out_channels = out_channels or channels
+        self.act_layers = nn.Sequential(
+            sp.SparseGroupNorm32(num_groups, channels),
+            sp.SparseSiLU()
+        )
+        self.sub = sp.SparseSubdivide()
+        self.out_layers = nn.Sequential(
+            sp.SparseConv3d(channels, self.out_channels, 3, indice_key=f"res_{self.out_resolution}"),
+            sp.SparseGroupNorm32(num_groups, self.out_channels),
+            sp.SparseSiLU(),
+            zero_module(sp.SparseConv3d(self.out_channels, self.out_channels, 3, indice_key=f"res_{self.out_resolution}")),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        else:
+            self.skip_connection = sp.SparseConv3d(channels, self.out_channels, 1, indice_key=f"res_{self.out_resolution}")
+    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        Args:
+            x: an [N x C x ...] Tensor of features.
+        Returns:
+            an [N x C x ...] Tensor of outputs.
+        """
+        h = self.act_layers(x)
+        h = self.sub(h)
+        x = self.sub(x)
+        h = self.out_layers(h)
+        h = h + self.skip_connection(x)
+        return h
+class SLatMeshDecoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        representation_config: dict = None,
+    ):
+        super().__init__(
+            in_channels=latent_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.resolution = resolution
+        self.rep_config = representation_config
+        self.mesh_extractor = SparseFeatures2Mesh(res=self.resolution*5, use_color=self.rep_config.get('use_color', False))
+        self.out_channels = self.mesh_extractor.feats_channels
+        self.upsample = nn.ModuleList([
+            SparseSubdivideBlock3d(
+                channels=model_channels,
+                resolution=resolution,
+                out_channels=model_channels // 4
+            ),
+            SparseSubdivideBlock3d(
+                channels=model_channels // 4,
+                resolution=resolution * 2,
+                out_channels=model_channels // 8
+            )
+        ])
+        self.out_layer = sp.SparseLinear(model_channels // 8, self.out_channels)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        super().convert_to_fp16()
+        self.upsample.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        super().convert_to_fp32()
+        self.upsample.apply(convert_module_to_f32)
+    def to_representation(self, x: sp.SparseTensor) -> List[MeshExtractResult]:
+        """
+        Convert a batch of network outputs to 3D representations.
+        Args:
+            x: The [N x * x C] sparse tensor output by the network.
+        Returns:
+            list of representations
+        """
+        ret = []
+        for i in range(x.shape[0]):
+            mesh = self.mesh_extractor(x[i], training=self.training)
+            ret.append(mesh)
+        return ret
+    def forward(self, x: sp.SparseTensor) -> List[MeshExtractResult]:
+        h = super().forward(x)
+        for block in self.upsample:
+            h = block(h)
+        h = h.type(x.dtype)
+        h = self.out_layer(h)
+        return self.to_representation(h)

iscene/trellis/modules/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from typing import *
+BACKEND = 'flash_attn'
+DEBUG = False
+def __from_env():
+    import os
+    global BACKEND
+    global DEBUG
+    env_attn_backend = os.environ.get('ATTN_BACKEND')
+    env_sttn_debug = os.environ.get('ATTN_DEBUG')
+    if env_attn_backend is not None and env_attn_backend in ['xformers', 'flash_attn', 'sdpa', 'naive']:
+        BACKEND = env_attn_backend
+    if env_sttn_debug is not None:
+        DEBUG = env_sttn_debug == '1'
+    print(f"[ATTENTION] Using backend: {BACKEND}")
+__from_env()
+def set_backend(backend: Literal['xformers', 'flash_attn']):
+    global BACKEND
+    BACKEND = backend
+def set_debug(debug: bool):
+    global DEBUG
+    DEBUG = debug
+from .full_attn import *
+from .modules import *

iscene/trellis/modules/attention/full_attn.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from typing import *
+import torch
+import math
+from . import DEBUG, BACKEND
+if BACKEND == 'xformers':
+    import xformers.ops as xops
+elif BACKEND == 'flash_attn':
+    import flash_attn
+elif BACKEND == 'sdpa':
+    from torch.nn.functional import scaled_dot_product_attention as sdpa
+elif BACKEND == 'naive':
+    pass
+else:
+    raise ValueError(f"Unknown attention backend: {BACKEND}")
+__all__ = [
+    'scaled_dot_product_attention',
+]
+def _naive_sdpa(q, k, v):
+    """
+    Naive implementation of scaled dot product attention.
+    """
+    q = q.permute(0, 2, 1, 3)   # [N, H, L, C]
+    k = k.permute(0, 2, 1, 3)   # [N, H, L, C]
+    v = v.permute(0, 2, 1, 3)   # [N, H, L, C]
+    scale_factor = 1 / math.sqrt(q.size(-1))
+    attn_weight = q @ k.transpose(-2, -1) * scale_factor
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    out = attn_weight @ v
+    out = out.permute(0, 2, 1, 3)   # [N, L, H, C]
+    return out
+@overload
+def scaled_dot_product_attention(qkv: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        qkv (torch.Tensor): A [N, L, 3, H, C] tensor containing Qs, Ks, and Vs.
+    """
+    ...
+@overload
+def scaled_dot_product_attention(q: torch.Tensor, kv: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        q (torch.Tensor): A [N, L, H, C] tensor containing Qs.
+        kv (torch.Tensor): A [N, L, 2, H, C] tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def scaled_dot_product_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        q (torch.Tensor): A [N, L, H, Ci] tensor containing Qs.
+        k (torch.Tensor): A [N, L, H, Ci] tensor containing Ks.
+        v (torch.Tensor): A [N, L, H, Co] tensor containing Vs.
+    Note:
+        k and v are assumed to have the same coordinate map.
+    """
+    ...
+def scaled_dot_product_attention(*args, **kwargs):
+    arg_names_dict = {
+        1: ['qkv'],
+        2: ['q', 'kv'],
+        3: ['q', 'k', 'v']
+    }
+    num_all_args = len(args) + len(kwargs)
+    assert num_all_args in arg_names_dict, f"Invalid number of arguments, got {num_all_args}, expected 1, 2, or 3"
+    for key in arg_names_dict[num_all_args][len(args):]:
+        assert key in kwargs, f"Missing argument {key}"
+    if num_all_args == 1:
+        qkv = args[0] if len(args) > 0 else kwargs['qkv']
+        assert len(qkv.shape) == 5 and qkv.shape[2] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, L, 3, H, C]"
+        device = qkv.device
+    elif num_all_args == 2:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        kv = args[1] if len(args) > 1 else kwargs['kv']
+        assert q.shape[0] == kv.shape[0], f"Batch size mismatch, got {q.shape[0]} and {kv.shape[0]}"
+        assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, C]"
+        assert len(kv.shape) == 5, f"Invalid shape for kv, got {kv.shape}, expected [N, L, 2, H, C]"
+        device = q.device
+    elif num_all_args == 3:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        k = args[1] if len(args) > 1 else kwargs['k']
+        v = args[2] if len(args) > 2 else kwargs['v']
+        assert q.shape[0] == k.shape[0] == v.shape[0], f"Batch size mismatch, got {q.shape[0]}, {k.shape[0]}, and {v.shape[0]}"
+        assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, Ci]"
+        assert len(k.shape) == 4, f"Invalid shape for k, got {k.shape}, expected [N, L, H, Ci]"
+        assert len(v.shape) == 4, f"Invalid shape for v, got {v.shape}, expected [N, L, H, Co]"
+        device = q.device
+    if BACKEND == 'xformers':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        out = xops.memory_efficient_attention(q, k, v)
+    elif BACKEND == 'flash_attn':
+        if num_all_args == 1:
+            out = flash_attn.flash_attn_qkvpacked_func(qkv)
+        elif num_all_args == 2:
+            out = flash_attn.flash_attn_kvpacked_func(q, kv)
+        elif num_all_args == 3:
+            out = flash_attn.flash_attn_func(q, k, v)
+    elif BACKEND == 'sdpa':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        q = q.permute(0, 2, 1, 3)   # [N, H, L, C]
+        k = k.permute(0, 2, 1, 3)   # [N, H, L, C]
+        v = v.permute(0, 2, 1, 3)   # [N, H, L, C]
+        out = sdpa(q, k, v)         # [N, H, L, C]
+        out = out.permute(0, 2, 1, 3)   # [N, L, H, C]
+    elif BACKEND == 'naive':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        out = _naive_sdpa(q, k, v)
+    else:
+        raise ValueError(f"Unknown attention module: {BACKEND}")
+    return out

iscene/trellis/modules/attention/modules.py ADDED Viewed

	@@ -0,0 +1,342 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .full_attn import scaled_dot_product_attention
+from einops import rearrange
+class MultiHeadRMSNorm(nn.Module):
+    def __init__(self, dim: int, heads: int):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return (F.normalize(x.float(), dim = -1) * self.gamma * self.scale).to(x.dtype)
+class RotaryPositionEmbedder(nn.Module):
+    def __init__(self, hidden_size: int, in_channels: int = 3):
+        super().__init__()
+        assert hidden_size % 2 == 0, "Hidden size must be divisible by 2"
+        self.hidden_size = hidden_size
+        self.in_channels = in_channels
+        self.freq_dim = hidden_size // in_channels // 2
+        self.freqs = torch.arange(self.freq_dim, dtype=torch.float32) / self.freq_dim
+        self.freqs = 1.0 / (10000 ** self.freqs)
+    def _get_phases(self, indices: torch.Tensor) -> torch.Tensor:
+        self.freqs = self.freqs.to(indices.device)
+        phases = torch.outer(indices, self.freqs)
+        phases = torch.polar(torch.ones_like(phases), phases)
+        return phases
+    def _rotary_embedding(self, x: torch.Tensor, phases: torch.Tensor) -> torch.Tensor:
+        x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        x_rotated = x_complex * phases
+        x_embed = torch.view_as_real(x_rotated).reshape(*x_rotated.shape[:-1], -1).to(x.dtype)
+        return x_embed
+    def forward(self, q: torch.Tensor, k: torch.Tensor, indices: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            q (sp.SparseTensor): [..., N, D] tensor of queries
+            k (sp.SparseTensor): [..., N, D] tensor of keys
+            indices (torch.Tensor): [..., N, C] tensor of spatial positions
+        """
+        if indices is None:
+            indices = torch.arange(q.shape[-2], device=q.device)
+            if len(q.shape) > 2:
+                indices = indices.unsqueeze(0).expand(q.shape[:-2] + (-1,))
+        phases = self._get_phases(indices.reshape(-1)).reshape(*indices.shape[:-1], -1)
+        if phases.shape[1] < self.hidden_size // 2:
+            phases = torch.cat([phases, torch.polar(
+                torch.ones(*phases.shape[:-1], self.hidden_size // 2 - phases.shape[1], device=phases.device),
+                torch.zeros(*phases.shape[:-1], self.hidden_size // 2 - phases.shape[1], device=phases.device)
+            )], dim=-1)
+        q_embed = self._rotary_embedding(q, phases)
+        k_embed = self._rotary_embedding(k, phases)
+        return q_embed, k_embed
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        ctx_channels: Optional[int]=None,
+        type: Literal["self", "cross"] = "self",
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        qkv_bias: bool = True,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        assert channels % num_heads == 0
+        assert type in ["self", "cross"], f"Invalid attention type: {type}"
+        assert attn_mode in ["full", "windowed"], f"Invalid attention mode: {attn_mode}"
+        assert type == "self" or attn_mode == "full", "Cross-attention only supports full attention"
+        if attn_mode == "windowed":
+            raise NotImplementedError("Windowed attention is not yet implemented")
+        self.channels = channels
+        self.head_dim = channels // num_heads
+        self.ctx_channels = ctx_channels if ctx_channels is not None else channels
+        self.num_heads = num_heads
+        self._type = type
+        self.attn_mode = attn_mode
+        self.window_size = window_size
+        self.shift_window = shift_window
+        self.use_rope = use_rope
+        self.qk_rms_norm = qk_rms_norm
+        if self._type == "self":
+            self.to_qkv = nn.Linear(channels, channels * 3, bias=qkv_bias)
+        else:
+            self.to_q = nn.Linear(channels, channels, bias=qkv_bias)
+            self.to_kv = nn.Linear(self.ctx_channels, channels * 2, bias=qkv_bias)
+        if self.qk_rms_norm:
+            self.q_rms_norm = MultiHeadRMSNorm(self.head_dim, num_heads)
+            self.k_rms_norm = MultiHeadRMSNorm(self.head_dim, num_heads)
+        self.to_out = nn.Linear(channels, channels)
+        if use_rope:
+            self.rope = RotaryPositionEmbedder(channels)
+        self.use_positional_encoding = False
+    def initialize_positional_encoding(self, num_external_sources: int = 2, enable_gate: bool = True, enable_k_bias: bool = False, k_bias_scale: float = 0.1):
+        self.use_positional_encoding = True
+        # Controls for optional mechanisms
+        self.enable_ext_gate = bool(enable_gate)
+        self.enable_ext_k_bias = bool(enable_k_bias)
+        self.ext_k_bias_scale = float(k_bias_scale)
+        # K-gate for external keys only (values unchanged)
+        if self.enable_ext_gate:
+            self.ext_gate = nn.Parameter(torch.full((num_external_sources, self.num_heads,), 0.0))
+        # Per-source, per-head K additive bias vector (bounded via tanh during application)
+        if self.enable_ext_k_bias:
+            self.k_type_bias = nn.Parameter(torch.zeros(num_external_sources, self.num_heads, self.head_dim))
+    def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None, indices: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, L, C = x.shape
+        if self._type == "self":
+            qkv = self.to_qkv(x)
+            qkv = qkv.reshape(B, L, 3, self.num_heads, -1)
+            if self.use_rope:
+                q, k, v = qkv.unbind(dim=2)
+                q, k = self.rope(q, k, indices)
+                qkv = torch.stack([q, k, v], dim=2)
+            if self.attn_mode == "full":
+                if self.qk_rms_norm:
+                    q, k, v = qkv.unbind(dim=2)
+                    q = self.q_rms_norm(q)
+                    k = self.k_rms_norm(k)
+                    h = scaled_dot_product_attention(q, k, v)
+                else:
+                    h = scaled_dot_product_attention(qkv)
+            elif self.attn_mode == "windowed":
+                raise NotImplementedError("Windowed attention is not yet implemented")
+        else:
+            Lkv = context.shape[1]
+            q = self.to_q(x)
+            kv = self.to_kv(context)
+            q = q.reshape(B, L, self.num_heads, -1)
+            kv = kv.reshape(B, Lkv, 2, self.num_heads, -1)
+            if self.qk_rms_norm:
+                q = self.q_rms_norm(q)
+                k, v = kv.unbind(dim=2)
+                k = self.k_rms_norm(k)
+                h = scaled_dot_product_attention(q, k, v)
+            else:
+                h = scaled_dot_product_attention(q, kv)
+        h = h.reshape(B, L, -1)
+        h = self.to_out(h)
+        return h
+    def mi_attention(self, x: torch.Tensor, num_instances: int, indices: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Multi-instance self-attention.
+        q stays (B_total, L, ...).
+        k, v are concatenated across instances (N) -> (B, N*L, ...), then expanded to (B*N, N*L, ...).
+        """
+        B_total, L, C = x.shape
+        # 1. QKV projection
+        qkv = self.to_qkv(x).reshape(B_total, L, 3, self.num_heads, -1)
+        q, k, v = qkv.unbind(dim=2)
+        # 2. RoPE
+        if self.use_rope:
+            q, k = self.rope(q, k, indices)
+        if self.qk_rms_norm:
+            q = self.q_rms_norm(q)
+            k = self.k_rms_norm(k)
+        # q: (B*N, L, H, D)
+        # 3. Prepare K, V: merge instances in scene, then broadcast to each instance
+        # (B*N, L, H, D) -> (B, N*L, H, D)
+        k_scene = rearrange(k, '(b n) l h d -> b (n l) h d', n=num_instances)
+        v_scene = rearrange(v, '(b n) l h d -> b (n l) h d', n=num_instances)
+        # Expand to (B*N, N*L, H, D)
+        # We want each of the N instances in batch b to see the same k_scene[b]
+        # k_scene: (B, 1, NL, H, D) -> expand -> (B, N, NL, H, D) -> reshape -> (BN, NL, H, D)
+        k_all = k_scene.unsqueeze(1).expand(-1, num_instances, -1, -1, -1)
+        k_all = rearrange(k_all, 'b n nl h d -> (b n) nl h d')
+        v_all = v_scene.unsqueeze(1).expand(-1, num_instances, -1, -1, -1)
+        v_all = rearrange(v_all, 'b n nl h d -> (b n) nl h d')
+        # 4. Attention
+        # q: (BN, L, H, D)
+        # k_all: (BN, NL, H, D)
+        # out: (BN, L, H, D)
+        h = scaled_dot_product_attention(q, k_all, v_all)
+        # 6. Output projection
+        h = h.reshape(B_total, L, -1)
+        h = self.to_out(h)
+        return h
+    def scene_context_attn(self, x: torch.Tensor, context: torch.Tensor, num_instances=3, indices: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, L, C = x.shape
+        # Project to QKV and apply rotary/QK RMS-norm as configured
+        qkv = self.to_qkv(x).reshape(B, L, 3, self.num_heads, -1)
+        q, k, v = qkv.unbind(dim=2)
+        if self.use_rope:
+            q, k = self.rope(q, k, indices)
+        if self.qk_rms_norm:
+            q = self.q_rms_norm(q)
+            k = self.k_rms_norm(k)
+        # Reshape into pairs: (bp, num_instances, L, H, C)
+        qp = rearrange(q, '(bp ni) L h c -> bp ni L h c', ni=num_instances)
+        kp = rearrange(k, '(bp ni) L h c -> bp ni L h c', ni=num_instances)
+        vp = rearrange(v, '(bp ni) L h c -> bp ni L h c', ni=num_instances)
+        output_list =[]
+        ext_k_list = []
+        for ins_idx in range(1, num_instances):
+            k_j = kp[:, ins_idx]  # (bp, L, H, C)
+            if self.use_positional_encoding:
+                # pick a source id for this external (share or per-instance)
+                # share: src_id = 0     # if you only defined one external source
+                src_id = ins_idx - 1
+                if getattr(self, 'enable_ext_k_bias', False):
+                    bias = torch.tanh(self.k_type_bias[src_id])[None, None, :, :].to(dtype=k_j.dtype, device=k_j.device)
+                    k_j = k_j + self.ext_k_bias_scale * bias
+                if getattr(self, 'enable_ext_gate', False):
+                    alpha = torch.sigmoid(self.ext_gate[src_id])[None, None, :, None].to(dtype=k_j.dtype, device=k_j.device)
+                    k_j = k_j * alpha
+            ext_k_list.append(k_j)
+        k_full = torch.cat([kp[:, 0]] + ext_k_list, dim=1)  # (bp, num_instances * L, H, C)
+        v_full = torch.cat([vp[:, i] for i in range(num_instances)], dim=1)
+        out_inst = scaled_dot_product_attention(qp[:, 0], k_full, v_full)
+        output_list.append(out_inst)
+        # num_instance > 1 are separated for scene and instance
+        # Scene/canonical attends only to scene KV
+        for i in range(1, num_instances):
+            self_attn_instance = scaled_dot_product_attention(qp[:, i], kp[:, i], vp[:, i])
+            output_list.append(self_attn_instance)
+        # Stitch back to (B, L, H, C) → (B, L, C_all) → linear proj
+        h = torch.stack(output_list, dim=1)  # (bp, num_instances, L, H, C)
+        h = rearrange(h, 'bp ni L h c -> (bp ni) L h c')
+        h = h.reshape(B, L, -1)
+        h = self.to_out(h)
+        return h
+    def self_attn_join_external(self, x: torch.Tensor, external_tokens: Union[torch.Tensor, List[torch.Tensor]], indices: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Self-attention where queries come from x, and keys/values are augmented
+        with one or more external token sequences. All projections (Q/K/V) use
+        this module's own projection weights to keep them in the same space.
+        Args:
+            x: (B, Lq, C) queries from the current stream
+            external_tokens: either a tensor (B, Lext, C) or a list of tensors
+                              each of shape (B, Lext_i, C)
+            indices: optional rotary indices
+        Returns:
+            (B, Lq, C) attended output
+        """
+        assert self._type == "self", "self_attn_join_external is only valid for self-attention"
+        if isinstance(external_tokens, torch.Tensor):
+            external_list: List[torch.Tensor] = [external_tokens]
+        else:
+            external_list = list(external_tokens)
+        B, Lq, C = x.shape
+        # Project Q/K/V for x
+        qkv = self.to_qkv(x).reshape(B, Lq, 3, self.num_heads, -1)
+        q, k, v = qkv.unbind(dim=2)
+        if self.use_rope:
+            q, k = self.rope(q, k, indices)
+        # Optional Q/K RMSNorm
+        if self.qk_rms_norm:
+            q = self.q_rms_norm(q)
+            k = self.k_rms_norm(k)
+        # Project only K/V for external tokens using the SAME to_qkv weights
+        k_ext_list: List[torch.Tensor] = []
+        v_ext_list: List[torch.Tensor] = []
+        for i, ext in enumerate(external_list):
+            assert ext.dim() == 3, f"external token must be 3D (B, L, C), got {ext.shape}"
+            assert ext.shape[0] == B, f"Batch size mismatch: ext B={ext.shape[0]} vs x B={B}"
+            # Do not alter raw external token content; avoid adding source/type embedding to ext tokens
+            ext_qkv = self.to_qkv(ext).reshape(ext.shape[0], ext.shape[1], 3, self.num_heads, -1)
+            _, k_ext, v_ext = ext_qkv.unbind(dim=2)
+            if self.use_rope:
+                # apply RoPE to external K; use K as both inputs to get rotated K
+                _, k_ext = self.rope(k_ext, k_ext, indices)
+            if self.qk_rms_norm:
+                k_ext = self.k_rms_norm(k_ext)
+            if self.use_positional_encoding:
+                # Optional per-head K type bias (vector) applied after RoPE/RMSNorm
+                if getattr(self, 'enable_ext_k_bias', False):
+                    bias_vec = torch.tanh(self.k_type_bias[i])[None, None, :, :].to(k_ext.dtype)
+                    k_ext = k_ext + self.ext_k_bias_scale * bias_vec
+                # Optional per-head gate to modulate influence of external keys only (values unchanged)
+                if getattr(self, 'enable_ext_gate', False):
+                    alpha = torch.sigmoid(self.ext_gate[i])[None, None, :, None].to(k_ext.dtype)
+                    k_ext = k_ext * alpha
+            k_ext_list.append(k_ext)
+            v_ext_list.append(v_ext)
+        # Concatenate K/V along sequence dimension
+        if len(k_ext_list) > 0:
+            k_cat = torch.cat([k] + k_ext_list, dim=1)
+            v_cat = torch.cat([v] + v_ext_list, dim=1)
+        else:
+            k_cat, v_cat = k, v
+        # Attention and output
+        h = scaled_dot_product_attention(q, k_cat, v_cat)
+        h = h.reshape(B, Lq, -1)
+        h = self.to_out(h)
+        return h

iscene/trellis/modules/attention_resample.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from __future__ import annotations
+from typing import Optional
+import torch
+import torch.nn as nn
+try:
+    import flash_attn
+except ImportError:  # pragma: no cover - flash-attn is optional
+    flash_attn = None
+__all__ = ["AttentionResample"]
+class AttentionResample(nn.Module):
+    """Resample a variable-length token sequence to a fixed target length."""
+    def __init__(
+        self,
+        d_model: int = 1024,
+        n_target: int = 4096,
+        *,
+        n_heads: int = 16,
+        use_flash: bool = True,
+    ) -> None:
+        super().__init__()
+        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
+        self.d_model = d_model
+        self.n_target = n_target
+        self.n_heads = n_heads
+        self.head_dim = d_model // n_heads
+        self.scale = self.head_dim ** -0.5
+        self.latent = nn.Parameter(torch.randn(n_target, d_model))
+        self.to_kv = nn.Linear(d_model, 2 * d_model, bias=False)
+        self._flash_available = use_flash and flash_attn is not None
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Return a tensor with shape (B, n_target, d_model)."""
+        batch_size, _, dim = x.shape
+        assert dim == self.d_model, f"Expected input dim {self.d_model}, got {dim}"
+        q = self.latent.unsqueeze(0).expand(batch_size, -1, -1)
+        k, v = self.to_kv(x).chunk(2, dim=-1)
+        if self._flash_available:
+            return self._forward_flash(q, k, v)
+        return self._forward_torch(q, k, v)
+    def _forward_torch(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+        batch_size = q.size(0)
+        q = q.view(batch_size, self.n_target, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)
+        attn = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+        weights = torch.softmax(attn, dim=-1, dtype=attn.dtype)
+        out = torch.matmul(weights, v)
+        return out.transpose(1, 2).contiguous().view(batch_size, self.n_target, self.d_model)
+    def _forward_flash(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+        batch_size = q.size(0)
+        q = q.view(batch_size, self.n_target, self.n_heads, self.head_dim).contiguous()
+        k = k.view(batch_size, -1, self.n_heads, self.head_dim).contiguous()
+        v = v.view(batch_size, -1, self.n_heads, self.head_dim).contiguous()
+        assert flash_attn is not None
+        out = flash_attn.flash_attn_func(
+            q,
+            k,
+            v,  # type: ignore[arg-type]
+            causal=False,
+            softmax_scale=self.scale,
+        )
+        return out.reshape(batch_size, self.n_target, self.d_model)

iscene/trellis/modules/norm.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+import torch.nn as nn
+class LayerNorm32(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class GroupNorm32(nn.GroupNorm):
+    """
+    A GroupNorm layer that converts to float32 before the forward pass.
+    """
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class ChannelLayerNorm32(LayerNorm32):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        DIM = x.dim()
+        x = x.permute(0, *range(2, DIM), 1).contiguous()
+        x = super().forward(x)
+        x = x.permute(0, DIM-1, *range(1, DIM-1)).contiguous()
+        return x

iscene/trellis/modules/sparse/__init__.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from typing import *
+BACKEND = 'spconv'
+DEBUG = False
+ATTN = 'flash_attn'
+def __from_env():
+    import os
+    global BACKEND
+    global DEBUG
+    global ATTN
+    env_sparse_backend = os.environ.get('SPARSE_BACKEND')
+    env_sparse_debug = os.environ.get('SPARSE_DEBUG')
+    env_sparse_attn = os.environ.get('SPARSE_ATTN_BACKEND')
+    if env_sparse_attn is None:
+        env_sparse_attn = os.environ.get('ATTN_BACKEND')
+    if env_sparse_backend is not None and env_sparse_backend in ['spconv', 'torchsparse']:
+        BACKEND = env_sparse_backend
+    if env_sparse_debug is not None:
+        DEBUG = env_sparse_debug == '1'
+    if env_sparse_attn is not None and env_sparse_attn in ['xformers', 'flash_attn']:
+        ATTN = env_sparse_attn
+    print(f"[SPARSE] Backend: {BACKEND}, Attention: {ATTN}")
+__from_env()
+def set_backend(backend: Literal['spconv', 'torchsparse']):
+    global BACKEND
+    BACKEND = backend
+def set_debug(debug: bool):
+    global DEBUG
+    DEBUG = debug
+def set_attn(attn: Literal['xformers', 'flash_attn']):
+    global ATTN
+    ATTN = attn
+import importlib
+__attributes = {
+    'SparseTensor': 'basic',
+    'sparse_batch_broadcast': 'basic',
+    'sparse_batch_op': 'basic',
+    'sparse_cat': 'basic',
+    'sparse_unbind': 'basic',
+    'SparseGroupNorm': 'norm',
+    'SparseLayerNorm': 'norm',
+    'SparseGroupNorm32': 'norm',
+    'SparseLayerNorm32': 'norm',
+    'SparseReLU': 'nonlinearity',
+    'SparseSiLU': 'nonlinearity',
+    'SparseGELU': 'nonlinearity',
+    'SparseActivation': 'nonlinearity',
+    'SparseLinear': 'linear',
+    'sparse_scaled_dot_product_attention': 'attention',
+    'SerializeMode': 'attention',
+    'sparse_serialized_scaled_dot_product_self_attention': 'attention',
+    'sparse_windowed_scaled_dot_product_self_attention': 'attention',
+    'SparseMultiHeadAttention': 'attention',
+    'SparseConv3d': 'conv',
+    'SparseInverseConv3d': 'conv',
+    'SparseDownsample': 'spatial',
+    'SparseUpsample': 'spatial',
+    'SparseSubdivide' : 'spatial'
+}
+__submodules = ['transformer']
+__all__ = list(__attributes.keys()) + __submodules
+def __getattr__(name):
+    if name not in globals():
+        if name in __attributes:
+            module_name = __attributes[name]
+            module = importlib.import_module(f".{module_name}", __name__)
+            globals()[name] = getattr(module, name)
+        elif name in __submodules:
+            module = importlib.import_module(f".{name}", __name__)
+            globals()[name] = module
+        else:
+            raise AttributeError(f"module {__name__} has no attribute {name}")
+    return globals()[name]
+# For Pylance
+if __name__ == '__main__':
+    from .basic import *
+    from .norm import *
+    from .nonlinearity import *
+    from .linear import *
+    from .attention import *
+    from .conv import *
+    from .spatial import *
+    import transformer

iscene/trellis/modules/sparse/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .full_attn import *
+from .serialized_attn import *
+from .windowed_attn import *
+from .modules import *

iscene/trellis/modules/sparse/attention/full_attn.py ADDED Viewed

	@@ -0,0 +1,215 @@

+from typing import *
+import torch
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_scaled_dot_product_attention',
+]
+@overload
+def sparse_scaled_dot_product_attention(qkv: SparseTensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): A [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, kv: Union[SparseTensor, torch.Tensor]) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, C] sparse tensor containing Qs.
+        kv (SparseTensor or torch.Tensor): A [N, *, 2, H, C] sparse tensor or a [N, L, 2, H, C] dense tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: torch.Tensor, kv: SparseTensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, L, H, C] dense tensor containing Qs.
+        kv (SparseTensor or torch.Tensor): A [N, *, 2, H, C] sparse tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, k: SparseTensor, v: SparseTensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, Ci] sparse tensor containing Qs.
+        k (SparseTensor): A [N, *, H, Ci] sparse tensor containing Ks.
+        v (SparseTensor): A [N, *, H, Co] sparse tensor containing Vs.
+    Note:
+        k and v are assumed to have the same coordinate map.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, k: torch.Tensor, v: torch.Tensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, Ci] sparse tensor containing Qs.
+        k (torch.Tensor): A [N, L, H, Ci] dense tensor containing Ks.
+        v (torch.Tensor): A [N, L, H, Co] dense tensor containing Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: torch.Tensor, k: SparseTensor, v: SparseTensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (torch.Tensor): A [N, L, H, Ci] dense tensor containing Qs.
+        k (SparseTensor): A [N, *, H, Ci] sparse tensor containing Ks.
+        v (SparseTensor): A [N, *, H, Co] sparse tensor containing Vs.
+    """
+    ...
+def sparse_scaled_dot_product_attention(*args, **kwargs):
+    arg_names_dict = {
+        1: ['qkv'],
+        2: ['q', 'kv'],
+        3: ['q', 'k', 'v']
+    }
+    num_all_args = len(args) + len(kwargs)
+    assert num_all_args in arg_names_dict, f"Invalid number of arguments, got {num_all_args}, expected 1, 2, or 3"
+    for key in arg_names_dict[num_all_args][len(args):]:
+        assert key in kwargs, f"Missing argument {key}"
+    if num_all_args == 1:
+        qkv = args[0] if len(args) > 0 else kwargs['qkv']
+        assert isinstance(qkv, SparseTensor), f"qkv must be a SparseTensor, got {type(qkv)}"
+        assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+        device = qkv.device
+        s = qkv
+        q_seqlen = [qkv.layout[i].stop - qkv.layout[i].start for i in range(qkv.shape[0])]
+        kv_seqlen = q_seqlen
+        qkv = qkv.feats     # [T, 3, H, C]
+    elif num_all_args == 2:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        kv = args[1] if len(args) > 1 else kwargs['kv']
+        assert isinstance(q, SparseTensor) and isinstance(kv, (SparseTensor, torch.Tensor)) or \
+               isinstance(q, torch.Tensor) and isinstance(kv, SparseTensor), \
+               f"Invalid types, got {type(q)} and {type(kv)}"
+        assert q.shape[0] == kv.shape[0], f"Batch size mismatch, got {q.shape[0]} and {kv.shape[0]}"
+        device = q.device
+        if isinstance(q, SparseTensor):
+            assert len(q.shape) == 3, f"Invalid shape for q, got {q.shape}, expected [N, *, H, C]"
+            s = q
+            q_seqlen = [q.layout[i].stop - q.layout[i].start for i in range(q.shape[0])]
+            q = q.feats     # [T_Q, H, C]
+        else:
+            assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, C]"
+            s = None
+            N, L, H, C = q.shape
+            q_seqlen = [L] * N
+            q = q.reshape(N * L, H, C)   # [T_Q, H, C]
+        if isinstance(kv, SparseTensor):
+            assert len(kv.shape) == 4 and kv.shape[1] == 2, f"Invalid shape for kv, got {kv.shape}, expected [N, *, 2, H, C]"
+            kv_seqlen = [kv.layout[i].stop - kv.layout[i].start for i in range(kv.shape[0])]
+            kv = kv.feats     # [T_KV, 2, H, C]
+        else:
+            assert len(kv.shape) == 5, f"Invalid shape for kv, got {kv.shape}, expected [N, L, 2, H, C]"
+            N, L, _, H, C = kv.shape
+            kv_seqlen = [L] * N
+            kv = kv.reshape(N * L, 2, H, C)   # [T_KV, 2, H, C]
+    elif num_all_args == 3:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        k = args[1] if len(args) > 1 else kwargs['k']
+        v = args[2] if len(args) > 2 else kwargs['v']
+        assert isinstance(q, SparseTensor) and isinstance(k, (SparseTensor, torch.Tensor)) and type(k) == type(v) or \
+               isinstance(q, torch.Tensor) and isinstance(k, SparseTensor) and isinstance(v, SparseTensor), \
+               f"Invalid types, got {type(q)}, {type(k)}, and {type(v)}"
+        assert q.shape[0] == k.shape[0] == v.shape[0], f"Batch size mismatch, got {q.shape[0]}, {k.shape[0]}, and {v.shape[0]}"
+        device = q.device
+        if isinstance(q, SparseTensor):
+            assert len(q.shape) == 3, f"Invalid shape for q, got {q.shape}, expected [N, *, H, Ci]"
+            s = q
+            q_seqlen = [q.layout[i].stop - q.layout[i].start for i in range(q.shape[0])]
+            q = q.feats     # [T_Q, H, Ci]
+        else:
+            assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, Ci]"
+            s = None
+            N, L, H, CI = q.shape
+            q_seqlen = [L] * N
+            q = q.reshape(N * L, H, CI)  # [T_Q, H, Ci]
+        if isinstance(k, SparseTensor):
+            assert len(k.shape) == 3, f"Invalid shape for k, got {k.shape}, expected [N, *, H, Ci]"
+            assert len(v.shape) == 3, f"Invalid shape for v, got {v.shape}, expected [N, *, H, Co]"
+            kv_seqlen = [k.layout[i].stop - k.layout[i].start for i in range(k.shape[0])]
+            k = k.feats     # [T_KV, H, Ci]
+            v = v.feats     # [T_KV, H, Co]
+        else:
+            assert len(k.shape) == 4, f"Invalid shape for k, got {k.shape}, expected [N, L, H, Ci]"
+            assert len(v.shape) == 4, f"Invalid shape for v, got {v.shape}, expected [N, L, H, Co]"
+            N, L, H, CI, CO = *k.shape, v.shape[-1]
+            kv_seqlen = [L] * N
+            k = k.reshape(N * L, H, CI)     # [T_KV, H, Ci]
+            v = v.reshape(N * L, H, CO)     # [T_KV, H, Co]
+    if DEBUG:
+        if s is not None:
+            for i in range(s.shape[0]):
+                assert (s.coords[s.layout[i]] == i).all(), f"SparseScaledDotProductSelfAttention: batch index mismatch"
+        if num_all_args in [2, 3]:
+            assert q.shape[:2] == [1, sum(q_seqlen)], f"SparseScaledDotProductSelfAttention: q shape mismatch"
+        if num_all_args == 3:
+            assert k.shape[:2] == [1, sum(kv_seqlen)], f"SparseScaledDotProductSelfAttention: k shape mismatch"
+            assert v.shape[:2] == [1, sum(kv_seqlen)], f"SparseScaledDotProductSelfAttention: v shape mismatch"
+    if ATTN == 'xformers':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=1)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=1)
+        q = q.unsqueeze(0)
+        k = k.unsqueeze(0)
+        v = v.unsqueeze(0)
+        mask = xops.fmha.BlockDiagonalMask.from_seqlens(q_seqlen, kv_seqlen)
+        out = xops.memory_efficient_attention(q, k, v, mask)[0]
+    elif ATTN == 'flash_attn':
+        cu_seqlens_q = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(q_seqlen), dim=0)]).int().to(device)
+        if num_all_args in [2, 3]:
+            cu_seqlens_kv = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(kv_seqlen), dim=0)]).int().to(device)
+        if num_all_args == 1:
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv, cu_seqlens_q, max(q_seqlen))
+        elif num_all_args == 2:
+            out = flash_attn.flash_attn_varlen_kvpacked_func(q, kv, cu_seqlens_q, cu_seqlens_kv, max(q_seqlen), max(kv_seqlen))
+        elif num_all_args == 3:
+            out = flash_attn.flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max(q_seqlen), max(kv_seqlen))
+    else:
+        raise ValueError(f"Unknown attention module: {ATTN}")
+    if s is not None:
+        return s.replace(out)
+    else:
+        return out.reshape(N, L, H, -1)

iscene/trellis/modules/sparse/attention/modules.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .. import SparseTensor
+from .full_attn import sparse_scaled_dot_product_attention
+from .serialized_attn import SerializeMode, sparse_serialized_scaled_dot_product_self_attention
+from .windowed_attn import sparse_windowed_scaled_dot_product_self_attention
+from ...attention import RotaryPositionEmbedder
+class SparseMultiHeadRMSNorm(nn.Module):
+    def __init__(self, dim: int, heads: int):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, dim))
+    def forward(self, x: Union[SparseTensor, torch.Tensor]) -> Union[SparseTensor, torch.Tensor]:
+        x_type = x.dtype
+        x = x.float()
+        if isinstance(x, SparseTensor):
+            x = x.replace(F.normalize(x.feats, dim=-1))
+        else:
+            x = F.normalize(x, dim=-1)
+        return (x * self.gamma * self.scale).to(x_type)
+class SparseMultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        ctx_channels: Optional[int] = None,
+        type: Literal["self", "cross"] = "self",
+        attn_mode: Literal["full", "serialized", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        qkv_bias: bool = True,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        assert channels % num_heads == 0
+        assert type in ["self", "cross"], f"Invalid attention type: {type}"
+        assert attn_mode in ["full", "serialized", "windowed"], f"Invalid attention mode: {attn_mode}"
+        assert type == "self" or attn_mode == "full", "Cross-attention only supports full attention"
+        assert type == "self" or use_rope is False, "Rotary position embeddings only supported for self-attention"
+        self.channels = channels
+        self.ctx_channels = ctx_channels if ctx_channels is not None else channels
+        self.num_heads = num_heads
+        self._type = type
+        self.attn_mode = attn_mode
+        self.window_size = window_size
+        self.shift_sequence = shift_sequence
+        self.shift_window = shift_window
+        self.serialize_mode = serialize_mode
+        self.use_rope = use_rope
+        self.qk_rms_norm = qk_rms_norm
+        if self._type == "self":
+            self.to_qkv = nn.Linear(channels, channels * 3, bias=qkv_bias)
+        else:
+            self.to_q = nn.Linear(channels, channels, bias=qkv_bias)
+            self.to_kv = nn.Linear(self.ctx_channels, channels * 2, bias=qkv_bias)
+        if self.qk_rms_norm:
+            self.q_rms_norm = SparseMultiHeadRMSNorm(channels // num_heads, num_heads)
+            self.k_rms_norm = SparseMultiHeadRMSNorm(channels // num_heads, num_heads)
+        self.to_out = nn.Linear(channels, channels)
+        if use_rope:
+            self.rope = RotaryPositionEmbedder(channels)
+    @staticmethod
+    def _linear(module: nn.Linear, x: Union[SparseTensor, torch.Tensor]) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            return x.replace(module(x.feats))
+        else:
+            return module(x)
+    @staticmethod
+    def _reshape_chs(x: Union[SparseTensor, torch.Tensor], shape: Tuple[int, ...]) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            return x.reshape(*shape)
+        else:
+            return x.reshape(*x.shape[:2], *shape)
+    def _fused_pre(self, x: Union[SparseTensor, torch.Tensor], num_fused: int) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            x_feats = x.feats.unsqueeze(0)
+        else:
+            x_feats = x
+        x_feats = x_feats.reshape(*x_feats.shape[:2], num_fused, self.num_heads, -1)
+        return x.replace(x_feats.squeeze(0)) if isinstance(x, SparseTensor) else x_feats
+    def _rope(self, qkv: SparseTensor) -> SparseTensor:
+        q, k, v = qkv.feats.unbind(dim=1)   # [T, H, C]
+        q, k = self.rope(q, k, qkv.coords[:, 1:])
+        qkv = qkv.replace(torch.stack([q, k, v], dim=1))
+        return qkv
+    def forward(self, x: Union[SparseTensor, torch.Tensor], context: Optional[Union[SparseTensor, torch.Tensor]] = None) -> Union[SparseTensor, torch.Tensor]:
+        if self._type == "self":
+            qkv = self._linear(self.to_qkv, x)
+            qkv = self._fused_pre(qkv, num_fused=3)
+            if self.use_rope:
+                qkv = self._rope(qkv)
+            if self.qk_rms_norm:
+                q, k, v = qkv.unbind(dim=1)
+                q = self.q_rms_norm(q)
+                k = self.k_rms_norm(k)
+                qkv = qkv.replace(torch.stack([q.feats, k.feats, v.feats], dim=1))
+            if self.attn_mode == "full":
+                h = sparse_scaled_dot_product_attention(qkv)
+            elif self.attn_mode == "serialized":
+                h = sparse_serialized_scaled_dot_product_self_attention(
+                    qkv, self.window_size, serialize_mode=self.serialize_mode, shift_sequence=self.shift_sequence, shift_window=self.shift_window
+                )
+            elif self.attn_mode == "windowed":
+                h = sparse_windowed_scaled_dot_product_self_attention(
+                    qkv, self.window_size, shift_window=self.shift_window
+                )
+        else:
+            q = self._linear(self.to_q, x)
+            q = self._reshape_chs(q, (self.num_heads, -1))
+            kv = self._linear(self.to_kv, context)
+            kv = self._fused_pre(kv, num_fused=2)
+            if self.qk_rms_norm:
+                q = self.q_rms_norm(q)
+                k, v = kv.unbind(dim=1)
+                k = self.k_rms_norm(k)
+                kv = kv.replace(torch.stack([k.feats, v.feats], dim=1))
+            h = sparse_scaled_dot_product_attention(q, kv)
+        h = self._reshape_chs(h, (-1,))
+        h = self._linear(self.to_out, h)
+        return h

iscene/trellis/modules/sparse/attention/serialized_attn.py ADDED Viewed

	@@ -0,0 +1,193 @@

+from typing import *
+from enum import Enum
+import torch
+import math
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_serialized_scaled_dot_product_self_attention',
+]
+class SerializeMode(Enum):
+    Z_ORDER = 0
+    Z_ORDER_TRANSPOSED = 1
+    HILBERT = 2
+    HILBERT_TRANSPOSED = 3
+SerializeModes = [
+    SerializeMode.Z_ORDER,
+    SerializeMode.Z_ORDER_TRANSPOSED,
+    SerializeMode.HILBERT,
+    SerializeMode.HILBERT_TRANSPOSED
+]
+def calc_serialization(
+    tensor: SparseTensor,
+    window_size: int,
+    serialize_mode: SerializeMode = SerializeMode.Z_ORDER,
+    shift_sequence: int = 0,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
+    """
+    Calculate serialization and partitioning for a set of coordinates.
+    Args:
+        tensor (SparseTensor): The input tensor.
+        window_size (int): The window size to use.
+        serialize_mode (SerializeMode): The serialization mode to use.
+        shift_sequence (int): The shift of serialized sequence.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+    Returns:
+        (torch.Tensor, torch.Tensor): Forwards and backwards indices.
+    """
+    fwd_indices = []
+    bwd_indices = []
+    seq_lens = []
+    seq_batch_indices = []
+    offsets = [0]
+    if 'vox2seq' not in globals():
+        import vox2seq
+    # Serialize the input
+    serialize_coords = tensor.coords[:, 1:].clone()
+    serialize_coords += torch.tensor(shift_window, dtype=torch.int32, device=tensor.device).reshape(1, 3)
+    if serialize_mode == SerializeMode.Z_ORDER:
+        code = vox2seq.encode(serialize_coords, mode='z_order', permute=[0, 1, 2])
+    elif serialize_mode == SerializeMode.Z_ORDER_TRANSPOSED:
+        code = vox2seq.encode(serialize_coords, mode='z_order', permute=[1, 0, 2])
+    elif serialize_mode == SerializeMode.HILBERT:
+        code = vox2seq.encode(serialize_coords, mode='hilbert', permute=[0, 1, 2])
+    elif serialize_mode == SerializeMode.HILBERT_TRANSPOSED:
+        code = vox2seq.encode(serialize_coords, mode='hilbert', permute=[1, 0, 2])
+    else:
+        raise ValueError(f"Unknown serialize mode: {serialize_mode}")
+    for bi, s in enumerate(tensor.layout):
+        num_points = s.stop - s.start
+        num_windows = (num_points + window_size - 1) // window_size
+        valid_window_size = num_points / num_windows
+        to_ordered = torch.argsort(code[s.start:s.stop])
+        if num_windows == 1:
+            fwd_indices.append(to_ordered)
+            bwd_indices.append(torch.zeros_like(to_ordered).scatter_(0, to_ordered, torch.arange(num_points, device=tensor.device)))
+            fwd_indices[-1] += s.start
+            bwd_indices[-1] += offsets[-1]
+            seq_lens.append(num_points)
+            seq_batch_indices.append(bi)
+            offsets.append(offsets[-1] + seq_lens[-1])
+        else:
+            # Partition the input
+            offset = 0
+            mids = [(i + 0.5) * valid_window_size + shift_sequence for i in range(num_windows)]
+            split = [math.floor(i * valid_window_size + shift_sequence) for i in range(num_windows + 1)]
+            bwd_index = torch.zeros((num_points,), dtype=torch.int64, device=tensor.device)
+            for i in range(num_windows):
+                mid = mids[i]
+                valid_start = split[i]
+                valid_end = split[i + 1]
+                padded_start = math.floor(mid - 0.5 * window_size)
+                padded_end = padded_start + window_size
+                fwd_indices.append(to_ordered[torch.arange(padded_start, padded_end, device=tensor.device) % num_points])
+                offset += valid_start - padded_start
+                bwd_index.scatter_(0, fwd_indices[-1][valid_start-padded_start:valid_end-padded_start], torch.arange(offset, offset + valid_end - valid_start, device=tensor.device))
+                offset += padded_end - valid_start
+                fwd_indices[-1] += s.start
+            seq_lens.extend([window_size] * num_windows)
+            seq_batch_indices.extend([bi] * num_windows)
+            bwd_indices.append(bwd_index + offsets[-1])
+            offsets.append(offsets[-1] + num_windows * window_size)
+    fwd_indices = torch.cat(fwd_indices)
+    bwd_indices = torch.cat(bwd_indices)
+    return fwd_indices, bwd_indices, seq_lens, seq_batch_indices
+def sparse_serialized_scaled_dot_product_self_attention(
+    qkv: SparseTensor,
+    window_size: int,
+    serialize_mode: SerializeMode = SerializeMode.Z_ORDER,
+    shift_sequence: int = 0,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> SparseTensor:
+    """
+    Apply serialized scaled dot product self attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+        window_size (int): The window size to use.
+        serialize_mode (SerializeMode): The serialization mode to use.
+        shift_sequence (int): The shift of serialized sequence.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+        shift (int): The shift to use.
+    """
+    assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+    serialization_spatial_cache_name = f'serialization_{serialize_mode}_{window_size}_{shift_sequence}_{shift_window}'
+    serialization_spatial_cache = qkv.get_spatial_cache(serialization_spatial_cache_name)
+    if serialization_spatial_cache is None:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = calc_serialization(qkv, window_size, serialize_mode, shift_sequence, shift_window)
+        qkv.register_spatial_cache(serialization_spatial_cache_name, (fwd_indices, bwd_indices, seq_lens, seq_batch_indices))
+    else:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = serialization_spatial_cache
+    M = fwd_indices.shape[0]
+    T = qkv.feats.shape[0]
+    H = qkv.feats.shape[2]
+    C = qkv.feats.shape[3]
+    qkv_feats = qkv.feats[fwd_indices]      # [M, 3, H, C]
+    if DEBUG:
+        start = 0
+        qkv_coords = qkv.coords[fwd_indices]
+        for i in range(len(seq_lens)):
+            assert (qkv_coords[start:start+seq_lens[i], 0] == seq_batch_indices[i]).all(), f"SparseWindowedScaledDotProductSelfAttention: batch index mismatch"
+            start += seq_lens[i]
+    if all([seq_len == window_size for seq_len in seq_lens]):
+        B = len(seq_lens)
+        N = window_size
+        qkv_feats = qkv_feats.reshape(B, N, 3, H, C)
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=2)                       # [B, N, H, C]
+            out = xops.memory_efficient_attention(q, k, v)          # [B, N, H, C]
+        elif ATTN == 'flash_attn':
+            out = flash_attn.flash_attn_qkvpacked_func(qkv_feats)   # [B, N, H, C]
+        else:
+            raise ValueError(f"Unknown attention module: {ATTN}")
+        out = out.reshape(B * N, H, C)                              # [M, H, C]
+    else:
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=1)                       # [M, H, C]
+            q = q.unsqueeze(0)                                      # [1, M, H, C]
+            k = k.unsqueeze(0)                                      # [1, M, H, C]
+            v = v.unsqueeze(0)                                      # [1, M, H, C]
+            mask = xops.fmha.BlockDiagonalMask.from_seqlens(seq_lens)
+            out = xops.memory_efficient_attention(q, k, v, mask)[0] # [M, H, C]
+        elif ATTN == 'flash_attn':
+            cu_seqlens = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(seq_lens), dim=0)], dim=0) \
+                        .to(qkv.device).int()
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv_feats, cu_seqlens, max(seq_lens)) # [M, H, C]
+    out = out[bwd_indices]      # [T, H, C]
+    if DEBUG:
+        qkv_coords = qkv_coords[bwd_indices]
+        assert torch.equal(qkv_coords, qkv.coords), "SparseWindowedScaledDotProductSelfAttention: coordinate mismatch"
+    return qkv.replace(out)

iscene/trellis/modules/sparse/attention/windowed_attn.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from typing import *
+import torch
+import math
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_windowed_scaled_dot_product_self_attention',
+]
+def _lexsort_columns(columns: List[torch.Tensor]) -> torch.Tensor:
+    if not columns:
+        raise ValueError("columns must be non-empty")
+    if columns[0].numel() == 0:
+        return torch.empty(0, dtype=torch.long, device=columns[0].device)
+    cols64 = [col.to(torch.int64) for col in columns]
+    max_vals = [int(col.max().item()) + 1 for col in cols64]
+    key = cols64[0]
+    for col, max_val in zip(cols64[1:], max_vals[1:]):
+        key = key * max_val + col
+    return torch.argsort(key)
+def calc_window_partition(
+    tensor: SparseTensor,
+    window_size: Union[int, Tuple[int, ...]],
+    shift_window: Union[int, Tuple[int, ...]] = 0
+) -> Tuple[torch.Tensor, torch.Tensor, List[int], List[int]]:
+    """
+    Calculate serialization and partitioning for a set of coordinates.
+    Args:
+        tensor (SparseTensor): The input tensor.
+        window_size (int): The window size to use.
+        shift_window (Tuple[int, ...]): The shift of serialized coordinates.
+    Returns:
+        (torch.Tensor): Forwards indices.
+        (torch.Tensor): Backwards indices.
+        (List[int]): Sequence lengths.
+        (List[int]): Sequence batch indices.
+    """
+    DIM = tensor.coords.shape[1] - 1
+    shift_window = (shift_window,) * DIM if isinstance(shift_window, int) else shift_window
+    window_size = (window_size,) * DIM if isinstance(window_size, int) else window_size
+    shifted_coords = tensor.coords.clone().detach()
+    shifted_coords[:, 1:] += torch.tensor(shift_window, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+    fine_coords = shifted_coords[:, 1:].clone()
+    MAX_COORDS = shifted_coords[:, 1:].max(dim=0).values.tolist()
+    NUM_WINDOWS = [math.ceil((mc + 1) / ws) for mc, ws in zip(MAX_COORDS, window_size)]
+    OFFSET = torch.cumprod(torch.tensor([1] + NUM_WINDOWS[::-1]), dim=0).tolist()[::-1]
+    shifted_coords[:, 1:] //= torch.tensor(window_size, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+    shifted_indices = (shifted_coords * torch.tensor(OFFSET, device=tensor.device, dtype=torch.int32).unsqueeze(0)).sum(dim=1)
+    fwd_indices = _lexsort_columns([shifted_indices, fine_coords[:, 0], fine_coords[:, 1], fine_coords[:, 2]])
+    bwd_indices = torch.empty_like(fwd_indices)
+    bwd_indices[fwd_indices] = torch.arange(fwd_indices.shape[0], device=tensor.device)
+    seq_lens = torch.bincount(shifted_indices)
+    seq_batch_indices = torch.arange(seq_lens.shape[0], device=tensor.device, dtype=torch.int32) // OFFSET[0]
+    mask = seq_lens != 0
+    seq_lens = seq_lens[mask].tolist()
+    seq_batch_indices = seq_batch_indices[mask].tolist()
+    return fwd_indices, bwd_indices, seq_lens, seq_batch_indices
+def sparse_windowed_scaled_dot_product_self_attention(
+    qkv: SparseTensor,
+    window_size: int,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> SparseTensor:
+    """
+    Apply windowed scaled dot product self attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+        window_size (int): The window size to use.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+        shift (int): The shift to use.
+    """
+    assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+    serialization_spatial_cache_name = f'window_partition_{window_size}_{shift_window}'
+    serialization_spatial_cache = qkv.get_spatial_cache(serialization_spatial_cache_name)
+    if serialization_spatial_cache is None:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = calc_window_partition(qkv, window_size, shift_window)
+        qkv.register_spatial_cache(serialization_spatial_cache_name, (fwd_indices, bwd_indices, seq_lens, seq_batch_indices))
+    else:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = serialization_spatial_cache
+    M = fwd_indices.shape[0]
+    T = qkv.feats.shape[0]
+    H = qkv.feats.shape[2]
+    C = qkv.feats.shape[3]
+    qkv_feats = qkv.feats[fwd_indices]      # [M, 3, H, C]
+    if DEBUG:
+        start = 0
+        qkv_coords = qkv.coords[fwd_indices]
+        for i in range(len(seq_lens)):
+            seq_coords = qkv_coords[start:start+seq_lens[i]]
+            assert (seq_coords[:, 0] == seq_batch_indices[i]).all(), f"SparseWindowedScaledDotProductSelfAttention: batch index mismatch"
+            assert (seq_coords[:, 1:].max(dim=0).values - seq_coords[:, 1:].min(dim=0).values < window_size).all(), \
+                    f"SparseWindowedScaledDotProductSelfAttention: window size exceeded"
+            start += seq_lens[i]
+    if all([seq_len == window_size for seq_len in seq_lens]):
+        B = len(seq_lens)
+        N = window_size
+        qkv_feats = qkv_feats.reshape(B, N, 3, H, C)
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=2)                       # [B, N, H, C]
+            out = xops.memory_efficient_attention(q, k, v)          # [B, N, H, C]
+        elif ATTN == 'flash_attn':
+            out = flash_attn.flash_attn_qkvpacked_func(qkv_feats)   # [B, N, H, C]
+        else:
+            raise ValueError(f"Unknown attention module: {ATTN}")
+        out = out.reshape(B * N, H, C)                              # [M, H, C]
+    else:
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=1)                       # [M, H, C]
+            q = q.unsqueeze(0)                                      # [1, M, H, C]
+            k = k.unsqueeze(0)                                      # [1, M, H, C]
+            v = v.unsqueeze(0)                                      # [1, M, H, C]
+            mask = xops.fmha.BlockDiagonalMask.from_seqlens(seq_lens)
+            out = xops.memory_efficient_attention(q, k, v, mask)[0] # [M, H, C]
+        elif ATTN == 'flash_attn':
+            cu_seqlens = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(seq_lens), dim=0)], dim=0) \
+                        .to(qkv.device).int()
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv_feats, cu_seqlens, max(seq_lens)) # [M, H, C]
+    out = out[bwd_indices]      # [T, H, C]
+    if DEBUG:
+        qkv_coords = qkv_coords[bwd_indices]
+        assert torch.equal(qkv_coords, qkv.coords), "SparseWindowedScaledDotProductSelfAttention: coordinate mismatch"
+    return qkv.replace(out)