MohamedRashad commited on
Commit
73c350d
·
1 Parent(s): 09758c4

Add initial module structure and base classes for samplers and representations

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. app.py +321 -0
  3. assets/example_image/T.png +0 -0
  4. assets/example_image/typical_building_building.png +0 -0
  5. assets/example_image/typical_building_castle.png +0 -0
  6. assets/example_image/typical_building_colorful_cottage.png +0 -0
  7. assets/example_image/typical_building_maya_pyramid.png +0 -0
  8. assets/example_image/typical_building_mushroom.png +0 -0
  9. assets/example_image/typical_building_space_station.png +0 -0
  10. assets/example_image/typical_creature_dragon.png +0 -0
  11. assets/example_image/typical_creature_elephant.png +0 -0
  12. assets/example_image/typical_creature_furry.png +0 -0
  13. assets/example_image/typical_creature_quadruped.png +0 -0
  14. assets/example_image/typical_creature_robot_crab.png +0 -0
  15. assets/example_image/typical_creature_robot_dinosour.png +0 -0
  16. assets/example_image/typical_creature_rock_monster.png +0 -0
  17. assets/example_image/typical_humanoid_block_robot.png +0 -0
  18. assets/example_image/typical_humanoid_dragonborn.png +0 -0
  19. assets/example_image/typical_humanoid_dwarf.png +0 -0
  20. assets/example_image/typical_humanoid_goblin.png +0 -0
  21. assets/example_image/typical_humanoid_mech.png +0 -0
  22. assets/example_image/typical_misc_crate.png +0 -0
  23. assets/example_image/typical_misc_fireplace.png +0 -0
  24. assets/example_image/typical_misc_gate.png +0 -0
  25. assets/example_image/typical_misc_lantern.png +0 -0
  26. assets/example_image/typical_misc_magicbook.png +0 -0
  27. assets/example_image/typical_misc_mailbox.png +0 -0
  28. assets/example_image/typical_misc_monster_chest.png +0 -0
  29. assets/example_image/typical_misc_paper_machine.png +0 -0
  30. assets/example_image/typical_misc_phonograph.png +0 -0
  31. assets/example_image/typical_misc_portal2.png +0 -0
  32. assets/example_image/typical_misc_storage_chest.png +0 -0
  33. assets/example_image/typical_misc_telephone.png +0 -0
  34. assets/example_image/typical_misc_television.png +0 -0
  35. assets/example_image/typical_misc_workbench.png +0 -0
  36. assets/example_image/typical_vehicle_biplane.png +0 -0
  37. assets/example_image/typical_vehicle_bulldozer.png +0 -0
  38. assets/example_image/typical_vehicle_cart.png +0 -0
  39. assets/example_image/typical_vehicle_excavator.png +0 -0
  40. assets/example_image/typical_vehicle_helicopter.png +0 -0
  41. assets/example_image/typical_vehicle_locomotive.png +0 -0
  42. assets/example_image/typical_vehicle_pirate_ship.png +0 -0
  43. assets/example_image/weatherworn_misc_paper_machine3.png +0 -0
  44. extensions/nvdiffrast/LICENSE.txt +97 -0
  45. extensions/nvdiffrast/README.md +42 -0
  46. extensions/nvdiffrast/nvdiffrast/__init__.py +9 -0
  47. extensions/nvdiffrast/nvdiffrast/common/antialias.cu +558 -0
  48. extensions/nvdiffrast/nvdiffrast/common/antialias.h +50 -0
  49. extensions/nvdiffrast/nvdiffrast/common/common.cpp +60 -0
  50. extensions/nvdiffrast/nvdiffrast/common/common.h +263 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
37
+ wheels/*.whl filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ from gradio_litmodel3d import LitModel3D
4
+
5
+ import os
6
+ os.environ['SPCONV_ALGO'] = 'native'
7
+ from typing import *
8
+ import torch
9
+ import numpy as np
10
+ import imageio
11
+ import uuid
12
+ from easydict import EasyDict as edict
13
+ from PIL import Image
14
+ from trellis.pipelines import TrellisImageTo3DPipeline
15
+ from trellis.representations import Gaussian, MeshExtractResult
16
+ from trellis.utils import render_utils, postprocessing_utils
17
+ from huggingface_hub import InferenceClient
18
+
19
+ client = InferenceClient(api_key=os.environ["HF_API_KEY"])
20
+
21
+ def generate_t2i_prompt(item_name):
22
+ llm_prompt_template = """You are tasked with creating a concise yet highly detailed description of an item to be used for generating an image in a game development pipeline. The image should show the **entire item** with no parts cropped or hidden. The background should always be plain and monocolor, with no focus on it.
23
+
24
+ ### Guidelines:
25
+ 1. **Whole Item Focus**: The description should emphasize the full item, ensuring it is clearly depicted in the image.
26
+ 2. **Concise Details**: Use vivid but compact language to describe the item's shape, materials, textures, colors, and unique features. Avoid unnecessary elaboration or context.
27
+ 3. **No Background Details**: Specify that the background is plain and monocolor without describing it further.
28
+
29
+ ### Examples:
30
+ Item: "Golden Pocket Watch"
31
+ A vintage golden pocket watch with intricate floral engravings, polished metal, and Roman numerals on its clock face. Its chain is smooth and reflective, completing the elegant design.
32
+
33
+ Item: "Crystal Vase"
34
+ A tall crystal vase with a fluted top edge, clear polished surface, and delicate floral engravings. The crystal glimmers subtly, showing off its refined craftsmanship.
35
+
36
+ Now generate a concise description for the item: "{item_name}"
37
+ Focus on the item itself, ensuring it is fully described, and specify a plain, white background and the output is no longer than 77 tokens.
38
+ """
39
+
40
+ messages = [
41
+ {
42
+ "role": "user",
43
+ "content": llm_prompt_template.format(item_name=item_name)
44
+ }
45
+ ]
46
+
47
+ completion = client.chat.completions.create(
48
+ model="Qwen/Qwen2.5-72B-Instruct",
49
+ messages=messages,
50
+ max_tokens=500
51
+ )
52
+ object_t2i_prompt = completion.choices[0].message.content
53
+ print(object_t2i_prompt)
54
+
55
+ return object_t2i_prompt
56
+
57
+ # generate_t2i_prompt("Golden Isalmic Mosque")
58
+ # exit()
59
+
60
+ def generate_item_image(object_t2i_prompt):
61
+ image = client.text_to_image(object_t2i_prompt, model="black-forest-labs/FLUX.1-dev", width=1024, height=1024, guidance_scale=3.5, num_inference_steps=28)
62
+ trial_id, processed_image = preprocess_image(image)
63
+ return trial_id, processed_image
64
+
65
+
66
+ MAX_SEED = np.iinfo(np.int32).max
67
+ TMP_DIR = "/tmp/Trellis-demo"
68
+
69
+ os.makedirs(TMP_DIR, exist_ok=True)
70
+
71
+
72
+ def preprocess_image(image: Image.Image) -> Tuple[str, Image.Image]:
73
+ """
74
+ Preprocess the input image.
75
+
76
+ Args:
77
+ image (Image.Image): The input image.
78
+
79
+ Returns:
80
+ str: uuid of the trial.
81
+ Image.Image: The preprocessed image.
82
+ """
83
+ trial_id = str(uuid.uuid4())
84
+ processed_image = pipeline.preprocess_image(image)
85
+ processed_image.save(f"{TMP_DIR}/{trial_id}.png")
86
+ return trial_id, processed_image
87
+
88
+
89
+ def pack_state(gs: Gaussian, mesh: MeshExtractResult, trial_id: str) -> dict:
90
+ return {
91
+ 'gaussian': {
92
+ **gs.init_params,
93
+ '_xyz': gs._xyz.cpu().numpy(),
94
+ '_features_dc': gs._features_dc.cpu().numpy(),
95
+ '_scaling': gs._scaling.cpu().numpy(),
96
+ '_rotation': gs._rotation.cpu().numpy(),
97
+ '_opacity': gs._opacity.cpu().numpy(),
98
+ },
99
+ 'mesh': {
100
+ 'vertices': mesh.vertices.cpu().numpy(),
101
+ 'faces': mesh.faces.cpu().numpy(),
102
+ },
103
+ 'trial_id': trial_id,
104
+ }
105
+
106
+
107
+ def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
108
+ gs = Gaussian(
109
+ aabb=state['gaussian']['aabb'],
110
+ sh_degree=state['gaussian']['sh_degree'],
111
+ mininum_kernel_size=state['gaussian']['mininum_kernel_size'],
112
+ scaling_bias=state['gaussian']['scaling_bias'],
113
+ opacity_bias=state['gaussian']['opacity_bias'],
114
+ scaling_activation=state['gaussian']['scaling_activation'],
115
+ )
116
+ gs._xyz = torch.tensor(state['gaussian']['_xyz'], device='cuda')
117
+ gs._features_dc = torch.tensor(state['gaussian']['_features_dc'], device='cuda')
118
+ gs._scaling = torch.tensor(state['gaussian']['_scaling'], device='cuda')
119
+ gs._rotation = torch.tensor(state['gaussian']['_rotation'], device='cuda')
120
+ gs._opacity = torch.tensor(state['gaussian']['_opacity'], device='cuda')
121
+
122
+ mesh = edict(
123
+ vertices=torch.tensor(state['mesh']['vertices'], device='cuda'),
124
+ faces=torch.tensor(state['mesh']['faces'], device='cuda'),
125
+ )
126
+
127
+ return gs, mesh, state['trial_id']
128
+
129
+
130
+ @spaces.GPU
131
+ def image_to_3d(trial_id: str, seed: int, randomize_seed: bool, ss_guidance_strength: float, ss_sampling_steps: int, slat_guidance_strength: float, slat_sampling_steps: int) -> Tuple[dict, str]:
132
+ """
133
+ Convert an image to a 3D model.
134
+
135
+ Args:
136
+ trial_id (str): The uuid of the trial.
137
+ seed (int): The random seed.
138
+ randomize_seed (bool): Whether to randomize the seed.
139
+ ss_guidance_strength (float): The guidance strength for sparse structure generation.
140
+ ss_sampling_steps (int): The number of sampling steps for sparse structure generation.
141
+ slat_guidance_strength (float): The guidance strength for structured latent generation.
142
+ slat_sampling_steps (int): The number of sampling steps for structured latent generation.
143
+
144
+ Returns:
145
+ dict: The information of the generated 3D model.
146
+ str: The path to the video of the 3D model.
147
+ """
148
+ if randomize_seed:
149
+ seed = np.random.randint(0, MAX_SEED)
150
+ outputs = pipeline.run(
151
+ Image.open(f"{TMP_DIR}/{trial_id}.png"),
152
+ seed=seed,
153
+ formats=["gaussian", "mesh"],
154
+ preprocess_image=False,
155
+ sparse_structure_sampler_params={
156
+ "steps": ss_sampling_steps,
157
+ "cfg_strength": ss_guidance_strength,
158
+ },
159
+ slat_sampler_params={
160
+ "steps": slat_sampling_steps,
161
+ "cfg_strength": slat_guidance_strength,
162
+ },
163
+ )
164
+ video = render_utils.render_video(outputs['gaussian'][0], num_frames=120)['color']
165
+ video_geo = render_utils.render_video(outputs['mesh'][0], num_frames=120)['normal']
166
+ video = [np.concatenate([video[i], video_geo[i]], axis=1) for i in range(len(video))]
167
+ trial_id = uuid.uuid4()
168
+ video_path = f"{TMP_DIR}/{trial_id}.mp4"
169
+ os.makedirs(os.path.dirname(video_path), exist_ok=True)
170
+ imageio.mimsave(video_path, video, fps=15)
171
+ state = pack_state(outputs['gaussian'][0], outputs['mesh'][0], trial_id)
172
+ return state, video_path
173
+
174
+
175
+ @spaces.GPU
176
+ def extract_glb(state: dict, mesh_simplify: float, texture_size: int) -> Tuple[str, str]:
177
+ """
178
+ Extract a GLB file from the 3D model.
179
+
180
+ Args:
181
+ state (dict): The state of the generated 3D model.
182
+ mesh_simplify (float): The mesh simplification factor.
183
+ texture_size (int): The texture resolution.
184
+
185
+ Returns:
186
+ str: The path to the extracted GLB file.
187
+ """
188
+ gs, mesh, trial_id = unpack_state(state)
189
+ glb = postprocessing_utils.to_glb(gs, mesh, simplify=mesh_simplify, texture_size=texture_size, verbose=False)
190
+ glb_path = f"{TMP_DIR}/{trial_id}.glb"
191
+ glb.export(glb_path)
192
+ return glb_path, glb_path
193
+
194
+
195
+ def activate_button() -> gr.Button:
196
+ return gr.Button(interactive=True)
197
+
198
+
199
+ def deactivate_button() -> gr.Button:
200
+ return gr.Button(interactive=False)
201
+
202
+
203
+ with gr.Blocks(title="Game Items Generator") as demo:
204
+ gr.HTML("<h1 style='text-align: center;'>Game Items Generator</h1>")
205
+ gr.Markdown("""
206
+ ## Text or Image to 3D Asset with [TRELLIS](https://trellis3d.github.io/)
207
+ - Write in a very simple words the item you want for your game and click "Enhance Prompt" to generate a text-to-image prompt.
208
+ - Click "Generate Image" to generate an image of the item or you can bypass all of the previous steps and uplod your own image.
209
+ - Click "Generate 3D video" to create a 3D asset. If the image has alpha channel, it be used as the mask. Otherwise, we use `rembg` to remove the background.
210
+ * If you find the generated 3D asset satisfactory, click "Extract GLB" to extract the GLB file and download it.
211
+ """)
212
+
213
+ with gr.Row():
214
+ with gr.Column():
215
+ with gr.Row():
216
+ item_text_field = gr.Textbox(label="Item Name", placeholder="Enter the name of the item", lines=2, scale=4)
217
+ enhance_prompt_btn = gr.Button("Enhance Prompt", variant="primary", scale=1)
218
+ generate_image_btn = gr.Button("Generate Image", variant="primary")
219
+ image_prompt = gr.Image(label="Image Prompt", image_mode="RGBA", type="pil", height=300)
220
+
221
+ with gr.Accordion(label="Generation Settings", open=False):
222
+ seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
223
+ randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
224
+ gr.Markdown("Stage 1: Sparse Structure Generation")
225
+ with gr.Row():
226
+ ss_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
227
+ ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
228
+ gr.Markdown("Stage 2: Structured Latent Generation")
229
+ with gr.Row():
230
+ slat_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=3.0, step=0.1)
231
+ slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
232
+
233
+ generate_btn = gr.Button("Generate 3D video")
234
+
235
+ with gr.Accordion(label="GLB Extraction Settings", open=False):
236
+ mesh_simplify = gr.Slider(0.9, 0.98, label="Simplify", value=0.95, step=0.01)
237
+ texture_size = gr.Slider(512, 2048, label="Texture Size", value=1024, step=512)
238
+
239
+ extract_glb_btn = gr.Button("Extract GLB", interactive=False)
240
+
241
+ with gr.Column():
242
+ video_output = gr.Video(label="Generated 3D Asset", autoplay=True, loop=True, height=300)
243
+ model_output = LitModel3D(label="Extracted GLB", exposure=20.0, height=300)
244
+ download_glb = gr.DownloadButton(label="Download GLB", interactive=False)
245
+
246
+ trial_id = gr.Textbox(visible=False)
247
+ output_buf = gr.State()
248
+
249
+ # Example images at the bottom of the page
250
+ with gr.Row():
251
+ examples = gr.Examples(
252
+ examples=[
253
+ f'assets/example_image/{image}'
254
+ for image in os.listdir("assets/example_image")
255
+ ],
256
+ inputs=[image_prompt],
257
+ fn=preprocess_image,
258
+ outputs=[trial_id, image_prompt],
259
+ run_on_click=True,
260
+ examples_per_page=64,
261
+ )
262
+
263
+ # Handlers
264
+ enhance_prompt_btn.click(
265
+ generate_t2i_prompt,
266
+ inputs=[item_text_field],
267
+ outputs=[item_text_field],
268
+ )
269
+ generate_image_btn.click(
270
+ generate_item_image,
271
+ inputs=[item_text_field],
272
+ outputs=[trial_id, image_prompt],
273
+ )
274
+ image_prompt.upload(
275
+ preprocess_image,
276
+ inputs=[image_prompt],
277
+ outputs=[trial_id, image_prompt],
278
+ )
279
+ image_prompt.clear(
280
+ lambda: '',
281
+ outputs=[trial_id],
282
+ )
283
+
284
+ generate_btn.click(
285
+ image_to_3d,
286
+ inputs=[trial_id, seed, randomize_seed, ss_guidance_strength, ss_sampling_steps, slat_guidance_strength, slat_sampling_steps],
287
+ outputs=[output_buf, video_output],
288
+ ).then(
289
+ activate_button,
290
+ outputs=[extract_glb_btn],
291
+ )
292
+
293
+ video_output.clear(
294
+ deactivate_button,
295
+ outputs=[extract_glb_btn],
296
+ )
297
+
298
+ extract_glb_btn.click(
299
+ extract_glb,
300
+ inputs=[output_buf, mesh_simplify, texture_size],
301
+ outputs=[model_output, download_glb],
302
+ ).then(
303
+ activate_button,
304
+ outputs=[download_glb],
305
+ )
306
+
307
+ model_output.clear(
308
+ deactivate_button,
309
+ outputs=[download_glb],
310
+ )
311
+
312
+
313
+ # Launch the Gradio app
314
+ if __name__ == "__main__":
315
+ pipeline = TrellisImageTo3DPipeline.from_pretrained("JeffreyXiang/TRELLIS-image-large")
316
+ pipeline.cuda()
317
+ try:
318
+ pipeline.preprocess_image(Image.fromarray(np.zeros((512, 512, 3), dtype=np.uint8))) # Preload rembg
319
+ except:
320
+ pass
321
+ demo.launch()
assets/example_image/T.png ADDED
assets/example_image/typical_building_building.png ADDED
assets/example_image/typical_building_castle.png ADDED
assets/example_image/typical_building_colorful_cottage.png ADDED
assets/example_image/typical_building_maya_pyramid.png ADDED
assets/example_image/typical_building_mushroom.png ADDED
assets/example_image/typical_building_space_station.png ADDED
assets/example_image/typical_creature_dragon.png ADDED
assets/example_image/typical_creature_elephant.png ADDED
assets/example_image/typical_creature_furry.png ADDED
assets/example_image/typical_creature_quadruped.png ADDED
assets/example_image/typical_creature_robot_crab.png ADDED
assets/example_image/typical_creature_robot_dinosour.png ADDED
assets/example_image/typical_creature_rock_monster.png ADDED
assets/example_image/typical_humanoid_block_robot.png ADDED
assets/example_image/typical_humanoid_dragonborn.png ADDED
assets/example_image/typical_humanoid_dwarf.png ADDED
assets/example_image/typical_humanoid_goblin.png ADDED
assets/example_image/typical_humanoid_mech.png ADDED
assets/example_image/typical_misc_crate.png ADDED
assets/example_image/typical_misc_fireplace.png ADDED
assets/example_image/typical_misc_gate.png ADDED
assets/example_image/typical_misc_lantern.png ADDED
assets/example_image/typical_misc_magicbook.png ADDED
assets/example_image/typical_misc_mailbox.png ADDED
assets/example_image/typical_misc_monster_chest.png ADDED
assets/example_image/typical_misc_paper_machine.png ADDED
assets/example_image/typical_misc_phonograph.png ADDED
assets/example_image/typical_misc_portal2.png ADDED
assets/example_image/typical_misc_storage_chest.png ADDED
assets/example_image/typical_misc_telephone.png ADDED
assets/example_image/typical_misc_television.png ADDED
assets/example_image/typical_misc_workbench.png ADDED
assets/example_image/typical_vehicle_biplane.png ADDED
assets/example_image/typical_vehicle_bulldozer.png ADDED
assets/example_image/typical_vehicle_cart.png ADDED
assets/example_image/typical_vehicle_excavator.png ADDED
assets/example_image/typical_vehicle_helicopter.png ADDED
assets/example_image/typical_vehicle_locomotive.png ADDED
assets/example_image/typical_vehicle_pirate_ship.png ADDED
assets/example_image/weatherworn_misc_paper_machine3.png ADDED
extensions/nvdiffrast/LICENSE.txt ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
2
+
3
+
4
+ Nvidia Source Code License (1-Way Commercial)
5
+
6
+ =======================================================================
7
+
8
+ 1. Definitions
9
+
10
+ "Licensor" means any person or entity that distributes its Work.
11
+
12
+ "Software" means the original work of authorship made available under
13
+ this License.
14
+
15
+ "Work" means the Software and any additions to or derivative works of
16
+ the Software that are made available under this License.
17
+
18
+ The terms "reproduce," "reproduction," "derivative works," and
19
+ "distribution" have the meaning as provided under U.S. copyright law;
20
+ provided, however, that for the purposes of this License, derivative
21
+ works shall not include works that remain separable from, or merely
22
+ link (or bind by name) to the interfaces of, the Work.
23
+
24
+ Works, including the Software, are "made available" under this License
25
+ by including in or with the Work either (a) a copyright notice
26
+ referencing the applicability of this License to the Work, or (b) a
27
+ copy of this License.
28
+
29
+ 2. License Grants
30
+
31
+ 2.1 Copyright Grant. Subject to the terms and conditions of this
32
+ License, each Licensor grants to you a perpetual, worldwide,
33
+ non-exclusive, royalty-free, copyright license to reproduce,
34
+ prepare derivative works of, publicly display, publicly perform,
35
+ sublicense and distribute its Work and any resulting derivative
36
+ works in any form.
37
+
38
+ 3. Limitations
39
+
40
+ 3.1 Redistribution. You may reproduce or distribute the Work only
41
+ if (a) you do so under this License, (b) you include a complete
42
+ copy of this License with your distribution, and (c) you retain
43
+ without modification any copyright, patent, trademark, or
44
+ attribution notices that are present in the Work.
45
+
46
+ 3.2 Derivative Works. You may specify that additional or different
47
+ terms apply to the use, reproduction, and distribution of your
48
+ derivative works of the Work ("Your Terms") only if (a) Your Terms
49
+ provide that the use limitation in Section 3.3 applies to your
50
+ derivative works, and (b) you identify the specific derivative
51
+ works that are subject to Your Terms. Notwithstanding Your Terms,
52
+ this License (including the redistribution requirements in Section
53
+ 3.1) will continue to apply to the Work itself.
54
+
55
+ 3.3 Use Limitation. The Work and any derivative works thereof only
56
+ may be used or intended for use non-commercially. The Work or
57
+ derivative works thereof may be used or intended for use by Nvidia
58
+ or its affiliates commercially or non-commercially. As used herein,
59
+ "non-commercially" means for research or evaluation purposes only
60
+ and not for any direct or indirect monetary gain.
61
+
62
+ 3.4 Patent Claims. If you bring or threaten to bring a patent claim
63
+ against any Licensor (including any claim, cross-claim or
64
+ counterclaim in a lawsuit) to enforce any patents that you allege
65
+ are infringed by any Work, then your rights under this License from
66
+ such Licensor (including the grant in Section 2.1) will terminate
67
+ immediately.
68
+
69
+ 3.5 Trademarks. This License does not grant any rights to use any
70
+ Licensor's or its affiliates' names, logos, or trademarks, except
71
+ as necessary to reproduce the notices described in this License.
72
+
73
+ 3.6 Termination. If you violate any term of this License, then your
74
+ rights under this License (including the grant in Section 2.1) will
75
+ terminate immediately.
76
+
77
+ 4. Disclaimer of Warranty.
78
+
79
+ THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
80
+ KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
81
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
82
+ NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
83
+ THIS LICENSE.
84
+
85
+ 5. Limitation of Liability.
86
+
87
+ EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
88
+ THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
89
+ SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
90
+ INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
91
+ OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
92
+ (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
93
+ LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
94
+ COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
95
+ THE POSSIBILITY OF SUCH DAMAGES.
96
+
97
+ =======================================================================
extensions/nvdiffrast/README.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Nvdiffrast &ndash; Modular Primitives for High-Performance Differentiable Rendering
2
+
3
+ ![Teaser image](./docs/img/teaser.png)
4
+
5
+ **Modular Primitives for High-Performance Differentiable Rendering**<br>
6
+ Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
7
+ [http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
8
+
9
+ Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
10
+ Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
11
+
12
+ ## Licenses
13
+
14
+ Copyright &copy; 2020&ndash;2024, NVIDIA Corporation. All rights reserved.
15
+
16
+ This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
17
+
18
+ For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
19
+
20
+ We do not currently accept outside code contributions in the form of pull requests.
21
+
22
+ Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
23
+ [sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
24
+ originally shared under
25
+ [MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
26
+ Mesh and texture stored as part of `samples/data/earth.npz` are derived from
27
+ [3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
28
+ model originally made available under
29
+ [TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
30
+
31
+ ## Citation
32
+
33
+ ```
34
+ @article{Laine2020diffrast,
35
+ title = {Modular Primitives for High-Performance Differentiable Rendering},
36
+ author = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
37
+ journal = {ACM Transactions on Graphics},
38
+ year = {2020},
39
+ volume = {39},
40
+ number = {6}
41
+ }
42
+ ```
extensions/nvdiffrast/nvdiffrast/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ # and proprietary rights in and to this software, related documentation
5
+ # and any modifications thereto. Any use, reproduction, disclosure or
6
+ # distribution of this software and related documentation without an express
7
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ __version__ = '0.3.3'
extensions/nvdiffrast/nvdiffrast/common/antialias.cu ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #include "antialias.h"
10
+
11
+ //------------------------------------------------------------------------
12
+ // Helpers.
13
+
14
+ #define F32_MAX (3.402823466e+38f)
15
+ static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
16
+ static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); }
17
+ static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
18
+ {
19
+ bool g10 = rational_gt(n1, n0, d1, d0);
20
+ bool g20 = rational_gt(n2, n0, d2, d0);
21
+ bool g21 = rational_gt(n2, n1, d2, d1);
22
+ if (g20 && g21) return 2;
23
+ if (g10) return 1;
24
+ return 0;
25
+ }
26
+
27
+ //------------------------------------------------------------------------
28
+ // Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.
29
+
30
+ struct AAWorkItem
31
+ {
32
+ enum
33
+ {
34
+ EDGE_MASK = 3, // Edge index in lowest bits.
35
+ FLAG_DOWN_BIT = 2, // Down instead of right.
36
+ FLAG_TRI1_BIT = 3, // Edge is from other pixel's triangle.
37
+ };
38
+
39
+ int px, py; // Pixel x, y.
40
+ unsigned int pz_flags; // High 16 bits = pixel z, low 16 bits = edge index and flags.
41
+ float alpha; // Antialiasing alpha value. Zero if no AA.
42
+ };
43
+
44
+ //------------------------------------------------------------------------
45
+ // Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html
46
+
47
+ #define JENKINS_MAGIC (0x9e3779b9u)
48
+ static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
49
+ {
50
+ a -= b; a -= c; a ^= (c>>13);
51
+ b -= c; b -= a; b ^= (a<<8);
52
+ c -= a; c -= b; c ^= (b>>13);
53
+ a -= b; a -= c; a ^= (c>>12);
54
+ b -= c; b -= a; b ^= (a<<16);
55
+ c -= a; c -= b; c ^= (b>>5);
56
+ a -= b; a -= c; a ^= (c>>3);
57
+ b -= c; b -= a; b ^= (a<<10);
58
+ c -= a; c -= b; c ^= (b>>15);
59
+ }
60
+
61
+ // Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
62
+ class HashIndex
63
+ {
64
+ public:
65
+ __device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
66
+ {
67
+ m_mask = (p.allocTriangles << AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles)) - 1; // This should work until triangle count exceeds 1073741824.
68
+ m_idx = (uint32_t)(key & 0xffffffffu);
69
+ m_skip = (uint32_t)(key >> 32);
70
+ uint32_t dummy = JENKINS_MAGIC;
71
+ jenkins_mix(m_idx, m_skip, dummy);
72
+ m_idx &= m_mask;
73
+ m_skip &= m_mask;
74
+ m_skip |= 1;
75
+ }
76
+ __device__ __forceinline__ int get(void) const { return m_idx; }
77
+ __device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
78
+ private:
79
+ uint32_t m_idx, m_skip, m_mask;
80
+ };
81
+
82
+ static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
83
+ {
84
+ HashIndex idx(p, key);
85
+ while(1)
86
+ {
87
+ uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
88
+ if (prev == 0 || prev == key)
89
+ break;
90
+ idx.next();
91
+ }
92
+ int* q = (int*)&p.evHash[idx.get()];
93
+ int a = atomicCAS(q+2, 0, v);
94
+ if (a != 0 && a != v)
95
+ atomicCAS(q+3, 0, v);
96
+ }
97
+
98
+ static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
99
+ {
100
+ HashIndex idx(p, key);
101
+ while(1)
102
+ {
103
+ uint4 entry = p.evHash[idx.get()];
104
+ uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32);
105
+ if (k == key || k == 0)
106
+ return make_int2((int)entry.z, (int)entry.w);
107
+ idx.next();
108
+ }
109
+ }
110
+
111
+ static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
112
+ {
113
+ if (va == vb)
114
+ return;
115
+
116
+ uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
117
+ uint64_t v1 = (uint32_t)max(va, vb) + 1;
118
+ uint64_t vk = v0 | (v1 << 32); // hash key
119
+ hash_insert(p, vk, vn + 1);
120
+ }
121
+
122
+ static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
123
+ {
124
+ if (va == vb)
125
+ return -1;
126
+
127
+ uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
128
+ uint64_t v1 = (uint32_t)max(va, vb) + 1;
129
+ uint64_t vk = v0 | (v1 << 32); // hash key
130
+ int2 vn = hash_find(p, vk) - 1;
131
+ if (vn.x == vr) return vn.y;
132
+ if (vn.y == vr) return vn.x;
133
+ return -1;
134
+ }
135
+
136
+ //------------------------------------------------------------------------
137
+ // Mesh analysis kernel.
138
+
139
+ __global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
140
+ {
141
+ int idx = threadIdx.x + blockIdx.x * blockDim.x;
142
+ if (idx >= p.numTriangles)
143
+ return;
144
+
145
+ int v0 = p.tri[idx * 3 + 0];
146
+ int v1 = p.tri[idx * 3 + 1];
147
+ int v2 = p.tri[idx * 3 + 2];
148
+
149
+ if (v0 < 0 || v0 >= p.numVertices ||
150
+ v1 < 0 || v1 >= p.numVertices ||
151
+ v2 < 0 || v2 >= p.numVertices)
152
+ return;
153
+
154
+ if (v0 == v1 || v1 == v2 || v2 == v0)
155
+ return;
156
+
157
+ evhash_insert_vertex(p, v1, v2, v0);
158
+ evhash_insert_vertex(p, v2, v0, v1);
159
+ evhash_insert_vertex(p, v0, v1, v2);
160
+ }
161
+
162
+ //------------------------------------------------------------------------
163
+ // Discontinuity finder kernel.
164
+
165
+ __global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
166
+ {
167
+ // Calculate pixel position.
168
+ int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
169
+ int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
170
+ int pz = blockIdx.z;
171
+ if (px >= p.width || py >= p.height || pz >= p.n)
172
+ return;
173
+
174
+ // Pointer to our TriIdx and fetch.
175
+ int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
176
+ float tri0 = p.rasterOut[pidx0]; // These can stay as float, as we only compare them against each other.
177
+
178
+ // Look right, clamp at edge.
179
+ int pidx1 = pidx0;
180
+ if (px < p.width - 1)
181
+ pidx1 += 4;
182
+ float tri1 = p.rasterOut[pidx1];
183
+
184
+ // Look down, clamp at edge.
185
+ int pidx2 = pidx0;
186
+ if (py < p.height - 1)
187
+ pidx2 += p.width << 2;
188
+ float tri2 = p.rasterOut[pidx2];
189
+
190
+ // Determine amount of work.
191
+ int count = 0;
192
+ if (tri1 != tri0) count = 1;
193
+ if (tri2 != tri0) count += 1;
194
+ if (!count)
195
+ return; // Exit warp.
196
+
197
+ // Coalesce work counter update to once per CTA.
198
+ __shared__ int s_temp;
199
+ s_temp = 0;
200
+ __syncthreads();
201
+ int idx = atomicAdd(&s_temp, count);
202
+ __syncthreads();
203
+ if (idx == 0)
204
+ {
205
+ int base = atomicAdd(&p.workBuffer[0].x, s_temp);
206
+ s_temp = base + 1; // don't clobber the counters in first slot.
207
+ }
208
+ __syncthreads();
209
+ idx += s_temp;
210
+
211
+ // Write to memory.
212
+ if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
213
+ if (tri2 != tri0) p.workBuffer[idx] = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
214
+ }
215
+
216
+ //------------------------------------------------------------------------
217
+ // Forward analysis kernel.
218
+
219
+ __global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
220
+ {
221
+ __shared__ int s_base;
222
+ int workCount = p.workBuffer[0].x;
223
+ for(;;)
224
+ {
225
+ // Persistent threads work fetcher.
226
+ __syncthreads();
227
+ if (threadIdx.x == 0)
228
+ s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
229
+ __syncthreads();
230
+ int thread_idx = s_base + threadIdx.x;
231
+ if (thread_idx >= workCount)
232
+ return;
233
+
234
+ int4* pItem = p.workBuffer + thread_idx + 1;
235
+ int4 item = *pItem;
236
+ int px = item.x;
237
+ int py = item.y;
238
+ int pz = (int)(((unsigned int)item.z) >> 16);
239
+ int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
240
+
241
+ int pixel0 = px + p.width * (py + p.height * pz);
242
+ int pixel1 = pixel0 + (d ? p.width : 1);
243
+ float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
244
+ float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
245
+ int tri0 = float_to_triidx(zt0.y) - 1;
246
+ int tri1 = float_to_triidx(zt1.y) - 1;
247
+
248
+ // Select triangle based on background / depth.
249
+ int tri = (tri0 >= 0) ? tri0 : tri1;
250
+ if (tri0 >= 0 && tri1 >= 0)
251
+ tri = (zt0.x < zt1.x) ? tri0 : tri1;
252
+ if (tri == tri1)
253
+ {
254
+ // Calculate with respect to neighbor pixel if chose that triangle.
255
+ px += 1 - d;
256
+ py += d;
257
+ }
258
+
259
+ // Bail out if triangle index is corrupt.
260
+ if (tri < 0 || tri >= p.numTriangles)
261
+ continue;
262
+
263
+ // Fetch vertex indices.
264
+ int vi0 = p.tri[tri * 3 + 0];
265
+ int vi1 = p.tri[tri * 3 + 1];
266
+ int vi2 = p.tri[tri * 3 + 2];
267
+
268
+ // Bail out if vertex indices are corrupt.
269
+ if (vi0 < 0 || vi0 >= p.numVertices ||
270
+ vi1 < 0 || vi1 >= p.numVertices ||
271
+ vi2 < 0 || vi2 >= p.numVertices)
272
+ continue;
273
+
274
+ // Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
275
+ int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
276
+ int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
277
+ int op2 = evhash_find_vertex(p, vi1, vi0, vi2);
278
+
279
+ // Instance mode: Adjust vertex indices based on minibatch index.
280
+ if (p.instance_mode)
281
+ {
282
+ int vbase = pz * p.numVertices;
283
+ vi0 += vbase;
284
+ vi1 += vbase;
285
+ vi2 += vbase;
286
+ if (op0 >= 0) op0 += vbase;
287
+ if (op1 >= 0) op1 += vbase;
288
+ if (op2 >= 0) op2 += vbase;
289
+ }
290
+
291
+ // Fetch vertex positions.
292
+ float4 p0 = ((float4*)p.pos)[vi0];
293
+ float4 p1 = ((float4*)p.pos)[vi1];
294
+ float4 p2 = ((float4*)p.pos)[vi2];
295
+ float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
296
+ float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
297
+ float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];
298
+
299
+ // Project vertices to pixel space.
300
+ float w0 = 1.f / p0.w;
301
+ float w1 = 1.f / p1.w;
302
+ float w2 = 1.f / p2.w;
303
+ float ow0 = 1.f / o0.w;
304
+ float ow1 = 1.f / o1.w;
305
+ float ow2 = 1.f / o2.w;
306
+ float fx = (float)px + .5f - p.xh;
307
+ float fy = (float)py + .5f - p.yh;
308
+ float x0 = p0.x * w0 * p.xh - fx;
309
+ float y0 = p0.y * w0 * p.yh - fy;
310
+ float x1 = p1.x * w1 * p.xh - fx;
311
+ float y1 = p1.y * w1 * p.yh - fy;
312
+ float x2 = p2.x * w2 * p.xh - fx;
313
+ float y2 = p2.y * w2 * p.yh - fy;
314
+ float ox0 = o0.x * ow0 * p.xh - fx;
315
+ float oy0 = o0.y * ow0 * p.yh - fy;
316
+ float ox1 = o1.x * ow1 * p.xh - fx;
317
+ float oy1 = o1.y * ow1 * p.yh - fy;
318
+ float ox2 = o2.x * ow2 * p.xh - fx;
319
+ float oy2 = o2.y * ow2 * p.yh - fy;
320
+
321
+ // Signs to kill non-silhouette edges.
322
+ float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself.
323
+ float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings.
324
+ float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1);
325
+ float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2);
326
+
327
+ // If no matching signs anywhere, skip the rest.
328
+ if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb))
329
+ {
330
+ // XY flip for horizontal edges.
331
+ if (d)
332
+ {
333
+ swap(x0, y0);
334
+ swap(x1, y1);
335
+ swap(x2, y2);
336
+ }
337
+
338
+ float dx0 = x2 - x1;
339
+ float dx1 = x0 - x2;
340
+ float dx2 = x1 - x0;
341
+ float dy0 = y2 - y1;
342
+ float dy1 = y0 - y2;
343
+ float dy2 = y1 - y0;
344
+
345
+ // Check if an edge crosses between us and the neighbor pixel.
346
+ float dc = -F32_MAX;
347
+ float ds = (tri == tri0) ? 1.f : -1.f;
348
+ float d0 = ds * (x1*dy0 - y1*dx0);
349
+ float d1 = ds * (x2*dy1 - y2*dx1);
350
+ float d2 = ds * (x0*dy2 - y0*dx2);
351
+
352
+ if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
353
+ if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
354
+ if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;
355
+
356
+ int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
357
+ if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
358
+ if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
359
+ if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
360
+ float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.
361
+
362
+ // Adjust output image if a suitable edge was found.
363
+ if (dc > -eps && dc < 1.f + eps)
364
+ {
365
+ dc = fminf(fmaxf(dc, 0.f), 1.f);
366
+ float alpha = ds * (.5f - dc);
367
+ const float* pColor0 = p.color + pixel0 * p.channels;
368
+ const float* pColor1 = p.color + pixel1 * p.channels;
369
+ float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
370
+ for (int i=0; i < p.channels; i++)
371
+ atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));
372
+
373
+ // Rewrite the work item's flags and alpha. Keep original px, py.
374
+ unsigned int flags = pz << 16;
375
+ flags |= di;
376
+ flags |= d << AAWorkItem::FLAG_DOWN_BIT;
377
+ flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
378
+ ((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
379
+ }
380
+ }
381
+ }
382
+ }
383
+
384
+ //------------------------------------------------------------------------
385
+ // Gradient kernel.
386
+
387
+ __global__ void AntialiasGradKernel(const AntialiasKernelParams p)
388
+ {
389
+ // Temporary space for coalesced atomics.
390
+ CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
391
+ __shared__ int s_base; // Work counter communication across entire CTA.
392
+
393
+ int workCount = p.workBuffer[0].x;
394
+
395
+ for(;;)
396
+ {
397
+ // Persistent threads work fetcher.
398
+ __syncthreads();
399
+ if (threadIdx.x == 0)
400
+ s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
401
+ __syncthreads();
402
+ int thread_idx = s_base + threadIdx.x;
403
+ if (thread_idx >= workCount)
404
+ return;
405
+
406
+ // Read work item filled out by forward kernel.
407
+ int4 item = p.workBuffer[thread_idx + 1];
408
+ unsigned int amask = __ballot_sync(0xffffffffu, item.w);
409
+ if (item.w == 0)
410
+ continue; // No effect.
411
+
412
+ // Unpack work item and replicate setup from forward analysis kernel.
413
+ int px = item.x;
414
+ int py = item.y;
415
+ int pz = (int)(((unsigned int)item.z) >> 16);
416
+ int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
417
+ float alpha = __int_as_float(item.w);
418
+ int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
419
+ int di = item.z & AAWorkItem::EDGE_MASK;
420
+ float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31));
421
+ int pixel0 = px + p.width * (py + p.height * pz);
422
+ int pixel1 = pixel0 + (d ? p.width : 1);
423
+ int tri = float_to_triidx(p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3]) - 1;
424
+ if (tri1)
425
+ {
426
+ px += 1 - d;
427
+ py += d;
428
+ }
429
+
430
+ // Bail out if triangle index is corrupt.
431
+ bool triFail = (tri < 0 || tri >= p.numTriangles);
432
+ amask = __ballot_sync(amask, !triFail);
433
+ if (triFail)
434
+ continue;
435
+
436
+ // Outgoing color gradients.
437
+ float* pGrad0 = p.gradColor + pixel0 * p.channels;
438
+ float* pGrad1 = p.gradColor + pixel1 * p.channels;
439
+
440
+ // Incoming color gradients.
441
+ const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
442
+
443
+ // Position gradient weight based on colors and incoming gradients.
444
+ float dd = 0.f;
445
+ const float* pColor0 = p.color + pixel0 * p.channels;
446
+ const float* pColor1 = p.color + pixel1 * p.channels;
447
+
448
+ // Loop over channels and accumulate.
449
+ for (int i=0; i < p.channels; i++)
450
+ {
451
+ float dy = pDy[i];
452
+ if (dy != 0.f)
453
+ {
454
+ // Update position gradient weight.
455
+ dd += dy * (pColor1[i] - pColor0[i]);
456
+
457
+ // Update color gradients. No coalescing because all have different targets.
458
+ float v = alpha * dy;
459
+ atomicAdd(&pGrad0[i], -v);
460
+ atomicAdd(&pGrad1[i], v);
461
+ }
462
+ }
463
+
464
+ // If position weight is zero, skip the rest.
465
+ bool noGrad = (dd == 0.f);
466
+ amask = __ballot_sync(amask, !noGrad);
467
+ if (noGrad)
468
+ continue;
469
+
470
+ // Fetch vertex indices of the active edge and their positions.
471
+ int i1 = (di < 2) ? (di + 1) : 0;
472
+ int i2 = (i1 < 2) ? (i1 + 1) : 0;
473
+ int vi1 = p.tri[3 * tri + i1];
474
+ int vi2 = p.tri[3 * tri + i2];
475
+
476
+ // Bail out if vertex indices are corrupt.
477
+ bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices);
478
+ amask = __ballot_sync(amask, !vtxFail);
479
+ if (vtxFail)
480
+ continue;
481
+
482
+ // Instance mode: Adjust vertex indices based on minibatch index.
483
+ if (p.instance_mode)
484
+ {
485
+ vi1 += pz * p.numVertices;
486
+ vi2 += pz * p.numVertices;
487
+ }
488
+
489
+ // Fetch vertex positions.
490
+ float4 p1 = ((float4*)p.pos)[vi1];
491
+ float4 p2 = ((float4*)p.pos)[vi2];
492
+
493
+ // Project vertices to pixel space.
494
+ float pxh = p.xh;
495
+ float pyh = p.yh;
496
+ float fx = (float)px + .5f - pxh;
497
+ float fy = (float)py + .5f - pyh;
498
+
499
+ // XY flip for horizontal edges.
500
+ if (d)
501
+ {
502
+ swap(p1.x, p1.y);
503
+ swap(p2.x, p2.y);
504
+ swap(pxh, pyh);
505
+ swap(fx, fy);
506
+ }
507
+
508
+ // Gradient calculation setup.
509
+ float w1 = 1.f / p1.w;
510
+ float w2 = 1.f / p2.w;
511
+ float x1 = p1.x * w1 * pxh - fx;
512
+ float y1 = p1.y * w1 * pyh - fy;
513
+ float x2 = p2.x * w2 * pxh - fx;
514
+ float y2 = p2.y * w2 * pyh - fy;
515
+ float dx = x2 - x1;
516
+ float dy = y2 - y1;
517
+ float db = x1*dy - y1*dx;
518
+
519
+ // Compute inverse delta-y with epsilon.
520
+ float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
521
+ float iy = 1.f / (dy + ep);
522
+
523
+ // Compute position gradients.
524
+ float dby = db * iy;
525
+ float iw1 = -w1 * iy * dd;
526
+ float iw2 = w2 * iy * dd;
527
+ float gp1x = iw1 * pxh * y2;
528
+ float gp2x = iw2 * pxh * y1;
529
+ float gp1y = iw1 * pyh * (dby - x2);
530
+ float gp2y = iw2 * pyh * (dby - x1);
531
+ float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
532
+ float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;
533
+
534
+ // XY flip the gradients.
535
+ if (d)
536
+ {
537
+ swap(gp1x, gp1y);
538
+ swap(gp2x, gp2y);
539
+ }
540
+
541
+ // Kill position gradients if alpha was saturated.
542
+ if (fabsf(alpha) >= 0.5f)
543
+ {
544
+ gp1x = gp1y = gp1w = 0.f;
545
+ gp2x = gp2y = gp2w = 0.f;
546
+ }
547
+
548
+ // Initialize coalesced atomics. Match both triangle ID and edge index.
549
+ // Also note that some threads may be inactive.
550
+ CA_SET_GROUP_MASK(tri ^ (di << 30), amask);
551
+
552
+ // Accumulate gradients.
553
+ caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
554
+ caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
555
+ }
556
+ }
557
+
558
+ //------------------------------------------------------------------------
extensions/nvdiffrast/nvdiffrast/common/antialias.h ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #pragma once
10
+ #include "common.h"
11
+
12
+ //------------------------------------------------------------------------
13
+ // Constants and helpers.
14
+
15
+ #define AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH 32
16
+ #define AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT 8
17
+ #define AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK 256
18
+ #define AA_MESH_KERNEL_THREADS_PER_BLOCK 256
19
+ #define AA_HASH_ELEMENTS_PER_TRIANGLE(alloc) ((alloc) >= (2 << 25) ? 4 : 8) // With more than 16777216 triangles (alloc >= 33554432) use smallest possible value of 4 to conserve memory, otherwise use 8 for fewer collisions.
20
+ #define AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(alloc) ((alloc) >= (2 << 25) ? 2 : 3)
21
+ #define AA_GRAD_KERNEL_THREADS_PER_BLOCK 256
22
+
23
+ //------------------------------------------------------------------------
24
+ // CUDA kernel params.
25
+
26
+ struct AntialiasKernelParams
27
+ {
28
+ const float* color; // Incoming color buffer.
29
+ const float* rasterOut; // Incoming rasterizer output buffer.
30
+ const int* tri; // Incoming triangle buffer.
31
+ const float* pos; // Incoming position buffer.
32
+ float* output; // Output buffer of forward kernel.
33
+ const float* dy; // Incoming gradients.
34
+ float* gradColor; // Output buffer, color gradient.
35
+ float* gradPos; // Output buffer, position gradient.
36
+ int4* workBuffer; // Buffer for storing intermediate work items. First item reserved for counters.
37
+ uint4* evHash; // Edge-vertex hash.
38
+ int allocTriangles; // Number of triangles accommodated by evHash. Always power of two.
39
+ int numTriangles; // Number of triangles.
40
+ int numVertices; // Number of vertices.
41
+ int width; // Input width.
42
+ int height; // Input height.
43
+ int n; // Minibatch size.
44
+ int channels; // Channel count in color input.
45
+ float xh, yh; // Transfer to pixel space.
46
+ int instance_mode; // 0=normal, 1=instance mode.
47
+ int tri_const; // 1 if triangle array is known to be constant.
48
+ };
49
+
50
+ //------------------------------------------------------------------------
extensions/nvdiffrast/nvdiffrast/common/common.cpp ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #include <cuda_runtime.h>
10
+
11
+ //------------------------------------------------------------------------
12
+ // Block and grid size calculators for kernel launches.
13
+
14
+ dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height)
15
+ {
16
+ int maxThreads = maxWidth * maxHeight;
17
+ if (maxThreads <= 1 || (width * height) <= 1)
18
+ return dim3(1, 1, 1); // Degenerate.
19
+
20
+ // Start from max size.
21
+ int bw = maxWidth;
22
+ int bh = maxHeight;
23
+
24
+ // Optimizations for weirdly sized buffers.
25
+ if (width < bw)
26
+ {
27
+ // Decrease block width to smallest power of two that covers the buffer width.
28
+ while ((bw >> 1) >= width)
29
+ bw >>= 1;
30
+
31
+ // Maximize height.
32
+ bh = maxThreads / bw;
33
+ if (bh > height)
34
+ bh = height;
35
+ }
36
+ else if (height < bh)
37
+ {
38
+ // Halve height and double width until fits completely inside buffer vertically.
39
+ while (bh > height)
40
+ {
41
+ bh >>= 1;
42
+ if (bw < width)
43
+ bw <<= 1;
44
+ }
45
+ }
46
+
47
+ // Done.
48
+ return dim3(bw, bh, 1);
49
+ }
50
+
51
+ dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth)
52
+ {
53
+ dim3 gridSize;
54
+ gridSize.x = (width - 1) / blockSize.x + 1;
55
+ gridSize.y = (height - 1) / blockSize.y + 1;
56
+ gridSize.z = (depth - 1) / blockSize.z + 1;
57
+ return gridSize;
58
+ }
59
+
60
+ //------------------------------------------------------------------------
extensions/nvdiffrast/nvdiffrast/common/common.h ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #pragma once
10
+ #include <cuda.h>
11
+ #include <stdint.h>
12
+
13
+ //------------------------------------------------------------------------
14
+ // C++ helper function prototypes.
15
+
16
+ dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height);
17
+ dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth);
18
+
19
+ //------------------------------------------------------------------------
20
+ // The rest is CUDA device code specific stuff.
21
+
22
+ #ifdef __CUDACC__
23
+
24
+ //------------------------------------------------------------------------
25
+ // Helpers for CUDA vector types.
26
+
27
+ static __device__ __forceinline__ float2& operator*= (float2& a, const float2& b) { a.x *= b.x; a.y *= b.y; return a; }
28
+ static __device__ __forceinline__ float2& operator+= (float2& a, const float2& b) { a.x += b.x; a.y += b.y; return a; }
29
+ static __device__ __forceinline__ float2& operator-= (float2& a, const float2& b) { a.x -= b.x; a.y -= b.y; return a; }
30
+ static __device__ __forceinline__ float2& operator*= (float2& a, float b) { a.x *= b; a.y *= b; return a; }
31
+ static __device__ __forceinline__ float2& operator+= (float2& a, float b) { a.x += b; a.y += b; return a; }
32
+ static __device__ __forceinline__ float2& operator-= (float2& a, float b) { a.x -= b; a.y -= b; return a; }
33
+ static __device__ __forceinline__ float2 operator* (const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); }
34
+ static __device__ __forceinline__ float2 operator+ (const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); }
35
+ static __device__ __forceinline__ float2 operator- (const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); }
36
+ static __device__ __forceinline__ float2 operator* (const float2& a, float b) { return make_float2(a.x * b, a.y * b); }
37
+ static __device__ __forceinline__ float2 operator+ (const float2& a, float b) { return make_float2(a.x + b, a.y + b); }
38
+ static __device__ __forceinline__ float2 operator- (const float2& a, float b) { return make_float2(a.x - b, a.y - b); }
39
+ static __device__ __forceinline__ float2 operator* (float a, const float2& b) { return make_float2(a * b.x, a * b.y); }
40
+ static __device__ __forceinline__ float2 operator+ (float a, const float2& b) { return make_float2(a + b.x, a + b.y); }
41
+ static __device__ __forceinline__ float2 operator- (float a, const float2& b) { return make_float2(a - b.x, a - b.y); }
42
+ static __device__ __forceinline__ float2 operator- (const float2& a) { return make_float2(-a.x, -a.y); }
43
+ static __device__ __forceinline__ float3& operator*= (float3& a, const float3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
44
+ static __device__ __forceinline__ float3& operator+= (float3& a, const float3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
45
+ static __device__ __forceinline__ float3& operator-= (float3& a, const float3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
46
+ static __device__ __forceinline__ float3& operator*= (float3& a, float b) { a.x *= b; a.y *= b; a.z *= b; return a; }
47
+ static __device__ __forceinline__ float3& operator+= (float3& a, float b) { a.x += b; a.y += b; a.z += b; return a; }
48
+ static __device__ __forceinline__ float3& operator-= (float3& a, float b) { a.x -= b; a.y -= b; a.z -= b; return a; }
49
+ static __device__ __forceinline__ float3 operator* (const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
50
+ static __device__ __forceinline__ float3 operator+ (const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); }
51
+ static __device__ __forceinline__ float3 operator- (const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
52
+ static __device__ __forceinline__ float3 operator* (const float3& a, float b) { return make_float3(a.x * b, a.y * b, a.z * b); }
53
+ static __device__ __forceinline__ float3 operator+ (const float3& a, float b) { return make_float3(a.x + b, a.y + b, a.z + b); }
54
+ static __device__ __forceinline__ float3 operator- (const float3& a, float b) { return make_float3(a.x - b, a.y - b, a.z - b); }
55
+ static __device__ __forceinline__ float3 operator* (float a, const float3& b) { return make_float3(a * b.x, a * b.y, a * b.z); }
56
+ static __device__ __forceinline__ float3 operator+ (float a, const float3& b) { return make_float3(a + b.x, a + b.y, a + b.z); }
57
+ static __device__ __forceinline__ float3 operator- (float a, const float3& b) { return make_float3(a - b.x, a - b.y, a - b.z); }
58
+ static __device__ __forceinline__ float3 operator- (const float3& a) { return make_float3(-a.x, -a.y, -a.z); }
59
+ static __device__ __forceinline__ float4& operator*= (float4& a, const float4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
60
+ static __device__ __forceinline__ float4& operator+= (float4& a, const float4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
61
+ static __device__ __forceinline__ float4& operator-= (float4& a, const float4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
62
+ static __device__ __forceinline__ float4& operator*= (float4& a, float b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
63
+ static __device__ __forceinline__ float4& operator+= (float4& a, float b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
64
+ static __device__ __forceinline__ float4& operator-= (float4& a, float b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
65
+ static __device__ __forceinline__ float4 operator* (const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
66
+ static __device__ __forceinline__ float4 operator+ (const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
67
+ static __device__ __forceinline__ float4 operator- (const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
68
+ static __device__ __forceinline__ float4 operator* (const float4& a, float b) { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); }
69
+ static __device__ __forceinline__ float4 operator+ (const float4& a, float b) { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); }
70
+ static __device__ __forceinline__ float4 operator- (const float4& a, float b) { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); }
71
+ static __device__ __forceinline__ float4 operator* (float a, const float4& b) { return make_float4(a * b.x, a * b.y, a * b.z, a * b.w); }
72
+ static __device__ __forceinline__ float4 operator+ (float a, const float4& b) { return make_float4(a + b.x, a + b.y, a + b.z, a + b.w); }
73
+ static __device__ __forceinline__ float4 operator- (float a, const float4& b) { return make_float4(a - b.x, a - b.y, a - b.z, a - b.w); }
74
+ static __device__ __forceinline__ float4 operator- (const float4& a) { return make_float4(-a.x, -a.y, -a.z, -a.w); }
75
+ static __device__ __forceinline__ int2& operator*= (int2& a, const int2& b) { a.x *= b.x; a.y *= b.y; return a; }
76
+ static __device__ __forceinline__ int2& operator+= (int2& a, const int2& b) { a.x += b.x; a.y += b.y; return a; }
77
+ static __device__ __forceinline__ int2& operator-= (int2& a, const int2& b) { a.x -= b.x; a.y -= b.y; return a; }
78
+ static __device__ __forceinline__ int2& operator*= (int2& a, int b) { a.x *= b; a.y *= b; return a; }
79
+ static __device__ __forceinline__ int2& operator+= (int2& a, int b) { a.x += b; a.y += b; return a; }
80
+ static __device__ __forceinline__ int2& operator-= (int2& a, int b) { a.x -= b; a.y -= b; return a; }
81
+ static __device__ __forceinline__ int2 operator* (const int2& a, const int2& b) { return make_int2(a.x * b.x, a.y * b.y); }
82
+ static __device__ __forceinline__ int2 operator+ (const int2& a, const int2& b) { return make_int2(a.x + b.x, a.y + b.y); }
83
+ static __device__ __forceinline__ int2 operator- (const int2& a, const int2& b) { return make_int2(a.x - b.x, a.y - b.y); }
84
+ static __device__ __forceinline__ int2 operator* (const int2& a, int b) { return make_int2(a.x * b, a.y * b); }
85
+ static __device__ __forceinline__ int2 operator+ (const int2& a, int b) { return make_int2(a.x + b, a.y + b); }
86
+ static __device__ __forceinline__ int2 operator- (const int2& a, int b) { return make_int2(a.x - b, a.y - b); }
87
+ static __device__ __forceinline__ int2 operator* (int a, const int2& b) { return make_int2(a * b.x, a * b.y); }
88
+ static __device__ __forceinline__ int2 operator+ (int a, const int2& b) { return make_int2(a + b.x, a + b.y); }
89
+ static __device__ __forceinline__ int2 operator- (int a, const int2& b) { return make_int2(a - b.x, a - b.y); }
90
+ static __device__ __forceinline__ int2 operator- (const int2& a) { return make_int2(-a.x, -a.y); }
91
+ static __device__ __forceinline__ int3& operator*= (int3& a, const int3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
92
+ static __device__ __forceinline__ int3& operator+= (int3& a, const int3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
93
+ static __device__ __forceinline__ int3& operator-= (int3& a, const int3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
94
+ static __device__ __forceinline__ int3& operator*= (int3& a, int b) { a.x *= b; a.y *= b; a.z *= b; return a; }
95
+ static __device__ __forceinline__ int3& operator+= (int3& a, int b) { a.x += b; a.y += b; a.z += b; return a; }
96
+ static __device__ __forceinline__ int3& operator-= (int3& a, int b) { a.x -= b; a.y -= b; a.z -= b; return a; }
97
+ static __device__ __forceinline__ int3 operator* (const int3& a, const int3& b) { return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); }
98
+ static __device__ __forceinline__ int3 operator+ (const int3& a, const int3& b) { return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); }
99
+ static __device__ __forceinline__ int3 operator- (const int3& a, const int3& b) { return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); }
100
+ static __device__ __forceinline__ int3 operator* (const int3& a, int b) { return make_int3(a.x * b, a.y * b, a.z * b); }
101
+ static __device__ __forceinline__ int3 operator+ (const int3& a, int b) { return make_int3(a.x + b, a.y + b, a.z + b); }
102
+ static __device__ __forceinline__ int3 operator- (const int3& a, int b) { return make_int3(a.x - b, a.y - b, a.z - b); }
103
+ static __device__ __forceinline__ int3 operator* (int a, const int3& b) { return make_int3(a * b.x, a * b.y, a * b.z); }
104
+ static __device__ __forceinline__ int3 operator+ (int a, const int3& b) { return make_int3(a + b.x, a + b.y, a + b.z); }
105
+ static __device__ __forceinline__ int3 operator- (int a, const int3& b) { return make_int3(a - b.x, a - b.y, a - b.z); }
106
+ static __device__ __forceinline__ int3 operator- (const int3& a) { return make_int3(-a.x, -a.y, -a.z); }
107
+ static __device__ __forceinline__ int4& operator*= (int4& a, const int4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
108
+ static __device__ __forceinline__ int4& operator+= (int4& a, const int4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
109
+ static __device__ __forceinline__ int4& operator-= (int4& a, const int4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
110
+ static __device__ __forceinline__ int4& operator*= (int4& a, int b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
111
+ static __device__ __forceinline__ int4& operator+= (int4& a, int b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
112
+ static __device__ __forceinline__ int4& operator-= (int4& a, int b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
113
+ static __device__ __forceinline__ int4 operator* (const int4& a, const int4& b) { return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
114
+ static __device__ __forceinline__ int4 operator+ (const int4& a, const int4& b) { return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
115
+ static __device__ __forceinline__ int4 operator- (const int4& a, const int4& b) { return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
116
+ static __device__ __forceinline__ int4 operator* (const int4& a, int b) { return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); }
117
+ static __device__ __forceinline__ int4 operator+ (const int4& a, int b) { return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); }
118
+ static __device__ __forceinline__ int4 operator- (const int4& a, int b) { return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); }
119
+ static __device__ __forceinline__ int4 operator* (int a, const int4& b) { return make_int4(a * b.x, a * b.y, a * b.z, a * b.w); }
120
+ static __device__ __forceinline__ int4 operator+ (int a, const int4& b) { return make_int4(a + b.x, a + b.y, a + b.z, a + b.w); }
121
+ static __device__ __forceinline__ int4 operator- (int a, const int4& b) { return make_int4(a - b.x, a - b.y, a - b.z, a - b.w); }
122
+ static __device__ __forceinline__ int4 operator- (const int4& a) { return make_int4(-a.x, -a.y, -a.z, -a.w); }
123
+ static __device__ __forceinline__ uint2& operator*= (uint2& a, const uint2& b) { a.x *= b.x; a.y *= b.y; return a; }
124
+ static __device__ __forceinline__ uint2& operator+= (uint2& a, const uint2& b) { a.x += b.x; a.y += b.y; return a; }
125
+ static __device__ __forceinline__ uint2& operator-= (uint2& a, const uint2& b) { a.x -= b.x; a.y -= b.y; return a; }
126
+ static __device__ __forceinline__ uint2& operator*= (uint2& a, unsigned int b) { a.x *= b; a.y *= b; return a; }
127
+ static __device__ __forceinline__ uint2& operator+= (uint2& a, unsigned int b) { a.x += b; a.y += b; return a; }
128
+ static __device__ __forceinline__ uint2& operator-= (uint2& a, unsigned int b) { a.x -= b; a.y -= b; return a; }
129
+ static __device__ __forceinline__ uint2 operator* (const uint2& a, const uint2& b) { return make_uint2(a.x * b.x, a.y * b.y); }
130
+ static __device__ __forceinline__ uint2 operator+ (const uint2& a, const uint2& b) { return make_uint2(a.x + b.x, a.y + b.y); }
131
+ static __device__ __forceinline__ uint2 operator- (const uint2& a, const uint2& b) { return make_uint2(a.x - b.x, a.y - b.y); }
132
+ static __device__ __forceinline__ uint2 operator* (const uint2& a, unsigned int b) { return make_uint2(a.x * b, a.y * b); }
133
+ static __device__ __forceinline__ uint2 operator+ (const uint2& a, unsigned int b) { return make_uint2(a.x + b, a.y + b); }
134
+ static __device__ __forceinline__ uint2 operator- (const uint2& a, unsigned int b) { return make_uint2(a.x - b, a.y - b); }
135
+ static __device__ __forceinline__ uint2 operator* (unsigned int a, const uint2& b) { return make_uint2(a * b.x, a * b.y); }
136
+ static __device__ __forceinline__ uint2 operator+ (unsigned int a, const uint2& b) { return make_uint2(a + b.x, a + b.y); }
137
+ static __device__ __forceinline__ uint2 operator- (unsigned int a, const uint2& b) { return make_uint2(a - b.x, a - b.y); }
138
+ static __device__ __forceinline__ uint3& operator*= (uint3& a, const uint3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
139
+ static __device__ __forceinline__ uint3& operator+= (uint3& a, const uint3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
140
+ static __device__ __forceinline__ uint3& operator-= (uint3& a, const uint3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
141
+ static __device__ __forceinline__ uint3& operator*= (uint3& a, unsigned int b) { a.x *= b; a.y *= b; a.z *= b; return a; }
142
+ static __device__ __forceinline__ uint3& operator+= (uint3& a, unsigned int b) { a.x += b; a.y += b; a.z += b; return a; }
143
+ static __device__ __forceinline__ uint3& operator-= (uint3& a, unsigned int b) { a.x -= b; a.y -= b; a.z -= b; return a; }
144
+ static __device__ __forceinline__ uint3 operator* (const uint3& a, const uint3& b) { return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); }
145
+ static __device__ __forceinline__ uint3 operator+ (const uint3& a, const uint3& b) { return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); }
146
+ static __device__ __forceinline__ uint3 operator- (const uint3& a, const uint3& b) { return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); }
147
+ static __device__ __forceinline__ uint3 operator* (const uint3& a, unsigned int b) { return make_uint3(a.x * b, a.y * b, a.z * b); }
148
+ static __device__ __forceinline__ uint3 operator+ (const uint3& a, unsigned int b) { return make_uint3(a.x + b, a.y + b, a.z + b); }
149
+ static __device__ __forceinline__ uint3 operator- (const uint3& a, unsigned int b) { return make_uint3(a.x - b, a.y - b, a.z - b); }
150
+ static __device__ __forceinline__ uint3 operator* (unsigned int a, const uint3& b) { return make_uint3(a * b.x, a * b.y, a * b.z); }
151
+ static __device__ __forceinline__ uint3 operator+ (unsigned int a, const uint3& b) { return make_uint3(a + b.x, a + b.y, a + b.z); }
152
+ static __device__ __forceinline__ uint3 operator- (unsigned int a, const uint3& b) { return make_uint3(a - b.x, a - b.y, a - b.z); }
153
+ static __device__ __forceinline__ uint4& operator*= (uint4& a, const uint4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
154
+ static __device__ __forceinline__ uint4& operator+= (uint4& a, const uint4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
155
+ static __device__ __forceinline__ uint4& operator-= (uint4& a, const uint4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
156
+ static __device__ __forceinline__ uint4& operator*= (uint4& a, unsigned int b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
157
+ static __device__ __forceinline__ uint4& operator+= (uint4& a, unsigned int b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
158
+ static __device__ __forceinline__ uint4& operator-= (uint4& a, unsigned int b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
159
+ static __device__ __forceinline__ uint4 operator* (const uint4& a, const uint4& b) { return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
160
+ static __device__ __forceinline__ uint4 operator+ (const uint4& a, const uint4& b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
161
+ static __device__ __forceinline__ uint4 operator- (const uint4& a, const uint4& b) { return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
162
+ static __device__ __forceinline__ uint4 operator* (const uint4& a, unsigned int b) { return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); }
163
+ static __device__ __forceinline__ uint4 operator+ (const uint4& a, unsigned int b) { return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); }
164
+ static __device__ __forceinline__ uint4 operator- (const uint4& a, unsigned int b) { return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); }
165
+ static __device__ __forceinline__ uint4 operator* (unsigned int a, const uint4& b) { return make_uint4(a * b.x, a * b.y, a * b.z, a * b.w); }
166
+ static __device__ __forceinline__ uint4 operator+ (unsigned int a, const uint4& b) { return make_uint4(a + b.x, a + b.y, a + b.z, a + b.w); }
167
+ static __device__ __forceinline__ uint4 operator- (unsigned int a, const uint4& b) { return make_uint4(a - b.x, a - b.y, a - b.z, a - b.w); }
168
+
169
+ template<class T> static __device__ __forceinline__ T zero_value(void);
170
+ template<> __device__ __forceinline__ float zero_value<float> (void) { return 0.f; }
171
+ template<> __device__ __forceinline__ float2 zero_value<float2>(void) { return make_float2(0.f, 0.f); }
172
+ template<> __device__ __forceinline__ float4 zero_value<float4>(void) { return make_float4(0.f, 0.f, 0.f, 0.f); }
173
+ static __device__ __forceinline__ float3 make_float3(const float2& a, float b) { return make_float3(a.x, a.y, b); }
174
+ static __device__ __forceinline__ float4 make_float4(const float3& a, float b) { return make_float4(a.x, a.y, a.z, b); }
175
+ static __device__ __forceinline__ float4 make_float4(const float2& a, const float2& b) { return make_float4(a.x, a.y, b.x, b.y); }
176
+ static __device__ __forceinline__ int3 make_int3(const int2& a, int b) { return make_int3(a.x, a.y, b); }
177
+ static __device__ __forceinline__ int4 make_int4(const int3& a, int b) { return make_int4(a.x, a.y, a.z, b); }
178
+ static __device__ __forceinline__ int4 make_int4(const int2& a, const int2& b) { return make_int4(a.x, a.y, b.x, b.y); }
179
+ static __device__ __forceinline__ uint3 make_uint3(const uint2& a, unsigned int b) { return make_uint3(a.x, a.y, b); }
180
+ static __device__ __forceinline__ uint4 make_uint4(const uint3& a, unsigned int b) { return make_uint4(a.x, a.y, a.z, b); }
181
+ static __device__ __forceinline__ uint4 make_uint4(const uint2& a, const uint2& b) { return make_uint4(a.x, a.y, b.x, b.y); }
182
+
183
+ template<class T> static __device__ __forceinline__ void swap(T& a, T& b) { T temp = a; a = b; b = temp; }
184
+
185
+ //------------------------------------------------------------------------
186
+ // Triangle ID <-> float32 conversion functions to support very large triangle IDs.
187
+ //
188
+ // Values up to and including 16777216 (also, negative values) are converted trivially and retain
189
+ // compatibility with previous versions. Larger values are mapped to unique float32 that are not equal to
190
+ // the ID. The largest value that converts to float32 and back without generating inf or nan is 889192447.
191
+
192
+ static __device__ __forceinline__ int float_to_triidx(float x) { if (x <= 16777216.f) return (int)x; return __float_as_int(x) - 0x4a800000; }
193
+ static __device__ __forceinline__ float triidx_to_float(int x) { if (x <= 0x01000000) return (float)x; return __int_as_float(0x4a800000 + x); }
194
+
195
+ //------------------------------------------------------------------------
196
+ // Coalesced atomics. These are all done via macros.
197
+
198
+ #if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
199
+
200
+ #define CA_TEMP _ca_temp
201
+ #define CA_TEMP_PARAM float* CA_TEMP
202
+ #define CA_DECLARE_TEMP(threads_per_block) \
203
+ __shared__ float CA_TEMP[(threads_per_block)]
204
+
205
+ #define CA_SET_GROUP_MASK(group, thread_mask) \
206
+ bool _ca_leader; \
207
+ float* _ca_ptr; \
208
+ do { \
209
+ int tidx = threadIdx.x + blockDim.x * threadIdx.y; \
210
+ int lane = tidx & 31; \
211
+ int warp = tidx >> 5; \
212
+ int tmask = __match_any_sync((thread_mask), (group)); \
213
+ int leader = __ffs(tmask) - 1; \
214
+ _ca_leader = (leader == lane); \
215
+ _ca_ptr = &_ca_temp[((warp << 5) + leader)]; \
216
+ } while(0)
217
+
218
+ #define CA_SET_GROUP(group) \
219
+ CA_SET_GROUP_MASK((group), 0xffffffffu)
220
+
221
+ #define caAtomicAdd(ptr, value) \
222
+ do { \
223
+ if (_ca_leader) \
224
+ *_ca_ptr = 0.f; \
225
+ atomicAdd(_ca_ptr, (value)); \
226
+ if (_ca_leader) \
227
+ atomicAdd((ptr), *_ca_ptr); \
228
+ } while(0)
229
+
230
+ #define caAtomicAdd3_xyw(ptr, x, y, w) \
231
+ do { \
232
+ caAtomicAdd((ptr), (x)); \
233
+ caAtomicAdd((ptr)+1, (y)); \
234
+ caAtomicAdd((ptr)+3, (w)); \
235
+ } while(0)
236
+
237
+ #define caAtomicAddTexture(ptr, level, idx, value) \
238
+ do { \
239
+ CA_SET_GROUP((idx) ^ ((level) << 27)); \
240
+ caAtomicAdd((ptr)+(idx), (value)); \
241
+ } while(0)
242
+
243
+ //------------------------------------------------------------------------
244
+ // Disable atomic coalescing for compute capability lower than 7.x
245
+
246
+ #else // __CUDA_ARCH__ >= 700
247
+ #define CA_TEMP _ca_temp
248
+ #define CA_TEMP_PARAM float CA_TEMP
249
+ #define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
250
+ #define CA_SET_GROUP_MASK(group, thread_mask)
251
+ #define CA_SET_GROUP(group)
252
+ #define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
253
+ #define caAtomicAdd3_xyw(ptr, x, y, w) \
254
+ do { \
255
+ atomicAdd((ptr), (x)); \
256
+ atomicAdd((ptr)+1, (y)); \
257
+ atomicAdd((ptr)+3, (w)); \
258
+ } while(0)
259
+ #define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
260
+ #endif // __CUDA_ARCH__ >= 700
261
+
262
+ //------------------------------------------------------------------------
263
+ #endif // __CUDACC__