Spaces:

flamehaze1115
/

Wonder3D-demo

Running on L4

App Files Files Community

Yuxiao319 commited on Dec 15, 2024

Commit

5ac6f37

1 Parent(s): 8ca9794

wonder3d_plus

Browse files

Files changed (1) hide show

gradio_app.py +39 -18

gradio_app.py CHANGED Viewed

@@ -23,14 +23,17 @@ from typing import Dict, Optional, Tuple, List
 from dataclasses import dataclass
 import huggingface_hub
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
-from mvdiffusion.models.unet_mv2d_condition import UNetMV2DConditionModel
-from mvdiffusion.data.single_image_dataset import SingleImageDataset as MVDiffusionDataset
-from mvdiffusion.pipelines.pipeline_mvdiffusion_image import MVDiffusionImagePipeline
 from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler
 from einops import rearrange
 import numpy as np
 from transformers import SamModel, SamProcessor
 def save_image(tensor):
     ndarr = tensor.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
     # pdb.set_trace()
@@ -48,6 +51,7 @@ Generate consistent multi-view normals maps and color images.
 <div>
 The demo does not include the mesh reconstruction part, please visit <a href="https://github.com/xxlong0/Wonder3D/">our github repo</a> to get a textured mesh.
 </div>
 '''
 _GPU_ID = 0
@@ -57,30 +61,34 @@ if not hasattr(Image, 'Resampling'):
 def sam_init():
-    model = SamModel.from_pretrained("facebook/sam-vit-large").to("cuda")
-    processor = SamProcessor.from_pretrained("facebook/sam-vit-large")
     return model, processor
 def sam_segment(sam_model, sam_processor, input_image, *bbox_coords):
     bbox = torch.tensor(bbox_coords, dtype=torch.float32)
     bbox =  bbox.unsqueeze(0).unsqueeze(0)
     image = np.asarray(input_image)
     start_time = time.time()
-    inputs = sam_processor(input_image.convert('RGB'), input_boxes=bbox, return_tensors="pt", do_resize=False).to("cuda")
     outputs = sam_model(**inputs, multimask_output=False)
-    masks = sam_processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
     print(f"SAM Time: {time.time() - start_time:.3f}s")
     out_image = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
     out_image[:, :, :3] = image
     out_image_bbox = out_image.copy()
-    out_image_bbox[:, :, 3] = masks[-1].cpu().detach().numpy().astype(np.uint8) * 255
     torch.cuda.empty_cache()
-    return Image.fromarray(out_image_bbox, mode='RGBA')
 def expand2square(pil_img, background_color):
     width, height = pil_img.size
@@ -142,7 +150,7 @@ def load_wonder3d_pipeline(cfg):
     feature_extractor = CLIPImageProcessor.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="feature_extractor", revision=cfg.revision)
     vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", revision=cfg.revision)
     unet = UNetMV2DConditionModel.from_pretrained_2d(cfg.pretrained_unet_path, subfolder="unet", revision=cfg.revision, **cfg.unet_from_pretrained_kwargs)
-    unet.enable_xformers_memory_efficient_attention()
     # Move text_encode and vae to gpu and cast to weight_dtype
     image_encoder.to(dtype=weight_dtype)
@@ -160,24 +168,28 @@ def load_wonder3d_pipeline(cfg):
     # sys.main_lock = threading.Lock()
     return pipeline
-from mvdiffusion.data.single_image_dataset import SingleImageDataset
-def prepare_data(single_image, crop_size):
     dataset = SingleImageDataset(
         root_dir = None,
         num_views = 6,
         img_wh=[256, 256],
         bg_color='white',
         crop_size=crop_size,
-        single_image=single_image
     )
     return dataset[0]
-def run_pipeline(pipeline, cfg, single_image, guidance_scale, steps, seed, crop_size):
     import pdb
     # pdb.set_trace()
-    batch = prepare_data(single_image, crop_size)
     pipeline.set_progress_bar_config(disable=True)
     seed = int(seed)
@@ -244,13 +256,14 @@ class TestConfig:
     cond_on_normals: bool
     cond_on_colors: bool
 def run_demo():
     from utils.misc import load_config
     from omegaconf import OmegaConf
     # parse YAML config to OmegaConf
-    cfg = load_config("./configs/mvdiffusion-joint-ortho-6views.yaml")
     # print(cfg)
     schema = OmegaConf.structured(TestConfig)
     cfg = OmegaConf.merge(schema, cfg)
@@ -302,7 +315,7 @@ def run_demo():
                             output_processing = gr.CheckboxGroup(['Background Removal'], label='Output Image Postprocessing', value=[])
                     with gr.Row():
                         with gr.Column():
-                            scale_slider = gr.Slider(1, 5, value=3, step=1,
                                                         label='Classifier Free Guidance Scale')
                         with gr.Column():
                             steps_slider = gr.Slider(15, 100, value=50, step=1,
@@ -312,6 +325,14 @@ def run_demo():
                             seed = gr.Number(42, label='Seed')
                         with gr.Column():
                             crop_size = gr.Number(192, label='Crop size')
                     # crop_size = 192
                 run_btn = gr.Button('Generate', variant='primary', interactive=True)
         with gr.Row():
@@ -338,7 +359,7 @@ def run_demo():
                         inputs=[input_image, input_processing],
                         outputs=[processed_image_highres, processed_image], queue=True
             ).success(fn=partial(run_pipeline, pipeline, cfg),
-                        inputs=[processed_image_highres, scale_slider, steps_slider, seed, crop_size],
                         outputs=[view_1, view_2, view_3, view_4, view_5, view_6,
                                  normal_1, normal_2, normal_3, normal_4, normal_5, normal_6,
                                  view_gallery, normal_gallery]

 from dataclasses import dataclass
 import huggingface_hub
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from mv_diffusion_30.models.unet_mv2d_condition import UNetMV2DConditionModel
+from mv_diffusion_30.data.single_image_dataset import SingleImageDataset as MVDiffusionDataset
+from mv_diffusion_30.pipelines.pipeline_mvdiffusion_image import MVDiffusionImagePipeline
 from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler
 from einops import rearrange
 import numpy as np
 from transformers import SamModel, SamProcessor
 def save_image(tensor):
     ndarr = tensor.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
     # pdb.set_trace()
 <div>
 The demo does not include the mesh reconstruction part, please visit <a href="https://github.com/xxlong0/Wonder3D/">our github repo</a> to get a textured mesh.
 </div>
+<span style="font-weight: bold; color: #d9534f;">- 2024.11.5 We shift our ckpt to the a more powerful model [Wonder3D_Plus] that supports both orthogonal and perspective camera settings and further improves generalizability.</span>
 '''
 _GPU_ID = 0
 def sam_init():
+    model = SamModel.from_pretrained("facebook/sam-vit-huge").to("cuda")
+    processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
     return model, processor
 def sam_segment(sam_model, sam_processor, input_image, *bbox_coords):
+    input_points = [[[bbox_coords[2] - bbox_coords[0], bbox_coords[3] - bbox_coords[1]]]]
     bbox = torch.tensor(bbox_coords, dtype=torch.float32)
     bbox =  bbox.unsqueeze(0).unsqueeze(0)
     image = np.asarray(input_image)
     start_time = time.time()
+    inputs = sam_processor(input_image, input_boxes=bbox, return_tensors="pt", do_resize=False).to("cuda")
     outputs = sam_model(**inputs, multimask_output=False)
+    masks = sam_processor.image_processor.post_process_masks(outputs.pred_masks.cpu(),
+    inputs["original_sizes"].cpu(),
+    inputs["reshaped_input_sizes"].cpu(), )
     print(f"SAM Time: {time.time() - start_time:.3f}s")
     out_image = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
     out_image[:, :, :3] = image
     out_image_bbox = out_image.copy()
+    foreground_mask = masks[-1][-1, -1, ...] * 1.
+    out_image_bbox[:, :, 3] =  foreground_mask.cpu().detach().numpy().astype(np.uint8) * 255
     torch.cuda.empty_cache()
+    return Image.fromarray(out_image_bbox, mode='RGBA')
 def expand2square(pil_img, background_color):
     width, height = pil_img.size
     feature_extractor = CLIPImageProcessor.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="feature_extractor", revision=cfg.revision)
     vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", revision=cfg.revision)
     unet = UNetMV2DConditionModel.from_pretrained_2d(cfg.pretrained_unet_path, subfolder="unet", revision=cfg.revision, **cfg.unet_from_pretrained_kwargs)
+    # unet.enable_xformers_memory_efficient_attention()
     # Move text_encode and vae to gpu and cast to weight_dtype
     image_encoder.to(dtype=weight_dtype)
     # sys.main_lock = threading.Lock()
     return pipeline
+from mv_diffusion_30.data.single_image_dataset import SingleImageDataset
+def prepare_data(single_image, crop_size, input_camera_type):
     dataset = SingleImageDataset(
         root_dir = None,
         num_views = 6,
         img_wh=[256, 256],
         bg_color='white',
         crop_size=crop_size,
+        single_image=single_image,
+        load_cam_type=True,
+        cam_types=[input_camera_type]
     )
     return dataset[0]
+def run_pipeline(pipeline, cfg, single_image, guidance_scale, steps, seed, crop_size, input_camera_type):
     import pdb
     # pdb.set_trace()
+    batch = prepare_data(single_image, crop_size, input_camera_type)
     pipeline.set_progress_bar_config(disable=True)
     seed = int(seed)
     cond_on_normals: bool
     cond_on_colors: bool
+    load_task: bool
 def run_demo():
     from utils.misc import load_config
     from omegaconf import OmegaConf
     # parse YAML config to OmegaConf
+    cfg = load_config("./configs/mvdiffusion-joint-plus.yaml")
     # print(cfg)
     schema = OmegaConf.structured(TestConfig)
     cfg = OmegaConf.merge(schema, cfg)
                             output_processing = gr.CheckboxGroup(['Background Removal'], label='Output Image Postprocessing', value=[])
                     with gr.Row():
                         with gr.Column():
+                            scale_slider = gr.Slider(1, 5, value=2, step=1,
                                                         label='Classifier Free Guidance Scale')
                         with gr.Column():
                             steps_slider = gr.Slider(15, 100, value=50, step=1,
                             seed = gr.Number(42, label='Seed')
                         with gr.Column():
                             crop_size = gr.Number(192, label='Crop size')
+                    with gr.Row():
+                        camera_type = gr.Radio(
+                            choices=[("Orthogonal Camera", "ortho"), ("Perspective Camera", "persp")],
+                            value="ortho",
+                            label="Camera Type"
+                        )
                     # crop_size = 192
                 run_btn = gr.Button('Generate', variant='primary', interactive=True)
         with gr.Row():
                         inputs=[input_image, input_processing],
                         outputs=[processed_image_highres, processed_image], queue=True
             ).success(fn=partial(run_pipeline, pipeline, cfg),
+                        inputs=[processed_image_highres, scale_slider, steps_slider, seed, crop_size, camera_type],
                         outputs=[view_1, view_2, view_3, view_4, view_5, view_6,
                                  normal_1, normal_2, normal_3, normal_4, normal_5, normal_6,
                                  view_gallery, normal_gallery]