diff --git a/app copy.py b/app copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..3def2652ab03477f2e1b9c0f9c14e9c5c61122a1
--- /dev/null
+++ b/app copy.py
@@ -0,0 +1,740 @@
+try:
+ import spaces
+except:
+ pass
+
+import os
+import gradio as gr
+
+import torch
+from gradio_image_prompter import ImagePrompter
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+from omegaconf import OmegaConf
+from PIL import Image
+import numpy as np
+from copy import deepcopy
+import cv2
+
+import torch.nn.functional as F
+import torchvision
+from einops import rearrange
+import tempfile
+
+from objctrl_2_5d.utils.ui_utils import process_image, get_camera_pose, get_subject_points, get_points, undo_points, mask_image
+from ZoeDepth.zoedepth.utils.misc import colorize
+
+from cameractrl.inference import get_pipeline
+from objctrl_2_5d.utils.examples import examples, sync_points
+
+from objctrl_2_5d.utils.objmask_util import RT2Plucker, Unprojected, roll_with_ignore_multidim, dilate_mask_pytorch
+from objctrl_2_5d.utils.filter_utils import get_freq_filter, freq_mix_3d
+
+
+### Title and Description ###
+#### Description ####
+title = r"""
ObjCtrl-2.5D: Training-free Object Control with Camera Poses
"""
+# subtitle = r"""Deployed on SVD Generation
"""
+important_link = r"""
+
+"""
+
+authors = r"""
+
+"""
+
+affiliation = r"""
+
+"""
+
+description = r"""
+Official Gradio demo for ObjCtrl-2.5D: Training-free Object Control with Camera Poses.
+🔥 ObjCtrl2.5D enables object motion control in a I2V generated video via transforming 2D trajectories to 3D using depth, subsequently converting them into camera poses,
+thereby leveraging the exisitng camera motion control module for object motion control without requiring additional training.
+"""
+
+article = r"""
+If ObjCtrl2.5D is helpful, please help to ⭐ the Github Repo. Thanks!
+[![GitHub Stars](https://img.shields.io/github/stars/TencentARC%2FMotionCtrl
+)](https://github.com/TencentARC/MotionCtrl)
+
+---
+
+📝 **Citation**
+
+If our work is useful for your research, please consider citing:
+```bibtex
+@inproceedings{wang2024motionctrl,
+ title={Motionctrl: A unified and flexible motion controller for video generation},
+ author={Wang, Zhouxia and Yuan, Ziyang and Wang, Xintao and Li, Yaowei and Chen, Tianshui and Xia, Menghan and Luo, Ping and Shan, Ying},
+ booktitle={ACM SIGGRAPH 2024 Conference Papers},
+ pages={1--11},
+ year={2024}
+}
+```
+
+📧 **Contact**
+
+If you have any questions, please feel free to reach me out at zhouzi1212@gmail.com.
+
+"""
+
+# -------------- initialization --------------
+
+CAMERA_MODE = ["Traj2Cam", "Rotate", "Clockwise", "Translate"]
+
+# select the device for computation
+if torch.cuda.is_available():
+ device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+ device = torch.device("mps")
+else:
+ device = torch.device("cpu")
+print(f"using device: {device}")
+
+# segmentation model
+segmentor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-tiny", cache_dir="ckpt", device=device)
+
+# depth model
+d_model_NK = torch.hub.load('./ZoeDepth', 'ZoeD_NK', source='local', pretrained=True).to(device)
+
+# cameractrl model
+config = "configs/svd_320_576_cameractrl.yaml"
+model_id = "stabilityai/stable-video-diffusion-img2vid"
+ckpt = "checkpoints/CameraCtrl_svd.ckpt"
+if not os.path.exists(ckpt):
+ os.makedirs("checkpoints", exist_ok=True)
+ os.system("wget -c https://huggingface.co/hehao13/CameraCtrl_SVD_ckpts/resolve/main/CameraCtrl_svd.ckpt?download=true")
+ os.system("mv CameraCtrl_svd.ckpt?download=true checkpoints/CameraCtrl_svd.ckpt")
+model_config = OmegaConf.load(config)
+
+
+pipeline = get_pipeline(model_id, "unet", model_config['down_block_types'], model_config['up_block_types'],
+ model_config['pose_encoder_kwargs'], model_config['attention_processor_kwargs'],
+ ckpt, True, device)
+
+# segmentor = None
+# d_model_NK = None
+# pipeline = None
+
+### run the demo ##
+# @spaces.GPU(duration=5)
+def segment(canvas, image, logits):
+ if logits is not None:
+ logits *= 32.0
+ _, points = get_subject_points(canvas)
+ image = np.array(image)
+
+ with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+ segmentor.set_image(image)
+ input_points = []
+ input_boxes = []
+ for p in points:
+ [x1, y1, _, x2, y2, _] = p
+ if x2==0 and y2==0:
+ input_points.append([x1, y1])
+ else:
+ input_boxes.append([x1, y1, x2, y2])
+ if len(input_points) == 0:
+ input_points = None
+ input_labels = None
+ else:
+ input_points = np.array(input_points)
+ input_labels = np.ones(len(input_points))
+ if len(input_boxes) == 0:
+ input_boxes = None
+ else:
+ input_boxes = np.array(input_boxes)
+ masks, _, logits = segmentor.predict(
+ point_coords=input_points,
+ point_labels=input_labels,
+ box=input_boxes,
+ multimask_output=False,
+ return_logits=True,
+ mask_input=logits,
+ )
+ mask = masks > 0
+ masked_img = mask_image(image, mask[0], color=[252, 140, 90], alpha=0.9)
+ masked_img = Image.fromarray(masked_img)
+
+ return mask[0], masked_img, masked_img, logits / 32.0
+
+# @spaces.GPU(duration=5)
+def get_depth(image, points):
+
+ depth = d_model_NK.infer_pil(image)
+ colored_depth = colorize(depth, cmap='gray_r') # [h, w, 4] 0-255
+
+ depth_img = deepcopy(colored_depth[:, :, :3])
+ if len(points) > 0:
+ for idx, point in enumerate(points):
+ if idx % 2 == 0:
+ cv2.circle(depth_img, tuple(point), 10, (255, 0, 0), -1)
+ else:
+ cv2.circle(depth_img, tuple(point), 10, (0, 0, 255), -1)
+ if idx > 0:
+ cv2.arrowedLine(depth_img, points[idx-1], points[idx], (255, 255, 255), 4, tipLength=0.5)
+
+ return depth, depth_img, colored_depth[:, :, :3]
+
+
+# @spaces.GPU(duration=80)
+def run_objctrl_2_5d(condition_image,
+ mask,
+ depth,
+ RTs,
+ bg_mode,
+ shared_wapring_latents,
+ scale_wise_masks,
+ rescale,
+ seed,
+ ds, dt,
+ num_inference_steps=25):
+
+ DEBUG = False
+
+ if DEBUG:
+ cur_OUTPUT_PATH = 'outputs/tmp'
+ os.makedirs(cur_OUTPUT_PATH, exist_ok=True)
+
+ # num_inference_steps=25
+ min_guidance_scale = 1.0
+ max_guidance_scale = 3.0
+
+ area_ratio = 0.3
+ depth_scale_ = 5.2
+ center_margin = 10
+
+ height, width = 320, 576
+ num_frames = 14
+
+ intrinsics = np.array([[float(width), float(width), float(width) / 2, float(height) / 2]])
+ intrinsics = np.repeat(intrinsics, num_frames, axis=0) # [n_frame, 4]
+ fx = intrinsics[0, 0] / width
+ fy = intrinsics[0, 1] / height
+ cx = intrinsics[0, 2] / width
+ cy = intrinsics[0, 3] / height
+
+ down_scale = 8
+ H, W = height // down_scale, width // down_scale
+ K = np.array([[width / down_scale, 0, W / 2], [0, width / down_scale, H / 2], [0, 0, 1]])
+
+ seed = int(seed)
+
+ center_h_margin, center_w_margin = center_margin, center_margin
+ depth_center = np.mean(depth[height//2-center_h_margin:height//2+center_h_margin, width//2-center_w_margin:width//2+center_w_margin])
+
+ if rescale > 0:
+ depth_rescale = round(depth_scale_ * rescale / depth_center, 2)
+ else:
+ depth_rescale = 1.0
+
+ depth = depth * depth_rescale
+
+ depth_down = F.interpolate(torch.tensor(depth).unsqueeze(0).unsqueeze(0),
+ (H, W), mode='bilinear', align_corners=False).squeeze().numpy() # [H, W]
+
+ ## latent
+ generator = torch.Generator()
+ generator.manual_seed(seed)
+
+ latents_org = pipeline.prepare_latents(
+ 1,
+ 14,
+ 8,
+ height,
+ width,
+ pipeline.dtype,
+ device,
+ generator,
+ None,
+ )
+ latents_org = latents_org / pipeline.scheduler.init_noise_sigma
+
+ cur_plucker_embedding, _, _ = RT2Plucker(RTs, RTs.shape[0], (height, width), fx, fy, cx, cy) # 6, V, H, W
+ cur_plucker_embedding = cur_plucker_embedding.to(device)
+ cur_plucker_embedding = cur_plucker_embedding[None, ...] # b 6 f h w
+ cur_plucker_embedding = cur_plucker_embedding.permute(0, 2, 1, 3, 4) # b f 6 h w
+ cur_plucker_embedding = cur_plucker_embedding[:, :num_frames, ...]
+ cur_pose_features = pipeline.pose_encoder(cur_plucker_embedding)
+
+ # bg_mode = ["Fixed", "Reverse", "Free"]
+ if bg_mode == "Fixed":
+ fix_RTs = np.repeat(RTs[0][None, ...], num_frames, axis=0) # [n_frame, 4, 3]
+ fix_plucker_embedding, _, _ = RT2Plucker(fix_RTs, num_frames, (height, width), fx, fy, cx, cy) # 6, V, H, W
+ fix_plucker_embedding = fix_plucker_embedding.to(device)
+ fix_plucker_embedding = fix_plucker_embedding[None, ...] # b 6 f h w
+ fix_plucker_embedding = fix_plucker_embedding.permute(0, 2, 1, 3, 4) # b f 6 h w
+ fix_plucker_embedding = fix_plucker_embedding[:, :num_frames, ...]
+ fix_pose_features = pipeline.pose_encoder(fix_plucker_embedding)
+
+ elif bg_mode == "Reverse":
+ bg_plucker_embedding, _, _ = RT2Plucker(RTs[::-1], RTs.shape[0], (height, width), fx, fy, cx, cy) # 6, V, H, W
+ bg_plucker_embedding = bg_plucker_embedding.to(device)
+ bg_plucker_embedding = bg_plucker_embedding[None, ...] # b 6 f h w
+ bg_plucker_embedding = bg_plucker_embedding.permute(0, 2, 1, 3, 4) # b f 6 h w
+ bg_plucker_embedding = bg_plucker_embedding[:, :num_frames, ...]
+ fix_pose_features = pipeline.pose_encoder(bg_plucker_embedding)
+
+ else:
+ fix_pose_features = None
+
+ #### preparing mask
+
+ mask = Image.fromarray(mask)
+ mask = mask.resize((W, H))
+ mask = np.array(mask).astype(np.float32)
+ mask = np.expand_dims(mask, axis=-1)
+
+ # visulize mask
+ if DEBUG:
+ mask_sum_vis = mask[..., 0]
+ mask_sum_vis = (mask_sum_vis * 255.0).astype(np.uint8)
+ mask_sum_vis = Image.fromarray(mask_sum_vis)
+
+ mask_sum_vis.save(f'{cur_OUTPUT_PATH}/org_mask.png')
+
+ try:
+ warped_masks = Unprojected(mask, depth_down, RTs, H=H, W=W, K=K)
+
+ warped_masks.insert(0, mask)
+
+ except:
+ # mask to bbox
+ print(f'!!! Mask is too small to warp; mask to bbox')
+ mask = mask[:, :, 0]
+ coords = cv2.findNonZero(mask)
+ x, y, w, h = cv2.boundingRect(coords)
+ # mask[y:y+h, x:x+w] = 1.0
+
+ center_x, center_y = x + w // 2, y + h // 2
+ center_z = depth_down[center_y, center_x]
+
+ # RTs [n_frame, 3, 4] to [n_frame, 4, 4] , add [0, 0, 0, 1]
+ RTs = np.concatenate([RTs, np.array([[[0, 0, 0, 1]]] * num_frames)], axis=1)
+
+ # RTs: world to camera
+ P0 = np.array([center_x, center_y, 1])
+ Pc0 = np.linalg.inv(K) @ P0 * center_z
+ pw = np.linalg.inv(RTs[0]) @ np.array([Pc0[0], Pc0[1], center_z, 1]) # [4]
+
+ P = [np.array([center_x, center_y])]
+ for i in range(1, num_frames):
+ Pci = RTs[i] @ pw
+ Pi = K @ Pci[:3] / Pci[2]
+ P.append(Pi[:2])
+
+ warped_masks = [mask]
+ for i in range(1, num_frames):
+ shift_x = int(round(P[i][0] - P[0][0]))
+ shift_y = int(round(P[i][1] - P[0][1]))
+
+ cur_mask = roll_with_ignore_multidim(mask, [shift_y, shift_x])
+ warped_masks.append(cur_mask)
+
+
+ warped_masks = [v[..., None] for v in warped_masks]
+
+ warped_masks = np.stack(warped_masks, axis=0) # [f, h, w]
+ warped_masks = np.repeat(warped_masks, 3, axis=-1) # [f, h, w, 3]
+
+ mask_sum = np.sum(warped_masks, axis=0, keepdims=True) # [1, H, W, 3]
+ mask_sum[mask_sum > 1.0] = 1.0
+ mask_sum = mask_sum[0,:,:, 0]
+
+ if DEBUG:
+ ## visulize warp mask
+ warp_masks_vis = torch.tensor(warped_masks)
+ warp_masks_vis = (warp_masks_vis * 255.0).to(torch.uint8)
+ torchvision.io.write_video(f'{cur_OUTPUT_PATH}/warped_masks.mp4', warp_masks_vis, fps=10, video_codec='h264', options={'crf': '10'})
+
+ # visulize mask
+ mask_sum_vis = mask_sum
+ mask_sum_vis = (mask_sum_vis * 255.0).astype(np.uint8)
+ mask_sum_vis = Image.fromarray(mask_sum_vis)
+
+ mask_sum_vis.save(f'{cur_OUTPUT_PATH}/merged_mask.png')
+
+ if scale_wise_masks:
+ min_area = H * W * area_ratio # cal in downscale
+ non_zero_len = mask_sum.sum()
+
+ print(f'non_zero_len: {non_zero_len}, min_area: {min_area}')
+
+ if non_zero_len > min_area:
+ kernel_sizes = [1, 1, 1, 3]
+ elif non_zero_len > min_area * 0.5:
+ kernel_sizes = [3, 1, 1, 5]
+ else:
+ kernel_sizes = [5, 3, 3, 7]
+ else:
+ kernel_sizes = [1, 1, 1, 1]
+
+ mask = torch.from_numpy(mask_sum) # [h, w]
+ mask = mask[None, None, ...] # [1, 1, h, w]
+ mask = F.interpolate(mask, (height, width), mode='bilinear', align_corners=False) # [1, 1, H, W]
+ # mask = mask.repeat(1, num_frames, 1, 1) # [1, f, H, W]
+ mask = mask.to(pipeline.dtype).to(device)
+
+ ##### Mask End ######
+
+ ### Got blending pose features Start ###
+
+ pose_features = []
+ for i in range(0, len(cur_pose_features)):
+ kernel_size = kernel_sizes[i]
+ h, w = cur_pose_features[i].shape[-2:]
+
+ if fix_pose_features is None:
+ pose_features.append(torch.zeros_like(cur_pose_features[i]))
+ else:
+ pose_features.append(fix_pose_features[i])
+
+ cur_mask = F.interpolate(mask, (h, w), mode='bilinear', align_corners=False)
+ cur_mask = dilate_mask_pytorch(cur_mask, kernel_size=kernel_size) # [1, 1, H, W]
+ cur_mask = cur_mask.repeat(1, num_frames, 1, 1) # [1, f, H, W]
+
+ if DEBUG:
+ # visulize mask
+ mask_vis = cur_mask[0, 0].cpu().numpy() * 255.0
+ mask_vis = Image.fromarray(mask_vis.astype(np.uint8))
+ mask_vis.save(f'{cur_OUTPUT_PATH}/mask_k{kernel_size}_scale{i}.png')
+
+ cur_mask = cur_mask[None, ...] # [1, 1, f, H, W]
+ pose_features[-1] = cur_pose_features[i] * cur_mask + pose_features[-1] * (1 - cur_mask)
+
+ ### Got blending pose features End ###
+
+ ##### Warp Noise Start ######
+
+ if shared_wapring_latents:
+ noise = latents_org[0, 0].data.cpu().numpy().copy() #[14, 4, 40, 72]
+ noise = np.transpose(noise, (1, 2, 0)) # [40, 72, 4]
+
+ try:
+ warp_noise = Unprojected(noise, depth_down, RTs, H=H, W=W, K=K)
+ warp_noise.insert(0, noise)
+ except:
+ print(f'!!! Noise is too small to warp; mask to bbox')
+
+ warp_noise = [noise]
+ for i in range(1, num_frames):
+ shift_x = int(round(P[i][0] - P[0][0]))
+ shift_y = int(round(P[i][1] - P[0][1]))
+
+ cur_noise= roll_with_ignore_multidim(noise, [shift_y, shift_x])
+ warp_noise.append(cur_noise)
+
+ warp_noise = np.stack(warp_noise, axis=0) # [f, h, w, 4]
+
+ if DEBUG:
+ ## visulize warp noise
+ warp_noise_vis = torch.tensor(warp_noise)[..., :3] * torch.tensor(warped_masks)
+ warp_noise_vis = (warp_noise_vis - warp_noise_vis.min()) / (warp_noise_vis.max() - warp_noise_vis.min())
+ warp_noise_vis = (warp_noise_vis * 255.0).to(torch.uint8)
+
+ torchvision.io.write_video(f'{cur_OUTPUT_PATH}/warp_noise.mp4', warp_noise_vis, fps=10, video_codec='h264', options={'crf': '10'})
+
+
+ warp_latents = torch.tensor(warp_noise).permute(0, 3, 1, 2).to(latents_org.device).to(latents_org.dtype) # [frame, 4, H, W]
+ warp_latents = warp_latents.unsqueeze(0) # [1, frame, 4, H, W]
+
+ warped_masks = torch.tensor(warped_masks).permute(0, 3, 1, 2).unsqueeze(0) # [1, frame, 3, H, W]
+ mask_extend = torch.concat([warped_masks, warped_masks[:,:,0:1]], dim=2) # [1, frame, 4, H, W]
+ mask_extend = mask_extend.to(latents_org.device).to(latents_org.dtype)
+
+ warp_latents = warp_latents * mask_extend + latents_org * (1 - mask_extend)
+ warp_latents = warp_latents.permute(0, 2, 1, 3, 4)
+ random_noise = latents_org.clone().permute(0, 2, 1, 3, 4)
+
+ filter_shape = warp_latents.shape
+
+ freq_filter = get_freq_filter(
+ filter_shape,
+ device = device,
+ filter_type='butterworth',
+ n=4,
+ d_s=ds,
+ d_t=dt
+ )
+
+ warp_latents = freq_mix_3d(warp_latents, random_noise, freq_filter)
+ warp_latents = warp_latents.permute(0, 2, 1, 3, 4)
+
+ else:
+ warp_latents = latents_org.clone()
+
+ generator.manual_seed(42)
+
+ with torch.no_grad():
+ result = pipeline(
+ image=condition_image,
+ pose_embedding=cur_plucker_embedding,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ num_inference_steps=num_inference_steps,
+ min_guidance_scale=min_guidance_scale,
+ max_guidance_scale=max_guidance_scale,
+ do_image_process=True,
+ generator=generator,
+ output_type='pt',
+ pose_features= pose_features,
+ latents = warp_latents
+ ).frames[0].cpu() #[f, c, h, w]
+
+
+ result = rearrange(result, 'f c h w -> f h w c')
+ result = (result * 255.0).to(torch.uint8)
+
+ video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+ torchvision.io.write_video(video_path, result, fps=10, video_codec='h264', options={'crf': '8'})
+
+ return video_path
+
+# -------------- UI definition --------------
+with gr.Blocks() as demo:
+ # layout definition
+ gr.Markdown(title)
+ gr.Markdown(authors)
+ gr.Markdown(affiliation)
+ gr.Markdown(important_link)
+ gr.Markdown(description)
+
+
+ # with gr.Row():
+ # gr.Markdown("""# Repositioning the Subject within Image """)
+ mask = gr.State(value=None) # store mask
+ removal_mask = gr.State(value=None) # store removal mask
+ selected_points = gr.State([]) # store points
+ selected_points_text = gr.Textbox(label="Selected Points", visible=False)
+
+ original_image = gr.State(value=None) # store original input image
+ masked_original_image = gr.State(value=None) # store masked input image
+ mask_logits = gr.State(value=None) # store mask logits
+
+ depth = gr.State(value=None) # store depth
+ org_depth_image = gr.State(value=None) # store original depth image
+
+ camera_pose = gr.State(value=None) # store camera pose
+
+ with gr.Column():
+
+ outlines = """
+ There are total 5 steps to complete the task.
+ - Step 1: Input an image and Crop it to a suitable size;
+ - Step 2: Attain the subject mask;
+ - Step 3: Get depth and Draw Trajectory;
+ - Step 4: Get camera pose from trajectory or customize it;
+ - Step 5: Generate the final video.
+ """
+
+ gr.Markdown(outlines)
+
+
+ with gr.Row():
+ with gr.Column():
+ # Step 1: Input Image
+ step1_dec = """
+ Step 1: Input Image
+ - Select the region using a bounding box, aiming for a ratio close to 320:576 (height:width).
+ - All provided images in `Examples` are in 320 x 576 resolution. Simply press `Process` to proceed.
+ """
+ step1 = gr.Markdown(step1_dec)
+ raw_input = ImagePrompter(type="pil", label="Raw Image", show_label=True, interactive=True)
+ # left_up_point = gr.Textbox(value = "-1 -1", label="Left Up Point", interactive=True)
+ process_button = gr.Button("Process")
+
+ with gr.Column():
+ # Step 2: Get Subject Mask
+ step2_dec = """
+ Step 2: Get Subject Mask
+ - Use the bounding boxes or paints to select the subject.
+ - Press `Segment Subject` to get the mask. Can be refined iteratively by updating points.
+ """
+ step2 = gr.Markdown(step2_dec)
+ canvas = ImagePrompter(type="pil", label="Input Image", show_label=True, interactive=True) # for mask painting
+
+ select_button = gr.Button("Segment Subject")
+
+ with gr.Row():
+ with gr.Column():
+ mask_dec = """
+ Mask Result
+ - Just for visualization purpose. No need to interact.
+ """
+ mask_vis = gr.Markdown(mask_dec)
+ mask_output = gr.Image(type="pil", label="Mask", show_label=True, interactive=False)
+ with gr.Column():
+ # Step 3: Get Depth and Draw Trajectory
+ step3_dec = """
+ Step 3: Get Depth and Draw Trajectory
+ - Press `Get Depth` to get the depth image.
+ - Draw the trajectory by selecting points on the depth image. No more than 14 points.
+ - Press `Undo point` to remove all points.
+ """
+ step3 = gr.Markdown(step3_dec)
+ depth_image = gr.Image(type="pil", label="Depth Image", show_label=True, interactive=False)
+ with gr.Row():
+ depth_button = gr.Button("Get Depth")
+ undo_button = gr.Button("Undo point")
+
+ with gr.Row():
+ with gr.Column():
+ # Step 4: Trajectory to Camera Pose or Get Camera Pose
+ step4_dec = """
+ Step 4: Get camera pose from trajectory or customize it
+ - Option 1: Transform the 2D trajectory to camera poses with depth. `Rescale` is used for depth alignment. Larger value can speed up the object motion.
+ - Option 2: Rotate the camera with a specific `Angle`.
+ - Option 3: Rotate the camera clockwise or counterclockwise with a specific `Angle`.
+ - Option 4: Translate the camera with `Tx` (Pan Left/Right), `Ty` (Pan Up/Down), `Tz` (Zoom In/Out) and `Speed`.
+ """
+ step4 = gr.Markdown(step4_dec)
+ camera_pose_vis = gr.Plot(None, label='Camera Pose')
+ with gr.Row():
+ with gr.Column():
+ speed = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.0, label="Speed", interactive=True)
+ rescale = gr.Slider(minimum=0.0, maximum=10, step=0.1, value=1.0, label="Rescale", interactive=True)
+ # traj2pose_button = gr.Button("Option1: Trajectory to Camera Pose")
+
+ angle = gr.Slider(minimum=-360, maximum=360, step=1, value=60, label="Angle", interactive=True)
+ # rotation_button = gr.Button("Option2: Rotate")
+ # clockwise_button = gr.Button("Option3: Clockwise")
+ with gr.Column():
+
+ Tx = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tx", interactive=True)
+ Ty = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Ty", interactive=True)
+ Tz = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tz", interactive=True)
+ # translation_button = gr.Button("Option4: Translate")
+ with gr.Row():
+ camera_option = gr.Radio(choices = CAMERA_MODE, label='Camera Options', value=CAMERA_MODE[0], interactive=True)
+ with gr.Row():
+ get_camera_pose_button = gr.Button("Get Camera Pose")
+
+ with gr.Column():
+ # Step 5: Get the final generated video
+ step5_dec = """
+ Step 5: Get the final generated video
+ - 3 modes for background: Fixed, Reverse, Free.
+ - Enable Scale-wise Masks for better object control.
+ - Option to enable Shared Warping Latents and set stop frequency for spatial (`ds`) and temporal (`dt`) dimensions. Larger stop frequency will lead to artifacts.
+ """
+ step5 = gr.Markdown(step5_dec)
+ generated_video = gr.Video(None, label='Generated Video')
+
+ with gr.Row():
+ seed = gr.Textbox(value = "42", label="Seed", interactive=True)
+ # num_inference_steps = gr.Slider(minimum=1, maximum=100, step=1, value=25, label="Number of Inference Steps", interactive=True)
+ bg_mode = gr.Radio(choices = ["Fixed", "Reverse", "Free"], label="Background Mode", value="Fixed", interactive=True)
+ # swl_mode = gr.Radio(choices = ["Enable SWL", "Disable SWL"], label="Shared Warping Latent", value="Disable SWL", interactive=True)
+ scale_wise_masks = gr.Checkbox(label="Enable Scale-wise Masks", interactive=True, value=True)
+ with gr.Row():
+ with gr.Column():
+ shared_wapring_latents = gr.Checkbox(label="Enable Shared Warping Latents", interactive=True)
+ with gr.Column():
+ ds = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="ds", interactive=True)
+ dt = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="dt", interactive=True)
+
+ generated_button = gr.Button("Generate")
+
+
+
+ # # event definition
+ process_button.click(
+ fn = process_image,
+ inputs = [raw_input],
+ outputs = [original_image, canvas]
+ )
+
+ select_button.click(
+ segment,
+ [canvas, original_image, mask_logits],
+ [mask, mask_output, masked_original_image, mask_logits]
+ )
+
+ depth_button.click(
+ get_depth,
+ [original_image, selected_points],
+ [depth, depth_image, org_depth_image]
+ )
+
+ depth_image.select(
+ get_points,
+ [depth_image, selected_points],
+ [depth_image, selected_points],
+ )
+ undo_button.click(
+ undo_points,
+ [org_depth_image],
+ [depth_image, selected_points]
+ )
+
+ get_camera_pose_button.click(
+ get_camera_pose(CAMERA_MODE),
+ [camera_option, selected_points, depth, mask, rescale, angle, Tx, Ty, Tz, speed],
+ [camera_pose, camera_pose_vis, rescale]
+ )
+
+ generated_button.click(
+ run_objctrl_2_5d,
+ [
+ original_image,
+ mask,
+ depth,
+ camera_pose,
+ bg_mode,
+ shared_wapring_latents,
+ scale_wise_masks,
+ rescale,
+ seed,
+ ds,
+ dt,
+ # num_inference_steps
+ ],
+ [generated_video],
+ )
+
+ gr.Examples(
+ examples=examples,
+ inputs=[
+ raw_input,
+ rescale,
+ speed,
+ angle,
+ Tx,
+ Ty,
+ Tz,
+ camera_option,
+ bg_mode,
+ shared_wapring_latents,
+ scale_wise_masks,
+ ds,
+ dt,
+ seed,
+ selected_points_text # selected_points
+ ],
+ outputs=[generated_video],
+ examples_per_page=10
+ )
+
+ selected_points_text.change(
+ sync_points,
+ inputs=[selected_points_text],
+ outputs=[selected_points]
+ )
+
+
+ gr.Markdown(article)
+
+
+demo.queue().launch(share=True)
diff --git a/app.py b/app.py
index a0ff86394ceb8e4bb476f09029b8dac724bec92f..ba36a556966966c415efee0133c8139616239610 100644
--- a/app.py
+++ b/app.py
@@ -1,12 +1,18 @@
-import spaces
+try:
+ import spaces
+except:
+ pass
+
import os
import gradio as gr
+import json
+import ast
import torch
from gradio_image_prompter import ImagePrompter
from sam2.sam2_image_predictor import SAM2ImagePredictor
from omegaconf import OmegaConf
-from PIL import Image
+from PIL import Image, ImageDraw
import numpy as np
from copy import deepcopy
import cv2
@@ -16,7 +22,7 @@ import torchvision
from einops import rearrange
import tempfile
-from objctrl_2_5d.utils.ui_utils import process_image, get_camera_pose, get_subject_points, get_points, undo_points, mask_image
+from objctrl_2_5d.utils.ui_utils import process_image, get_camera_pose, get_subject_points, get_points, undo_points, mask_image, traj2cam, get_mid_params
from ZoeDepth.zoedepth.utils.misc import colorize
from cameractrl.inference import get_pipeline
@@ -25,7 +31,6 @@ from objctrl_2_5d.utils.examples import examples, sync_points
from objctrl_2_5d.utils.objmask_util import RT2Plucker, Unprojected, roll_with_ignore_multidim, dilate_mask_pytorch
from objctrl_2_5d.utils.filter_utils import get_freq_filter, freq_mix_3d
-
### Title and Description ###
#### Description ####
title = r"""ObjCtrl-2.5D: Training-free Object Control with Camera Poses
"""
@@ -85,9 +90,40 @@ If you have any questions, please feel free to reach me out at zhouzi1212@gma
"""
+# pre-defined parameters
+DEBUG = False
+
+if DEBUG:
+ cur_OUTPUT_PATH = 'outputs/tmp'
+ os.makedirs(cur_OUTPUT_PATH, exist_ok=True)
+
+# num_inference_steps=25
+min_guidance_scale = 1.0
+max_guidance_scale = 3.0
+
+area_ratio = 0.3
+depth_scale_ = 5.2
+center_margin = 10
+
+height, width = 320, 576
+num_frames = 14
+
+intrinsics = np.array([[float(width), float(width), float(width) / 2, float(height) / 2]])
+intrinsics = np.repeat(intrinsics, num_frames, axis=0) # [n_frame, 4]
+fx = intrinsics[0, 0] / width
+fy = intrinsics[0, 1] / height
+cx = intrinsics[0, 2] / width
+cy = intrinsics[0, 3] / height
+
+down_scale = 8
+H, W = height // down_scale, width // down_scale
+K = np.array([[width / down_scale, 0, W / 2], [0, width / down_scale, H / 2], [0, 0, 1]])
+
+
# -------------- initialization --------------
-CAMERA_MODE = ["Traj2Cam", "Rotate", "Clockwise", "Translate"]
+# CAMERA_MODE = ["Traj2Cam", "Rotate", "Clockwise", "Translate"]
+CAMERA_MODE = ["None", "ZoomIn", "ZoomOut", "PanRight", "PanLeft", "TiltUp", "TiltDown", "ClockWise", "Anti-CW", "Rotate60"]
# select the device for computation
if torch.cuda.is_available():
@@ -96,11 +132,9 @@ elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
- device = torch.device("cuda")
- print(f"Force device to {device} due to ZeroGPU")
print(f"using device: {device}")
-# segmentation model
+# # segmentation model
segmentor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-tiny", cache_dir="ckpt", device=device)
# depth model
@@ -126,7 +160,7 @@ pipeline = get_pipeline(model_id, "unet", model_config['down_block_types'], mode
# pipeline = None
### run the demo ##
-@spaces.GPU(duration=5)
+# @spaces.GPU(duration=5)
def segment(canvas, image, logits):
if logits is not None:
logits *= 32.0
@@ -165,28 +199,9 @@ def segment(canvas, image, logits):
masked_img = mask_image(image, mask[0], color=[252, 140, 90], alpha=0.9)
masked_img = Image.fromarray(masked_img)
- return mask[0], masked_img, masked_img, logits / 32.0
-
-@spaces.GPU(duration=5)
-def get_depth(image, points):
-
- depth = d_model_NK.infer_pil(image)
- colored_depth = colorize(depth, cmap='gray_r') # [h, w, 4] 0-255
-
- depth_img = deepcopy(colored_depth[:, :, :3])
- if len(points) > 0:
- for idx, point in enumerate(points):
- if idx % 2 == 0:
- cv2.circle(depth_img, tuple(point), 10, (255, 0, 0), -1)
- else:
- cv2.circle(depth_img, tuple(point), 10, (0, 0, 255), -1)
- if idx > 0:
- cv2.arrowedLine(depth_img, points[idx-1], points[idx], (255, 255, 255), 4, tipLength=0.5)
-
- return depth, depth_img, colored_depth[:, :, :3]
+ return mask[0], {'image': masked_img, 'points': points}, logits / 32.0
-
-@spaces.GPU(duration=80)
+# @spaces.GPU(duration=80)
def run_objctrl_2_5d(condition_image,
mask,
depth,
@@ -198,35 +213,6 @@ def run_objctrl_2_5d(condition_image,
seed,
ds, dt,
num_inference_steps=25):
-
- DEBUG = False
-
- if DEBUG:
- cur_OUTPUT_PATH = 'outputs/tmp'
- os.makedirs(cur_OUTPUT_PATH, exist_ok=True)
-
- # num_inference_steps=25
- min_guidance_scale = 1.0
- max_guidance_scale = 3.0
-
- area_ratio = 0.3
- depth_scale_ = 5.2
- center_margin = 10
-
- height, width = 320, 576
- num_frames = 14
-
- intrinsics = np.array([[float(width), float(width), float(width) / 2, float(height) / 2]])
- intrinsics = np.repeat(intrinsics, num_frames, axis=0) # [n_frame, 4]
- fx = intrinsics[0, 0] / width
- fy = intrinsics[0, 1] / height
- cx = intrinsics[0, 2] / width
- cy = intrinsics[0, 3] / height
-
- down_scale = 8
- H, W = height // down_scale, width // down_scale
- K = np.array([[width / down_scale, 0, W / 2], [0, width / down_scale, H / 2], [0, 0, 1]])
-
seed = int(seed)
center_h_margin, center_w_margin = center_margin, center_margin
@@ -288,7 +274,7 @@ def run_objctrl_2_5d(condition_image,
fix_pose_features = None
#### preparing mask
-
+
mask = Image.fromarray(mask)
mask = mask.resize((W, H))
mask = np.array(mask).astype(np.float32)
@@ -500,6 +486,97 @@ def run_objctrl_2_5d(condition_image,
return video_path
+
+# UI function
+# @spaces.GPU(duration=5)
+def process_image(raw_image, trajectory_points):
+
+ image, points = raw_image['image'], raw_image['points']
+
+ print(points)
+
+ try:
+ assert(len(points)) == 1, "Please draw only one bbox"
+ [x1, y1, _, x2, y2, _] = points[0]
+
+ image = image.crop((x1, y1, x2, y2))
+ image = image.resize((width, height))
+ except:
+ image = image.resize((width, height))
+
+ depth = d_model_NK.infer_pil(image)
+ colored_depth = colorize(depth, cmap='gray_r') # [h, w, 4] 0-255
+
+ depth_img = deepcopy(colored_depth[:, :, :3])
+ if len(trajectory_points) > 0:
+ for idx, point in enumerate(trajectory_points):
+ if idx % 2 == 0:
+ cv2.circle(depth_img, tuple(point), 10, (255, 0, 0), -1)
+ else:
+ cv2.circle(depth_img, tuple(point), 10, (0, 0, 255), -1)
+ if idx > 0:
+ line_length = np.sqrt((trajectory_points[idx][0] - trajectory_points[idx-1][0])**2 + (trajectory_points[idx][1] - trajectory_points[idx-1][1])**2)
+ arrow_head_length = 10
+ tip_length = arrow_head_length / line_length
+ cv2.arrowedLine(depth_img, trajectory_points[idx-1], trajectory_points[idx], (0, 255, 0), 4, tipLength=tip_length)
+
+ return image, {'image': image}, depth, depth_img, colored_depth[:, :, :3]
+
+
+
+def draw_points_on_image(img, points):
+ # img = Image.fromarray(np.array(image))
+ draw = ImageDraw.Draw(img)
+
+ for p in points:
+ x1, y1, _, x2, y2, _ = p
+
+ if x2 == 0 and y2 == 0:
+ # Point: 青色点带黑边
+ point_radius = 4
+ draw.ellipse(
+ (x1 - point_radius, y1 - point_radius, x1 + point_radius, y1 + point_radius),
+ fill="cyan", outline="black", width=1
+ )
+ else:
+ # Bounding Box: 黑色矩形框
+ draw.rectangle([x1, y1, x2, y2], outline="black", width=3)
+
+ return img
+
+# @spaces.GPU(duration=10)
+def from_examples(raw_input, raw_image_points, canvas, seg_image_points, selected_points_text, camera_option, mask_bk):
+
+ selected_points = ast.literal_eval(selected_points_text)
+ mask = np.array(mask_bk)
+ mask = mask[:,:,0] > 0
+ selected_points = ast.literal_eval(selected_points_text)
+
+ image, _, depth, depth_img, colored_depth = process_image(raw_input, selected_points)
+
+ # get camera pose
+ if camera_option == "None":
+ # traj2came
+ rescale = 1.0
+ camera_pose, camera_pose_vis, rescale, _ = traj2cam(selected_points, depth , rescale)
+ else:
+ rescale = 0.0
+ angle = 60
+ speed = 4.0
+ camera_pose, camera_pose_vis, rescale = get_camera_pose(CAMERA_MODE)(camera_option, depth, mask, rescale, angle, speed)
+
+ raw_image_points = ast.literal_eval(raw_image_points)
+ seg_image_points = ast.literal_eval(seg_image_points)
+
+ raw_image = draw_points_on_image(raw_input['image'], raw_image_points)
+ seg_image = draw_points_on_image(canvas['image'], seg_image_points)
+
+ return image, mask, depth, depth_img, colored_depth, camera_pose, \
+ camera_pose_vis, rescale, selected_points, \
+ gr.update(value={'image': raw_image, 'points': raw_image_points}), \
+ gr.update(value={'image': seg_image, 'points': seg_image_points}), \
+
+
# -------------- UI definition --------------
with gr.Blocks() as demo:
# layout definition
@@ -513,12 +590,16 @@ with gr.Blocks() as demo:
# with gr.Row():
# gr.Markdown("""# Repositioning the Subject within Image """)
mask = gr.State(value=None) # store mask
+ mask_bk = gr.Image(type="pil", label="Mask", show_label=True, interactive=False, visible=False)
+
removal_mask = gr.State(value=None) # store removal mask
selected_points = gr.State([]) # store points
selected_points_text = gr.Textbox(label="Selected Points", visible=False)
+ raw_image_points = gr.Textbox(label="Raw Image Points", visible=False)
+ seg_image_points = gr.Textbox(label="Segment Image Points", visible=False)
original_image = gr.State(value=None) # store original input image
- masked_original_image = gr.State(value=None) # store masked input image
+ # masked_original_image = gr.State(value=None) # store masked input image
mask_logits = gr.State(value=None) # store mask logits
depth = gr.State(value=None) # store depth
@@ -526,14 +607,22 @@ with gr.Blocks() as demo:
camera_pose = gr.State(value=None) # store camera pose
+ rescale = gr.Slider(minimum=0.0, maximum=10, step=0.1, value=1.0, label="Rescale", interactive=True, visible=False)
+ angle = gr.Slider(minimum=-360, maximum=360, step=1, value=60, label="Angle", interactive=True, visible=False)
+
+ seed = gr.Textbox(value = "42", label="Seed", interactive=True, visible=False)
+ scale_wise_masks = gr.Checkbox(label="Enable Scale-wise Masks", interactive=True, value=True, visible=False)
+ ds = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.25, label="ds", interactive=True, visible=False)
+ dt = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.1, label="dt", interactive=True, visible=False)
+
with gr.Column():
outlines = """
There are total 5 steps to complete the task.
- - Step 1: Input an image and Crop it to a suitable size;
+ - Step 1: Input an image and Crop it to a suitable size and attained depth;
- Step 2: Attain the subject mask;
- - Step 3: Get depth and Draw Trajectory;
- - Step 4: Get camera pose from trajectory or customize it;
+ - Step 3: Draw trajectory on depth map or skip to use camera pose;
+ - Step 4: Select camera poses or skip.
- Step 5: Generate the final video.
"""
@@ -545,125 +634,92 @@ with gr.Blocks() as demo:
# Step 1: Input Image
step1_dec = """
Step 1: Input Image
- - Select the region using a bounding box, aiming for a ratio close to 320:576 (height:width).
- - All provided images in `Examples` are in 320 x 576 resolution. Simply press `Process` to proceed.
"""
step1 = gr.Markdown(step1_dec)
raw_input = ImagePrompter(type="pil", label="Raw Image", show_label=True, interactive=True)
- # left_up_point = gr.Textbox(value = "-1 -1", label="Left Up Point", interactive=True)
+
+ step1_notes = """
+ - Select the region using a bounding box, aiming for a ratio close to 320:576 (height:width).
+ - If the input is in 320 x 576, press `Process` directly.
+ """
+ notes = gr.Markdown(step1_notes)
+
process_button = gr.Button("Process")
with gr.Column():
# Step 2: Get Subject Mask
step2_dec = """
Step 2: Get Subject Mask
- - Use the bounding boxes or paints to select the subject.
- - Press `Segment Subject` to get the mask. Can be refined iteratively by updating points.
"""
step2 = gr.Markdown(step2_dec)
canvas = ImagePrompter(type="pil", label="Input Image", show_label=True, interactive=True) # for mask painting
+ step2_notes = """
+ - Use the bounding boxes or points to select the subject.
+ - Press `Segment Subject` to get the mask. Can be refined iteratively by updating points.
+ """
+ notes = gr.Markdown(step2_notes)
+
select_button = gr.Button("Segment Subject")
- with gr.Row():
- with gr.Column():
- mask_dec = """
- Mask Result
- - Just for visualization purpose. No need to interact.
- """
- mask_vis = gr.Markdown(mask_dec)
- mask_output = gr.Image(type="pil", label="Mask", show_label=True, interactive=False)
with gr.Column():
# Step 3: Get Depth and Draw Trajectory
step3_dec = """
- Step 3: Get Depth and Draw Trajectory
- - Press `Get Depth` to get the depth image.
- - Draw the trajectory by selecting points on the depth image. No more than 14 points.
- - Press `Undo point` to remove all points.
+ Step 3: Draw Trajectory on Depth or SKIP
+
"""
step3 = gr.Markdown(step3_dec)
depth_image = gr.Image(type="pil", label="Depth Image", show_label=True, interactive=False)
- with gr.Row():
- depth_button = gr.Button("Get Depth")
- undo_button = gr.Button("Undo point")
-
+
+ step3_dec = """
+ - Selecting points on the depth image. No more than 14 points.
+ - Press `Undo point` to remove all points. Press `Traj2Cam` to get camera poses.
+ """
+ notes = gr.Markdown(step3_dec)
+
+ undo_button = gr.Button("Undo point")
+ traj2cam_button = gr.Button("Traj2Cam")
+
with gr.Row():
+
with gr.Column():
# Step 4: Trajectory to Camera Pose or Get Camera Pose
step4_dec = """
- Step 4: Get camera pose from trajectory or customize it
- - Option 1: Transform the 2D trajectory to camera poses with depth. `Rescale` is used for depth alignment. Larger value can speed up the object motion.
- - Option 2: Rotate the camera with a specific `Angle`.
- - Option 3: Rotate the camera clockwise or counterclockwise with a specific `Angle`.
- - Option 4: Translate the camera with `Tx` (Pan Left/Right), `Ty` (Pan Up/Down), `Tz` (Zoom In/Out) and `Speed`.
+ Step 4: Get Customized Camera Poses or Skip
"""
step4 = gr.Markdown(step4_dec)
camera_pose_vis = gr.Plot(None, label='Camera Pose')
- with gr.Row():
- with gr.Column():
- speed = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.0, label="Speed", interactive=True)
- rescale = gr.Slider(minimum=0.0, maximum=10, step=0.1, value=1.0, label="Rescale", interactive=True)
- # traj2pose_button = gr.Button("Option1: Trajectory to Camera Pose")
-
- angle = gr.Slider(minimum=-360, maximum=360, step=1, value=60, label="Angle", interactive=True)
- # rotation_button = gr.Button("Option2: Rotate")
- # clockwise_button = gr.Button("Option3: Clockwise")
- with gr.Column():
-
- Tx = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tx", interactive=True)
- Ty = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Ty", interactive=True)
- Tz = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tz", interactive=True)
- # translation_button = gr.Button("Option4: Translate")
- with gr.Row():
- camera_option = gr.Radio(choices = CAMERA_MODE, label='Camera Options', value=CAMERA_MODE[0], interactive=True)
- with gr.Row():
- get_camera_pose_button = gr.Button("Get Camera Pose")
+ camera_option = gr.Radio(choices = CAMERA_MODE, label='Camera Options', value=CAMERA_MODE[0], interactive=True)
+ speed = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=4.0, label="Speed", interactive=True, visible=True)
with gr.Column():
# Step 5: Get the final generated video
step5_dec = """
Step 5: Get the final generated video
- - 3 modes for background: Fixed, Reverse, Free.
- - Enable Scale-wise Masks for better object control.
- - Option to enable Shared Warping Latents and set stop frequency for spatial (`ds`) and temporal (`dt`) dimensions. Larger stop frequency will lead to artifacts.
"""
step5 = gr.Markdown(step5_dec)
generated_video = gr.Video(None, label='Generated Video')
- with gr.Row():
- seed = gr.Textbox(value = "42", label="Seed", interactive=True)
- # num_inference_steps = gr.Slider(minimum=1, maximum=100, step=1, value=25, label="Number of Inference Steps", interactive=True)
- bg_mode = gr.Radio(choices = ["Fixed", "Reverse", "Free"], label="Background Mode", value="Fixed", interactive=True)
- # swl_mode = gr.Radio(choices = ["Enable SWL", "Disable SWL"], label="Shared Warping Latent", value="Disable SWL", interactive=True)
- scale_wise_masks = gr.Checkbox(label="Enable Scale-wise Masks", interactive=True, value=True)
- with gr.Row():
- with gr.Column():
- shared_wapring_latents = gr.Checkbox(label="Enable Shared Warping Latents", interactive=True)
- with gr.Column():
- ds = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="ds", interactive=True)
- dt = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="dt", interactive=True)
+ # with gr.Row():
+ bg_mode = gr.Radio(choices = ["Fixed", "Reverse", "Free"], label="Background Mode", value="Fixed", interactive=True)
+ shared_wapring_latents = gr.Checkbox(label="Enable Shared Warping Latents", interactive=True, value=False, visible=True)
generated_button = gr.Button("Generate")
+ get_mid_params_button = gr.Button("Get Mid Params")
# # event definition
process_button.click(
fn = process_image,
- inputs = [raw_input],
- outputs = [original_image, canvas]
+ inputs = [raw_input, selected_points],
+ outputs = [original_image, canvas, depth, depth_image, org_depth_image]
)
select_button.click(
segment,
[canvas, original_image, mask_logits],
- [mask, mask_output, masked_original_image, mask_logits]
- )
-
- depth_button.click(
- get_depth,
- [original_image, selected_points],
- [depth, depth_image, org_depth_image]
+ [mask, canvas, mask_logits]
)
depth_image.select(
@@ -677,9 +733,15 @@ with gr.Blocks() as demo:
[depth_image, selected_points]
)
- get_camera_pose_button.click(
+ traj2cam_button.click(
+ traj2cam,
+ [selected_points, depth, rescale],
+ [camera_pose, camera_pose_vis, rescale, camera_option]
+ )
+
+ camera_option.change(
get_camera_pose(CAMERA_MODE),
- [camera_option, selected_points, depth, mask, rescale, angle, Tx, Ty, Tz, speed],
+ [camera_option, depth, mask, rescale, angle, speed],
[camera_pose, camera_pose_vis, rescale]
)
@@ -701,35 +763,44 @@ with gr.Blocks() as demo:
],
[generated_video],
)
+
+ get_mid_params_button.click(
+ get_mid_params,
+ [raw_input, canvas, mask, selected_points, camera_option, bg_mode, shared_wapring_latents, generated_video]
+ )
+
+ ## Get examples
+ with open('./assets/examples/examples.json', 'r') as f:
+ examples = json.load(f)
+ print(examples)
+
+ # examples = [examples]
+ examples = [v for k, v in examples.items()]
gr.Examples(
examples=examples,
inputs=[
raw_input,
- rescale,
- speed,
- angle,
- Tx,
- Ty,
- Tz,
+ raw_image_points,
+ canvas,
+ seg_image_points,
+ mask_bk,
+ selected_points_text, # selected_points
camera_option,
bg_mode,
shared_wapring_latents,
- scale_wise_masks,
- ds,
- dt,
- seed,
- selected_points_text # selected_points
+ generated_video
],
- outputs=[generated_video],
- examples_per_page=10
+ examples_per_page=20
)
selected_points_text.change(
- sync_points,
- inputs=[selected_points_text],
- outputs=[selected_points]
+ from_examples,
+ inputs=[raw_input, raw_image_points, canvas, seg_image_points, selected_points_text, camera_option, mask_bk],
+ outputs=[original_image, mask, depth, depth_image, org_depth_image, camera_pose, camera_pose_vis, rescale, selected_points, raw_input, canvas]
)
+
+
gr.Markdown(article)
diff --git a/assets/examples/00010/generated_video.mp4 b/assets/examples/00010/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..26601e85dbd20fde53d38ef7d319eedbe32aebfa
Binary files /dev/null and b/assets/examples/00010/generated_video.mp4 differ
diff --git a/assets/examples/00010/mask.png b/assets/examples/00010/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..459f4ba3de4e946483b2b37cfa554372ab2599de
Binary files /dev/null and b/assets/examples/00010/mask.png differ
diff --git a/assets/examples/00010/raw_image.png b/assets/examples/00010/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..17b53a68038d4050676c28e60781f2109cb3beb1
Binary files /dev/null and b/assets/examples/00010/raw_image.png differ
diff --git a/assets/examples/00010/seg_image.png b/assets/examples/00010/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..ac23f6bb72cd980085e985c0e90792cb7f5787f3
Binary files /dev/null and b/assets/examples/00010/seg_image.png differ
diff --git a/assets/examples/00011/generated_video.mp4 b/assets/examples/00011/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..05d52334826bab5f997e59b706df6c024f3e3f1e
Binary files /dev/null and b/assets/examples/00011/generated_video.mp4 differ
diff --git a/assets/examples/00011/mask.png b/assets/examples/00011/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..459f4ba3de4e946483b2b37cfa554372ab2599de
Binary files /dev/null and b/assets/examples/00011/mask.png differ
diff --git a/assets/examples/00011/raw_image.png b/assets/examples/00011/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..17b53a68038d4050676c28e60781f2109cb3beb1
Binary files /dev/null and b/assets/examples/00011/raw_image.png differ
diff --git a/assets/examples/00011/seg_image.png b/assets/examples/00011/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..ac23f6bb72cd980085e985c0e90792cb7f5787f3
Binary files /dev/null and b/assets/examples/00011/seg_image.png differ
diff --git a/assets/examples/00012/generated_video.mp4 b/assets/examples/00012/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..eb4c2bc2838971aa5882d7b204d9fae9a1dfa939
Binary files /dev/null and b/assets/examples/00012/generated_video.mp4 differ
diff --git a/assets/examples/00012/mask.png b/assets/examples/00012/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..897f5c3ac8aa97a8eb73b8b13bdbb85dc0802aa3
Binary files /dev/null and b/assets/examples/00012/mask.png differ
diff --git a/assets/examples/00012/raw_image.png b/assets/examples/00012/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..94bcd57b36cc6b45676ddc825e784fe3553c7113
Binary files /dev/null and b/assets/examples/00012/raw_image.png differ
diff --git a/assets/examples/00012/seg_image.png b/assets/examples/00012/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..542e3361f0ff06c6e1790fe1db3c3f77e7147ec7
Binary files /dev/null and b/assets/examples/00012/seg_image.png differ
diff --git a/assets/examples/00013/generated_video.mp4 b/assets/examples/00013/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..930a037352192cfe7061e6aea292a67c980ca141
Binary files /dev/null and b/assets/examples/00013/generated_video.mp4 differ
diff --git a/assets/examples/00013/mask.png b/assets/examples/00013/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..e80536699170ae05ee81cfb8c01ac35d06fff750
Binary files /dev/null and b/assets/examples/00013/mask.png differ
diff --git a/assets/examples/00013/raw_image.png b/assets/examples/00013/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d395bcf430f1d34841f0ce41374dd6b7fc7e70b
Binary files /dev/null and b/assets/examples/00013/raw_image.png differ
diff --git a/assets/examples/00013/seg_image.png b/assets/examples/00013/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..0b6a1bbf24d6031c620a1864aab20c7cba346f92
Binary files /dev/null and b/assets/examples/00013/seg_image.png differ
diff --git a/assets/examples/00014/generated_video.mp4 b/assets/examples/00014/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..abe5fcb4e8cbc991adc78d2392063e806ef521e6
Binary files /dev/null and b/assets/examples/00014/generated_video.mp4 differ
diff --git a/assets/examples/00014/mask.png b/assets/examples/00014/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..05b00d1cb6b3c00307c7ff8b4122ab93c326a03d
Binary files /dev/null and b/assets/examples/00014/mask.png differ
diff --git a/assets/examples/00014/raw_image.png b/assets/examples/00014/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca884dbf77e9e2b65bb98253e9bd66784ae24c82
Binary files /dev/null and b/assets/examples/00014/raw_image.png differ
diff --git a/assets/examples/00014/seg_image.png b/assets/examples/00014/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..1393e456ec8ddab5a1c01250632242f0cb2c4a34
Binary files /dev/null and b/assets/examples/00014/seg_image.png differ
diff --git a/assets/examples/00015/generated_video.mp4 b/assets/examples/00015/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..31c3026c5ff862f37de37a785fc752cdbf0d9e94
Binary files /dev/null and b/assets/examples/00015/generated_video.mp4 differ
diff --git a/assets/examples/00015/mask.png b/assets/examples/00015/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..05b00d1cb6b3c00307c7ff8b4122ab93c326a03d
Binary files /dev/null and b/assets/examples/00015/mask.png differ
diff --git a/assets/examples/00015/raw_image.png b/assets/examples/00015/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca884dbf77e9e2b65bb98253e9bd66784ae24c82
Binary files /dev/null and b/assets/examples/00015/raw_image.png differ
diff --git a/assets/examples/00015/seg_image.png b/assets/examples/00015/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..1393e456ec8ddab5a1c01250632242f0cb2c4a34
Binary files /dev/null and b/assets/examples/00015/seg_image.png differ
diff --git a/assets/examples/00016/generated_video.mp4 b/assets/examples/00016/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..84430085d2e9f8898a051117a16f99e973cbc953
Binary files /dev/null and b/assets/examples/00016/generated_video.mp4 differ
diff --git a/assets/examples/00016/mask.png b/assets/examples/00016/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..05b00d1cb6b3c00307c7ff8b4122ab93c326a03d
Binary files /dev/null and b/assets/examples/00016/mask.png differ
diff --git a/assets/examples/00016/raw_image.png b/assets/examples/00016/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca884dbf77e9e2b65bb98253e9bd66784ae24c82
Binary files /dev/null and b/assets/examples/00016/raw_image.png differ
diff --git a/assets/examples/00016/seg_image.png b/assets/examples/00016/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..1393e456ec8ddab5a1c01250632242f0cb2c4a34
Binary files /dev/null and b/assets/examples/00016/seg_image.png differ
diff --git a/assets/examples/00017/generated_video.mp4 b/assets/examples/00017/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..5793d5bf0065781ce62e62246e2ad98c0ad78ed0
Binary files /dev/null and b/assets/examples/00017/generated_video.mp4 differ
diff --git a/assets/examples/00017/mask.png b/assets/examples/00017/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..05b00d1cb6b3c00307c7ff8b4122ab93c326a03d
Binary files /dev/null and b/assets/examples/00017/mask.png differ
diff --git a/assets/examples/00017/raw_image.png b/assets/examples/00017/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca884dbf77e9e2b65bb98253e9bd66784ae24c82
Binary files /dev/null and b/assets/examples/00017/raw_image.png differ
diff --git a/assets/examples/00017/seg_image.png b/assets/examples/00017/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..1393e456ec8ddab5a1c01250632242f0cb2c4a34
Binary files /dev/null and b/assets/examples/00017/seg_image.png differ
diff --git a/assets/examples/00018/generated_video.mp4 b/assets/examples/00018/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..7f71802934f9996ca9a68c48a44fe448b0d5a6e4
Binary files /dev/null and b/assets/examples/00018/generated_video.mp4 differ
diff --git a/assets/examples/00018/mask.png b/assets/examples/00018/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..05b00d1cb6b3c00307c7ff8b4122ab93c326a03d
Binary files /dev/null and b/assets/examples/00018/mask.png differ
diff --git a/assets/examples/00018/raw_image.png b/assets/examples/00018/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca884dbf77e9e2b65bb98253e9bd66784ae24c82
Binary files /dev/null and b/assets/examples/00018/raw_image.png differ
diff --git a/assets/examples/00018/seg_image.png b/assets/examples/00018/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..1393e456ec8ddab5a1c01250632242f0cb2c4a34
Binary files /dev/null and b/assets/examples/00018/seg_image.png differ
diff --git a/assets/examples/00019/generated_video.mp4 b/assets/examples/00019/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..240682e9d11236777b40c053bdef84383f1512b6
Binary files /dev/null and b/assets/examples/00019/generated_video.mp4 differ
diff --git a/assets/examples/00019/mask.png b/assets/examples/00019/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..c4fdda09358dbca78bd53fd90d8de3d8fbead3af
Binary files /dev/null and b/assets/examples/00019/mask.png differ
diff --git a/assets/examples/00019/raw_image.png b/assets/examples/00019/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..61a14083818696524a9cb7e6b15018f0bc8c198c
Binary files /dev/null and b/assets/examples/00019/raw_image.png differ
diff --git a/assets/examples/00019/seg_image.png b/assets/examples/00019/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f83f9e5e98c693ce0599e61cd1216f2204540c3
Binary files /dev/null and b/assets/examples/00019/seg_image.png differ
diff --git a/assets/examples/00020/generated_video.mp4 b/assets/examples/00020/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..0d2029da6871f23f514d46df6c676f86bedf8818
Binary files /dev/null and b/assets/examples/00020/generated_video.mp4 differ
diff --git a/assets/examples/00020/mask.png b/assets/examples/00020/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..2028408721baf056a0009bcf4289a7b4737c4879
Binary files /dev/null and b/assets/examples/00020/mask.png differ
diff --git a/assets/examples/00020/raw_image.png b/assets/examples/00020/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..61a14083818696524a9cb7e6b15018f0bc8c198c
Binary files /dev/null and b/assets/examples/00020/raw_image.png differ
diff --git a/assets/examples/00020/seg_image.png b/assets/examples/00020/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..3f70307348cc004e88e7552e000d3a5591b835cc
Binary files /dev/null and b/assets/examples/00020/seg_image.png differ
diff --git a/assets/examples/00021/generated_video.mp4 b/assets/examples/00021/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..01c4ca5b0ebeb80c3d5df93cb3bd315476ab7548
Binary files /dev/null and b/assets/examples/00021/generated_video.mp4 differ
diff --git a/assets/examples/00021/mask.png b/assets/examples/00021/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..75c5da4bb4a2780f5c38eebe0eaf4a8b31014ada
Binary files /dev/null and b/assets/examples/00021/mask.png differ
diff --git a/assets/examples/00021/raw_image.png b/assets/examples/00021/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..ece9435f244fd4468e0fa1c703c6aa5331a3ba2b
Binary files /dev/null and b/assets/examples/00021/raw_image.png differ
diff --git a/assets/examples/00021/seg_image.png b/assets/examples/00021/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..eef5849e3e730e1cb3342b4b52bcd54412e1ac56
Binary files /dev/null and b/assets/examples/00021/seg_image.png differ
diff --git a/assets/examples/00022/generated_video.mp4 b/assets/examples/00022/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..0f7e8f4b1ff2b9385cf43dcc765c08433e71f85f
Binary files /dev/null and b/assets/examples/00022/generated_video.mp4 differ
diff --git a/assets/examples/00022/mask.png b/assets/examples/00022/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e2b0f60eeecfadb656a7f4d3a1f7954467ee9bf
Binary files /dev/null and b/assets/examples/00022/mask.png differ
diff --git a/assets/examples/00022/raw_image.png b/assets/examples/00022/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..0196cc58626b2f55f92a7b3ba3fefcbe56a6cba1
Binary files /dev/null and b/assets/examples/00022/raw_image.png differ
diff --git a/assets/examples/00022/seg_image.png b/assets/examples/00022/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc9a17ad52a4eac0999f0964aeb5817150e5b0b7
Binary files /dev/null and b/assets/examples/00022/seg_image.png differ
diff --git a/assets/examples/00023/generated_video.mp4 b/assets/examples/00023/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..a90558dc9c294000103e182151d37e152a706686
Binary files /dev/null and b/assets/examples/00023/generated_video.mp4 differ
diff --git a/assets/examples/00023/mask.png b/assets/examples/00023/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..3f6032390a8dbaf0449f30ed0fce1f2b821995cc
Binary files /dev/null and b/assets/examples/00023/mask.png differ
diff --git a/assets/examples/00023/raw_image.png b/assets/examples/00023/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d395bcf430f1d34841f0ce41374dd6b7fc7e70b
Binary files /dev/null and b/assets/examples/00023/raw_image.png differ
diff --git a/assets/examples/00023/seg_image.png b/assets/examples/00023/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..ecbbf6cdf35162afd124ddaf39e56088bc8385d1
Binary files /dev/null and b/assets/examples/00023/seg_image.png differ
diff --git a/assets/examples/00024/generated_video.mp4 b/assets/examples/00024/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..71d64366becd3aa945374f9075cf064636d27384
Binary files /dev/null and b/assets/examples/00024/generated_video.mp4 differ
diff --git a/assets/examples/00024/mask.png b/assets/examples/00024/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..ea907a882489f68d9f808fc443b30fea82ab1465
Binary files /dev/null and b/assets/examples/00024/mask.png differ
diff --git a/assets/examples/00024/raw_image.png b/assets/examples/00024/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d395bcf430f1d34841f0ce41374dd6b7fc7e70b
Binary files /dev/null and b/assets/examples/00024/raw_image.png differ
diff --git a/assets/examples/00024/seg_image.png b/assets/examples/00024/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ed2128c07c7d3b6fee249a5ccab6d26a89f8e97
Binary files /dev/null and b/assets/examples/00024/seg_image.png differ
diff --git a/assets/examples/00025/generated_video.mp4 b/assets/examples/00025/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..4eec1e2901ba9509d680fbca789523a703fa2299
Binary files /dev/null and b/assets/examples/00025/generated_video.mp4 differ
diff --git a/assets/examples/00025/mask.png b/assets/examples/00025/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..92228d48e54de9c2a4108fa1e9ee294407d09584
Binary files /dev/null and b/assets/examples/00025/mask.png differ
diff --git a/assets/examples/00025/seg_image.png b/assets/examples/00025/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..a92a2cbdee2600bd5a5f04aa453a8a246c663289
Binary files /dev/null and b/assets/examples/00025/seg_image.png differ
diff --git a/assets/examples/00029/generated_video.mp4 b/assets/examples/00029/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..907e651c50713e2d7012b8e8a7517f05ec1ab7c8
Binary files /dev/null and b/assets/examples/00029/generated_video.mp4 differ
diff --git a/assets/examples/00029/mask.png b/assets/examples/00029/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..e20fabd9794876dd2fba1006185e0ccb5aa06bd4
Binary files /dev/null and b/assets/examples/00029/mask.png differ
diff --git a/assets/examples/00029/raw_image.png b/assets/examples/00029/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..27c1f9850f8b920095cae39fd2953a280ff3b2ff
Binary files /dev/null and b/assets/examples/00029/raw_image.png differ
diff --git a/assets/examples/00029/seg_image.png b/assets/examples/00029/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..d692d0e9f6e87e07f919d105f1bd85d6bb7d79b4
Binary files /dev/null and b/assets/examples/00029/seg_image.png differ
diff --git a/assets/examples/00030/generated_video.mp4 b/assets/examples/00030/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..dfaa99ecacf12e4fc7f16a8721e5e428abb4ebc7
Binary files /dev/null and b/assets/examples/00030/generated_video.mp4 differ
diff --git a/assets/examples/00030/mask.png b/assets/examples/00030/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..e20fabd9794876dd2fba1006185e0ccb5aa06bd4
Binary files /dev/null and b/assets/examples/00030/mask.png differ
diff --git a/assets/examples/00030/raw_image.png b/assets/examples/00030/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..27c1f9850f8b920095cae39fd2953a280ff3b2ff
Binary files /dev/null and b/assets/examples/00030/raw_image.png differ
diff --git a/assets/examples/00030/seg_image.png b/assets/examples/00030/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..d692d0e9f6e87e07f919d105f1bd85d6bb7d79b4
Binary files /dev/null and b/assets/examples/00030/seg_image.png differ
diff --git a/assets/examples/00031/generated_video.mp4 b/assets/examples/00031/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..6e57b81cb1009677ad3724a713a1f08e185c10c2
Binary files /dev/null and b/assets/examples/00031/generated_video.mp4 differ
diff --git a/assets/examples/00031/mask.png b/assets/examples/00031/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..e20fabd9794876dd2fba1006185e0ccb5aa06bd4
Binary files /dev/null and b/assets/examples/00031/mask.png differ
diff --git a/assets/examples/00031/raw_image.png b/assets/examples/00031/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..27c1f9850f8b920095cae39fd2953a280ff3b2ff
Binary files /dev/null and b/assets/examples/00031/raw_image.png differ
diff --git a/assets/examples/00031/seg_image.png b/assets/examples/00031/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..d692d0e9f6e87e07f919d105f1bd85d6bb7d79b4
Binary files /dev/null and b/assets/examples/00031/seg_image.png differ
diff --git a/assets/examples/00032/generated_video.mp4 b/assets/examples/00032/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..31c352cf6d70b66dbe1370192146d1cfcd5f51ad
Binary files /dev/null and b/assets/examples/00032/generated_video.mp4 differ
diff --git a/assets/examples/00032/mask.png b/assets/examples/00032/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..51d1da4d33b2f729bf75af91a4b740d842bf4795
Binary files /dev/null and b/assets/examples/00032/mask.png differ
diff --git a/assets/examples/00032/raw_image.png b/assets/examples/00032/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..06bb3b8d6cccb21af50745833efd0b462fc67215
Binary files /dev/null and b/assets/examples/00032/raw_image.png differ
diff --git a/assets/examples/00032/seg_image.png b/assets/examples/00032/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..f67652257c59a815b08802b5f641c96704fade77
Binary files /dev/null and b/assets/examples/00032/seg_image.png differ
diff --git a/assets/examples/00033/generated_video.mp4 b/assets/examples/00033/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..1e2ecfbb279b035daaa571aa1455c7f42224c011
Binary files /dev/null and b/assets/examples/00033/generated_video.mp4 differ
diff --git a/assets/examples/00033/mask.png b/assets/examples/00033/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..86786228a50a0bd4bce31dde45e28074bfa1bd0e
Binary files /dev/null and b/assets/examples/00033/mask.png differ
diff --git a/assets/examples/00033/raw_image.png b/assets/examples/00033/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..94e184c62148de8ac707dc6ab9d9161d7f57fac3
Binary files /dev/null and b/assets/examples/00033/raw_image.png differ
diff --git a/assets/examples/00033/seg_image.png b/assets/examples/00033/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..012cf5d1fc70661e8ac6bc846e3d5b80fa8c92aa
Binary files /dev/null and b/assets/examples/00033/seg_image.png differ
diff --git a/assets/examples/00034/generated_video.mp4 b/assets/examples/00034/generated_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..bbcbc7d70ace1439d7f0a5bd218bff70b7ea432d
Binary files /dev/null and b/assets/examples/00034/generated_video.mp4 differ
diff --git a/assets/examples/00034/mask.png b/assets/examples/00034/mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..86786228a50a0bd4bce31dde45e28074bfa1bd0e
Binary files /dev/null and b/assets/examples/00034/mask.png differ
diff --git a/assets/examples/00034/raw_image.png b/assets/examples/00034/raw_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..94e184c62148de8ac707dc6ab9d9161d7f57fac3
Binary files /dev/null and b/assets/examples/00034/raw_image.png differ
diff --git a/assets/examples/00034/seg_image.png b/assets/examples/00034/seg_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..012cf5d1fc70661e8ac6bc846e3d5b80fa8c92aa
Binary files /dev/null and b/assets/examples/00034/seg_image.png differ
diff --git a/assets/examples/examples.json b/assets/examples/examples.json
new file mode 100644
index 0000000000000000000000000000000000000000..11f61212b9ffa5fa70028673b783312ae325701d
--- /dev/null
+++ b/assets/examples/examples.json
@@ -0,0 +1,20 @@
+{
+"00025": [{"image": "./assets/examples/00025/raw_image.png"}, "[[23.0, 797.0, 2.0, 2387.0, 2279.0, 3.0]]", {"image": "./assets/examples/00025/seg_image.png"}, "[[5.0, 22.0, 2.0, 167.0, 300.0, 3.0], [113.0, 282.0, 1.0, 0.0, 0.0, 4.0], [193.0, 301.0, 1.0, 0.0, 0.0, 4.0], [78.0, 270.0, 1.0, 0.0, 0.0, 4.0], [90.0, 274.0, 1.0, 0.0, 0.0, 4.0], [143.0, 259.0, 1.0, 0.0, 0.0, 4.0], [181.0, 292.0, 1.0, 0.0, 0.0, 4.0], [216.0, 306.0, 1.0, 0.0, 0.0, 4.0], [209.0, 288.0, 1.0, 0.0, 0.0, 4.0], [214.0, 312.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00025/mask.png", "[[103, 280], [101, 188]]", "None", "Fixed", true, "./assets/examples/00025/generated_video.mp4"],
+"00010": [{"image": "./assets/examples/00010/raw_image.png"}, "[]", {"image": "./assets/examples/00010/seg_image.png"}, "[[37.0, 29.0, 2.0, 502.0, 264.0, 3.0], [524.0, 194.0, 1.0, 0.0, 0.0, 4.0], [554.0, 203.0, 1.0, 0.0, 0.0, 4.0], [567.0, 205.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00010/mask.png", "[[183, 154], [327, 172]]", "None", "Fixed", true, "./assets/examples/00010/generated_video.mp4"],
+"00011": [{"image": "./assets/examples/00011/raw_image.png"}, "[]", {"image": "./assets/examples/00011/seg_image.png"}, "[[37.0, 29.0, 2.0, 502.0, 264.0, 3.0], [524.0, 194.0, 1.0, 0.0, 0.0, 4.0], [554.0, 203.0, 1.0, 0.0, 0.0, 4.0], [567.0, 205.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00011/mask.png", "[[340, 166], [184, 141]]", "None", "Fixed", true, "./assets/examples/00011/generated_video.mp4"],
+"00012": [{"image": "./assets/examples/00012/raw_image.png"}, "[]", {"image": "./assets/examples/00012/seg_image.png"}, "[[311.0, 174.0, 1.0, 0.0, 0.0, 4.0], [257.0, 144.0, 1.0, 0.0, 0.0, 4.0], [210.0, 121.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00012/mask.png", "[[202, 110], [134, 46]]", "None", "Fixed", true, "./assets/examples/00012/generated_video.mp4"],
+"00016": [{"image": "./assets/examples/00016/raw_image.png"}, "[]", {"image": "./assets/examples/00016/seg_image.png"}, "[[98.0, 245.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00016/mask.png", "[[28, 290], [34, 266], [49, 243], [84, 243]]", "None", "Free", true, "./assets/examples/00016/generated_video.mp4"],
+"00017": [{"image": "./assets/examples/00017/raw_image.png"}, "[]", {"image": "./assets/examples/00017/seg_image.png"}, "[[98.0, 245.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00017/mask.png", "[[28, 290], [34, 266], [49, 243], [84, 243]]", "None", "Reverse", true, "./assets/examples/00017/generated_video.mp4"],
+"00018": [{"image": "./assets/examples/00018/raw_image.png"}, "[]", {"image": "./assets/examples/00018/seg_image.png"}, "[[98.0, 245.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00018/mask.png", "[[28, 290], [34, 266], [49, 243], [84, 243]]", "None", "Fixed", true, "./assets/examples/00018/generated_video.mp4"],
+"00020": [{"image": "./assets/examples/00020/raw_image.png"}, "[]", {"image": "./assets/examples/00020/seg_image.png"}, "[[146.0, 34.0, 2.0, 499.0, 305.0, 3.0], [285.0, 260.0, 1.0, 0.0, 0.0, 4.0], [278.0, 308.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00020/mask.png", "[]", "Rotate60", "Fixed", false, "./assets/examples/00020/generated_video.mp4"],
+"00021": [{"image": "./assets/examples/00021/raw_image.png"}, "[]", {"image": "./assets/examples/00021/seg_image.png"}, "[[183.0, 5.0, 2.0, 345.0, 271.0, 3.0], [249.0, 234.0, 1.0, 0.0, 0.0, 4.0], [262.0, 5.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00021/mask.png", "[]", "Rotate60", "Fixed", false, "./assets/examples/00021/generated_video.mp4"],
+"00022": [{"image": "./assets/examples/00022/raw_image.png"}, "[]", {"image": "./assets/examples/00022/seg_image.png"}, "[[184.0, 174.0, 1.0, 0.0, 0.0, 4.0], [151.0, 174.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00022/mask.png", "[[151, 194], [197, 194], [246, 224]]", "Rotate60", "Fixed", true, "./assets/examples/00022/generated_video.mp4"],
+"00024": [{"image": "./assets/examples/00024/raw_image.png"}, "[]", {"image": "./assets/examples/00024/seg_image.png"}, "[[177.0, 11.0, 2.0, 414.0, 312.0, 3.0], [302.0, 32.0, 1.0, 0.0, 0.0, 4.0], [372.0, 224.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00024/mask.png", "[[297, 60], [425, 82], [233, 135], [396, 171]]", "None", "Fixed", false, "./assets/examples/00024/generated_video.mp4"],
+"00023": [{"image": "./assets/examples/00023/raw_image.png"}, "[]", {"image": "./assets/examples/00023/seg_image.png"}, "[[191.0, 7.0, 2.0, 404.0, 312.0, 3.0], [369.0, 228.0, 1.0, 0.0, 0.0, 4.0], [321.0, 210.0, 1.0, 0.0, 0.0, 4.0], [297.0, 195.0, 1.0, 0.0, 0.0, 4.0], [297.0, 248.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00023/mask.png", "[]", "Rotate60", "Fixed", false, "./assets/examples/00023/generated_video.mp4"],
+"00031": [{"image": "./assets/examples/00031/raw_image.png"}, "[]", {"image": "./assets/examples/00031/seg_image.png"}, "[[1.0, 4.0, 2.0, 562.0, 181.0, 3.0], [102.0, 63.0, 1.0, 0.0, 0.0, 4.0], [86.0, 148.0, 1.0, 0.0, 0.0, 4.0], [383.0, 142.0, 1.0, 0.0, 0.0, 4.0], [520.0, 150.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00031/mask.png", "[]", "ZoomIn", "Fixed", false, "./assets/examples/00031/generated_video.mp4"],
+"00029": [{"image": "./assets/examples/00029/raw_image.png"}, "[]", {"image": "./assets/examples/00029/seg_image.png"}, "[[1.0, 4.0, 2.0, 562.0, 181.0, 3.0], [102.0, 63.0, 1.0, 0.0, 0.0, 4.0], [86.0, 148.0, 1.0, 0.0, 0.0, 4.0], [383.0, 142.0, 1.0, 0.0, 0.0, 4.0], [520.0, 150.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00029/mask.png", "[]", "PanLeft", "Fixed", false, "./assets/examples/00029/generated_video.mp4"],
+"00030": [{"image": "./assets/examples/00030/raw_image.png"}, "[]", {"image": "./assets/examples/00030/seg_image.png"}, "[[1.0, 4.0, 2.0, 562.0, 181.0, 3.0], [102.0, 63.0, 1.0, 0.0, 0.0, 4.0], [86.0, 148.0, 1.0, 0.0, 0.0, 4.0], [383.0, 142.0, 1.0, 0.0, 0.0, 4.0], [520.0, 150.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00030/mask.png", "[]", "PanRight", "Fixed", false, "./assets/examples/00030/generated_video.mp4"],
+"00033": [{"image": "./assets/examples/00033/raw_image.png"}, "[]", {"image": "./assets/examples/00033/seg_image.png"}, "[[187.0, 34.0, 2.0, 321.0, 171.0, 3.0], [239.0, 117.0, 1.0, 0.0, 0.0, 4.0], [257.0, 138.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00033/mask.png", "[]", "Anti-CW", "Fixed", true, "./assets/examples/00033/generated_video.mp4"],
+"00034": [{"image": "./assets/examples/00034/raw_image.png"}, "[]", {"image": "./assets/examples/00034/seg_image.png"}, "[[187.0, 34.0, 2.0, 321.0, 171.0, 3.0], [239.0, 117.0, 1.0, 0.0, 0.0, 4.0], [257.0, 138.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00034/mask.png", "[]", "ClockWise", "Fixed", true, "./assets/examples/00034/generated_video.mp4"]
+}
+
diff --git a/objctrl_2_5d/utils/ui_utils.py b/objctrl_2_5d/utils/ui_utils.py
index 4e8af59434e091345be4337e1f9988b767d82e1a..811c99abc273a4276210f3b33db4d7336972c371 100644
--- a/objctrl_2_5d/utils/ui_utils.py
+++ b/objctrl_2_5d/utils/ui_utils.py
@@ -9,6 +9,7 @@ from objctrl_2_5d.utils.vis_camera import vis_camera_rescale
from objctrl_2_5d.utils.objmask_util import trajectory_to_camera_poses_v1
from objctrl_2_5d.utils.customized_cam import rotation, clockwise, pan_and_zoom
+CAMERA_MODE = ["None", "ZoomIn", "ZoomOut", "PanRight", "PanLeft", "TiltUp", "TiltDown", "ClockWise", "Anti-CW", "Rotate60"]
zc_threshold = 0.2
depth_scale_ = 5.2
@@ -29,8 +30,6 @@ def process_image(raw_image):
image, points = raw_image['image'], raw_image['points']
- print(points)
-
try:
assert(len(points)) == 1, "Please select only one point"
[x1, y1, _, x2, y2, _] = points[0]
@@ -88,7 +87,10 @@ def get_points(img,
# draw an arrow from handle point to target point
# if len(points) == idx + 1:
if idx > 0:
- cv2.arrowedLine(img, points[idx-1], points[idx], (255, 255, 255), 4, tipLength=0.5)
+ line_length = np.sqrt((points[idx][0] - points[idx-1][0])**2 + (points[idx][1] - points[idx-1][1])**2)
+ arrow_head_length = 10
+ tip_length = arrow_head_length / line_length
+ cv2.arrowedLine(img, points[idx-1], points[idx], (0, 255, 0), 4, tipLength=tip_length)
# points = []
return img if isinstance(img, np.ndarray) else np.array(img), sel_pix
@@ -113,6 +115,9 @@ def interpolate_points(points, num_points):
def traj2cam(traj, depth, rescale):
+ if len(traj) == 0:
+ return None, None, 0.0, gr.update(value=CAMERA_MODE[0])
+
traj = np.array(traj)
trajectory = interpolate_points(traj, num_frames)
@@ -148,13 +153,13 @@ def traj2cam(traj, depth, rescale):
RTs = traj_w2c[:, :3]
fig = vis_camera_rescale(RTs)
- return RTs, fig, rescale
+ return RTs, fig, rescale, gr.update(value=CAMERA_MODE[0])
def get_rotate_cam(angle, depth):
# mean_depth = np.mean(depth * mask)
center_h_margin, center_w_margin = center_margin, center_margin
depth_center = np.mean(depth[height//2-center_h_margin:height//2+center_h_margin, width//2-center_w_margin:width//2+center_w_margin])
- print(f'rotate depth_center: {depth_center}')
+ # print(f'rotate depth_center: {depth_center}')
RTs = rotation(num_frames, angle, depth_center, depth_center)
fig = vis_camera_rescale(RTs)
@@ -162,47 +167,128 @@ def get_rotate_cam(angle, depth):
return RTs, fig
def get_clockwise_cam(angle, depth, mask):
- mask = mask.astype(np.float32) # [0, 1]
- mean_depth = np.mean(depth * mask)
+ # mask = mask.astype(np.float32) # [0, 1]
+ # mean_depth = np.mean(depth * mask)
# center_h_margin, center_w_margin = center_margin, center_margin
# depth_center = np.mean(depth[height//2-center_h_margin:height//2+center_h_margin, width//2-center_w_margin:width//2+center_w_margin])
RTs = clockwise(angle, num_frames)
- RTs[:, -1, -1] = mean_depth
+ # RTs[:, -1, -1] = mean_depth
fig = vis_camera_rescale(RTs)
return RTs, fig
def get_translate_cam(Tx, Ty, Tz, depth, mask, speed):
- mask = mask.astype(np.float32) # [0, 1]
+ # mask = mask.astype(np.float32) # [0, 1]
- mean_depth = np.mean(depth * mask)
+ # mean_depth = np.mean(depth * mask)
T = np.array([Tx, Ty, Tz])
T = T.reshape(3, 1)
T = T[None, ...].repeat(num_frames, axis=0)
RTs = pan_and_zoom(T, speed, n=num_frames)
- RTs[:, -1, -1] += mean_depth
+ # RTs[:, -1, -1] += mean_depth
fig = vis_camera_rescale(RTs)
return RTs, fig
+
def get_camera_pose(camera_mode):
- def trigger_camera_pose(camera_option, selected_points, depth, mask, rescale, angle, Tx, Ty, Tz, speed):
- if camera_option == camera_mode[0]: # traj2cam
- RTs, fig, rescale = traj2cam(selected_points, depth, rescale)
- elif camera_option == camera_mode[1]: # rotate
- RTs, fig = get_rotate_cam(angle, depth)
- rescale = 0.0
- elif camera_option == camera_mode[2]: # clockwise
+ # camera_mode = ["None", "ZoomIn", "ZoomOut", "PanLeft", "PanRight", "TiltUp", "TiltDown", "ClockWise", "Anti-CW", "Rotate60"]
+ def trigger_camera_pose(camera_option, depth, mask, rescale, angle, speed):
+ if camera_option == camera_mode[0]: # None
+ RTs = None
+ fig = None
+ elif camera_option == camera_mode[1]: # ZoomIn
+ RTs, fig = get_translate_cam(0, 0, -1, depth, mask, speed)
+
+ elif camera_option == camera_mode[2]: # ZoomOut
+ RTs, fig = get_translate_cam(0, 0, 1, depth, mask, speed)
+
+ elif camera_option == camera_mode[3]: # PanLeft
+ RTs, fig = get_translate_cam(-1, 0, 0, depth, mask, speed)
+
+ elif camera_option == camera_mode[4]: # PanRight
+ RTs, fig = get_translate_cam(1, 0, 0, depth, mask, speed)
+
+ elif camera_option == camera_mode[5]: # TiltUp
+ RTs, fig = get_translate_cam(0, 1, 0, depth, mask, speed)
+
+ elif camera_option == camera_mode[6]: # TiltDown
+ RTs, fig = get_translate_cam(0, -1, 0, depth, mask, speed)
+
+ elif camera_option == camera_mode[7]: # ClockWise
+ RTs, fig = get_clockwise_cam(-angle, depth, mask)
+
+ elif camera_option == camera_mode[8]: # Anti-CW
RTs, fig = get_clockwise_cam(angle, depth, mask)
- rescale = 0.0
- elif camera_option == camera_mode[3]: # translate
- RTs, fig = get_translate_cam(Tx, Ty, Tz, depth, mask, speed)
- rescale = 0.0
+
+ else: # Rotate60
+ RTs, fig = get_rotate_cam(angle, depth)
+ rescale = 0.0
return RTs, fig, rescale
return trigger_camera_pose
+
+import os
+from glob import glob
+import json
+
+def get_mid_params(raw_input, canvas, mask, selected_points, camera_option, bg_mode, shared_wapring_latents, generated_video):
+ output_dir = "./assets/examples"
+ os.makedirs(output_dir, exist_ok=True)
+
+ # folders = sorted(glob(output_dir + "/*"))
+ folders = os.listdir(output_dir)
+ folders = [int(folder) for folder in folders if os.path.isdir(os.path.join(output_dir, folder))]
+ num = sorted(folders)[-1] + 1 if folders else 0
+
+ fout = open(os.path.join(output_dir, f'examples.json'), 'a+')
+
+ cur_folder = os.path.join(output_dir, f'{num:05d}')
+ os.makedirs(cur_folder, exist_ok=True)
+
+ raw_image = raw_input['image']
+ raw_points = raw_input['points']
+ seg_image = canvas['image']
+ seg_points = canvas['points']
+
+ mask = Image.fromarray(mask)
+ mask_path = os.path.join(cur_folder, 'mask.png')
+ mask.save(mask_path)
+
+ raw_image_path = os.path.join(cur_folder, 'raw_image.png')
+ seg_image_path = os.path.join(cur_folder, 'seg_image.png')
+
+ raw_image.save(os.path.join(cur_folder, 'raw_image.png'))
+ seg_image.save(os.path.join(cur_folder, 'seg_image.png'))
+
+ gen_path = os.path.join(cur_folder, 'generated_video.mp4')
+ cmd = f"cp {generated_video} {gen_path}"
+ os.system(cmd)
+
+ # data = [{'image': raw_image_path, 'points': raw_points},
+ # {'image': seg_image_path, 'points': seg_points},
+ # mask_path,
+ # str(selected_points),
+ # camera_option,
+ # bg_mode,
+ # gen_path]
+ data = {f'{num:05d}': [{'image': raw_image_path},
+ str(raw_points),
+ {'image': seg_image_path},
+ str(seg_points),
+ mask_path,
+ str(selected_points),
+ camera_option,
+ bg_mode,
+ shared_wapring_latents,
+ gen_path]}
+ fout.write(json.dumps(data) + '\n')
+
+ fout.close()
+
+
\ No newline at end of file