diff --git a/app copy.py b/app copy.py new file mode 100644 index 0000000000000000000000000000000000000000..3def2652ab03477f2e1b9c0f9c14e9c5c61122a1 --- /dev/null +++ b/app copy.py @@ -0,0 +1,740 @@ +try: + import spaces +except: + pass + +import os +import gradio as gr + +import torch +from gradio_image_prompter import ImagePrompter +from sam2.sam2_image_predictor import SAM2ImagePredictor +from omegaconf import OmegaConf +from PIL import Image +import numpy as np +from copy import deepcopy +import cv2 + +import torch.nn.functional as F +import torchvision +from einops import rearrange +import tempfile + +from objctrl_2_5d.utils.ui_utils import process_image, get_camera_pose, get_subject_points, get_points, undo_points, mask_image +from ZoeDepth.zoedepth.utils.misc import colorize + +from cameractrl.inference import get_pipeline +from objctrl_2_5d.utils.examples import examples, sync_points + +from objctrl_2_5d.utils.objmask_util import RT2Plucker, Unprojected, roll_with_ignore_multidim, dilate_mask_pytorch +from objctrl_2_5d.utils.filter_utils import get_freq_filter, freq_mix_3d + + +### Title and Description ### +#### Description #### +title = r"""

ObjCtrl-2.5D: Training-free Object Control with Camera Poses

""" +# subtitle = r"""

Deployed on SVD Generation

""" +important_link = r""" +
+ [Paper] +  [Project Page] +  [Code] +
+""" + +authors = r""" +
+ Zhouxia Wang +  Yushi Lan +  Shanchen Zhou +  Chen Change Loy +
+""" + +affiliation = r""" +
+ S-Lab, NTU Singapore +
+""" + +description = r""" +Official Gradio demo for ObjCtrl-2.5D: Training-free Object Control with Camera Poses.
+🔥 ObjCtrl2.5D enables object motion control in a I2V generated video via transforming 2D trajectories to 3D using depth, subsequently converting them into camera poses, +thereby leveraging the exisitng camera motion control module for object motion control without requiring additional training.
+""" + +article = r""" +If ObjCtrl2.5D is helpful, please help to ⭐ the Github Repo. Thanks! +[![GitHub Stars](https://img.shields.io/github/stars/TencentARC%2FMotionCtrl +)](https://github.com/TencentARC/MotionCtrl) + +--- + +📝 **Citation** +
+If our work is useful for your research, please consider citing: +```bibtex +@inproceedings{wang2024motionctrl, + title={Motionctrl: A unified and flexible motion controller for video generation}, + author={Wang, Zhouxia and Yuan, Ziyang and Wang, Xintao and Li, Yaowei and Chen, Tianshui and Xia, Menghan and Luo, Ping and Shan, Ying}, + booktitle={ACM SIGGRAPH 2024 Conference Papers}, + pages={1--11}, + year={2024} +} +``` + +📧 **Contact** +
+If you have any questions, please feel free to reach me out at zhouzi1212@gmail.com. + +""" + +# -------------- initialization -------------- + +CAMERA_MODE = ["Traj2Cam", "Rotate", "Clockwise", "Translate"] + +# select the device for computation +if torch.cuda.is_available(): + device = torch.device("cuda") +elif torch.backends.mps.is_available(): + device = torch.device("mps") +else: + device = torch.device("cpu") +print(f"using device: {device}") + +# segmentation model +segmentor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-tiny", cache_dir="ckpt", device=device) + +# depth model +d_model_NK = torch.hub.load('./ZoeDepth', 'ZoeD_NK', source='local', pretrained=True).to(device) + +# cameractrl model +config = "configs/svd_320_576_cameractrl.yaml" +model_id = "stabilityai/stable-video-diffusion-img2vid" +ckpt = "checkpoints/CameraCtrl_svd.ckpt" +if not os.path.exists(ckpt): + os.makedirs("checkpoints", exist_ok=True) + os.system("wget -c https://huggingface.co/hehao13/CameraCtrl_SVD_ckpts/resolve/main/CameraCtrl_svd.ckpt?download=true") + os.system("mv CameraCtrl_svd.ckpt?download=true checkpoints/CameraCtrl_svd.ckpt") +model_config = OmegaConf.load(config) + + +pipeline = get_pipeline(model_id, "unet", model_config['down_block_types'], model_config['up_block_types'], + model_config['pose_encoder_kwargs'], model_config['attention_processor_kwargs'], + ckpt, True, device) + +# segmentor = None +# d_model_NK = None +# pipeline = None + +### run the demo ## +# @spaces.GPU(duration=5) +def segment(canvas, image, logits): + if logits is not None: + logits *= 32.0 + _, points = get_subject_points(canvas) + image = np.array(image) + + with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16): + segmentor.set_image(image) + input_points = [] + input_boxes = [] + for p in points: + [x1, y1, _, x2, y2, _] = p + if x2==0 and y2==0: + input_points.append([x1, y1]) + else: + input_boxes.append([x1, y1, x2, y2]) + if len(input_points) == 0: + input_points = None + input_labels = None + else: + input_points = np.array(input_points) + input_labels = np.ones(len(input_points)) + if len(input_boxes) == 0: + input_boxes = None + else: + input_boxes = np.array(input_boxes) + masks, _, logits = segmentor.predict( + point_coords=input_points, + point_labels=input_labels, + box=input_boxes, + multimask_output=False, + return_logits=True, + mask_input=logits, + ) + mask = masks > 0 + masked_img = mask_image(image, mask[0], color=[252, 140, 90], alpha=0.9) + masked_img = Image.fromarray(masked_img) + + return mask[0], masked_img, masked_img, logits / 32.0 + +# @spaces.GPU(duration=5) +def get_depth(image, points): + + depth = d_model_NK.infer_pil(image) + colored_depth = colorize(depth, cmap='gray_r') # [h, w, 4] 0-255 + + depth_img = deepcopy(colored_depth[:, :, :3]) + if len(points) > 0: + for idx, point in enumerate(points): + if idx % 2 == 0: + cv2.circle(depth_img, tuple(point), 10, (255, 0, 0), -1) + else: + cv2.circle(depth_img, tuple(point), 10, (0, 0, 255), -1) + if idx > 0: + cv2.arrowedLine(depth_img, points[idx-1], points[idx], (255, 255, 255), 4, tipLength=0.5) + + return depth, depth_img, colored_depth[:, :, :3] + + +# @spaces.GPU(duration=80) +def run_objctrl_2_5d(condition_image, + mask, + depth, + RTs, + bg_mode, + shared_wapring_latents, + scale_wise_masks, + rescale, + seed, + ds, dt, + num_inference_steps=25): + + DEBUG = False + + if DEBUG: + cur_OUTPUT_PATH = 'outputs/tmp' + os.makedirs(cur_OUTPUT_PATH, exist_ok=True) + + # num_inference_steps=25 + min_guidance_scale = 1.0 + max_guidance_scale = 3.0 + + area_ratio = 0.3 + depth_scale_ = 5.2 + center_margin = 10 + + height, width = 320, 576 + num_frames = 14 + + intrinsics = np.array([[float(width), float(width), float(width) / 2, float(height) / 2]]) + intrinsics = np.repeat(intrinsics, num_frames, axis=0) # [n_frame, 4] + fx = intrinsics[0, 0] / width + fy = intrinsics[0, 1] / height + cx = intrinsics[0, 2] / width + cy = intrinsics[0, 3] / height + + down_scale = 8 + H, W = height // down_scale, width // down_scale + K = np.array([[width / down_scale, 0, W / 2], [0, width / down_scale, H / 2], [0, 0, 1]]) + + seed = int(seed) + + center_h_margin, center_w_margin = center_margin, center_margin + depth_center = np.mean(depth[height//2-center_h_margin:height//2+center_h_margin, width//2-center_w_margin:width//2+center_w_margin]) + + if rescale > 0: + depth_rescale = round(depth_scale_ * rescale / depth_center, 2) + else: + depth_rescale = 1.0 + + depth = depth * depth_rescale + + depth_down = F.interpolate(torch.tensor(depth).unsqueeze(0).unsqueeze(0), + (H, W), mode='bilinear', align_corners=False).squeeze().numpy() # [H, W] + + ## latent + generator = torch.Generator() + generator.manual_seed(seed) + + latents_org = pipeline.prepare_latents( + 1, + 14, + 8, + height, + width, + pipeline.dtype, + device, + generator, + None, + ) + latents_org = latents_org / pipeline.scheduler.init_noise_sigma + + cur_plucker_embedding, _, _ = RT2Plucker(RTs, RTs.shape[0], (height, width), fx, fy, cx, cy) # 6, V, H, W + cur_plucker_embedding = cur_plucker_embedding.to(device) + cur_plucker_embedding = cur_plucker_embedding[None, ...] # b 6 f h w + cur_plucker_embedding = cur_plucker_embedding.permute(0, 2, 1, 3, 4) # b f 6 h w + cur_plucker_embedding = cur_plucker_embedding[:, :num_frames, ...] + cur_pose_features = pipeline.pose_encoder(cur_plucker_embedding) + + # bg_mode = ["Fixed", "Reverse", "Free"] + if bg_mode == "Fixed": + fix_RTs = np.repeat(RTs[0][None, ...], num_frames, axis=0) # [n_frame, 4, 3] + fix_plucker_embedding, _, _ = RT2Plucker(fix_RTs, num_frames, (height, width), fx, fy, cx, cy) # 6, V, H, W + fix_plucker_embedding = fix_plucker_embedding.to(device) + fix_plucker_embedding = fix_plucker_embedding[None, ...] # b 6 f h w + fix_plucker_embedding = fix_plucker_embedding.permute(0, 2, 1, 3, 4) # b f 6 h w + fix_plucker_embedding = fix_plucker_embedding[:, :num_frames, ...] + fix_pose_features = pipeline.pose_encoder(fix_plucker_embedding) + + elif bg_mode == "Reverse": + bg_plucker_embedding, _, _ = RT2Plucker(RTs[::-1], RTs.shape[0], (height, width), fx, fy, cx, cy) # 6, V, H, W + bg_plucker_embedding = bg_plucker_embedding.to(device) + bg_plucker_embedding = bg_plucker_embedding[None, ...] # b 6 f h w + bg_plucker_embedding = bg_plucker_embedding.permute(0, 2, 1, 3, 4) # b f 6 h w + bg_plucker_embedding = bg_plucker_embedding[:, :num_frames, ...] + fix_pose_features = pipeline.pose_encoder(bg_plucker_embedding) + + else: + fix_pose_features = None + + #### preparing mask + + mask = Image.fromarray(mask) + mask = mask.resize((W, H)) + mask = np.array(mask).astype(np.float32) + mask = np.expand_dims(mask, axis=-1) + + # visulize mask + if DEBUG: + mask_sum_vis = mask[..., 0] + mask_sum_vis = (mask_sum_vis * 255.0).astype(np.uint8) + mask_sum_vis = Image.fromarray(mask_sum_vis) + + mask_sum_vis.save(f'{cur_OUTPUT_PATH}/org_mask.png') + + try: + warped_masks = Unprojected(mask, depth_down, RTs, H=H, W=W, K=K) + + warped_masks.insert(0, mask) + + except: + # mask to bbox + print(f'!!! Mask is too small to warp; mask to bbox') + mask = mask[:, :, 0] + coords = cv2.findNonZero(mask) + x, y, w, h = cv2.boundingRect(coords) + # mask[y:y+h, x:x+w] = 1.0 + + center_x, center_y = x + w // 2, y + h // 2 + center_z = depth_down[center_y, center_x] + + # RTs [n_frame, 3, 4] to [n_frame, 4, 4] , add [0, 0, 0, 1] + RTs = np.concatenate([RTs, np.array([[[0, 0, 0, 1]]] * num_frames)], axis=1) + + # RTs: world to camera + P0 = np.array([center_x, center_y, 1]) + Pc0 = np.linalg.inv(K) @ P0 * center_z + pw = np.linalg.inv(RTs[0]) @ np.array([Pc0[0], Pc0[1], center_z, 1]) # [4] + + P = [np.array([center_x, center_y])] + for i in range(1, num_frames): + Pci = RTs[i] @ pw + Pi = K @ Pci[:3] / Pci[2] + P.append(Pi[:2]) + + warped_masks = [mask] + for i in range(1, num_frames): + shift_x = int(round(P[i][0] - P[0][0])) + shift_y = int(round(P[i][1] - P[0][1])) + + cur_mask = roll_with_ignore_multidim(mask, [shift_y, shift_x]) + warped_masks.append(cur_mask) + + + warped_masks = [v[..., None] for v in warped_masks] + + warped_masks = np.stack(warped_masks, axis=0) # [f, h, w] + warped_masks = np.repeat(warped_masks, 3, axis=-1) # [f, h, w, 3] + + mask_sum = np.sum(warped_masks, axis=0, keepdims=True) # [1, H, W, 3] + mask_sum[mask_sum > 1.0] = 1.0 + mask_sum = mask_sum[0,:,:, 0] + + if DEBUG: + ## visulize warp mask + warp_masks_vis = torch.tensor(warped_masks) + warp_masks_vis = (warp_masks_vis * 255.0).to(torch.uint8) + torchvision.io.write_video(f'{cur_OUTPUT_PATH}/warped_masks.mp4', warp_masks_vis, fps=10, video_codec='h264', options={'crf': '10'}) + + # visulize mask + mask_sum_vis = mask_sum + mask_sum_vis = (mask_sum_vis * 255.0).astype(np.uint8) + mask_sum_vis = Image.fromarray(mask_sum_vis) + + mask_sum_vis.save(f'{cur_OUTPUT_PATH}/merged_mask.png') + + if scale_wise_masks: + min_area = H * W * area_ratio # cal in downscale + non_zero_len = mask_sum.sum() + + print(f'non_zero_len: {non_zero_len}, min_area: {min_area}') + + if non_zero_len > min_area: + kernel_sizes = [1, 1, 1, 3] + elif non_zero_len > min_area * 0.5: + kernel_sizes = [3, 1, 1, 5] + else: + kernel_sizes = [5, 3, 3, 7] + else: + kernel_sizes = [1, 1, 1, 1] + + mask = torch.from_numpy(mask_sum) # [h, w] + mask = mask[None, None, ...] # [1, 1, h, w] + mask = F.interpolate(mask, (height, width), mode='bilinear', align_corners=False) # [1, 1, H, W] + # mask = mask.repeat(1, num_frames, 1, 1) # [1, f, H, W] + mask = mask.to(pipeline.dtype).to(device) + + ##### Mask End ###### + + ### Got blending pose features Start ### + + pose_features = [] + for i in range(0, len(cur_pose_features)): + kernel_size = kernel_sizes[i] + h, w = cur_pose_features[i].shape[-2:] + + if fix_pose_features is None: + pose_features.append(torch.zeros_like(cur_pose_features[i])) + else: + pose_features.append(fix_pose_features[i]) + + cur_mask = F.interpolate(mask, (h, w), mode='bilinear', align_corners=False) + cur_mask = dilate_mask_pytorch(cur_mask, kernel_size=kernel_size) # [1, 1, H, W] + cur_mask = cur_mask.repeat(1, num_frames, 1, 1) # [1, f, H, W] + + if DEBUG: + # visulize mask + mask_vis = cur_mask[0, 0].cpu().numpy() * 255.0 + mask_vis = Image.fromarray(mask_vis.astype(np.uint8)) + mask_vis.save(f'{cur_OUTPUT_PATH}/mask_k{kernel_size}_scale{i}.png') + + cur_mask = cur_mask[None, ...] # [1, 1, f, H, W] + pose_features[-1] = cur_pose_features[i] * cur_mask + pose_features[-1] * (1 - cur_mask) + + ### Got blending pose features End ### + + ##### Warp Noise Start ###### + + if shared_wapring_latents: + noise = latents_org[0, 0].data.cpu().numpy().copy() #[14, 4, 40, 72] + noise = np.transpose(noise, (1, 2, 0)) # [40, 72, 4] + + try: + warp_noise = Unprojected(noise, depth_down, RTs, H=H, W=W, K=K) + warp_noise.insert(0, noise) + except: + print(f'!!! Noise is too small to warp; mask to bbox') + + warp_noise = [noise] + for i in range(1, num_frames): + shift_x = int(round(P[i][0] - P[0][0])) + shift_y = int(round(P[i][1] - P[0][1])) + + cur_noise= roll_with_ignore_multidim(noise, [shift_y, shift_x]) + warp_noise.append(cur_noise) + + warp_noise = np.stack(warp_noise, axis=0) # [f, h, w, 4] + + if DEBUG: + ## visulize warp noise + warp_noise_vis = torch.tensor(warp_noise)[..., :3] * torch.tensor(warped_masks) + warp_noise_vis = (warp_noise_vis - warp_noise_vis.min()) / (warp_noise_vis.max() - warp_noise_vis.min()) + warp_noise_vis = (warp_noise_vis * 255.0).to(torch.uint8) + + torchvision.io.write_video(f'{cur_OUTPUT_PATH}/warp_noise.mp4', warp_noise_vis, fps=10, video_codec='h264', options={'crf': '10'}) + + + warp_latents = torch.tensor(warp_noise).permute(0, 3, 1, 2).to(latents_org.device).to(latents_org.dtype) # [frame, 4, H, W] + warp_latents = warp_latents.unsqueeze(0) # [1, frame, 4, H, W] + + warped_masks = torch.tensor(warped_masks).permute(0, 3, 1, 2).unsqueeze(0) # [1, frame, 3, H, W] + mask_extend = torch.concat([warped_masks, warped_masks[:,:,0:1]], dim=2) # [1, frame, 4, H, W] + mask_extend = mask_extend.to(latents_org.device).to(latents_org.dtype) + + warp_latents = warp_latents * mask_extend + latents_org * (1 - mask_extend) + warp_latents = warp_latents.permute(0, 2, 1, 3, 4) + random_noise = latents_org.clone().permute(0, 2, 1, 3, 4) + + filter_shape = warp_latents.shape + + freq_filter = get_freq_filter( + filter_shape, + device = device, + filter_type='butterworth', + n=4, + d_s=ds, + d_t=dt + ) + + warp_latents = freq_mix_3d(warp_latents, random_noise, freq_filter) + warp_latents = warp_latents.permute(0, 2, 1, 3, 4) + + else: + warp_latents = latents_org.clone() + + generator.manual_seed(42) + + with torch.no_grad(): + result = pipeline( + image=condition_image, + pose_embedding=cur_plucker_embedding, + height=height, + width=width, + num_frames=num_frames, + num_inference_steps=num_inference_steps, + min_guidance_scale=min_guidance_scale, + max_guidance_scale=max_guidance_scale, + do_image_process=True, + generator=generator, + output_type='pt', + pose_features= pose_features, + latents = warp_latents + ).frames[0].cpu() #[f, c, h, w] + + + result = rearrange(result, 'f c h w -> f h w c') + result = (result * 255.0).to(torch.uint8) + + video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name + torchvision.io.write_video(video_path, result, fps=10, video_codec='h264', options={'crf': '8'}) + + return video_path + +# -------------- UI definition -------------- +with gr.Blocks() as demo: + # layout definition + gr.Markdown(title) + gr.Markdown(authors) + gr.Markdown(affiliation) + gr.Markdown(important_link) + gr.Markdown(description) + + + # with gr.Row(): + # gr.Markdown("""#
Repositioning the Subject within Image
""") + mask = gr.State(value=None) # store mask + removal_mask = gr.State(value=None) # store removal mask + selected_points = gr.State([]) # store points + selected_points_text = gr.Textbox(label="Selected Points", visible=False) + + original_image = gr.State(value=None) # store original input image + masked_original_image = gr.State(value=None) # store masked input image + mask_logits = gr.State(value=None) # store mask logits + + depth = gr.State(value=None) # store depth + org_depth_image = gr.State(value=None) # store original depth image + + camera_pose = gr.State(value=None) # store camera pose + + with gr.Column(): + + outlines = """ + There are total 5 steps to complete the task. + - Step 1: Input an image and Crop it to a suitable size; + - Step 2: Attain the subject mask; + - Step 3: Get depth and Draw Trajectory; + - Step 4: Get camera pose from trajectory or customize it; + - Step 5: Generate the final video. + """ + + gr.Markdown(outlines) + + + with gr.Row(): + with gr.Column(): + # Step 1: Input Image + step1_dec = """ + Step 1: Input Image + - Select the region using a bounding box, aiming for a ratio close to 320:576 (height:width). + - All provided images in `Examples` are in 320 x 576 resolution. Simply press `Process` to proceed. + """ + step1 = gr.Markdown(step1_dec) + raw_input = ImagePrompter(type="pil", label="Raw Image", show_label=True, interactive=True) + # left_up_point = gr.Textbox(value = "-1 -1", label="Left Up Point", interactive=True) + process_button = gr.Button("Process") + + with gr.Column(): + # Step 2: Get Subject Mask + step2_dec = """ + Step 2: Get Subject Mask + - Use the bounding boxes or paints to select the subject. + - Press `Segment Subject` to get the mask. Can be refined iteratively by updating points. + """ + step2 = gr.Markdown(step2_dec) + canvas = ImagePrompter(type="pil", label="Input Image", show_label=True, interactive=True) # for mask painting + + select_button = gr.Button("Segment Subject") + + with gr.Row(): + with gr.Column(): + mask_dec = """ + Mask Result + - Just for visualization purpose. No need to interact. + """ + mask_vis = gr.Markdown(mask_dec) + mask_output = gr.Image(type="pil", label="Mask", show_label=True, interactive=False) + with gr.Column(): + # Step 3: Get Depth and Draw Trajectory + step3_dec = """ + Step 3: Get Depth and Draw Trajectory + - Press `Get Depth` to get the depth image. + - Draw the trajectory by selecting points on the depth image. No more than 14 points. + - Press `Undo point` to remove all points. + """ + step3 = gr.Markdown(step3_dec) + depth_image = gr.Image(type="pil", label="Depth Image", show_label=True, interactive=False) + with gr.Row(): + depth_button = gr.Button("Get Depth") + undo_button = gr.Button("Undo point") + + with gr.Row(): + with gr.Column(): + # Step 4: Trajectory to Camera Pose or Get Camera Pose + step4_dec = """ + Step 4: Get camera pose from trajectory or customize it + - Option 1: Transform the 2D trajectory to camera poses with depth. `Rescale` is used for depth alignment. Larger value can speed up the object motion. + - Option 2: Rotate the camera with a specific `Angle`. + - Option 3: Rotate the camera clockwise or counterclockwise with a specific `Angle`. + - Option 4: Translate the camera with `Tx` (Pan Left/Right), `Ty` (Pan Up/Down), `Tz` (Zoom In/Out) and `Speed`. + """ + step4 = gr.Markdown(step4_dec) + camera_pose_vis = gr.Plot(None, label='Camera Pose') + with gr.Row(): + with gr.Column(): + speed = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.0, label="Speed", interactive=True) + rescale = gr.Slider(minimum=0.0, maximum=10, step=0.1, value=1.0, label="Rescale", interactive=True) + # traj2pose_button = gr.Button("Option1: Trajectory to Camera Pose") + + angle = gr.Slider(minimum=-360, maximum=360, step=1, value=60, label="Angle", interactive=True) + # rotation_button = gr.Button("Option2: Rotate") + # clockwise_button = gr.Button("Option3: Clockwise") + with gr.Column(): + + Tx = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tx", interactive=True) + Ty = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Ty", interactive=True) + Tz = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tz", interactive=True) + # translation_button = gr.Button("Option4: Translate") + with gr.Row(): + camera_option = gr.Radio(choices = CAMERA_MODE, label='Camera Options', value=CAMERA_MODE[0], interactive=True) + with gr.Row(): + get_camera_pose_button = gr.Button("Get Camera Pose") + + with gr.Column(): + # Step 5: Get the final generated video + step5_dec = """ + Step 5: Get the final generated video + - 3 modes for background: Fixed, Reverse, Free. + - Enable Scale-wise Masks for better object control. + - Option to enable Shared Warping Latents and set stop frequency for spatial (`ds`) and temporal (`dt`) dimensions. Larger stop frequency will lead to artifacts. + """ + step5 = gr.Markdown(step5_dec) + generated_video = gr.Video(None, label='Generated Video') + + with gr.Row(): + seed = gr.Textbox(value = "42", label="Seed", interactive=True) + # num_inference_steps = gr.Slider(minimum=1, maximum=100, step=1, value=25, label="Number of Inference Steps", interactive=True) + bg_mode = gr.Radio(choices = ["Fixed", "Reverse", "Free"], label="Background Mode", value="Fixed", interactive=True) + # swl_mode = gr.Radio(choices = ["Enable SWL", "Disable SWL"], label="Shared Warping Latent", value="Disable SWL", interactive=True) + scale_wise_masks = gr.Checkbox(label="Enable Scale-wise Masks", interactive=True, value=True) + with gr.Row(): + with gr.Column(): + shared_wapring_latents = gr.Checkbox(label="Enable Shared Warping Latents", interactive=True) + with gr.Column(): + ds = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="ds", interactive=True) + dt = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="dt", interactive=True) + + generated_button = gr.Button("Generate") + + + + # # event definition + process_button.click( + fn = process_image, + inputs = [raw_input], + outputs = [original_image, canvas] + ) + + select_button.click( + segment, + [canvas, original_image, mask_logits], + [mask, mask_output, masked_original_image, mask_logits] + ) + + depth_button.click( + get_depth, + [original_image, selected_points], + [depth, depth_image, org_depth_image] + ) + + depth_image.select( + get_points, + [depth_image, selected_points], + [depth_image, selected_points], + ) + undo_button.click( + undo_points, + [org_depth_image], + [depth_image, selected_points] + ) + + get_camera_pose_button.click( + get_camera_pose(CAMERA_MODE), + [camera_option, selected_points, depth, mask, rescale, angle, Tx, Ty, Tz, speed], + [camera_pose, camera_pose_vis, rescale] + ) + + generated_button.click( + run_objctrl_2_5d, + [ + original_image, + mask, + depth, + camera_pose, + bg_mode, + shared_wapring_latents, + scale_wise_masks, + rescale, + seed, + ds, + dt, + # num_inference_steps + ], + [generated_video], + ) + + gr.Examples( + examples=examples, + inputs=[ + raw_input, + rescale, + speed, + angle, + Tx, + Ty, + Tz, + camera_option, + bg_mode, + shared_wapring_latents, + scale_wise_masks, + ds, + dt, + seed, + selected_points_text # selected_points + ], + outputs=[generated_video], + examples_per_page=10 + ) + + selected_points_text.change( + sync_points, + inputs=[selected_points_text], + outputs=[selected_points] + ) + + + gr.Markdown(article) + + +demo.queue().launch(share=True) diff --git a/app.py b/app.py index a0ff86394ceb8e4bb476f09029b8dac724bec92f..ba36a556966966c415efee0133c8139616239610 100644 --- a/app.py +++ b/app.py @@ -1,12 +1,18 @@ -import spaces +try: + import spaces +except: + pass + import os import gradio as gr +import json +import ast import torch from gradio_image_prompter import ImagePrompter from sam2.sam2_image_predictor import SAM2ImagePredictor from omegaconf import OmegaConf -from PIL import Image +from PIL import Image, ImageDraw import numpy as np from copy import deepcopy import cv2 @@ -16,7 +22,7 @@ import torchvision from einops import rearrange import tempfile -from objctrl_2_5d.utils.ui_utils import process_image, get_camera_pose, get_subject_points, get_points, undo_points, mask_image +from objctrl_2_5d.utils.ui_utils import process_image, get_camera_pose, get_subject_points, get_points, undo_points, mask_image, traj2cam, get_mid_params from ZoeDepth.zoedepth.utils.misc import colorize from cameractrl.inference import get_pipeline @@ -25,7 +31,6 @@ from objctrl_2_5d.utils.examples import examples, sync_points from objctrl_2_5d.utils.objmask_util import RT2Plucker, Unprojected, roll_with_ignore_multidim, dilate_mask_pytorch from objctrl_2_5d.utils.filter_utils import get_freq_filter, freq_mix_3d - ### Title and Description ### #### Description #### title = r"""

ObjCtrl-2.5D: Training-free Object Control with Camera Poses

""" @@ -85,9 +90,40 @@ If you have any questions, please feel free to reach me out at zhouzi1212@gma """ +# pre-defined parameters +DEBUG = False + +if DEBUG: + cur_OUTPUT_PATH = 'outputs/tmp' + os.makedirs(cur_OUTPUT_PATH, exist_ok=True) + +# num_inference_steps=25 +min_guidance_scale = 1.0 +max_guidance_scale = 3.0 + +area_ratio = 0.3 +depth_scale_ = 5.2 +center_margin = 10 + +height, width = 320, 576 +num_frames = 14 + +intrinsics = np.array([[float(width), float(width), float(width) / 2, float(height) / 2]]) +intrinsics = np.repeat(intrinsics, num_frames, axis=0) # [n_frame, 4] +fx = intrinsics[0, 0] / width +fy = intrinsics[0, 1] / height +cx = intrinsics[0, 2] / width +cy = intrinsics[0, 3] / height + +down_scale = 8 +H, W = height // down_scale, width // down_scale +K = np.array([[width / down_scale, 0, W / 2], [0, width / down_scale, H / 2], [0, 0, 1]]) + + # -------------- initialization -------------- -CAMERA_MODE = ["Traj2Cam", "Rotate", "Clockwise", "Translate"] +# CAMERA_MODE = ["Traj2Cam", "Rotate", "Clockwise", "Translate"] +CAMERA_MODE = ["None", "ZoomIn", "ZoomOut", "PanRight", "PanLeft", "TiltUp", "TiltDown", "ClockWise", "Anti-CW", "Rotate60"] # select the device for computation if torch.cuda.is_available(): @@ -96,11 +132,9 @@ elif torch.backends.mps.is_available(): device = torch.device("mps") else: device = torch.device("cpu") - device = torch.device("cuda") - print(f"Force device to {device} due to ZeroGPU") print(f"using device: {device}") -# segmentation model +# # segmentation model segmentor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-tiny", cache_dir="ckpt", device=device) # depth model @@ -126,7 +160,7 @@ pipeline = get_pipeline(model_id, "unet", model_config['down_block_types'], mode # pipeline = None ### run the demo ## -@spaces.GPU(duration=5) +# @spaces.GPU(duration=5) def segment(canvas, image, logits): if logits is not None: logits *= 32.0 @@ -165,28 +199,9 @@ def segment(canvas, image, logits): masked_img = mask_image(image, mask[0], color=[252, 140, 90], alpha=0.9) masked_img = Image.fromarray(masked_img) - return mask[0], masked_img, masked_img, logits / 32.0 - -@spaces.GPU(duration=5) -def get_depth(image, points): - - depth = d_model_NK.infer_pil(image) - colored_depth = colorize(depth, cmap='gray_r') # [h, w, 4] 0-255 - - depth_img = deepcopy(colored_depth[:, :, :3]) - if len(points) > 0: - for idx, point in enumerate(points): - if idx % 2 == 0: - cv2.circle(depth_img, tuple(point), 10, (255, 0, 0), -1) - else: - cv2.circle(depth_img, tuple(point), 10, (0, 0, 255), -1) - if idx > 0: - cv2.arrowedLine(depth_img, points[idx-1], points[idx], (255, 255, 255), 4, tipLength=0.5) - - return depth, depth_img, colored_depth[:, :, :3] + return mask[0], {'image': masked_img, 'points': points}, logits / 32.0 - -@spaces.GPU(duration=80) +# @spaces.GPU(duration=80) def run_objctrl_2_5d(condition_image, mask, depth, @@ -198,35 +213,6 @@ def run_objctrl_2_5d(condition_image, seed, ds, dt, num_inference_steps=25): - - DEBUG = False - - if DEBUG: - cur_OUTPUT_PATH = 'outputs/tmp' - os.makedirs(cur_OUTPUT_PATH, exist_ok=True) - - # num_inference_steps=25 - min_guidance_scale = 1.0 - max_guidance_scale = 3.0 - - area_ratio = 0.3 - depth_scale_ = 5.2 - center_margin = 10 - - height, width = 320, 576 - num_frames = 14 - - intrinsics = np.array([[float(width), float(width), float(width) / 2, float(height) / 2]]) - intrinsics = np.repeat(intrinsics, num_frames, axis=0) # [n_frame, 4] - fx = intrinsics[0, 0] / width - fy = intrinsics[0, 1] / height - cx = intrinsics[0, 2] / width - cy = intrinsics[0, 3] / height - - down_scale = 8 - H, W = height // down_scale, width // down_scale - K = np.array([[width / down_scale, 0, W / 2], [0, width / down_scale, H / 2], [0, 0, 1]]) - seed = int(seed) center_h_margin, center_w_margin = center_margin, center_margin @@ -288,7 +274,7 @@ def run_objctrl_2_5d(condition_image, fix_pose_features = None #### preparing mask - + mask = Image.fromarray(mask) mask = mask.resize((W, H)) mask = np.array(mask).astype(np.float32) @@ -500,6 +486,97 @@ def run_objctrl_2_5d(condition_image, return video_path + +# UI function +# @spaces.GPU(duration=5) +def process_image(raw_image, trajectory_points): + + image, points = raw_image['image'], raw_image['points'] + + print(points) + + try: + assert(len(points)) == 1, "Please draw only one bbox" + [x1, y1, _, x2, y2, _] = points[0] + + image = image.crop((x1, y1, x2, y2)) + image = image.resize((width, height)) + except: + image = image.resize((width, height)) + + depth = d_model_NK.infer_pil(image) + colored_depth = colorize(depth, cmap='gray_r') # [h, w, 4] 0-255 + + depth_img = deepcopy(colored_depth[:, :, :3]) + if len(trajectory_points) > 0: + for idx, point in enumerate(trajectory_points): + if idx % 2 == 0: + cv2.circle(depth_img, tuple(point), 10, (255, 0, 0), -1) + else: + cv2.circle(depth_img, tuple(point), 10, (0, 0, 255), -1) + if idx > 0: + line_length = np.sqrt((trajectory_points[idx][0] - trajectory_points[idx-1][0])**2 + (trajectory_points[idx][1] - trajectory_points[idx-1][1])**2) + arrow_head_length = 10 + tip_length = arrow_head_length / line_length + cv2.arrowedLine(depth_img, trajectory_points[idx-1], trajectory_points[idx], (0, 255, 0), 4, tipLength=tip_length) + + return image, {'image': image}, depth, depth_img, colored_depth[:, :, :3] + + + +def draw_points_on_image(img, points): + # img = Image.fromarray(np.array(image)) + draw = ImageDraw.Draw(img) + + for p in points: + x1, y1, _, x2, y2, _ = p + + if x2 == 0 and y2 == 0: + # Point: 青色点带黑边 + point_radius = 4 + draw.ellipse( + (x1 - point_radius, y1 - point_radius, x1 + point_radius, y1 + point_radius), + fill="cyan", outline="black", width=1 + ) + else: + # Bounding Box: 黑色矩形框 + draw.rectangle([x1, y1, x2, y2], outline="black", width=3) + + return img + +# @spaces.GPU(duration=10) +def from_examples(raw_input, raw_image_points, canvas, seg_image_points, selected_points_text, camera_option, mask_bk): + + selected_points = ast.literal_eval(selected_points_text) + mask = np.array(mask_bk) + mask = mask[:,:,0] > 0 + selected_points = ast.literal_eval(selected_points_text) + + image, _, depth, depth_img, colored_depth = process_image(raw_input, selected_points) + + # get camera pose + if camera_option == "None": + # traj2came + rescale = 1.0 + camera_pose, camera_pose_vis, rescale, _ = traj2cam(selected_points, depth , rescale) + else: + rescale = 0.0 + angle = 60 + speed = 4.0 + camera_pose, camera_pose_vis, rescale = get_camera_pose(CAMERA_MODE)(camera_option, depth, mask, rescale, angle, speed) + + raw_image_points = ast.literal_eval(raw_image_points) + seg_image_points = ast.literal_eval(seg_image_points) + + raw_image = draw_points_on_image(raw_input['image'], raw_image_points) + seg_image = draw_points_on_image(canvas['image'], seg_image_points) + + return image, mask, depth, depth_img, colored_depth, camera_pose, \ + camera_pose_vis, rescale, selected_points, \ + gr.update(value={'image': raw_image, 'points': raw_image_points}), \ + gr.update(value={'image': seg_image, 'points': seg_image_points}), \ + + # -------------- UI definition -------------- with gr.Blocks() as demo: # layout definition @@ -513,12 +590,16 @@ with gr.Blocks() as demo: # with gr.Row(): # gr.Markdown("""#
Repositioning the Subject within Image
""") mask = gr.State(value=None) # store mask + mask_bk = gr.Image(type="pil", label="Mask", show_label=True, interactive=False, visible=False) + removal_mask = gr.State(value=None) # store removal mask selected_points = gr.State([]) # store points selected_points_text = gr.Textbox(label="Selected Points", visible=False) + raw_image_points = gr.Textbox(label="Raw Image Points", visible=False) + seg_image_points = gr.Textbox(label="Segment Image Points", visible=False) original_image = gr.State(value=None) # store original input image - masked_original_image = gr.State(value=None) # store masked input image + # masked_original_image = gr.State(value=None) # store masked input image mask_logits = gr.State(value=None) # store mask logits depth = gr.State(value=None) # store depth @@ -526,14 +607,22 @@ with gr.Blocks() as demo: camera_pose = gr.State(value=None) # store camera pose + rescale = gr.Slider(minimum=0.0, maximum=10, step=0.1, value=1.0, label="Rescale", interactive=True, visible=False) + angle = gr.Slider(minimum=-360, maximum=360, step=1, value=60, label="Angle", interactive=True, visible=False) + + seed = gr.Textbox(value = "42", label="Seed", interactive=True, visible=False) + scale_wise_masks = gr.Checkbox(label="Enable Scale-wise Masks", interactive=True, value=True, visible=False) + ds = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.25, label="ds", interactive=True, visible=False) + dt = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.1, label="dt", interactive=True, visible=False) + with gr.Column(): outlines = """ There are total 5 steps to complete the task. - - Step 1: Input an image and Crop it to a suitable size; + - Step 1: Input an image and Crop it to a suitable size and attained depth; - Step 2: Attain the subject mask; - - Step 3: Get depth and Draw Trajectory; - - Step 4: Get camera pose from trajectory or customize it; + - Step 3: Draw trajectory on depth map or skip to use camera pose; + - Step 4: Select camera poses or skip. - Step 5: Generate the final video. """ @@ -545,125 +634,92 @@ with gr.Blocks() as demo: # Step 1: Input Image step1_dec = """ Step 1: Input Image - - Select the region using a bounding box, aiming for a ratio close to
320:576
(height:width). - - All provided images in `Examples` are in 320 x 576 resolution. Simply press `Process` to proceed. """ step1 = gr.Markdown(step1_dec) raw_input = ImagePrompter(type="pil", label="Raw Image", show_label=True, interactive=True) - # left_up_point = gr.Textbox(value = "-1 -1", label="Left Up Point", interactive=True) + + step1_notes = """ + - Select the region using a bounding box, aiming for a ratio close to 320:576 (height:width). + - If the input is in 320 x 576, press `Process` directly. + """ + notes = gr.Markdown(step1_notes) + process_button = gr.Button("Process") with gr.Column(): # Step 2: Get Subject Mask step2_dec = """ Step 2: Get Subject Mask - - Use the bounding boxes or paints to select the subject. - - Press `Segment Subject` to get the mask. Can be refined iteratively by updating points. """ step2 = gr.Markdown(step2_dec) canvas = ImagePrompter(type="pil", label="Input Image", show_label=True, interactive=True) # for mask painting + step2_notes = """ + - Use the bounding boxes or points to select the subject. + - Press `Segment Subject` to get the mask. Can be refined iteratively by updating points. + """ + notes = gr.Markdown(step2_notes) + select_button = gr.Button("Segment Subject") - with gr.Row(): - with gr.Column(): - mask_dec = """ - Mask Result - - Just for visualization purpose. No need to interact. - """ - mask_vis = gr.Markdown(mask_dec) - mask_output = gr.Image(type="pil", label="Mask", show_label=True, interactive=False) with gr.Column(): # Step 3: Get Depth and Draw Trajectory step3_dec = """ - Step 3: Get Depth and Draw Trajectory - - Press `Get Depth` to get the depth image. - - Draw the trajectory by selecting points on the depth image. No more than 14 points. - - Press `Undo point` to remove all points. + Step 3: Draw Trajectory on Depth or SKIP + """ step3 = gr.Markdown(step3_dec) depth_image = gr.Image(type="pil", label="Depth Image", show_label=True, interactive=False) - with gr.Row(): - depth_button = gr.Button("Get Depth") - undo_button = gr.Button("Undo point") - + + step3_dec = """ + - Selecting points on the depth image. No more than 14 points. + - Press `Undo point` to remove all points. Press `Traj2Cam` to get camera poses. + """ + notes = gr.Markdown(step3_dec) + + undo_button = gr.Button("Undo point") + traj2cam_button = gr.Button("Traj2Cam") + with gr.Row(): + with gr.Column(): # Step 4: Trajectory to Camera Pose or Get Camera Pose step4_dec = """ - Step 4: Get camera pose from trajectory or customize it - - Option 1: Transform the 2D trajectory to camera poses with depth. `Rescale` is used for depth alignment. Larger value can speed up the object motion. - - Option 2: Rotate the camera with a specific `Angle`. - - Option 3: Rotate the camera clockwise or counterclockwise with a specific `Angle`. - - Option 4: Translate the camera with `Tx` (Pan Left/Right), `Ty` (Pan Up/Down), `Tz` (Zoom In/Out) and `Speed`. + Step 4: Get Customized Camera Poses or Skip """ step4 = gr.Markdown(step4_dec) camera_pose_vis = gr.Plot(None, label='Camera Pose') - with gr.Row(): - with gr.Column(): - speed = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.0, label="Speed", interactive=True) - rescale = gr.Slider(minimum=0.0, maximum=10, step=0.1, value=1.0, label="Rescale", interactive=True) - # traj2pose_button = gr.Button("Option1: Trajectory to Camera Pose") - - angle = gr.Slider(minimum=-360, maximum=360, step=1, value=60, label="Angle", interactive=True) - # rotation_button = gr.Button("Option2: Rotate") - # clockwise_button = gr.Button("Option3: Clockwise") - with gr.Column(): - - Tx = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tx", interactive=True) - Ty = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Ty", interactive=True) - Tz = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tz", interactive=True) - # translation_button = gr.Button("Option4: Translate") - with gr.Row(): - camera_option = gr.Radio(choices = CAMERA_MODE, label='Camera Options', value=CAMERA_MODE[0], interactive=True) - with gr.Row(): - get_camera_pose_button = gr.Button("Get Camera Pose") + camera_option = gr.Radio(choices = CAMERA_MODE, label='Camera Options', value=CAMERA_MODE[0], interactive=True) + speed = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=4.0, label="Speed", interactive=True, visible=True) with gr.Column(): # Step 5: Get the final generated video step5_dec = """ Step 5: Get the final generated video - - 3 modes for background: Fixed, Reverse, Free. - - Enable Scale-wise Masks for better object control. - - Option to enable Shared Warping Latents and set stop frequency for spatial (`ds`) and temporal (`dt`) dimensions. Larger stop frequency will lead to artifacts. """ step5 = gr.Markdown(step5_dec) generated_video = gr.Video(None, label='Generated Video') - with gr.Row(): - seed = gr.Textbox(value = "42", label="Seed", interactive=True) - # num_inference_steps = gr.Slider(minimum=1, maximum=100, step=1, value=25, label="Number of Inference Steps", interactive=True) - bg_mode = gr.Radio(choices = ["Fixed", "Reverse", "Free"], label="Background Mode", value="Fixed", interactive=True) - # swl_mode = gr.Radio(choices = ["Enable SWL", "Disable SWL"], label="Shared Warping Latent", value="Disable SWL", interactive=True) - scale_wise_masks = gr.Checkbox(label="Enable Scale-wise Masks", interactive=True, value=True) - with gr.Row(): - with gr.Column(): - shared_wapring_latents = gr.Checkbox(label="Enable Shared Warping Latents", interactive=True) - with gr.Column(): - ds = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="ds", interactive=True) - dt = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="dt", interactive=True) + # with gr.Row(): + bg_mode = gr.Radio(choices = ["Fixed", "Reverse", "Free"], label="Background Mode", value="Fixed", interactive=True) + shared_wapring_latents = gr.Checkbox(label="Enable Shared Warping Latents", interactive=True, value=False, visible=True) generated_button = gr.Button("Generate") + get_mid_params_button = gr.Button("Get Mid Params") # # event definition process_button.click( fn = process_image, - inputs = [raw_input], - outputs = [original_image, canvas] + inputs = [raw_input, selected_points], + outputs = [original_image, canvas, depth, depth_image, org_depth_image] ) select_button.click( segment, [canvas, original_image, mask_logits], - [mask, mask_output, masked_original_image, mask_logits] - ) - - depth_button.click( - get_depth, - [original_image, selected_points], - [depth, depth_image, org_depth_image] + [mask, canvas, mask_logits] ) depth_image.select( @@ -677,9 +733,15 @@ with gr.Blocks() as demo: [depth_image, selected_points] ) - get_camera_pose_button.click( + traj2cam_button.click( + traj2cam, + [selected_points, depth, rescale], + [camera_pose, camera_pose_vis, rescale, camera_option] + ) + + camera_option.change( get_camera_pose(CAMERA_MODE), - [camera_option, selected_points, depth, mask, rescale, angle, Tx, Ty, Tz, speed], + [camera_option, depth, mask, rescale, angle, speed], [camera_pose, camera_pose_vis, rescale] ) @@ -701,35 +763,44 @@ with gr.Blocks() as demo: ], [generated_video], ) + + get_mid_params_button.click( + get_mid_params, + [raw_input, canvas, mask, selected_points, camera_option, bg_mode, shared_wapring_latents, generated_video] + ) + + ## Get examples + with open('./assets/examples/examples.json', 'r') as f: + examples = json.load(f) + print(examples) + + # examples = [examples] + examples = [v for k, v in examples.items()] gr.Examples( examples=examples, inputs=[ raw_input, - rescale, - speed, - angle, - Tx, - Ty, - Tz, + raw_image_points, + canvas, + seg_image_points, + mask_bk, + selected_points_text, # selected_points camera_option, bg_mode, shared_wapring_latents, - scale_wise_masks, - ds, - dt, - seed, - selected_points_text # selected_points + generated_video ], - outputs=[generated_video], - examples_per_page=10 + examples_per_page=20 ) selected_points_text.change( - sync_points, - inputs=[selected_points_text], - outputs=[selected_points] + from_examples, + inputs=[raw_input, raw_image_points, canvas, seg_image_points, selected_points_text, camera_option, mask_bk], + outputs=[original_image, mask, depth, depth_image, org_depth_image, camera_pose, camera_pose_vis, rescale, selected_points, raw_input, canvas] ) + + gr.Markdown(article) diff --git a/assets/examples/00010/generated_video.mp4 b/assets/examples/00010/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..26601e85dbd20fde53d38ef7d319eedbe32aebfa Binary files /dev/null and b/assets/examples/00010/generated_video.mp4 differ diff --git a/assets/examples/00010/mask.png b/assets/examples/00010/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..459f4ba3de4e946483b2b37cfa554372ab2599de Binary files /dev/null and b/assets/examples/00010/mask.png differ diff --git a/assets/examples/00010/raw_image.png b/assets/examples/00010/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..17b53a68038d4050676c28e60781f2109cb3beb1 Binary files /dev/null and b/assets/examples/00010/raw_image.png differ diff --git a/assets/examples/00010/seg_image.png b/assets/examples/00010/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..ac23f6bb72cd980085e985c0e90792cb7f5787f3 Binary files /dev/null and b/assets/examples/00010/seg_image.png differ diff --git a/assets/examples/00011/generated_video.mp4 b/assets/examples/00011/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..05d52334826bab5f997e59b706df6c024f3e3f1e Binary files /dev/null and b/assets/examples/00011/generated_video.mp4 differ diff --git a/assets/examples/00011/mask.png b/assets/examples/00011/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..459f4ba3de4e946483b2b37cfa554372ab2599de Binary files /dev/null and b/assets/examples/00011/mask.png differ diff --git a/assets/examples/00011/raw_image.png b/assets/examples/00011/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..17b53a68038d4050676c28e60781f2109cb3beb1 Binary files /dev/null and b/assets/examples/00011/raw_image.png differ diff --git a/assets/examples/00011/seg_image.png b/assets/examples/00011/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..ac23f6bb72cd980085e985c0e90792cb7f5787f3 Binary files /dev/null and b/assets/examples/00011/seg_image.png differ diff --git a/assets/examples/00012/generated_video.mp4 b/assets/examples/00012/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..eb4c2bc2838971aa5882d7b204d9fae9a1dfa939 Binary files /dev/null and b/assets/examples/00012/generated_video.mp4 differ diff --git a/assets/examples/00012/mask.png b/assets/examples/00012/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..897f5c3ac8aa97a8eb73b8b13bdbb85dc0802aa3 Binary files /dev/null and b/assets/examples/00012/mask.png differ diff --git a/assets/examples/00012/raw_image.png b/assets/examples/00012/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..94bcd57b36cc6b45676ddc825e784fe3553c7113 Binary files /dev/null and b/assets/examples/00012/raw_image.png differ diff --git a/assets/examples/00012/seg_image.png b/assets/examples/00012/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..542e3361f0ff06c6e1790fe1db3c3f77e7147ec7 Binary files /dev/null and b/assets/examples/00012/seg_image.png differ diff --git a/assets/examples/00013/generated_video.mp4 b/assets/examples/00013/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..930a037352192cfe7061e6aea292a67c980ca141 Binary files /dev/null and b/assets/examples/00013/generated_video.mp4 differ diff --git a/assets/examples/00013/mask.png b/assets/examples/00013/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..e80536699170ae05ee81cfb8c01ac35d06fff750 Binary files /dev/null and b/assets/examples/00013/mask.png differ diff --git a/assets/examples/00013/raw_image.png b/assets/examples/00013/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..1d395bcf430f1d34841f0ce41374dd6b7fc7e70b Binary files /dev/null and b/assets/examples/00013/raw_image.png differ diff --git a/assets/examples/00013/seg_image.png b/assets/examples/00013/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..0b6a1bbf24d6031c620a1864aab20c7cba346f92 Binary files /dev/null and b/assets/examples/00013/seg_image.png differ diff --git a/assets/examples/00014/generated_video.mp4 b/assets/examples/00014/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..abe5fcb4e8cbc991adc78d2392063e806ef521e6 Binary files /dev/null and b/assets/examples/00014/generated_video.mp4 differ diff --git a/assets/examples/00014/mask.png b/assets/examples/00014/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..05b00d1cb6b3c00307c7ff8b4122ab93c326a03d Binary files /dev/null and b/assets/examples/00014/mask.png differ diff --git a/assets/examples/00014/raw_image.png b/assets/examples/00014/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..ca884dbf77e9e2b65bb98253e9bd66784ae24c82 Binary files /dev/null and b/assets/examples/00014/raw_image.png differ diff --git a/assets/examples/00014/seg_image.png b/assets/examples/00014/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..1393e456ec8ddab5a1c01250632242f0cb2c4a34 Binary files /dev/null and b/assets/examples/00014/seg_image.png differ diff --git a/assets/examples/00015/generated_video.mp4 b/assets/examples/00015/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..31c3026c5ff862f37de37a785fc752cdbf0d9e94 Binary files /dev/null and b/assets/examples/00015/generated_video.mp4 differ diff --git a/assets/examples/00015/mask.png b/assets/examples/00015/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..05b00d1cb6b3c00307c7ff8b4122ab93c326a03d Binary files /dev/null and b/assets/examples/00015/mask.png differ diff --git a/assets/examples/00015/raw_image.png b/assets/examples/00015/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..ca884dbf77e9e2b65bb98253e9bd66784ae24c82 Binary files /dev/null and b/assets/examples/00015/raw_image.png differ diff --git a/assets/examples/00015/seg_image.png b/assets/examples/00015/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..1393e456ec8ddab5a1c01250632242f0cb2c4a34 Binary files /dev/null and b/assets/examples/00015/seg_image.png differ diff --git a/assets/examples/00016/generated_video.mp4 b/assets/examples/00016/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..84430085d2e9f8898a051117a16f99e973cbc953 Binary files /dev/null and b/assets/examples/00016/generated_video.mp4 differ diff --git a/assets/examples/00016/mask.png b/assets/examples/00016/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..05b00d1cb6b3c00307c7ff8b4122ab93c326a03d Binary files /dev/null and b/assets/examples/00016/mask.png differ diff --git a/assets/examples/00016/raw_image.png b/assets/examples/00016/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..ca884dbf77e9e2b65bb98253e9bd66784ae24c82 Binary files /dev/null and b/assets/examples/00016/raw_image.png differ diff --git a/assets/examples/00016/seg_image.png b/assets/examples/00016/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..1393e456ec8ddab5a1c01250632242f0cb2c4a34 Binary files /dev/null and b/assets/examples/00016/seg_image.png differ diff --git a/assets/examples/00017/generated_video.mp4 b/assets/examples/00017/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5793d5bf0065781ce62e62246e2ad98c0ad78ed0 Binary files /dev/null and b/assets/examples/00017/generated_video.mp4 differ diff --git a/assets/examples/00017/mask.png b/assets/examples/00017/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..05b00d1cb6b3c00307c7ff8b4122ab93c326a03d Binary files /dev/null and b/assets/examples/00017/mask.png differ diff --git a/assets/examples/00017/raw_image.png b/assets/examples/00017/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..ca884dbf77e9e2b65bb98253e9bd66784ae24c82 Binary files /dev/null and b/assets/examples/00017/raw_image.png differ diff --git a/assets/examples/00017/seg_image.png b/assets/examples/00017/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..1393e456ec8ddab5a1c01250632242f0cb2c4a34 Binary files /dev/null and b/assets/examples/00017/seg_image.png differ diff --git a/assets/examples/00018/generated_video.mp4 b/assets/examples/00018/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7f71802934f9996ca9a68c48a44fe448b0d5a6e4 Binary files /dev/null and b/assets/examples/00018/generated_video.mp4 differ diff --git a/assets/examples/00018/mask.png b/assets/examples/00018/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..05b00d1cb6b3c00307c7ff8b4122ab93c326a03d Binary files /dev/null and b/assets/examples/00018/mask.png differ diff --git a/assets/examples/00018/raw_image.png b/assets/examples/00018/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..ca884dbf77e9e2b65bb98253e9bd66784ae24c82 Binary files /dev/null and b/assets/examples/00018/raw_image.png differ diff --git a/assets/examples/00018/seg_image.png b/assets/examples/00018/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..1393e456ec8ddab5a1c01250632242f0cb2c4a34 Binary files /dev/null and b/assets/examples/00018/seg_image.png differ diff --git a/assets/examples/00019/generated_video.mp4 b/assets/examples/00019/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..240682e9d11236777b40c053bdef84383f1512b6 Binary files /dev/null and b/assets/examples/00019/generated_video.mp4 differ diff --git a/assets/examples/00019/mask.png b/assets/examples/00019/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..c4fdda09358dbca78bd53fd90d8de3d8fbead3af Binary files /dev/null and b/assets/examples/00019/mask.png differ diff --git a/assets/examples/00019/raw_image.png b/assets/examples/00019/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..61a14083818696524a9cb7e6b15018f0bc8c198c Binary files /dev/null and b/assets/examples/00019/raw_image.png differ diff --git a/assets/examples/00019/seg_image.png b/assets/examples/00019/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..0f83f9e5e98c693ce0599e61cd1216f2204540c3 Binary files /dev/null and b/assets/examples/00019/seg_image.png differ diff --git a/assets/examples/00020/generated_video.mp4 b/assets/examples/00020/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0d2029da6871f23f514d46df6c676f86bedf8818 Binary files /dev/null and b/assets/examples/00020/generated_video.mp4 differ diff --git a/assets/examples/00020/mask.png b/assets/examples/00020/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..2028408721baf056a0009bcf4289a7b4737c4879 Binary files /dev/null and b/assets/examples/00020/mask.png differ diff --git a/assets/examples/00020/raw_image.png b/assets/examples/00020/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..61a14083818696524a9cb7e6b15018f0bc8c198c Binary files /dev/null and b/assets/examples/00020/raw_image.png differ diff --git a/assets/examples/00020/seg_image.png b/assets/examples/00020/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..3f70307348cc004e88e7552e000d3a5591b835cc Binary files /dev/null and b/assets/examples/00020/seg_image.png differ diff --git a/assets/examples/00021/generated_video.mp4 b/assets/examples/00021/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..01c4ca5b0ebeb80c3d5df93cb3bd315476ab7548 Binary files /dev/null and b/assets/examples/00021/generated_video.mp4 differ diff --git a/assets/examples/00021/mask.png b/assets/examples/00021/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..75c5da4bb4a2780f5c38eebe0eaf4a8b31014ada Binary files /dev/null and b/assets/examples/00021/mask.png differ diff --git a/assets/examples/00021/raw_image.png b/assets/examples/00021/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..ece9435f244fd4468e0fa1c703c6aa5331a3ba2b Binary files /dev/null and b/assets/examples/00021/raw_image.png differ diff --git a/assets/examples/00021/seg_image.png b/assets/examples/00021/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..eef5849e3e730e1cb3342b4b52bcd54412e1ac56 Binary files /dev/null and b/assets/examples/00021/seg_image.png differ diff --git a/assets/examples/00022/generated_video.mp4 b/assets/examples/00022/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0f7e8f4b1ff2b9385cf43dcc765c08433e71f85f Binary files /dev/null and b/assets/examples/00022/generated_video.mp4 differ diff --git a/assets/examples/00022/mask.png b/assets/examples/00022/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..2e2b0f60eeecfadb656a7f4d3a1f7954467ee9bf Binary files /dev/null and b/assets/examples/00022/mask.png differ diff --git a/assets/examples/00022/raw_image.png b/assets/examples/00022/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..0196cc58626b2f55f92a7b3ba3fefcbe56a6cba1 Binary files /dev/null and b/assets/examples/00022/raw_image.png differ diff --git a/assets/examples/00022/seg_image.png b/assets/examples/00022/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..cc9a17ad52a4eac0999f0964aeb5817150e5b0b7 Binary files /dev/null and b/assets/examples/00022/seg_image.png differ diff --git a/assets/examples/00023/generated_video.mp4 b/assets/examples/00023/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a90558dc9c294000103e182151d37e152a706686 Binary files /dev/null and b/assets/examples/00023/generated_video.mp4 differ diff --git a/assets/examples/00023/mask.png b/assets/examples/00023/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..3f6032390a8dbaf0449f30ed0fce1f2b821995cc Binary files /dev/null and b/assets/examples/00023/mask.png differ diff --git a/assets/examples/00023/raw_image.png b/assets/examples/00023/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..1d395bcf430f1d34841f0ce41374dd6b7fc7e70b Binary files /dev/null and b/assets/examples/00023/raw_image.png differ diff --git a/assets/examples/00023/seg_image.png b/assets/examples/00023/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..ecbbf6cdf35162afd124ddaf39e56088bc8385d1 Binary files /dev/null and b/assets/examples/00023/seg_image.png differ diff --git a/assets/examples/00024/generated_video.mp4 b/assets/examples/00024/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..71d64366becd3aa945374f9075cf064636d27384 Binary files /dev/null and b/assets/examples/00024/generated_video.mp4 differ diff --git a/assets/examples/00024/mask.png b/assets/examples/00024/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..ea907a882489f68d9f808fc443b30fea82ab1465 Binary files /dev/null and b/assets/examples/00024/mask.png differ diff --git a/assets/examples/00024/raw_image.png b/assets/examples/00024/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..1d395bcf430f1d34841f0ce41374dd6b7fc7e70b Binary files /dev/null and b/assets/examples/00024/raw_image.png differ diff --git a/assets/examples/00024/seg_image.png b/assets/examples/00024/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..4ed2128c07c7d3b6fee249a5ccab6d26a89f8e97 Binary files /dev/null and b/assets/examples/00024/seg_image.png differ diff --git a/assets/examples/00025/generated_video.mp4 b/assets/examples/00025/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4eec1e2901ba9509d680fbca789523a703fa2299 Binary files /dev/null and b/assets/examples/00025/generated_video.mp4 differ diff --git a/assets/examples/00025/mask.png b/assets/examples/00025/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..92228d48e54de9c2a4108fa1e9ee294407d09584 Binary files /dev/null and b/assets/examples/00025/mask.png differ diff --git a/assets/examples/00025/seg_image.png b/assets/examples/00025/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..a92a2cbdee2600bd5a5f04aa453a8a246c663289 Binary files /dev/null and b/assets/examples/00025/seg_image.png differ diff --git a/assets/examples/00029/generated_video.mp4 b/assets/examples/00029/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..907e651c50713e2d7012b8e8a7517f05ec1ab7c8 Binary files /dev/null and b/assets/examples/00029/generated_video.mp4 differ diff --git a/assets/examples/00029/mask.png b/assets/examples/00029/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..e20fabd9794876dd2fba1006185e0ccb5aa06bd4 Binary files /dev/null and b/assets/examples/00029/mask.png differ diff --git a/assets/examples/00029/raw_image.png b/assets/examples/00029/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..27c1f9850f8b920095cae39fd2953a280ff3b2ff Binary files /dev/null and b/assets/examples/00029/raw_image.png differ diff --git a/assets/examples/00029/seg_image.png b/assets/examples/00029/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..d692d0e9f6e87e07f919d105f1bd85d6bb7d79b4 Binary files /dev/null and b/assets/examples/00029/seg_image.png differ diff --git a/assets/examples/00030/generated_video.mp4 b/assets/examples/00030/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..dfaa99ecacf12e4fc7f16a8721e5e428abb4ebc7 Binary files /dev/null and b/assets/examples/00030/generated_video.mp4 differ diff --git a/assets/examples/00030/mask.png b/assets/examples/00030/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..e20fabd9794876dd2fba1006185e0ccb5aa06bd4 Binary files /dev/null and b/assets/examples/00030/mask.png differ diff --git a/assets/examples/00030/raw_image.png b/assets/examples/00030/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..27c1f9850f8b920095cae39fd2953a280ff3b2ff Binary files /dev/null and b/assets/examples/00030/raw_image.png differ diff --git a/assets/examples/00030/seg_image.png b/assets/examples/00030/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..d692d0e9f6e87e07f919d105f1bd85d6bb7d79b4 Binary files /dev/null and b/assets/examples/00030/seg_image.png differ diff --git a/assets/examples/00031/generated_video.mp4 b/assets/examples/00031/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6e57b81cb1009677ad3724a713a1f08e185c10c2 Binary files /dev/null and b/assets/examples/00031/generated_video.mp4 differ diff --git a/assets/examples/00031/mask.png b/assets/examples/00031/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..e20fabd9794876dd2fba1006185e0ccb5aa06bd4 Binary files /dev/null and b/assets/examples/00031/mask.png differ diff --git a/assets/examples/00031/raw_image.png b/assets/examples/00031/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..27c1f9850f8b920095cae39fd2953a280ff3b2ff Binary files /dev/null and b/assets/examples/00031/raw_image.png differ diff --git a/assets/examples/00031/seg_image.png b/assets/examples/00031/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..d692d0e9f6e87e07f919d105f1bd85d6bb7d79b4 Binary files /dev/null and b/assets/examples/00031/seg_image.png differ diff --git a/assets/examples/00032/generated_video.mp4 b/assets/examples/00032/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..31c352cf6d70b66dbe1370192146d1cfcd5f51ad Binary files /dev/null and b/assets/examples/00032/generated_video.mp4 differ diff --git a/assets/examples/00032/mask.png b/assets/examples/00032/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..51d1da4d33b2f729bf75af91a4b740d842bf4795 Binary files /dev/null and b/assets/examples/00032/mask.png differ diff --git a/assets/examples/00032/raw_image.png b/assets/examples/00032/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..06bb3b8d6cccb21af50745833efd0b462fc67215 Binary files /dev/null and b/assets/examples/00032/raw_image.png differ diff --git a/assets/examples/00032/seg_image.png b/assets/examples/00032/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..f67652257c59a815b08802b5f641c96704fade77 Binary files /dev/null and b/assets/examples/00032/seg_image.png differ diff --git a/assets/examples/00033/generated_video.mp4 b/assets/examples/00033/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..1e2ecfbb279b035daaa571aa1455c7f42224c011 Binary files /dev/null and b/assets/examples/00033/generated_video.mp4 differ diff --git a/assets/examples/00033/mask.png b/assets/examples/00033/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..86786228a50a0bd4bce31dde45e28074bfa1bd0e Binary files /dev/null and b/assets/examples/00033/mask.png differ diff --git a/assets/examples/00033/raw_image.png b/assets/examples/00033/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..94e184c62148de8ac707dc6ab9d9161d7f57fac3 Binary files /dev/null and b/assets/examples/00033/raw_image.png differ diff --git a/assets/examples/00033/seg_image.png b/assets/examples/00033/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..012cf5d1fc70661e8ac6bc846e3d5b80fa8c92aa Binary files /dev/null and b/assets/examples/00033/seg_image.png differ diff --git a/assets/examples/00034/generated_video.mp4 b/assets/examples/00034/generated_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bbcbc7d70ace1439d7f0a5bd218bff70b7ea432d Binary files /dev/null and b/assets/examples/00034/generated_video.mp4 differ diff --git a/assets/examples/00034/mask.png b/assets/examples/00034/mask.png new file mode 100644 index 0000000000000000000000000000000000000000..86786228a50a0bd4bce31dde45e28074bfa1bd0e Binary files /dev/null and b/assets/examples/00034/mask.png differ diff --git a/assets/examples/00034/raw_image.png b/assets/examples/00034/raw_image.png new file mode 100644 index 0000000000000000000000000000000000000000..94e184c62148de8ac707dc6ab9d9161d7f57fac3 Binary files /dev/null and b/assets/examples/00034/raw_image.png differ diff --git a/assets/examples/00034/seg_image.png b/assets/examples/00034/seg_image.png new file mode 100644 index 0000000000000000000000000000000000000000..012cf5d1fc70661e8ac6bc846e3d5b80fa8c92aa Binary files /dev/null and b/assets/examples/00034/seg_image.png differ diff --git a/assets/examples/examples.json b/assets/examples/examples.json new file mode 100644 index 0000000000000000000000000000000000000000..11f61212b9ffa5fa70028673b783312ae325701d --- /dev/null +++ b/assets/examples/examples.json @@ -0,0 +1,20 @@ +{ +"00025": [{"image": "./assets/examples/00025/raw_image.png"}, "[[23.0, 797.0, 2.0, 2387.0, 2279.0, 3.0]]", {"image": "./assets/examples/00025/seg_image.png"}, "[[5.0, 22.0, 2.0, 167.0, 300.0, 3.0], [113.0, 282.0, 1.0, 0.0, 0.0, 4.0], [193.0, 301.0, 1.0, 0.0, 0.0, 4.0], [78.0, 270.0, 1.0, 0.0, 0.0, 4.0], [90.0, 274.0, 1.0, 0.0, 0.0, 4.0], [143.0, 259.0, 1.0, 0.0, 0.0, 4.0], [181.0, 292.0, 1.0, 0.0, 0.0, 4.0], [216.0, 306.0, 1.0, 0.0, 0.0, 4.0], [209.0, 288.0, 1.0, 0.0, 0.0, 4.0], [214.0, 312.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00025/mask.png", "[[103, 280], [101, 188]]", "None", "Fixed", true, "./assets/examples/00025/generated_video.mp4"], +"00010": [{"image": "./assets/examples/00010/raw_image.png"}, "[]", {"image": "./assets/examples/00010/seg_image.png"}, "[[37.0, 29.0, 2.0, 502.0, 264.0, 3.0], [524.0, 194.0, 1.0, 0.0, 0.0, 4.0], [554.0, 203.0, 1.0, 0.0, 0.0, 4.0], [567.0, 205.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00010/mask.png", "[[183, 154], [327, 172]]", "None", "Fixed", true, "./assets/examples/00010/generated_video.mp4"], +"00011": [{"image": "./assets/examples/00011/raw_image.png"}, "[]", {"image": "./assets/examples/00011/seg_image.png"}, "[[37.0, 29.0, 2.0, 502.0, 264.0, 3.0], [524.0, 194.0, 1.0, 0.0, 0.0, 4.0], [554.0, 203.0, 1.0, 0.0, 0.0, 4.0], [567.0, 205.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00011/mask.png", "[[340, 166], [184, 141]]", "None", "Fixed", true, "./assets/examples/00011/generated_video.mp4"], +"00012": [{"image": "./assets/examples/00012/raw_image.png"}, "[]", {"image": "./assets/examples/00012/seg_image.png"}, "[[311.0, 174.0, 1.0, 0.0, 0.0, 4.0], [257.0, 144.0, 1.0, 0.0, 0.0, 4.0], [210.0, 121.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00012/mask.png", "[[202, 110], [134, 46]]", "None", "Fixed", true, "./assets/examples/00012/generated_video.mp4"], +"00016": [{"image": "./assets/examples/00016/raw_image.png"}, "[]", {"image": "./assets/examples/00016/seg_image.png"}, "[[98.0, 245.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00016/mask.png", "[[28, 290], [34, 266], [49, 243], [84, 243]]", "None", "Free", true, "./assets/examples/00016/generated_video.mp4"], +"00017": [{"image": "./assets/examples/00017/raw_image.png"}, "[]", {"image": "./assets/examples/00017/seg_image.png"}, "[[98.0, 245.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00017/mask.png", "[[28, 290], [34, 266], [49, 243], [84, 243]]", "None", "Reverse", true, "./assets/examples/00017/generated_video.mp4"], +"00018": [{"image": "./assets/examples/00018/raw_image.png"}, "[]", {"image": "./assets/examples/00018/seg_image.png"}, "[[98.0, 245.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00018/mask.png", "[[28, 290], [34, 266], [49, 243], [84, 243]]", "None", "Fixed", true, "./assets/examples/00018/generated_video.mp4"], +"00020": [{"image": "./assets/examples/00020/raw_image.png"}, "[]", {"image": "./assets/examples/00020/seg_image.png"}, "[[146.0, 34.0, 2.0, 499.0, 305.0, 3.0], [285.0, 260.0, 1.0, 0.0, 0.0, 4.0], [278.0, 308.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00020/mask.png", "[]", "Rotate60", "Fixed", false, "./assets/examples/00020/generated_video.mp4"], +"00021": [{"image": "./assets/examples/00021/raw_image.png"}, "[]", {"image": "./assets/examples/00021/seg_image.png"}, "[[183.0, 5.0, 2.0, 345.0, 271.0, 3.0], [249.0, 234.0, 1.0, 0.0, 0.0, 4.0], [262.0, 5.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00021/mask.png", "[]", "Rotate60", "Fixed", false, "./assets/examples/00021/generated_video.mp4"], +"00022": [{"image": "./assets/examples/00022/raw_image.png"}, "[]", {"image": "./assets/examples/00022/seg_image.png"}, "[[184.0, 174.0, 1.0, 0.0, 0.0, 4.0], [151.0, 174.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00022/mask.png", "[[151, 194], [197, 194], [246, 224]]", "Rotate60", "Fixed", true, "./assets/examples/00022/generated_video.mp4"], +"00024": [{"image": "./assets/examples/00024/raw_image.png"}, "[]", {"image": "./assets/examples/00024/seg_image.png"}, "[[177.0, 11.0, 2.0, 414.0, 312.0, 3.0], [302.0, 32.0, 1.0, 0.0, 0.0, 4.0], [372.0, 224.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00024/mask.png", "[[297, 60], [425, 82], [233, 135], [396, 171]]", "None", "Fixed", false, "./assets/examples/00024/generated_video.mp4"], +"00023": [{"image": "./assets/examples/00023/raw_image.png"}, "[]", {"image": "./assets/examples/00023/seg_image.png"}, "[[191.0, 7.0, 2.0, 404.0, 312.0, 3.0], [369.0, 228.0, 1.0, 0.0, 0.0, 4.0], [321.0, 210.0, 1.0, 0.0, 0.0, 4.0], [297.0, 195.0, 1.0, 0.0, 0.0, 4.0], [297.0, 248.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00023/mask.png", "[]", "Rotate60", "Fixed", false, "./assets/examples/00023/generated_video.mp4"], +"00031": [{"image": "./assets/examples/00031/raw_image.png"}, "[]", {"image": "./assets/examples/00031/seg_image.png"}, "[[1.0, 4.0, 2.0, 562.0, 181.0, 3.0], [102.0, 63.0, 1.0, 0.0, 0.0, 4.0], [86.0, 148.0, 1.0, 0.0, 0.0, 4.0], [383.0, 142.0, 1.0, 0.0, 0.0, 4.0], [520.0, 150.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00031/mask.png", "[]", "ZoomIn", "Fixed", false, "./assets/examples/00031/generated_video.mp4"], +"00029": [{"image": "./assets/examples/00029/raw_image.png"}, "[]", {"image": "./assets/examples/00029/seg_image.png"}, "[[1.0, 4.0, 2.0, 562.0, 181.0, 3.0], [102.0, 63.0, 1.0, 0.0, 0.0, 4.0], [86.0, 148.0, 1.0, 0.0, 0.0, 4.0], [383.0, 142.0, 1.0, 0.0, 0.0, 4.0], [520.0, 150.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00029/mask.png", "[]", "PanLeft", "Fixed", false, "./assets/examples/00029/generated_video.mp4"], +"00030": [{"image": "./assets/examples/00030/raw_image.png"}, "[]", {"image": "./assets/examples/00030/seg_image.png"}, "[[1.0, 4.0, 2.0, 562.0, 181.0, 3.0], [102.0, 63.0, 1.0, 0.0, 0.0, 4.0], [86.0, 148.0, 1.0, 0.0, 0.0, 4.0], [383.0, 142.0, 1.0, 0.0, 0.0, 4.0], [520.0, 150.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00030/mask.png", "[]", "PanRight", "Fixed", false, "./assets/examples/00030/generated_video.mp4"], +"00033": [{"image": "./assets/examples/00033/raw_image.png"}, "[]", {"image": "./assets/examples/00033/seg_image.png"}, "[[187.0, 34.0, 2.0, 321.0, 171.0, 3.0], [239.0, 117.0, 1.0, 0.0, 0.0, 4.0], [257.0, 138.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00033/mask.png", "[]", "Anti-CW", "Fixed", true, "./assets/examples/00033/generated_video.mp4"], +"00034": [{"image": "./assets/examples/00034/raw_image.png"}, "[]", {"image": "./assets/examples/00034/seg_image.png"}, "[[187.0, 34.0, 2.0, 321.0, 171.0, 3.0], [239.0, 117.0, 1.0, 0.0, 0.0, 4.0], [257.0, 138.0, 1.0, 0.0, 0.0, 4.0]]", "./assets/examples/00034/mask.png", "[]", "ClockWise", "Fixed", true, "./assets/examples/00034/generated_video.mp4"] +} + diff --git a/objctrl_2_5d/utils/ui_utils.py b/objctrl_2_5d/utils/ui_utils.py index 4e8af59434e091345be4337e1f9988b767d82e1a..811c99abc273a4276210f3b33db4d7336972c371 100644 --- a/objctrl_2_5d/utils/ui_utils.py +++ b/objctrl_2_5d/utils/ui_utils.py @@ -9,6 +9,7 @@ from objctrl_2_5d.utils.vis_camera import vis_camera_rescale from objctrl_2_5d.utils.objmask_util import trajectory_to_camera_poses_v1 from objctrl_2_5d.utils.customized_cam import rotation, clockwise, pan_and_zoom +CAMERA_MODE = ["None", "ZoomIn", "ZoomOut", "PanRight", "PanLeft", "TiltUp", "TiltDown", "ClockWise", "Anti-CW", "Rotate60"] zc_threshold = 0.2 depth_scale_ = 5.2 @@ -29,8 +30,6 @@ def process_image(raw_image): image, points = raw_image['image'], raw_image['points'] - print(points) - try: assert(len(points)) == 1, "Please select only one point" [x1, y1, _, x2, y2, _] = points[0] @@ -88,7 +87,10 @@ def get_points(img, # draw an arrow from handle point to target point # if len(points) == idx + 1: if idx > 0: - cv2.arrowedLine(img, points[idx-1], points[idx], (255, 255, 255), 4, tipLength=0.5) + line_length = np.sqrt((points[idx][0] - points[idx-1][0])**2 + (points[idx][1] - points[idx-1][1])**2) + arrow_head_length = 10 + tip_length = arrow_head_length / line_length + cv2.arrowedLine(img, points[idx-1], points[idx], (0, 255, 0), 4, tipLength=tip_length) # points = [] return img if isinstance(img, np.ndarray) else np.array(img), sel_pix @@ -113,6 +115,9 @@ def interpolate_points(points, num_points): def traj2cam(traj, depth, rescale): + if len(traj) == 0: + return None, None, 0.0, gr.update(value=CAMERA_MODE[0]) + traj = np.array(traj) trajectory = interpolate_points(traj, num_frames) @@ -148,13 +153,13 @@ def traj2cam(traj, depth, rescale): RTs = traj_w2c[:, :3] fig = vis_camera_rescale(RTs) - return RTs, fig, rescale + return RTs, fig, rescale, gr.update(value=CAMERA_MODE[0]) def get_rotate_cam(angle, depth): # mean_depth = np.mean(depth * mask) center_h_margin, center_w_margin = center_margin, center_margin depth_center = np.mean(depth[height//2-center_h_margin:height//2+center_h_margin, width//2-center_w_margin:width//2+center_w_margin]) - print(f'rotate depth_center: {depth_center}') + # print(f'rotate depth_center: {depth_center}') RTs = rotation(num_frames, angle, depth_center, depth_center) fig = vis_camera_rescale(RTs) @@ -162,47 +167,128 @@ def get_rotate_cam(angle, depth): return RTs, fig def get_clockwise_cam(angle, depth, mask): - mask = mask.astype(np.float32) # [0, 1] - mean_depth = np.mean(depth * mask) + # mask = mask.astype(np.float32) # [0, 1] + # mean_depth = np.mean(depth * mask) # center_h_margin, center_w_margin = center_margin, center_margin # depth_center = np.mean(depth[height//2-center_h_margin:height//2+center_h_margin, width//2-center_w_margin:width//2+center_w_margin]) RTs = clockwise(angle, num_frames) - RTs[:, -1, -1] = mean_depth + # RTs[:, -1, -1] = mean_depth fig = vis_camera_rescale(RTs) return RTs, fig def get_translate_cam(Tx, Ty, Tz, depth, mask, speed): - mask = mask.astype(np.float32) # [0, 1] + # mask = mask.astype(np.float32) # [0, 1] - mean_depth = np.mean(depth * mask) + # mean_depth = np.mean(depth * mask) T = np.array([Tx, Ty, Tz]) T = T.reshape(3, 1) T = T[None, ...].repeat(num_frames, axis=0) RTs = pan_and_zoom(T, speed, n=num_frames) - RTs[:, -1, -1] += mean_depth + # RTs[:, -1, -1] += mean_depth fig = vis_camera_rescale(RTs) return RTs, fig + def get_camera_pose(camera_mode): - def trigger_camera_pose(camera_option, selected_points, depth, mask, rescale, angle, Tx, Ty, Tz, speed): - if camera_option == camera_mode[0]: # traj2cam - RTs, fig, rescale = traj2cam(selected_points, depth, rescale) - elif camera_option == camera_mode[1]: # rotate - RTs, fig = get_rotate_cam(angle, depth) - rescale = 0.0 - elif camera_option == camera_mode[2]: # clockwise + # camera_mode = ["None", "ZoomIn", "ZoomOut", "PanLeft", "PanRight", "TiltUp", "TiltDown", "ClockWise", "Anti-CW", "Rotate60"] + def trigger_camera_pose(camera_option, depth, mask, rescale, angle, speed): + if camera_option == camera_mode[0]: # None + RTs = None + fig = None + elif camera_option == camera_mode[1]: # ZoomIn + RTs, fig = get_translate_cam(0, 0, -1, depth, mask, speed) + + elif camera_option == camera_mode[2]: # ZoomOut + RTs, fig = get_translate_cam(0, 0, 1, depth, mask, speed) + + elif camera_option == camera_mode[3]: # PanLeft + RTs, fig = get_translate_cam(-1, 0, 0, depth, mask, speed) + + elif camera_option == camera_mode[4]: # PanRight + RTs, fig = get_translate_cam(1, 0, 0, depth, mask, speed) + + elif camera_option == camera_mode[5]: # TiltUp + RTs, fig = get_translate_cam(0, 1, 0, depth, mask, speed) + + elif camera_option == camera_mode[6]: # TiltDown + RTs, fig = get_translate_cam(0, -1, 0, depth, mask, speed) + + elif camera_option == camera_mode[7]: # ClockWise + RTs, fig = get_clockwise_cam(-angle, depth, mask) + + elif camera_option == camera_mode[8]: # Anti-CW RTs, fig = get_clockwise_cam(angle, depth, mask) - rescale = 0.0 - elif camera_option == camera_mode[3]: # translate - RTs, fig = get_translate_cam(Tx, Ty, Tz, depth, mask, speed) - rescale = 0.0 + + else: # Rotate60 + RTs, fig = get_rotate_cam(angle, depth) + rescale = 0.0 return RTs, fig, rescale return trigger_camera_pose + +import os +from glob import glob +import json + +def get_mid_params(raw_input, canvas, mask, selected_points, camera_option, bg_mode, shared_wapring_latents, generated_video): + output_dir = "./assets/examples" + os.makedirs(output_dir, exist_ok=True) + + # folders = sorted(glob(output_dir + "/*")) + folders = os.listdir(output_dir) + folders = [int(folder) for folder in folders if os.path.isdir(os.path.join(output_dir, folder))] + num = sorted(folders)[-1] + 1 if folders else 0 + + fout = open(os.path.join(output_dir, f'examples.json'), 'a+') + + cur_folder = os.path.join(output_dir, f'{num:05d}') + os.makedirs(cur_folder, exist_ok=True) + + raw_image = raw_input['image'] + raw_points = raw_input['points'] + seg_image = canvas['image'] + seg_points = canvas['points'] + + mask = Image.fromarray(mask) + mask_path = os.path.join(cur_folder, 'mask.png') + mask.save(mask_path) + + raw_image_path = os.path.join(cur_folder, 'raw_image.png') + seg_image_path = os.path.join(cur_folder, 'seg_image.png') + + raw_image.save(os.path.join(cur_folder, 'raw_image.png')) + seg_image.save(os.path.join(cur_folder, 'seg_image.png')) + + gen_path = os.path.join(cur_folder, 'generated_video.mp4') + cmd = f"cp {generated_video} {gen_path}" + os.system(cmd) + + # data = [{'image': raw_image_path, 'points': raw_points}, + # {'image': seg_image_path, 'points': seg_points}, + # mask_path, + # str(selected_points), + # camera_option, + # bg_mode, + # gen_path] + data = {f'{num:05d}': [{'image': raw_image_path}, + str(raw_points), + {'image': seg_image_path}, + str(seg_points), + mask_path, + str(selected_points), + camera_option, + bg_mode, + shared_wapring_latents, + gen_path]} + fout.write(json.dumps(data) + '\n') + + fout.close() + + \ No newline at end of file