wzhouxiff commited on
Commit
37c595d
·
2 Parent(s): 2b75eb1 7ff78b5
Files changed (3) hide show
  1. README.md +16 -0
  2. app.py +347 -4
  3. objctrl-2.5d_gradio.py +0 -350
README.md CHANGED
@@ -1,3 +1,4 @@
 
1
  title: ObjCtrl 2.5D
2
  colorFrom: purple
3
  colorTo: red
@@ -7,3 +8,18 @@ app_file: app.py
7
  pinned: false
8
  license: apache-2.0
9
  short_description: Training-free object control with camera poses
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  title: ObjCtrl 2.5D
3
  colorFrom: purple
4
  colorTo: red
 
8
  pinned: false
9
  license: apache-2.0
10
  short_description: Training-free object control with camera poses
11
+ =======
12
+ ---
13
+ title: ObjCtrl 2.5D
14
+ emoji: ⚡
15
+ colorFrom: blue
16
+ colorTo: purple
17
+ sdk: gradio
18
+ sdk_version: 5.7.0
19
+ app_file: app.py
20
+ pinned: false
21
+ license: apache-2.0
22
+ ---
23
+
24
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
25
+ >>>>>>> 8085650ca5418a3401650b762a3b63d7c2176d32
app.py CHANGED
@@ -1,7 +1,350 @@
 
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import gradio as gr
3
 
4
+ import torch
5
+ from gradio_image_prompter import ImagePrompter
6
+ from sam2.sam2_image_predictor import SAM2ImagePredictor
7
+ from omegaconf import OmegaConf
8
 
9
+ from objctrl_2_5d.utils.ui_utils import process_image, get_camera_pose, run_segment, run_depth, get_points, undo_points
10
+
11
+
12
+ from cameractrl.inference import get_pipeline
13
+ from objctrl_2_5d.objctrl_2_5d import run
14
+ from objctrl_2_5d.utils.examples import examples, sync_points
15
+
16
+
17
+ ### Title and Description ###
18
+ #### Description ####
19
+ title = r"""<h1 align="center">ObjCtrl-2.5D: Training-free Object Control with Camera Poses</h1>"""
20
+ # subtitle = r"""<h2 align="center">Deployed on SVD Generation</h2>"""
21
+ important_link = r"""
22
+ <div align='center'>
23
+ <a href='https://wzhouxiff.github.io/projects/MotionCtrl/assets/paper/MotionCtrl.pdf'>[Paper]</a>
24
+ &ensp; <a href='https://wzhouxiff.github.io/projects/MotionCtrl/'>[Project Page]</a>
25
+ &ensp; <a href='https://github.com/TencentARC/MotionCtrl'>[Code]</a>
26
+ </div>
27
+ """
28
+
29
+ authors = r"""
30
+ <div align='center'>
31
+ <a href='https://wzhouxiff.github.io/'>Zhouxia Wang</a>
32
+ &ensp; <a href='https://nirvanalan.github.io/'>Yushi Lan</a>
33
+ &ensp; <a href='https://shangchenzhou.com/'>Shanchen Zhou</a>
34
+ &ensp; <a href='https://www.mmlab-ntu.com/person/ccloy/index.html'>Chen Change Loy</a>
35
+ </div>
36
+ """
37
+
38
+ affiliation = r"""
39
+ <div align='center'>
40
+ <a href='https://www.mmlab-ntu.com/'>S-Lab, NTU Singapore</a>
41
+ </div>
42
+ """
43
+
44
+ description = r"""
45
+ <b>Official Gradio demo</b> for <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'><b>ObjCtrl-2.5D: Training-free Object Control with Camera Poses</b></a>.<br>
46
+ 🔥 ObjCtrl2.5D enables object motion control in a I2V generated video via transforming 2D trajectories to 3D using depth, subsequently converting them into camera poses,
47
+ thereby leveraging the exisitng camera motion control module for object motion control without requiring additional training.<br>
48
+ """
49
+
50
+ article = r"""
51
+ If ObjCtrl2.5D is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'>Github Repo</a>. Thanks!
52
+ [![GitHub Stars](https://img.shields.io/github/stars/TencentARC%2FMotionCtrl
53
+ )](https://github.com/TencentARC/MotionCtrl)
54
+
55
+ ---
56
+
57
+ 📝 **Citation**
58
+ <br>
59
+ If our work is useful for your research, please consider citing:
60
+ ```bibtex
61
+ @inproceedings{wang2024motionctrl,
62
+ title={Motionctrl: A unified and flexible motion controller for video generation},
63
+ author={Wang, Zhouxia and Yuan, Ziyang and Wang, Xintao and Li, Yaowei and Chen, Tianshui and Xia, Menghan and Luo, Ping and Shan, Ying},
64
+ booktitle={ACM SIGGRAPH 2024 Conference Papers},
65
+ pages={1--11},
66
+ year={2024}
67
+ }
68
+ ```
69
+
70
+ 📧 **Contact**
71
+ <br>
72
+ If you have any questions, please feel free to reach me out at <b>zhouzi1212@gmail.com</b>.
73
+
74
+ """
75
+
76
+ # -------------- initialization --------------
77
+
78
+ CAMERA_MODE = ["Traj2Cam", "Rotate", "Clockwise", "Translate"]
79
+
80
+ # select the device for computation
81
+ if torch.cuda.is_available():
82
+ device = torch.device("cuda")
83
+ elif torch.backends.mps.is_available():
84
+ device = torch.device("mps")
85
+ else:
86
+ device = torch.device("cpu")
87
+ print(f"using device: {device}")
88
+
89
+ # segmentation model
90
+ segmentor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-tiny", cache_dir="ckpt", device=device)
91
+
92
+ # depth model
93
+ d_model_NK = torch.hub.load('./ZoeDepth', 'ZoeD_NK', source='local', pretrained=True).to(device)
94
+
95
+ # cameractrl model
96
+ config = "configs/svd_320_576_cameractrl.yaml"
97
+ model_id = "stabilityai/stable-video-diffusion-img2vid"
98
+ ckpt = "checkpoints/CameraCtrl_svd.ckpt"
99
+ if not os.path.exists(ckpt):
100
+ os.makedirs("checkpoints", exist_ok=True)
101
+ os.system("wget -c https://huggingface.co/hehao13/CameraCtrl_SVD_ckpts/resolve/main/CameraCtrl_svd.ckpt?download=true")
102
+ os.system("mv CameraCtrl_svd.ckpt?download=true checkpoints/CameraCtrl_svd.ckpt")
103
+ model_config = OmegaConf.load(config)
104
+
105
+
106
+ pipeline = get_pipeline(model_id, "unet", model_config['down_block_types'], model_config['up_block_types'],
107
+ model_config['pose_encoder_kwargs'], model_config['attention_processor_kwargs'],
108
+ ckpt, True, device)
109
+
110
+ # segmentor = None
111
+ # d_model_NK = None
112
+ # pipeline = None
113
+
114
+
115
+ # -------------- UI definition --------------
116
+ with gr.Blocks() as demo:
117
+ # layout definition
118
+ gr.Markdown(title)
119
+ gr.Markdown(authors)
120
+ gr.Markdown(affiliation)
121
+ gr.Markdown(important_link)
122
+ gr.Markdown(description)
123
+
124
+
125
+ # with gr.Row():
126
+ # gr.Markdown("""# <center>Repositioning the Subject within Image </center>""")
127
+ mask = gr.State(value=None) # store mask
128
+ removal_mask = gr.State(value=None) # store removal mask
129
+ selected_points = gr.State([]) # store points
130
+ selected_points_text = gr.Textbox(label="Selected Points", visible=False)
131
+
132
+ original_image = gr.State(value=None) # store original input image
133
+ masked_original_image = gr.State(value=None) # store masked input image
134
+ mask_logits = gr.State(value=None) # store mask logits
135
+
136
+ depth = gr.State(value=None) # store depth
137
+ org_depth_image = gr.State(value=None) # store original depth image
138
+
139
+ camera_pose = gr.State(value=None) # store camera pose
140
+
141
+ with gr.Column():
142
+
143
+ outlines = """
144
+ <font size="5"><b>There are total 5 steps to complete the task.</b></font>
145
+ - Step 1: Input an image and Crop it to a suitable size;
146
+ - Step 2: Attain the subject mask;
147
+ - Step 3: Get depth and Draw Trajectory;
148
+ - Step 4: Get camera pose from trajectory or customize it;
149
+ - Step 5: Generate the final video.
150
+ """
151
+
152
+ gr.Markdown(outlines)
153
+
154
+
155
+ with gr.Row():
156
+ with gr.Column():
157
+ # Step 1: Input Image
158
+ step1_dec = """
159
+ <font size="4"><b>Step 1: Input Image</b></font>
160
+ - Select the region using a <mark>bounding box</mark>, aiming for a ratio close to </mark>320:576</mark> (height:width).
161
+ - All provided images in `Examples` are in 320 x 576 resolution. Simply press `Process` to proceed.
162
+ """
163
+ step1 = gr.Markdown(step1_dec)
164
+ raw_input = ImagePrompter(type="pil", label="Raw Image", show_label=True, interactive=True)
165
+ # left_up_point = gr.Textbox(value = "-1 -1", label="Left Up Point", interactive=True)
166
+ process_button = gr.Button("Process")
167
+
168
+ with gr.Column():
169
+ # Step 2: Get Subject Mask
170
+ step2_dec = """
171
+ <font size="4"><b>Step 2: Get Subject Mask</b></font>
172
+ - Use the <mark>bounding boxes</mark> or <mark>paints</mark> to select the subject.
173
+ - Press `Segment Subject` to get the mask. <mark>Can be refined iteratively by updating points<mark>.
174
+ """
175
+ step2 = gr.Markdown(step2_dec)
176
+ canvas = ImagePrompter(type="pil", label="Input Image", show_label=True, interactive=True) # for mask painting
177
+
178
+ select_button = gr.Button("Segment Subject")
179
+
180
+ with gr.Row():
181
+ with gr.Column():
182
+ mask_dec = """
183
+ <font size="4"><b>Mask Result</b></font>
184
+ - Just for visualization purpose. No need to interact.
185
+ """
186
+ mask_vis = gr.Markdown(mask_dec)
187
+ mask_output = gr.Image(type="pil", label="Mask", show_label=True, interactive=False)
188
+ with gr.Column():
189
+ # Step 3: Get Depth and Draw Trajectory
190
+ step3_dec = """
191
+ <font size="4"><b>Step 3: Get Depth and Draw Trajectory</b></font>
192
+ - Press `Get Depth` to get the depth image.
193
+ - Draw the trajectory by selecting points on the depth image. <mark>No more than 14 points</mark>.
194
+ - Press `Undo point` to remove all points.
195
+ """
196
+ step3 = gr.Markdown(step3_dec)
197
+ depth_image = gr.Image(type="pil", label="Depth Image", show_label=True, interactive=False)
198
+ with gr.Row():
199
+ depth_button = gr.Button("Get Depth")
200
+ undo_button = gr.Button("Undo point")
201
+
202
+ with gr.Row():
203
+ with gr.Column():
204
+ # Step 4: Trajectory to Camera Pose or Get Camera Pose
205
+ step4_dec = """
206
+ <font size="4"><b>Step 4: Get camera pose from trajectory or customize it</b></font>
207
+ - Option 1: Transform the 2D trajectory to camera poses with depth. <mark>`Rescale` is used for depth alignment. Larger value can speed up the object motion.</mark>
208
+ - Option 2: Rotate the camera with a specific `Angle`.
209
+ - Option 3: Rotate the camera clockwise or counterclockwise with a specific `Angle`.
210
+ - Option 4: Translate the camera with `Tx` (<mark>Pan Left/Right</mark>), `Ty` (<mark>Pan Up/Down</mark>), `Tz` (<mark>Zoom In/Out</mark>) and `Speed`.
211
+ """
212
+ step4 = gr.Markdown(step4_dec)
213
+ camera_pose_vis = gr.Plot(None, label='Camera Pose')
214
+ with gr.Row():
215
+ with gr.Column():
216
+ speed = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.0, label="Speed", interactive=True)
217
+ rescale = gr.Slider(minimum=0.0, maximum=10, step=0.1, value=1.0, label="Rescale", interactive=True)
218
+ # traj2pose_button = gr.Button("Option1: Trajectory to Camera Pose")
219
+
220
+ angle = gr.Slider(minimum=-360, maximum=360, step=1, value=60, label="Angle", interactive=True)
221
+ # rotation_button = gr.Button("Option2: Rotate")
222
+ # clockwise_button = gr.Button("Option3: Clockwise")
223
+ with gr.Column():
224
+
225
+ Tx = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tx", interactive=True)
226
+ Ty = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Ty", interactive=True)
227
+ Tz = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tz", interactive=True)
228
+ # translation_button = gr.Button("Option4: Translate")
229
+ with gr.Row():
230
+ camera_option = gr.Radio(choices = CAMERA_MODE, label='Camera Options', value=CAMERA_MODE[0], interactive=True)
231
+ with gr.Row():
232
+ get_camera_pose_button = gr.Button("Get Camera Pose")
233
+
234
+ with gr.Column():
235
+ # Step 5: Get the final generated video
236
+ step5_dec = """
237
+ <font size="4"><b>Step 5: Get the final generated video</b></font>
238
+ - 3 modes for background: <mark>Fixed</mark>, <mark>Reverse</mark>, <mark>Free</mark>.
239
+ - Enable <mark>Scale-wise Masks</mark> for better object control.
240
+ - Option to enable <mark>Shared Warping Latents</mark> and set <mark>stop frequency</mark> for spatial (`ds`) and temporal (`dt`) dimensions. Larger stop frequency will lead to artifacts.
241
+ """
242
+ step5 = gr.Markdown(step5_dec)
243
+ generated_video = gr.Video(None, label='Generated Video')
244
+
245
+ with gr.Row():
246
+ seed = gr.Textbox(value = "42", label="Seed", interactive=True)
247
+ # num_inference_steps = gr.Slider(minimum=1, maximum=100, step=1, value=25, label="Number of Inference Steps", interactive=True)
248
+ bg_mode = gr.Radio(choices = ["Fixed", "Reverse", "Free"], label="Background Mode", value="Fixed", interactive=True)
249
+ # swl_mode = gr.Radio(choices = ["Enable SWL", "Disable SWL"], label="Shared Warping Latent", value="Disable SWL", interactive=True)
250
+ scale_wise_masks = gr.Checkbox(label="Enable Scale-wise Masks", interactive=True, value=True)
251
+ with gr.Row():
252
+ with gr.Column():
253
+ shared_wapring_latents = gr.Checkbox(label="Enable Shared Warping Latents", interactive=True)
254
+ with gr.Column():
255
+ ds = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="ds", interactive=True)
256
+ dt = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="dt", interactive=True)
257
+
258
+ generated_button = gr.Button("Generate")
259
+
260
+
261
+
262
+ # # event definition
263
+ process_button.click(
264
+ fn = process_image,
265
+ inputs = [raw_input],
266
+ outputs = [original_image, canvas]
267
+ )
268
+
269
+ select_button.click(
270
+ run_segment(segmentor),
271
+ [canvas, original_image, mask_logits],
272
+ [mask, mask_output, masked_original_image, mask_logits]
273
+ )
274
+
275
+ depth_button.click(
276
+ run_depth(d_model_NK),
277
+ [original_image, selected_points],
278
+ [depth, depth_image, org_depth_image]
279
+ )
280
+
281
+ depth_image.select(
282
+ get_points,
283
+ [depth_image, selected_points],
284
+ [depth_image, selected_points],
285
+ )
286
+ undo_button.click(
287
+ undo_points,
288
+ [org_depth_image],
289
+ [depth_image, selected_points]
290
+ )
291
+
292
+ get_camera_pose_button.click(
293
+ get_camera_pose(CAMERA_MODE),
294
+ [camera_option, selected_points, depth, mask, rescale, angle, Tx, Ty, Tz, speed],
295
+ [camera_pose, camera_pose_vis]
296
+ )
297
+
298
+ generated_button.click(
299
+ run(pipeline, device),
300
+ [
301
+ original_image,
302
+ mask,
303
+ depth,
304
+ camera_pose,
305
+ bg_mode,
306
+ shared_wapring_latents,
307
+ scale_wise_masks,
308
+ rescale,
309
+ seed,
310
+ ds,
311
+ dt,
312
+ # num_inference_steps
313
+ ],
314
+ [generated_video],
315
+ )
316
+
317
+ gr.Examples(
318
+ examples=examples,
319
+ inputs=[
320
+ raw_input,
321
+ rescale,
322
+ speed,
323
+ angle,
324
+ Tx,
325
+ Ty,
326
+ Tz,
327
+ camera_option,
328
+ bg_mode,
329
+ shared_wapring_latents,
330
+ scale_wise_masks,
331
+ ds,
332
+ dt,
333
+ seed,
334
+ selected_points_text # selected_points
335
+ ],
336
+ outputs=[generated_video],
337
+ examples_per_page=10
338
+ )
339
+
340
+ selected_points_text.change(
341
+ sync_points,
342
+ inputs=[selected_points_text],
343
+ outputs=[selected_points]
344
+ )
345
+
346
+
347
+ gr.Markdown(article)
348
+
349
+
350
+ demo.queue().launch(share=True)
objctrl-2.5d_gradio.py DELETED
@@ -1,350 +0,0 @@
1
- import os
2
- import gradio as gr
3
-
4
- import torch
5
- from gradio_image_prompter import ImagePrompter
6
- from sam2.sam2_image_predictor import SAM2ImagePredictor
7
- from omegaconf import OmegaConf
8
-
9
- from objctrl_2_5d.utils.ui_utils import process_image, get_camera_pose, run_segment, run_depth, get_points, undo_points
10
-
11
-
12
- from cameractrl.inference import get_pipeline
13
- from objctrl_2_5d.objctrl_2_5d import run
14
- from objctrl_2_5d.utils.examples import examples, sync_points
15
-
16
-
17
- ### Title and Description ###
18
- #### Description ####
19
- title = r"""<h1 align="center">ObjCtrl-2.5D: Training-free Object Control with Camera Poses</h1>"""
20
- # subtitle = r"""<h2 align="center">Deployed on SVD Generation</h2>"""
21
- important_link = r"""
22
- <div align='center'>
23
- <a href='https://wzhouxiff.github.io/projects/MotionCtrl/assets/paper/MotionCtrl.pdf'>[Paper]</a>
24
- &ensp; <a href='https://wzhouxiff.github.io/projects/MotionCtrl/'>[Project Page]</a>
25
- &ensp; <a href='https://github.com/TencentARC/MotionCtrl'>[Code]</a>
26
- </div>
27
- """
28
-
29
- authors = r"""
30
- <div align='center'>
31
- <a href='https://wzhouxiff.github.io/'>Zhouxia Wang</a>
32
- &ensp; <a href='https://nirvanalan.github.io/'>Yushi Lan</a>
33
- &ensp; <a href='https://shangchenzhou.com/'>Shanchen Zhou</a>
34
- &ensp; <a href='https://www.mmlab-ntu.com/person/ccloy/index.html'>Chen Change Loy</a>
35
- </div>
36
- """
37
-
38
- affiliation = r"""
39
- <div align='center'>
40
- <a href='https://www.mmlab-ntu.com/'>S-Lab, NTU Singapore</a>
41
- </div>
42
- """
43
-
44
- description = r"""
45
- <b>Official Gradio demo</b> for <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'><b>ObjCtrl-2.5D: Training-free Object Control with Camera Poses</b></a>.<br>
46
- 🔥 ObjCtrl2.5D enables object motion control in a I2V generated video via transforming 2D trajectories to 3D using depth, subsequently converting them into camera poses,
47
- thereby leveraging the exisitng camera motion control module for object motion control without requiring additional training.<br>
48
- """
49
-
50
- article = r"""
51
- If ObjCtrl2.5D is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'>Github Repo</a>. Thanks!
52
- [![GitHub Stars](https://img.shields.io/github/stars/TencentARC%2FMotionCtrl
53
- )](https://github.com/TencentARC/MotionCtrl)
54
-
55
- ---
56
-
57
- 📝 **Citation**
58
- <br>
59
- If our work is useful for your research, please consider citing:
60
- ```bibtex
61
- @inproceedings{wang2024motionctrl,
62
- title={Motionctrl: A unified and flexible motion controller for video generation},
63
- author={Wang, Zhouxia and Yuan, Ziyang and Wang, Xintao and Li, Yaowei and Chen, Tianshui and Xia, Menghan and Luo, Ping and Shan, Ying},
64
- booktitle={ACM SIGGRAPH 2024 Conference Papers},
65
- pages={1--11},
66
- year={2024}
67
- }
68
- ```
69
-
70
- 📧 **Contact**
71
- <br>
72
- If you have any questions, please feel free to reach me out at <b>zhouzi1212@gmail.com</b>.
73
-
74
- """
75
-
76
- # -------------- initialization --------------
77
-
78
- CAMERA_MODE = ["Traj2Cam", "Rotate", "Clockwise", "Translate"]
79
-
80
- # select the device for computation
81
- if torch.cuda.is_available():
82
- device = torch.device("cuda")
83
- elif torch.backends.mps.is_available():
84
- device = torch.device("mps")
85
- else:
86
- device = torch.device("cpu")
87
- print(f"using device: {device}")
88
-
89
- # segmentation model
90
- segmentor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-tiny", cache_dir="ckpt", device=device)
91
-
92
- # depth model
93
- d_model_NK = torch.hub.load('./ZoeDepth', 'ZoeD_NK', source='local', pretrained=True).to(device)
94
-
95
- # cameractrl model
96
- config = "configs/svd_320_576_cameractrl.yaml"
97
- model_id = "stabilityai/stable-video-diffusion-img2vid"
98
- ckpt = "checkpoints/CameraCtrl_svd.ckpt"
99
- if not os.path.exists(ckpt):
100
- os.makedirs("checkpoints", exist_ok=True)
101
- os.system("wget -c https://huggingface.co/hehao13/CameraCtrl_SVD_ckpts/resolve/main/CameraCtrl_svd.ckpt?download=true")
102
- os.system("mv CameraCtrl_svd.ckpt?download=true checkpoints/CameraCtrl_svd.ckpt")
103
- model_config = OmegaConf.load(config)
104
-
105
-
106
- pipeline = get_pipeline(model_id, "unet", model_config['down_block_types'], model_config['up_block_types'],
107
- model_config['pose_encoder_kwargs'], model_config['attention_processor_kwargs'],
108
- ckpt, True, device)
109
-
110
- # segmentor = None
111
- # d_model_NK = None
112
- # pipeline = None
113
-
114
-
115
- # -------------- UI definition --------------
116
- with gr.Blocks() as demo:
117
- # layout definition
118
- gr.Markdown(title)
119
- gr.Markdown(authors)
120
- gr.Markdown(affiliation)
121
- gr.Markdown(important_link)
122
- gr.Markdown(description)
123
-
124
-
125
- # with gr.Row():
126
- # gr.Markdown("""# <center>Repositioning the Subject within Image </center>""")
127
- mask = gr.State(value=None) # store mask
128
- removal_mask = gr.State(value=None) # store removal mask
129
- selected_points = gr.State([]) # store points
130
- selected_points_text = gr.Textbox(label="Selected Points", visible=False)
131
-
132
- original_image = gr.State(value=None) # store original input image
133
- masked_original_image = gr.State(value=None) # store masked input image
134
- mask_logits = gr.State(value=None) # store mask logits
135
-
136
- depth = gr.State(value=None) # store depth
137
- org_depth_image = gr.State(value=None) # store original depth image
138
-
139
- camera_pose = gr.State(value=None) # store camera pose
140
-
141
- with gr.Column():
142
-
143
- outlines = """
144
- <font size="5"><b>There are total 5 steps to complete the task.</b></font>
145
- - Step 1: Input an image and Crop it to a suitable size;
146
- - Step 2: Attain the subject mask;
147
- - Step 3: Get depth and Draw Trajectory;
148
- - Step 4: Get camera pose from trajectory or customize it;
149
- - Step 5: Generate the final video.
150
- """
151
-
152
- gr.Markdown(outlines)
153
-
154
-
155
- with gr.Row():
156
- with gr.Column():
157
- # Step 1: Input Image
158
- step1_dec = """
159
- <font size="4"><b>Step 1: Input Image</b></font>
160
- - Select the region using a <mark>bounding box</mark>, aiming for a ratio close to </mark>320:576</mark> (height:width).
161
- - All provided images in `Examples` are in 320 x 576 resolution. Simply press `Process` to proceed.
162
- """
163
- step1 = gr.Markdown(step1_dec)
164
- raw_input = ImagePrompter(type="pil", label="Raw Image", show_label=True, interactive=True)
165
- # left_up_point = gr.Textbox(value = "-1 -1", label="Left Up Point", interactive=True)
166
- process_button = gr.Button("Process")
167
-
168
- with gr.Column():
169
- # Step 2: Get Subject Mask
170
- step2_dec = """
171
- <font size="4"><b>Step 2: Get Subject Mask</b></font>
172
- - Use the <mark>bounding boxes</mark> or <mark>paints</mark> to select the subject.
173
- - Press `Segment Subject` to get the mask. <mark>Can be refined iteratively by updating points<mark>.
174
- """
175
- step2 = gr.Markdown(step2_dec)
176
- canvas = ImagePrompter(type="pil", label="Input Image", show_label=True, interactive=True) # for mask painting
177
-
178
- select_button = gr.Button("Segment Subject")
179
-
180
- with gr.Row():
181
- with gr.Column():
182
- mask_dec = """
183
- <font size="4"><b>Mask Result</b></font>
184
- - Just for visualization purpose. No need to interact.
185
- """
186
- mask_vis = gr.Markdown(mask_dec)
187
- mask_output = gr.Image(type="pil", label="Mask", show_label=True, interactive=False)
188
- with gr.Column():
189
- # Step 3: Get Depth and Draw Trajectory
190
- step3_dec = """
191
- <font size="4"><b>Step 3: Get Depth and Draw Trajectory</b></font>
192
- - Press `Get Depth` to get the depth image.
193
- - Draw the trajectory by selecting points on the depth image. <mark>No more than 14 points</mark>.
194
- - Press `Undo point` to remove all points.
195
- """
196
- step3 = gr.Markdown(step3_dec)
197
- depth_image = gr.Image(type="pil", label="Depth Image", show_label=True, interactive=False)
198
- with gr.Row():
199
- depth_button = gr.Button("Get Depth")
200
- undo_button = gr.Button("Undo point")
201
-
202
- with gr.Row():
203
- with gr.Column():
204
- # Step 4: Trajectory to Camera Pose or Get Camera Pose
205
- step4_dec = """
206
- <font size="4"><b>Step 4: Get camera pose from trajectory or customize it</b></font>
207
- - Option 1: Transform the 2D trajectory to camera poses with depth. <mark>`Rescale` is used for depth alignment. Larger value can speed up the object motion.</mark>
208
- - Option 2: Rotate the camera with a specific `Angle`.
209
- - Option 3: Rotate the camera clockwise or counterclockwise with a specific `Angle`.
210
- - Option 4: Translate the camera with `Tx` (<mark>Pan Left/Right</mark>), `Ty` (<mark>Pan Up/Down</mark>), `Tz` (<mark>Zoom In/Out</mark>) and `Speed`.
211
- """
212
- step4 = gr.Markdown(step4_dec)
213
- camera_pose_vis = gr.Plot(None, label='Camera Pose')
214
- with gr.Row():
215
- with gr.Column():
216
- speed = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.0, label="Speed", interactive=True)
217
- rescale = gr.Slider(minimum=0.0, maximum=10, step=0.1, value=1.0, label="Rescale", interactive=True)
218
- # traj2pose_button = gr.Button("Option1: Trajectory to Camera Pose")
219
-
220
- angle = gr.Slider(minimum=-360, maximum=360, step=1, value=60, label="Angle", interactive=True)
221
- # rotation_button = gr.Button("Option2: Rotate")
222
- # clockwise_button = gr.Button("Option3: Clockwise")
223
- with gr.Column():
224
-
225
- Tx = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tx", interactive=True)
226
- Ty = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Ty", interactive=True)
227
- Tz = gr.Slider(minimum=-1, maximum=1, step=1, value=0, label="Tz", interactive=True)
228
- # translation_button = gr.Button("Option4: Translate")
229
- with gr.Row():
230
- camera_option = gr.Radio(choices = CAMERA_MODE, label='Camera Options', value=CAMERA_MODE[0], interactive=True)
231
- with gr.Row():
232
- get_camera_pose_button = gr.Button("Get Camera Pose")
233
-
234
- with gr.Column():
235
- # Step 5: Get the final generated video
236
- step5_dec = """
237
- <font size="4"><b>Step 5: Get the final generated video</b></font>
238
- - 3 modes for background: <mark>Fixed</mark>, <mark>Reverse</mark>, <mark>Free</mark>.
239
- - Enable <mark>Scale-wise Masks</mark> for better object control.
240
- - Option to enable <mark>Shared Warping Latents</mark> and set <mark>stop frequency</mark> for spatial (`ds`) and temporal (`dt`) dimensions. Larger stop frequency will lead to artifacts.
241
- """
242
- step5 = gr.Markdown(step5_dec)
243
- generated_video = gr.Video(None, label='Generated Video')
244
-
245
- with gr.Row():
246
- seed = gr.Textbox(value = "42", label="Seed", interactive=True)
247
- # num_inference_steps = gr.Slider(minimum=1, maximum=100, step=1, value=25, label="Number of Inference Steps", interactive=True)
248
- bg_mode = gr.Radio(choices = ["Fixed", "Reverse", "Free"], label="Background Mode", value="Fixed", interactive=True)
249
- # swl_mode = gr.Radio(choices = ["Enable SWL", "Disable SWL"], label="Shared Warping Latent", value="Disable SWL", interactive=True)
250
- scale_wise_masks = gr.Checkbox(label="Enable Scale-wise Masks", interactive=True, value=True)
251
- with gr.Row():
252
- with gr.Column():
253
- shared_wapring_latents = gr.Checkbox(label="Enable Shared Warping Latents", interactive=True)
254
- with gr.Column():
255
- ds = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="ds", interactive=True)
256
- dt = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.5, label="dt", interactive=True)
257
-
258
- generated_button = gr.Button("Generate")
259
-
260
-
261
-
262
- # # event definition
263
- process_button.click(
264
- fn = process_image,
265
- inputs = [raw_input],
266
- outputs = [original_image, canvas]
267
- )
268
-
269
- select_button.click(
270
- run_segment(segmentor),
271
- [canvas, original_image, mask_logits],
272
- [mask, mask_output, masked_original_image, mask_logits]
273
- )
274
-
275
- depth_button.click(
276
- run_depth(d_model_NK),
277
- [original_image, selected_points],
278
- [depth, depth_image, org_depth_image]
279
- )
280
-
281
- depth_image.select(
282
- get_points,
283
- [depth_image, selected_points],
284
- [depth_image, selected_points],
285
- )
286
- undo_button.click(
287
- undo_points,
288
- [org_depth_image],
289
- [depth_image, selected_points]
290
- )
291
-
292
- get_camera_pose_button.click(
293
- get_camera_pose(CAMERA_MODE),
294
- [camera_option, selected_points, depth, mask, rescale, angle, Tx, Ty, Tz, speed],
295
- [camera_pose, camera_pose_vis]
296
- )
297
-
298
- generated_button.click(
299
- run(pipeline, device),
300
- [
301
- original_image,
302
- mask,
303
- depth,
304
- camera_pose,
305
- bg_mode,
306
- shared_wapring_latents,
307
- scale_wise_masks,
308
- rescale,
309
- seed,
310
- ds,
311
- dt,
312
- # num_inference_steps
313
- ],
314
- [generated_video],
315
- )
316
-
317
- gr.Examples(
318
- examples=examples,
319
- inputs=[
320
- raw_input,
321
- rescale,
322
- speed,
323
- angle,
324
- Tx,
325
- Ty,
326
- Tz,
327
- camera_option,
328
- bg_mode,
329
- shared_wapring_latents,
330
- scale_wise_masks,
331
- ds,
332
- dt,
333
- seed,
334
- selected_points_text # selected_points
335
- ],
336
- outputs=[generated_video],
337
- examples_per_page=10
338
- )
339
-
340
- selected_points_text.change(
341
- sync_points,
342
- inputs=[selected_points_text],
343
- outputs=[selected_points]
344
- )
345
-
346
-
347
- gr.Markdown(article)
348
-
349
-
350
- demo.queue().launch(share=True)