Spaces:

sczhou
/

ProPainter

Running on A10G

App Files Files Community

PeiqingYang commited on Nov 30, 2023

Commit

2f89268

1 Parent(s): 5c532b1

add highlight guidance, set memory limit

Browse files

Files changed (3) hide show

web-demos/hugging_face/app.py +56 -40
web-demos/hugging_face/inpainter/base_inpainter.py +1 -1
web-demos/hugging_face/track_anything.py +2 -2

web-demos/hugging_face/app.py CHANGED Viewed

@@ -71,56 +71,65 @@ def get_frames_from_video(video_input, video_state):
     video_path = video_input
     frames = []
     user_name = time.time()
-    operation_log = [("",""),("Video uploaded! Try to click the image shown in step2 to add masks.","Normal")]
     try:
         cap = cv2.VideoCapture(video_path)
         fps = cap.get(cv2.CAP_PROP_FPS)
-        while cap.isOpened():
             ret, frame = cap.read()
             if ret == True:
-                current_memory_usage = psutil.virtual_memory().percent
                 frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-                # if current_memory_usage > 90:
-                #     operation_log = [("Memory usage is too high (>90%). Stop the video extraction. Please reduce the video resolution or frame rate.", "Error")]
-                #     print("Memory usage is too high (>90%). Please reduce the video resolution or frame rate.")
-                #     break
-            else:
-                break
-        # TODO: hard code to avoid out of memory
-        t, h, w = len(frames), frames[0].shape[0], frames[0].shape[1]
-        print(f'Inp video shape: t_{t}, s_{h}x_{w}')
-        if len(frames) > 150 and max(frames[0].shape) > 1024:
-            raise ValueError('Due to GPU memory constraints, the current version of this demo supports videos \
-                             with a maximum length of 150 and a maximum resolution of 1024. \
-                             We will continue to optimize it after the CVPR 2024 deadline. \
-                             Please stay tuned!')
     except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
         print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
-    image_size = (frames[0].shape[0],frames[0].shape[1])
     # initialize video_state
     video_state = {
         "user_name": user_name,
         "video_name": os.path.split(video_path)[-1],
         "origin_images": frames,
         "painted_images": frames.copy(),
-        "masks": [np.zeros((frames[0].shape[0],frames[0].shape[1]), np.uint8)]*len(frames),
         "logits": [None]*len(frames),
         "select_frame_number": 0,
         "fps": fps
         }
-    video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), len(frames), image_size)
     model.samcontroler.sam_controler.reset_image()
     model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
-    return video_state, video_info, video_state["origin_images"][0], gr.update(visible=True, maximum=len(frames), value=1), gr.update(visible=True, maximum=len(frames), value=len(frames)), \
-                        gr.update(visible=True), gr.update(visible=True), \
-                        gr.update(visible=True), gr.update(visible=True),\
-                        gr.update(visible=True), gr.update(visible=True), \
-                        gr.update(visible=True), gr.update(visible=True), \
-                        gr.update(visible=True), gr.update(visible=True), \
-                        gr.update(visible=True), gr.update(visible=True, choices=[], value=[]), \
-                        gr.update(visible=True, value=operation_log), gr.update(visible=True, value=operation_log)
 # get the select frame from gradio slider
 def select_template(image_selection_slider, video_state, interactive_state, mask_dropdown):
@@ -175,7 +184,10 @@ def sam_refine(video_state, point_prompt, click_state, interactive_state, evt:gr
     video_state["logits"][video_state["select_frame_number"]] = logit
     video_state["painted_images"][video_state["select_frame_number"]] = painted_image
-    operation_log = [("",""), ("You can try to add positive or negative points by clicking, click Clear clicks button to refresh the image, click Add mask button when you are satisfied with the segment, or click Remove mask button to remove all added masks.","Normal")]
     return painted_image, video_state, interactive_state, operation_log, operation_log
 def add_multi_mask(video_state, interactive_state, mask_dropdown):
@@ -326,7 +338,7 @@ def generate_video_from_frames(frames, output_path, fps=30):
     return output_path
 def restart():
-    operation_log = [("",""), ("Try to upload your video and click the Get video info button to get started!", "Normal")]
     return {
             "user_name": "",
             "video_name": "",
@@ -423,6 +435,7 @@ span.svelte-s1r2yt {font-size: 17px !important; font-weight: bold !important; co
 button {border-radius: 8px !important;}
 .add_button {background-color: #4CAF50 !important;}
 .remove_button {background-color: #f44336 !important;}
 .mask_button_group {gap: 10px !important;}
 .video {height: 300px !important;}
 .image {height: 300px !important;}
@@ -512,7 +525,8 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=css) as iface:
                 video_input = gr.Video(elem_classes="video")
                 extract_frames_button = gr.Button(value="Get video info", interactive=True, variant="primary")
             with gr.Column(scale=2):
-                run_status = gr.HighlightedText(value=[("",""), ("Try to upload your video and click the Get svideo info button to get started!", "Normal")])
                 video_info = gr.Textbox(label="Video Info")
@@ -524,12 +538,10 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=css) as iface:
                 image_selection_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track start frame", visible=False)
                 track_pause_number_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track end frame", visible=False)
             with gr.Column(scale=2, elem_classes="jc_center"):
-                run_status2 = gr.HighlightedText(value=[("",""), ("Try to upload your video and click the Get svideo info button to get started!", "Normal")], visible=False)
-                with gr.Row():
-                    with gr.Column(scale=2, elem_classes="mask_button_group"):
-                        clear_button_click = gr.Button(value="Clear clicks", interactive=True, visible=False)
-                        remove_mask_button = gr.Button(value="Remove mask", interactive=True, visible=False, elem_classes="remove_button")
-                        Add_mask_button = gr.Button(value="Add mask", interactive=True, visible=False, elem_classes="add_button")
                     point_prompt = gr.Radio(
                         choices=["Positive", "Negative"],
                         value="Positive",
@@ -537,7 +549,11 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=css) as iface:
                         interactive=True,
                         visible=False,
                         min_width=100,
-                        scale=1)
                 mask_dropdown = gr.Dropdown(multiselect=True, value=[], label="Mask selection", info=".", visible=False)
         # output video

     video_path = video_input
     frames = []
     user_name = time.time()
+    status_ok = True
+    operation_log = [("[Must Do]", "Click image"), (": Video uploaded! Try to click the image shown in step2 to add masks.\n", None)]
     try:
         cap = cv2.VideoCapture(video_path)
         fps = cap.get(cv2.CAP_PROP_FPS)
+        length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        if length >= 500:
+            operation_log = [("You uploaded a video with more than 500 frames. Stop the video extraction. Kindly lower the video frame rate to a value below 500. We highly recommend deploying the demo locally for long video processing.", "Error")]
             ret, frame = cap.read()
             if ret == True:
+                original_h, original_w = frame.shape[:2]
+                scale_factor = min(1, 1280/max(original_h, original_w))
+                target_h, target_w = int(original_h*scale_factor), int(original_w*scale_factor)
                 frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            status_ok = False
+        else:
+            while cap.isOpened():
+                ret, frame = cap.read()
+                if ret == True:
+                    # resize input image
+                    original_h, original_w = frame.shape[:2]
+                    scale_factor = min(1, 1280/max(original_h, original_w))
+                    target_h, target_w = int(original_h*scale_factor), int(original_w*scale_factor)
+                    frame = cv2.resize(frame, (target_w, target_h))
+                    frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+                else:
+                    break
+            t = len(frames)
+            print(f'Inp video shape: t_{t}, s_{original_h}x{original_w} to s_{target_h}x{target_w}')
     except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
+        status_ok = False
         print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
     # initialize video_state
+    if frames[0].shape[0] > 720 or frames[0].shape[1] > 720:
+         operation_log = [(f"Video uploaded! Try to click the image shown in step2 to add masks. (You uploaded a video with a size of {original_w}x{original_h}, and the length of its longest edge exceeds 720 pixels. We may resize the input video during processing.)", "Normal")]
     video_state = {
         "user_name": user_name,
         "video_name": os.path.split(video_path)[-1],
         "origin_images": frames,
         "painted_images": frames.copy(),
+        "masks": [np.zeros((target_h, target_w), np.uint8)]*len(frames),
         "logits": [None]*len(frames),
         "select_frame_number": 0,
         "fps": fps
         }
+    video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), length, (original_w, original_h))
     model.samcontroler.sam_controler.reset_image()
     model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
+    return video_state, video_info, video_state["origin_images"][0], gr.update(visible=status_ok, maximum=len(frames), value=1), gr.update(visible=status_ok, maximum=len(frames), value=len(frames)), \
+                        gr.update(visible=status_ok), gr.update(visible=status_ok), \
+                        gr.update(visible=status_ok), gr.update(visible=status_ok),\
+                        gr.update(visible=status_ok), gr.update(visible=status_ok), \
+                        gr.update(visible=status_ok), gr.update(visible=status_ok), \
+                        gr.update(visible=status_ok), gr.update(visible=status_ok), \
+                        gr.update(visible=status_ok), gr.update(visible=status_ok, choices=[], value=[]), \
+                        gr.update(visible=True, value=operation_log), gr.update(visible=status_ok, value=operation_log)
 # get the select frame from gradio slider
 def select_template(image_selection_slider, video_state, interactive_state, mask_dropdown):
     video_state["logits"][video_state["select_frame_number"]] = logit
     video_state["painted_images"][video_state["select_frame_number"]] = painted_image
+    operation_log = [("[Must Do]", "Add mask"), (": add the current displayed mask for video segmentation.\n", None),
+                     ("[Optional]", "Remove mask"), (": remove all added masks.\n", None),
+                     ("[Optional]", "Clear clicks"), (": clear current displayed mask.\n", None),
+                     ("[Optional]", "Click image"), (": Try to click the image shown in step2 if you want to generate more masks.\n", None)]
     return painted_image, video_state, interactive_state, operation_log, operation_log
 def add_multi_mask(video_state, interactive_state, mask_dropdown):
     return output_path
 def restart():
+    operation_log = [("",""), ("Try to upload your video and click the Get video info button to get started! (Kindly ensure that the uploaded video consists of fewer than 500 frames in total)", "Normal")]
     return {
             "user_name": "",
             "video_name": "",
 button {border-radius: 8px !important;}
 .add_button {background-color: #4CAF50 !important;}
 .remove_button {background-color: #f44336 !important;}
+.clear_button {background-color: gray !important;}
 .mask_button_group {gap: 10px !important;}
 .video {height: 300px !important;}
 .image {height: 300px !important;}
                 video_input = gr.Video(elem_classes="video")
                 extract_frames_button = gr.Button(value="Get video info", interactive=True, variant="primary")
             with gr.Column(scale=2):
+                run_status = gr.HighlightedText(value=[("",""), ("Try to upload your video and click the Get video info button to get started! (Kindly ensure that the uploaded video consists of fewer than 500 frames in total)", "Normal")],
+                                                color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"})
                 video_info = gr.Textbox(label="Video Info")
                 image_selection_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track start frame", visible=False)
                 track_pause_number_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track end frame", visible=False)
             with gr.Column(scale=2, elem_classes="jc_center"):
+                run_status2 = gr.HighlightedText(value=[("",""), ("Try to upload your video and click the Get video info button to get started! (Kindly ensure that the uploaded video consists of fewer than 500 frames in total)", "Normal")],
+                                                 color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"},
+                                                 visible=False)
+                with gr.Column():
                     point_prompt = gr.Radio(
                         choices=["Positive", "Negative"],
                         value="Positive",
                         interactive=True,
                         visible=False,
                         min_width=100,
+                        scale=1,)
+                    with gr.Row(scale=2, elem_classes="mask_button_group"):
+                        Add_mask_button = gr.Button(value="Add mask", interactive=True, visible=False, elem_classes="add_button")
+                        remove_mask_button = gr.Button(value="Remove mask", interactive=True, visible=False, elem_classes="remove_button")
+                        clear_button_click = gr.Button(value="Clear clicks", interactive=True, visible=False, elem_classes="clear_button")
                 mask_dropdown = gr.Dropdown(multiselect=True, value=[], label="Mask selection", info=".", visible=False)
         # output video

web-demos/hugging_face/inpainter/base_inpainter.py CHANGED Viewed

@@ -205,7 +205,7 @@ class ProInpainter:
 		# The ouput size should be divided by 2 so that it can encoded by libx264
 		size = (int(ratio*size[0])//2*2, int(ratio*size[1])//2*2)
-		# TODO: hard code to reduce memory
 		if max(size[0], size[1]) > 720:
 			scale = 720.0 / max(size[0], size[1])
 			# The ouput size should be divided by 2 so that it can encoded by libx264

 		# The ouput size should be divided by 2 so that it can encoded by libx264
 		size = (int(ratio*size[0])//2*2, int(ratio*size[1])//2*2)
+		# set propainter size limit to 720 to reduce memory usage
 		if max(size[0], size[1]) > 720:
 			scale = 720.0 / max(size[0], size[1])
 			# The ouput size should be divided by 2 so that it can encoded by libx264

web-demos/hugging_face/track_anything.py CHANGED Viewed

@@ -17,7 +17,7 @@ class TrackingAnything():
         mask, logit, painted_image = self.samcontroler.first_frame_click(image, points, labels, multimask)
         return mask, logit, painted_image
-    def generator(self, images: list, template_mask:np.ndarray):
         masks = []
         logits = []
         painted_images = []
@@ -31,7 +31,7 @@ class TrackingAnything():
                 mask, logit, painted_image = self.cutie.track(images[i])
                 masks.append(mask)
                 logits.append(logit)
-                painted_images.append(painted_image)
         return masks, logits, painted_images

         mask, logit, painted_image = self.samcontroler.first_frame_click(image, points, labels, multimask)
         return mask, logit, painted_image
+    def generator(self, images: list, template_mask:np.ndarray, size_limit=1024):
         masks = []
         logits = []
         painted_images = []
                 mask, logit, painted_image = self.cutie.track(images[i])
                 masks.append(mask)
                 logits.append(logit)
+                painted_images.append(painted_image)
         return masks, logits, painted_images