Spaces:

TencentARC
/

BrushEdit

Running on Zero

App Files Files Community

Yw22 commited on 1 day ago

Commit

15d0e68

•

1 Parent(s): ff48bea

fix some bugs

Browse files

Files changed (3) hide show

app/run_app.sh +5 -0
app/src/brushedit_app.py +53 -42
app/src/vlm_pipeline.py +24 -18

app/run_app.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+export PYTHONPATH=.:$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=0
+python app/src/brushedit_app.py

app/src/brushedit_app.py CHANGED Viewed

@@ -337,7 +337,7 @@ vlm_type, vlm_local_path, vlm_processor, vlm_model = vlms_template[DEFAULT_VLM_M
 if vlm_processor != "" and vlm_model != "":
     vlm_model.to(device)
 else:
-    gr.Error("Please Download default VLM model "+ DEFAULT_VLM_MODEL_NAME +" first.")
 ## init base model
@@ -504,7 +504,7 @@ def random_mask_func(mask, dilation_type='square', dilation_size=20):
         dilated_mask = np.zeros_like(binary_mask, dtype=bool)
         dilated_mask[ellipse_mask] = True
     else:
-        raise ValueError("dilation_type must be 'square' or 'ellipse'")
     # use binary dilation
     dilated_mask =  np.uint8(dilated_mask[:,:,np.newaxis]) * 255
@@ -637,7 +637,8 @@ def process(input_image,
             image_pil = input_image["background"].convert("RGB")
             original_image = np.array(image_pil)
     if prompt is None or prompt == "":
-        raise gr.Error("Please input your instructions, e.g., remove the xxx")
     alpha_mask = input_image["layers"][0].split()[3]
     input_mask = np.asarray(alpha_mask)
@@ -687,17 +688,23 @@ def process(input_image,
             original_mask = input_mask
     if category is not None:
-        pass
     else:
-        category = vlm_response_editing_type(vlm_processor, vlm_model, original_image, prompt, device)
     if original_mask is not None:
         original_mask = np.clip(original_mask, 0, 255).astype(np.uint8)
     else:
-        object_wait_for_edit = vlm_response_object_wait_for_edit(
                                                 vlm_processor,
                                                 vlm_model,
                                                 original_image,
@@ -705,30 +712,37 @@ def process(input_image,
                                                 prompt,
                                                 device)
-        original_mask = vlm_response_mask(vlm_processor,
-                                          vlm_model,
-                                          category,
-                                          original_image,
-                                          prompt,
-                                          object_wait_for_edit,
-                                          sam,
-                                          sam_predictor,
-                                          sam_automask_generator,
-                                          groundingdino_model,
-                                          device)
     if original_mask.ndim == 2:
         original_mask = original_mask[:,:,None]
-    if len(target_prompt) <= 1:
-        prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(
                                                                     vlm_processor,
                                                                     vlm_model,
                                                                     original_image,
                                                                     prompt,
                                                                     device)
-    else:
-        prompt_after_apply_instruction = target_prompt
     generator = torch.Generator(device).manual_seed(random.randint(0, 2147483647) if randomize_seed else seed)
@@ -758,7 +772,8 @@ def process(input_image,
     # image[3].save(f"outputs/image_edit_{uuid}_3.png")
     # mask_image.save(f"outputs/mask_{uuid}.png")
     # masked_image.save(f"outputs/masked_image_{uuid}.png")
-    return image, [mask_image], [masked_image], prompt, '', prompt_after_apply_instruction, False
 def generate_target_prompt(input_image,
@@ -774,7 +789,7 @@ def generate_target_prompt(input_image,
                                                             original_image,
                                                             prompt,
                                                             device)
-    return prompt_after_apply_instruction, prompt_after_apply_instruction
 def process_mask(input_image,
@@ -1415,7 +1430,7 @@ def init_img(base,
         original_mask = np.array(mask_gallery[0]).astype(np.uint8)[:,:,None] # h,w,1
         return base, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, "", "", "", "Custom resolution", False, False, example_change_times
     else:
-        return base, original_image, None, "", None, None, None, "", "", "", aspect_ratio, True, False, 0
 def reset_func(input_image,
@@ -1423,7 +1438,7 @@ def reset_func(input_image,
                original_mask,
                prompt,
                target_prompt,
-               target_prompt_output):
     input_image = None
     original_image = None
     original_mask = None
@@ -1432,10 +1447,9 @@ def reset_func(input_image,
     masked_gallery = []
     result_gallery = []
     target_prompt = ''
-    target_prompt_output = ''
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-    return input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, target_prompt_output, True, False
 def update_example(example_type,
@@ -1458,7 +1472,8 @@ def update_example(example_type,
     original_mask = np.array(mask_gallery[0]).astype(np.uint8)[:,:,None] # h,w,1
     aspect_ratio = "Custom resolution"
     example_change_times += 1
-    return input_image, prompt, original_image, original_mask, mask_gallery, masked_gallery, result_gallery, aspect_ratio, "", "", False, example_change_times
 block = gr.Blocks(
         theme=gr.themes.Soft(
@@ -1498,6 +1513,8 @@ with block as demo:
                     sources=["upload"],
                     )
             vlm_model_dropdown = gr.Dropdown(label="VLM model", choices=VLM_MODEL_NAMES, value=DEFAULT_VLM_MODEL_NAME, interactive=True)
             with gr.Group():
@@ -1510,12 +1527,6 @@ with block as demo:
             aspect_ratio = gr.Dropdown(label="Output aspect ratio", choices=ASPECT_RATIO_LABELS, value=DEFAULT_ASPECT_RATIO)
             resize_default = gr.Checkbox(label="Short edge resize to 640px", value=True)
-            prompt = gr.Textbox(label="⌨️ Instruction", placeholder="Please input your instruction.", value="",lines=1)
-            run_button = gr.Button("💫 Run")
             with gr.Row():
                 mask_button = gr.Button("Generate Mask")
                 random_mask_button = gr.Button("Square/Circle Mask ")
@@ -1603,7 +1614,7 @@ with block as demo:
             with gr.Tab(elem_classes="feedback", label="Output"):
                 result_gallery = gr.Gallery(label='Output', show_label=True, elem_id="gallery", preview=True, height=400)
-            target_prompt_output = gr.Text(label="Output Target Prompt", value="", lines=1, interactive=False)
             reset_button = gr.Button("Reset")
@@ -1634,9 +1645,9 @@ with block as demo:
     input_image.upload(
         init_img,
         [input_image, init_type, prompt, aspect_ratio, example_change_times],
-        [input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, target_prompt_output, init_type, aspect_ratio, resize_default, invert_mask_state, example_change_times]
     )
-    example_type.change(fn=update_example, inputs=[example_type, prompt, example_change_times], outputs=[input_image, prompt, original_image, original_mask, mask_gallery, masked_gallery, result_gallery, aspect_ratio, target_prompt, target_prompt_output, invert_mask_state, example_change_times])
     ## vlm and base model dropdown
     vlm_model_dropdown.change(fn=update_vlm_model, inputs=[vlm_model_dropdown], outputs=[status])
@@ -1666,7 +1677,7 @@ with block as demo:
          invert_mask_state]
     ## run brushedit
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery, mask_gallery, masked_gallery, prompt, target_prompt, target_prompt_output, invert_mask_state])
     ## mask func
     mask_button.click(fn=process_mask, inputs=[input_image, original_image, prompt, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask, category])
@@ -1681,10 +1692,10 @@ with block as demo:
     move_down_button.click(fn=move_mask_down, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
     ## prompt func
-    generate_target_prompt_button.click(fn=generate_target_prompt, inputs=[input_image, original_image, prompt], outputs=[target_prompt, target_prompt_output])
     ## reset func
-    reset_button.click(fn=reset_func, inputs=[input_image, original_image, original_mask, prompt, target_prompt, target_prompt_output], outputs=[input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, target_prompt_output, resize_default, invert_mask_state])
 demo.launch()

 if vlm_processor != "" and vlm_model != "":
     vlm_model.to(device)
 else:
+    raise gr.Error("Please Download default VLM model "+ DEFAULT_VLM_MODEL_NAME +" first.")
 ## init base model
         dilated_mask = np.zeros_like(binary_mask, dtype=bool)
         dilated_mask[ellipse_mask] = True
     else:
+        ValueError("dilation_type must be 'square' or 'ellipse'")
     # use binary dilation
     dilated_mask =  np.uint8(dilated_mask[:,:,np.newaxis]) * 255
             image_pil = input_image["background"].convert("RGB")
             original_image = np.array(image_pil)
     if prompt is None or prompt == "":
+        if target_prompt is None or target_prompt == "":
+            raise gr.Error("Please input your instructions, e.g., remove the xxx")
     alpha_mask = input_image["layers"][0].split()[3]
     input_mask = np.asarray(alpha_mask)
             original_mask = input_mask
+    ## inpainting directly if target_prompt is not None
     if category is not None:
+        pass
+    elif target_prompt is not None and len(target_prompt) >= 1 and original_mask is not None:
+        pass
     else:
+        try:
+            category = vlm_response_editing_type(vlm_processor, vlm_model, original_image, prompt, device)
+        except Exception as e:
+            raise gr.Error("Please select the correct VLM model and input the correct API Key first!")
     if original_mask is not None:
         original_mask = np.clip(original_mask, 0, 255).astype(np.uint8)
     else:
+        try:
+            object_wait_for_edit = vlm_response_object_wait_for_edit(
                                                 vlm_processor,
                                                 vlm_model,
                                                 original_image,
                                                 prompt,
                                                 device)
+            original_mask = vlm_response_mask(vlm_processor,
+                                            vlm_model,
+                                            category,
+                                            original_image,
+                                            prompt,
+                                            object_wait_for_edit,
+                                            sam,
+                                            sam_predictor,
+                                            sam_automask_generator,
+                                            groundingdino_model,
+                                            device)
+        except Exception as e:
+            raise gr.Error("Please select the correct VLM model and input the correct API Key first!")
     if original_mask.ndim == 2:
         original_mask = original_mask[:,:,None]
+    if target_prompt is not None and len(target_prompt) >= 1:
+        prompt_after_apply_instruction = target_prompt
+    else:
+        try:
+            prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(
                                                                     vlm_processor,
                                                                     vlm_model,
                                                                     original_image,
                                                                     prompt,
                                                                     device)
+        except Exception as e:
+            raise gr.Error("Please select the correct VLM model and input the correct API Key first!")
     generator = torch.Generator(device).manual_seed(random.randint(0, 2147483647) if randomize_seed else seed)
     # image[3].save(f"outputs/image_edit_{uuid}_3.png")
     # mask_image.save(f"outputs/mask_{uuid}.png")
     # masked_image.save(f"outputs/masked_image_{uuid}.png")
+    # gr.Info(f"Target Prompt: {prompt_after_apply_instruction}", duration=16)
+    return image, [mask_image], [masked_image], prompt, '', False
 def generate_target_prompt(input_image,
                                                             original_image,
                                                             prompt,
                                                             device)
+    return prompt_after_apply_instruction
 def process_mask(input_image,
         original_mask = np.array(mask_gallery[0]).astype(np.uint8)[:,:,None] # h,w,1
         return base, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, "", "", "", "Custom resolution", False, False, example_change_times
     else:
+        return base, original_image, None, "", None, None, None, "", "", aspect_ratio, True, False, 0
 def reset_func(input_image,
                original_mask,
                prompt,
                target_prompt,
+               ):
     input_image = None
     original_image = None
     original_mask = None
     masked_gallery = []
     result_gallery = []
     target_prompt = ''
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+    return input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, True, False
 def update_example(example_type,
     original_mask = np.array(mask_gallery[0]).astype(np.uint8)[:,:,None] # h,w,1
     aspect_ratio = "Custom resolution"
     example_change_times += 1
+    return input_image, prompt, original_image, original_mask, mask_gallery, masked_gallery, result_gallery, aspect_ratio, "", False, example_change_times
 block = gr.Blocks(
         theme=gr.themes.Soft(
                     sources=["upload"],
                     )
+            prompt = gr.Textbox(label="⌨️ Instruction", placeholder="Please input your instruction.", value="",lines=1)
+            run_button = gr.Button("💫 Run")
             vlm_model_dropdown = gr.Dropdown(label="VLM model", choices=VLM_MODEL_NAMES, value=DEFAULT_VLM_MODEL_NAME, interactive=True)
             with gr.Group():
             aspect_ratio = gr.Dropdown(label="Output aspect ratio", choices=ASPECT_RATIO_LABELS, value=DEFAULT_ASPECT_RATIO)
             resize_default = gr.Checkbox(label="Short edge resize to 640px", value=True)
             with gr.Row():
                 mask_button = gr.Button("Generate Mask")
                 random_mask_button = gr.Button("Square/Circle Mask ")
             with gr.Tab(elem_classes="feedback", label="Output"):
                 result_gallery = gr.Gallery(label='Output', show_label=True, elem_id="gallery", preview=True, height=400)
+            # target_prompt_output = gr.Text(label="Output Target Prompt", value="", lines=1, interactive=False)
             reset_button = gr.Button("Reset")
     input_image.upload(
         init_img,
         [input_image, init_type, prompt, aspect_ratio, example_change_times],
+        [input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, init_type, aspect_ratio, resize_default, invert_mask_state, example_change_times]
     )
+    example_type.change(fn=update_example, inputs=[example_type, prompt, example_change_times], outputs=[input_image, prompt, original_image, original_mask, mask_gallery, masked_gallery, result_gallery, aspect_ratio, target_prompt, invert_mask_state, example_change_times])
     ## vlm and base model dropdown
     vlm_model_dropdown.change(fn=update_vlm_model, inputs=[vlm_model_dropdown], outputs=[status])
          invert_mask_state]
     ## run brushedit
+    run_button.click(fn=process, inputs=ips, outputs=[result_gallery, mask_gallery, masked_gallery, prompt, target_prompt, invert_mask_state])
     ## mask func
     mask_button.click(fn=process_mask, inputs=[input_image, original_image, prompt, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask, category])
     move_down_button.click(fn=move_mask_down, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
     ## prompt func
+    generate_target_prompt_button.click(fn=generate_target_prompt, inputs=[input_image, original_image, prompt], outputs=[target_prompt])
     ## reset func
+    reset_button.click(fn=reset_func, inputs=[input_image, original_image, original_mask, prompt, target_prompt], outputs=[input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, resize_default, invert_mask_state])
 demo.launch()

app/src/vlm_pipeline.py CHANGED Viewed

@@ -98,10 +98,12 @@ def vlm_response_editing_type(vlm_processor,
         messages = create_editing_category_messages_qwen2(editing_prompt)
         response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device=device)
-    for category_name in ["Addition","Remove","Local","Global","Background"]:
-        if category_name.lower() in response_str.lower():
-            return category_name
-    raise gr.Error("Please input correct commands, including add, delete, and modify commands. If it still does not work, please switch to a more powerful VLM.")
 ### response object to be edited
@@ -206,17 +208,21 @@ def vlm_response_prompt_after_apply_instruction(vlm_processor,
                                                 image,
                                                 editing_prompt,
                                                 device):
-    if isinstance(vlm_model, OpenAI):
-        base64_image = encode_image(image)
-        messages = create_apply_editing_messages_gpt4o(editing_prompt, base64_image)
-        response_str = run_gpt4o_vl_inference(vlm_model, messages)
-    elif isinstance(vlm_model, LlavaNextForConditionalGeneration):
-        messages = create_apply_editing_messages_llava(editing_prompt)
-        response_str = run_llava_next_inference(vlm_processor, vlm_model, messages, image, device)
-    elif isinstance(vlm_model, Qwen2VLForConditionalGeneration):
-        base64_image = encode_image(image)
-        messages = create_apply_editing_messages_qwen2(editing_prompt, base64_image)
-        response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device)
-    else:
-        raise gr.Error("Please select the correct VLM model!")
-    return response_str

         messages = create_editing_category_messages_qwen2(editing_prompt)
         response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device=device)
+    try:
+        for category_name in ["Addition","Remove","Local","Global","Background"]:
+            if category_name.lower() in response_str.lower():
+                return category_name
+    except Exception as e:
+        raise gr.Error("Please input OpenAI API Key. Or please input correct commands, including add, delete, and modify commands. If it still does not work, please switch to a more powerful VLM.")
 ### response object to be edited
                                                 image,
                                                 editing_prompt,
                                                 device):
+    try:
+        if isinstance(vlm_model, OpenAI):
+            base64_image = encode_image(image)
+            messages = create_apply_editing_messages_gpt4o(editing_prompt, base64_image)
+            response_str = run_gpt4o_vl_inference(vlm_model, messages)
+        elif isinstance(vlm_model, LlavaNextForConditionalGeneration):
+            messages = create_apply_editing_messages_llava(editing_prompt)
+            response_str = run_llava_next_inference(vlm_processor, vlm_model, messages, image, device)
+        elif isinstance(vlm_model, Qwen2VLForConditionalGeneration):
+            base64_image = encode_image(image)
+            messages = create_apply_editing_messages_qwen2(editing_prompt, base64_image)
+            response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device)
+        else:
+            raise gr.Error("Please select the correct VLM model and input the correct API Key first!")
+    except Exception as e:
+        raise gr.Error("Please select the correct VLM model and input the correct API Key first!")
+    return response_str