florence-sam2-masking

Running on Zero

App Files Files Community

jiuface commited on Aug 22

Commit

6d7bbcd

•

1 Parent(s): 2a31f6e

add invert_mask

Browse files

Files changed (1) hide show

app.py +49 -9

app.py CHANGED Viewed

@@ -29,6 +29,15 @@ if torch.cuda.get_device_properties(0).major >= 8:
 FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
 SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
 class calculateDuration:
     def __init__(self, activity_name=""):
         self.activity_name = activity_name
@@ -55,7 +64,7 @@ class calculateDuration:
 @spaces.GPU()
 @torch.inference_mode()
 @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
-def process_image(image_input, image_url, task_prompt, text_prompt=None, dilate=0, merge_masks=False, return_rectangles=False) -> Optional[Image.Image]:
     if not image_input:
         gr.Info("Please upload an image.")
@@ -68,9 +77,7 @@ def process_image(image_input, image_url, task_prompt, text_prompt=None, dilate=
     if image_url:
         with calculateDuration("Download Image"):
             print("start to fetch image from url", image_url)
-            response = requests.get(image_url)
-            response.raise_for_status()
-            image_input = PIL.Image.open(BytesIO(response.content))
             print("fetch image success")
     # start to parse prompt
@@ -131,10 +138,30 @@ def process_image(image_input, image_url, task_prompt, text_prompt=None, dilate=
                 for mask in images:
                     merged_mask = cv2.bitwise_or(merged_mask, mask)
                 images = [merged_mask]
     return images
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
@@ -143,18 +170,31 @@ with gr.Blocks() as demo:
             task_prompt = gr.Dropdown(
                 ['<OD>', '<CAPTION_TO_PHRASE_GROUNDING>', '<DENSE_REGION_CAPTION>', '<REGION_PROPOSAL>', '<OCR_WITH_REGION>', '<REFERRING_EXPRESSION_SEGMENTATION>', '<REGION_TO_SEGMENTATION>', '<OPEN_VOCABULARY_DETECTION>', '<REGION_TO_CATEGORY>', '<REGION_TO_DESCRIPTION>'], value="<CAPTION_TO_PHRASE_GROUNDING>", label="Task Prompt", info="task prompts"
             )
-            dilate = gr.Slider(label="dilate mask", minimum=0, maximum=50, value=10, step=1)
-            merge_masks = gr.Checkbox(label="Merge masks", value=False)
-            return_rectangles = gr.Checkbox(label="Return Rectangles", value=False)
             text_prompt = gr.Textbox(label='Text prompt', placeholder='Enter text prompts')
             submit_button = gr.Button(value='Submit', variant='primary')
         with gr.Column():
             image_gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery", columns=[3], rows=[1], object_fit="contain", height="auto")
             # json_result = gr.Code(label="JSON Result", language="json")
     submit_button.click(
         fn=process_image,
-        inputs=[image, image_url, task_prompt, text_prompt, dilate, merge_masks, return_rectangles],
         outputs=[image_gallery],
         show_api=False
     )

 FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
 SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
+def fetch_image_from_url(image_url):
+    try:
+        response = requests.get(image_url)
+        response.raise_for_status()
+        img = Image.open(BytesIO(response.content))
+        return img
+    except Exception as e:
+        return None
 class calculateDuration:
     def __init__(self, activity_name=""):
         self.activity_name = activity_name
 @spaces.GPU()
 @torch.inference_mode()
 @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+def process_image(image_input, image_url, task_prompt, text_prompt=None, dilate=0, merge_masks=False, return_rectangles=False, invert_mask=False) -> Optional[Image.Image]:
     if not image_input:
         gr.Info("Please upload an image.")
     if image_url:
         with calculateDuration("Download Image"):
             print("start to fetch image from url", image_url)
+            image_input = fetch_image_from_url(image_url)
             print("fetch image success")
     # start to parse prompt
                 for mask in images:
                     merged_mask = cv2.bitwise_or(merged_mask, mask)
                 images = [merged_mask]
+    if invert_mask:
+        with calculateDuration("invert mask colors"):
+            images = [cv2.bitwise_not(mask) for mask in images]
     return images
+def update_task_info(task_prompt):
+    task_info = {
+        '<OD>': "Object Detection: Detect objects in the image.",
+        '<CAPTION_TO_PHRASE_GROUNDING>': "Phrase Grounding: Link phrases in captions to corresponding regions in the image.",
+        '<DENSE_REGION_CAPTION>': "Dense Region Captioning: Generate captions for different regions in the image.",
+        '<REGION_PROPOSAL>': "Region Proposal: Propose potential regions of interest in the image.",
+        '<OCR_WITH_REGION>': "OCR with Region: Extract text and its bounding regions from the image.",
+        '<REFERRING_EXPRESSION_SEGMENTATION>': "Referring Expression Segmentation: Segment the region referred to by a natural language expression.",
+        '<REGION_TO_SEGMENTATION>': "Region to Segmentation: Convert region proposals into detailed segmentations.",
+        '<OPEN_VOCABULARY_DETECTION>': "Open Vocabulary Detection: Detect objects based on open vocabulary concepts.",
+        '<REGION_TO_CATEGORY>': "Region to Category: Assign categories to proposed regions.",
+        '<REGION_TO_DESCRIPTION>': "Region to Description: Generate descriptive text for specified regions."
+    }
+    return task_info.get(task_prompt, "Select a task to see its description.")
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             task_prompt = gr.Dropdown(
                 ['<OD>', '<CAPTION_TO_PHRASE_GROUNDING>', '<DENSE_REGION_CAPTION>', '<REGION_PROPOSAL>', '<OCR_WITH_REGION>', '<REFERRING_EXPRESSION_SEGMENTATION>', '<REGION_TO_SEGMENTATION>', '<OPEN_VOCABULARY_DETECTION>', '<REGION_TO_CATEGORY>', '<REGION_TO_DESCRIPTION>'], value="<CAPTION_TO_PHRASE_GROUNDING>", label="Task Prompt", info="task prompts"
             )
+            task_info = gr.Textbox(label='Task Info', value=update_task_info("<CAPTION_TO_PHRASE_GROUNDING>"), interactive=False)
+            dilate = gr.Slider(label="dilate mask", minimum=0, maximum=50, value=10, step=1, info="The dilate parameter controls the expansion of the mask's white areas by a specified number of pixels. Increasing this value will enlarge the white regions, which can help in smoothing out the mask's edges or covering more area in the segmentation.")
+            merge_masks = gr.Checkbox(label="Merge masks", value=False, info="The merge_masks parameter combines all the individual masks into a single mask. When enabled, the separate masks generated for different objects or regions will be merged into one unified mask, which can simplify further processing or visualization.")
+            return_rectangles = gr.Checkbox(label="Return Rectangles", value=False, info="The return_rectangles parameter, when enabled, generates masks as filled white rectangles corresponding to the bounding boxes of detected objects, rather than detailed contours or segments. This option is useful for simpler, box-based visualizations.")
+            invert_mask = gr.Checkbox(label="invert mask", value=False, info="The invert_mask option allows you to reverse the colors of the generated mask, changing black areas to white and white areas to black. This can be useful for visualizing or processing the mask in a different context.")
             text_prompt = gr.Textbox(label='Text prompt', placeholder='Enter text prompts')
             submit_button = gr.Button(value='Submit', variant='primary')
         with gr.Column():
             image_gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery", columns=[3], rows=[1], object_fit="contain", height="auto")
             # json_result = gr.Code(label="JSON Result", language="json")
+    task_prompt.change(
+        fn=update_task_info,
+        inputs=[task_prompt],
+        outputs=[task_info]
+    )
+    image_url.change(
+        fn=fetch_image_from_url,
+        inputs=[image_url],
+        outputs=[image]
+    )
     submit_button.click(
         fn=process_image,
+        inputs=[image, image_url, task_prompt, text_prompt, dilate, merge_masks, return_rectangles, invert_mask],
         outputs=[image_gallery],
         show_api=False
     )