OmniParser

Paused

App Files Files Community

adamlu1 commited on 2 days ago

Commit

b35e1d0

•

1 Parent(s): 39f8e6b

auto adj bbox width

Browse files

Files changed (2) hide show

app.py +14 -29
imgs/saved_image_demo.png +0 -0

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from typing import Optional
-import spaces
 import gradio as gr
 import numpy as np
@@ -25,31 +25,6 @@ caption_model_processor = {'processor': processor, 'model': model}
 print('finish loading model!!!')
-platform = 'pc'
-if platform == 'pc':
-    draw_bbox_config = {
-        'text_scale': 0.8,
-        'text_thickness': 2,
-        'text_padding': 2,
-        'thickness': 2,
-    }
-elif platform == 'web':
-    draw_bbox_config = {
-        'text_scale': 0.8,
-        'text_thickness': 2,
-        'text_padding': 3,
-        'thickness': 3,
-    }
-elif platform == 'mobile':
-    draw_bbox_config = {
-        'text_scale': 0.8,
-        'text_thickness': 2,
-        'text_padding': 3,
-        'thickness': 3,
-    }
 MARKDOWN = """
 # OmniParser for Pure Vision Based General GUI Agent 🔥
 <div>
@@ -59,6 +34,8 @@ MARKDOWN = """
 </div>
 OmniParser is a screen parsing tool to convert general GUI screen to structured elements.
 """
 # DEVICE = torch.device('cuda')
@@ -66,7 +43,7 @@ OmniParser is a screen parsing tool to convert general GUI screen to structured
 # @spaces.GPU
 @torch.inference_mode()
 # @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
-@spaces.GPU(duration=65)
 def process(
     image_input,
     box_threshold,
@@ -76,6 +53,14 @@ def process(
     image_save_path = 'imgs/saved_image_demo.png'
     image_input.save(image_save_path)
     # import pdb; pdb.set_trace()
     ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=True)
     text, ocr_bbox = ocr_bbox_rslt
@@ -117,5 +102,5 @@ with gr.Blocks() as demo:
     )
 # demo.launch(debug=False, show_error=True, share=True)
-# demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
-demo.queue().launch(share=False)

 from typing import Optional
+# import spaces
 import gradio as gr
 import numpy as np
 print('finish loading model!!!')
 MARKDOWN = """
 # OmniParser for Pure Vision Based General GUI Agent 🔥
 <div>
 </div>
 OmniParser is a screen parsing tool to convert general GUI screen to structured elements.
+📢 [[Project Page](https://microsoft.github.io/OmniParser/)] [[Blog Post](https://www.microsoft.com/en-us/research/articles/omniparser-for-pure-vision-based-gui-agent/)] [[Models](https://huggingface.co/microsoft/OmniParser)]
 """
 # DEVICE = torch.device('cuda')
 # @spaces.GPU
 @torch.inference_mode()
 # @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+# @spaces.GPU(duration=65)
 def process(
     image_input,
     box_threshold,
     image_save_path = 'imgs/saved_image_demo.png'
     image_input.save(image_save_path)
     # import pdb; pdb.set_trace()
+    image = Image.open(image_save_path)
+    box_overlay_ratio = image.size[0] / 3200
+    draw_bbox_config = {
+        'text_scale': 0.8 * box_overlay_ratio,
+        'text_thickness': max(int(2 * box_overlay_ratio), 1),
+        'text_padding': max(int(3 * box_overlay_ratio), 1),
+        'thickness': max(int(3 * box_overlay_ratio), 1),
+    }
     ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=True)
     text, ocr_bbox = ocr_bbox_rslt
     )
 # demo.launch(debug=False, show_error=True, share=True)
+demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
+# demo.queue().launch(share=False)

imgs/saved_image_demo.png CHANGED Viewed