Spaces:

Ashoka74
/

Demo_Refurnish

Running on Zero

App Files Files Community

Ashoka74 commited on 16 days ago

Commit

791131a

•

1 Parent(s): c280274

Update app.py

Browse files

Files changed (1) hide show

app.py +213 -99

app.py CHANGED Viewed

@@ -823,10 +823,10 @@ def compress_image(image):
     return compressed_img
 @spaces.GPU(duration=60)
-@torch.inference_mode()
 def process_image(input_image, input_text):
     """Main processing function for the Gradio interface"""
     # Initialize configs
     API_TOKEN = "9c8c865e10ec1821bea79d9fa9dc8720"
     SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt"
@@ -835,6 +835,8 @@ def process_image(input_image, input_text):
     OUTPUT_DIR = Path("outputs/grounded_sam2_dinox_demo")
     OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
     # Initialize DDS client
     config = Config(API_TOKEN)
     client = Client(config)
@@ -850,102 +852,215 @@ def process_image(input_image, input_text):
         image_url = client.upload_file(tmpfile.name)
     os.remove(tmpfile.name)
-    # Run DINO-X detection
-    task = DinoxTask(
-        image_url=image_url,
-        prompts=[TextPrompt(text=input_text)]
-    )
-    client.run_task(task)
-    result = task.result
-    objects = result.objects
     # Process detection results
     input_boxes = []
     confidences = []
     class_names = []
     class_ids = []
-    for obj in objects:
-        input_boxes.append(obj.bbox)
-        confidences.append(obj.score)
-        cls_name = obj.category.lower().strip()
-        class_names.append(cls_name)
-        class_ids.append(class_name_to_id[cls_name])
-    input_boxes = np.array(input_boxes)
-    class_ids = np.array(class_ids)
-    # Initialize SAM2
-    torch.autocast(device_type=DEVICE, dtype=torch.bfloat16).__enter__()
-    if torch.cuda.get_device_properties(0).major >= 8:
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
-    sam2_model = build_sam2(SAM2_MODEL_CONFIG, SAM2_CHECKPOINT, device=DEVICE)
-    sam2_predictor = SAM2ImagePredictor(sam2_model)
-    sam2_predictor.set_image(input_image)
-    # sam2_predictor = run_sam_inference(SAM_IMAGE_MODEL, input_image, detections)
-    # Get masks from SAM2
-    masks, scores, logits = sam2_predictor.predict(
-        point_coords=None,
-        point_labels=None,
-        box=input_boxes,
-        multimask_output=False,
-    )
-    if masks.ndim == 4:
-        masks = masks.squeeze(1)
-    # Create visualization
-    labels = [f"{class_name} {confidence:.2f}"
-             for class_name, confidence in zip(class_names, confidences)]
-    detections = sv.Detections(
-        xyxy=input_boxes,
-        mask=masks.astype(bool),
-        class_id=class_ids
-    )
-    box_annotator = sv.BoxAnnotator()
-    label_annotator = sv.LabelAnnotator()
-    mask_annotator = sv.MaskAnnotator()
-    annotated_frame = input_image.copy()
-    annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
-    annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
-    annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
-    # Create transparent mask for first detected object
-    if len(detections) > 0:
-        # Get first mask
-        first_mask = detections.mask[0]
-        # Get original RGB image
-        img = input_image.copy()
-        H, W, C = img.shape
-        # Create RGBA image
-        alpha = np.zeros((H, W, 1), dtype=np.uint8)
-        alpha[first_mask] = 255
-        rgba = np.dstack((img, alpha)).astype(np.uint8)
-        # Crop to mask bounds to minimize image size
-        y_indices, x_indices = np.where(first_mask)
-        y_min, y_max = y_indices.min(), y_indices.max()
-        x_min, x_max = x_indices.min(), x_indices.max()
-        # Crop the RGBA image
-        cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1]
-        # Set extracted foreground for mask mover
-        mask_mover.set_extracted_fg(cropped_rgba)
-        return annotated_frame, cropped_rgba, gr.update(visible=True), gr.update(visible=True)
-    return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
 block = gr.Blocks().queue()
 with block:
@@ -958,16 +1073,15 @@ with block:
                     input_fg = gr.Image(type="numpy", label="Image", height=480)
                 with gr.Row():
                     with gr.Group():
-                        # find_objects_button = gr.Button(value="(Option 1) Segment Object from text")
-                        # text_prompt = gr.Textbox(
-                        #         label="Text Prompt",
-                        #         placeholder="Enter object classes separated by periods (e.g. 'car . person .')",
-                        #         value="couch . table ."
-                        #     )
                     extract_button = gr.Button(value="Remove Background")
                 with gr.Row():
-                    #extracted_objects = gr.Image(type="numpy", label="Extracted Foreground", height=480)
                     extracted_fg = gr.Image(type="numpy", label="Extracted Foreground", height=480)
                     # output_bg = gr.Image(type="numpy", label="Preprocessed Foreground", height=480)
@@ -1028,11 +1142,11 @@ with block:
         relight_button.click(fn=process_relight, inputs=ips, outputs=[extracted_fg, result_gallery])
         example_quick_prompts.click(lambda x, y: ', '.join(y.split(', ')[:2] + [x[0]]), inputs=[example_quick_prompts, prompt], outputs=prompt, show_progress=False, queue=False)
         example_quick_subjects.click(lambda x: x[0], inputs=example_quick_subjects, outputs=prompt, show_progress=False, queue=False)
-        # find_objects_button.click(
-        #     fn=process_image,
-        #     inputs=[input_fg, text_prompt],
-        #     outputs=[extracted_objects, extracted_fg]
-        #     )
         extract_button.click(
             fn=extract_foreground,
             inputs=[input_fg],
@@ -1169,11 +1283,11 @@ with block:
             outputs=[extracted_fg, x_slider, y_slider]
         )
-        # find_objects_button.click(
-        #     fn=process_image,
-        #     inputs=[input_image, text_prompt],
-        #     outputs=[extracted_objects, extracted_fg, x_slider, y_slider]
-        #     )
         get_depth_button.click(
             fn=get_depth,

     return compressed_img
 @spaces.GPU(duration=60)
+@torch.inference_mode
 def process_image(input_image, input_text):
     """Main processing function for the Gradio interface"""
     # Initialize configs
     API_TOKEN = "9c8c865e10ec1821bea79d9fa9dc8720"
     SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt"
     OUTPUT_DIR = Path("outputs/grounded_sam2_dinox_demo")
     OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
     # Initialize DDS client
     config = Config(API_TOKEN)
     client = Client(config)
         image_url = client.upload_file(tmpfile.name)
     os.remove(tmpfile.name)
     # Process detection results
     input_boxes = []
+    masks = []
     confidences = []
     class_names = []
     class_ids = []
+    if len(input_text) == 0:
+        task = DinoxTask(
+        image_url=image_url,
+        prompts=[TextPrompt(text="<prompt_free>")],
+        # targets=[DetectionTarget.BBox, DetectionTarget.Mask]
+        )
+        client.run_task(task)
+        predictions = task.result.objects
+        classes = [pred.category for pred in predictions]
+        classes = list(set(classes))
+        class_name_to_id = {name: id for id, name in enumerate(classes)}
+        class_id_to_name = {id: name for name, id in class_name_to_id.items()}
+        for idx, obj in enumerate(predictions):
+            input_boxes.append(obj.bbox)
+            masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size))  # convert mask to np.array using DDS API
+            confidences.append(obj.score)
+            cls_name = obj.category.lower().strip()
+            class_names.append(cls_name)
+            class_ids.append(class_name_to_id[cls_name])
+        boxes = np.array(input_boxes)
+        masks = np.array(masks)
+        class_ids = np.array(class_ids)
+        labels = [
+            f"{class_name} {confidence:.2f}"
+            for class_name, confidence
+            in zip(class_names, confidences)
+        ]
+        detections = sv.Detections(
+            xyxy=boxes,
+            mask=masks.astype(bool),
+            class_id=class_ids
+        )
+        box_annotator = sv.BoxAnnotator()
+        label_annotator = sv.LabelAnnotator()
+        mask_annotator = sv.MaskAnnotator()
+        annotated_frame = input_image.copy()
+        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
+        annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
+        annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
+        # Create transparent mask for first detected object
+        if len(detections) > 0:
+            # Get first mask
+            first_mask = detections.mask[0]
+            # Get original RGB image
+            img = input_image.copy()
+            H, W, C = img.shape
+            # Create RGBA image
+            alpha = np.zeros((H, W, 1), dtype=np.uint8)
+            alpha[first_mask] = 255
+            rgba = np.dstack((img, alpha)).astype(np.uint8)
+            # Crop to mask bounds to minimize image size
+            y_indices, x_indices = np.where(first_mask)
+            y_min, y_max = y_indices.min(), y_indices.max()
+            x_min, x_max = x_indices.min(), x_indices.max()
+            # Crop the RGBA image
+            cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1]
+            # Set extracted foreground for mask mover
+            mask_mover.set_extracted_fg(cropped_rgba)
+            return annotated_frame, cropped_rgba, gr.update(visible=True), gr.update(visible=True)
+    else:
+        # Run DINO-X detection
+        task = DinoxTask(
+            image_url=image_url,
+            prompts=[TextPrompt(text=input_text)],
+            targets=[DetectionTarget.BBox, DetectionTarget.Mask]
+        )
+        client.run_task(task)
+        result = task.result
+        objects = result.objects
+        # for obj in objects:
+        #     input_boxes.append(obj.bbox)
+        #     confidences.append(obj.score)
+        #     cls_name = obj.category.lower().strip()
+        #     class_names.append(cls_name)
+        #     class_ids.append(class_name_to_id[cls_name])
+        # input_boxes = np.array(input_boxes)
+        # class_ids = np.array(class_ids)
+        predictions = task.result.objects
+        classes = [x.strip().lower() for x in input_text.split('.') if x]
+        class_name_to_id = {name: id for id, name in enumerate(classes)}
+        class_id_to_name = {id: name for name, id in class_name_to_id.items()}
+        boxes = []
+        masks = []
+        confidences = []
+        class_names = []
+        class_ids = []
+        for idx, obj in enumerate(predictions):
+            boxes.append(obj.bbox)
+            masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size))  # convert mask to np.array using DDS API
+            confidences.append(obj.score)
+            cls_name = obj.category.lower().strip()
+            class_names.append(cls_name)
+            class_ids.append(class_name_to_id[cls_name])
+        boxes = np.array(boxes)
+        masks = np.array(masks)
+        class_ids = np.array(class_ids)
+        labels = [
+            f"{class_name} {confidence:.2f}"
+            for class_name, confidence
+            in zip(class_names, confidences)
+        ]
+        # Initialize SAM2
+        # torch.autocast(device_type=DEVICE, dtype=torch.bfloat16).__enter__()
+        # if torch.cuda.get_device_properties(0).major >= 8:
+        #     torch.backends.cuda.matmul.allow_tf32 = True
+        #     torch.backends.cudnn.allow_tf32 = True
+        # sam2_model = build_sam2(SAM2_MODEL_CONFIG, SAM2_CHECKPOINT, device=DEVICE)
+        # sam2_predictor = SAM2ImagePredictor(sam2_model)
+        # sam2_predictor.set_image(input_image)
+        # sam2_predictor = run_sam_inference(SAM_IMAGE_MODEL, input_image, detections)
+        # Get masks from SAM2
+        # masks, scores, logits = sam2_predictor.predict(
+        #     point_coords=None,
+        #     point_labels=None,
+        #     box=input_boxes,
+        #     multimask_output=False,
+        # )
+        if masks.ndim == 4:
+            masks = masks.squeeze(1)
+        # Create visualization
+        # labels = [f"{class_name} {confidence:.2f}"
+        #          for class_name, confidence in zip(class_names, confidences)]
+        # detections = sv.Detections(
+        #     xyxy=input_boxes,
+        #     mask=masks.astype(bool),
+        #     class_id=class_ids
+        # )
+        detections = sv.Detections(
+        xyxy = boxes,
+        mask = masks.astype(bool),
+        class_id = class_ids,
+    )
+        box_annotator = sv.BoxAnnotator()
+        label_annotator = sv.LabelAnnotator()
+        mask_annotator = sv.MaskAnnotator()
+        annotated_frame = input_image.copy()
+        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
+        annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
+        annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
+        # Create transparent mask for first detected object
+        if len(detections) > 0:
+            # Get first mask
+            first_mask = detections.mask[0]
+            # Get original RGB image
+            img = input_image.copy()
+            H, W, C = img.shape
+            # Create RGBA image
+            alpha = np.zeros((H, W, 1), dtype=np.uint8)
+            alpha[first_mask] = 255
+            rgba = np.dstack((img, alpha)).astype(np.uint8)
+            # Crop to mask bounds to minimize image size
+            y_indices, x_indices = np.where(first_mask)
+            y_min, y_max = y_indices.min(), y_indices.max()
+            x_min, x_max = x_indices.min(), x_indices.max()
+            # Crop the RGBA image
+            cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1]
+            # Set extracted foreground for mask mover
+            mask_mover.set_extracted_fg(cropped_rgba)
+            return annotated_frame, cropped_rgba, gr.update(visible=True), gr.update(visible=True)
+        return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
 block = gr.Blocks().queue()
 with block:
                     input_fg = gr.Image(type="numpy", label="Image", height=480)
                 with gr.Row():
                     with gr.Group():
+                        find_objects_button = gr.Button(value="(Option 1) Segment Object from text")
+                        text_prompt = gr.Textbox(
+                                label="Text Prompt",
+                                placeholder="Enter object classes separated by periods (e.g. 'car . person .'), leave empty to get all objects",
+                                value=""
+                            )
                     extract_button = gr.Button(value="Remove Background")
                 with gr.Row():
+                    extracted_objects = gr.Image(type="numpy", label="Extracted Foreground", height=480)
                     extracted_fg = gr.Image(type="numpy", label="Extracted Foreground", height=480)
                     # output_bg = gr.Image(type="numpy", label="Preprocessed Foreground", height=480)
         relight_button.click(fn=process_relight, inputs=ips, outputs=[extracted_fg, result_gallery])
         example_quick_prompts.click(lambda x, y: ', '.join(y.split(', ')[:2] + [x[0]]), inputs=[example_quick_prompts, prompt], outputs=prompt, show_progress=False, queue=False)
         example_quick_subjects.click(lambda x: x[0], inputs=example_quick_subjects, outputs=prompt, show_progress=False, queue=False)
+        find_objects_button.click(
+            fn=process_image,
+            inputs=[input_fg, text_prompt],
+            outputs=[extracted_objects, extracted_fg]
+            )
         extract_button.click(
             fn=extract_foreground,
             inputs=[input_fg],
             outputs=[extracted_fg, x_slider, y_slider]
         )
+        find_objects_button.click(
+            fn=process_image,
+            inputs=[input_image, text_prompt],
+            outputs=[extracted_objects, extracted_fg, x_slider, y_slider]
+            )
         get_depth_button.click(
             fn=get_depth,