Spaces:

LuJingyi
/

Inpaint4Drag

Running on Zero

App Files Files Community

LuJingyi-John commited on Aug 21

Commit

6678b47

1 Parent(s): 75fac02

Add Inpaint4Drag application with all components

Browse files

Files changed (6) hide show

.gitignore +28 -0
app.py +201 -0
requirements.txt +21 -0
utils/drag.py +297 -0
utils/refine_mask.py +168 -0
utils/ui_utils.py +271 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,28 @@

+output/
+checkpoints/
+drag_data/
+webpage/
+play.py
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST

app.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import gradio as gr
+from utils.ui_utils import *
+CANVAS_SIZE = 400
+DEFAULT_GEN_SIZE = 512
+def create_interface():
+    with gr.Blocks() as app:
+        # State variables
+        state = {
+            'canvas_size': gr.Number(value=CANVAS_SIZE, visible=False, precision=0),
+            'gen_size': gr.Number(value=DEFAULT_GEN_SIZE, visible=False, precision=0),
+            'points_list': gr.State(value=[]),
+            'inpaint_mask': gr.State(value=None)
+        }
+        with gr.Tab(label='Inpaint4Drag'):
+            with gr.Row():
+                # Draw Region Column
+                with gr.Column():
+                    gr.Markdown("""<p style="text-align: center; font-size: 20px">1. Draw Regions</p>""")
+                    canvas = gr.Image(type="numpy", tool="sketch", label=" ", height=CANVAS_SIZE, width=CANVAS_SIZE)
+                    with gr.Row():
+                        fit_btn = gr.Button("Resize Image")
+                        if_sam_box = gr.Checkbox(label='Refine mask (SAM)')
+                # Control Points Column
+                with gr.Column():
+                    gr.Markdown("""<p style="text-align: center; font-size: 20px">2. Control Points</p>""")
+                    input_img = gr.Image(type="numpy", label=" ", height=CANVAS_SIZE, width=CANVAS_SIZE, interactive=True)
+                    with gr.Row():
+                        undo_btn = gr.Button("Undo Point")
+                        clear_btn = gr.Button("Clear Points")
+                # Results Column
+                with gr.Column():
+                    gr.Markdown("""<p style="text-align: center; font-size: 20px">Results</p>""")
+                    output_img = gr.Image(type="numpy", label=" ", height=CANVAS_SIZE, width=CANVAS_SIZE, interactive=False)
+                    with gr.Row():
+                        run_btn = gr.Button("Inpaint")
+                        reset_btn = gr.Button("Reset All")
+        # Output Settings
+        with gr.Row("Generation Parameters"):
+            sam_ks = gr.Slider(minimum=11, maximum=51, value=21, step=2, label='How much to refine mask with SAM', interactive=True)
+            inpaint_ks = gr.Slider(minimum=0, maximum=25, value=5, step=1, label='How much to expand inpainting mask', interactive=True)
+            output_path = gr.Textbox(value='output/app', label="Output path")
+        setup_events(
+            components={
+                'canvas': canvas,
+                'input_img': input_img,
+                'output_img': output_img,
+                'output_path': output_path,
+                'if_sam_box': if_sam_box,
+                'sam_ks': sam_ks,
+                'inpaint_ks': inpaint_ks,
+            },
+            state=state,
+            buttons={
+                'fit': fit_btn,
+                'undo': undo_btn,
+                'clear': clear_btn,
+                'run': run_btn,
+                'reset': reset_btn
+            }
+        )
+    return app
+def setup_events(components, state, buttons):
+    # Reset and clear events
+    def setup_reset_events():
+        buttons['reset'].click(
+            clear_all,
+            [state['canvas_size']],
+            [components['canvas'], components['input_img'], components['output_img'],
+             state['points_list'], components['sam_ks'], components['inpaint_ks'], components['output_path'], state['inpaint_mask']]
+        )
+        components['canvas'].clear(
+            clear_all,
+            [state['canvas_size']],
+            [components['canvas'], components['input_img'], components['output_img'],
+             state['points_list'], components['sam_ks'], components['inpaint_ks'], components['output_path'], state['inpaint_mask']]
+        )
+    # Image manipulation events
+    def setup_image_events():
+        buttons['fit'].click(
+            clear_point,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['if_sam_box'], components['output_path']],
+            [components['input_img']]
+        ).then(
+            resize,
+            [components['canvas'], state['gen_size'], state['canvas_size']],
+            [components['canvas'], components['input_img'], components['output_img']]
+        )
+    # Canvas interaction events
+    def setup_canvas_events():
+        components['canvas'].edit(
+            visualize_user_drag,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['if_sam_box'], components['output_path']],
+            [components['input_img']]
+        ).then(
+            preview_out_image,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['inpaint_ks'], components['if_sam_box'], components['output_path']],
+            [components['output_img'], state['inpaint_mask']]
+        )
+        components['if_sam_box'].change(
+            visualize_user_drag,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['if_sam_box']],
+            [components['input_img']]
+        ).then(
+            preview_out_image,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['inpaint_ks'], components['if_sam_box'], components['output_path']],
+            [components['output_img'], state['inpaint_mask']]
+        )
+        components['sam_ks'].change(
+            visualize_user_drag,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['if_sam_box']],
+            [components['input_img']]
+        ).then(
+            preview_out_image,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['inpaint_ks'], components['if_sam_box'], components['output_path']],
+            [components['output_img'], state['inpaint_mask']]
+        )
+        components['inpaint_ks'].change(
+            visualize_user_drag,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['if_sam_box']],
+            [components['input_img']]
+        ).then(
+            preview_out_image,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['inpaint_ks'], components['if_sam_box'], components['output_path']],
+            [components['output_img'], state['inpaint_mask']]
+        )
+    # Input image events
+    def setup_input_events():
+        components['input_img'].select(
+            add_point,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['if_sam_box'], components['output_path']],
+            [components['input_img']]
+        ).then(
+            preview_out_image,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['inpaint_ks'], components['if_sam_box'], components['output_path']],
+            [components['output_img'], state['inpaint_mask']]
+        )
+    # Point manipulation events
+    def setup_point_events():
+        buttons['undo'].click(
+            undo_point,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['if_sam_box'], components['output_path']],
+            [components['input_img']]
+        ).then(
+            preview_out_image,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['inpaint_ks'], components['if_sam_box'], components['output_path']],
+            [components['output_img'], state['inpaint_mask']]
+        )
+        buttons['clear'].click(
+            clear_point,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['if_sam_box'], components['output_path']],
+            [components['input_img']]
+        ).then(
+            preview_out_image,
+            [components['canvas'], state['points_list'], components['sam_ks'], components['inpaint_ks'], components['if_sam_box'], components['output_path']],
+            [components['output_img'], state['inpaint_mask']]
+        )
+    # Processing events
+    def setup_processing_events():
+        buttons['run'].click(
+            preview_out_image,
+           [components['canvas'], state['points_list'], components['sam_ks'], components['inpaint_ks'], components['if_sam_box'], components['output_path']],
+            [components['output_img'], state['inpaint_mask']]
+        ).then(
+            inpaint,
+            [components['output_img'], state['inpaint_mask']],
+            [components['output_img']]
+        )
+    # Setup all events
+    setup_reset_events()
+    setup_image_events()
+    setup_canvas_events()
+    setup_input_events()
+    setup_point_events()
+    setup_processing_events()
+def main():
+    app = create_interface()
+    app.queue().launch(share=True, debug=True)
+if __name__ == '__main__':
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+# Core ML Libraries
+torch
+torchvision
+transformers
+diffusers
+accelerate
+peft
+xformers
+# UI and Image Processing
+gradio==3.47.1
+opencv-python==4.8.0.76
+Pillow
+numpy
+# Evaluation (Optional)
+lpips
+gdown
+# EfficientViT-SAM (Optional)
+git+https://github.com/mit-han-lab/efficientvit.git

utils/drag.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import numpy as np
+import cv2
+import torch
+from typing import Union
+def contour_to_points_and_mask(contour: np.ndarray, image_shape: tuple) -> tuple[np.ndarray, np.ndarray]:
+    """Convert a contour to a set of points and binary mask.
+    This function takes a contour and creates both a binary mask and a list of points
+    that lie within the contour. The points are represented in (x, y) coordinates.
+    Args:
+        contour (np.ndarray): Input contour of shape (N, 2) or (N, 1, 2) where N is
+            the number of points. Each point should be in (x, y) format.
+        image_shape (tuple): Shape of the output mask as (height, width).
+    Returns:
+        tuple:
+            - np.ndarray: Array of points in (x, y) format with shape (M, 2),
+              where M is the number of points inside the contour.
+              Returns empty array of shape (0, 2) if contour is empty.
+            - np.ndarray: Binary mask of shape image_shape where pixels inside
+              the contour are 255 and outside are 0.
+    """
+    if len(contour) == 0:
+        return np.zeros((0, 2), dtype=np.int32), np.zeros(image_shape, dtype=np.uint8)
+    # Create empty mask and fill the contour in the mask
+    mask = np.zeros(image_shape, dtype=np.uint8)
+    cv2.drawContours(mask, [contour.reshape(-1, 1, 2)], -1, 255, cv2.FILLED)
+    # Get points inside contour (y, x) and convert to (x, y)
+    points = np.column_stack(np.where(mask)).astype(np.int32)[:, [1, 0]]
+    # Return empty array if no points found
+    if len(points) == 0:
+        points = np.zeros((0, 2), dtype=np.int32)
+    return points, mask
+def find_control_points(
+    region_points: torch.Tensor,
+    source_control_points: torch.Tensor,
+    target_control_points: torch.Tensor,
+    distance_threshold: float = 1e-6
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Find control points that match points within a region.
+    This function identifies which control points lie within or very close to
+    the specified region points. It matches source control points to region points
+    and returns both source and corresponding target control points that satisfy
+    the distance threshold criterion.
+    Args:
+        region_points (torch.Tensor): Points defining a region, shape (N, 2).
+            Each point is in (x, y) format.
+        source_control_points (torch.Tensor): Source control points, shape (M, 2).
+            Each point is in (x, y) format.
+        target_control_points (torch.Tensor): Target control points, shape (M, 2).
+            Must have same first dimension as source_control_points.
+        distance_threshold (float, optional): Maximum distance for a point to be
+            considered matching. Defaults to 1e-6.
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]:
+            - Matched source control points, shape (K, 2) where K ≤ M
+            - Corresponding target control points, shape (K, 2)
+            If no matches found or inputs empty, returns empty tensors of shape (0, 2)
+    """
+    # Handle empty input cases
+    if len(region_points) == 0 or len(source_control_points) == 0:
+        return (
+            torch.zeros((0, 2), device=source_control_points.device),
+            torch.zeros((0, 2), device=target_control_points.device)
+        )
+    # Calculate pairwise distances between source control points and region points
+    distances = torch.cdist(source_control_points, region_points)
+    # Find points that are within threshold distance of any region point
+    min_distances = distances.min(dim=1)[0]
+    matching_indices = min_distances < distance_threshold
+    # Return matched pairs of control points
+    return source_control_points[matching_indices], target_control_points[matching_indices]
+def interpolate_points_with_weighted_directions(
+    points: torch.Tensor,
+    reference_points: torch.Tensor,
+    direction_vectors: torch.Tensor,
+    max_reference_points: int = 100,
+    num_nearest_neighbors: int = 4,
+    eps: float = 1e-6
+) -> torch.Tensor:
+    """Interpolate points based on weighted directions from nearest reference points.
+    This function moves each point by a weighted combination of direction vectors.
+    The weights are determined by the inverse distances to the nearest reference points.
+    If there are too many reference points, they are subsampled for efficiency.
+    Args:
+        points (torch.Tensor): Points to interpolate, shape (N, 2) in (x, y) format
+        reference_points (torch.Tensor): Reference point locations, shape (M, 2)
+        direction_vectors (torch.Tensor): Direction vectors for each reference point,
+            shape (M, 2), must match reference_points first dimension
+        max_reference_points (int, optional): Maximum number of reference points to use.
+            If exceeded, points are subsampled. Defaults to 100.
+        num_nearest_neighbors (int, optional): Number of nearest neighbors to consider
+            for interpolation. Defaults to 4.
+        eps (float, optional): Small value to avoid division by zero. Defaults to 1e-6.
+    Returns:
+        torch.Tensor: Interpolated points with shape (N, 2). If input points or
+            references are empty, returns the input points unchanged.
+    """
+    # Handle empty input cases
+    if len(points) == 0 or len(reference_points) == 0:
+        return points
+    # Handle single reference point case
+    if len(reference_points) == 1:
+        return points + direction_vectors
+    # Subsample reference points if too many
+    if len(reference_points) > max_reference_points:
+        indices = torch.linspace(0, len(reference_points)-1, max_reference_points).long()
+        reference_points = reference_points[indices]
+        direction_vectors = direction_vectors[indices]
+    # Calculate distances to all reference points
+    distances = torch.cdist(points, reference_points)
+    # Find k nearest neighbors (k = min(num_nearest_neighbors, num_references))
+    k = min(num_nearest_neighbors, len(reference_points))
+    topk_distances, neighbor_indices = torch.topk(
+        distances,
+        k=k,
+        dim=1,
+        largest=False
+    )
+    # Calculate weights based on inverse distances
+    weights = 1.0 / (topk_distances + eps)
+    weights = weights / weights.sum(dim=1, keepdim=True)
+    # Get directions for nearest neighbors and compute weighted average
+    neighbor_directions = direction_vectors[neighbor_indices]
+    weighted_directions = (weights.unsqueeze(-1) * neighbor_directions).sum(dim=1)
+    # Apply weighted directions and round to nearest integer
+    interpolated_points = (points + weighted_directions).round().float()
+    return interpolated_points
+def get_points_within_image_bounds(
+    points: torch.Tensor,
+    image_shape: tuple[int, int]
+) -> torch.Tensor:
+    """Create a boolean mask for points that lie within image boundaries.
+    Identifies which points from the input tensor fall within valid image coordinates.
+    Points are assumed to be in (x, y) format, while image_shape is in (height, width) format.
+    Args:
+        points (torch.Tensor): Points to check, shape (N, 2) in (x, y) format.
+            x coordinates correspond to width/columns
+            y coordinates correspond to height/rows
+        image_shape (tuple[int, int]): Image dimensions as (height, width).
+    Returns:
+        torch.Tensor: Boolean mask of shape (N,) where True indicates the point
+            is within bounds. Returns empty tensor of shape (0,) if input is empty.
+    """
+    # Handle empty input case
+    if len(points) == 0:
+        return torch.zeros(0, dtype=torch.bool, device=points.device)
+    # Unpack image dimensions
+    height, width = image_shape
+    # Check both x and y coordinates are within bounds
+    x_in_bounds = (points[:, 0] >= 0) & (points[:, 0] < width)
+    y_in_bounds = (points[:, 1] >= 0) & (points[:, 1] < height)
+    # Combine conditions
+    valid_points_mask = x_in_bounds & y_in_bounds
+    return valid_points_mask
+def bi_warp(
+    region_mask: np.ndarray,
+    control_points: Union[np.ndarray, torch.Tensor],
+    kernel_size: int = 5
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Generate corresponding source/target points and inpainting mask for masked regions.
+    Args:
+        region_mask: Binary mask defining regions of interest (2D array with 0s and 1s)
+        control_points: Alternating source and target control points. Shape (N*2, 2)
+        kernel_size: Controls dilation kernel size. Must be odd number or 0.
+                    Contour thickness will be (kernel_size-1)*2 (default: 5)
+                    Set to 0 for no contour drawing and no dilation.
+    Returns:
+        tuple containing:
+            - Source points (M, 2)
+            - Target points (M, 2)
+            - Inpainting mask combined with target contour mask
+    """
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    image_shape = region_mask.shape
+    # Ensure kernel_size is odd or 0
+    kernel_size = max(0, kernel_size)
+    if kernel_size > 0 and kernel_size % 2 == 0:
+        kernel_size += 1
+    # 1. Initialize tensors and masks
+    control_points = torch.tensor(control_points, dtype=torch.float32, device=device) if not isinstance(control_points, torch.Tensor) else control_points
+    source_control_points = control_points[0:-1:2]
+    target_control_points = control_points[1::2]
+    combined_source_mask = np.zeros(image_shape, dtype=np.uint8)
+    combined_target_mask = np.zeros(image_shape, dtype=np.uint8)
+    region_mask_binary = np.where(region_mask > 0, 1, 0).astype(np.uint8)
+    contour_mask = np.zeros(image_shape, dtype=np.uint8)
+    # 2. Process regions
+    contours = cv2.findContours(region_mask_binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
+    all_source_points = []
+    all_target_points = []
+    for contour in contours:
+        if len(contour) == 0:
+            continue
+        # 3. Get source region points and mask
+        source_contour = torch.from_numpy(contour[:, 0, :]).float().to(device)
+        source_region_points, source_mask = contour_to_points_and_mask(contour[:, 0, :], image_shape)
+        source_mask = (source_mask > 0).astype(np.uint8)
+        if len(source_region_points) == 0:
+            continue
+        source_region_points = torch.from_numpy(source_region_points).float().to(device)
+        # 4. Transform points
+        source, target = find_control_points(source_region_points, source_control_points, target_control_points)
+        if len(source) == 0:
+            continue
+        directions = target - source
+        target_contour = interpolate_points_with_weighted_directions(source_contour, source, directions)
+        interpolated_target = interpolate_points_with_weighted_directions(source_region_points, source, directions)
+        # 5. Get target region points and mask
+        target_region_points, target_mask = contour_to_points_and_mask(target_contour.cpu().int().numpy(), image_shape)
+        target_mask = (target_mask > 0).astype(np.uint8)
+        if len(target_region_points) == 0:
+            continue
+        # Draw target contour
+        target_contour_np = target_contour.cpu().int().numpy()
+        if kernel_size > 0:
+            cv2.drawContours(contour_mask, [target_contour_np], -1, 1, kernel_size)
+        target_region = torch.from_numpy(target_region_points).float().to(device)
+        # 6. Apply reverse transformation
+        back_directions = source_region_points - interpolated_target
+        interpolated_source = interpolate_points_with_weighted_directions(target_region, interpolated_target, back_directions)
+        # 7. Filter valid points
+        valid_mask = get_points_within_image_bounds(interpolated_source, image_shape)
+        if valid_mask.any():
+            all_source_points.append(interpolated_source[valid_mask])
+            all_target_points.append(target_region[valid_mask])
+            combined_source_mask = np.logical_or(combined_source_mask, source_mask).astype(np.uint8)
+            combined_target_mask = np.logical_or(combined_target_mask, target_mask).astype(np.uint8)
+    # 8. Handle empty case
+    if not all_source_points:
+        return np.zeros((0, 2), dtype=np.int32), np.zeros((0, 2), dtype=np.int32), np.zeros(image_shape, dtype=np.uint8)
+    # 9. Finalize outputs
+    final_source = torch.cat(all_source_points).cpu().numpy().astype(np.int32)
+    final_target = torch.cat(all_target_points).cpu().numpy().astype(np.int32)
+    # Create and combine masks
+    inpaint_mask = np.logical_and(combined_source_mask, np.logical_not(combined_target_mask)).astype(np.uint8)
+    if kernel_size > 0:
+        kernel = np.ones((kernel_size, kernel_size), dtype=np.uint8)
+        inpaint_mask = cv2.dilate(inpaint_mask, kernel)
+    final_mask = np.logical_or(inpaint_mask, contour_mask).astype(np.uint8)
+    return final_source, final_target, final_mask

utils/refine_mask.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import os
+import urllib.request
+from typing import Optional
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+def download_model(checkpoint_path: str, model_name: str = "efficientvit_sam_l0.pt") -> str:
+    """
+    Download the model checkpoint if not found locally.
+    Args:
+        checkpoint_path: Local path where model should be saved
+        model_name: Name of the model file to download
+    Returns:
+        str: Path to the downloaded checkpoint
+    """
+    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
+    base_url = "https://huggingface.co/mit-han-lab/efficientvit-sam/resolve/main"
+    model_url = f"{base_url}/{model_name}"
+    try:
+        print(f"Downloading model from {model_url}...")
+        urllib.request.urlretrieve(model_url, checkpoint_path)
+        print(f"Model successfully downloaded to {checkpoint_path}")
+        return checkpoint_path
+    except Exception as e:
+        raise RuntimeError(f"Failed to download model: {str(e)}")
+class SamMaskRefiner(nn.Module):
+    CHECKPOINT_DIR = 'checkpoints'
+    MODEL_CONFIGS = {
+        'l0': 'efficientvit_sam_l0.pt',
+        'l1': 'efficientvit_sam_l1.pt',
+        'l2': 'efficientvit_sam_l2.pt'
+    }
+    def __init__(self, model_name: str = 'l0') -> None:
+        """
+        Initialize SAM predictor with specified model version.
+        Args:
+            model_name: Model version to use ('l0', 'l1', or 'l2'). Defaults to 'l0'.
+        Raises:
+            ValueError: If invalid model_name is provided
+            RuntimeError: If model loading fails after download attempt
+        """
+        super().__init__()
+        if model_name not in self.MODEL_CONFIGS:
+            raise ValueError(f"Invalid model_name. Choose from: {list(self.MODEL_CONFIGS.keys())}")
+        model_filename = self.MODEL_CONFIGS[model_name]
+        checkpoint_path = os.path.join(self.CHECKPOINT_DIR, model_filename)
+        try:
+            from efficientvit.models.efficientvit.sam import EfficientViTSamPredictor
+            from efficientvit.sam_model_zoo import create_efficientvit_sam_model
+        except ImportError:
+            raise ImportError(
+                "Failed to import EfficientViT modules. Please ensure the package is installed:\n"
+                "pip install git+https://github.com/mit-han-lab/efficientvit.git"
+            )
+        if not os.path.exists(checkpoint_path):
+            print(f"Checkpoint not found at {checkpoint_path}. Attempting to download...")
+            checkpoint_path = download_model(checkpoint_path, model_filename)
+        try:
+            model_type = f'efficientvit-sam-{model_name}'
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+            self.model = create_efficientvit_sam_model(model_type, True, checkpoint_path).eval()
+            self.model = self.model.requires_grad_(False).to(device)
+            self.predictor = EfficientViTSamPredictor(self.model)
+            print(f"\033[92mEfficientViT-SAM model loaded from: {checkpoint_path}\033[0m")
+        except Exception as e:
+            raise RuntimeError(f"Failed to load model: {str(e)}")
+    def sample_points_from_mask(self, mask: np.ndarray, max_points: int = 128) -> np.ndarray:
+        """
+        Sample points uniformly from masked regions.
+        Args:
+            mask: Binary mask array of shape (H, W) with 0-1 values.
+            max_points: Maximum number of points to sample.
+        Returns:
+            np.ndarray: Array of shape (N, 2) containing [x,y] coordinates.
+        """
+        y_indices, x_indices = np.where(mask > 0.5)
+        total_points = len(y_indices)
+        if total_points <= max_points:
+            return np.stack([x_indices, y_indices], axis=1)
+        y_min, y_max = y_indices.min(), y_indices.max()
+        x_min, x_max = x_indices.min(), x_indices.max()
+        aspect_ratio = (x_max - x_min) / max(y_max - y_min, 1)
+        ny = int(np.sqrt(max_points / aspect_ratio))
+        nx = int(ny * aspect_ratio)
+        x_bins = np.linspace(x_min, x_max + 1, nx + 1, dtype=np.int32)
+        y_bins = np.linspace(y_min, y_max + 1, ny + 1, dtype=np.int32)
+        x_dig = np.digitize(x_indices, x_bins) - 1
+        y_dig = np.digitize(y_indices, y_bins) - 1
+        bin_indices = y_dig * nx + x_dig
+        unique_bins = np.unique(bin_indices)
+        points = []
+        for idx in unique_bins:
+            bin_y = idx // nx
+            bin_x = idx % nx
+            mask = (y_dig == bin_y) & (x_dig == bin_x)
+            if np.any(mask):
+                px = int(np.mean(x_indices[mask]))
+                py = int(np.mean(y_indices[mask]))
+                points.append([px, py])
+        points = np.array(points)
+        if len(points) > max_points:
+            indices = np.linspace(0, len(points) - 1, max_points, dtype=int)
+            points = points[indices]
+        return points
+    def refine_mask(self, image: np.ndarray, input_mask: np.ndarray, kernel_size: int = 21) -> np.ndarray:
+        """
+        Refine an input mask using the SAM (Segment Anything Model) model.
+        Args:
+            image: RGB image, shape (H, W, 3), values in [0, 255]
+            input_mask: Binary mask, shape (H, W), values in {0, 1}
+            kernel_size: Size of morphological kernel (default: 21)
+        Returns:
+            Refined binary mask, shape (H, W), values in {0, 1}
+        """
+        points = self.sample_points_from_mask(input_mask, max_points=128)
+        if len(points) == 0:
+            return input_mask
+        self.predictor.set_image(image)
+        masks_pred, _, _ = self.predictor.predict(
+            point_coords=points,
+            point_labels=np.ones(len(points)),
+            multimask_output=False
+        )
+        sam_mask = masks_pred[0]
+        kernel = np.ones((kernel_size, kernel_size), np.uint8)
+        expanded_input = cv2.dilate(input_mask.astype(np.uint8), kernel)
+        preserved_input = cv2.erode(input_mask.astype(np.uint8), kernel)
+        sam_mask = np.logical_and(expanded_input, sam_mask).astype(np.uint8)
+        sam_mask = np.logical_or(preserved_input, sam_mask).astype(np.uint8)
+        return sam_mask

utils/ui_utils.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import os
+import pickle
+from time import perf_counter
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image
+from diffusers import AutoPipelineForInpainting, AutoencoderTiny, LCMScheduler
+from utils.drag import bi_warp
+from utils.refine_mask import SamMaskRefiner
+__all__ = [
+    'clear_all', 'resize',
+    'visualize_user_drag', 'preview_out_image', 'inpaint',
+    'add_point', 'undo_point', 'clear_point',
+]
+# UI functions
+def clear_all(length):
+    """Reset UI by clearing all input images and parameters."""
+    return (gr.Image(value=None, height=length, width=length),) * 3 + ([], 21, 2, "output/app", None)
+def resize(canvas, gen_length, canvas_length):
+    """Resize canvas while maintaining aspect ratio."""
+    if not canvas:
+        return (gr.Image(value=None, width=canvas_length, height=canvas_length),) * 3
+    image = process_canvas(canvas)[0]
+    aspect_ratio = image.shape[1] / image.shape[0]
+    is_landscape = aspect_ratio >= 1
+    new_dims = (
+        (gen_length, round(gen_length / aspect_ratio / 8) * 8) if is_landscape
+        else (round(gen_length * aspect_ratio / 8) * 8, gen_length)
+    )
+    canvas_dims = (
+        (canvas_length, round(canvas_length / aspect_ratio)) if is_landscape
+        else (round(canvas_length * aspect_ratio), canvas_length)
+    )
+    return (gr.Image(value=cv2.resize(image, new_dims), width=canvas_dims[0], height=canvas_dims[1]),) * 3
+def process_canvas(canvas):
+    """Extracts the image (H, W, 3) and the mask (H, W) from a Gradio canvas object."""
+    image = canvas["image"].copy()
+    mask = np.uint8(canvas["mask"][:, :, 0] > 0).copy()
+    return image, mask
+# Point manipulation functions
+def add_point(canvas, points, sam_ks, if_sam, output_path, evt: gr.SelectData):
+    """Add selected point to points list and update image."""
+    if canvas is None:
+        return None
+    points.append(evt.index)
+    return visualize_user_drag(canvas, points, sam_ks, if_sam, output_path)
+def undo_point(canvas, points, sam_ks, if_sam, output_path):
+    """Remove last point and update image."""
+    if canvas is None:
+        return None
+    if len(points) > 0:
+        points.pop()
+    return visualize_user_drag(canvas, points, sam_ks, if_sam, output_path)
+def clear_point(canvas, points, sam_ks, if_sam, output_path):
+    """Clear all points and update image."""
+    if canvas is None:
+        return None
+    points.clear()
+    return visualize_user_drag(canvas, points, sam_ks, if_sam, output_path)
+# Visualization tools
+def refine_mask(image, mask, kernel_size):
+    """Refine mask using SAM model if available."""
+    global sam_refiner
+    try:
+        if 'sam_refiner' not in globals():
+            sam_refiner = SamMaskRefiner()
+        return sam_refiner.refine_mask(image, mask, kernel_size)
+    except ImportError:
+        gr.Warning("EfficientVit not installed. Please install with: pip install git+https://github.com/mit-han-lab/efficientvit.git")
+        return mask
+    except Exception as e:
+        gr.Warning(f"Error refining mask: {str(e)}")
+        return mask
+def visualize_user_drag(canvas, points, sam_ks, if_sam=False, output_path=None):
+    """Visualize control points and motion vectors on the input image.
+    Args:
+        canvas (dict): Gradio canvas containing image and mask
+        points (list): List of (x,y) coordinate pairs for control points
+        sam_ks (int): Kernel size for SAM mask refinement
+        if_sam (bool): Whether to use SAM refinement on mask
+    """
+    if canvas is None:
+        return None
+    image, mask = process_canvas(canvas)
+    mask = refine_mask(image, mask, sam_ks) if if_sam and mask.sum() > 0 else mask
+    # Apply colored mask overlay
+    result = image.copy()
+    result[mask == 1] = [255, 0, 0]  # Red color
+    image = cv2.addWeighted(result, 0.3, image, 0.7, 0)
+    # Draw mask outline
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(image, contours, -1, (255, 255, 255), 2)
+    # Draw control points and motion vectors
+    for idx, point in enumerate(points, 1):
+        if idx % 2 == 0:
+            cv2.circle(image, tuple(point), 10, (0, 0, 255), -1)  # End point
+            cv2.arrowedLine(image, prev_point, point, (255, 255, 255), 4, tipLength=0.5)
+        else:
+            cv2.circle(image, tuple(point), 10, (255, 0, 0), -1)  # Start point
+            prev_point = point
+    if output_path:
+        os.makedirs(output_path, exist_ok=True)
+        Image.fromarray(image).save(os.path.join(output_path, 'user_drag_i4p.png'))
+    return image
+def preview_out_image(canvas, points, sam_ks, inpaint_ks, if_sam=False, output_path=None):
+    """Preview warped image result and generate inpainting mask.
+    Args:
+        canvas (dict): Gradio canvas containing the input image and mask
+        points (list): List of (x,y) coordinate pairs defining source and target positions for warping
+        sam_ks (int): Kernel size parameter for SAM mask refinement
+        inpaint_ks (int): Kernel size parameter for inpainting mask generation
+        if_sam (bool): Whether to use SAM model for mask refinement
+        output_path (str, optional): Directory path to save original image and metadata
+    Returns:
+        tuple:
+            - ndarray: Warped image with grid pattern overlay on regions needing inpainting
+            - ndarray: Binary mask (255 for inpainting regions, 0 elsewhere)
+            - (None, None): If canvas is empty or fewer than 2 control points provided
+    """
+    if canvas is None:
+        return None, None
+    image, mask = process_canvas(canvas)
+    if len(points) < 2:
+        return image, None
+    # ensure H, W divisible by 8 and longer edge 512
+    shapes_valid = all(s % 8 == 0 for s in mask.shape + image.shape[:2])
+    size_valid = all(max(x.shape[:2] if len(x.shape) > 2 else x.shape) == 512 for x in (image, mask))
+    if not (shapes_valid and size_valid):
+        gr.Warning('Click Resize Image Button first.')
+    mask = refine_mask(image, mask, sam_ks) if if_sam and mask.sum() > 0 else mask
+    if output_path:
+        os.makedirs(output_path, exist_ok=True)
+        Image.fromarray(image).save(os.path.join(output_path, 'original_image.png'))
+        metadata = {'mask': mask, 'points': points}
+        with open(os.path.join(output_path, 'meta_data_i4p.pkl'), 'wb') as f:
+            pickle.dump(metadata, f)
+    handle_pts, target_pts, inpaint_mask = bi_warp(mask, points, inpaint_ks)
+    image[target_pts[:, 1], target_pts[:, 0]] = image[handle_pts[:, 1], handle_pts[:, 0]]
+    # Add grid pattern to highlight inpainting regions
+    background = np.ones_like(mask) * 255
+    background[::10] = background[:, ::10] = 0
+    image = np.where(inpaint_mask[..., np.newaxis]==1, background[..., np.newaxis], image)
+    if output_path:
+        Image.fromarray(image).save(os.path.join(output_path, 'preview_image.png'))
+    return image, (inpaint_mask * 255).astype(np.uint8)
+# Inpaint tools
+def setup_pipeline(device='cuda', model_version='v1-5'):
+    """Initialize optimized inpainting pipeline with specified model configuration."""
+    MODEL_CONFIGS = {
+        'v1-5': ('runwayml/stable-diffusion-inpainting', 'latent-consistency/lcm-lora-sdv1-5', 'madebyollin/taesd'),
+        'xl': ('diffusers/stable-diffusion-xl-1.0-inpainting-0.1', 'latent-consistency/lcm-lora-sdxl', 'madebyollin/taesdxl')
+    }
+    model_id, lora_id, vae_id = MODEL_CONFIGS[model_version]
+    pipe = AutoPipelineForInpainting.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16", safety_checker=None)
+    pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+    pipe.load_lora_weights(lora_id)
+    pipe.fuse_lora()
+    pipe.vae = AutoencoderTiny.from_pretrained(vae_id, torch_dtype=torch.float16)
+    pipe = pipe.to(device)
+    # Pre-compute prompt embeddings during setup
+    if model_version == 'v1-5':
+        pipe.cached_prompt_embeds = pipe.encode_prompt(
+            '', device=device, num_images_per_prompt=1,
+            do_classifier_free_guidance=False)[0]
+    else:
+        pipe.cached_prompt_embeds, pipe.cached_pooled_prompt_embeds = pipe.encode_prompt(
+            '', device=device, num_images_per_prompt=1,
+            do_classifier_free_guidance=False)[0::2]
+    return pipe
+pipe = setup_pipeline(model_version='v1-5')
+pipe.cached_prompt_embeds = pipe.encode_prompt('', 'cuda', 1, False)[0]
+def inpaint(image, inpaint_mask):
+    """Perform efficient inpainting on masked regions using Stable Diffusion.
+    Args:
+        image (ndarray): Input RGB image array (warped preview image)
+        inpaint_mask (ndarray): Binary mask array where 255 indicates regions to inpaint
+    Returns:
+        ndarray: Inpainted image with masked regions filled in
+    """
+    if image is None:
+        return None
+    if inpaint_mask is None:
+        return image
+    start = perf_counter()
+    pipe_id = 'xl' if 'xl' in pipe.config._name_or_path else 'v1-5'
+    inpaint_strength = 0.99 if pipe_id == 'xl' else 1.0
+    # Convert inputs to PIL
+    image_pil = Image.fromarray(image)
+    inpaint_mask_pil = Image.fromarray(inpaint_mask)
+    width, height = inpaint_mask_pil.size
+    if width % 8 != 0 or height % 8 != 0:
+        width, height = round(width / 8) * 8, round(height / 8) * 8
+        image_pil = image_pil.resize((width, height))
+        image = np.array(image_pil)
+        inpaint_mask_pil = inpaint_mask_pil.resize((width, height), Image.NEAREST)
+        inpaint_mask = np.array(inpaint_mask_pil)
+    # Common pipeline parameters
+    common_params = {
+        'image': image_pil,
+        'mask_image': inpaint_mask_pil,
+        'height': height,
+        'width': width,
+        'guidance_scale': 1.0,
+        'num_inference_steps': 8,
+        'strength': inpaint_strength,
+        'output_type': 'np'
+    }
+    # Run pipeline
+    if pipe_id == 'v1-5':
+        inpainted = pipe(
+            prompt_embeds=pipe.cached_prompt_embeds,
+            **common_params
+        ).images[0]
+    else:
+        inpainted = pipe(
+            prompt_embeds=pipe.cached_prompt_embeds,
+            pooled_prompt_embeds=pipe.cached_pooled_prompt_embeds,
+            **common_params
+        ).images[0]
+    # Post-process results
+    inpaint_mask = (inpaint_mask[..., np.newaxis] / 255).astype(np.uint8)
+    return (inpainted * 255).astype(np.uint8) * inpaint_mask + image * (1 - inpaint_mask)