ketanmore's picture
Upload folder using huggingface_hub
2720487 verified
history blame
8.72 kB
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
from typing import List, Optional
from PIL import Image
import numpy as np
from surya.detection import batch_detection
from surya.postprocessing.heatmap import keep_largest_boxes, get_and_clean_boxes, get_detected_boxes
from surya.schema import LayoutResult, LayoutBox, TextDetectionResult
from surya.settings import settings
def get_regions_from_detection_result(detection_result: TextDetectionResult, heatmaps: List[np.ndarray], orig_size, id2label, segment_assignment, vertical_line_width=20) -> List[LayoutBox]:
logits = np.stack(heatmaps, axis=0)
vertical_line_bboxes = [line for line in detection_result.vertical_lines]
line_bboxes = detection_result.bboxes
# Scale back to processor size
for line in vertical_line_bboxes:
line.rescale_bbox(orig_size, list(reversed(heatmaps[0].shape)))
for line in line_bboxes:
line.rescale(orig_size, list(reversed(heatmaps[0].shape)))
for bbox in vertical_line_bboxes:
# Give some width to the vertical lines
vert_bbox = list(bbox.bbox)
vert_bbox[2] = min(heatmaps[0].shape[0], vert_bbox[2] + vertical_line_width)
logits[:, vert_bbox[1]:vert_bbox[3], vert_bbox[0]:vert_bbox[2]] = 0 # zero out where the column lines are
logits[:, logits[0] >= .5] = 0 # zero out where blanks are
# Zero out where other segments are
for i in range(logits.shape[0]):
logits[i, segment_assignment != i] = 0
detected_boxes = []
for heatmap_idx in range(1, len(id2label)): # Skip the blank class
heatmap = logits[heatmap_idx]
bboxes = get_detected_boxes(heatmap)
bboxes = [bbox for bbox in bboxes if bbox.area > 25]
for bb in bboxes:
bb.fit_to_bounds([0, 0, heatmap.shape[1] - 1, heatmap.shape[0] - 1])
for bbox in bboxes:
detected_boxes.append(LayoutBox(polygon=bbox.polygon, label=id2label[heatmap_idx], confidence=1))
detected_boxes = sorted(detected_boxes, key=lambda x: x.confidence, reverse=True)
# Expand bbox to cover intersecting lines
box_lines = defaultdict(list)
used_lines = set()
# We try 2 rounds of identifying the correct lines to snap to
# First round is majority intersection, second lowers the threshold
for thresh in [.5, .4]:
for bbox_idx, bbox in enumerate(detected_boxes):
for line_idx, line_bbox in enumerate(line_bboxes):
if line_bbox.intersection_pct(bbox) > thresh and line_idx not in used_lines:
new_boxes = []
for bbox_idx, bbox in enumerate(detected_boxes):
if bbox.label == "Picture" and bbox.area < 200: # Remove very small figures
# Skip if we didn't find any lines to snap to, except for Pictures and Formulas
if bbox_idx not in box_lines and bbox.label not in ["Picture", "Formula"]:
covered_lines = box_lines[bbox_idx]
# Snap non-picture layout boxes to correct text boundaries
if len(covered_lines) > 0 and bbox.label not in ["Picture"]:
min_x = min([line[0] for line in covered_lines])
min_y = min([line[1] for line in covered_lines])
max_x = max([line[2] for line in covered_lines])
max_y = max([line[3] for line in covered_lines])
# Tables and formulas can contain text, but text isn't the whole area
if bbox.label in ["Table", "Formula"]:
min_x_box = min([b[0] for b in bbox.polygon])
min_y_box = min([b[1] for b in bbox.polygon])
max_x_box = max([b[0] for b in bbox.polygon])
max_y_box = max([b[1] for b in bbox.polygon])
min_x = min(min_x, min_x_box)
min_y = min(min_y, min_y_box)
max_x = max(max_x, max_x_box)
max_y = max(max_y, max_y_box)
bbox.polygon[0][0] = min_x
bbox.polygon[0][1] = min_y
bbox.polygon[1][0] = max_x
bbox.polygon[1][1] = min_y
bbox.polygon[2][0] = max_x
bbox.polygon[2][1] = max_y
bbox.polygon[3][0] = min_x
bbox.polygon[3][1] = max_y
if bbox_idx in box_lines and bbox.label in ["Picture"]:
bbox.label = "Figure"
# Merge tables together (sometimes one column is detected as a separate table)
for i in range(5): # Up to 5 rounds of merging
to_remove = set()
for bbox_idx, bbox in enumerate(new_boxes):
if bbox.label != "Table" or bbox_idx in to_remove:
for bbox_idx2, bbox2 in enumerate(new_boxes):
if bbox2.label != "Table" or bbox_idx2 in to_remove or bbox_idx == bbox_idx2:
if bbox.intersection_pct(bbox2) > 0:
new_boxes = [bbox for idx, bbox in enumerate(new_boxes) if idx not in to_remove]
# Ensure we account for all text lines in the layout
unused_lines = [line for idx, line in enumerate(line_bboxes) if idx not in used_lines]
for bbox in unused_lines:
new_boxes.append(LayoutBox(polygon=bbox.polygon, label="Text", confidence=.5))
for bbox in new_boxes:
bbox.rescale(list(reversed(heatmaps[0].shape)), orig_size)
detected_boxes = [bbox for bbox in new_boxes if bbox.area > 16]
# Remove bboxes contained inside others, unless they're captions
contained_bbox = []
for i, bbox in enumerate(detected_boxes):
for j, bbox2 in enumerate(detected_boxes):
if i == j:
if bbox2.intersection_pct(bbox) >= .95 and bbox2.label not in ["Caption"]:
detected_boxes = [bbox for idx, bbox in enumerate(detected_boxes) if idx not in contained_bbox]
return detected_boxes
def get_regions(heatmaps: List[np.ndarray], orig_size, id2label, segment_assignment) -> List[LayoutBox]:
bboxes = []
for i in range(1, len(id2label)): # Skip the blank class
heatmap = heatmaps[i]
assert heatmap.shape == segment_assignment.shape
heatmap[segment_assignment != i] = 0 # zero out where another segment is
bbox = get_and_clean_boxes(heatmap, list(reversed(heatmap.shape)), orig_size)
for bb in bbox:
bboxes.append(LayoutBox(polygon=bb.polygon, label=id2label[i]))
bboxes = keep_largest_boxes(bboxes)
return bboxes
def parallel_get_regions(heatmaps: List[np.ndarray], orig_size, id2label, detection_results=None) -> LayoutResult:
logits = np.stack(heatmaps, axis=0)
segment_assignment = logits.argmax(axis=0)
if detection_results is not None:
bboxes = get_regions_from_detection_result(detection_results, heatmaps, orig_size, id2label,
bboxes = get_regions(heatmaps, orig_size, id2label, segment_assignment)
segmentation_img = Image.fromarray(segment_assignment.astype(np.uint8))
result = LayoutResult(
image_bbox=[0, 0, orig_size[0], orig_size[1]]
return result
def batch_layout_detection(images: List, model, processor, detection_results: Optional[List[TextDetectionResult]] = None, batch_size=None) -> List[LayoutResult]:
preds, orig_sizes = batch_detection(images, model, processor, batch_size=batch_size)
id2label = model.config.id2label
results = []
if settings.IN_STREAMLIT or len(images) < settings.DETECTOR_MIN_PARALLEL_THRESH: # Ensures we don't parallelize with streamlit or too few images
for i in range(len(images)):
result = parallel_get_regions(preds[i], orig_sizes[i], id2label, detection_results[i] if detection_results else None)
futures = []
max_workers = min(settings.DETECTOR_POSTPROCESSING_CPU_WORKERS, len(images))
with ProcessPoolExecutor(max_workers=max_workers) as executor:
for i in range(len(images)):
future = executor.submit(parallel_get_regions, preds[i], orig_sizes[i], id2label, detection_results[i] if detection_results else None)
for future in futures:
return results