Spaces:
Runtime error
Runtime error
# Copyright (C) 2021, Mindee. | |
# This program is licensed under the Apache License version 2. | |
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. | |
from typing import Any, Dict, List, Tuple | |
import pandas as pd | |
import numpy as np | |
from scipy.cluster.hierarchy import fclusterdata | |
from doctr.utils.geometry import estimate_page_angle, resolve_enclosing_bbox, resolve_enclosing_rbbox, rotate_boxes | |
from doctr.utils.repr import NestedObject | |
__all__ = ['DocumentBuilder'] | |
class DocumentBuilder(NestedObject): | |
"""Implements a document builder | |
Args: | |
resolve_lines: whether words should be automatically grouped into lines | |
resolve_blocks: whether lines should be automatically grouped into blocks | |
paragraph_break: relative length of the minimum space separating paragraphs | |
export_as_straight_boxes: if True, force straight boxes in the export (fit a rectangle | |
box to all rotated boxes). Else, keep the boxes format unchanged, no matter what it is. | |
""" | |
def __init__( | |
self, | |
resolve_lines: bool = True, | |
resolve_blocks: bool = True, | |
paragraph_break: float = 0.035, | |
export_as_straight_boxes: bool = False, | |
) -> None: | |
self.resolve_lines = resolve_lines | |
self.resolve_blocks = resolve_blocks | |
self.paragraph_break = paragraph_break | |
self.export_as_straight_boxes = export_as_straight_boxes | |
def _sort_boxes(boxes: np.ndarray) -> np.ndarray: | |
"""Sort bounding boxes from top to bottom, left to right | |
Args: | |
boxes: bounding boxes of shape (N, 4) or (N, 4, 2) (in case of rotated bbox) | |
Returns: | |
tuple: indices of ordered boxes of shape (N,), boxes | |
If straight boxes are passed tpo the function, boxes are unchanged | |
else: boxes returned are straight boxes fitted to the straightened rotated boxes | |
so that we fit the lines afterwards to the straigthened page | |
""" | |
if boxes.ndim == 3: | |
boxes = rotate_boxes( | |
loc_preds=boxes, | |
angle=-estimate_page_angle(boxes), | |
orig_shape=(1024, 1024), | |
min_angle=5., | |
) | |
boxes = np.concatenate((boxes.min(1), boxes.max(1)), -1) | |
return (boxes[:, 0] + 2 * boxes[:, 3] / np.median(boxes[:, 3] - boxes[:, 1])).argsort(), boxes | |
def _resolve_sub_lines(self, boxes: np.ndarray, word_idcs: List[int]) -> List[List[int]]: | |
"""Split a line in sub_lines | |
Args: | |
boxes: bounding boxes of shape (N, 4) | |
word_idcs: list of indexes for the words of the line | |
Returns: | |
A list of (sub-)lines computed from the original line (words) | |
""" | |
lines = [] | |
# Sort words horizontally | |
word_idcs = [word_idcs[idx] | |
for idx in boxes[word_idcs, 0].argsort().tolist()] | |
# Eventually split line horizontally | |
if len(word_idcs) < 2: | |
lines.append(word_idcs) | |
else: | |
sub_line = [word_idcs[0]] | |
for i in word_idcs[1:]: | |
horiz_break = True | |
prev_box = boxes[sub_line[-1]] | |
# Compute distance between boxes | |
dist = boxes[i, 0] - prev_box[2] | |
# If distance between boxes is lower than paragraph break, same sub-line | |
if dist < self.paragraph_break: | |
horiz_break = False | |
if horiz_break: | |
lines.append(sub_line) | |
sub_line = [] | |
sub_line.append(i) | |
lines.append(sub_line) | |
return lines | |
def _resolve_lines(self, boxes: np.ndarray) -> List[List[int]]: | |
"""Order boxes to group them in lines | |
Args: | |
boxes: bounding boxes of shape (N, 4) or (N, 4, 2) in case of rotated bbox | |
Returns: | |
nested list of box indices | |
""" | |
# Sort boxes, and straighten the boxes if they are rotated | |
idxs, boxes = self._sort_boxes(boxes) | |
# Compute median for boxes heights | |
y_med = np.median(boxes[:, 3] - boxes[:, 1]) | |
lines = [] | |
words = [idxs[0]] # Assign the top-left word to the first line | |
# Define a mean y-center for the line | |
y_center_sum = boxes[idxs[0]][[1, 3]].mean() | |
for idx in idxs[1:]: | |
vert_break = True | |
# Compute y_dist | |
y_dist = abs(boxes[idx][[1, 3]].mean() - y_center_sum / len(words)) | |
# If y-center of the box is close enough to mean y-center of the line, same line | |
if y_dist < y_med / 2: | |
vert_break = False | |
if vert_break: | |
# Compute sub-lines (horizontal split) | |
lines.extend(self._resolve_sub_lines(boxes, words)) | |
words = [] | |
y_center_sum = 0 | |
words.append(idx) | |
y_center_sum += boxes[idx][[1, 3]].mean() | |
# Use the remaining words to form the last(s) line(s) | |
if len(words) > 0: | |
# Compute sub-lines (horizontal split) | |
lines.extend(self._resolve_sub_lines(boxes, words)) | |
return lines | |
def _resolve_blocks(boxes: np.ndarray, lines: List[List[int]]) -> List[List[List[int]]]: | |
"""Order lines to group them in blocks | |
Args: | |
boxes: bounding boxes of shape (N, 4) or (N, 4, 2) | |
lines: list of lines, each line is a list of idx | |
Returns: | |
nested list of box indices | |
""" | |
# Resolve enclosing boxes of lines | |
if boxes.ndim == 3: | |
box_lines = np.asarray([ | |
resolve_enclosing_rbbox( | |
[tuple(boxes[idx, :, :]) for idx in line]) | |
for line in lines # type: ignore[misc] | |
]) | |
else: | |
_box_lines = [ | |
resolve_enclosing_bbox([ | |
# type: ignore[misc] | |
(tuple(boxes[idx, :2]), tuple(boxes[idx, 2:])) for idx in line | |
]) | |
for line in lines | |
] | |
box_lines = np.asarray([(x1, y1, x2, y2) | |
for ((x1, y1), (x2, y2)) in _box_lines]) | |
# Compute geometrical features of lines to clusterize | |
# Clusterizing only with box centers yield to poor results for complex documents | |
if boxes.ndim == 3: | |
box_features = np.stack( | |
( | |
(box_lines[:, 0, 0] + box_lines[:, 0, 1]) / 2, | |
(box_lines[:, 0, 0] + box_lines[:, 2, 0]) / 2, | |
(box_lines[:, 0, 0] + box_lines[:, 2, 1]) / 2, | |
(box_lines[:, 0, 1] + box_lines[:, 2, 1]) / 2, | |
(box_lines[:, 0, 1] + box_lines[:, 2, 0]) / 2, | |
(box_lines[:, 2, 0] + box_lines[:, 2, 1]) / 2, | |
), axis=-1 | |
) | |
else: | |
box_features = np.stack( | |
( | |
(box_lines[:, 0] + box_lines[:, 3]) / 2, | |
(box_lines[:, 1] + box_lines[:, 2]) / 2, | |
(box_lines[:, 0] + box_lines[:, 2]) / 2, | |
(box_lines[:, 1] + box_lines[:, 3]) / 2, | |
box_lines[:, 0], | |
box_lines[:, 1], | |
), axis=-1 | |
) | |
# Compute clusters | |
clusters = fclusterdata( | |
box_features, t=0.1, depth=4, criterion='distance', metric='euclidean') | |
_blocks: Dict[int, List[int]] = {} | |
# Form clusters | |
for line_idx, cluster_idx in enumerate(clusters): | |
if cluster_idx in _blocks.keys(): | |
_blocks[cluster_idx].append(line_idx) | |
else: | |
_blocks[cluster_idx] = [line_idx] | |
# Retrieve word-box level to return a fully nested structure | |
blocks = [[lines[idx] for idx in block] for block in _blocks.values()] | |
return blocks | |
def _build_blocks(self, boxes: np.ndarray, word_preds: List[Tuple[str, float]], page_shapes: List[Tuple[int, int]]) -> Any: | |
"""Gather independent words in structured blocks | |
Args: | |
boxes: bounding boxes of all detected words of the page, of shape (N, 5) or (N, 4, 2) | |
word_preds: list of all detected words of the page, of shape N | |
Returns: | |
list of block elements | |
""" | |
if boxes.shape[0] != len(word_preds): | |
raise ValueError( | |
f"Incompatible argument lengths: {boxes.shape[0]}, {len(word_preds)}") | |
if boxes.shape[0] == 0: | |
return [] | |
# Decide whether we try to form lines | |
_boxes = boxes | |
if self.resolve_lines: | |
lines = self._resolve_lines( | |
_boxes if _boxes.ndim == 3 else _boxes[:, :4]) | |
# Decide whether we try to form blocks | |
if self.resolve_blocks and len(lines) > 1: | |
_blocks = self._resolve_blocks( | |
_boxes if _boxes.ndim == 3 else _boxes[:, :4], lines) | |
else: | |
_blocks = [lines] | |
else: | |
# Sort bounding boxes, one line for all boxes, one block for the line | |
lines = [self._sort_boxes( | |
_boxes if _boxes.ndim == 3 else _boxes[:, :4])[0]] | |
_blocks = [lines] | |
rows = [] | |
for block_idx, lines in enumerate(_blocks): | |
for line_idx, line in enumerate(lines): | |
for i,idx in enumerate(line): | |
h, w = page_shapes | |
row = ( | |
block_idx, line_idx, i, word_preds[idx], | |
int(round(boxes[idx, 0]*w) | |
), int(round(boxes[idx, 1]*h)), | |
int(round(boxes[idx, 2]*w) | |
), int(round(boxes[idx, 3]*h)), | |
int(round(boxes[idx, 4]*100)) | |
) | |
rows.append(row) | |
return rows | |
def extra_repr(self) -> str: | |
return (f"resolve_lines={self.resolve_lines}, resolve_blocks={self.resolve_blocks}, " | |
f"paragraph_break={self.paragraph_break}, " | |
f"export_as_straight_boxes={self.export_as_straight_boxes}") | |
def __call__( | |
self, | |
boxes: List[np.ndarray], | |
text_preds: List[List[Tuple[str, float]]], | |
page_shapes: List[Tuple[int, int]] | |
) -> pd.DataFrame: | |
"""Re-arrange detected words into structured blocks | |
Args: | |
boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5) | |
or (*, 6) for all words for a given page | |
text_preds: list of N elements, where each element is the list of all word prediction (text + confidence) | |
page_shape: shape of each page, of size N | |
Returns: | |
document object | |
""" | |
if len(boxes) != len(text_preds) or len(boxes) != len(page_shapes): | |
raise ValueError( | |
"All arguments are expected to be lists of the same size") | |
if self.export_as_straight_boxes and len(boxes) > 0: | |
# If boxes are already straight OK, else fit a bounding rect | |
if boxes[0].ndim == 3: | |
straight_boxes = [] | |
# Iterate over pages | |
for p_boxes in boxes: | |
# Iterate over boxes of the pages | |
straight_boxes.append(np.concatenate( | |
(p_boxes.min(1), p_boxes.max(1)), 1)) | |
boxes = straight_boxes | |
_pages = [ | |
pd.DataFrame.from_records(self._build_blocks(page_boxes, word_preds, shape), columns=[ | |
"block_num", "line_num", "word_num" ,"word", "xmin", "ymin", "xmax", "ymax", "confidence_score" | |
]) | |
for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds) | |
] | |
return _pages |