Spaces:

AlhitawiMohammed22
/

HTD_HTR

Runtime error

File size: 12,068 Bytes

ff135d3


# Copyright (C) 2021, Mindee.

# This program is licensed under the Apache License version 2.
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.


from typing import Any, Dict, List, Tuple
import pandas as pd

import numpy as np
from scipy.cluster.hierarchy import fclusterdata

from doctr.utils.geometry import estimate_page_angle, resolve_enclosing_bbox, resolve_enclosing_rbbox, rotate_boxes
from doctr.utils.repr import NestedObject

__all__ = ['DocumentBuilder']


class DocumentBuilder(NestedObject):
    """Implements a document builder
    Args:
        resolve_lines: whether words should be automatically grouped into lines
        resolve_blocks: whether lines should be automatically grouped into blocks
        paragraph_break: relative length of the minimum space separating paragraphs
        export_as_straight_boxes: if True, force straight boxes in the export (fit a rectangle
            box to all rotated boxes). Else, keep the boxes format unchanged, no matter what it is.
    """

    def __init__(
        self,
        resolve_lines: bool = True,
        resolve_blocks: bool = True,
        paragraph_break: float = 0.035,
        export_as_straight_boxes: bool = False,
    ) -> None:

        self.resolve_lines = resolve_lines
        self.resolve_blocks = resolve_blocks
        self.paragraph_break = paragraph_break
        self.export_as_straight_boxes = export_as_straight_boxes

    @staticmethod
    def _sort_boxes(boxes: np.ndarray) -> np.ndarray:
        """Sort bounding boxes from top to bottom, left to right
        Args:
            boxes: bounding boxes of shape (N, 4) or (N, 4, 2) (in case of rotated bbox)
        Returns:
            tuple: indices of ordered boxes of shape (N,), boxes
                If straight boxes are passed tpo the function, boxes are unchanged
                else: boxes returned are straight boxes fitted to the straightened rotated boxes
                so that we fit the lines afterwards to the straigthened page
        """
        if boxes.ndim == 3:
            boxes = rotate_boxes(
                loc_preds=boxes,
                angle=-estimate_page_angle(boxes),
                orig_shape=(1024, 1024),
                min_angle=5.,
            )
            boxes = np.concatenate((boxes.min(1), boxes.max(1)), -1)
        return (boxes[:, 0] + 2 * boxes[:, 3] / np.median(boxes[:, 3] - boxes[:, 1])).argsort(), boxes

    def _resolve_sub_lines(self, boxes: np.ndarray, word_idcs: List[int]) -> List[List[int]]:
        """Split a line in sub_lines
        Args:
            boxes: bounding boxes of shape (N, 4)
            word_idcs: list of indexes for the words of the line
        Returns:
            A list of (sub-)lines computed from the original line (words)
        """
        lines = []
        # Sort words horizontally
        word_idcs = [word_idcs[idx]
                     for idx in boxes[word_idcs, 0].argsort().tolist()]

        # Eventually split line horizontally
        if len(word_idcs) < 2:
            lines.append(word_idcs)
        else:
            sub_line = [word_idcs[0]]
            for i in word_idcs[1:]:
                horiz_break = True

                prev_box = boxes[sub_line[-1]]
                # Compute distance between boxes
                dist = boxes[i, 0] - prev_box[2]
                # If distance between boxes is lower than paragraph break, same sub-line
                if dist < self.paragraph_break:
                    horiz_break = False

                if horiz_break:
                    lines.append(sub_line)
                    sub_line = []

                sub_line.append(i)
            lines.append(sub_line)

        return lines

    def _resolve_lines(self, boxes: np.ndarray) -> List[List[int]]:
        """Order boxes to group them in lines
        Args:
            boxes: bounding boxes of shape (N, 4) or (N, 4, 2) in case of rotated bbox
        Returns:
            nested list of box indices
        """

        # Sort boxes, and straighten the boxes if they are rotated
        idxs, boxes = self._sort_boxes(boxes)

        # Compute median for boxes heights
        y_med = np.median(boxes[:, 3] - boxes[:, 1])

        lines = []
        words = [idxs[0]]  # Assign the top-left word to the first line
        # Define a mean y-center for the line
        y_center_sum = boxes[idxs[0]][[1, 3]].mean()

        for idx in idxs[1:]:
            vert_break = True

            # Compute y_dist
            y_dist = abs(boxes[idx][[1, 3]].mean() - y_center_sum / len(words))
            # If y-center of the box is close enough to mean y-center of the line, same line
            if y_dist < y_med / 2:
                vert_break = False

            if vert_break:
                # Compute sub-lines (horizontal split)
                lines.extend(self._resolve_sub_lines(boxes, words))
                words = []
                y_center_sum = 0

            words.append(idx)
            y_center_sum += boxes[idx][[1, 3]].mean()

        # Use the remaining words to form the last(s) line(s)
        if len(words) > 0:
            # Compute sub-lines (horizontal split)
            lines.extend(self._resolve_sub_lines(boxes, words))

        return lines

    @staticmethod
    def _resolve_blocks(boxes: np.ndarray, lines: List[List[int]]) -> List[List[List[int]]]:
        """Order lines to group them in blocks
        Args:
            boxes: bounding boxes of shape (N, 4) or (N, 4, 2)
            lines: list of lines, each line is a list of idx
        Returns:
            nested list of box indices
        """
        # Resolve enclosing boxes of lines
        if boxes.ndim == 3:
            box_lines = np.asarray([
                resolve_enclosing_rbbox(
                    [tuple(boxes[idx, :, :]) for idx in line])
                for line in lines  # type: ignore[misc]
            ])
        else:
            _box_lines = [
                resolve_enclosing_bbox([
                    # type: ignore[misc]
                    (tuple(boxes[idx, :2]), tuple(boxes[idx, 2:])) for idx in line
                ])
                for line in lines
            ]
            box_lines = np.asarray([(x1, y1, x2, y2)
                                   for ((x1, y1), (x2, y2)) in _box_lines])

        # Compute geometrical features of lines to clusterize
        # Clusterizing only with box centers yield to poor results for complex documents
        if boxes.ndim == 3:
            box_features = np.stack(
                (
                    (box_lines[:, 0, 0] + box_lines[:, 0, 1]) / 2,
                    (box_lines[:, 0, 0] + box_lines[:, 2, 0]) / 2,
                    (box_lines[:, 0, 0] + box_lines[:, 2, 1]) / 2,
                    (box_lines[:, 0, 1] + box_lines[:, 2, 1]) / 2,
                    (box_lines[:, 0, 1] + box_lines[:, 2, 0]) / 2,
                    (box_lines[:, 2, 0] + box_lines[:, 2, 1]) / 2,
                ), axis=-1
            )
        else:
            box_features = np.stack(
                (
                    (box_lines[:, 0] + box_lines[:, 3]) / 2,
                    (box_lines[:, 1] + box_lines[:, 2]) / 2,
                    (box_lines[:, 0] + box_lines[:, 2]) / 2,
                    (box_lines[:, 1] + box_lines[:, 3]) / 2,
                    box_lines[:, 0],
                    box_lines[:, 1],
                ), axis=-1
            )
        # Compute clusters
        clusters = fclusterdata(
            box_features, t=0.1, depth=4, criterion='distance', metric='euclidean')

        _blocks: Dict[int, List[int]] = {}
        # Form clusters
        for line_idx, cluster_idx in enumerate(clusters):
            if cluster_idx in _blocks.keys():
                _blocks[cluster_idx].append(line_idx)
            else:
                _blocks[cluster_idx] = [line_idx]

        # Retrieve word-box level to return a fully nested structure
        blocks = [[lines[idx] for idx in block] for block in _blocks.values()]

        return blocks

    def _build_blocks(self, boxes: np.ndarray, word_preds: List[Tuple[str, float]], page_shapes: List[Tuple[int, int]]) -> Any:
        """Gather independent words in structured blocks
        Args:
            boxes: bounding boxes of all detected words of the page, of shape (N, 5) or (N, 4, 2)
            word_preds: list of all detected words of the page, of shape N
        Returns:
            list of block elements
        """

        if boxes.shape[0] != len(word_preds):
            raise ValueError(
                f"Incompatible argument lengths: {boxes.shape[0]}, {len(word_preds)}")

        if boxes.shape[0] == 0:
            return []

        # Decide whether we try to form lines
        _boxes = boxes
        if self.resolve_lines:
            lines = self._resolve_lines(
                _boxes if _boxes.ndim == 3 else _boxes[:, :4])
            # Decide whether we try to form blocks
            if self.resolve_blocks and len(lines) > 1:
                _blocks = self._resolve_blocks(
                    _boxes if _boxes.ndim == 3 else _boxes[:, :4], lines)
            else:
                _blocks = [lines]
        else:
            # Sort bounding boxes, one line for all boxes, one block for the line
            lines = [self._sort_boxes(
                _boxes if _boxes.ndim == 3 else _boxes[:, :4])[0]]
            _blocks = [lines]

        rows = []
        for block_idx, lines in enumerate(_blocks):
            for line_idx, line in enumerate(lines):
                for i,idx in enumerate(line):
                    h, w = page_shapes
                    row = (
                        block_idx, line_idx, i, word_preds[idx],
                        int(round(boxes[idx, 0]*w)
                            ), int(round(boxes[idx, 1]*h)),
                        int(round(boxes[idx, 2]*w)
                            ), int(round(boxes[idx, 3]*h)),
                        int(round(boxes[idx, 4]*100))
                    )
                    rows.append(row)

        return rows

    def extra_repr(self) -> str:
        return (f"resolve_lines={self.resolve_lines}, resolve_blocks={self.resolve_blocks}, "
                f"paragraph_break={self.paragraph_break}, "
                f"export_as_straight_boxes={self.export_as_straight_boxes}")

    def __call__(
        self,
        boxes: List[np.ndarray],
        text_preds: List[List[Tuple[str, float]]],
        page_shapes: List[Tuple[int, int]]
    ) -> pd.DataFrame:
        """Re-arrange detected words into structured blocks
        Args:
            boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5)
                or (*, 6) for all words for a given page
            text_preds: list of N elements, where each element is the list of all word prediction (text + confidence)
            page_shape: shape of each page, of size N
        Returns:
            document object
        """
        if len(boxes) != len(text_preds) or len(boxes) != len(page_shapes):
            raise ValueError(
                "All arguments are expected to be lists of the same size")

        if self.export_as_straight_boxes and len(boxes) > 0:
            # If boxes are already straight OK, else fit a bounding rect
            if boxes[0].ndim == 3:
                straight_boxes = []
                # Iterate over pages
                for p_boxes in boxes:
                    # Iterate over boxes of the pages
                    straight_boxes.append(np.concatenate(
                        (p_boxes.min(1), p_boxes.max(1)), 1))
                boxes = straight_boxes

        _pages = [
            pd.DataFrame.from_records(self._build_blocks(page_boxes, word_preds, shape), columns=[
                "block_num", "line_num", "word_num" ,"word", "xmin", "ymin", "xmax", "ymax", "confidence_score"
            ])
            for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds)
        ]

        return _pages