File size: 827 Bytes
2720487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import fitz as pymupdf
from surya.postprocessing.util import rescale_bbox


def get_pdf_lines(pdf_path, img_sizes):
    doc = pymupdf.open(pdf_path)
    page_lines = []
    for idx, img_size in enumerate(img_sizes):
        page = doc[idx]
        blocks = page.get_text("dict", sort=True, flags=pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]

        line_boxes = []
        for block_idx, block in enumerate(blocks):
            for l in block["lines"]:
                line_boxes.append(list(l["bbox"]))

        page_box = page.bound()
        pwidth, pheight = page_box[2] - page_box[0], page_box[3] - page_box[1]
        line_boxes = [rescale_bbox(bbox, (pwidth, pheight), img_size) for bbox in line_boxes]
        page_lines.append(line_boxes)

    return page_lines