File size: 827 Bytes
2720487 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
import fitz as pymupdf
from surya.postprocessing.util import rescale_bbox
def get_pdf_lines(pdf_path, img_sizes):
doc = pymupdf.open(pdf_path)
page_lines = []
for idx, img_size in enumerate(img_sizes):
page = doc[idx]
blocks = page.get_text("dict", sort=True, flags=pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]
line_boxes = []
for block_idx, block in enumerate(blocks):
for l in block["lines"]:
line_boxes.append(list(l["bbox"]))
page_box = page.bound()
pwidth, pheight = page_box[2] - page_box[0], page_box[3] - page_box[1]
line_boxes = [rescale_bbox(bbox, (pwidth, pheight), img_size) for bbox in line_boxes]
page_lines.append(line_boxes)
return page_lines |