import fitz as pymupdf from surya.postprocessing.util import rescale_bbox def get_pdf_lines(pdf_path, img_sizes): doc = pymupdf.open(pdf_path) page_lines = [] for idx, img_size in enumerate(img_sizes): page = doc[idx] blocks = page.get_text("dict", sort=True, flags=pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES)["blocks"] line_boxes = [] for block_idx, block in enumerate(blocks): for l in block["lines"]: line_boxes.append(list(l["bbox"])) page_box = page.bound() pwidth, pheight = page_box[2] - page_box[0], page_box[3] - page_box[1] line_boxes = [rescale_bbox(bbox, (pwidth, pheight), img_size) for bbox in line_boxes] page_lines.append(line_boxes) return page_lines