|
from collections import defaultdict |
|
|
|
from magic_pdf.libs.boxbase import calculate_iou |
|
|
|
|
|
def compare_bbox_with_list(bbox, bbox_list, tolerance=1): |
|
return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list) |
|
|
|
def is_single_line_block(block): |
|
|
|
block_width = block["X1"] - block["X0"] |
|
block_height = block["bbox"][3] - block["bbox"][1] |
|
|
|
|
|
return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3 |
|
|
|
def get_most_common_bboxes(bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2): |
|
""" |
|
This function gets the most common bboxes from the bboxes |
|
|
|
Parameters |
|
---------- |
|
bboxes : list |
|
bboxes |
|
page_height : float |
|
height of the page |
|
position : str, optional |
|
"top" or "bottom", by default "top" |
|
threshold : float, optional |
|
threshold, by default 0.25 |
|
num_bboxes : int, optional |
|
number of bboxes to return, by default 3 |
|
min_frequency : int, optional |
|
minimum frequency of the bbox, by default 2 |
|
|
|
Returns |
|
------- |
|
common_bboxes : list |
|
common bboxes |
|
""" |
|
|
|
if position == "top": |
|
filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold] |
|
else: |
|
filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)] |
|
|
|
|
|
bbox_count = defaultdict(int) |
|
for bbox in filtered_bboxes: |
|
bbox_count[tuple(bbox)] += 1 |
|
|
|
|
|
common_bboxes = [ |
|
bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency |
|
][:num_bboxes] |
|
return common_bboxes |
|
|
|
def detect_footer_header2(result_dict, similarity_threshold=0.5): |
|
""" |
|
This function detects the header and footer of the document. |
|
|
|
Parameters |
|
---------- |
|
result_dict : dict |
|
result dictionary |
|
|
|
Returns |
|
------- |
|
result_dict : dict |
|
result dictionary |
|
""" |
|
|
|
single_line_blocks = 0 |
|
total_blocks = 0 |
|
single_line_blocks = 0 |
|
|
|
for page_id, blocks in result_dict.items(): |
|
if page_id.startswith("page_"): |
|
for block_key, block in blocks.items(): |
|
if block_key.startswith("block_"): |
|
total_blocks += 1 |
|
if is_single_line_block(block): |
|
single_line_blocks += 1 |
|
|
|
|
|
if total_blocks == 0: |
|
print("No blocks found. Skipping header/footer detection.") |
|
return result_dict |
|
|
|
|
|
if single_line_blocks / total_blocks > 0.5: |
|
|
|
return result_dict |
|
|
|
|
|
all_bboxes = [] |
|
all_texts = [] |
|
|
|
for page_id, blocks in result_dict.items(): |
|
if page_id.startswith("page_"): |
|
for block_key, block in blocks.items(): |
|
if block_key.startswith("block_"): |
|
all_bboxes.append(block["bbox"]) |
|
|
|
|
|
page_height = max(bbox[3] for bbox in all_bboxes) |
|
|
|
|
|
common_header_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else [] |
|
common_footer_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else [] |
|
|
|
|
|
for page_id, blocks in result_dict.items(): |
|
if page_id.startswith("page_"): |
|
for block_key, block in blocks.items(): |
|
if block_key.startswith("block_"): |
|
bbox = block["bbox"] |
|
text = block["text"] |
|
|
|
is_header = compare_bbox_with_list(bbox, common_header_bboxes) |
|
is_footer = compare_bbox_with_list(bbox, common_footer_bboxes) |
|
block["is_header"] = int(is_header) |
|
block["is_footer"] = int(is_footer) |
|
|
|
return result_dict |
|
|
|
|
|
def __get_page_size(page_sizes:list): |
|
""" |
|
页面大小可能不一样 |
|
""" |
|
w = sum([w for w,h in page_sizes])/len(page_sizes) |
|
h = sum([h for w,h in page_sizes])/len(page_sizes) |
|
return w, h |
|
|
|
def __calculate_iou(bbox1, bbox2): |
|
iou = calculate_iou(bbox1, bbox2) |
|
return iou |
|
|
|
def __is_same_pos(box1, box2, iou_threshold): |
|
iou = __calculate_iou(box1, box2) |
|
return iou >= iou_threshold |
|
|
|
|
|
def get_most_common_bbox(bboxes:list, page_size:list, page_cnt:int, page_range_threshold=0.2, iou_threshold=0.9): |
|
""" |
|
common bbox必须大于page_cnt的1/3 |
|
""" |
|
min_occurance_cnt = max(3, page_cnt//4) |
|
header_det_bbox = [] |
|
footer_det_bbox = [] |
|
|
|
hdr_same_pos_group = [] |
|
btn_same_pos_group = [] |
|
|
|
page_w, page_h = __get_page_size(page_size) |
|
top_y, bottom_y = page_w*page_range_threshold, page_h*(1-page_range_threshold) |
|
|
|
top_bbox = [b for b in bboxes if b[3]<top_y] |
|
bottom_bbox = [b for b in bboxes if b[1]>bottom_y] |
|
|
|
for i in range(0, len(top_bbox)): |
|
hdr_same_pos_group.append([top_bbox[i]]) |
|
for j in range(i+1, len(top_bbox)): |
|
if __is_same_pos(top_bbox[i], top_bbox[j], iou_threshold): |
|
|
|
hdr_same_pos_group[i].append(top_bbox[j]) |
|
|
|
for i in range(0, len(bottom_bbox)): |
|
btn_same_pos_group.append([bottom_bbox[i]]) |
|
for j in range(i+1, len(bottom_bbox)): |
|
if __is_same_pos(bottom_bbox[i], bottom_bbox[j], iou_threshold): |
|
|
|
btn_same_pos_group[i].append(bottom_bbox[j]) |
|
|
|
|
|
hdr_same_pos_group = [g for g in hdr_same_pos_group if len(g)>=min_occurance_cnt] |
|
btn_same_pos_group = [g for g in btn_same_pos_group if len(g)>=min_occurance_cnt] |
|
|
|
|
|
hdr_same_pos_group = [bbox for g in hdr_same_pos_group for bbox in g] |
|
btn_same_pos_group = [bbox for g in btn_same_pos_group for bbox in g] |
|
|
|
hdr_same_pos_group.sort(key=lambda b:b[3]) |
|
btn_same_pos_group.sort(key=lambda b:b[1]) |
|
|
|
hdr_y = hdr_same_pos_group[-1][3] if hdr_same_pos_group else 0 |
|
btn_y = btn_same_pos_group[0][1] if btn_same_pos_group else page_h |
|
|
|
header_det_bbox = [0, 0, page_w, hdr_y] |
|
footer_det_bbox = [0, btn_y, page_w, page_h] |
|
|
|
return header_det_bbox, footer_det_bbox, page_w, page_h |
|
|
|
|
|
def drop_footer_header(pdf_info_dict:dict): |
|
""" |
|
启用规则探测,在全局的视角上通过统计的方法。 |
|
""" |
|
header = [] |
|
footer = [] |
|
|
|
all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']] |
|
image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']] |
|
page_size = [val['page_size'] for _, val in pdf_info_dict.items()] |
|
page_cnt = len(pdf_info_dict.keys()) |
|
header, footer, page_w, page_h = get_most_common_bbox(all_text_bboxes+image_bboxes, page_size, page_cnt) |
|
|
|
"""" |
|
把范围扩展到页面水平的整个方向上 |
|
""" |
|
if header: |
|
header = [0, 0, page_w, header[3]+1] |
|
|
|
if footer: |
|
footer = [0, footer[1]-1, page_w, page_h] |
|
|
|
|
|
|
|
|
|
for _, page_info in pdf_info_dict.items(): |
|
header_text_blk = [] |
|
footer_text_blk = [] |
|
for blk in page_info['preproc_blocks']: |
|
blk_bbox = blk['bbox'] |
|
if header and blk_bbox[3]<=header[3]: |
|
blk['tag'] = "header" |
|
header_text_blk.append(blk) |
|
elif footer and blk_bbox[1]>=footer[1]: |
|
blk['tag'] = "footer" |
|
footer_text_blk.append(blk) |
|
|
|
|
|
page_info['droped_text_block'].extend(header_text_blk) |
|
page_info['droped_text_block'].extend(footer_text_blk) |
|
|
|
for blk in header_text_blk: |
|
page_info['preproc_blocks'].remove(blk) |
|
for blk in footer_text_blk: |
|
page_info['preproc_blocks'].remove(blk) |
|
|
|
"""接下来把footer、header上的图片也删除掉。图片包括正常的和backup的""" |
|
header_image = [] |
|
footer_image = [] |
|
|
|
for image_info in page_info['images']: |
|
img_bbox = image_info['bbox'] |
|
if header and img_bbox[3]<=header[3]: |
|
image_info['tag'] = "header" |
|
header_image.append(image_info) |
|
elif footer and img_bbox[1]>=footer[1]: |
|
image_info['tag'] = "footer" |
|
footer_image.append(image_info) |
|
|
|
page_info['droped_image_block'].extend(header_image) |
|
page_info['droped_image_block'].extend(footer_image) |
|
|
|
for img in header_image: |
|
page_info['images'].remove(img) |
|
for img in footer_image: |
|
page_info['images'].remove(img) |
|
|
|
"""接下来吧backup的图片也删除掉""" |
|
header_image = [] |
|
footer_image = [] |
|
|
|
for image_info in page_info['image_backup']: |
|
img_bbox = image_info['bbox'] |
|
if header and img_bbox[3]<=header[3]: |
|
image_info['tag'] = "header" |
|
header_image.append(image_info) |
|
elif footer and img_bbox[1]>=footer[1]: |
|
image_info['tag'] = "footer" |
|
footer_image.append(image_info) |
|
|
|
page_info['droped_image_block'].extend(header_image) |
|
page_info['droped_image_block'].extend(footer_image) |
|
|
|
for img in header_image: |
|
page_info['image_backup'].remove(img) |
|
for img in footer_image: |
|
page_info['image_backup'].remove(img) |
|
|
|
return header, footer |
|
|