""" |
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍 |
1. 首先去掉出现在图片上的bbox,图片包括表格和图片 |
2. 然后去掉出现在文字blcok上的图片bbox |
""" |
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap |
from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT |
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list, |
text_raw_blocks: list): |
""" |
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json |
当下采用一种粗暴的方式: |
1. 去掉图片上的公式 |
2. 去掉table上的公式 |
2. 图片和文字block部分重叠,首先丢弃图片 |
3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉) |
4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部) |
5. 去掉表格上的文字 |
""" |
text_block_removed = [] |
images_backup = [] |
for image_box in images: |
for text_block in text_raw_blocks: |
text_bbox = text_block["bbox"] |
if _is_in(text_bbox, image_box): |
text_block['tag'] = ON_IMAGE_TEXT |
text_block_removed.append(text_block) |
for table_box in tables: |
for text_block in text_raw_blocks: |
text_bbox = text_block["bbox"] |
if _is_in(text_bbox, table_box): |
text_block['tag'] = ON_TABLE_TEXT |
text_block_removed.append(text_block) |
for text_block in text_block_removed: |
if text_block in text_raw_blocks: |
text_raw_blocks.remove(text_block) |
temp = [] |
for image_box in images: |
for eq1 in interline_equations: |
if _is_in_or_part_overlap(image_box, eq1[:4]): |
temp.append(eq1) |
for eq2 in inline_equations: |
if _is_in_or_part_overlap(image_box, eq2[:4]): |
temp.append(eq2) |
for eq in temp: |
if eq in interline_equations: |
interline_equations.remove(eq) |
if eq in inline_equations: |
inline_equations.remove(eq) |
temp = [] |
for table_box in tables: |
for eq1 in interline_equations: |
if _is_in_or_part_overlap(table_box, eq1[:4]): |
temp.append(eq1) |
for eq2 in inline_equations: |
if _is_in_or_part_overlap(table_box, eq2[:4]): |
temp.append(eq2) |
for eq in temp: |
if eq in interline_equations: |
interline_equations.remove(eq) |
if eq in inline_equations: |
inline_equations.remove(eq) |
for image_box in images: |
for text_block in text_raw_blocks: |
text_bbox = text_block["bbox"] |
if _is_in_or_part_overlap(image_box, text_bbox): |
images_backup.append(image_box) |
break |
for image_box in images_backup: |
images.remove(image_box) |
images_dup_index = [] |
for i in range(len(images)): |
for j in range(i + 1, len(images)): |
if _is_in_or_part_overlap(images[i], images[j]): |
images_dup_index.append(i) |
images_dup_index.append(j) |
dup_idx = set(images_dup_index) |
for img_id in dup_idx: |
images_backup.append(images[img_id]) |
images[img_id] = None |
images = [img for img in images if img is not None] |
text_block_removed_2 = [] |
return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2 |
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool: |
""" |
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 |
因为这种情况大概率发生了公式没有被检测出来。 |
""" |
if len(text_blocks) == 0: |
return False |
page_min_y = 0 |
page_max_y = max(yy['bbox'][3] for yy in text_blocks) |
def __max_y(lst: list): |
if len(lst) > 0: |
return max([item[1] for item in lst]) |
return page_min_y |
def __min_y(lst: list): |
if len(lst) > 0: |
return min([item[3] for item in lst]) |
return page_max_y |
clip_y0 = __max_y(header) |
clip_y1 = __min_y(footer) |
txt_bboxes = [] |
for text_block in text_blocks: |
bbox = text_block["bbox"] |
if bbox[1] >= clip_y0 and bbox[3] <= clip_y1: |
txt_bboxes.append(bbox) |
for i in range(len(txt_bboxes)): |
for j in range(i + 1, len(txt_bboxes)): |
if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]): |
return True |
return False |
def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool: |
""" |
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 |
因为这种情况大概率发生了公式没有被检测出来。 |
""" |
if len(useful_blocks) == 0: |
return False |
page_min_y = 0 |
page_max_y = max(yy['bbox'][3] for yy in useful_blocks) |
useful_bboxes = [] |
for text_block in useful_blocks: |
bbox = text_block["bbox"] |
if bbox[1] >= page_min_y and bbox[3] <= page_max_y: |
useful_bboxes.append(bbox) |
for i in range(len(useful_bboxes)): |
for j in range(i + 1, len(useful_bboxes)): |
area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1]) |
area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1]) |
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]): |
if area_i > area_j: |
return True, useful_bboxes[j], useful_bboxes[i] |
else: |
return True, useful_bboxes[i], useful_bboxes[j] |
return False, None, None |