|
""" |
|
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍 |
|
1. 首先去掉出现在图片上的bbox,图片包括表格和图片 |
|
2. 然后去掉出现在文字blcok上的图片bbox |
|
""" |
|
|
|
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap |
|
from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT |
|
|
|
|
|
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list, |
|
text_raw_blocks: list): |
|
""" |
|
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json |
|
当下采用一种粗暴的方式: |
|
1. 去掉图片上的公式 |
|
2. 去掉table上的公式 |
|
2. 图片和文字block部分重叠,首先丢弃图片 |
|
3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉) |
|
4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部) |
|
5. 去掉表格上的文字 |
|
""" |
|
text_block_removed = [] |
|
images_backup = [] |
|
|
|
|
|
for image_box in images: |
|
for text_block in text_raw_blocks: |
|
text_bbox = text_block["bbox"] |
|
if _is_in(text_bbox, image_box): |
|
text_block['tag'] = ON_IMAGE_TEXT |
|
text_block_removed.append(text_block) |
|
|
|
for table_box in tables: |
|
for text_block in text_raw_blocks: |
|
text_bbox = text_block["bbox"] |
|
if _is_in(text_bbox, table_box): |
|
text_block['tag'] = ON_TABLE_TEXT |
|
text_block_removed.append(text_block) |
|
|
|
for text_block in text_block_removed: |
|
if text_block in text_raw_blocks: |
|
text_raw_blocks.remove(text_block) |
|
|
|
|
|
temp = [] |
|
for image_box in images: |
|
for eq1 in interline_equations: |
|
if _is_in_or_part_overlap(image_box, eq1[:4]): |
|
temp.append(eq1) |
|
for eq2 in inline_equations: |
|
if _is_in_or_part_overlap(image_box, eq2[:4]): |
|
temp.append(eq2) |
|
|
|
for eq in temp: |
|
if eq in interline_equations: |
|
interline_equations.remove(eq) |
|
if eq in inline_equations: |
|
inline_equations.remove(eq) |
|
|
|
|
|
temp = [] |
|
for table_box in tables: |
|
for eq1 in interline_equations: |
|
if _is_in_or_part_overlap(table_box, eq1[:4]): |
|
temp.append(eq1) |
|
for eq2 in inline_equations: |
|
if _is_in_or_part_overlap(table_box, eq2[:4]): |
|
temp.append(eq2) |
|
|
|
for eq in temp: |
|
if eq in interline_equations: |
|
interline_equations.remove(eq) |
|
if eq in inline_equations: |
|
inline_equations.remove(eq) |
|
|
|
|
|
for image_box in images: |
|
for text_block in text_raw_blocks: |
|
text_bbox = text_block["bbox"] |
|
if _is_in_or_part_overlap(image_box, text_bbox): |
|
images_backup.append(image_box) |
|
break |
|
for image_box in images_backup: |
|
images.remove(image_box) |
|
|
|
|
|
images_dup_index = [] |
|
for i in range(len(images)): |
|
for j in range(i + 1, len(images)): |
|
if _is_in_or_part_overlap(images[i], images[j]): |
|
images_dup_index.append(i) |
|
images_dup_index.append(j) |
|
|
|
dup_idx = set(images_dup_index) |
|
for img_id in dup_idx: |
|
images_backup.append(images[img_id]) |
|
images[img_id] = None |
|
|
|
images = [img for img in images if img is not None] |
|
|
|
|
|
|
|
|
|
text_block_removed_2 = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2 |
|
|
|
|
|
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool: |
|
""" |
|
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 |
|
因为这种情况大概率发生了公式没有被检测出来。 |
|
|
|
""" |
|
if len(text_blocks) == 0: |
|
return False |
|
|
|
page_min_y = 0 |
|
page_max_y = max(yy['bbox'][3] for yy in text_blocks) |
|
|
|
def __max_y(lst: list): |
|
if len(lst) > 0: |
|
return max([item[1] for item in lst]) |
|
return page_min_y |
|
|
|
def __min_y(lst: list): |
|
if len(lst) > 0: |
|
return min([item[3] for item in lst]) |
|
return page_max_y |
|
|
|
clip_y0 = __max_y(header) |
|
clip_y1 = __min_y(footer) |
|
|
|
txt_bboxes = [] |
|
for text_block in text_blocks: |
|
bbox = text_block["bbox"] |
|
if bbox[1] >= clip_y0 and bbox[3] <= clip_y1: |
|
txt_bboxes.append(bbox) |
|
|
|
for i in range(len(txt_bboxes)): |
|
for j in range(i + 1, len(txt_bboxes)): |
|
if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]): |
|
return True |
|
|
|
return False |
|
|
|
|
|
def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool: |
|
""" |
|
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 |
|
因为这种情况大概率发生了公式没有被检测出来。 |
|
|
|
""" |
|
if len(useful_blocks) == 0: |
|
return False |
|
|
|
page_min_y = 0 |
|
page_max_y = max(yy['bbox'][3] for yy in useful_blocks) |
|
|
|
useful_bboxes = [] |
|
for text_block in useful_blocks: |
|
bbox = text_block["bbox"] |
|
if bbox[1] >= page_min_y and bbox[3] <= page_max_y: |
|
useful_bboxes.append(bbox) |
|
|
|
for i in range(len(useful_bboxes)): |
|
for j in range(i + 1, len(useful_bboxes)): |
|
area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1]) |
|
area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1]) |
|
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]): |
|
if area_i > area_j: |
|
return True, useful_bboxes[j], useful_bboxes[i] |
|
else: |
|
return True, useful_bboxes[i], useful_bboxes[j] |
|
|
|
return False, None, None |
|
|