File size: 11,250 Bytes
240e0a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 |
from collections import defaultdict
from magic_pdf.libs.boxbase import calculate_iou
def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
def is_single_line_block(block):
# Determine based on the width and height of the block
block_width = block["X1"] - block["X0"]
block_height = block["bbox"][3] - block["bbox"][1]
# If the height of the block is close to the average character height and the width is large, it is considered a single line
return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
def get_most_common_bboxes(bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
"""
This function gets the most common bboxes from the bboxes
Parameters
----------
bboxes : list
bboxes
page_height : float
height of the page
position : str, optional
"top" or "bottom", by default "top"
threshold : float, optional
threshold, by default 0.25
num_bboxes : int, optional
number of bboxes to return, by default 3
min_frequency : int, optional
minimum frequency of the bbox, by default 2
Returns
-------
common_bboxes : list
common bboxes
"""
# Filter bbox by position
if position == "top":
filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
else:
filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
# Find the most common bbox
bbox_count = defaultdict(int)
for bbox in filtered_bboxes:
bbox_count[tuple(bbox)] += 1
# Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
common_bboxes = [
bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
][:num_bboxes]
return common_bboxes
def detect_footer_header2(result_dict, similarity_threshold=0.5):
"""
This function detects the header and footer of the document.
Parameters
----------
result_dict : dict
result dictionary
Returns
-------
result_dict : dict
result dictionary
"""
# Traverse all blocks in the document
single_line_blocks = 0
total_blocks = 0
single_line_blocks = 0
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_key, block in blocks.items():
if block_key.startswith("block_"):
total_blocks += 1
if is_single_line_block(block):
single_line_blocks += 1
# If there are no blocks, skip the header and footer detection
if total_blocks == 0:
print("No blocks found. Skipping header/footer detection.")
return result_dict
# If most of the blocks are single-line, skip the header and footer detection
if single_line_blocks / total_blocks > 0.5: # 50% of the blocks are single-line
# print("Skipping header/footer detection for text-dense document.")
return result_dict
# Collect the bounding boxes of all blocks
all_bboxes = []
all_texts = []
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_key, block in blocks.items():
if block_key.startswith("block_"):
all_bboxes.append(block["bbox"])
# Get the height of the page
page_height = max(bbox[3] for bbox in all_bboxes)
# Get the most common bbox lists for headers and footers
common_header_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
common_footer_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
# Detect and mark headers and footers
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_key, block in blocks.items():
if block_key.startswith("block_"):
bbox = block["bbox"]
text = block["text"]
is_header = compare_bbox_with_list(bbox, common_header_bboxes)
is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
block["is_header"] = int(is_header)
block["is_footer"] = int(is_footer)
return result_dict
def __get_page_size(page_sizes:list):
"""
页面大小可能不一样
"""
w = sum([w for w,h in page_sizes])/len(page_sizes)
h = sum([h for w,h in page_sizes])/len(page_sizes)
return w, h
def __calculate_iou(bbox1, bbox2):
iou = calculate_iou(bbox1, bbox2)
return iou
def __is_same_pos(box1, box2, iou_threshold):
iou = __calculate_iou(box1, box2)
return iou >= iou_threshold
def get_most_common_bbox(bboxes:list, page_size:list, page_cnt:int, page_range_threshold=0.2, iou_threshold=0.9):
"""
common bbox必须大于page_cnt的1/3
"""
min_occurance_cnt = max(3, page_cnt//4)
header_det_bbox = []
footer_det_bbox = []
hdr_same_pos_group = []
btn_same_pos_group = []
page_w, page_h = __get_page_size(page_size)
top_y, bottom_y = page_w*page_range_threshold, page_h*(1-page_range_threshold)
top_bbox = [b for b in bboxes if b[3]<top_y]
bottom_bbox = [b for b in bboxes if b[1]>bottom_y]
# 然后开始排序,寻找最经常出现的bbox, 寻找的时候如果IOU>iou_threshold就算是一个
for i in range(0, len(top_bbox)):
hdr_same_pos_group.append([top_bbox[i]])
for j in range(i+1, len(top_bbox)):
if __is_same_pos(top_bbox[i], top_bbox[j], iou_threshold):
#header_det_bbox = [min(top_bbox[i][0], top_bbox[j][0]), min(top_bbox[i][1], top_bbox[j][1]), max(top_bbox[i][2], top_bbox[j][2]), max(top_bbox[i][3],top_bbox[j][3])]
hdr_same_pos_group[i].append(top_bbox[j])
for i in range(0, len(bottom_bbox)):
btn_same_pos_group.append([bottom_bbox[i]])
for j in range(i+1, len(bottom_bbox)):
if __is_same_pos(bottom_bbox[i], bottom_bbox[j], iou_threshold):
#footer_det_bbox = [min(bottom_bbox[i][0], bottom_bbox[j][0]), min(bottom_bbox[i][1], bottom_bbox[j][1]), max(bottom_bbox[i][2], bottom_bbox[j][2]), max(bottom_bbox[i][3],bottom_bbox[j][3])]
btn_same_pos_group[i].append(bottom_bbox[j])
# 然后看下每一组的bbox,是否符合大于page_cnt一定比例
hdr_same_pos_group = [g for g in hdr_same_pos_group if len(g)>=min_occurance_cnt]
btn_same_pos_group = [g for g in btn_same_pos_group if len(g)>=min_occurance_cnt]
# 平铺2个list[list]
hdr_same_pos_group = [bbox for g in hdr_same_pos_group for bbox in g]
btn_same_pos_group = [bbox for g in btn_same_pos_group for bbox in g]
# 寻找hdr_same_pos_group中的box[3]最大值,btn_same_pos_group中的box[1]最小值
hdr_same_pos_group.sort(key=lambda b:b[3])
btn_same_pos_group.sort(key=lambda b:b[1])
hdr_y = hdr_same_pos_group[-1][3] if hdr_same_pos_group else 0
btn_y = btn_same_pos_group[0][1] if btn_same_pos_group else page_h
header_det_bbox = [0, 0, page_w, hdr_y]
footer_det_bbox = [0, btn_y, page_w, page_h]
# logger.warning(f"header: {header_det_bbox}, footer: {footer_det_bbox}")
return header_det_bbox, footer_det_bbox, page_w, page_h
def drop_footer_header(pdf_info_dict:dict):
"""
启用规则探测,在全局的视角上通过统计的方法。
"""
header = []
footer = []
all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']]
image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']]
page_size = [val['page_size'] for _, val in pdf_info_dict.items()]
page_cnt = len(pdf_info_dict.keys()) # 一共多少页
header, footer, page_w, page_h = get_most_common_bbox(all_text_bboxes+image_bboxes, page_size, page_cnt)
""""
把范围扩展到页面水平的整个方向上
"""
if header:
header = [0, 0, page_w, header[3]+1]
if footer:
footer = [0, footer[1]-1, page_w, page_h]
# 找到footer, header范围之后,针对每一页pdf,从text、图片中删除这些范围内的内容
# 移除text block
for _, page_info in pdf_info_dict.items():
header_text_blk = []
footer_text_blk = []
for blk in page_info['preproc_blocks']:
blk_bbox = blk['bbox']
if header and blk_bbox[3]<=header[3]:
blk['tag'] = "header"
header_text_blk.append(blk)
elif footer and blk_bbox[1]>=footer[1]:
blk['tag'] = "footer"
footer_text_blk.append(blk)
# 放入text_block_droped中
page_info['droped_text_block'].extend(header_text_blk)
page_info['droped_text_block'].extend(footer_text_blk)
for blk in header_text_blk:
page_info['preproc_blocks'].remove(blk)
for blk in footer_text_blk:
page_info['preproc_blocks'].remove(blk)
"""接下来把footer、header上的图片也删除掉。图片包括正常的和backup的"""
header_image = []
footer_image = []
for image_info in page_info['images']:
img_bbox = image_info['bbox']
if header and img_bbox[3]<=header[3]:
image_info['tag'] = "header"
header_image.append(image_info)
elif footer and img_bbox[1]>=footer[1]:
image_info['tag'] = "footer"
footer_image.append(image_info)
page_info['droped_image_block'].extend(header_image)
page_info['droped_image_block'].extend(footer_image)
for img in header_image:
page_info['images'].remove(img)
for img in footer_image:
page_info['images'].remove(img)
"""接下来吧backup的图片也删除掉"""
header_image = []
footer_image = []
for image_info in page_info['image_backup']:
img_bbox = image_info['bbox']
if header and img_bbox[3]<=header[3]:
image_info['tag'] = "header"
header_image.append(image_info)
elif footer and img_bbox[1]>=footer[1]:
image_info['tag'] = "footer"
footer_image.append(image_info)
page_info['droped_image_block'].extend(header_image)
page_info['droped_image_block'].extend(footer_image)
for img in header_image:
page_info['image_backup'].remove(img)
for img in footer_image:
page_info['image_backup'].remove(img)
return header, footer
|