Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- codes/data_extraction.py +62 -0
- codes/image_processing.py +55 -0
- codes/table_detection.py +28 -0
- codes/table_preprocessing.py +35 -0
- codes/table_recognition.py +33 -0
codes/data_extraction.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import pytesseract
|
4 |
+
from pytesseract import Output
|
5 |
+
from datatypes.datatypes import Row, Cell
|
6 |
+
from codes.image_processing import ImageProcessor
|
7 |
+
from datatypes.config import Config
|
8 |
+
|
9 |
+
class TextDataExtraction():
|
10 |
+
def __init__(self):
|
11 |
+
pass
|
12 |
+
|
13 |
+
def clean_ocr_data(self, value):
|
14 |
+
transf = ''.join(e for e in value if e==' 'or e=='.' or e.isalnum())
|
15 |
+
transf.strip()
|
16 |
+
return transf
|
17 |
+
|
18 |
+
def pytess(self, cell_pil_img):
|
19 |
+
return ' '.join(pytesseract.image_to_data(cell_pil_img, output_type=Output.DICT, config='-c tessedit_char_blacklist=œ˜â€œï¬â™Ã©œ¢!|”?«“¥ --psm 6 preserve_interword_spaces')['text']).strip()
|
20 |
+
|
21 |
+
def cell_data_extraction(self, image, table_data):
|
22 |
+
for table in table_data.tables:
|
23 |
+
tableimg_processor = ImageProcessor()
|
24 |
+
table_bbox = table.detection_box
|
25 |
+
table_image = image.crop(table_bbox)
|
26 |
+
table_image = tableimg_processor.image_padding(table_image, padd=Config['table_padd'])
|
27 |
+
|
28 |
+
for row_idx, table_row in enumerate(table.ordered_recognitiondata[0].recognized_row):
|
29 |
+
row_obj = Row([])
|
30 |
+
xmin_row, ymin_row, xmax_row, ymax_row, _, _ = table_row
|
31 |
+
|
32 |
+
row_image = table_image.crop((xmin_row,ymin_row,xmax_row,ymax_row))
|
33 |
+
row_width, row_height = row_image.size
|
34 |
+
row_obj.rowindex = row_idx
|
35 |
+
|
36 |
+
# Cell bounding box creation
|
37 |
+
xa, ya, xb, yb = 0, 0, 0, row_height
|
38 |
+
|
39 |
+
for indx, table_column in enumerate(table.ordered_recognitiondata[0].recognized_column):
|
40 |
+
cell_obj = Cell()
|
41 |
+
xmin_col, _, xmax_col, _,_,_ = table_column
|
42 |
+
xmin_col, xmax_col = xmin_col -Config['table_padd'], xmax_col - Config['table_padd']
|
43 |
+
xa = xmin_col
|
44 |
+
xb = xmax_col
|
45 |
+
if indx == 0:
|
46 |
+
xa = 0
|
47 |
+
if indx == len(table.ordered_recognitiondata[0].recognized_column)-1:
|
48 |
+
xb = row_width
|
49 |
+
|
50 |
+
cell_img = row_image.crop((xa, ya, xb, yb))
|
51 |
+
xa, ya, xb, yb = xa, ya, xb, yb
|
52 |
+
|
53 |
+
cell_value = self.pytess(cell_img)
|
54 |
+
transformed_cell_value = self.clean_ocr_data(cell_value)
|
55 |
+
|
56 |
+
cell_obj.cellindex = indx
|
57 |
+
cell_obj.value = transformed_cell_value
|
58 |
+
|
59 |
+
row_obj.extracted_cells.append(cell_obj)
|
60 |
+
table.extracted_rows.append(row_obj)
|
61 |
+
|
62 |
+
return table_data
|
codes/image_processing.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image
|
5 |
+
|
6 |
+
# Some image process techniques to improve the images.
|
7 |
+
class ImageProcessor():
|
8 |
+
def __init__(self):
|
9 |
+
pass
|
10 |
+
|
11 |
+
def PIL_to_cv2(self, pil_img):
|
12 |
+
return cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
|
13 |
+
|
14 |
+
def cv2_to_PIL(self, cv_img):
|
15 |
+
return Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB))
|
16 |
+
|
17 |
+
def image_padding(self, image, padd):
|
18 |
+
'''
|
19 |
+
Image boarder padding to avoid table image loss
|
20 |
+
'''
|
21 |
+
width, height = image.size
|
22 |
+
new_width = width +(2*padd)
|
23 |
+
new_height = height + (2*padd)
|
24 |
+
color = (255, 255, 255)
|
25 |
+
result = Image.new(image.mode, (new_width, new_height), color)
|
26 |
+
result.paste(image, (padd, padd))
|
27 |
+
return result
|
28 |
+
|
29 |
+
|
30 |
+
def sharpen_image(self, pil_img):
|
31 |
+
img = self.PIL_to_cv2(pil_img)
|
32 |
+
'''
|
33 |
+
Image sharpening kernal
|
34 |
+
'''
|
35 |
+
sharpen_kernel = np.array([[-1, -1, -1],
|
36 |
+
[-1, 9, -1],
|
37 |
+
[-1, -1, -1]])
|
38 |
+
|
39 |
+
sharpen = cv2.filter2D(img, -1, sharpen_kernel)
|
40 |
+
pil_img = self.cv2_to_PIL(sharpen)
|
41 |
+
return pil_img
|
42 |
+
|
43 |
+
def binarizeBlur_image(self, pil_img):
|
44 |
+
image = self.PIL_to_cv2(pil_img)
|
45 |
+
thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV)[1]
|
46 |
+
|
47 |
+
result = cv2.GaussianBlur(thresh, (3,3), 0)
|
48 |
+
result = 255 - result
|
49 |
+
return self.cv2_to_PIL(result)
|
50 |
+
|
51 |
+
def whole_image_processing(self, pil_img):
|
52 |
+
sharpen_img = self.sharpen_image(pil_img)
|
53 |
+
binary_img = self.binarizeBlur_image(sharpen_img)
|
54 |
+
|
55 |
+
return binary_img
|
codes/table_detection.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from datatypes.datatypes import ImageData
|
4 |
+
from datatypes.datatypes import TableDetectionData
|
5 |
+
|
6 |
+
class TableDetection():
|
7 |
+
def __init__(self, feature_extractor, detection_model, threshold):
|
8 |
+
self.feature_extractor = feature_extractor
|
9 |
+
self.detection_model = detection_model
|
10 |
+
self.threshold = threshold
|
11 |
+
|
12 |
+
def table_detection_from_image(self, detection_image):
|
13 |
+
|
14 |
+
table_data_extraction = ImageData([])
|
15 |
+
image_width, image_height = detection_image.size
|
16 |
+
detection_encoding = self.feature_extractor(detection_image, return_tensors='pt')
|
17 |
+
detection_output = self.detection_model(**detection_encoding)
|
18 |
+
detection_results = self.feature_extractor.post_process_object_detection(detection_output, threshold=0.3, target_sizes=[(image_height, image_width)])
|
19 |
+
detection_results = detection_results[0]
|
20 |
+
# copying the detections
|
21 |
+
for score, label, bbox in zip((detection_results['scores']).tolist(), (detection_results['labels']).tolist(), (detection_results['boxes']).tolist()):
|
22 |
+
detection_table_results = TableDetectionData()
|
23 |
+
detection_table_results.detection_score = score
|
24 |
+
detection_table_results.detection_label = label
|
25 |
+
detection_table_results.detection_box = bbox
|
26 |
+
table_data_extraction.tables.append(detection_table_results)
|
27 |
+
return table_data_extraction
|
28 |
+
|
codes/table_preprocessing.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from datatypes.datatypes import DetectionLabels, TableRecognitionOrdered
|
3 |
+
|
4 |
+
class TablePreprocessor():
|
5 |
+
def __init__(self):
|
6 |
+
pass
|
7 |
+
|
8 |
+
def table_structure_sorting(self, table_data):
|
9 |
+
for table in table_data.tables:
|
10 |
+
recognized_row = []
|
11 |
+
recognized_column = []
|
12 |
+
recognized_ord_obj = TableRecognitionOrdered([])
|
13 |
+
# print(table.recognitiondata[0])
|
14 |
+
for score, label, box in zip(table.recognitiondata[0].scores, table.recognitiondata[0].labels, table.recognitiondata[0].boxes):
|
15 |
+
# print(score, label, box)
|
16 |
+
newbox = []
|
17 |
+
if label == DetectionLabels.table_row.value:
|
18 |
+
newbox = box
|
19 |
+
newbox.append(label)
|
20 |
+
newbox.append(score)
|
21 |
+
recognized_row.append(newbox)
|
22 |
+
if label == DetectionLabels.table_column.value:
|
23 |
+
newbox = box
|
24 |
+
newbox.append(label)
|
25 |
+
newbox.append(score)
|
26 |
+
recognized_column.append(newbox)
|
27 |
+
|
28 |
+
recognized_row.sort(key=lambda x:x[1])
|
29 |
+
recognized_column.sort(key=lambda x:x[0])
|
30 |
+
|
31 |
+
recognized_ord_obj.recognized_row = recognized_row
|
32 |
+
recognized_ord_obj.recognized_column = recognized_column
|
33 |
+
table.ordered_recognitiondata.append(recognized_ord_obj)
|
34 |
+
|
35 |
+
return table_data
|
codes/table_recognition.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from datatypes.datatypes import TableRecognitionData, TableDetectionData
|
4 |
+
from codes.image_processing import ImageProcessor
|
5 |
+
from datatypes.config import Config
|
6 |
+
|
7 |
+
class TableRecognition:
|
8 |
+
def __init__(self, feature_extractor, recognition_model, threshold):
|
9 |
+
self.feature_extractor = feature_extractor
|
10 |
+
self.recognition_model = recognition_model
|
11 |
+
self.threshold = threshold
|
12 |
+
|
13 |
+
def table_recognition_from_detection(self, recognition_image, detection_results):
|
14 |
+
|
15 |
+
for table in detection_results.tables:
|
16 |
+
recognised_table_results = TableRecognitionData()
|
17 |
+
bbox = table.detection_box
|
18 |
+
detected_tbl = recognition_image.crop(bbox)
|
19 |
+
img_processor = ImageProcessor()
|
20 |
+
padded_table = img_processor.image_padding(image=detected_tbl, padd=Config['table_padd'])
|
21 |
+
width, height = padded_table.size
|
22 |
+
|
23 |
+
recognition_encoding = self.feature_extractor(padded_table, return_tensors='pt')
|
24 |
+
recognition_output = self.recognition_model(**recognition_encoding)
|
25 |
+
recognition_results = self.feature_extractor.post_process_object_detection(recognition_output, threshold=0.7, target_sizes=[(height, width)])
|
26 |
+
recognition_results = recognition_results[0]
|
27 |
+
|
28 |
+
recognised_table_results.scores = (recognition_results['scores'].tolist())
|
29 |
+
recognised_table_results.labels = (recognition_results['labels'].tolist())
|
30 |
+
recognised_table_results.boxes = (recognition_results['boxes'].tolist())
|
31 |
+
|
32 |
+
table.recognitiondata.append(recognised_table_results)
|
33 |
+
return detection_results
|