vteam27
commited on
Commit
•
1cfd79c
1
Parent(s):
dc813d0
added searchable pdf
Browse files- app.py +19 -2
- requirements.txt +3 -1
- utils.py +163 -0
app.py
CHANGED
@@ -5,6 +5,8 @@ from doctr.io import DocumentFile
|
|
5 |
from doctr.models import ocr_predictor
|
6 |
import gradio as gr
|
7 |
from PIL import Image
|
|
|
|
|
8 |
|
9 |
predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
|
10 |
|
@@ -15,6 +17,10 @@ def greet(img):
|
|
15 |
img.save("out.jpg")
|
16 |
doc = DocumentFile.from_images("out.jpg")
|
17 |
output=predictor(doc)
|
|
|
|
|
|
|
|
|
18 |
res=""
|
19 |
for obj in output.pages:
|
20 |
for obj1 in obj.blocks:
|
@@ -23,16 +29,27 @@ def greet(img):
|
|
23 |
res=res + " " + obj3.value
|
24 |
res=res + "\n"
|
25 |
res=res + "\n"
|
|
|
26 |
_output_name = "RESULT_OCR.txt"
|
|
|
|
|
27 |
open(_output_name, 'w').close() # clear file
|
28 |
with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
|
29 |
f.write(res)
|
30 |
print("Writing into file")
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
demo = gr.Interface(fn=greet,
|
34 |
inputs=gr.Image(type="pil"),
|
35 |
-
outputs=["text", "file"],
|
36 |
title=title,
|
37 |
description=description,
|
38 |
examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]
|
|
|
5 |
from doctr.models import ocr_predictor
|
6 |
import gradio as gr
|
7 |
from PIL import Image
|
8 |
+
import base64
|
9 |
+
from utils import HocrParser
|
10 |
|
11 |
predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
|
12 |
|
|
|
17 |
img.save("out.jpg")
|
18 |
doc = DocumentFile.from_images("out.jpg")
|
19 |
output=predictor(doc)
|
20 |
+
|
21 |
+
xml_outputs = output.export_as_xml()
|
22 |
+
parser = HocrParser()
|
23 |
+
|
24 |
res=""
|
25 |
for obj in output.pages:
|
26 |
for obj1 in obj.blocks:
|
|
|
29 |
res=res + " " + obj3.value
|
30 |
res=res + "\n"
|
31 |
res=res + "\n"
|
32 |
+
|
33 |
_output_name = "RESULT_OCR.txt"
|
34 |
+
_output_name_pdf="RESULT_OCR.pdf"
|
35 |
+
|
36 |
open(_output_name, 'w').close() # clear file
|
37 |
with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
|
38 |
f.write(res)
|
39 |
print("Writing into file")
|
40 |
+
|
41 |
+
base64_encoded_pdfs = list()
|
42 |
+
for i, (xml, img) in enumerate(zip(xml_outputs, doc)):
|
43 |
+
xml_element_tree = xml[1]
|
44 |
+
parser.export_pdfa(_output_name_pdf,
|
45 |
+
hocr=xml_element_tree, image=img)
|
46 |
+
with open(_output_name_pdf, 'rb') as f:
|
47 |
+
base64_encoded_pdfs.append(base64.b64encode(f.read()))
|
48 |
+
return res, _output_name, _output_name_pdf
|
49 |
|
50 |
demo = gr.Interface(fn=greet,
|
51 |
inputs=gr.Image(type="pil"),
|
52 |
+
outputs=["text", "file","file"],
|
53 |
title=title,
|
54 |
description=description,
|
55 |
examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]
|
requirements.txt
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
pycairo
|
2 |
python-doctr[torch]@git+https://github.com/mindee/doctr.git
|
3 |
-
gradio
|
|
|
|
|
|
1 |
pycairo
|
2 |
python-doctr[torch]@git+https://github.com/mindee/doctr.git
|
3 |
+
gradio
|
4 |
+
reportlab>=3.6.2
|
5 |
+
PyPDF2==1.26.0
|
utils.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import re
|
3 |
+
from tempfile import TemporaryDirectory
|
4 |
+
from math import atan, cos, sin
|
5 |
+
from typing import Dict, Optional, Tuple
|
6 |
+
from xml.etree import ElementTree as ET
|
7 |
+
from xml.etree.ElementTree import Element
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import PyPDF2
|
11 |
+
from PyPDF2 import PdfFileMerger
|
12 |
+
from doctr.io import DocumentFile
|
13 |
+
from doctr.models import ocr_predictor
|
14 |
+
from PIL import Image
|
15 |
+
from reportlab.lib.colors import black
|
16 |
+
from reportlab.lib.units import inch
|
17 |
+
from reportlab.lib.utils import ImageReader
|
18 |
+
from reportlab.pdfgen.canvas import Canvas
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
class HocrParser():
|
24 |
+
|
25 |
+
def __init__(self):
|
26 |
+
self.box_pattern = re.compile(r'bbox((\s+\d+){4})')
|
27 |
+
self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})')
|
28 |
+
|
29 |
+
def _element_coordinates(self, element: Element) -> Dict:
|
30 |
+
"""
|
31 |
+
Returns a tuple containing the coordinates of the bounding box around
|
32 |
+
an element
|
33 |
+
"""
|
34 |
+
out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0}
|
35 |
+
if 'title' in element.attrib:
|
36 |
+
matches = self.box_pattern.search(element.attrib['title'])
|
37 |
+
if matches:
|
38 |
+
coords = matches.group(1).split()
|
39 |
+
out = {'x1': int(coords[0]), 'y1': int(
|
40 |
+
coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])}
|
41 |
+
return out
|
42 |
+
|
43 |
+
def _get_baseline(self, element: Element) -> Tuple[float, float]:
|
44 |
+
"""
|
45 |
+
Returns a tuple containing the baseline slope and intercept.
|
46 |
+
"""
|
47 |
+
if 'title' in element.attrib:
|
48 |
+
matches = self.baseline_pattern.search(
|
49 |
+
element.attrib['title']).group(1).split()
|
50 |
+
if matches:
|
51 |
+
return float(matches[0]), float(matches[1])
|
52 |
+
return (0.0, 0.0)
|
53 |
+
|
54 |
+
def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict:
|
55 |
+
"""
|
56 |
+
Returns the quantity in PDF units (pt) given quantity in pixels
|
57 |
+
"""
|
58 |
+
pt = [(c / dpi * inch) for c in pxl.values()]
|
59 |
+
return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]}
|
60 |
+
|
61 |
+
def _get_element_text(self, element: Element) -> str:
|
62 |
+
"""
|
63 |
+
Return the textual content of the element and its children
|
64 |
+
"""
|
65 |
+
text = ''
|
66 |
+
if element.text is not None:
|
67 |
+
text += element.text
|
68 |
+
for child in element:
|
69 |
+
text += self._get_element_text(child)
|
70 |
+
if element.tail is not None:
|
71 |
+
text += element.tail
|
72 |
+
return text
|
73 |
+
|
74 |
+
def export_pdfa(self,
|
75 |
+
out_filename: str,
|
76 |
+
hocr: ET.ElementTree,
|
77 |
+
image: Optional[np.ndarray] = None,
|
78 |
+
fontname: str = "Times-Roman",
|
79 |
+
fontsize: int = 12,
|
80 |
+
invisible_text: bool = True,
|
81 |
+
add_spaces: bool = True,
|
82 |
+
dpi: int = 300):
|
83 |
+
"""
|
84 |
+
Generates a PDF/A document from a hOCR document.
|
85 |
+
"""
|
86 |
+
|
87 |
+
width, height = None, None
|
88 |
+
# Get the image dimensions
|
89 |
+
for div in hocr.findall(".//div[@class='ocr_page']"):
|
90 |
+
coords = self._element_coordinates(div)
|
91 |
+
pt_coords = self._pt_from_pixel(coords, dpi)
|
92 |
+
width, height = pt_coords['x2'] - \
|
93 |
+
pt_coords['x1'], pt_coords['y2'] - pt_coords['y1']
|
94 |
+
# after catch break loop
|
95 |
+
break
|
96 |
+
if width is None or height is None:
|
97 |
+
raise ValueError("Could not determine page size")
|
98 |
+
|
99 |
+
pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1)
|
100 |
+
|
101 |
+
span_elements = [element for element in hocr.iterfind(".//span")]
|
102 |
+
for line in span_elements:
|
103 |
+
if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None:
|
104 |
+
# get information from xml
|
105 |
+
pxl_line_coords = self._element_coordinates(line)
|
106 |
+
line_box = self._pt_from_pixel(pxl_line_coords, dpi)
|
107 |
+
|
108 |
+
# compute baseline
|
109 |
+
slope, pxl_intercept = self._get_baseline(line)
|
110 |
+
if abs(slope) < 0.005:
|
111 |
+
slope = 0.0
|
112 |
+
angle = atan(slope)
|
113 |
+
cos_a, sin_a = cos(angle), sin(angle)
|
114 |
+
intercept = pxl_intercept / dpi * inch
|
115 |
+
baseline_y2 = height - (line_box['y2'] + intercept)
|
116 |
+
|
117 |
+
# configure options
|
118 |
+
text = pdf.beginText()
|
119 |
+
text.setFont(fontname, fontsize)
|
120 |
+
pdf.setFillColor(black)
|
121 |
+
if invisible_text:
|
122 |
+
text.setTextRenderMode(3) # invisible text
|
123 |
+
|
124 |
+
# transform overlayed text
|
125 |
+
text.setTextTransform(
|
126 |
+
cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2)
|
127 |
+
|
128 |
+
elements = line.findall(".//span[@class='ocrx_word']")
|
129 |
+
for elem in elements:
|
130 |
+
elemtxt = self._get_element_text(elem).strip()
|
131 |
+
# replace unsupported characters
|
132 |
+
elemtxt = elemtxt.translate(str.maketrans(
|
133 |
+
{'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'}))
|
134 |
+
if not elemtxt:
|
135 |
+
continue
|
136 |
+
|
137 |
+
# compute string width
|
138 |
+
pxl_coords = self._element_coordinates(elem)
|
139 |
+
box = self._pt_from_pixel(pxl_coords, dpi)
|
140 |
+
if add_spaces:
|
141 |
+
elemtxt += ' '
|
142 |
+
box_width = box['x2'] + pdf.stringWidth(elemtxt, fontname, fontsize) - box['x1']
|
143 |
+
else:
|
144 |
+
box_width = box['x2'] - box['x1']
|
145 |
+
font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
|
146 |
+
|
147 |
+
# Adjust relative position of cursor
|
148 |
+
cursor = text.getStartOfLine()
|
149 |
+
dx = box['x1'] - cursor[0]
|
150 |
+
dy = baseline_y2 - cursor[1]
|
151 |
+
text.moveCursor(dx, dy)
|
152 |
+
|
153 |
+
# suppress text if it is 0 units wide
|
154 |
+
if font_width > 0:
|
155 |
+
text.setHorizScale(100 * box_width / font_width)
|
156 |
+
text.textOut(elemtxt)
|
157 |
+
pdf.drawText(text)
|
158 |
+
|
159 |
+
# overlay image if provided
|
160 |
+
if image is not None:
|
161 |
+
pdf.drawImage(ImageReader(Image.fromarray(image)),
|
162 |
+
0, 0, width=width, height=height)
|
163 |
+
pdf.save()
|