vteam27 commited on
Commit
1cfd79c
1 Parent(s): dc813d0

added searchable pdf

Browse files
Files changed (3) hide show
  1. app.py +19 -2
  2. requirements.txt +3 -1
  3. utils.py +163 -0
app.py CHANGED
@@ -5,6 +5,8 @@ from doctr.io import DocumentFile
5
  from doctr.models import ocr_predictor
6
  import gradio as gr
7
  from PIL import Image
 
 
8
 
9
  predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
10
 
@@ -15,6 +17,10 @@ def greet(img):
15
  img.save("out.jpg")
16
  doc = DocumentFile.from_images("out.jpg")
17
  output=predictor(doc)
 
 
 
 
18
  res=""
19
  for obj in output.pages:
20
  for obj1 in obj.blocks:
@@ -23,16 +29,27 @@ def greet(img):
23
  res=res + " " + obj3.value
24
  res=res + "\n"
25
  res=res + "\n"
 
26
  _output_name = "RESULT_OCR.txt"
 
 
27
  open(_output_name, 'w').close() # clear file
28
  with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
29
  f.write(res)
30
  print("Writing into file")
31
- return res, _output_name
 
 
 
 
 
 
 
 
32
 
33
  demo = gr.Interface(fn=greet,
34
  inputs=gr.Image(type="pil"),
35
- outputs=["text", "file"],
36
  title=title,
37
  description=description,
38
  examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]
 
5
  from doctr.models import ocr_predictor
6
  import gradio as gr
7
  from PIL import Image
8
+ import base64
9
+ from utils import HocrParser
10
 
11
  predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
12
 
 
17
  img.save("out.jpg")
18
  doc = DocumentFile.from_images("out.jpg")
19
  output=predictor(doc)
20
+
21
+ xml_outputs = output.export_as_xml()
22
+ parser = HocrParser()
23
+
24
  res=""
25
  for obj in output.pages:
26
  for obj1 in obj.blocks:
 
29
  res=res + " " + obj3.value
30
  res=res + "\n"
31
  res=res + "\n"
32
+
33
  _output_name = "RESULT_OCR.txt"
34
+ _output_name_pdf="RESULT_OCR.pdf"
35
+
36
  open(_output_name, 'w').close() # clear file
37
  with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
38
  f.write(res)
39
  print("Writing into file")
40
+
41
+ base64_encoded_pdfs = list()
42
+ for i, (xml, img) in enumerate(zip(xml_outputs, doc)):
43
+ xml_element_tree = xml[1]
44
+ parser.export_pdfa(_output_name_pdf,
45
+ hocr=xml_element_tree, image=img)
46
+ with open(_output_name_pdf, 'rb') as f:
47
+ base64_encoded_pdfs.append(base64.b64encode(f.read()))
48
+ return res, _output_name, _output_name_pdf
49
 
50
  demo = gr.Interface(fn=greet,
51
  inputs=gr.Image(type="pil"),
52
+ outputs=["text", "file","file"],
53
  title=title,
54
  description=description,
55
  examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  pycairo
2
  python-doctr[torch]@git+https://github.com/mindee/doctr.git
3
- gradio
 
 
 
1
  pycairo
2
  python-doctr[torch]@git+https://github.com/mindee/doctr.git
3
+ gradio
4
+ reportlab>=3.6.2
5
+ PyPDF2==1.26.0
utils.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import re
3
+ from tempfile import TemporaryDirectory
4
+ from math import atan, cos, sin
5
+ from typing import Dict, Optional, Tuple
6
+ from xml.etree import ElementTree as ET
7
+ from xml.etree.ElementTree import Element
8
+
9
+ import numpy as np
10
+ import PyPDF2
11
+ from PyPDF2 import PdfFileMerger
12
+ from doctr.io import DocumentFile
13
+ from doctr.models import ocr_predictor
14
+ from PIL import Image
15
+ from reportlab.lib.colors import black
16
+ from reportlab.lib.units import inch
17
+ from reportlab.lib.utils import ImageReader
18
+ from reportlab.pdfgen.canvas import Canvas
19
+
20
+
21
+
22
+
23
+ class HocrParser():
24
+
25
+ def __init__(self):
26
+ self.box_pattern = re.compile(r'bbox((\s+\d+){4})')
27
+ self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})')
28
+
29
+ def _element_coordinates(self, element: Element) -> Dict:
30
+ """
31
+ Returns a tuple containing the coordinates of the bounding box around
32
+ an element
33
+ """
34
+ out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0}
35
+ if 'title' in element.attrib:
36
+ matches = self.box_pattern.search(element.attrib['title'])
37
+ if matches:
38
+ coords = matches.group(1).split()
39
+ out = {'x1': int(coords[0]), 'y1': int(
40
+ coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])}
41
+ return out
42
+
43
+ def _get_baseline(self, element: Element) -> Tuple[float, float]:
44
+ """
45
+ Returns a tuple containing the baseline slope and intercept.
46
+ """
47
+ if 'title' in element.attrib:
48
+ matches = self.baseline_pattern.search(
49
+ element.attrib['title']).group(1).split()
50
+ if matches:
51
+ return float(matches[0]), float(matches[1])
52
+ return (0.0, 0.0)
53
+
54
+ def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict:
55
+ """
56
+ Returns the quantity in PDF units (pt) given quantity in pixels
57
+ """
58
+ pt = [(c / dpi * inch) for c in pxl.values()]
59
+ return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]}
60
+
61
+ def _get_element_text(self, element: Element) -> str:
62
+ """
63
+ Return the textual content of the element and its children
64
+ """
65
+ text = ''
66
+ if element.text is not None:
67
+ text += element.text
68
+ for child in element:
69
+ text += self._get_element_text(child)
70
+ if element.tail is not None:
71
+ text += element.tail
72
+ return text
73
+
74
+ def export_pdfa(self,
75
+ out_filename: str,
76
+ hocr: ET.ElementTree,
77
+ image: Optional[np.ndarray] = None,
78
+ fontname: str = "Times-Roman",
79
+ fontsize: int = 12,
80
+ invisible_text: bool = True,
81
+ add_spaces: bool = True,
82
+ dpi: int = 300):
83
+ """
84
+ Generates a PDF/A document from a hOCR document.
85
+ """
86
+
87
+ width, height = None, None
88
+ # Get the image dimensions
89
+ for div in hocr.findall(".//div[@class='ocr_page']"):
90
+ coords = self._element_coordinates(div)
91
+ pt_coords = self._pt_from_pixel(coords, dpi)
92
+ width, height = pt_coords['x2'] - \
93
+ pt_coords['x1'], pt_coords['y2'] - pt_coords['y1']
94
+ # after catch break loop
95
+ break
96
+ if width is None or height is None:
97
+ raise ValueError("Could not determine page size")
98
+
99
+ pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1)
100
+
101
+ span_elements = [element for element in hocr.iterfind(".//span")]
102
+ for line in span_elements:
103
+ if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None:
104
+ # get information from xml
105
+ pxl_line_coords = self._element_coordinates(line)
106
+ line_box = self._pt_from_pixel(pxl_line_coords, dpi)
107
+
108
+ # compute baseline
109
+ slope, pxl_intercept = self._get_baseline(line)
110
+ if abs(slope) < 0.005:
111
+ slope = 0.0
112
+ angle = atan(slope)
113
+ cos_a, sin_a = cos(angle), sin(angle)
114
+ intercept = pxl_intercept / dpi * inch
115
+ baseline_y2 = height - (line_box['y2'] + intercept)
116
+
117
+ # configure options
118
+ text = pdf.beginText()
119
+ text.setFont(fontname, fontsize)
120
+ pdf.setFillColor(black)
121
+ if invisible_text:
122
+ text.setTextRenderMode(3) # invisible text
123
+
124
+ # transform overlayed text
125
+ text.setTextTransform(
126
+ cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2)
127
+
128
+ elements = line.findall(".//span[@class='ocrx_word']")
129
+ for elem in elements:
130
+ elemtxt = self._get_element_text(elem).strip()
131
+ # replace unsupported characters
132
+ elemtxt = elemtxt.translate(str.maketrans(
133
+ {'ff': 'ff', 'ffi': 'f‌f‌i', 'ffl': 'f‌f‌l', 'fi': 'fi', 'fl': 'fl'}))
134
+ if not elemtxt:
135
+ continue
136
+
137
+ # compute string width
138
+ pxl_coords = self._element_coordinates(elem)
139
+ box = self._pt_from_pixel(pxl_coords, dpi)
140
+ if add_spaces:
141
+ elemtxt += ' '
142
+ box_width = box['x2'] + pdf.stringWidth(elemtxt, fontname, fontsize) - box['x1']
143
+ else:
144
+ box_width = box['x2'] - box['x1']
145
+ font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
146
+
147
+ # Adjust relative position of cursor
148
+ cursor = text.getStartOfLine()
149
+ dx = box['x1'] - cursor[0]
150
+ dy = baseline_y2 - cursor[1]
151
+ text.moveCursor(dx, dy)
152
+
153
+ # suppress text if it is 0 units wide
154
+ if font_width > 0:
155
+ text.setHorizScale(100 * box_width / font_width)
156
+ text.textOut(elemtxt)
157
+ pdf.drawText(text)
158
+
159
+ # overlay image if provided
160
+ if image is not None:
161
+ pdf.drawImage(ImageReader(Image.fromarray(image)),
162
+ 0, 0, width=width, height=height)
163
+ pdf.save()