File size: 12,951 Bytes
d748bf5
86bfe12
317c295
89337a8
04e4aef
ac7b15a
4769339
86bfe12
40daba5
049580f
86bfe12
d6c6484
86bfe12
d6c6484
86bfe12
d6c6484
86bfe12
 
 
89337a8
86bfe12
5c3b074
40daba5
5c3b074
 
 
 
 
 
86bfe12
317c295
 
397d15f
 
 
 
4769339
 
397d15f
4769339
 
981daf7
 
 
 
 
397d15f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4769339
 
317c295
 
 
 
 
 
 
 
 
 
 
 
 
32db61e
317c295
 
 
 
 
 
32db61e
317c295
 
 
 
 
 
32db61e
317c295
 
397d15f
b9fe6b2
 
 
397d15f
317c295
 
397d15f
317c295
4769339
 
397d15f
 
 
4769339
ac7b15a
317c295
 
 
ac7b15a
397d15f
 
 
 
317c295
baa7144
0c789cf
04c7117
317c295
ac7b15a
0c789cf
04c7117
317c295
ac7b15a
317c295
 
397d15f
 
317c295
 
 
45d5330
 
 
 
397d15f
317c295
 
 
 
 
 
 
 
 
b77ac8e
397d15f
 
 
317c295
397d15f
317c295
397d15f
 
317c295
397d15f
 
317c295
397d15f
b9fe6b2
 
 
317c295
 
 
 
397d15f
317c295
 
397d15f
317c295
 
397d15f
317c295
 
b9fe6b2
317c295
 
 
 
 
397d15f
317c295
 
 
 
 
397d15f
 
 
 
 
 
 
 
 
 
86bfe12
397d15f
 
 
 
 
 
 
 
 
23469dd
04e4aef
317c295
 
 
 
397d15f
317c295
 
 
 
 
 
 
397d15f
 
 
 
d328fda
5aa04c9
 
 
397d15f
317c295
 
 
 
 
 
 
64811e2
317c295
 
64811e2
317c295
 
 
 
64811e2
317c295
 
397d15f
 
 
317c295
 
ac7b15a
317c295
940f0a4
317c295
 
940f0a4
 
 
317c295
940f0a4
 
397d15f
 
 
 
 
 
 
 
 
 
 
23469dd
ac7b15a
397d15f
23469dd
ac7b15a
397d15f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import os
import importlib.metadata
from os import getcwd, path, environ
from dotenv import load_dotenv
import json


def check_additional_requirements():
    if importlib.util.find_spec("detectron2") is None:
        os.system('pip install detectron2@git+https://github.com/deepdoctection/detectron2.git')
    if importlib.util.find_spec("gradio") is not None:
        if importlib.metadata.version("gradio")!="3.44.3":
            os.system("pip uninstall -y gradio")
            os.system("pip install gradio==3.44.3")
    else:
        os.system("pip install gradio==3.44.3")
    return


load_dotenv()
check_additional_requirements()

import deepdoctection as dd
from deepdoctection.dataflow.serialize import DataFromList
import time
import gradio as gr
from botocore.config import Config


# work around: https://discuss.huggingface.co/t/how-to-install-a-specific-version-of-gradio-in-spaces/13552
_DD_ONE = "conf_dd_one.yaml"

dd.ModelCatalog.register("xrf_layout/model_final_inf_only.pt",dd.ModelProfile(
            name="xrf_layout/model_final_inf_only.pt",
            description="layout_detection/morning-dragon-114",
            config="xrf_dd/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
            size=[274632215],
            tp_model=False,
            hf_repo_id=environ.get("HF_REPO_LAYOUT"),
            hf_model_name="model_final_inf_only.pt",
            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
            categories={"1": dd.LayoutType.text,
                        "2": dd.LayoutType.title,
                        "3": dd.LayoutType.list,
                        "4": dd.LayoutType.table,
                        "5": dd.LayoutType.figure},
            model_wrapper="D2FrcnnDetector",
        ))

dd.ModelCatalog.register("xrf_cell/model_final_inf_only.pt", dd.ModelProfile(
            name="xrf_cell/model_final_inf_only.pt",
            description="cell_detection/restful-eon-6",
            config="xrf_dd/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
            size=[274583063],
            tp_model=False,
            hf_repo_id=environ.get("HF_REPO_CELL"),
            hf_model_name="model_final_inf_only.pt",
            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
            categories={"1": dd.LayoutType.cell},
            model_wrapper="D2FrcnnDetector",
        ))

dd.ModelCatalog.register("xrf_item/model_final_inf_only.pt", dd.ModelProfile(
            name="xrf_item/model_final_inf_only.pt",
            description="item_detection/firm_plasma_14",
            config="xrf_dd/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
            size=[274595351],
            tp_model=False,
            hf_repo_id=environ.get("HF_REPO_ITEM"),
            hf_model_name="model_final_inf_only.pt",
            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
            categories={"1": dd.LayoutType.row, "2": dd.LayoutType.column},
            model_wrapper="D2FrcnnDetector",
        ))

# Set up of the configuration and logging. Models are globally defined, so that they are not re-loaded once the input
# updates
cfg = dd.set_config_by_yaml(path.join(getcwd(),_DD_ONE))
cfg.freeze(freezed=False)
cfg.DEVICE = "cpu"
cfg.freeze()

# layout detector
layout_config_path = dd.ModelCatalog.get_full_path_configs(cfg.CONFIG.D2LAYOUT)
layout_weights_path = dd.ModelDownloadManager.maybe_download_weights_and_configs(cfg.WEIGHTS.D2LAYOUT)
categories_layout = dd.ModelCatalog.get_profile(cfg.WEIGHTS.D2LAYOUT).categories
assert categories_layout is not None
assert layout_weights_path is not None
d_layout = dd.D2FrcnnDetector(layout_config_path, layout_weights_path, categories_layout, device=cfg.DEVICE)

# cell detector
cell_config_path = dd.ModelCatalog.get_full_path_configs(cfg.CONFIG.D2CELL)
cell_weights_path = dd.ModelDownloadManager.maybe_download_weights_and_configs(cfg.WEIGHTS.D2CELL)
categories_cell = dd.ModelCatalog.get_profile(cfg.WEIGHTS.D2CELL).categories
assert categories_cell is not None
d_cell = dd.D2FrcnnDetector(cell_config_path, cell_weights_path, categories_cell, device=cfg.DEVICE)

# row/column detector
item_config_path = dd.ModelCatalog.get_full_path_configs(cfg.CONFIG.D2ITEM)
item_weights_path = dd.ModelDownloadManager.maybe_download_weights_and_configs(cfg.WEIGHTS.D2ITEM)
categories_item = dd.ModelCatalog.get_profile(cfg.WEIGHTS.D2ITEM).categories
assert categories_item is not None
d_item = dd.D2FrcnnDetector(item_config_path, item_weights_path, categories_item, device=cfg.DEVICE)


# text detector
credentials_kwargs={"aws_access_key_id": os.environ["ACCESS_KEY"],
                    "aws_secret_access_key": os.environ["SECRET_KEY"],
                    "config": Config(region_name=os.environ["REGION"])}
tex_text = dd.TextractOcrDetector(**credentials_kwargs)


def build_gradio_analyzer():
    """Building the Detectron2/DocTr analyzer based on the given config"""

    cfg.freeze(freezed=False)
    cfg.TAB = True
    cfg.TAB_REF = True
    cfg.OCR = True
    cfg.freeze()

    pipe_component_list = []
    layout = dd.ImageLayoutService(d_layout, to_image=True, crop_image=True)
    pipe_component_list.append(layout)

    nms_service = dd.AnnotationNmsService(nms_pairs=cfg.LAYOUT_NMS_PAIRS.COMBINATIONS,
                                          thresholds=cfg.LAYOUT_NMS_PAIRS.THRESHOLDS)
    pipe_component_list.append(nms_service)

    if cfg.TAB:

        detect_result_generator = dd.DetectResultGenerator(categories_cell)
        cell = dd.SubImageLayoutService(d_cell, dd.LayoutType.table, {1: 6}, detect_result_generator)
        pipe_component_list.append(cell)

        detect_result_generator = dd.DetectResultGenerator(categories_item)
        item = dd.SubImageLayoutService(d_item, dd.LayoutType.table, {1: 7, 2: 8}, detect_result_generator)
        pipe_component_list.append(item)

        table_segmentation = dd.TableSegmentationService(
            cfg.SEGMENTATION.ASSIGNMENT_RULE,
            cfg.SEGMENTATION.THRESHOLD_ROWS,
            cfg.SEGMENTATION.THRESHOLD_COLS,
            cfg.SEGMENTATION.FULL_TABLE_TILING,
            cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
            cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
            dd.LayoutType.table,
            [dd.CellType.header, dd.CellType.body, dd.LayoutType.cell],
            [dd.LayoutType.row, dd.LayoutType.column],
            [dd.CellType.row_number, dd.CellType.column_number],
            cfg.SEGMENTATION.STRETCH_RULE
        )
        pipe_component_list.append(table_segmentation)

        if cfg.TAB_REF:
            table_segmentation_refinement = dd.TableSegmentationRefinementService()
            pipe_component_list.append(table_segmentation_refinement)

    if cfg.OCR:

        t_text = dd.TextExtractionService(tex_text)
        pipe_component_list.append(t_text)

        match_words = dd.MatchingService(
            parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
            child_categories=cfg.WORD_MATCHING.CHILD_CATEGORIES,
            matching_rule=cfg.WORD_MATCHING.RULE,
            threshold=cfg.WORD_MATCHING.THRESHOLD,
            max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY
        )
        pipe_component_list.append(match_words)

        order = dd.TextOrderService(
            text_container=cfg.TEXT_ORDERING.TEXT_CONTAINER,
            floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK,
            text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK,
            include_residual_text_container=cfg.TEXT_ORDERING.TEXT_CONTAINER_TO_TEXT_BLOCK)
        pipe_component_list.append(order)

    pipe = dd.DoctectionPipe(pipeline_component_list=pipe_component_list)

    return pipe


def analyze_image(img, pdf, max_datapoints):

    # creating an image object and passing to the analyzer by using dataflows
    analyzer = build_gradio_analyzer()

    if img is not None:
        image = dd.Image(file_name=str(time.time()).replace(".","") + ".png", location="")
        image.image = img[:, :, ::-1]

        df = DataFromList(lst=[image])
        df = analyzer.analyze(dataset_dataflow=df)
    elif pdf:
        df = analyzer.analyze(path=pdf.name, max_datapoints=max_datapoints)
    else:
        raise ValueError

    df.reset_state()

    layout_items_str = ""
    jsonl_out = []
    dpts = []
    html_list = []

    for dp in df:
        dpts.append(dp)
        out = dp.as_dict()
        jsonl_out.append(out)
        out.pop("_image")
        layout_items = [layout for layout in dp.layouts if layout.reading_order is not None]
        layout_items.sort(key=lambda x: x.reading_order)
        layout_items_str += f"\n\n -------- PAGE NUMBER: {dp.page_number+1} ------------- \n"
        for item in layout_items:
            layout_items_str += f"\n {item.category_name}: {item.text}"
        html_list.extend([table.html for table in dp.tables])
    if html_list:
        html = ("<br /><br /><br />").join(html_list)
    else:
        html = None
    json_object = json.dumps(jsonl_out, indent = 4)
    return [dp.viz(show_cells=False) for dp in dpts], layout_items_str, html, json_object


demo = gr.Blocks(css="scrollbar.css")


with demo:
    with gr.Box():
        gr.Markdown("<h1><center>deepdoctection - A Document AI Package</center></h1>")
        gr.Markdown("<strong>deep</strong>doctection is a Python library that orchestrates document extraction"
                    " and document layout analysis tasks using deep learning models. It does not implement models"
                    " but enables you to build pipelines using highly acknowledged libraries for object detection,"
                    " OCR and selected NLP tasks and provides an integrated frameworks for fine-tuning, evaluating"
                    " and running models.<br />"
                    "This pipeline consists of a stack of models powered by <strong>Detectron2"
                    "</strong> for layout analysis and table recognition. OCR will be provided as well. You can process"
                    "an image or even a PDF-document. Up to nine pages can be processed. <br />")
        gr.Markdown("<center><strong>Please note:</strong> The models for layout detection and table recognition are not open sourced. "
        "When you start using deepdoctection you will get models that have been trained on less diversified data and that will perform worse. "
        "OCR isn't open sourced either: It uses AWS Textract, which is a commercial service. Keep this in mind, before you get started with "
        "your installation and observe dissapointing results. Thanks. </center>")
        gr.Markdown("[https://github.com/deepdoctection/deepdoctection](https://github.com/deepdoctection/deepdoctection)")
    with gr.Box():
        gr.Markdown("<h2><center>Upload a document and choose setting</center></h2>")
        with gr.Row():
            with gr.Column():
                with gr.Tab("Image upload"):
                    with gr.Column():
                        inputs = gr.Image(type='numpy', label="Original Image")
                with gr.Tab("PDF upload (only first image will be processed) *"):
                    with gr.Column():
                        inputs_pdf = gr.File(label="PDF")
                    gr.Markdown("<sup>* If an image is cached in tab, remove it first</sup>")
            with gr.Column():
                gr.Examples(
                    examples=[path.join(getcwd(), "sample_1.jpg"), path.join(getcwd(), "sample_2.png")],
                    inputs = inputs)
                gr.Examples(examples=[path.join(getcwd(), "sample_3.pdf")], inputs = inputs_pdf)

        with gr.Row():
            max_imgs = gr.Slider(1, 8, value=2, step=1, label="Number of pages in multi page PDF",
                                 info="Will stop after 9 pages")

        with gr.Row():
            btn = gr.Button("Run model", variant="primary")

    with gr.Box():
        gr.Markdown("<h2><center>Outputs</center></h2>")
        with gr.Row():
            with gr.Column():
                with gr.Box():
                    gr.Markdown("<center><strong>Contiguous text</strong></center>")
                    image_text = gr.Textbox()
            with gr.Column():
                with gr.Box():
                    gr.Markdown("<center><strong>Layout detection</strong></center>")
                    gallery = gr.Gallery(
                        label="Output images", show_label=False, elem_id="gallery"
                    ).style(grid=2)
        with gr.Row():
            with gr.Box():
                gr.Markdown("<center><strong>Table</strong></center>")
                html = gr.HTML()

        with gr.Row():
            with gr.Box():
                gr.Markdown("<center><strong>JSON</strong></center>")
                json_output = gr.JSON()

    btn.click(fn=analyze_image, inputs=[inputs, inputs_pdf,  max_imgs],
              outputs=[gallery, image_text, html, json_output])

demo.launch()