pierreguillou
commited on
Commit
•
e9594d5
1
Parent(s):
a6fbb01
Update app.py
Browse files
app.py
CHANGED
@@ -36,18 +36,26 @@ from functions import *
|
|
36 |
# update pip
|
37 |
os.system('python -m pip install --upgrade pip')
|
38 |
|
39 |
-
|
40 |
-
|
|
|
41 |
|
42 |
import torch
|
43 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
model = AutoModelForTokenClassification.from_pretrained(model_id);
|
49 |
model.to(device);
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
# APP outputs
|
52 |
def app_outputs(uploaded_pdf):
|
53 |
filename, msg, images = pdf_to_images(uploaded_pdf)
|
@@ -104,17 +112,16 @@ def app_outputs(uploaded_pdf):
|
|
104 |
return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]
|
105 |
|
106 |
# gradio APP
|
107 |
-
with gr.Blocks(title="Inference APP for Document Understanding at line level (
|
108 |
gr.HTML("""
|
109 |
-
<div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at line level (
|
110 |
-
<div style="margin-top: 40px"><p
|
111 |
-
<div><p
|
112 |
-
<div><p
|
113 |
-
<div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) to get the bounding boxes, then run LiLT (already fine-tuned on the dataset DocLayNet base at line level) on the individual tokens and then, visualize the result at line level!</p></div>
|
114 |
<div><p><b>It allows to get all pages of any PDF (of any language) with bounding boxes labeled at line level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)</b></p></div>
|
115 |
-
<div><p>However, the inference time per page can be high when running the model on CPU due to the number of line predictions to be made. Therefore, to avoid running this APP for too long, <b>only the first 2 pages are processed by this APP</b>. If you want to increase this limit, you can either clone this APP in Hugging Face Space (or run its <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/
|
116 |
<div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p>
|
117 |
-
<ul><li>(
|
118 |
""")
|
119 |
with gr.Row():
|
120 |
pdf_file = gr.File(label="PDF")
|
|
|
36 |
# update pip
|
37 |
os.system('python -m pip install --upgrade pip')
|
38 |
|
39 |
+
## model / feature extractor / tokenizer
|
40 |
+
|
41 |
+
from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
|
42 |
|
43 |
import torch
|
44 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
45 |
|
46 |
+
# model
|
47 |
+
# tokenizer = LayoutXLMTokenizerFast.from_pretrained(model_id)
|
48 |
+
model = LayoutLMv2ForTokenClassification.from_pretrained(model_id);
|
|
|
49 |
model.to(device);
|
50 |
|
51 |
+
# feature extractor
|
52 |
+
from transformers import LayoutLMv2FeatureExtractor
|
53 |
+
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
54 |
+
|
55 |
+
# tokenizer
|
56 |
+
from transformers import AutoTokenizer
|
57 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
58 |
+
|
59 |
# APP outputs
|
60 |
def app_outputs(uploaded_pdf):
|
61 |
filename, msg, images = pdf_to_images(uploaded_pdf)
|
|
|
112 |
return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]
|
113 |
|
114 |
# gradio APP
|
115 |
+
with gr.Blocks(title="Inference APP for Document Understanding at line level (v2 - LayoutXLM base)", css=".gradio-container") as demo:
|
116 |
gr.HTML("""
|
117 |
+
<div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at line level (v2 - LayoutXLM base)</h1></div>
|
118 |
+
<div style="margin-top: 40px"><p>(03/05/2023) This Inference APP uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" target="_blank">model Layout XLM base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base at line level</a> (chunk size of 384 tokens).</p></div>
|
119 |
+
<div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2104.08836" target="_blank">LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to <b>understand any language</b>. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can <b>classifly any bounding box (and its OCR text) to 11 labels</b> (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
|
120 |
+
<div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) to get the bounding boxes, then run Layout XLM base (already fine-tuned on the dataset DocLayNet base at line level) on the individual tokens and then, visualize the result at line level!</p></div>
|
|
|
121 |
<div><p><b>It allows to get all pages of any PDF (of any language) with bounding boxes labeled at line level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)</b></p></div>
|
122 |
+
<div><p>However, the inference time per page can be high when running the model on CPU due to the number of line predictions to be made. Therefore, to avoid running this APP for too long, <b>only the first 2 pages are processed by this APP</b>. If you want to increase this limit, you can either clone this APP in Hugging Face Space (or run its <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">notebook</a> on your own plateform) and change the value of the parameter <code>max_imgboxes</code>, or run the inference notebook "<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">Document AI | Inference at line level with a Document Understanding model (LayoutXLM base fine-tuned on DocLayNet dataset)</a>" on your own platform as it does not have this limit.</p></div>
|
123 |
<div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p>
|
124 |
+
<ul><li>(03/05/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at line level with LayoutXLM base</a></li><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI | Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI | DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
|
125 |
""")
|
126 |
with gr.Row():
|
127 |
pdf_file = gr.File(label="PDF")
|