Spaces:
Runtime error
Runtime error
#-*- coding: UTF-8 -*- | |
# Copyright 2022 the HuggingFace Team. | |
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import base64 | |
import traceback | |
import gradio as gr | |
import cv2 | |
from paddlenlp import Taskflow | |
from paddlenlp.utils.doc_parser import DocParser | |
doc_parser = DocParser() | |
task_instance = Taskflow("information_extraction", model="uie-x-base", schema="") | |
examples = [ | |
[ | |
"business_card.png", | |
"Name;Title;Web Link;Email;Address", | |
], | |
[ | |
"license.jpeg", | |
"Name;DOB;ISS;EXP", | |
], | |
[ | |
"invoice.jpeg", | |
"名称;纳税人识别号;开票日期", | |
], | |
[ | |
"custom.jpeg", | |
"收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号" | |
], | |
[ | |
"resume.png", | |
"职位;年龄;学校|时间;学校|专业", | |
], | |
] | |
example_files = { | |
"Name;Title;Web Link;Email;Address": "business_card.png", | |
"Name;DOB;ISS;EXP": "license.jpeg", | |
"职位;年龄;学校|时间;学校|专业": "resume.png", | |
"收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号": "custom.jpeg", | |
"名称;纳税人识别号;开票日期": "invoice.jpeg", | |
} | |
lang_map = { | |
"resume.png": "ch", | |
"custom.jpeg": "ch", | |
"business_card.png": "en", | |
"invoice.jpeg": "ch", | |
"license.jpeg": "en", | |
} | |
def dbc2sbc(s): | |
rs = "" | |
for char in s: | |
code = ord(char) | |
if code == 0x3000: | |
code = 0x0020 | |
else: | |
code -= 0xfee0 | |
if not (0x0021 <= code and code <= 0x7e): | |
rs += char | |
continue | |
rs += chr(code) | |
return rs | |
def process_path(path): | |
error = None | |
if path: | |
try: | |
images_list = [doc_parser.read_image(path)] | |
return ( | |
path, | |
gr.update(visible=True, value=images_list), | |
gr.update(visible=True), | |
gr.update(visible=False, value=None), | |
gr.update(visible=False, value=None), | |
None, | |
) | |
except Exception as e: | |
traceback.print_exc() | |
error = str(e) | |
return ( | |
None, | |
gr.update(visible=False, value=None), | |
gr.update(visible=False), | |
gr.update(visible=False, value=None), | |
gr.update(visible=False, value=None), | |
gr.update(visible=True, value=error) if error is not None else None, | |
None, | |
) | |
def process_upload(file): | |
if file: | |
return process_path(file.name) | |
else: | |
return ( | |
None, | |
gr.update(visible=False, value=None), | |
gr.update(visible=False), | |
gr.update(visible=False, value=None), | |
gr.update(visible=False, value=None), | |
None, | |
) | |
def BGR2RGB(img): | |
pilimg = img.copy() | |
pilimg[:, :, 0] = img[:, :, 2] | |
pilimg[:, :, 2] = img[:, :, 0] | |
return pilimg | |
def np2base64(image_np): | |
image_np = BGR2RGB(image_np) | |
image = cv2.imencode('.jpg', image_np)[1] | |
base64_str = str(base64.b64encode(image))[2:-1] | |
return base64_str | |
def get_schema(schema_str): | |
def _is_ch(s): | |
for ch in s: | |
if "\u4e00" <= ch <= "\u9fff": | |
return True | |
return False | |
schema_lang = "ch" if _is_ch(schema_str) else "en" | |
schema = schema_str.split(";") | |
schema_list = [] | |
for s in schema: | |
cand = s.split("|") | |
if len(cand) == 1: | |
schema_list.append(cand[0]) | |
else: | |
subject = cand[0] | |
relations = cand[1:] | |
added = False | |
for a in schema_list: | |
if isinstance(a, dict): | |
if subject in a.keys(): | |
a[subject].extend(relations) | |
added = True | |
break | |
if not added: | |
a = {subject: relations} | |
schema_list.append(a) | |
return schema_list, schema_lang | |
def run_taskflow(document, schema, argument): | |
task_instance.set_schema(schema) | |
# task_instance.set_argument(argument) | |
return task_instance({'doc': document}) | |
def process_doc(document, schema, ocr_lang, layout_analysis): | |
if not schema: | |
schema = '时间;组织机构;人物' | |
if document is None: | |
return None, None | |
schema, schema_lang = get_schema(dbc2sbc(schema)) | |
argument = { | |
"ocr_lang": ocr_lang, | |
"schema_lang": schema_lang, | |
"layout_analysis": layout_analysis | |
} | |
prediction = run_taskflow(document, schema, argument)[0] | |
img_show = doc_parser.write_image_with_results( | |
document, | |
result=prediction, | |
return_image=True) | |
img_list = [img_show] | |
return ( | |
gr.update(visible=True, value=img_list), | |
gr.update(visible=True, value=prediction), | |
) | |
def load_example_document(img, schema, ocr_lang, layout_analysis): | |
if img is not None: | |
document = example_files[schema] | |
choice = lang_map[document].split("-") | |
ocr_lang = choice[0] | |
layout_analysis = False if len(choice) == 1 else True | |
preview, answer = process_doc(document, schema, ocr_lang, layout_analysis) | |
return document, schema, preview, gr.update(visible=True), answer | |
else: | |
return None, None, None, gr.update(visible=False), None | |
def read_content(file_path: str) -> str: | |
"""read the content of target file | |
""" | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
return content | |
CSS = """ | |
#prompt input { | |
font-size: 16px; | |
} | |
#url-textbox { | |
padding: 0 !important; | |
} | |
#short-upload-box .w-full { | |
min-height: 10rem !important; | |
} | |
/* I think something like this can be used to re-shape | |
* the table | |
*/ | |
/* | |
.gr-samples-table tr { | |
display: inline; | |
} | |
.gr-samples-table .p-2 { | |
width: 100px; | |
} | |
*/ | |
#select-a-file { | |
width: 100%; | |
} | |
#file-clear { | |
padding-top: 2px !important; | |
padding-bottom: 2px !important; | |
padding-left: 8px !important; | |
padding-right: 8px !important; | |
margin-top: 10px; | |
} | |
.gradio-container .gr-button-primary { | |
background: linear-gradient(180deg, #CDF9BE 0%, #AFF497 100%); | |
border: 1px solid #B0DCCC; | |
border-radius: 8px; | |
color: #1B8700; | |
} | |
.gradio-container.dark button#submit-button { | |
background: linear-gradient(180deg, #CDF9BE 0%, #AFF497 100%); | |
border: 1px solid #B0DCCC; | |
border-radius: 8px; | |
color: #1B8700 | |
} | |
table.gr-samples-table tr td { | |
border: none; | |
outline: none; | |
} | |
table.gr-samples-table tr td:first-of-type { | |
width: 0%; | |
} | |
div#short-upload-box div.absolute { | |
display: none !important; | |
} | |
gradio-app > div > div > div > div.w-full > div, .gradio-app > div > div > div > div.w-full > div { | |
gap: 0px 2%; | |
} | |
gradio-app div div div div.w-full, .gradio-app div div div div.w-full { | |
gap: 0px; | |
} | |
gradio-app h2, .gradio-app h2 { | |
padding-top: 10px; | |
} | |
#answer { | |
overflow-y: scroll; | |
color: white; | |
background: #666; | |
border-color: #666; | |
font-size: 20px; | |
font-weight: bold; | |
} | |
#answer span { | |
color: white; | |
} | |
#answer textarea { | |
color:white; | |
background: #777; | |
border-color: #777; | |
font-size: 18px; | |
} | |
#url-error input { | |
color: red; | |
} | |
""" | |
with gr.Blocks(css=CSS) as demo: | |
gr.HTML(read_content("header.html")) | |
gr.Markdown( | |
"**UIE-X 🧾 🎓** is a universal information extraction engine which supports both document and text inputs. It is powered by BAIDU and released on PaddleNLP. " | |
"Our extraction target(schema) can be set in natural language without limitation, and it also supports most extraction tasks. " | |
"The model performs well on zero-shot and few-shot settings. Moreover, on PaddleNLP, we provide a comprehensive and easy-to-use fine-tuning customization workflow." | |
"For more details, please visit the [GitHub](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction)" | |
) | |
document = gr.Variable() | |
is_text = gr.Variable() | |
example_schema = gr.Textbox(visible=False) | |
example_image = gr.Image(visible=False) | |
with gr.Row(equal_height=True): | |
with gr.Column(): | |
with gr.Row(): | |
gr.Markdown("## 1. 选择文件 / Select a file 📄", elem_id="select-a-file") | |
img_clear_button = gr.Button( | |
"Clear", variant="secondary", elem_id="file-clear", visible=False | |
) | |
image = gr.Gallery(visible=False) | |
with gr.Row(equal_height=True): | |
with gr.Column(): | |
with gr.Row(): | |
url = gr.Textbox( | |
show_label=False, | |
placeholder="URL", | |
lines=1, | |
max_lines=1, | |
elem_id="url-textbox", | |
) | |
submit = gr.Button("Get") | |
url_error = gr.Textbox( | |
visible=False, | |
elem_id="url-error", | |
max_lines=1, | |
interactive=False, | |
label="Error", | |
) | |
gr.Markdown("— or —") | |
upload = gr.File(label=None, interactive=True, elem_id="short-upload-box") | |
gr.Examples( | |
examples=examples, | |
inputs=[example_image, example_schema], | |
) | |
with gr.Column(): | |
gr.Markdown("## 2. 信息抽取 / Information extraction ℹ️ ") | |
gr.Markdown("### 👉 设置schema") | |
gr.Markdown("实体抽取:实体类别之间以';'分割,例如 **人物;组织机构**") | |
gr.Markdown("关系抽取:需配置主体和关系类别,中间以'|'分割,例如 **人物|出生时间;人物|邮箱**") | |
gr.Markdown("### 👉 Set a schema") | |
gr.Markdown("Entity extraction: entity label should be separated by ';', e.g. **Person;Organization**") | |
gr.Markdown("Relation extraction: set the subject and relation type, separated by '|', e.g. **Person|Date;Person|Email**") | |
gr.Markdown("### 💪 模型定制 / Model customization") | |
gr.Markdown("我们建议通过[数据标注+微调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document)的流程进一步增强模型在特定场景的效果") | |
gr.Markdown("We recommend to further improve the extraction performance in specific domain through the process of [data annotation & fine-tuning](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document)") | |
schema = gr.Textbox( | |
label="Schema", | |
placeholder="e.g. Name|Company;Name|Position;Email;Phone Number", | |
lines=1, | |
max_lines=1, | |
) | |
ocr_lang = gr.Radio( | |
choices=["ch", "en"], | |
value="en", | |
label="OCR语言 / OCR Language (Please choose ch for Chinese images.)", | |
) | |
layout_analysis = gr.Radio( | |
choices=["yes", "no"], | |
value="no", | |
label="版面分析 / Layout analysis (Better extraction for multi-line text)", | |
) | |
with gr.Row(): | |
clear_button = gr.Button("Clear", variant="secondary") | |
submit_button = gr.Button( | |
"Submit", variant="primary", elem_id="submit-button" | |
) | |
with gr.Column(): | |
output = gr.JSON(label="Output", visible=False) | |
for cb in [img_clear_button, clear_button]: | |
cb.click( | |
lambda _: ( | |
gr.update(visible=False, value=None), | |
None, | |
gr.update(visible=False, value=None), | |
gr.update(visible=False), | |
None, | |
None, | |
None, | |
gr.update(visible=False, value=None), | |
None, | |
), | |
inputs=clear_button, | |
outputs=[ | |
image, | |
document, | |
output, | |
img_clear_button, | |
example_image, | |
upload, | |
url, | |
url_error, | |
schema, | |
], | |
) | |
upload.change( | |
fn=process_upload, | |
inputs=[upload], | |
outputs=[document, image, img_clear_button, output, url_error], | |
) | |
submit.click( | |
fn=process_path, | |
inputs=[url], | |
outputs=[document, image, img_clear_button, output, url_error], | |
) | |
schema.submit( | |
fn=process_doc, | |
inputs=[document, schema, ocr_lang, layout_analysis], | |
outputs=[image, output], | |
) | |
submit_button.click( | |
fn=process_doc, | |
inputs=[document, schema, ocr_lang, layout_analysis], | |
outputs=[image, output], | |
) | |
example_image.change( | |
fn=load_example_document, | |
inputs=[example_image, example_schema, ocr_lang, layout_analysis], | |
outputs=[document, schema, image, img_clear_button, output], | |
) | |
gr.Markdown("[![Stargazers repo roster for @PaddlePaddle/PaddleNLP](https://reporoster.com/stars/PaddlePaddle/PaddleNLP)](https://github.com/PaddlePaddle/PaddleNLP)") | |
gr.HTML(read_content("footer.html")) | |
if __name__ == "__main__": | |
demo.launch(enable_queue=False) | |