xiaoyao9184 commited on
Commit
c9634a7
1 Parent(s): 5fd3ae2

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (2) hide show
  1. gradio_app.py +207 -0
  2. requirements.txt +4 -0
gradio_app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ if "APP_PATH" in os.environ:
5
+ os.chdir(os.environ["APP_PATH"])
6
+ # fix sys.path for import
7
+ sys.path.append(os.getcwd())
8
+
9
+ import gradio as gr
10
+
11
+ from marker.settings import settings
12
+
13
+ import base64
14
+ import io
15
+ import re
16
+ from typing import Any, Dict
17
+
18
+ import pypdfium2
19
+ from PIL import Image
20
+
21
+ from marker.converters.pdf import PdfConverter
22
+ from marker.models import create_model_dict
23
+ from marker.config.parser import ConfigParser
24
+ from marker.output import text_from_rendered
25
+
26
+
27
+ def load_models():
28
+ return create_model_dict()
29
+
30
+ def convert_pdf(fname: str, **kwargs) -> (str, Dict[str, Any], dict):
31
+ config_parser = ConfigParser(kwargs)
32
+ config_dict = config_parser.generate_config_dict()
33
+ config_dict["pdftext_workers"] = 1
34
+ converter = PdfConverter(
35
+ config=config_dict,
36
+ artifact_dict=model_dict,
37
+ processor_list=config_parser.get_processors(),
38
+ renderer=config_parser.get_renderer()
39
+ )
40
+ return converter(fname)
41
+
42
+ def open_pdf(pdf_file):
43
+ return pypdfium2.PdfDocument(pdf_file)
44
+
45
+ def count_pdf(pdf_file):
46
+ doc = open_pdf(pdf_file)
47
+ return len(doc)
48
+
49
+ def get_page_image(pdf_file, page_num, dpi=96):
50
+ doc = open_pdf(pdf_file)
51
+ renderer = doc.render(
52
+ pypdfium2.PdfBitmap.to_pil,
53
+ page_indices=[page_num - 1],
54
+ scale=dpi / 72,
55
+ )
56
+ png = list(renderer)[0]
57
+ png_image = png.convert("RGB")
58
+ return png_image
59
+
60
+ def get_uploaded_image(in_file):
61
+ return Image.open(in_file).convert("RGB")
62
+
63
+
64
+ def img_to_html(img, img_alt):
65
+ img_bytes = io.BytesIO()
66
+ img.save(img_bytes, format="PNG")
67
+ img_bytes = img_bytes.getvalue()
68
+ encoded = base64.b64encode(img_bytes).decode()
69
+ img_html = f'<img src="data:image/png;base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
70
+ return img_html
71
+
72
+ def markdown_insert_images(markdown, images):
73
+ image_tags = re.findall(r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
74
+
75
+ for image in image_tags:
76
+ image_markdown = image[0]
77
+ image_alt = image[1]
78
+ image_path = image[2]
79
+ if image_path in images:
80
+ markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt))
81
+ return markdown
82
+
83
+
84
+ model_dict = load_models()
85
+
86
+ with gr.Blocks(title="Marker") as demo:
87
+ gr.Markdown("""
88
+ # Marker Demo
89
+
90
+ This app will let you try marker, a PDF -> Markdown converter. It works with any languages, and extracts images, tables, equations, etc.
91
+
92
+ Find the project [here](https://github.com/VikParuchuri/marker).
93
+ """)
94
+
95
+ with gr.Row():
96
+ with gr.Column():
97
+ in_file = gr.File(label="PDF file or image:", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".gif", ".webp"])
98
+ in_num = gr.Slider(label="Page number", minimum=1, maximum=100, value=1, step=1)
99
+ in_img = gr.Image(label="Select page of Image", type="pil", sources=None)
100
+
101
+ page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"0-0")
102
+ output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html"], value="markdown")
103
+
104
+ force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages")
105
+ debug_ckb = gr.Checkbox(label="Debug", value=False, info="Show debug information")
106
+ trun_marker_btn = gr.Button("Run Marker")
107
+ with gr.Column():
108
+ result_md = gr.Markdown(label="Result markdown")
109
+ result_json = gr.JSON(label="Result json")
110
+ result_html = gr.Markdown(label="Result html")
111
+ debug_img_pdf = gr.Image(label="PDF debug image", visible=False)
112
+ debug_img_layout = gr.Image(label="Layout debug image", visible=False)
113
+
114
+ def show_image(file, num=1):
115
+ if file.endswith('.pdf'):
116
+ count = count_pdf(file)
117
+ img = get_page_image(file, num)
118
+ return [
119
+ gr.update(visible=True, maximum=count),
120
+ gr.update(value=img)]
121
+ else:
122
+ img = get_uploaded_image(file)
123
+ return [
124
+ gr.update(visible=False),
125
+ gr.update(value=img)]
126
+
127
+ in_file.upload(
128
+ fn=show_image,
129
+ inputs=[in_file],
130
+ outputs=[in_num, in_img],
131
+ )
132
+ in_num.change(
133
+ fn=show_image,
134
+ inputs=[in_file, in_num],
135
+ outputs=[in_num, in_img],
136
+ )
137
+
138
+ def check_page_range(page_range, file):
139
+ count = count_pdf(file) if file is not None else 1
140
+ if not re.match(r"^(\d+(-\d+)?)?$", page_range):
141
+ gr.Warning(f"Invalid format. Please use 0-{count-1}", duration=0)
142
+ return gr.update(info=f"format 0-{count-1}"), gr.update(interactive=False)
143
+ else:
144
+ return gr.update(info=f"format 0-{count-1}"), gr.update(interactive=True)
145
+ page_range_txt.change(
146
+ fn=check_page_range,
147
+ inputs=[page_range_txt, in_file],
148
+ outputs=[page_range_txt, trun_marker_btn]
149
+ )
150
+
151
+ # Run Marker
152
+ def run_marker_img(filename, page_range, force_ocr, output_format, debug):
153
+ rendered = convert_pdf(
154
+ filename,
155
+ page_range=page_range,
156
+ force_ocr=force_ocr,
157
+ output_format=output_format,
158
+ output_dir=settings.DEBUG_DATA_FOLDER if debug else None,
159
+ debug=debug
160
+ )
161
+ text, ext, images = text_from_rendered(rendered)
162
+
163
+ gr_debug_pdf = gr.update(visible=False)
164
+ gr_debug_lay = gr.update(visible=False)
165
+ if debug:
166
+ debug_data_path = rendered.metadata.get("debug_data_path")
167
+ if debug_data_path:
168
+ pdf_image_path = os.path.join(debug_data_path, f"pdf_page_0.png")
169
+ img = Image.open(pdf_image_path)
170
+ gr_debug_pdf = gr.update(visible=True, value=img)
171
+ layout_image_path = os.path.join(debug_data_path, f"layout_page_0.png")
172
+ img = Image.open(layout_image_path)
173
+ gr_debug_lay = gr.update(visible=True, value=img)
174
+
175
+ if output_format == "markdown":
176
+ text = markdown_insert_images(text, images)
177
+ return [
178
+ gr.update(visible=True, value=text),
179
+ gr.update(visible=False),
180
+ gr.update(visible=False),
181
+ gr_debug_pdf,
182
+ gr_debug_lay
183
+ ]
184
+ elif output_format == "json":
185
+ return [
186
+ gr.update(visible=False),
187
+ gr.update(visible=True, value=text),
188
+ gr.update(visible=False),
189
+ gr_debug_pdf,
190
+ gr_debug_lay
191
+ ]
192
+ elif output_format == "html":
193
+ return [
194
+ gr.update(visible=False),
195
+ gr.update(visible=False),
196
+ gr.update(visible=True, value=text),
197
+ gr_debug_pdf,
198
+ gr_debug_lay
199
+ ]
200
+
201
+ trun_marker_btn.click(
202
+ fn=run_marker_img,
203
+ inputs=[in_file, page_range_txt, force_ocr_ckb, output_format_dd, debug_ckb],
204
+ outputs=[result_md, result_json, result_html, debug_img_pdf, debug_img_layout],
205
+ )
206
+
207
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch==2.5.1
2
+ marker-pdf==1.0.0
3
+ gradio==5.8.0
4
+ huggingface-hub==0.26.3