xiaoyao9184 commited on
Commit
9db3d20
1 Parent(s): 955b2d1

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (2) hide show
  1. gradio_app.py +218 -0
  2. requirements.txt +4 -0
gradio_app.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ if "APP_PATH" in os.environ:
5
+ os.chdir(os.environ["APP_PATH"])
6
+ # fix sys.path for import
7
+ sys.path.append(os.getcwd())
8
+
9
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
10
+
11
+ import gradio as gr
12
+
13
+ import pypdfium2
14
+
15
+ from texify.inference import batch_inference
16
+ from texify.model.model import load_model
17
+ from texify.model.processor import load_processor
18
+ from texify.output import replace_katex_invalid
19
+ from PIL import Image
20
+
21
+ MAX_WIDTH = 800
22
+ MAX_HEIGHT = 1000
23
+
24
+ def load_model_cached():
25
+ return load_model()
26
+
27
+ def load_processor_cached():
28
+ return load_processor()
29
+
30
+ def infer_image(pil_image, bbox, temperature, model, processor):
31
+ input_img = pil_image.crop(bbox)
32
+ model_output = batch_inference([input_img], model, processor, temperature=temperature)
33
+ return model_output[0]
34
+
35
+ def open_pdf(pdf_file):
36
+ return pypdfium2.PdfDocument(pdf_file)
37
+
38
+ def count_pdf(pdf_file):
39
+ doc = open_pdf(pdf_file)
40
+ return len(doc)
41
+
42
+ def get_page_image(pdf_file, page_num, dpi=96):
43
+ doc = open_pdf(pdf_file)
44
+ renderer = doc.render(
45
+ pypdfium2.PdfBitmap.to_pil,
46
+ page_indices=[page_num - 1],
47
+ scale=dpi / 72,
48
+ )
49
+ png = list(renderer)[0]
50
+ png_image = png.convert("RGB")
51
+ return png_image
52
+
53
+ def get_uploaded_image(in_file):
54
+ return Image.open(in_file).convert("RGB")
55
+
56
+ def resize_image(pil_image):
57
+ if pil_image is None:
58
+ return
59
+ pil_image.thumbnail((MAX_WIDTH, MAX_HEIGHT), Image.Resampling.LANCZOS)
60
+
61
+ def texify(img, box, temperature):
62
+ img_pil = Image.fromarray(img).convert("RGB")
63
+
64
+ bbox_list = []
65
+ if box is not None and len(box[1]) > 0 and len(sections) > 0:
66
+ for idx, ((x_start, y_start, x_end, y_end), _) in enumerate(sections):
67
+ left = min(x_start, x_end)
68
+ right = max(x_start, x_end)
69
+ top = min(y_start, y_end)
70
+ bottom = max(y_start, y_end)
71
+ bbox_list.append((left, top, right, bottom))
72
+ else:
73
+ bbox_list = [(0, 0, img_pil.width, img_pil.height)]
74
+
75
+ output = ""
76
+ inferences = [infer_image(img_pil, bbox, temperature, model, processor) for bbox in bbox_list]
77
+ for idx, inference in enumerate(reversed(inferences)):
78
+ output += f"### {len(sections) - idx}\n"
79
+ katex_markdown = replace_katex_invalid(inference)
80
+ output += katex_markdown + "\n"
81
+ output += "\n"
82
+ return output
83
+
84
+ # ROI means Region Of Interest. It is the region where the user clicks
85
+ # to specify the location of the watermark.
86
+ ROI_coordinates = {
87
+ 'x_temp': 0,
88
+ 'y_temp': 0,
89
+ 'x_new': 0,
90
+ 'y_new': 0,
91
+ 'clicks': 0,
92
+ }
93
+
94
+ sections = []
95
+
96
+ def get_select_coordinates(img, evt: gr.SelectData):
97
+ # update new coordinates
98
+ ROI_coordinates['clicks'] += 1
99
+ ROI_coordinates['x_temp'] = ROI_coordinates['x_new']
100
+ ROI_coordinates['y_temp'] = ROI_coordinates['y_new']
101
+ ROI_coordinates['x_new'] = evt.index[0]
102
+ ROI_coordinates['y_new'] = evt.index[1]
103
+ # compare start end coordinates
104
+ x_start = ROI_coordinates['x_new'] if (ROI_coordinates['x_new'] < ROI_coordinates['x_temp']) else ROI_coordinates['x_temp']
105
+ y_start = ROI_coordinates['y_new'] if (ROI_coordinates['y_new'] < ROI_coordinates['y_temp']) else ROI_coordinates['y_temp']
106
+ x_end = ROI_coordinates['x_new'] if (ROI_coordinates['x_new'] > ROI_coordinates['x_temp']) else ROI_coordinates['x_temp']
107
+ y_end = ROI_coordinates['y_new'] if (ROI_coordinates['y_new'] > ROI_coordinates['y_temp']) else ROI_coordinates['y_temp']
108
+ if ROI_coordinates['clicks'] % 2 == 0:
109
+ sections[len(sections) - 1] = ((x_start, y_start, x_end, y_end), f"Mask {len(sections)}")
110
+ # both start and end point get
111
+ return (img, sections)
112
+ else:
113
+ point_width = int(img.shape[0]*0.05)
114
+ sections.append(((ROI_coordinates['x_new'], ROI_coordinates['y_new'],
115
+ ROI_coordinates['x_new'] + point_width, ROI_coordinates['y_new'] + point_width),
116
+ f"Click second point for Mask {len(sections) + 1}"))
117
+ return (img, sections)
118
+
119
+ def del_select_coordinates(img, evt: gr.SelectData):
120
+ del sections[evt.index]
121
+ # recreate section names
122
+ for i in range(len(sections)):
123
+ sections[i] = (sections[i][0], f"Mask {i + 1}")
124
+
125
+ # last section clicking second point not complete
126
+ if ROI_coordinates['clicks'] % 2 != 0:
127
+ if len(sections) == evt.index:
128
+ # delete last section
129
+ ROI_coordinates['clicks'] -= 1
130
+ else:
131
+ # recreate last section name for second point
132
+ ROI_coordinates['clicks'] -= 2
133
+ sections[len(sections) - 1] = (sections[len(sections) - 1][0], f"Click second point for Mask {len(sections) + 1}")
134
+ else:
135
+ ROI_coordinates['clicks'] -= 2
136
+
137
+ return (img[0], sections)
138
+
139
+
140
+ model = load_model_cached()
141
+ processor = load_processor_cached()
142
+
143
+ with gr.Blocks(title="Texify") as demo:
144
+ gr.Markdown("""
145
+ After the model loads, upload an image or a pdf, then draw a box around the equation or text you want to OCR by clicking and dragging.
146
+ Texify will convert it to Markdown with LaTeX math on the right.
147
+ If you have already cropped your image, select "OCR image" in the sidebar instead.
148
+ """)
149
+
150
+ with gr.Row():
151
+ with gr.Column():
152
+ in_file = gr.File(label="PDF file or image:", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".gif", ".webp"])
153
+
154
+ in_num = gr.Slider(label="Page number", minimum=1, maximum=100, value=1, step=1)
155
+ in_img = gr.Image(label="Select ROI of Image", type="numpy", sources=None)
156
+ in_temperature = gr.Slider(label="Generation temperature", minimum=0.0, maximum=1.0, value=0.0, step=0.05)
157
+ in_btn = gr.Button("OCR ROI")
158
+ with gr.Column():
159
+ gr.Markdown("""
160
+ ### Usage tips
161
+ - Don't make your boxes too small or too large. See the examples and the video in the [README](https://github.com/vikParuchuri/texify) for more info.
162
+ - Texify is sensitive to how you draw the box around the text you want to OCR. If you get bad results, try selecting a slightly different box, or splitting the box into multiple.
163
+ - You can try changing the temperature value on the left if you don't get good results. This controls how "creative" the model is.
164
+ - Sometimes KaTeX won't be able to render an equation (red error text), but it will still be valid LaTeX. You can copy the LaTeX and render it elsewhere.
165
+ """)
166
+ in_box = gr.AnnotatedImage(
167
+ label="ROI",
168
+ color_map={
169
+ "ROI of OCR": "#9987FF",
170
+ "Click second point for ROI": "#f44336"}
171
+ )
172
+ markdown_result = gr.Markdown(label="Markdown of results")
173
+
174
+ def show_image(file, num=1):
175
+ sections = []
176
+ if file.endswith('.pdf'):
177
+ count = count_pdf(file)
178
+ img = get_page_image(file, num)
179
+ # Resize to max bounds
180
+ resize_image(img)
181
+ return [
182
+ gr.update(visible=True, maximum=count),
183
+ gr.update(value=img)]
184
+ else:
185
+ img = get_uploaded_image(file)
186
+ # Resize to max bounds
187
+ resize_image(img)
188
+ return [
189
+ gr.update(visible=False),
190
+ gr.update(value=img)]
191
+
192
+ in_file.upload(
193
+ fn=show_image,
194
+ inputs=[in_file],
195
+ outputs=[in_num, in_img],
196
+ )
197
+ in_num.change(
198
+ fn=show_image,
199
+ inputs=[in_file, in_num],
200
+ outputs=[in_num, in_img],
201
+ )
202
+ in_img.select(
203
+ fn=get_select_coordinates,
204
+ inputs=[in_img],
205
+ outputs=in_box
206
+ )
207
+ in_box.select(
208
+ fn=del_select_coordinates,
209
+ inputs=in_box,
210
+ outputs=in_box
211
+ )
212
+ in_btn.click(
213
+ fn=texify,
214
+ inputs=[in_img, in_box, in_temperature],
215
+ outputs=[markdown_result]
216
+ )
217
+
218
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch==2.5.1
2
+ texify==0.2.1
3
+ gradio==5.8.0
4
+ huggingface-hub==0.26.3