Spaces:
Sleeping
Sleeping
neerajkalyank
commited on
Commit
•
9d104f1
1
Parent(s):
01fd285
Update app.py
Browse files
app.py
CHANGED
@@ -1,38 +1,28 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
import pandas as pd
|
4 |
from io import BytesIO
|
5 |
import fitz # PyMuPDF
|
6 |
import re
|
7 |
from PIL import Image
|
8 |
-
|
9 |
-
|
10 |
-
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
|
11 |
-
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
|
12 |
|
13 |
def extract_data_from_pdf(pdf_file):
|
14 |
# Open the PDF file using the path provided by gr.File
|
15 |
doc = fitz.open(pdf_file.name)
|
16 |
text_data = []
|
17 |
|
18 |
-
#
|
19 |
-
|
20 |
-
|
21 |
-
for page_num in range(max_pages):
|
22 |
page = doc[page_num]
|
23 |
pix = page.get_pixmap() # Render page to a Pixmap image
|
24 |
|
25 |
-
# Convert Pixmap to PIL Image
|
26 |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
27 |
-
image = image.resize((image.width // 2, image.height // 2)) # Resize to 50% for faster processing
|
28 |
-
|
29 |
-
# Preprocess image for the Donut model
|
30 |
-
processed_image = processor(image, return_tensors="pt").pixel_values
|
31 |
-
# Generate text with controlled length using `max_new_tokens`
|
32 |
-
outputs = model.generate(processed_image, max_new_tokens=50) # Reduced length for faster output
|
33 |
|
34 |
-
#
|
35 |
-
text =
|
36 |
text_data.append(text)
|
37 |
|
38 |
# Initialize list for parsed data
|
@@ -73,21 +63,19 @@ def extract_data_from_pdf(pdf_file):
|
|
73 |
"Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
|
74 |
]
|
75 |
|
76 |
-
#
|
77 |
-
|
78 |
-
with pd.ExcelWriter(
|
79 |
df.to_excel(writer, index=False, sheet_name="Extracted Data")
|
80 |
-
|
81 |
-
|
82 |
-
return output
|
83 |
else:
|
84 |
# If no data was found, create a blank Excel file
|
85 |
-
|
86 |
-
with pd.ExcelWriter(
|
87 |
pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
|
88 |
-
output.seek(0)
|
89 |
|
90 |
-
return
|
91 |
|
92 |
# Define Gradio Interface with updated components
|
93 |
iface = gr.Interface(
|
@@ -97,6 +85,8 @@ iface = gr.Interface(
|
|
97 |
title="Advanced Document Data Extractor",
|
98 |
description=(
|
99 |
"Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
|
|
|
|
|
100 |
),
|
101 |
)
|
102 |
|
|
|
1 |
import gradio as gr
|
2 |
+
import pytesseract
|
3 |
import pandas as pd
|
4 |
from io import BytesIO
|
5 |
import fitz # PyMuPDF
|
6 |
import re
|
7 |
from PIL import Image
|
8 |
+
import tempfile
|
9 |
+
import os
|
|
|
|
|
10 |
|
11 |
def extract_data_from_pdf(pdf_file):
|
12 |
# Open the PDF file using the path provided by gr.File
|
13 |
doc = fitz.open(pdf_file.name)
|
14 |
text_data = []
|
15 |
|
16 |
+
# Process each page in the PDF using Tesseract OCR
|
17 |
+
for page_num in range(doc.page_count):
|
|
|
|
|
18 |
page = doc[page_num]
|
19 |
pix = page.get_pixmap() # Render page to a Pixmap image
|
20 |
|
21 |
+
# Convert Pixmap to PIL Image
|
22 |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
# Use Tesseract to extract text from the image
|
25 |
+
text = pytesseract.image_to_string(image)
|
26 |
text_data.append(text)
|
27 |
|
28 |
# Initialize list for parsed data
|
|
|
63 |
"Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
|
64 |
]
|
65 |
|
66 |
+
# Save the DataFrame to a temporary Excel file
|
67 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
68 |
+
with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
|
69 |
df.to_excel(writer, index=False, sheet_name="Extracted Data")
|
70 |
+
|
71 |
+
return temp_file.name
|
|
|
72 |
else:
|
73 |
# If no data was found, create a blank Excel file
|
74 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
75 |
+
with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
|
76 |
pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
|
|
|
77 |
|
78 |
+
return temp_file.name
|
79 |
|
80 |
# Define Gradio Interface with updated components
|
81 |
iface = gr.Interface(
|
|
|
85 |
title="Advanced Document Data Extractor",
|
86 |
description=(
|
87 |
"Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
|
88 |
+
"The app will parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
|
89 |
+
"Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
|
90 |
),
|
91 |
)
|
92 |
|