Spaces:
Sleeping
Sleeping
neerajkalyank
commited on
Commit
•
8b139bf
1
Parent(s):
65c3f55
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers import DonutProcessor, VisionEncoderDecoderModel
|
3 |
+
import pandas as pd
|
4 |
+
from io import BytesIO
|
5 |
+
import fitz # PyMuPDF
|
6 |
+
import re
|
7 |
+
from PIL import Image
|
8 |
+
|
9 |
+
# Initialize the Hugging Face Donut model and processor
|
10 |
+
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
|
11 |
+
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
|
12 |
+
|
13 |
+
def extract_data_from_pdf(pdf_file):
|
14 |
+
# Open the PDF file
|
15 |
+
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
16 |
+
text_data = []
|
17 |
+
|
18 |
+
for page_num in range(doc.page_count):
|
19 |
+
page = doc[page_num]
|
20 |
+
pix = page.get_pixmap() # Render page to a Pixmap image
|
21 |
+
|
22 |
+
# Convert Pixmap to PIL Image
|
23 |
+
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
24 |
+
|
25 |
+
# Preprocess image for the Donut model
|
26 |
+
processed_image = processor(image, return_tensors="pt").pixel_values
|
27 |
+
outputs = model.generate(processed_image)
|
28 |
+
|
29 |
+
# Decode generated text
|
30 |
+
text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
31 |
+
text_data.append(text)
|
32 |
+
|
33 |
+
# Initialize list for parsed data
|
34 |
+
data = []
|
35 |
+
|
36 |
+
# Define regular expressions for parsing rows
|
37 |
+
row_pattern = re.compile(
|
38 |
+
r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)'
|
39 |
+
)
|
40 |
+
|
41 |
+
# Process and structure extracted text
|
42 |
+
for text in text_data:
|
43 |
+
for line in text.split('\n'):
|
44 |
+
# Apply row pattern to each line
|
45 |
+
match = row_pattern.search(line)
|
46 |
+
if match:
|
47 |
+
row = match.groupdict()
|
48 |
+
row["description"] = row["description"].strip() # Clean description
|
49 |
+
row["quantity"] = float(row["quantity"])
|
50 |
+
row["price"] = float(row["price"])
|
51 |
+
row["discount"] = float(row["discount"])
|
52 |
+
row["amount"] = float(row["amount"])
|
53 |
+
|
54 |
+
# Calculate Sub Total with assumed tax rate
|
55 |
+
central_gst = row["amount"] * 0.09 # Central GST 9%
|
56 |
+
state_gst = row["amount"] * 0.09 # State GST 9%
|
57 |
+
row["Central GST"] = round(central_gst, 2)
|
58 |
+
row["State GST"] = round(state_gst, 2)
|
59 |
+
row["Sub Total"] = round(row["amount"] + central_gst + state_gst - row["discount"], 2)
|
60 |
+
|
61 |
+
data.append(row)
|
62 |
+
|
63 |
+
# Create DataFrame if data was extracted
|
64 |
+
if data:
|
65 |
+
df = pd.DataFrame(data)
|
66 |
+
df.columns = [
|
67 |
+
"Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price",
|
68 |
+
"Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
|
69 |
+
]
|
70 |
+
|
71 |
+
# Export DataFrame to Excel
|
72 |
+
output = BytesIO()
|
73 |
+
with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
|
74 |
+
df.to_excel(writer, index=False, sheet_name="Extracted Data")
|
75 |
+
output.seek(0)
|
76 |
+
|
77 |
+
return output
|
78 |
+
else:
|
79 |
+
# If no data was found, create a blank Excel file
|
80 |
+
output = BytesIO()
|
81 |
+
with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
|
82 |
+
pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
|
83 |
+
output.seek(0)
|
84 |
+
|
85 |
+
return output
|
86 |
+
|
87 |
+
# Define Gradio Interface with updated components
|
88 |
+
iface = gr.Interface(
|
89 |
+
fn=extract_data_from_pdf,
|
90 |
+
inputs=gr.File(label="Upload PDF"),
|
91 |
+
outputs=gr.File(label="Download Excel"),
|
92 |
+
title="Advanced Document Data Extractor",
|
93 |
+
description=(
|
94 |
+
"Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
|
95 |
+
"The model will identify and parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
|
96 |
+
"Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
|
97 |
+
),
|
98 |
+
)
|
99 |
+
|
100 |
+
iface.launch()
|