neerajkalyank commited on
Commit
8b139bf
1 Parent(s): 65c3f55

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import DonutProcessor, VisionEncoderDecoderModel
3
+ import pandas as pd
4
+ from io import BytesIO
5
+ import fitz # PyMuPDF
6
+ import re
7
+ from PIL import Image
8
+
9
+ # Initialize the Hugging Face Donut model and processor
10
+ processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
11
+ model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
12
+
13
+ def extract_data_from_pdf(pdf_file):
14
+ # Open the PDF file
15
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
16
+ text_data = []
17
+
18
+ for page_num in range(doc.page_count):
19
+ page = doc[page_num]
20
+ pix = page.get_pixmap() # Render page to a Pixmap image
21
+
22
+ # Convert Pixmap to PIL Image
23
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
24
+
25
+ # Preprocess image for the Donut model
26
+ processed_image = processor(image, return_tensors="pt").pixel_values
27
+ outputs = model.generate(processed_image)
28
+
29
+ # Decode generated text
30
+ text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
31
+ text_data.append(text)
32
+
33
+ # Initialize list for parsed data
34
+ data = []
35
+
36
+ # Define regular expressions for parsing rows
37
+ row_pattern = re.compile(
38
+ r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)'
39
+ )
40
+
41
+ # Process and structure extracted text
42
+ for text in text_data:
43
+ for line in text.split('\n'):
44
+ # Apply row pattern to each line
45
+ match = row_pattern.search(line)
46
+ if match:
47
+ row = match.groupdict()
48
+ row["description"] = row["description"].strip() # Clean description
49
+ row["quantity"] = float(row["quantity"])
50
+ row["price"] = float(row["price"])
51
+ row["discount"] = float(row["discount"])
52
+ row["amount"] = float(row["amount"])
53
+
54
+ # Calculate Sub Total with assumed tax rate
55
+ central_gst = row["amount"] * 0.09 # Central GST 9%
56
+ state_gst = row["amount"] * 0.09 # State GST 9%
57
+ row["Central GST"] = round(central_gst, 2)
58
+ row["State GST"] = round(state_gst, 2)
59
+ row["Sub Total"] = round(row["amount"] + central_gst + state_gst - row["discount"], 2)
60
+
61
+ data.append(row)
62
+
63
+ # Create DataFrame if data was extracted
64
+ if data:
65
+ df = pd.DataFrame(data)
66
+ df.columns = [
67
+ "Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price",
68
+ "Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
69
+ ]
70
+
71
+ # Export DataFrame to Excel
72
+ output = BytesIO()
73
+ with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
74
+ df.to_excel(writer, index=False, sheet_name="Extracted Data")
75
+ output.seek(0)
76
+
77
+ return output
78
+ else:
79
+ # If no data was found, create a blank Excel file
80
+ output = BytesIO()
81
+ with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
82
+ pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
83
+ output.seek(0)
84
+
85
+ return output
86
+
87
+ # Define Gradio Interface with updated components
88
+ iface = gr.Interface(
89
+ fn=extract_data_from_pdf,
90
+ inputs=gr.File(label="Upload PDF"),
91
+ outputs=gr.File(label="Download Excel"),
92
+ title="Advanced Document Data Extractor",
93
+ description=(
94
+ "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
95
+ "The model will identify and parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
96
+ "Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
97
+ ),
98
+ )
99
+
100
+ iface.launch()