File size: 4,105 Bytes
8b139bf
 
 
 
 
 
 
 
 
 
 
 
 
a8af579
 
8b139bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
from transformers import DonutProcessor, VisionEncoderDecoderModel
import pandas as pd
from io import BytesIO
import fitz  # PyMuPDF
import re
from PIL import Image

# Initialize the Hugging Face Donut model and processor
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")

def extract_data_from_pdf(pdf_file):
    # Open the PDF file using the path provided by gr.File
    doc = fitz.open(pdf_file.name)
    text_data = []

    for page_num in range(doc.page_count):
        page = doc[page_num]
        pix = page.get_pixmap()  # Render page to a Pixmap image
        
        # Convert Pixmap to PIL Image
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        
        # Preprocess image for the Donut model
        processed_image = processor(image, return_tensors="pt").pixel_values
        outputs = model.generate(processed_image)
        
        # Decode generated text
        text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        text_data.append(text)

    # Initialize list for parsed data
    data = []

    # Define regular expressions for parsing rows
    row_pattern = re.compile(
        r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)'
    )

    # Process and structure extracted text
    for text in text_data:
        for line in text.split('\n'):
            # Apply row pattern to each line
            match = row_pattern.search(line)
            if match:
                row = match.groupdict()
                row["description"] = row["description"].strip()  # Clean description
                row["quantity"] = float(row["quantity"])
                row["price"] = float(row["price"])
                row["discount"] = float(row["discount"])
                row["amount"] = float(row["amount"])

                # Calculate Sub Total with assumed tax rate
                central_gst = row["amount"] * 0.09  # Central GST 9%
                state_gst = row["amount"] * 0.09    # State GST 9%
                row["Central GST"] = round(central_gst, 2)
                row["State GST"] = round(state_gst, 2)
                row["Sub Total"] = round(row["amount"] + central_gst + state_gst - row["discount"], 2)

                data.append(row)

    # Create DataFrame if data was extracted
    if data:
        df = pd.DataFrame(data)
        df.columns = [
            "Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price",
            "Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
        ]
        
        # Export DataFrame to Excel
        output = BytesIO()
        with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
            df.to_excel(writer, index=False, sheet_name="Extracted Data")
        output.seek(0)

        return output
    else:
        # If no data was found, create a blank Excel file
        output = BytesIO()
        with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
            pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
        output.seek(0)
        
        return output

# Define Gradio Interface with updated components
iface = gr.Interface(
    fn=extract_data_from_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.File(label="Download Excel"),
    title="Advanced Document Data Extractor",
    description=(
        "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
        "The model will identify and parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
        "Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
    ),
)

iface.launch()