File size: 2,149 Bytes
a72b612
8b139bf
d97cfeb
 
f6ae938
8b139bf
d97cfeb
 
8b139bf
8755e00
 
a72b612
 
8b139bf
7ebbb35
d97cfeb
 
bfda109
d97cfeb
7ebbb35
d97cfeb
7ebbb35
d97cfeb
 
7ebbb35
d97cfeb
7ebbb35
53debb2
7ebbb35
d97cfeb
bfda109
d97cfeb
7ebbb35
 
 
 
 
 
 
 
 
 
 
 
 
53debb2
 
d97cfeb
7ebbb35
 
 
d97cfeb
53debb2
8b139bf
 
ea64bb3
 
d97cfeb
53debb2
8b139bf
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pdfplumber
import pandas as pd
import re
import gradio as gr

def extract_data_from_pdf(pdf_file):
    data = []
    po_number = None

    # Open PDF file directly
    with pdfplumber.open(pdf_file.name) as pdf:
        for page in pdf.pages:
            text = page.extract_text()

            # Extract PO number (only once at the start)
            if po_number is None:
                po_match = re.search(r"Purchase Order : (\w+)", text)
                po_number = po_match.group(1) if po_match else "N/A"

            # Regex pattern for extracting rows
            row_pattern = re.compile(
                r"(\d+)\s+(\d{9})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+INR\s+([\d.]+)"
            )

            # Extract each row using the pattern
            for match in row_pattern.finditer(text):
                pos, item_code, unit, delivery_date, quantity, basic_price, discount, amount = match.groups()

                # Extract subtotal if present
                sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
                sub_total = sub_total_match.group(1) if sub_total_match else "0.0"

                # Append data for each matched row
                data.append({
                    "Purchase Order": po_number,
                    "Pos.": pos,
                    "Item Code": item_code,
                    "Unit": unit,
                    "Delivery Date": delivery_date,
                    "Quantity": quantity,
                    "Basic Price": basic_price,
                    "Discount": discount,
                    "Amount": amount,
                    "SUB TOTAL": sub_total,
                })

    # Convert data to DataFrame and save to Excel
    df = pd.DataFrame(data)
    output_file = "output.xlsx"
    df.to_excel(output_file, index=False)
    return output_file

# Gradio Interface
iface = gr.Interface(
    fn=extract_data_from_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.File(label="Download Excel"),
    title="PDF Data Extractor",
    description="Extract structured data from a PDF and output it as an Excel file.",
)
iface.launch()