Spaces:

neerajkalyank
/

toshiba_2.O

Sleeping

File size: 2,074 Bytes

a72b612
8b139bf
a72b612
d97cfeb
 
f6ae938
8b139bf
d97cfeb
 
8b139bf
a72b612
 
 
8b139bf
d97cfeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b139bf
 
 
 
d97cfeb
 
8b139bf

import pdfplumber
import pandas as pd
from io import BytesIO
import re
import gradio as gr

def extract_data_from_pdf(pdf_file):
    data = []
    po_number = None

    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()

            # Extract PO number once (if not already extracted)
            if po_number is None:
                po_match = re.search(r"Purchase Order : (\w+)", text)
                if po_match:
                    po_number = po_match.group(1)

            # Regex pattern to match the row data
            row_pattern = re.compile(
                r"(\d+)\s+(\d{10,})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
            )

            # Find all rows matching the pattern
            for match in row_pattern.finditer(text):
                pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups()
                sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
                sub_total = sub_total_match.group(1) if sub_total_match else ""

                data.append({
                    "Purchase Order": po_number,
                    "Pos.": pos,
                    "Item Code": item_code,
                    "Unit": unit,
                    "Delivery Date": delivery_date,
                    "Quantity": quantity,
                    "Basic Price": basic_price,
                    "Amount": amount,
                    "SUB TOTAL": sub_total
                })

    # Convert the data to a DataFrame
    df = pd.DataFrame(data)
    output = BytesIO()
    with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
        df.to_excel(writer, index=False, sheet_name="Extracted Data")
    output.seek(0)

    return output

# Gradio Interface
iface = gr.Interface(
    fn=extract_data_from_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.File(label="Download Excel"),
    title="PDF Data Extractor",
    description="Extract structured data from a PDF and output it as an Excel file."
)

iface.launch()