Spaces:
Sleeping
Sleeping
import pdfplumber | |
import pandas as pd | |
import re | |
import gradio as gr | |
def extract_data_from_pdf(pdf_file): | |
data = [] | |
po_number = None | |
# Open PDF file directly | |
with pdfplumber.open(pdf_file.name) as pdf: | |
for page in pdf.pages: | |
text = page.extract_text() | |
# Extract PO number | |
if po_number is None: | |
po_match = re.search(r"Purchase Order : (\w+)", text) | |
po_number = po_match.group(1) if po_match else "N/A" | |
# Regex pattern for row data | |
row_pattern = re.compile( | |
r"(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" | |
) | |
# Extract matching rows | |
for match in row_pattern.finditer(text): | |
( | |
pos, | |
item_code, | |
unit, | |
delivery_date, | |
quantity, | |
basic_price, | |
amount, | |
) = match.groups() | |
sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text) | |
sub_total = sub_total_match.group(1) if sub_total_match else "0.0" | |
data.append( | |
{ | |
"Purchase Order": po_number, | |
"Pos.": pos, | |
"Item Code": item_code, | |
"Unit": unit, | |
"Delivery Date": delivery_date, | |
"Quantity": quantity, | |
"Basic Price": basic_price, | |
"Amount": amount, | |
"SUB TOTAL": sub_total, | |
} | |
) | |
# Convert data to DataFrame and save to Excel | |
df = pd.DataFrame(data) | |
output = df.to_excel("output.xlsx", index=False) | |
return "output.xlsx" | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=extract_data_from_pdf, | |
inputs=gr.File(label="Upload PDF"), | |
outputs=gr.File(label="Download Excel"), | |
title="PDF Data Extractor", | |
description="Extract structured data from a PDF and output it as an Excel file.", | |
) | |
iface.launch() | |