Spaces:
Sleeping
Sleeping
import pdfplumber | |
import pandas as pd | |
from io import BytesIO | |
import re | |
import gradio as gr | |
def extract_data_from_pdf(pdf_file): | |
data = [] | |
po_number = None | |
# Save BytesIO to temporary file | |
with open("temp.pdf", "wb") as f: | |
f.write(pdf_file.getbuffer()) | |
with pdfplumber.open("temp.pdf") as pdf: | |
for page in pdf.pages: | |
text = page.extract_text() | |
# Extract PO number | |
if po_number is None: | |
po_match = re.search(r"Purchase Order : (\w+)", text) | |
po_number = po_match.group(1) if po_match else "N/A" | |
# Regex pattern for row data | |
row_pattern = re.compile( | |
r"(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" | |
) | |
# Extract matching rows | |
for match in row_pattern.finditer(text): | |
( | |
pos, | |
item_code, | |
unit, | |
delivery_date, | |
quantity, | |
basic_price, | |
amount, | |
) = match.groups() | |
sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text) | |
sub_total = sub_total_match.group(1) if sub_total_match else "0.0" | |
data.append( | |
{ | |
"Purchase Order": po_number, | |
"Pos.": pos, | |
"Item Code": item_code, | |
"Unit": unit, | |
"Delivery Date": delivery_date, | |
"Quantity": quantity, | |
"Basic Price": basic_price, | |
"Amount": amount, | |
"SUB TOTAL": sub_total, | |
} | |
) | |
# Convert data to DataFrame and save to Excel | |
df = pd.DataFrame(data) | |
output = BytesIO() | |
with pd.ExcelWriter(output, engine="xlsxwriter") as writer: | |
df.to_excel(writer, index=False, sheet_name="Extracted Data") | |
output.seek(0) | |
# Remove temporary PDF file | |
import os | |
os.remove("temp.pdf") | |
return output | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=extract_data_from_pdf, | |
inputs=gr.File(label="Upload PDF"), | |
outputs=gr.File(label="Download Excel"), | |
title="PDF Data Extractor", | |
description="Extract structured data from a PDF and output it as an Excel file.", | |
) | |
iface.launch() | |