Spaces:
Sleeping
Sleeping
import pdfplumber | |
import pandas as pd | |
from io import BytesIO | |
import tempfile | |
import re | |
import gradio as gr | |
def extract_data_from_pdf(pdf_file): | |
data = [] | |
po_number = None | |
# Save the uploaded file temporarily so pdfplumber can open it | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: | |
temp_pdf.write(pdf_file.read()) # Write the uploaded file content to a temporary file | |
temp_pdf_path = temp_pdf.name # Get the file path | |
# Now open the temporary file with pdfplumber | |
with pdfplumber.open(temp_pdf_path) as pdf: | |
for page in pdf.pages: | |
text = page.extract_text() | |
# Extract PO number once (if not already extracted) | |
if po_number is None: | |
po_match = re.search(r"Purchase Order : (\w+)", text) | |
if po_match: | |
po_number = po_match.group(1) | |
# Regex pattern to match the row data | |
row_pattern = re.compile( | |
r"(\d+)\s+(\d{10,})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" | |
) | |
# Find all rows matching the pattern | |
for match in row_pattern.finditer(text): | |
pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups() | |
sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text) | |
sub_total = sub_total_match.group(1) if sub_total_match else "" | |
data.append({ | |
"Purchase Order": po_number, | |
"Pos.": pos, | |
"Item Code": item_code, | |
"Unit": unit, | |
"Delivery Date": delivery_date, | |
"Quantity": quantity, | |
"Basic Price": basic_price, | |
"Amount": amount, | |
"SUB TOTAL": sub_total | |
}) | |
# Convert the data to a DataFrame | |
df = pd.DataFrame(data) | |
output = BytesIO() | |
with pd.ExcelWriter(output, engine="xlsxwriter") as writer: | |
df.to_excel(writer, index=False, sheet_name="Extracted Data") | |
output.seek(0) | |
return output | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=extract_data_from_pdf, | |
inputs=gr.File(label="Upload PDF"), | |
outputs=gr.File(label="Download Excel"), | |
title="PDF Data Extractor", | |
description="Extract structured data from a PDF and output it as an Excel file." | |
) | |
iface.launch() | |