Spaces:
Sleeping
Sleeping
import pdfplumber | |
import pandas as pd | |
from io import BytesIO | |
import re | |
import gradio as gr | |
def extract_data_from_pdf(pdf_file): | |
data = [] | |
po_number = None | |
# Use pdfplumber with BytesIO for Gradio compatibility | |
with pdfplumber.open(BytesIO(pdf_file.read())) as pdf: | |
for page in pdf.pages: | |
text = page.extract_text() | |
# Extract PO number if available | |
if po_number is None: | |
po_match = re.search(r"Purchase Order : (\w+)", text) | |
po_number = po_match.group(1) if po_match else "N/A" | |
# Regex pattern to match the row data | |
row_pattern = re.compile( | |
r"(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" | |
) | |
# Extract matching rows | |
for match in row_pattern.finditer(text): | |
pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups() | |
sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text) | |
sub_total = sub_total_match.group(1) if sub_total_match else "0.0" | |
data.append({ | |
"Purchase Order": po_number, | |
"Pos.": pos, | |
"Item Code": item_code, | |
"Unit": unit, | |
"Delivery Date": delivery_date, | |
"Quantity": quantity, | |
"Basic Price": basic_price, | |
"Amount": amount, | |
"SUB TOTAL": sub_total | |
}) | |
# Convert the data to a DataFrame and save to Excel | |
df = pd.DataFrame(data) | |
output = BytesIO() | |
with pd.ExcelWriter(output, engine="xlsxwriter") as writer: | |
df.to_excel(writer, index=False, sheet_name="Extracted Data") | |
output.seek(0) | |
return output | |
# Updated Gradio Interface | |
iface = gr.Interface( | |
fn=extract_data_from_pdf, | |
inputs=gr.File(label="Upload PDF"), | |
outputs=gr.File(label="Download Excel"), | |
title="PDF Data Extractor", | |
description="Extract structured data from a PDF and output it as an Excel file." | |
) | |
iface.launch() | |