Spaces:
Sleeping
Sleeping
import pdfplumber | |
import pandas as pd | |
import re | |
import gradio as gr | |
def extract_data_from_pdf(pdf_file): | |
data = [] | |
po_number = None | |
# Open PDF file directly | |
with pdfplumber.open(pdf_file.name) as pdf: | |
for page in pdf.pages: | |
text = page.extract_text() | |
# Extract PO number (only once at the start) | |
if po_number is None: | |
po_match = re.search(r"Purchase Order : (\w+)", text) | |
po_number = po_match.group(1) if po_match else "N/A" | |
# Regex pattern for extracting rows | |
row_pattern = re.compile( | |
r"(\d+)\s+(\d{9})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+INR\s+([\d.]+)" | |
) | |
# Extract each row using the pattern | |
for match in row_pattern.finditer(text): | |
pos, item_code, unit, delivery_date, quantity, basic_price, discount, amount = match.groups() | |
# Extract subtotal if present | |
sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text) | |
sub_total = sub_total_match.group(1) if sub_total_match else "0.0" | |
# Append data for each matched row | |
data.append({ | |
"Purchase Order": po_number, | |
"Pos.": pos, | |
"Item Code": item_code, | |
"Unit": unit, | |
"Delivery Date": delivery_date, | |
"Quantity": quantity, | |
"Basic Price": basic_price, | |
"Discount": discount, | |
"Amount": amount, | |
"SUB TOTAL": sub_total, | |
}) | |
# Convert data to DataFrame and save to Excel | |
df = pd.DataFrame(data) | |
output_file = "output.xlsx" | |
df.to_excel(output_file, index=False) | |
return output_file | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=extract_data_from_pdf, | |
inputs=gr.File(label="Upload PDF"), | |
outputs=gr.File(label="Download Excel"), | |
title="PDF Data Extractor", | |
description="Extract structured data from a PDF and output it as an Excel file.", | |
) | |
iface.launch() | |