Spaces:
Sleeping
Sleeping
import pdfplumber | |
import re | |
import pandas as pd | |
import gradio as gr | |
def extract_po_data(pdf_file): | |
""" | |
Extracts Purchase Order data with enhanced multi-line Material Description handling, | |
and cleans unwanted text or symbols. | |
""" | |
data = [] | |
purchase_order_no = None | |
purchase_order_date = None | |
with pdfplumber.open(pdf_file) as pdf: | |
for page in pdf.pages: | |
# Extract text from page | |
lines = page.extract_text().split("\n") | |
temp_row = None # Temporary row to handle multi-line descriptions | |
# Extract Purchase Order Number and Date (Assume it's on the first page) | |
if purchase_order_no is None: # Only extract once | |
po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines)) | |
po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines)) | |
if po_no_match: | |
purchase_order_no = po_no_match.group(1) | |
if po_date_match: | |
purchase_order_date = po_date_match.group(1) | |
# Process each line to extract data | |
for line in lines: | |
# Regex pattern for rows (excluding multi-line descriptions) | |
pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$" | |
match = re.match(pattern, line) | |
if match: | |
# If there's a match, capture the full row | |
if temp_row: # Append the previous temp_row if it exists | |
data.append(temp_row) | |
temp_row = None | |
temp_row = { | |
"S. No": match[1], | |
"Material No": match[2], | |
"Material Description": match[3].strip(), | |
"Qty": int(match[4]), | |
"Unit": match[5], | |
"Price": float(match[6]), | |
"Delivery Date": match[7], | |
"Total Value": float(match[8]), | |
"Vat%": float(match[9]), | |
"Amount Incl. VAT": float(match[10]), | |
} | |
elif temp_row: | |
# If no match, treat it as a continuation of Material Description | |
temp_row["Material Description"] += f" {line.strip()}" | |
# Append the last row | |
if temp_row: | |
data.append(temp_row) | |
# Create DataFrame | |
df = pd.DataFrame(data) | |
# Insert Purchase Order No and Purchase Order Date at the beginning | |
if purchase_order_no and purchase_order_date: | |
df.insert(0, "Purchase Order No", purchase_order_no) | |
df.insert(1, "Purchase Order Date", purchase_order_date) | |
# Filter unwanted text from Material Description | |
def clean_description(description): | |
# Define unwanted patterns | |
unwanted_patterns = [ | |
r"This document is electronically approved", # Matches exact phrase | |
r"does not require any signature or stamp", # Matches approval notes | |
r"Total Amount Excl\. VAT.*", # Matches totals | |
r"TWO THOUSAND.*ONLY", # Matches written totals | |
r"&", # Removes stray symbols like `&` | |
r"\.+$", # Removes trailing periods | |
] | |
for pattern in unwanted_patterns: | |
description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip() | |
return description | |
df["Material Description"] = df["Material Description"].apply(clean_description) | |
# Strip extra spaces | |
df["Material Description"] = df["Material Description"].str.strip() | |
return df | |
def process_pdf(file): | |
""" | |
Processes the uploaded PDF and saves the extracted data. | |
""" | |
try: | |
# Process the extracted text into a DataFrame | |
df = extract_po_data(file.name) | |
# Save the DataFrame to an Excel file | |
output_path = "federal_electric_extracted_data.xlsx" | |
df.to_excel(output_path, index=False, engine="openpyxl") | |
return output_path, "Data extraction successful!" | |
except Exception as e: | |
return None, f"Error during processing: {str(e)}" | |
# Gradio app interface | |
iface = gr.Interface( | |
fn=process_pdf, | |
inputs=[gr.File(label="Upload PDF")], | |
outputs=[ | |
gr.File(label="Download Extracted Data"), | |
gr.Textbox(label="Status") | |
], | |
title="Enhanced PO Data Extractor", | |
description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |