import pdfplumber import re import pandas as pd import gradio as gr def extract_po_data(pdf_file): """ Extracts Purchase Order data with enhanced multi-line Material Description handling, and cleans unwanted text or symbols. """ data = [] purchase_order_no = None purchase_order_date = None with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: # Extract text from page lines = page.extract_text().split("\n") temp_row = None # Temporary row to handle multi-line descriptions # Extract Purchase Order Number and Date (Assume it's on the first page) if purchase_order_no is None: # Only extract once po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines)) po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines)) if po_no_match: purchase_order_no = po_no_match.group(1) if po_date_match: purchase_order_date = po_date_match.group(1) # Process each line to extract data for line in lines: # Regex pattern for rows (excluding multi-line descriptions) pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$" match = re.match(pattern, line) if match: # If there's a match, capture the full row if temp_row: # Append the previous temp_row if it exists data.append(temp_row) temp_row = None temp_row = { "S. No": match[1], "Material No": match[2], "Material Description": match[3].strip(), "Qty": int(match[4]), "Unit": match[5], "Price": float(match[6]), "Delivery Date": match[7], "Total Value": float(match[8]), "Vat%": float(match[9]), "Amount Incl. VAT": float(match[10]), } elif temp_row: # If no match, treat it as a continuation of Material Description temp_row["Material Description"] += f" {line.strip()}" # Append the last row if temp_row: data.append(temp_row) # Create DataFrame df = pd.DataFrame(data) # Insert Purchase Order No and Purchase Order Date at the beginning if purchase_order_no and purchase_order_date: df.insert(0, "Purchase Order No", purchase_order_no) df.insert(1, "Purchase Order Date", purchase_order_date) # Filter unwanted text from Material Description def clean_description(description): # Define unwanted patterns unwanted_patterns = [ r"This document is electronically approved", # Matches exact phrase r"does not require any signature or stamp", # Matches approval notes r"Total Amount Excl\. VAT.*", # Matches totals r"TWO THOUSAND.*ONLY", # Matches written totals r"&", # Removes stray symbols like `&` r"\.+$", # Removes trailing periods ] for pattern in unwanted_patterns: description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip() return description df["Material Description"] = df["Material Description"].apply(clean_description) # Strip extra spaces df["Material Description"] = df["Material Description"].str.strip() return df def process_and_save(pdf_file, output_format): """ Processes the uploaded PDF and saves the extracted data as an Excel or CSV file. """ df = extract_po_data(pdf_file.name) # Save the file in the desired format output_file = f"output.{output_format}" if output_format == "csv": df.to_csv(output_file, index=False) elif output_format == "xlsx": df.to_excel(output_file, index=False, engine="openpyxl") return output_file # Gradio interface function def gradio_interface(pdf_file, output_format): output_file = process_and_save(pdf_file, output_format) return output_file # Gradio app interface iface = gr.Interface( fn=gradio_interface, inputs=[gr.File(label="Upload PDF"), gr.Radio(["csv", "xlsx"], label="Output Format")], outputs=gr.File(label="Download Output"), title="Enhanced PO Data Extractor", description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols. Download as CSV or Excel." ) if __name__ == "__main__": iface.launch()