Spaces:

DSatishchandra
/

PMP_PO_Extraction

Sleeping

App Files Files Community

DSatishchandra commited on Dec 2, 2024

Commit

9d15142

verified ·

1 Parent(s): 2e50318

Update federal_electric.py

Browse files

Files changed (1) hide show

federal_electric.py +110 -71

federal_electric.py CHANGED Viewed

@@ -1,83 +1,122 @@
 import pdfplumber
-import pandas as pd
 import re
-# Function: Extract Text from PDF
-def extract_text_from_pdf(pdf_file):
-    with pdfplumber.open(pdf_file.name) as pdf:
-        text = ""
-        for page in pdf.pages:
-            text += page.extract_text()
-    return text
-# Function: Parse PO Items
-def parse_po_items_with_filters(text):
     """
-    Parses purchase order items from the extracted text using regex with filters.
-    Handles split descriptions across lines and filters unwanted text.
     """
-    lines = text.splitlines()
     data = []
-    current_item = {}
-    description_accumulator = []
-    for line in lines:
-        # Match the start of an item row
-        item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
-        if item_match:
-            # Save the previous item and start a new one
-            if current_item:
-                current_item["Description"] = " ".join(description_accumulator).strip()
-                data.append(current_item)
-                description_accumulator = []
-            current_item = {
-                "Item": item_match.group("Item"),
-                "Description": "",
-                "Qty": "",
-                "Unit": "",
-                "Unit Price": "",
-                "Total Price": "",
-            }
-            description_accumulator.append(item_match.group("Description"))
-        elif current_item:
-            # Handle additional description lines or split descriptions
-            description_accumulator.append(line.strip())
-        # Match Qty, Unit, Unit Price, and Total Price
-        qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
-        if qty_match:
-            current_item["Qty"] = qty_match.group("Qty")
-            current_item["Unit"] = qty_match.group(2)
-        price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
-        if price_match:
-            current_item["Unit Price"] = price_match.group("UnitPrice")
-            current_item["Total Price"] = price_match.group("TotalPrice")
-    # Save the last item
-    if current_item:
-        current_item["Description"] = " ".join(description_accumulator).strip()
-        data.append(current_item)
-    if not data:
-        return None, "No items found. Please check the PDF file format."
-    df = pd.DataFrame(data)
-    return df, "Data extracted successfully."
-# Function: Save to Excel
-def save_to_excel(df, output_path="federal_electric_extracted_data.xlsx"):
-    df.to_excel(output_path, index=False)
-    return output_path
-# Main function to process PDF
-def process_pdf(file):
-    try:
-        text = extract_text_from_pdf(file)
-        df, status = parse_po_items_with_filters(text)
-        if df is not None:
-            output_path = save_to_excel(df)
-            return output_path, status
-        return None, status
-    except Exception as e:
-        return None, f"Error during processing: {str(e)}"

 import pdfplumber
 import re
+import pandas as pd
+import gradio as gr
+def extract_po_data(pdf_file):
     """
+    Extracts Purchase Order data with enhanced multi-line Material Description handling,
+    and cleans unwanted text or symbols.
     """
     data = []
+    purchase_order_no = None
+    purchase_order_date = None
+    with pdfplumber.open(pdf_file) as pdf:
+        for page in pdf.pages:
+            # Extract text from page
+            lines = page.extract_text().split("\n")
+            temp_row = None  # Temporary row to handle multi-line descriptions
+            # Extract Purchase Order Number and Date (Assume it's on the first page)
+            if purchase_order_no is None:  # Only extract once
+                po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines))
+                po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines))
+                if po_no_match:
+                    purchase_order_no = po_no_match.group(1)
+                if po_date_match:
+                    purchase_order_date = po_date_match.group(1)
+            # Process each line to extract data
+            for line in lines:
+                # Regex pattern for rows (excluding multi-line descriptions)
+                pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$"
+                match = re.match(pattern, line)
+                if match:
+                    # If there's a match, capture the full row
+                    if temp_row:  # Append the previous temp_row if it exists
+                        data.append(temp_row)
+                        temp_row = None
+                    temp_row = {
+                        "S. No": match[1],
+                        "Material No": match[2],
+                        "Material Description": match[3].strip(),
+                        "Qty": int(match[4]),
+                        "Unit": match[5],
+                        "Price": float(match[6]),
+                        "Delivery Date": match[7],
+                        "Total Value": float(match[8]),
+                        "Vat%": float(match[9]),
+                        "Amount Incl. VAT": float(match[10]),
+                    }
+                elif temp_row:
+                    # If no match, treat it as a continuation of Material Description
+                    temp_row["Material Description"] += f" {line.strip()}"
+            # Append the last row
+            if temp_row:
+                data.append(temp_row)
+    # Create DataFrame
+    df = pd.DataFrame(data)
+    # Insert Purchase Order No and Purchase Order Date at the beginning
+    if purchase_order_no and purchase_order_date:
+        df.insert(0, "Purchase Order No", purchase_order_no)
+        df.insert(1, "Purchase Order Date", purchase_order_date)
+    # Filter unwanted text from Material Description
+    def clean_description(description):
+        # Define unwanted patterns
+        unwanted_patterns = [
+            r"This document is electronically approved",  # Matches exact phrase
+            r"does not require any signature or stamp",   # Matches approval notes
+            r"Total Amount Excl\. VAT.*",                # Matches totals
+            r"TWO THOUSAND.*ONLY",                       # Matches written totals
+            r"&",                                        # Removes stray symbols like `&`
+            r"\.+$",                                     # Removes trailing periods
+        ]
+        for pattern in unwanted_patterns:
+            description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip()
+        return description
+    df["Material Description"] = df["Material Description"].apply(clean_description)
+    # Strip extra spaces
+    df["Material Description"] = df["Material Description"].str.strip()
+    return df
+def process_and_save(pdf_file, output_format):
+    """
+    Processes the uploaded PDF and saves the extracted data as an Excel or CSV file.
+    """
+    df = extract_po_data(pdf_file.name)
+    # Save the file in the desired format
+    output_file = f"output.{output_format}"
+    if output_format == "csv":
+        df.to_csv(output_file, index=False)
+    elif output_format == "xlsx":
+        df.to_excel(output_file, index=False, engine="openpyxl")
+    return output_file
+# Gradio interface function
+def gradio_interface(pdf_file, output_format):
+    output_file = process_and_save(pdf_file, output_format)
+    return output_file
+# Gradio app interface
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[gr.File(label="Upload PDF"), gr.Radio(["csv", "xlsx"], label="Output Format")],
+    outputs=gr.File(label="Download Output"),
+    title="Enhanced PO Data Extractor",
+    description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols. Download as CSV or Excel."
+)
+if __name__ == "__main__":
+    iface.launch()