Spaces:

DSatishchandra
/

POExtraction_UC3

Runtime error

App Files Files Community

neerajkalyank commited on Nov 12

Commit

af70b59

•

1 Parent(s): 229b210

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -96

app.py CHANGED Viewed

@@ -1,108 +1,53 @@
-import pdfplumber
 import pandas as pd
 import gradio as gr
 import re
-import tempfile
-# Define function to extract data
-def extract_data(pdf_file):
-    data = []
-    columns = ["Purchase Order No", "Date", "SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
-    # Set default values
-    purchase_order_no = "Not Found"
-    purchase_order_date = "Not Found"
-    try:
-        with pdfplumber.open(pdf_file) as pdf:
-            for page in pdf.pages:
-                text = page.extract_text()
-                if not text:
-                    continue  # Skip pages without text
-                lines = text.splitlines()
-                # Attempt to dynamically extract Purchase Order No and Date from the first page
-                for line in lines:
-                    # Search for Purchase Order No
-                    po_match = re.search(r'Purchase Order No[:\s]+(\d+)', line, re.IGNORECASE)
-                    if po_match:
-                        purchase_order_no = po_match.group(1)
-                    # Search for Date
-                    date_match = re.search(r'Date[:\s]+(\d{2}\.\d{2}\.\d{4})', line, re.IGNORECASE)
-                    if date_match:
-                        purchase_order_date = date_match.group(1)
-                    # Stop if both values are found
-                    if purchase_order_no != "Not Found" and purchase_order_date != "Not Found":
-                        break
-                # Process lines to extract row data, looking for rows that start with SI No
-                for line in lines:
-                    try:
-                        # Match lines that start with an SI number (e.g., "10", "20")
-                        si_no_match = re.match(r'^(\d+)\s', line)
-                        if si_no_match:
-                            parts = line.split()
-                            # Extract SI No
-                            si_no = parts[0]
-                            # Extract Material Number and format the Material Description
-                            material_number = parts[2] if len(parts) > 2 else "Unknown"
-                            material_desc = f"BPS 017507\nMaterial Number: {material_number}\nHSN Code: 8310\nIGST: 18%"
-                            # Extract Unit, Quantity, Dely Qty, Dely Date, Unit Rate, and Value
-                            unit = parts[3] if len(parts) > 3 else "NO"  # Default to "NO" if not found
-                            quantity = int(parts[4]) if len(parts) > 4 else 0
-                            dely_qty = int(parts[5]) if len(parts) > 5 else 0
-                            dely_date = parts[6] if len(parts) > 6 else "Unknown"
-                            unit_rate = float(parts[7]) if len(parts) > 7 else 0.0
-                            value = float(parts[8]) if len(parts) > 8 else 0.0
-                            # Append extracted data in the specified order
-                            data.append([
-                                purchase_order_no,
-                                purchase_order_date,
-                                si_no,
-                                material_desc,
-                                unit,
-                                quantity,
-                                dely_qty,
-                                dely_date,
-                                unit_rate,
-                                value
-                            ])
-                    except (ValueError, IndexError) as e:
-                        print(f"Error processing line: {line} - {e}")
-                        continue  # Skip lines that do not match the expected format
-        # Convert data to DataFrame and save as Excel
-        df = pd.DataFrame(data, columns=columns)
-        # Generate a temporary file path for the Excel file
-        with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp_file:
-            excel_path = tmp_file.name
-            df.to_excel(excel_path, index=False)
-    except Exception as e:
-        print(f"An error occurred while processing the PDF: {e}")
-        return None
-    # Log warning if data was not found for Purchase Order No or Date
-    if purchase_order_no == "Not Found" or purchase_order_date == "Not Found":
-        print("Warning: 'Purchase Order No' or 'Date' was not found in the PDF.")
-    return excel_path
 # Set up Gradio interface
 iface = gr.Interface(
     fn=extract_data,
-    inputs=gr.File(label="Upload PDF"),
-    outputs=gr.File(label="Download Excel"),
-    title="PDF Data Extractor"
 )
 # Launch the app

 import pandas as pd
 import gradio as gr
 import re
+# Define function to extract data from Excel file
+def extract_data(excel_file):
+    # Load the Excel file
+    df = pd.read_excel(excel_file)
+    # Attempt to extract 'Purchase Order No' and 'Date' from the first few rows
+    for _, row in df.iterrows():
+        # Search for Purchase Order No pattern in the row data
+        po_match = re.search(r'Purchase Order No[:\s]+(\w+)', str(row), re.IGNORECASE)
+        if po_match:
+            purchase_order_no = po_match.group(1)
+        # Search for Date pattern in the row data (e.g., "Date: 10.10.2023" or "10/10/2023")
+        date_match = re.search(r'Date[:\s]+(\d{2}[\./-]\d{2}[\./-]\d{4})', str(row), re.IGNORECASE)
+        if date_match:
+            purchase_order_date = date_match.group(1)
+        # Stop if both values are found
+        if purchase_order_no != "Not Found" and purchase_order_date != "Not Found":
+            break
+    # Required columns to keep
+    columns_to_keep = ["Purchase Order No", "Date", "SI No", "Material Description",
+                       "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
+    # Add Purchase Order No and Date columns to the DataFrame if they are missing
+    if "Purchase Order No" not in df.columns:
+        df["Purchase Order No"] = purchase_order_no
+    if "Date" not in df.columns:
+        df["Date"] = purchase_order_date
+    # Filter the DataFrame to only include relevant columns
+    df_filtered = df[columns_to_keep]
+    # Save the filtered data to a new Excel file
+    output_path = "/tmp/Filtered_Purchase_Order_Data.xlsx"
+    df_filtered.to_excel(output_path, index=False)
+    return output_path
 # Set up Gradio interface
 iface = gr.Interface(
     fn=extract_data,
+    inputs=gr.File(label="Upload Excel File"),
+    outputs=gr.File(label="Download Filtered Excel"),
+    title="Excel Data Extractor"
 )
 # Launch the app