Spaces:

DSatishchandra
/

POExtraction_UC3

Runtime error

App Files Files Community

DSatishchandra commited on Nov 12

Commit

7be7132

•

1 Parent(s): e16fdc8

Update parse_bhel.py

Browse files

Files changed (1) hide show

parse_bhel.py +45 -37

parse_bhel.py CHANGED Viewed

@@ -1,44 +1,52 @@
-import gradio as gr
 import pdfplumber
 import pandas as pd
-def parse_bhel_pdf(pdf_file):
-    # Open the uploaded PDF file
-    with pdfplumber.open(pdf_file) as pdf:
-        data = []
         for page in pdf.pages:
-            text = page.extract_text()
-            if text:
-                lines = text.split('\n')
-                for line in lines:
-                    parts = line.split()
-                    if len(parts) >= 8:
-                        row = {
-                            'Sl No': parts[0],
-                            'Material Description': " ".join(parts[1:-6]),
-                            'Unit': parts[-6],
-                            'Quantity': parts[-5],
-                            'Dely Qty': parts[-4],
-                            'Dely Date': parts[-3],
-                            'Unit Rate': parts[-2],
-                            'Value': parts[-1]
-                        }
                         data.append(row)
-    # Convert extracted data to a DataFrame
-    df = pd.DataFrame(data)
     return df
-def gradio_interface(pdf_file):
-    # Parse the PDF file and return the extracted table as an HTML table
-    df = parse_bhel_pdf(pdf_file.name)
-    return df.to_html()
-# Gradio interface
-gr.Interface(
-    fn=gradio_interface,
-    inputs=gr.File(type="file", label="Upload PDF File"),
-    outputs="html",
-    title="BHEL PDF Data Extractor",
-    description="Upload a BHEL PDF file to extract structured data in a tabular format."
-).launch()

 import pdfplumber
 import pandas as pd
+def preprocess_rows(rows, expected_columns):
+    aligned_rows = []
+    buffer = []
+    for row in rows:
+        # If the row has the correct number of columns, add it as-is
+        if len(row) == expected_columns:
+            # If there's buffered content from previous rows, add it before this row
+            if buffer:
+                aligned_rows.append(buffer)
+                buffer = []  # Clear the buffer
+            aligned_rows.append(row)
+        else:
+            # If row has fewer columns, treat it as a continuation and add to the buffer
+            if buffer:
+                buffer = [f"{b} {r}" if r else b for b, r in zip(buffer, row + [""] * (len(buffer) - len(row)))]
+            else:
+                buffer = row  # Initialize the buffer with the row
+    # If there's any remaining buffered row, add it to aligned rows
+    if buffer:
+        aligned_rows.append(buffer)
+    return aligned_rows
+def parse_bhel_pdf(pdf_path):
+    columns = [
+        "Purchase Order No", "Date", "Sl No", "Material Description",
+        "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
+    ]
+    expected_columns = len(columns)
+    data = []
+    with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
+            table = page.extract_table()
+            if table:
+                # Skip the header row and preprocess rows to align data
+                rows = preprocess_rows(table[1:], expected_columns)
+                for row in rows:
+                    # Only add rows that match the expected number of columns after preprocessing
+                    if len(row) == expected_columns:
                         data.append(row)
+                    else:
+                        print(f"Skipping unalignable row: {row}")
+    # Create a DataFrame with the specified columns
+    df = pd.DataFrame(data, columns=columns)
     return df