Spaces:

DSatishchandra
/

POExtraction_UC3

Runtime error

App Files Files Community

DSatishchandra commited on Nov 16

Commit

7cbc3d2

•

1 Parent(s): 7be7132

Update parse_bhel.py

Browse files

Files changed (1) hide show

parse_bhel.py +47 -45

parse_bhel.py CHANGED Viewed

@@ -1,52 +1,54 @@
 import pdfplumber
 import pandas as pd
-def preprocess_rows(rows, expected_columns):
-    aligned_rows = []
-    buffer = []
-    for row in rows:
-        # If the row has the correct number of columns, add it as-is
-        if len(row) == expected_columns:
-            # If there's buffered content from previous rows, add it before this row
-            if buffer:
-                aligned_rows.append(buffer)
-                buffer = []  # Clear the buffer
-            aligned_rows.append(row)
-        else:
-            # If row has fewer columns, treat it as a continuation and add to the buffer
-            if buffer:
-                buffer = [f"{b} {r}" if r else b for b, r in zip(buffer, row + [""] * (len(buffer) - len(row)))]
-            else:
-                buffer = row  # Initialize the buffer with the row
-    # If there's any remaining buffered row, add it to aligned rows
-    if buffer:
-        aligned_rows.append(buffer)
-    return aligned_rows
-def parse_bhel_pdf(pdf_path):
-    columns = [
-        "Purchase Order No", "Date", "Sl No", "Material Description",
-        "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
-    ]
-    expected_columns = len(columns)
     data = []
-    with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
-            table = page.extract_table()
-            if table:
-                # Skip the header row and preprocess rows to align data
-                rows = preprocess_rows(table[1:], expected_columns)
-                for row in rows:
-                    # Only add rows that match the expected number of columns after preprocessing
-                    if len(row) == expected_columns:
-                        data.append(row)
-                    else:
-                        print(f"Skipping unalignable row: {row}")
-    # Create a DataFrame with the specified columns
     df = pd.DataFrame(data, columns=columns)
-    return df

 import pdfplumber
 import pandas as pd
+import tempfile
+def format_material_description(description_series, si_no):
+    # Placeholder for a formatting function; update with your logic
+    return f"{description_series.iloc[0]} (SI No: {si_no})"
+def extract_bhel_data(pdf_file):
     data = []
+    columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value", "Material Number", "HSN Code", "IGST"]
+    start_si, end_si = 10, 1150
+    with pdfplumber.open(pdf_file) as pdf:
         for page in pdf.pages:
+            text = page.extract_text().splitlines()
+            for line in text:
+                parts = line.split()
+                try:
+                    si_no = int(parts[0])
+                    if start_si <= si_no <= end_si:
+                        material_desc = " ".join(parts[1:3])
+                        unit = parts[3]
+                        quantity = int(parts[4])
+                        dely_qty = int(parts[5])
+                        dely_date = parts[6]
+                        unit_rate = float(parts[7])
+                        value = float(parts[8])
+                        material_number = parts[9] if len(parts) > 9 else ""
+                        hsn_code = parts[10] if len(parts) > 10 else ""
+                        igst = parts[11] if len(parts) > 11 else ""
+                        data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value, material_number, hsn_code, igst])
+                except (ValueError, IndexError):
+                    continue
     df = pd.DataFrame(data, columns=columns)
+    # Correct the SI No column to follow increments of 10
+    df['SI No'] = range(10, 10 + len(df) * 10, 10)
+    # Reapply the Material Description formatting based on the corrected SI No
+    df['Material Description'] = df['SI No'].apply(
+        lambda si_no: format_material_description(df['Material Description'], si_no)
+    )
+    # Save to temporary file for download
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
+    df.to_excel(temp_file.name, index=False)
+    # Display the corrected data to the user
+    import ace_tools as tools  # Replace with your preferred display method
+    tools.display_dataframe_to_user(name="Corrected Data with Updated SI No", dataframe=df)
+    return temp_file.name