import pdfplumber import pandas as pd def preprocess_rows(rows, expected_columns): aligned_rows = [] buffer = [] unalignable_rows = [] # Capture unaligned rows for inspection for row in rows: # Check if the row contains irrelevant metadata or headers if any(keyword in row[0] for keyword in ["GSTIN", "Currency", "Payment Terms", "General Terms", "Delivery Schedule"]): continue # If row matches expected length, add directly if len(row) == expected_columns: if buffer: aligned_rows.append(buffer) # Add any buffered row first buffer = [] # Reset buffer aligned_rows.append(row) # If row contains part of an entry (such as "Material Number" or "HSN Code") elif "Material Number" in row[0] or "HSN Code" in row[0] or "IGST" in row[0]: if buffer: buffer[-1] += " " + row[0] # Append to last column in buffer else: buffer = row # Initialize buffer with this part-row else: # If unalignable, add to unalignable_rows for debugging unalignable_rows.append(row) # Log any remaining buffered content if buffer: aligned_rows.append(buffer) # Print unalignable rows for analysis for row in unalignable_rows: print(f"Unalignable row: {row}") return aligned_rows def parse_bhel_pdf(pdf_path): columns = [ "Purchase Order No", "Date", "Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value" ] expected_columns = len(columns) data = [] with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: table = page.extract_table() if table: # Preprocess and align rows before DataFrame conversion rows = preprocess_rows(table[1:], expected_columns) for row in rows: if len(row) == expected_columns: data.append(row) else: print(f"Skipping unalignable row: {row}") # Convert aligned rows into a DataFrame df = pd.DataFrame(data, columns=columns) return df