POExtraction_UC3 / parse_bhel.py
DSatishchandra's picture
Update parse_bhel.py
da34543 verified
raw
history blame
2.27 kB
import pdfplumber
import pandas as pd
def preprocess_rows(rows, expected_columns):
aligned_rows = []
buffer = []
unalignable_rows = [] # Capture unaligned rows for inspection
for row in rows:
# Check if the row contains irrelevant metadata or headers
if any(keyword in row[0] for keyword in ["GSTIN", "Currency", "Payment Terms", "General Terms", "Delivery Schedule"]):
continue
# If row matches expected length, add directly
if len(row) == expected_columns:
if buffer:
aligned_rows.append(buffer) # Add any buffered row first
buffer = [] # Reset buffer
aligned_rows.append(row)
# If row contains part of an entry (such as "Material Number" or "HSN Code")
elif "Material Number" in row[0] or "HSN Code" in row[0] or "IGST" in row[0]:
if buffer:
buffer[-1] += " " + row[0] # Append to last column in buffer
else:
buffer = row # Initialize buffer with this part-row
else:
# If unalignable, add to unalignable_rows for debugging
unalignable_rows.append(row)
# Log any remaining buffered content
if buffer:
aligned_rows.append(buffer)
# Print unalignable rows for analysis
for row in unalignable_rows:
print(f"Unalignable row: {row}")
return aligned_rows
def parse_bhel_pdf(pdf_path):
columns = [
"Purchase Order No", "Date", "Sl No", "Material Description",
"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
]
expected_columns = len(columns)
data = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
table = page.extract_table()
if table:
# Preprocess and align rows before DataFrame conversion
rows = preprocess_rows(table[1:], expected_columns)
for row in rows:
if len(row) == expected_columns:
data.append(row)
else:
print(f"Skipping unalignable row: {row}")
# Convert aligned rows into a DataFrame
df = pd.DataFrame(data, columns=columns)
return df