Spaces:
Runtime error
Runtime error
import pdfplumber | |
import pandas as pd | |
def preprocess_rows(rows, expected_columns): | |
aligned_rows = [] | |
buffer = [] | |
unalignable_rows = [] # Capture unaligned rows for inspection | |
for row in rows: | |
# Check if the row contains irrelevant metadata or headers | |
if any(keyword in row[0] for keyword in ["GSTIN", "Currency", "Payment Terms", "General Terms", "Delivery Schedule"]): | |
continue | |
# If row matches expected length, add directly | |
if len(row) == expected_columns: | |
if buffer: | |
aligned_rows.append(buffer) # Add any buffered row first | |
buffer = [] # Reset buffer | |
aligned_rows.append(row) | |
# If row contains part of an entry (such as "Material Number" or "HSN Code") | |
elif "Material Number" in row[0] or "HSN Code" in row[0] or "IGST" in row[0]: | |
if buffer: | |
buffer[-1] += " " + row[0] # Append to last column in buffer | |
else: | |
buffer = row # Initialize buffer with this part-row | |
else: | |
# If unalignable, add to unalignable_rows for debugging | |
unalignable_rows.append(row) | |
# Log any remaining buffered content | |
if buffer: | |
aligned_rows.append(buffer) | |
# Print unalignable rows for analysis | |
for row in unalignable_rows: | |
print(f"Unalignable row: {row}") | |
return aligned_rows | |
def parse_bhel_pdf(pdf_path): | |
columns = [ | |
"Purchase Order No", "Date", "Sl No", "Material Description", | |
"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value" | |
] | |
expected_columns = len(columns) | |
data = [] | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
table = page.extract_table() | |
if table: | |
# Preprocess and align rows before DataFrame conversion | |
rows = preprocess_rows(table[1:], expected_columns) | |
for row in rows: | |
if len(row) == expected_columns: | |
data.append(row) | |
else: | |
print(f"Skipping unalignable row: {row}") | |
# Convert aligned rows into a DataFrame | |
df = pd.DataFrame(data, columns=columns) | |
return df | |