Spaces:
Runtime error
Runtime error
File size: 2,268 Bytes
ac7dc42 8a41643 da34543 8a41643 da34543 8a41643 da34543 8a41643 da34543 8a41643 da34543 8a41643 da34543 8a41643 da34543 8a41643 da34543 8a41643 ac7dc42 8a41643 ac7dc42 da34543 8a41643 c4a3be0 8a41643 ac7dc42 da34543 ac7dc42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import pdfplumber
import pandas as pd
def preprocess_rows(rows, expected_columns):
aligned_rows = []
buffer = []
unalignable_rows = [] # Capture unaligned rows for inspection
for row in rows:
# Check if the row contains irrelevant metadata or headers
if any(keyword in row[0] for keyword in ["GSTIN", "Currency", "Payment Terms", "General Terms", "Delivery Schedule"]):
continue
# If row matches expected length, add directly
if len(row) == expected_columns:
if buffer:
aligned_rows.append(buffer) # Add any buffered row first
buffer = [] # Reset buffer
aligned_rows.append(row)
# If row contains part of an entry (such as "Material Number" or "HSN Code")
elif "Material Number" in row[0] or "HSN Code" in row[0] or "IGST" in row[0]:
if buffer:
buffer[-1] += " " + row[0] # Append to last column in buffer
else:
buffer = row # Initialize buffer with this part-row
else:
# If unalignable, add to unalignable_rows for debugging
unalignable_rows.append(row)
# Log any remaining buffered content
if buffer:
aligned_rows.append(buffer)
# Print unalignable rows for analysis
for row in unalignable_rows:
print(f"Unalignable row: {row}")
return aligned_rows
def parse_bhel_pdf(pdf_path):
columns = [
"Purchase Order No", "Date", "Sl No", "Material Description",
"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
]
expected_columns = len(columns)
data = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
table = page.extract_table()
if table:
# Preprocess and align rows before DataFrame conversion
rows = preprocess_rows(table[1:], expected_columns)
for row in rows:
if len(row) == expected_columns:
data.append(row)
else:
print(f"Skipping unalignable row: {row}")
# Convert aligned rows into a DataFrame
df = pd.DataFrame(data, columns=columns)
return df
|