Spaces:
Runtime error
Runtime error
DSatishchandra
commited on
Update parse_bhel.py
Browse files- parse_bhel.py +24 -13
parse_bhel.py
CHANGED
@@ -4,26 +4,38 @@ import pandas as pd
|
|
4 |
def preprocess_rows(rows, expected_columns):
|
5 |
aligned_rows = []
|
6 |
buffer = []
|
7 |
-
|
|
|
8 |
for row in rows:
|
9 |
-
#
|
|
|
|
|
|
|
|
|
10 |
if len(row) == expected_columns:
|
11 |
-
# If there's buffered content from previous rows, add it before this row
|
12 |
if buffer:
|
13 |
-
aligned_rows.append(buffer)
|
14 |
-
buffer = [] #
|
15 |
aligned_rows.append(row)
|
16 |
-
|
17 |
-
|
|
|
18 |
if buffer:
|
19 |
-
buffer
|
20 |
else:
|
21 |
-
buffer = row # Initialize
|
|
|
|
|
|
|
22 |
|
23 |
-
#
|
24 |
if buffer:
|
25 |
aligned_rows.append(buffer)
|
26 |
|
|
|
|
|
|
|
|
|
27 |
return aligned_rows
|
28 |
|
29 |
def parse_bhel_pdf(pdf_path):
|
@@ -38,15 +50,14 @@ def parse_bhel_pdf(pdf_path):
|
|
38 |
for page in pdf.pages:
|
39 |
table = page.extract_table()
|
40 |
if table:
|
41 |
-
#
|
42 |
rows = preprocess_rows(table[1:], expected_columns)
|
43 |
for row in rows:
|
44 |
-
# Only add rows that match the expected number of columns after preprocessing
|
45 |
if len(row) == expected_columns:
|
46 |
data.append(row)
|
47 |
else:
|
48 |
print(f"Skipping unalignable row: {row}")
|
49 |
|
50 |
-
#
|
51 |
df = pd.DataFrame(data, columns=columns)
|
52 |
return df
|
|
|
4 |
def preprocess_rows(rows, expected_columns):
|
5 |
aligned_rows = []
|
6 |
buffer = []
|
7 |
+
unalignable_rows = [] # Capture unaligned rows for inspection
|
8 |
+
|
9 |
for row in rows:
|
10 |
+
# Check if the row contains irrelevant metadata or headers
|
11 |
+
if any(keyword in row[0] for keyword in ["GSTIN", "Currency", "Payment Terms", "General Terms", "Delivery Schedule"]):
|
12 |
+
continue
|
13 |
+
|
14 |
+
# If row matches expected length, add directly
|
15 |
if len(row) == expected_columns:
|
|
|
16 |
if buffer:
|
17 |
+
aligned_rows.append(buffer) # Add any buffered row first
|
18 |
+
buffer = [] # Reset buffer
|
19 |
aligned_rows.append(row)
|
20 |
+
|
21 |
+
# If row contains part of an entry (such as "Material Number" or "HSN Code")
|
22 |
+
elif "Material Number" in row[0] or "HSN Code" in row[0] or "IGST" in row[0]:
|
23 |
if buffer:
|
24 |
+
buffer[-1] += " " + row[0] # Append to last column in buffer
|
25 |
else:
|
26 |
+
buffer = row # Initialize buffer with this part-row
|
27 |
+
else:
|
28 |
+
# If unalignable, add to unalignable_rows for debugging
|
29 |
+
unalignable_rows.append(row)
|
30 |
|
31 |
+
# Log any remaining buffered content
|
32 |
if buffer:
|
33 |
aligned_rows.append(buffer)
|
34 |
|
35 |
+
# Print unalignable rows for analysis
|
36 |
+
for row in unalignable_rows:
|
37 |
+
print(f"Unalignable row: {row}")
|
38 |
+
|
39 |
return aligned_rows
|
40 |
|
41 |
def parse_bhel_pdf(pdf_path):
|
|
|
50 |
for page in pdf.pages:
|
51 |
table = page.extract_table()
|
52 |
if table:
|
53 |
+
# Preprocess and align rows before DataFrame conversion
|
54 |
rows = preprocess_rows(table[1:], expected_columns)
|
55 |
for row in rows:
|
|
|
56 |
if len(row) == expected_columns:
|
57 |
data.append(row)
|
58 |
else:
|
59 |
print(f"Skipping unalignable row: {row}")
|
60 |
|
61 |
+
# Convert aligned rows into a DataFrame
|
62 |
df = pd.DataFrame(data, columns=columns)
|
63 |
return df
|