Spaces:
Runtime error
Runtime error
DSatishchandra
commited on
Update parse_bhel.py
Browse files- parse_bhel.py +32 -4
parse_bhel.py
CHANGED
@@ -1,23 +1,51 @@
|
|
1 |
import pdfplumber
|
2 |
import pandas as pd
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
def parse_bhel_pdf(pdf_path):
|
5 |
columns = [
|
6 |
"Purchase Order No", "Date", "Sl No", "Material Description",
|
7 |
"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
|
8 |
]
|
|
|
9 |
data = []
|
10 |
|
11 |
with pdfplumber.open(pdf_path) as pdf:
|
12 |
for page in pdf.pages:
|
13 |
table = page.extract_table()
|
14 |
if table:
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
18 |
data.append(row)
|
19 |
else:
|
20 |
-
print(f"Skipping row
|
21 |
|
22 |
# Create a DataFrame with the specified columns
|
23 |
df = pd.DataFrame(data, columns=columns)
|
|
|
1 |
import pdfplumber
|
2 |
import pandas as pd
|
3 |
|
4 |
+
def preprocess_rows(rows, expected_columns):
|
5 |
+
aligned_rows = []
|
6 |
+
buffer = []
|
7 |
+
|
8 |
+
for row in rows:
|
9 |
+
# If the row has the correct number of columns, add it as-is
|
10 |
+
if len(row) == expected_columns:
|
11 |
+
# If there's buffered content from previous rows, add it before this row
|
12 |
+
if buffer:
|
13 |
+
aligned_rows.append(buffer)
|
14 |
+
buffer = [] # Clear the buffer
|
15 |
+
aligned_rows.append(row)
|
16 |
+
else:
|
17 |
+
# If row has fewer columns, treat it as a continuation and add to the buffer
|
18 |
+
if buffer:
|
19 |
+
buffer = [f"{b} {r}" if r else b for b, r in zip(buffer, row + [""] * (len(buffer) - len(row)))]
|
20 |
+
else:
|
21 |
+
buffer = row # Initialize the buffer with the row
|
22 |
+
|
23 |
+
# If there's any remaining buffered row, add it to aligned rows
|
24 |
+
if buffer:
|
25 |
+
aligned_rows.append(buffer)
|
26 |
+
|
27 |
+
return aligned_rows
|
28 |
+
|
29 |
def parse_bhel_pdf(pdf_path):
|
30 |
columns = [
|
31 |
"Purchase Order No", "Date", "Sl No", "Material Description",
|
32 |
"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
|
33 |
]
|
34 |
+
expected_columns = len(columns)
|
35 |
data = []
|
36 |
|
37 |
with pdfplumber.open(pdf_path) as pdf:
|
38 |
for page in pdf.pages:
|
39 |
table = page.extract_table()
|
40 |
if table:
|
41 |
+
# Skip the header row and preprocess rows to align data
|
42 |
+
rows = preprocess_rows(table[1:], expected_columns)
|
43 |
+
for row in rows:
|
44 |
+
# Only add rows that match the expected number of columns after preprocessing
|
45 |
+
if len(row) == expected_columns:
|
46 |
data.append(row)
|
47 |
else:
|
48 |
+
print(f"Skipping unalignable row: {row}")
|
49 |
|
50 |
# Create a DataFrame with the specified columns
|
51 |
df = pd.DataFrame(data, columns=columns)
|