DSatishchandra commited on
Commit
8a41643
·
verified ·
1 Parent(s): c4a3be0

Update parse_bhel.py

Browse files
Files changed (1) hide show
  1. parse_bhel.py +32 -4
parse_bhel.py CHANGED
@@ -1,23 +1,51 @@
1
  import pdfplumber
2
  import pandas as pd
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def parse_bhel_pdf(pdf_path):
5
  columns = [
6
  "Purchase Order No", "Date", "Sl No", "Material Description",
7
  "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
8
  ]
 
9
  data = []
10
 
11
  with pdfplumber.open(pdf_path) as pdf:
12
  for page in pdf.pages:
13
  table = page.extract_table()
14
  if table:
15
- for row in table[1:]: # Skip header row
16
- # Only add rows that have exactly 10 columns
17
- if len(row) == 10:
 
 
18
  data.append(row)
19
  else:
20
- print(f"Skipping row due to column mismatch: {row}")
21
 
22
  # Create a DataFrame with the specified columns
23
  df = pd.DataFrame(data, columns=columns)
 
1
  import pdfplumber
2
  import pandas as pd
3
 
4
+ def preprocess_rows(rows, expected_columns):
5
+ aligned_rows = []
6
+ buffer = []
7
+
8
+ for row in rows:
9
+ # If the row has the correct number of columns, add it as-is
10
+ if len(row) == expected_columns:
11
+ # If there's buffered content from previous rows, add it before this row
12
+ if buffer:
13
+ aligned_rows.append(buffer)
14
+ buffer = [] # Clear the buffer
15
+ aligned_rows.append(row)
16
+ else:
17
+ # If row has fewer columns, treat it as a continuation and add to the buffer
18
+ if buffer:
19
+ buffer = [f"{b} {r}" if r else b for b, r in zip(buffer, row + [""] * (len(buffer) - len(row)))]
20
+ else:
21
+ buffer = row # Initialize the buffer with the row
22
+
23
+ # If there's any remaining buffered row, add it to aligned rows
24
+ if buffer:
25
+ aligned_rows.append(buffer)
26
+
27
+ return aligned_rows
28
+
29
  def parse_bhel_pdf(pdf_path):
30
  columns = [
31
  "Purchase Order No", "Date", "Sl No", "Material Description",
32
  "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
33
  ]
34
+ expected_columns = len(columns)
35
  data = []
36
 
37
  with pdfplumber.open(pdf_path) as pdf:
38
  for page in pdf.pages:
39
  table = page.extract_table()
40
  if table:
41
+ # Skip the header row and preprocess rows to align data
42
+ rows = preprocess_rows(table[1:], expected_columns)
43
+ for row in rows:
44
+ # Only add rows that match the expected number of columns after preprocessing
45
+ if len(row) == expected_columns:
46
  data.append(row)
47
  else:
48
+ print(f"Skipping unalignable row: {row}")
49
 
50
  # Create a DataFrame with the specified columns
51
  df = pd.DataFrame(data, columns=columns)