DSatishchandra commited on
Commit
da34543
·
verified ·
1 Parent(s): 8a41643

Update parse_bhel.py

Browse files
Files changed (1) hide show
  1. parse_bhel.py +24 -13
parse_bhel.py CHANGED
@@ -4,26 +4,38 @@ import pandas as pd
4
  def preprocess_rows(rows, expected_columns):
5
  aligned_rows = []
6
  buffer = []
7
-
 
8
  for row in rows:
9
- # If the row has the correct number of columns, add it as-is
 
 
 
 
10
  if len(row) == expected_columns:
11
- # If there's buffered content from previous rows, add it before this row
12
  if buffer:
13
- aligned_rows.append(buffer)
14
- buffer = [] # Clear the buffer
15
  aligned_rows.append(row)
16
- else:
17
- # If row has fewer columns, treat it as a continuation and add to the buffer
 
18
  if buffer:
19
- buffer = [f"{b} {r}" if r else b for b, r in zip(buffer, row + [""] * (len(buffer) - len(row)))]
20
  else:
21
- buffer = row # Initialize the buffer with the row
 
 
 
22
 
23
- # If there's any remaining buffered row, add it to aligned rows
24
  if buffer:
25
  aligned_rows.append(buffer)
26
 
 
 
 
 
27
  return aligned_rows
28
 
29
  def parse_bhel_pdf(pdf_path):
@@ -38,15 +50,14 @@ def parse_bhel_pdf(pdf_path):
38
  for page in pdf.pages:
39
  table = page.extract_table()
40
  if table:
41
- # Skip the header row and preprocess rows to align data
42
  rows = preprocess_rows(table[1:], expected_columns)
43
  for row in rows:
44
- # Only add rows that match the expected number of columns after preprocessing
45
  if len(row) == expected_columns:
46
  data.append(row)
47
  else:
48
  print(f"Skipping unalignable row: {row}")
49
 
50
- # Create a DataFrame with the specified columns
51
  df = pd.DataFrame(data, columns=columns)
52
  return df
 
4
  def preprocess_rows(rows, expected_columns):
5
  aligned_rows = []
6
  buffer = []
7
+ unalignable_rows = [] # Capture unaligned rows for inspection
8
+
9
  for row in rows:
10
+ # Check if the row contains irrelevant metadata or headers
11
+ if any(keyword in row[0] for keyword in ["GSTIN", "Currency", "Payment Terms", "General Terms", "Delivery Schedule"]):
12
+ continue
13
+
14
+ # If row matches expected length, add directly
15
  if len(row) == expected_columns:
 
16
  if buffer:
17
+ aligned_rows.append(buffer) # Add any buffered row first
18
+ buffer = [] # Reset buffer
19
  aligned_rows.append(row)
20
+
21
+ # If row contains part of an entry (such as "Material Number" or "HSN Code")
22
+ elif "Material Number" in row[0] or "HSN Code" in row[0] or "IGST" in row[0]:
23
  if buffer:
24
+ buffer[-1] += " " + row[0] # Append to last column in buffer
25
  else:
26
+ buffer = row # Initialize buffer with this part-row
27
+ else:
28
+ # If unalignable, add to unalignable_rows for debugging
29
+ unalignable_rows.append(row)
30
 
31
+ # Log any remaining buffered content
32
  if buffer:
33
  aligned_rows.append(buffer)
34
 
35
+ # Print unalignable rows for analysis
36
+ for row in unalignable_rows:
37
+ print(f"Unalignable row: {row}")
38
+
39
  return aligned_rows
40
 
41
  def parse_bhel_pdf(pdf_path):
 
50
  for page in pdf.pages:
51
  table = page.extract_table()
52
  if table:
53
+ # Preprocess and align rows before DataFrame conversion
54
  rows = preprocess_rows(table[1:], expected_columns)
55
  for row in rows:
 
56
  if len(row) == expected_columns:
57
  data.append(row)
58
  else:
59
  print(f"Skipping unalignable row: {row}")
60
 
61
+ # Convert aligned rows into a DataFrame
62
  df = pd.DataFrame(data, columns=columns)
63
  return df