DSatishchandra commited on
Commit
b051e96
1 Parent(s): da34543

Update parse_bhel.py

Browse files
Files changed (1) hide show
  1. parse_bhel.py +54 -59
parse_bhel.py CHANGED
@@ -1,63 +1,58 @@
1
- import pdfplumber
2
  import pandas as pd
 
3
 
4
- def preprocess_rows(rows, expected_columns):
5
- aligned_rows = []
6
- buffer = []
7
- unalignable_rows = [] # Capture unaligned rows for inspection
8
-
9
- for row in rows:
10
- # Check if the row contains irrelevant metadata or headers
11
- if any(keyword in row[0] for keyword in ["GSTIN", "Currency", "Payment Terms", "General Terms", "Delivery Schedule"]):
12
- continue
13
-
14
- # If row matches expected length, add directly
15
- if len(row) == expected_columns:
16
- if buffer:
17
- aligned_rows.append(buffer) # Add any buffered row first
18
- buffer = [] # Reset buffer
19
- aligned_rows.append(row)
20
-
21
- # If row contains part of an entry (such as "Material Number" or "HSN Code")
22
- elif "Material Number" in row[0] or "HSN Code" in row[0] or "IGST" in row[0]:
23
- if buffer:
24
- buffer[-1] += " " + row[0] # Append to last column in buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  else:
26
- buffer = row # Initialize buffer with this part-row
27
- else:
28
- # If unalignable, add to unalignable_rows for debugging
29
- unalignable_rows.append(row)
30
-
31
- # Log any remaining buffered content
32
- if buffer:
33
- aligned_rows.append(buffer)
34
-
35
- # Print unalignable rows for analysis
36
- for row in unalignable_rows:
37
- print(f"Unalignable row: {row}")
38
-
39
- return aligned_rows
40
-
41
- def parse_bhel_pdf(pdf_path):
42
- columns = [
43
- "Purchase Order No", "Date", "Sl No", "Material Description",
44
- "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
45
- ]
46
- expected_columns = len(columns)
47
- data = []
48
-
49
- with pdfplumber.open(pdf_path) as pdf:
50
- for page in pdf.pages:
51
- table = page.extract_table()
52
- if table:
53
- # Preprocess and align rows before DataFrame conversion
54
- rows = preprocess_rows(table[1:], expected_columns)
55
- for row in rows:
56
- if len(row) == expected_columns:
57
- data.append(row)
58
- else:
59
- print(f"Skipping unalignable row: {row}")
60
 
61
- # Convert aligned rows into a DataFrame
62
- df = pd.DataFrame(data, columns=columns)
63
- return df
 
1
+ import re
2
  import pandas as pd
3
+ import pdfplumber
4
 
5
+ # Define the target columns based on your table headers
6
+ columns = [
7
+ "Purchase Order No", "Date", "Sl No", "Material Description",
8
+ "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
9
+ ]
10
+
11
+ # Initialize an empty DataFrame with the defined columns
12
+ data = pd.DataFrame(columns=columns)
13
+
14
+ # Define regex patterns to identify and parse required lines
15
+ po_pattern = re.compile(r'^\d{10} / \d{2}\.\d{2}\.\d{4}') # Purchase Order pattern
16
+ material_pattern = re.compile(r'^\d{1,3} ') # Pattern for lines starting with Sl No
17
+
18
+ # Function to clean and split rows
19
+ def clean_and_split_line(line):
20
+ # Split line into components based on spaces and commas
21
+ parts = re.split(r'\s{2,}', line.strip()) # Split by two or more spaces
22
+ return parts if len(parts) == len(columns) else None
23
+
24
+ # Process the PDF and extract relevant lines
25
+ with pdfplumber.open('your_pdf_file.pdf') as pdf:
26
+ for page in pdf.pages:
27
+ text = page.extract_text().splitlines()
28
+
29
+ for line in text:
30
+ # Check for Purchase Order row
31
+ if po_pattern.match(line):
32
+ po_data = line.split(' / ')
33
+ po_no = po_data[0]
34
+ po_date = po_data[1]
35
+
36
+ # Check if the line contains material data
37
+ elif material_pattern.match(line):
38
+ cleaned_data = clean_and_split_line(line)
39
+ if cleaned_data:
40
+ row_data = {
41
+ "Purchase Order No": po_no,
42
+ "Date": po_date,
43
+ "Sl No": cleaned_data[0],
44
+ "Material Description": cleaned_data[1],
45
+ "Unit": cleaned_data[2],
46
+ "Quantity": cleaned_data[3],
47
+ "Dely Qty": cleaned_data[4],
48
+ "Dely Date": cleaned_data[5],
49
+ "Unit Rate": cleaned_data[6],
50
+ "Value": cleaned_data[7],
51
+ }
52
+ data = data.append(row_data, ignore_index=True)
53
+ # Skip irrelevant lines or unalignable rows
54
  else:
55
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Save extracted data to an Excel file
58
+ data.to_excel("extracted_data.xlsx", index=False)