DSatishchandra commited on
Commit
7cbc3d2
1 Parent(s): 7be7132

Update parse_bhel.py

Browse files
Files changed (1) hide show
  1. parse_bhel.py +47 -45
parse_bhel.py CHANGED
@@ -1,52 +1,54 @@
1
  import pdfplumber
2
  import pandas as pd
 
3
 
4
- def preprocess_rows(rows, expected_columns):
5
- aligned_rows = []
6
- buffer = []
7
-
8
- for row in rows:
9
- # If the row has the correct number of columns, add it as-is
10
- if len(row) == expected_columns:
11
- # If there's buffered content from previous rows, add it before this row
12
- if buffer:
13
- aligned_rows.append(buffer)
14
- buffer = [] # Clear the buffer
15
- aligned_rows.append(row)
16
- else:
17
- # If row has fewer columns, treat it as a continuation and add to the buffer
18
- if buffer:
19
- buffer = [f"{b} {r}" if r else b for b, r in zip(buffer, row + [""] * (len(buffer) - len(row)))]
20
- else:
21
- buffer = row # Initialize the buffer with the row
22
-
23
- # If there's any remaining buffered row, add it to aligned rows
24
- if buffer:
25
- aligned_rows.append(buffer)
26
-
27
- return aligned_rows
28
-
29
- def parse_bhel_pdf(pdf_path):
30
- columns = [
31
- "Purchase Order No", "Date", "Sl No", "Material Description",
32
- "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
33
- ]
34
- expected_columns = len(columns)
35
  data = []
 
 
36
 
37
- with pdfplumber.open(pdf_path) as pdf:
38
  for page in pdf.pages:
39
- table = page.extract_table()
40
- if table:
41
- # Skip the header row and preprocess rows to align data
42
- rows = preprocess_rows(table[1:], expected_columns)
43
- for row in rows:
44
- # Only add rows that match the expected number of columns after preprocessing
45
- if len(row) == expected_columns:
46
- data.append(row)
47
- else:
48
- print(f"Skipping unalignable row: {row}")
49
-
50
- # Create a DataFrame with the specified columns
 
 
 
 
 
 
 
 
51
  df = pd.DataFrame(data, columns=columns)
52
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pdfplumber
2
  import pandas as pd
3
+ import tempfile
4
 
5
+ def format_material_description(description_series, si_no):
6
+ # Placeholder for a formatting function; update with your logic
7
+ return f"{description_series.iloc[0]} (SI No: {si_no})"
8
+
9
+ def extract_bhel_data(pdf_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  data = []
11
+ columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value", "Material Number", "HSN Code", "IGST"]
12
+ start_si, end_si = 10, 1150
13
 
14
+ with pdfplumber.open(pdf_file) as pdf:
15
  for page in pdf.pages:
16
+ text = page.extract_text().splitlines()
17
+ for line in text:
18
+ parts = line.split()
19
+ try:
20
+ si_no = int(parts[0])
21
+ if start_si <= si_no <= end_si:
22
+ material_desc = " ".join(parts[1:3])
23
+ unit = parts[3]
24
+ quantity = int(parts[4])
25
+ dely_qty = int(parts[5])
26
+ dely_date = parts[6]
27
+ unit_rate = float(parts[7])
28
+ value = float(parts[8])
29
+ material_number = parts[9] if len(parts) > 9 else ""
30
+ hsn_code = parts[10] if len(parts) > 10 else ""
31
+ igst = parts[11] if len(parts) > 11 else ""
32
+ data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value, material_number, hsn_code, igst])
33
+ except (ValueError, IndexError):
34
+ continue
35
+
36
  df = pd.DataFrame(data, columns=columns)
37
+
38
+ # Correct the SI No column to follow increments of 10
39
+ df['SI No'] = range(10, 10 + len(df) * 10, 10)
40
+
41
+ # Reapply the Material Description formatting based on the corrected SI No
42
+ df['Material Description'] = df['SI No'].apply(
43
+ lambda si_no: format_material_description(df['Material Description'], si_no)
44
+ )
45
+
46
+ # Save to temporary file for download
47
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
48
+ df.to_excel(temp_file.name, index=False)
49
+
50
+ # Display the corrected data to the user
51
+ import ace_tools as tools # Replace with your preferred display method
52
+ tools.display_dataframe_to_user(name="Corrected Data with Updated SI No", dataframe=df)
53
+
54
+ return temp_file.name