neerajkalyank commited on
Commit
183531d
·
verified ·
1 Parent(s): f7f8ecb

Create bhel.py

Browse files
Files changed (1) hide show
  1. bhel.py +35 -0
bhel.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import pandas as pd
3
+ import tempfile
4
+
5
+ def extract_bhel_data(pdf_file):
6
+ data = []
7
+ columns = ["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value", "Material Number", "HSN Code", "IGST"]
8
+ start_si, end_si = 10, 1150
9
+
10
+ with pdfplumber.open(pdf_file) as pdf:
11
+ for page in pdf.pages:
12
+ text = page.extract_text().splitlines()
13
+ for line in text:
14
+ parts = line.split()
15
+ try:
16
+ si_no = int(parts[0])
17
+ if start_si <= si_no <= end_si:
18
+ material_desc = " ".join(parts[1:3])
19
+ unit = parts[3]
20
+ quantity = int(parts[4])
21
+ dely_qty = int(parts[5])
22
+ dely_date = parts[6]
23
+ unit_rate = float(parts[7])
24
+ value = float(parts[8])
25
+ material_number = parts[9] if len(parts) > 9 else ""
26
+ hsn_code = parts[10] if len(parts) > 10 else ""
27
+ igst = parts[11] if len(parts) > 11 else ""
28
+ data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value, material_number, hsn_code, igst])
29
+ except (ValueError, IndexError):
30
+ continue
31
+
32
+ df = pd.DataFrame(data, columns=columns)
33
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
34
+ df.to_excel(temp_file.name, index=False)
35
+ return temp_file.name