neerajkalyank commited on
Commit
69d8542
1 Parent(s): ff7685a

Create bhel.py

Browse files
Files changed (1) hide show
  1. bhel.py +76 -0
bhel.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+
3
+ import pandas as pd
4
+
5
+ import gradio as gr
6
+
7
+ # Define function to extract data
8
+
9
+ def extract_data(pdf_file):
10
+
11
+ data = []
12
+
13
+ columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
14
+
15
+ start_si, end_si = 10, 1150
16
+
17
+ with pdfplumber.open(pdf_file) as pdf:
18
+
19
+ for page in pdf.pages:
20
+
21
+ text = page.extract_text().splitlines()
22
+
23
+ for line in text:
24
+
25
+ parts = line.split()
26
+
27
+ try:
28
+
29
+ si_no = int(parts[0])
30
+
31
+ if start_si <= si_no <= end_si:
32
+
33
+ material_desc = " ".join(parts[1:3])
34
+
35
+ unit = parts[3]
36
+
37
+ quantity = int(parts[4])
38
+
39
+ dely_qty = int(parts[5])
40
+
41
+ dely_date = parts[6]
42
+
43
+ unit_rate = float(parts[7])
44
+
45
+ value = float(parts[8])
46
+
47
+ data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value])
48
+
49
+ except (ValueError, IndexError):
50
+
51
+ continue
52
+
53
+ df = pd.DataFrame(data, columns=columns)
54
+
55
+ excel_path = "/tmp/Extracted_Purchase_Order_Data.xlsx"
56
+
57
+ df.to_excel(excel_path, index=False)
58
+
59
+ return excel_path
60
+
61
+ # Set up Gradio interface
62
+
63
+ iface = gr.Interface(
64
+
65
+ fn=extract_data,
66
+
67
+ inputs=gr.File(label="Upload PDF"),
68
+
69
+ outputs=gr.File(label="Download Excel"),
70
+
71
+ title="PDF Data Extractor"
72
+
73
+ )
74
+ # Launch the app
75
+
76
+ iface.launch()