DSatishchandra commited on
Commit
d2251a9
1 Parent(s): 4a02ad0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -0
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import pandas as pd
3
+ import gradio as gr
4
+
5
+ # Define function to extract data
6
+ def extract_data(pdf_file):
7
+ data = []
8
+ columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
9
+
10
+ start_si, end_si = 10, 1150
11
+
12
+ with pdfplumber.open(pdf_file) as pdf:
13
+ for page in pdf.pages:
14
+ text = page.extract_text().splitlines()
15
+ for line in text:
16
+ parts = line.split()
17
+ try:
18
+ si_no = int(parts[0])
19
+ if start_si <= si_no <= end_si:
20
+ material_desc = " ".join(parts[1:3])
21
+ unit = parts[3]
22
+ quantity = int(parts[4])
23
+ dely_qty = int(parts[5])
24
+ dely_date = parts[6]
25
+ unit_rate = float(parts[7])
26
+ value = float(parts[8])
27
+ data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value])
28
+ except (ValueError, IndexError):
29
+ continue
30
+
31
+ df = pd.DataFrame(data, columns=columns)
32
+ excel_path = "/tmp/Extracted_Purchase_Order_Data.xlsx"
33
+ df.to_excel(excel_path, index=False)
34
+ return excel_path
35
+
36
+ # Set up Gradio interface
37
+ iface = gr.Interface(
38
+ fn=extract_data,
39
+ inputs=gr.File(label="Upload PDF"),
40
+ outputs=gr.File(label="Download Excel"),
41
+ title="PDF Data Extractor"
42
+ )
43
+
44
+ # Launch the app
45
+ iface.launch()