neerajkalyank commited on
Commit
6d9fb15
1 Parent(s): 6e8ab5b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -65
app.py CHANGED
@@ -2,6 +2,7 @@ import pdfplumber
2
  import pandas as pd
3
  import gradio as gr
4
  import re
 
5
 
6
  # Define function to extract data
7
  def extract_data(pdf_file):
@@ -12,71 +13,84 @@ def extract_data(pdf_file):
12
  purchase_order_no = "Not Found"
13
  purchase_order_date = "Not Found"
14
 
15
- with pdfplumber.open(pdf_file) as pdf:
16
- for page in pdf.pages:
17
- text = page.extract_text().splitlines()
18
-
19
- # Attempt to dynamically extract Purchase Order No and Date from the first page
20
- for line in text:
21
- # Search for Purchase Order No
22
- po_match = re.search(r'Purchase Order No[:\s]+(\d+)', line, re.IGNORECASE)
23
- if po_match:
24
- purchase_order_no = po_match.group(1)
25
-
26
- # Search for Date
27
- date_match = re.search(r'Date[:\s]+(\d{2}\.\d{2}\.\d{4})', line, re.IGNORECASE)
28
- if date_match:
29
- purchase_order_date = date_match.group(1)
30
-
31
- # Stop if both values are found
32
- if purchase_order_no != "Not Found" and purchase_order_date != "Not Found":
33
- break
34
-
35
- # Process lines to extract row data, looking for rows that start with SI No
36
- for line in text:
37
- try:
38
- # Match lines that start with an SI number (e.g., "10", "20")
39
- si_no_match = re.match(r'^(\d+)\s', line)
40
- if si_no_match:
41
- parts = line.split()
42
-
43
- # Extract SI No
44
- si_no = parts[0]
45
-
46
- # Extract Material Number and format the Material Description
47
- material_number = parts[2] if len(parts) > 2 else "Unknown"
48
- material_desc = f"BPS 017507\nMaterial Number: {material_number}\nHSN Code: 8310\nIGST: 18%"
49
-
50
- # Extract Unit, Quantity, Dely Qty, Dely Date, Unit Rate, and Value
51
- unit = parts[3] if len(parts) > 3 else "NO" # Default to "NO" if not found
52
- quantity = int(parts[4]) if len(parts) > 4 else 0
53
- dely_qty = int(parts[5]) if len(parts) > 5 else 0
54
- dely_date = parts[6] if len(parts) > 6 else "Unknown"
55
- unit_rate = float(parts[7]) if len(parts) > 7 else 0.0
56
- value = float(parts[8]) if len(parts) > 8 else 0.0
57
-
58
- # Append extracted data in the specified order
59
- data.append([
60
- purchase_order_no,
61
- purchase_order_date,
62
- si_no,
63
- material_desc,
64
- unit,
65
- quantity,
66
- dely_qty,
67
- dely_date,
68
- unit_rate,
69
- value
70
- ])
71
- except (ValueError, IndexError) as e:
72
- print(f"Error processing line: {line} - {e}")
73
- continue # Skip lines that do not match the expected format
74
-
75
- # Convert data to DataFrame and save as Excel
76
- df = pd.DataFrame(data, columns=columns)
77
- excel_path = "/tmp/Extracted_Purchase_Order_Data.xlsx"
78
- df.to_excel(excel_path, index=False)
79
-
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  # Log warning if data was not found for Purchase Order No or Date
81
  if purchase_order_no == "Not Found" or purchase_order_date == "Not Found":
82
  print("Warning: 'Purchase Order No' or 'Date' was not found in the PDF.")
 
2
  import pandas as pd
3
  import gradio as gr
4
  import re
5
+ import tempfile
6
 
7
  # Define function to extract data
8
  def extract_data(pdf_file):
 
13
  purchase_order_no = "Not Found"
14
  purchase_order_date = "Not Found"
15
 
16
+ try:
17
+ with pdfplumber.open(pdf_file) as pdf:
18
+ for page in pdf.pages:
19
+ text = page.extract_text()
20
+
21
+ if not text:
22
+ continue # Skip pages without text
23
+
24
+ lines = text.splitlines()
25
+
26
+ # Attempt to dynamically extract Purchase Order No and Date from the first page
27
+ for line in lines:
28
+ # Search for Purchase Order No
29
+ po_match = re.search(r'Purchase Order No[:\s]+(\d+)', line, re.IGNORECASE)
30
+ if po_match:
31
+ purchase_order_no = po_match.group(1)
32
+
33
+ # Search for Date
34
+ date_match = re.search(r'Date[:\s]+(\d{2}\.\d{2}\.\d{4})', line, re.IGNORECASE)
35
+ if date_match:
36
+ purchase_order_date = date_match.group(1)
37
+
38
+ # Stop if both values are found
39
+ if purchase_order_no != "Not Found" and purchase_order_date != "Not Found":
40
+ break
41
+
42
+ # Process lines to extract row data, looking for rows that start with SI No
43
+ for line in lines:
44
+ try:
45
+ # Match lines that start with an SI number (e.g., "10", "20")
46
+ si_no_match = re.match(r'^(\d+)\s', line)
47
+ if si_no_match:
48
+ parts = line.split()
49
+
50
+ # Extract SI No
51
+ si_no = parts[0]
52
+
53
+ # Extract Material Number and format the Material Description
54
+ material_number = parts[2] if len(parts) > 2 else "Unknown"
55
+ material_desc = f"BPS 017507\nMaterial Number: {material_number}\nHSN Code: 8310\nIGST: 18%"
56
+
57
+ # Extract Unit, Quantity, Dely Qty, Dely Date, Unit Rate, and Value
58
+ unit = parts[3] if len(parts) > 3 else "NO" # Default to "NO" if not found
59
+ quantity = int(parts[4]) if len(parts) > 4 else 0
60
+ dely_qty = int(parts[5]) if len(parts) > 5 else 0
61
+ dely_date = parts[6] if len(parts) > 6 else "Unknown"
62
+ unit_rate = float(parts[7]) if len(parts) > 7 else 0.0
63
+ value = float(parts[8]) if len(parts) > 8 else 0.0
64
+
65
+ # Append extracted data in the specified order
66
+ data.append([
67
+ purchase_order_no,
68
+ purchase_order_date,
69
+ si_no,
70
+ material_desc,
71
+ unit,
72
+ quantity,
73
+ dely_qty,
74
+ dely_date,
75
+ unit_rate,
76
+ value
77
+ ])
78
+ except (ValueError, IndexError) as e:
79
+ print(f"Error processing line: {line} - {e}")
80
+ continue # Skip lines that do not match the expected format
81
+
82
+ # Convert data to DataFrame and save as Excel
83
+ df = pd.DataFrame(data, columns=columns)
84
+
85
+ # Generate a temporary file path for the Excel file
86
+ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp_file:
87
+ excel_path = tmp_file.name
88
+ df.to_excel(excel_path, index=False)
89
+
90
+ except Exception as e:
91
+ print(f"An error occurred while processing the PDF: {e}")
92
+ return None
93
+
94
  # Log warning if data was not found for Purchase Order No or Date
95
  if purchase_order_no == "Not Found" or purchase_order_date == "Not Found":
96
  print("Warning: 'Purchase Order No' or 'Date' was not found in the PDF.")