neerajkalyank commited on
Commit
906860c
1 Parent(s): e20c41e

Update toshiba.py

Browse files
Files changed (1) hide show
  1. toshiba.py +21 -21
toshiba.py CHANGED
@@ -1,15 +1,20 @@
1
  import pdfplumber
 
2
  import pandas as pd
3
  import re
4
  import tempfile
5
 
6
  def extract_toshiba_data(pdf_file):
 
 
 
 
 
7
  data = []
8
  purchase_order, order_date = None, None
9
 
10
  with pdfplumber.open(pdf_file) as pdf:
11
  for page_num, page in enumerate(pdf.pages):
12
- # Extract and print the raw text of each page for debugging
13
  text = page.extract_text()
14
  if text:
15
  print(f"Page {page_num + 1} Content:\n{text}\n{'-' * 40}\n")
@@ -17,48 +22,43 @@ def extract_toshiba_data(pdf_file):
17
  print(f"Page {page_num + 1} has no extractable text.\n{'-' * 40}\n")
18
  continue
19
 
20
- # Split text into lines to analyze line by line
21
  lines = text.splitlines()
22
 
23
- # Extract Purchase Order and Order Date if not already found
24
  if not purchase_order or not order_date:
25
  for line in lines:
26
  po_match = re.search(r'Purchase Order\s*:\s*(P\d+)', line)
27
  date_match = re.search(r'Order Date\s*:\s*([\d-]+)', line)
28
  if po_match:
29
  purchase_order = po_match.group(1)
30
- print(f"Found Purchase Order: {purchase_order}") # Debug
31
  if date_match:
32
  order_date = date_match.group(1)
33
- print(f"Found Order Date: {order_date}") # Debug
34
 
35
- # Attempt to match item details using a general regex pattern
36
  for line in lines:
37
  item_match = re.match(r'(\d+)\s+(\d+)\s+(.*?)\s+([\d-]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)', line)
38
  if item_match:
39
- pos = int(item_match.group(1)) # Position number
40
- item_code = item_match.group(2) # Item Code
41
- item_name = item_match.group(3).strip() # Item Name/Description
42
- delivery_date = item_match.group(4) # Delivery Date
43
- quantity = float(item_match.group(5)) # Quantity
44
- basic_price = float(item_match.group(6)) # Basic Price
45
- amount = float(item_match.group(7)) # Calculated Amount
46
- sub_total = float(item_match.group(8)) # Subtotal or final price
47
 
48
- # Append the extracted row to data list
49
  data.append([purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total])
50
- print(f"Matched Item Row: {[purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total]}") # Debug
51
 
52
- # Define DataFrame with the expected structure
53
  df = pd.DataFrame(data, columns=["Purchase Order", "Order Date", "Pos", "Item Code", "Item Name", "Delivery Date", "Quantity", "Basic Price", "Amount", "SUB TOTAL"])
54
 
55
- # Save to Excel file
56
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
57
  df.to_excel(temp_file.name, index=False)
58
  print(f"Data extracted to: {temp_file.name}")
59
  return temp_file.name
60
 
61
- # Usage example with debug output
62
- file_path = '/mnt/data/Toshiba PO.pdf' # Replace this with the actual file path
63
  output_file = extract_toshiba_data(file_path)
64
- print(f"Extracted data saved to: {output_file}")
 
 
1
  import pdfplumber
2
+ import os
3
  import pandas as pd
4
  import re
5
  import tempfile
6
 
7
  def extract_toshiba_data(pdf_file):
8
+ # Check if the file exists
9
+ if not os.path.exists(pdf_file):
10
+ print(f"Error: The file '{pdf_file}' does not exist.")
11
+ return None
12
+
13
  data = []
14
  purchase_order, order_date = None, None
15
 
16
  with pdfplumber.open(pdf_file) as pdf:
17
  for page_num, page in enumerate(pdf.pages):
 
18
  text = page.extract_text()
19
  if text:
20
  print(f"Page {page_num + 1} Content:\n{text}\n{'-' * 40}\n")
 
22
  print(f"Page {page_num + 1} has no extractable text.\n{'-' * 40}\n")
23
  continue
24
 
 
25
  lines = text.splitlines()
26
 
 
27
  if not purchase_order or not order_date:
28
  for line in lines:
29
  po_match = re.search(r'Purchase Order\s*:\s*(P\d+)', line)
30
  date_match = re.search(r'Order Date\s*:\s*([\d-]+)', line)
31
  if po_match:
32
  purchase_order = po_match.group(1)
33
+ print(f"Found Purchase Order: {purchase_order}")
34
  if date_match:
35
  order_date = date_match.group(1)
36
+ print(f"Found Order Date: {order_date}")
37
 
 
38
  for line in lines:
39
  item_match = re.match(r'(\d+)\s+(\d+)\s+(.*?)\s+([\d-]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)', line)
40
  if item_match:
41
+ pos = int(item_match.group(1))
42
+ item_code = item_match.group(2)
43
+ item_name = item_match.group(3).strip()
44
+ delivery_date = item_match.group(4)
45
+ quantity = float(item_match.group(5))
46
+ basic_price = float(item_match.group(6))
47
+ amount = float(item_match.group(7))
48
+ sub_total = float(item_match.group(8))
49
 
 
50
  data.append([purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total])
51
+ print(f"Matched Item Row: {[purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total]}")
52
 
 
53
  df = pd.DataFrame(data, columns=["Purchase Order", "Order Date", "Pos", "Item Code", "Item Name", "Delivery Date", "Quantity", "Basic Price", "Amount", "SUB TOTAL"])
54
 
 
55
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
56
  df.to_excel(temp_file.name, index=False)
57
  print(f"Data extracted to: {temp_file.name}")
58
  return temp_file.name
59
 
60
+ # Usage example
61
+ file_path = 'Toshiba PO.pdf' # Ensure this is the correct path to the PDF file
62
  output_file = extract_toshiba_data(file_path)
63
+ if output_file:
64
+ print(f"Extracted data saved to: {output_file}")