neerajkalyank commited on
Commit
e5ed1d6
1 Parent(s): 906860c

Update toshiba.py

Browse files
Files changed (1) hide show
  1. toshiba.py +11 -56
toshiba.py CHANGED
@@ -1,64 +1,19 @@
1
  import pdfplumber
2
- import os
3
- import pandas as pd
4
- import re
5
- import tempfile
6
-
7
- def extract_toshiba_data(pdf_file):
8
- # Check if the file exists
9
- if not os.path.exists(pdf_file):
10
- print(f"Error: The file '{pdf_file}' does not exist.")
11
- return None
12
-
13
- data = []
14
- purchase_order, order_date = None, None
15
 
 
16
  with pdfplumber.open(pdf_file) as pdf:
17
  for page_num, page in enumerate(pdf.pages):
18
- text = page.extract_text()
 
 
 
19
  if text:
20
- print(f"Page {page_num + 1} Content:\n{text}\n{'-' * 40}\n")
21
  else:
22
- print(f"Page {page_num + 1} has no extractable text.\n{'-' * 40}\n")
23
- continue
24
-
25
- lines = text.splitlines()
26
-
27
- if not purchase_order or not order_date:
28
- for line in lines:
29
- po_match = re.search(r'Purchase Order\s*:\s*(P\d+)', line)
30
- date_match = re.search(r'Order Date\s*:\s*([\d-]+)', line)
31
- if po_match:
32
- purchase_order = po_match.group(1)
33
- print(f"Found Purchase Order: {purchase_order}")
34
- if date_match:
35
- order_date = date_match.group(1)
36
- print(f"Found Order Date: {order_date}")
37
-
38
- for line in lines:
39
- item_match = re.match(r'(\d+)\s+(\d+)\s+(.*?)\s+([\d-]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)', line)
40
- if item_match:
41
- pos = int(item_match.group(1))
42
- item_code = item_match.group(2)
43
- item_name = item_match.group(3).strip()
44
- delivery_date = item_match.group(4)
45
- quantity = float(item_match.group(5))
46
- basic_price = float(item_match.group(6))
47
- amount = float(item_match.group(7))
48
- sub_total = float(item_match.group(8))
49
-
50
- data.append([purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total])
51
- print(f"Matched Item Row: {[purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total]}")
52
-
53
- df = pd.DataFrame(data, columns=["Purchase Order", "Order Date", "Pos", "Item Code", "Item Name", "Delivery Date", "Quantity", "Basic Price", "Amount", "SUB TOTAL"])
54
-
55
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
56
- df.to_excel(temp_file.name, index=False)
57
- print(f"Data extracted to: {temp_file.name}")
58
- return temp_file.name
59
 
60
  # Usage example
61
- file_path = 'Toshiba PO.pdf' # Ensure this is the correct path to the PDF file
62
- output_file = extract_toshiba_data(file_path)
63
- if output_file:
64
- print(f"Extracted data saved to: {output_file}")
 
1
  import pdfplumber
2
+ from PIL import Image
3
+ import pytesseract
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ def extract_text_with_ocr(pdf_file):
6
  with pdfplumber.open(pdf_file) as pdf:
7
  for page_num, page in enumerate(pdf.pages):
8
+ # Convert the page to an image
9
+ image = page.to_image(resolution=300).original
10
+ # Use OCR to extract text from the image
11
+ text = pytesseract.image_to_string(image)
12
  if text:
13
+ print(f"Page {page_num + 1} OCR Content:\n{text}\n{'-' * 40}\n")
14
  else:
15
+ print(f"Page {page_num + 1} has no extractable text even with OCR.\n{'-' * 40}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Usage example
18
+ file_path = 'Toshiba PO.pdf' # Make sure this path points to your PDF file
19
+ extract_text_with_ocr(file_path)