Spaces:

neerajkalyank
/

pdf1excel

Runtime error

App Files Files Community

neerajkalyank commited on Nov 8

Commit

e5ed1d6

•

1 Parent(s): 906860c

Update toshiba.py

Browse files

Files changed (1) hide show

toshiba.py +11 -56

toshiba.py CHANGED Viewed

@@ -1,64 +1,19 @@
 import pdfplumber
-import os
-import pandas as pd
-import re
-import tempfile
-def extract_toshiba_data(pdf_file):
-    # Check if the file exists
-    if not os.path.exists(pdf_file):
-        print(f"Error: The file '{pdf_file}' does not exist.")
-        return None
-    data = []
-    purchase_order, order_date = None, None
     with pdfplumber.open(pdf_file) as pdf:
         for page_num, page in enumerate(pdf.pages):
-            text = page.extract_text()
             if text:
-                print(f"Page {page_num + 1} Content:\n{text}\n{'-' * 40}\n")
             else:
-                print(f"Page {page_num + 1} has no extractable text.\n{'-' * 40}\n")
-                continue
-            lines = text.splitlines()
-            if not purchase_order or not order_date:
-                for line in lines:
-                    po_match = re.search(r'Purchase Order\s*:\s*(P\d+)', line)
-                    date_match = re.search(r'Order Date\s*:\s*([\d-]+)', line)
-                    if po_match:
-                        purchase_order = po_match.group(1)
-                        print(f"Found Purchase Order: {purchase_order}")
-                    if date_match:
-                        order_date = date_match.group(1)
-                        print(f"Found Order Date: {order_date}")
-            for line in lines:
-                item_match = re.match(r'(\d+)\s+(\d+)\s+(.*?)\s+([\d-]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)', line)
-                if item_match:
-                    pos = int(item_match.group(1))
-                    item_code = item_match.group(2)
-                    item_name = item_match.group(3).strip()
-                    delivery_date = item_match.group(4)
-                    quantity = float(item_match.group(5))
-                    basic_price = float(item_match.group(6))
-                    amount = float(item_match.group(7))
-                    sub_total = float(item_match.group(8))
-                    data.append([purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total])
-                    print(f"Matched Item Row: {[purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total]}")
-    df = pd.DataFrame(data, columns=["Purchase Order", "Order Date", "Pos", "Item Code", "Item Name", "Delivery Date", "Quantity", "Basic Price", "Amount", "SUB TOTAL"])
-    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
-    df.to_excel(temp_file.name, index=False)
-    print(f"Data extracted to: {temp_file.name}")
-    return temp_file.name
 # Usage example
-file_path = 'Toshiba PO.pdf'  # Ensure this is the correct path to the PDF file
-output_file = extract_toshiba_data(file_path)
-if output_file:
-    print(f"Extracted data saved to: {output_file}")

 import pdfplumber
+from PIL import Image
+import pytesseract
+def extract_text_with_ocr(pdf_file):
     with pdfplumber.open(pdf_file) as pdf:
         for page_num, page in enumerate(pdf.pages):
+            # Convert the page to an image
+            image = page.to_image(resolution=300).original
+            # Use OCR to extract text from the image
+            text = pytesseract.image_to_string(image)
             if text:
+                print(f"Page {page_num + 1} OCR Content:\n{text}\n{'-' * 40}\n")
             else:
+                print(f"Page {page_num + 1} has no extractable text even with OCR.\n{'-' * 40}\n")
 # Usage example
+file_path = 'Toshiba PO.pdf'  # Make sure this path points to your PDF file
+extract_text_with_ocr(file_path)