import pdfplumber from PIL import Image import pytesseract import os def extract_text_with_ocr(pdf_file): # Check if the file exists before opening if not os.path.exists(pdf_file): print(f"Error: The file '{pdf_file}' does not exist.") return with pdfplumber.open(pdf_file) as pdf: for page_num, page in enumerate(pdf.pages): # Convert the page to an image image = page.to_image(resolution=300).original # Use OCR to extract text from the image text = pytesseract.image_to_string(image) if text: print(f"Page {page_num + 1} OCR Content:\n{text}\n{'-' * 40}\n") else: print(f"Page {page_num + 1} has no extractable text even with OCR.\n{'-' * 40}\n") # Usage example file_path = '/mnt/data/Toshiba PO.pdf' # Make sure this is the correct path to your PDF file extract_text_with_ocr(file_path)