import pdfplumber import pandas as pd import re import tempfile def extract_toshiba_data(pdf_file): data = [] purchase_order, order_date = None, None with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: text = page.extract_text().splitlines() # Extract Purchase Order and Order Date if not already found if not purchase_order or not order_date: for line in text: po_match = re.search(r'Purchase Order\s*:\s*(P\d+)', line) date_match = re.search(r'Order Date\s*:\s*([\d-]+)', line) if po_match: purchase_order = po_match.group(1) if date_match: order_date = date_match.group(1) # Extract item details using patterns for line in text: # Match each line with expected pattern for item rows item_match = re.match(r'(\d+)\s+(\d+)\s+(.*?)\s+([\d-]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)', line) if item_match: pos = int(item_match.group(1)) # Position number item_code = item_match.group(2) # Item Code item_name = item_match.group(3).strip() # Item Name/Description (if available) delivery_date = item_match.group(4) # Delivery Date quantity = float(item_match.group(5)) # Quantity basic_price = float(item_match.group(6)) # Basic Price amount = float(item_match.group(7)) # Calculated Amount sub_total = float(item_match.group(8)) # Subtotal or final price data.append([purchase_order, order_date, pos, item_code, item_name, delivery_date, quantity, basic_price, amount, sub_total]) # Define DataFrame with the corrected structure df = pd.DataFrame(data, columns=["Purchase Order", "Order Date", "Pos", "Item Code", "Item Name", "Delivery Date", "Quantity", "Basic Price", "Amount", "SUB TOTAL"]) # Save to Excel file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") df.to_excel(temp_file.name, index=False) return temp_file.name