import re import pandas as pd import pdfplumber # Define the target columns based on your table headers columns = [ "Purchase Order No", "Date", "Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value" ] # Initialize an empty DataFrame with the defined columns data = pd.DataFrame(columns=columns) # Define regex patterns to identify and parse required lines po_pattern = re.compile(r'^\d{10} / \d{2}\.\d{2}\.\d{4}') # Purchase Order pattern material_pattern = re.compile(r'^\d{1,3} ') # Pattern for lines starting with Sl No # Function to clean and split rows def clean_and_split_line(line): # Split line into components based on spaces and commas parts = re.split(r'\s{2,}', line.strip()) # Split by two or more spaces return parts if len(parts) == len(columns) else None # Process the PDF and extract relevant lines with pdfplumber.open('your_pdf_file.pdf') as pdf: for page in pdf.pages: text = page.extract_text().splitlines() for line in text: # Check for Purchase Order row if po_pattern.match(line): po_data = line.split(' / ') po_no = po_data[0] po_date = po_data[1] # Check if the line contains material data elif material_pattern.match(line): cleaned_data = clean_and_split_line(line) if cleaned_data: row_data = { "Purchase Order No": po_no, "Date": po_date, "Sl No": cleaned_data[0], "Material Description": cleaned_data[1], "Unit": cleaned_data[2], "Quantity": cleaned_data[3], "Dely Qty": cleaned_data[4], "Dely Date": cleaned_data[5], "Unit Rate": cleaned_data[6], "Value": cleaned_data[7], } data = data.append(row_data, ignore_index=True) # Skip irrelevant lines or unalignable rows else: continue # Save extracted data to an Excel file data.to_excel("extracted_data.xlsx", index=False)