import re
import pandas as pd
import pdfplumber

# Define the target columns based on your table headers
columns = [
    "Purchase Order No", "Date", "Sl No", "Material Description", 
    "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
]

# Initialize an empty DataFrame with the defined columns
data = pd.DataFrame(columns=columns)

# Define regex patterns to identify and parse required lines
po_pattern = re.compile(r'^\d{10} / \d{2}\.\d{2}\.\d{4}')  # Purchase Order pattern
material_pattern = re.compile(r'^\d{1,3} ')  # Pattern for lines starting with Sl No

# Function to clean and split rows
def clean_and_split_line(line):
    # Split line into components based on spaces and commas
    parts = re.split(r'\s{2,}', line.strip())  # Split by two or more spaces
    return parts if len(parts) == len(columns) else None

# Process the PDF and extract relevant lines
with pdfplumber.open('your_pdf_file.pdf') as pdf:
    for page in pdf.pages:
        text = page.extract_text().splitlines()
        
        for line in text:
            # Check for Purchase Order row
            if po_pattern.match(line):
                po_data = line.split(' / ')
                po_no = po_data[0]
                po_date = po_data[1]
                
            # Check if the line contains material data
            elif material_pattern.match(line):
                cleaned_data = clean_and_split_line(line)
                if cleaned_data:
                    row_data = {
                        "Purchase Order No": po_no,
                        "Date": po_date,
                        "Sl No": cleaned_data[0],
                        "Material Description": cleaned_data[1],
                        "Unit": cleaned_data[2],
                        "Quantity": cleaned_data[3],
                        "Dely Qty": cleaned_data[4],
                        "Dely Date": cleaned_data[5],
                        "Unit Rate": cleaned_data[6],
                        "Value": cleaned_data[7],
                    }
                    data = data.append(row_data, ignore_index=True)
            # Skip irrelevant lines or unalignable rows
            else:
                continue

# Save extracted data to an Excel file
data.to_excel("extracted_data.xlsx", index=False)