Spaces:
Runtime error
Runtime error
import re | |
import pandas as pd | |
import pdfplumber | |
# Define the target columns based on your table headers | |
columns = [ | |
"Purchase Order No", "Date", "Sl No", "Material Description", | |
"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value" | |
] | |
# Initialize an empty DataFrame with the defined columns | |
data = pd.DataFrame(columns=columns) | |
# Define regex patterns to identify and parse required lines | |
po_pattern = re.compile(r'^\d{10} / \d{2}\.\d{2}\.\d{4}') # Purchase Order pattern | |
material_pattern = re.compile(r'^\d{1,3} ') # Pattern for lines starting with Sl No | |
# Function to clean and split rows | |
def clean_and_split_line(line): | |
# Split line into components based on spaces and commas | |
parts = re.split(r'\s{2,}', line.strip()) # Split by two or more spaces | |
return parts if len(parts) == len(columns) else None | |
# Process the PDF and extract relevant lines | |
with pdfplumber.open('your_pdf_file.pdf') as pdf: | |
for page in pdf.pages: | |
text = page.extract_text().splitlines() | |
for line in text: | |
# Check for Purchase Order row | |
if po_pattern.match(line): | |
po_data = line.split(' / ') | |
po_no = po_data[0] | |
po_date = po_data[1] | |
# Check if the line contains material data | |
elif material_pattern.match(line): | |
cleaned_data = clean_and_split_line(line) | |
if cleaned_data: | |
row_data = { | |
"Purchase Order No": po_no, | |
"Date": po_date, | |
"Sl No": cleaned_data[0], | |
"Material Description": cleaned_data[1], | |
"Unit": cleaned_data[2], | |
"Quantity": cleaned_data[3], | |
"Dely Qty": cleaned_data[4], | |
"Dely Date": cleaned_data[5], | |
"Unit Rate": cleaned_data[6], | |
"Value": cleaned_data[7], | |
} | |
data = data.append(row_data, ignore_index=True) | |
# Skip irrelevant lines or unalignable rows | |
else: | |
continue | |
# Save extracted data to an Excel file | |
data.to_excel("extracted_data.xlsx", index=False) | |