POExtraction_UC3 / parse_bhel.py
DSatishchandra's picture
Update parse_bhel.py
b051e96 verified
raw
history blame
2.29 kB
import re
import pandas as pd
import pdfplumber
# Define the target columns based on your table headers
columns = [
"Purchase Order No", "Date", "Sl No", "Material Description",
"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
]
# Initialize an empty DataFrame with the defined columns
data = pd.DataFrame(columns=columns)
# Define regex patterns to identify and parse required lines
po_pattern = re.compile(r'^\d{10} / \d{2}\.\d{2}\.\d{4}') # Purchase Order pattern
material_pattern = re.compile(r'^\d{1,3} ') # Pattern for lines starting with Sl No
# Function to clean and split rows
def clean_and_split_line(line):
# Split line into components based on spaces and commas
parts = re.split(r'\s{2,}', line.strip()) # Split by two or more spaces
return parts if len(parts) == len(columns) else None
# Process the PDF and extract relevant lines
with pdfplumber.open('your_pdf_file.pdf') as pdf:
for page in pdf.pages:
text = page.extract_text().splitlines()
for line in text:
# Check for Purchase Order row
if po_pattern.match(line):
po_data = line.split(' / ')
po_no = po_data[0]
po_date = po_data[1]
# Check if the line contains material data
elif material_pattern.match(line):
cleaned_data = clean_and_split_line(line)
if cleaned_data:
row_data = {
"Purchase Order No": po_no,
"Date": po_date,
"Sl No": cleaned_data[0],
"Material Description": cleaned_data[1],
"Unit": cleaned_data[2],
"Quantity": cleaned_data[3],
"Dely Qty": cleaned_data[4],
"Dely Date": cleaned_data[5],
"Unit Rate": cleaned_data[6],
"Value": cleaned_data[7],
}
data = data.append(row_data, ignore_index=True)
# Skip irrelevant lines or unalignable rows
else:
continue
# Save extracted data to an Excel file
data.to_excel("extracted_data.xlsx", index=False)