Spaces:

DSatishchandra
/

POExtraction_UC3

Runtime error

App Files Files Community

POExtraction_UC3 / parse_bhel.py

DSatishchandra

Update parse_bhel.py

b051e96 verified 3 months ago

raw

history blame

2.29 kB

	import re
	import pandas as pd
	import pdfplumber

	# Define the target columns based on your table headers
	columns = [
	"Purchase Order No", "Date", "Sl No", "Material Description",
	"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
	]

	# Initialize an empty DataFrame with the defined columns
	data = pd.DataFrame(columns=columns)

	# Define regex patterns to identify and parse required lines
	po_pattern = re.compile(r'^\d{10} / \d{2}\.\d{2}\.\d{4}') # Purchase Order pattern
	material_pattern = re.compile(r'^\d{1,3} ') # Pattern for lines starting with Sl No

	# Function to clean and split rows
	def clean_and_split_line(line):
	# Split line into components based on spaces and commas
	parts = re.split(r'\s{2,}', line.strip()) # Split by two or more spaces
	return parts if len(parts) == len(columns) else None

	# Process the PDF and extract relevant lines
	with pdfplumber.open('your_pdf_file.pdf') as pdf:
	for page in pdf.pages:
	text = page.extract_text().splitlines()

	for line in text:
	# Check for Purchase Order row
	if po_pattern.match(line):
	po_data = line.split(' / ')
	po_no = po_data[0]
	po_date = po_data[1]

	# Check if the line contains material data
	elif material_pattern.match(line):
	cleaned_data = clean_and_split_line(line)
	if cleaned_data:
	row_data = {
	"Purchase Order No": po_no,
	"Date": po_date,
	"Sl No": cleaned_data[0],
	"Material Description": cleaned_data[1],
	"Unit": cleaned_data[2],
	"Quantity": cleaned_data[3],
	"Dely Qty": cleaned_data[4],
	"Dely Date": cleaned_data[5],
	"Unit Rate": cleaned_data[6],
	"Value": cleaned_data[7],
	}
	data = data.append(row_data, ignore_index=True)
	# Skip irrelevant lines or unalignable rows
	else:
	continue

	# Save extracted data to an Excel file
	data.to_excel("extracted_data.xlsx", index=False)