import pdfplumber import pandas as pd import re # Function: Extract Text from PDF def extract_text_from_pdf(pdf_file): with pdfplumber.open(pdf_file.name) as pdf: text = "" for page in pdf.pages: text += page.extract_text() return text # Function: Parse PO Items def parse_po_items_with_filters(text): """ Parses purchase order items from the extracted text using regex with filters. Handles split descriptions across lines and filters unwanted text. """ lines = text.splitlines() data = [] current_item = {} description_accumulator = [] for line in lines: # Match the start of an item row item_match = re.match(r"^(?P\d+)\s+(?P.+)", line) if item_match: # Save the previous item and start a new one if current_item: current_item["Description"] = " ".join(description_accumulator).strip() data.append(current_item) description_accumulator = [] current_item = { "Item": item_match.group("Item"), "Description": "", "Qty": "", "Unit": "", "Unit Price": "", "Total Price": "", } description_accumulator.append(item_match.group("Description")) elif current_item: # Handle additional description lines or split descriptions description_accumulator.append(line.strip()) # Match Qty, Unit, Unit Price, and Total Price qty_match = re.search(r"(?P\d+)\s+(Nos\.|Set)", line) if qty_match: current_item["Qty"] = qty_match.group("Qty") current_item["Unit"] = qty_match.group(2) price_match = re.search(r"(?P[\d.]+)\s+(?P[\d.]+)$", line) if price_match: current_item["Unit Price"] = price_match.group("UnitPrice") current_item["Total Price"] = price_match.group("TotalPrice") # Save the last item if current_item: current_item["Description"] = " ".join(description_accumulator).strip() data.append(current_item) if not data: return None, "No items found. Please check the PDF file format." df = pd.DataFrame(data) return df, "Data extracted successfully." # Function: Save to Excel def save_to_excel(df, output_path="federal_electric_extracted_data.xlsx"): df.to_excel(output_path, index=False) return output_path # Main function to process PDF def process_pdf(file): try: text = extract_text_from_pdf(file) df, status = parse_po_items_with_filters(text) if df is not None: output_path = save_to_excel(df) return output_path, status return None, status except Exception as e: return None, f"Error during processing: {str(e)}"