import pdfplumber import pandas as pd import re import gradio as gr def extract_data(pdf_file): """ Extract data from the uploaded PDF for dynamic ranges (e.g., 10 to n). """ data = [] columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"] start_si = 10 # Start from SI No 10 end_si = None # Dynamically detect the end SI No with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: full_text = page.extract_text() # Get the text content for the page lines = full_text.splitlines() if full_text else [] for line in lines: try: # Parse the first column for SI No si_no_match = re.match(r"^\s*(\d+)\s", line) if not si_no_match: continue si_no = int(si_no_match.group(1)) # Dynamically set the end SI No if higher SI Nos are found if end_si is None or si_no > end_si: end_si = si_no if si_no < start_si: continue # Skip rows below the start SI No # Extract Material Description and details dynamically material_desc = extract_material_description(full_text, si_no) # Extract remaining fields parts = line.split() unit = parts[3] quantity = int(parts[4]) dely_qty = int(parts[5]) dely_date = parts[6] unit_rate = float(parts[7]) value = float(parts[8]) # Append row data data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value]) except (ValueError, IndexError): # Skip invalid rows or rows with missing data continue # Convert data to DataFrame and save as Excel df = pd.DataFrame(data, columns=columns) excel_path = "/tmp/Extracted_PO_Data_Dynamic.xlsx" df.to_excel(excel_path, index=False) return excel_path def extract_material_description(full_text, si_no): """ Extract Material Description, including Material Number, HSN Code, and IGST, using unique patterns. """ material_desc = "" # Match the specific SI No row to extract details si_no_pattern = rf"{si_no}\s+(BPS\s+\d+).*?Material\s+Number:\s+(\d+)" match = re.search(si_no_pattern, full_text, re.DOTALL) if match: bps_code = match.group(1) material_number = match.group(2) material_desc += f"{bps_code}\nMaterial Number: {material_number}\n" # Extract HSN Code hsn_code_match = re.search(r"HSN\s+Code:\s*(\d+)", full_text) if hsn_code_match: hsn_code = hsn_code_match.group(1) material_desc += f"HSN Code: {hsn_code}\n" else: material_desc += "HSN Code: Not Found\n" # Extract IGST igst_match = re.search(r"IGST\s*:\s*(\d+)\s*%", full_text) if igst_match: igst = igst_match.group(1) material_desc += f"IGST: {igst} %" else: material_desc += "IGST: Not Found" return material_desc.strip() # Gradio Interface def gradio_interface(pdf_file): """ Interface function for Gradio to process the PDF and return the Excel file. """ return extract_data(pdf_file.name) def process_pdf(file): try: # Extract text from the PDF text = extract_text_from_pdf(file) # Process the extracted text output_path = extract_data(file.name) return output_path, "Data extraction successful!" except Exception as e: return None, f"Error during processing: {str(e)}" # Define Gradio interface interface = gr.Interface( fn=gradio_interface, inputs=gr.File(label="Upload PDF"), outputs=gr.File(label="Download Extracted Excel"), title="Dynamic BHEL PO Data Extractor", description="Upload a PDF to extract accurate Material Numbers and related data dynamically into an Excel file." ) if __name__ == "__main__": interface.launch()