import gradio as gr import pdfplumber import pandas as pd import re from io import BytesIO import tempfile def extract_data_from_pdf(pdf_file): # Initialize list to hold text from each page text_data = [] # Open the PDF file with pdfplumber with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: # Extract text from each page text = page.extract_text() if text: print(f"Extracted text from page {page.page_number}:\n{text}\n") # Debugging: Print extracted text text_data.append(text) # Initialize list for parsed data data = [] # Define regular expressions for parsing rows row_pattern = re.compile( r'(?P\d+)\s+(?P\d+)\s+(?P.+?)\s+(?P\S+)\s+(?P\d{4}-\d{2}-\d{2})\s+(?P\d+\.\d+)\s+(?P\d+\.\d+)\s+(?P\d+\.\d+)\s+(?P\S+)\s+(?P\d+\.\d+)' ) # Process and structure extracted text for text in text_data: for line in text.split('\n'): # Apply row pattern to each line match = row_pattern.search(line) if match: row = match.groupdict() row["description"] = row["description"].strip() # Clean description row["quantity"] = float(row["quantity"]) row["price"] = float(row["price"]) row["discount"] = float(row["discount"]) row["amount"] = float(row["amount"]) # Append extracted row to data data.append(row) # Create DataFrame if data was extracted if data: df = pd.DataFrame(data) df.columns = [ "Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price", "Discount", "Currency", "Amount" ] # Save the DataFrame to a temporary Excel file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer: df.to_excel(writer, index=False, sheet_name="Extracted Data") return temp_file.name else: # If no data was found, create a blank Excel file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer: pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error") return temp_file.name # Define Gradio Interface with updated components iface = gr.Interface( fn=extract_data_from_pdf, inputs=gr.File(label="Upload PDF"), outputs=gr.File(label="Download Excel"), title="Advanced Document Data Extractor", description=( "Upload a PDF file to extract structured purchase order data and download it as an Excel file. " "The app will parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. " "No additional calculations are performed; it simply extracts the data as it appears." ), ) iface.launch()