import gradio as gr
import pdfplumber
import pandas as pd
import re
from io import BytesIO
import tempfile

def extract_data_from_pdf(pdf_file):
    # Initialize list to hold text from each page
    text_data = []

    # Open the PDF file with pdfplumber
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            # Extract text from each page
            text = page.extract_text()
            if text:
                print(f"Extracted text from page {page.page_number}:\n{text}\n")  # Debugging: Print extracted text
                text_data.append(text)

    # Initialize list for parsed data
    data = []

    # Define regular expressions for parsing rows
    row_pattern = re.compile(
        r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)'
    )

    # Process and structure extracted text
    for text in text_data:
        for line in text.split('\n'):
            # Apply row pattern to each line
            match = row_pattern.search(line)
            if match:
                row = match.groupdict()
                row["description"] = row["description"].strip()  # Clean description
                row["quantity"] = float(row["quantity"])
                row["price"] = float(row["price"])
                row["discount"] = float(row["discount"])
                row["amount"] = float(row["amount"])

                # Append extracted row to data
                data.append(row)

    # Create DataFrame if data was extracted
    if data:
        df = pd.DataFrame(data)
        df.columns = [
            "Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price",
            "Discount", "Currency", "Amount"
        ]
        
        # Save the DataFrame to a temporary Excel file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
        with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
            df.to_excel(writer, index=False, sheet_name="Extracted Data")
        
        return temp_file.name
    else:
        # If no data was found, create a blank Excel file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
        with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
            pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
        
        return temp_file.name

# Define Gradio Interface with updated components
iface = gr.Interface(
    fn=extract_data_from_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.File(label="Download Excel"),
    title="Advanced Document Data Extractor",
    description=(
        "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
        "The app will parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
        "No additional calculations are performed; it simply extracts the data as it appears."
    ),
)

iface.launch()