Spaces:

neerajkalyank
/

toshiba_2.O

Sleeping

File size: 3,896 Bytes

8b139bf
9d104f1
8b139bf
 
 
 
 
9d104f1
f6ae938
d4f07d5
f6ae938
8b139bf
 
a8af579
 
8b139bf
 
9d104f1
 
8b139bf
 
 
9d104f1
8b139bf
 
9d104f1
 
8b139bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d104f1
 
 
8b139bf
9d104f1
 
8b139bf
 
9d104f1
 
8b139bf
 
9d104f1
8b139bf
 
 
 
 
 
 
 
 
9d104f1
 
8b139bf

import gradio as gr
import pytesseract
import pandas as pd
from io import BytesIO
import fitz  # PyMuPDF
import re
from PIL import Image
import tempfile

# Explicitly set the Tesseract path
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

def extract_data_from_pdf(pdf_file):
    # Open the PDF file using the path provided by gr.File
    doc = fitz.open(pdf_file.name)
    text_data = []

    # Process each page in the PDF using Tesseract OCR
    for page_num in range(doc.page_count):
        page = doc[page_num]
        pix = page.get_pixmap()  # Render page to a Pixmap image
        
        # Convert Pixmap to PIL Image
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        
        # Use Tesseract to extract text from the image
        text = pytesseract.image_to_string(image)
        text_data.append(text)

    # Initialize list for parsed data
    data = []

    # Define regular expressions for parsing rows
    row_pattern = re.compile(
        r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)'
    )

    # Process and structure extracted text
    for text in text_data:
        for line in text.split('\n'):
            # Apply row pattern to each line
            match = row_pattern.search(line)
            if match:
                row = match.groupdict()
                row["description"] = row["description"].strip()  # Clean description
                row["quantity"] = float(row["quantity"])
                row["price"] = float(row["price"])
                row["discount"] = float(row["discount"])
                row["amount"] = float(row["amount"])

                # Calculate Sub Total with assumed tax rate
                central_gst = row["amount"] * 0.09  # Central GST 9%
                state_gst = row["amount"] * 0.09    # State GST 9%
                row["Central GST"] = round(central_gst, 2)
                row["State GST"] = round(state_gst, 2)
                row["Sub Total"] = round(row["amount"] + central_gst + state_gst - row["discount"], 2)

                data.append(row)

    # Create DataFrame if data was extracted
    if data:
        df = pd.DataFrame(data)
        df.columns = [
            "Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price",
            "Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
        ]
        
        # Save the DataFrame to a temporary Excel file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
        with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
            df.to_excel(writer, index=False, sheet_name="Extracted Data")
        
        return temp_file.name
    else:
        # If no data was found, create a blank Excel file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
        with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
            pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
        
        return temp_file.name

# Define Gradio Interface with updated components
iface = gr.Interface(
    fn=extract_data_from_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.File(label="Download Excel"),
    title="Advanced Document Data Extractor",
    description=(
        "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
        "The app will parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
        "Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
    ),
)

iface.launch()