toshiba_2.O / app.py
neerajkalyank's picture
Update app.py
f6ae938 verified
raw
history blame
3.94 kB
import gradio as gr
import pytesseract
import pandas as pd
from io import BytesIO
import fitz # PyMuPDF
import re
from PIL import Image
import tempfile
# Specify the Tesseract executable path (optional if Tesseract is in PATH)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
def extract_data_from_pdf(pdf_file):
# Open the PDF file using the path provided by gr.File
doc = fitz.open(pdf_file.name)
text_data = []
# Process each page in the PDF using Tesseract OCR
for page_num in range(doc.page_count):
page = doc[page_num]
pix = page.get_pixmap() # Render page to a Pixmap image
# Convert Pixmap to PIL Image
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Use Tesseract to extract text from the image
text = pytesseract.image_to_string(image)
text_data.append(text)
# Initialize list for parsed data
data = []
# Define regular expressions for parsing rows
row_pattern = re.compile(
r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)'
)
# Process and structure extracted text
for text in text_data:
for line in text.split('\n'):
# Apply row pattern to each line
match = row_pattern.search(line)
if match:
row = match.groupdict()
row["description"] = row["description"].strip() # Clean description
row["quantity"] = float(row["quantity"])
row["price"] = float(row["price"])
row["discount"] = float(row["discount"])
row["amount"] = float(row["amount"])
# Calculate Sub Total with assumed tax rate
central_gst = row["amount"] * 0.09 # Central GST 9%
state_gst = row["amount"] * 0.09 # State GST 9%
row["Central GST"] = round(central_gst, 2)
row["State GST"] = round(state_gst, 2)
row["Sub Total"] = round(row["amount"] + central_gst + state_gst - row["discount"], 2)
data.append(row)
# Create DataFrame if data was extracted
if data:
df = pd.DataFrame(data)
df.columns = [
"Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price",
"Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
]
# Save the DataFrame to a temporary Excel file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
df.to_excel(writer, index=False, sheet_name="Extracted Data")
return temp_file.name
else:
# If no data was found, create a blank Excel file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
return temp_file.name
# Define Gradio Interface with updated components
iface = gr.Interface(
fn=extract_data_from_pdf,
inputs=gr.File(label="Upload PDF"),
outputs=gr.File(label="Download Excel"),
title="Advanced Document Data Extractor",
description=(
"Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
"The app will parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
"Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
),
)
iface.launch()