Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pytesseract | |
import pandas as pd | |
from io import BytesIO | |
import fitz # PyMuPDF | |
import re | |
from PIL import Image | |
import tempfile | |
# Explicitly set the Tesseract path | |
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" | |
def extract_data_from_pdf(pdf_file): | |
# Open the PDF file using the path provided by gr.File | |
doc = fitz.open(pdf_file.name) | |
text_data = [] | |
# Process each page in the PDF using Tesseract OCR | |
for page_num in range(doc.page_count): | |
page = doc[page_num] | |
pix = page.get_pixmap() # Render page to a Pixmap image | |
# Convert Pixmap to PIL Image | |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
# Use Tesseract to extract text from the image | |
text = pytesseract.image_to_string(image) | |
text_data.append(text) | |
# Initialize list for parsed data | |
data = [] | |
# Define regular expressions for parsing rows | |
row_pattern = re.compile( | |
r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)' | |
) | |
# Process and structure extracted text | |
for text in text_data: | |
for line in text.split('\n'): | |
# Apply row pattern to each line | |
match = row_pattern.search(line) | |
if match: | |
row = match.groupdict() | |
row["description"] = row["description"].strip() # Clean description | |
row["quantity"] = float(row["quantity"]) | |
row["price"] = float(row["price"]) | |
row["discount"] = float(row["discount"]) | |
row["amount"] = float(row["amount"]) | |
# Calculate Sub Total with assumed tax rate | |
central_gst = row["amount"] * 0.09 # Central GST 9% | |
state_gst = row["amount"] * 0.09 # State GST 9% | |
row["Central GST"] = round(central_gst, 2) | |
row["State GST"] = round(state_gst, 2) | |
row["Sub Total"] = round(row["amount"] + central_gst + state_gst - row["discount"], 2) | |
data.append(row) | |
# Create DataFrame if data was extracted | |
if data: | |
df = pd.DataFrame(data) | |
df.columns = [ | |
"Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price", | |
"Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total" | |
] | |
# Save the DataFrame to a temporary Excel file | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") | |
with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer: | |
df.to_excel(writer, index=False, sheet_name="Extracted Data") | |
return temp_file.name | |
else: | |
# If no data was found, create a blank Excel file | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") | |
with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer: | |
pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error") | |
return temp_file.name | |
# Define Gradio Interface with updated components | |
iface = gr.Interface( | |
fn=extract_data_from_pdf, | |
inputs=gr.File(label="Upload PDF"), | |
outputs=gr.File(label="Download Excel"), | |
title="Advanced Document Data Extractor", | |
description=( | |
"Upload a PDF file to extract structured purchase order data and download it as an Excel file. " | |
"The app will parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. " | |
"Calculated fields (like Central GST, State GST, and Sub Total) are automatically included." | |
), | |
) | |
iface.launch() | |