DSatishchandra's picture
Update app.py
e94122a verified
raw
history blame
3.83 kB
import re
from typing import Dict, List, Union
import gradio as gr
import fitz # PyMuPDF
import pandas as pd
from io import BytesIO
import tempfile
import os
def parse_federal_transformers(file_text: str) -> Dict[str, Union[str, List[Dict[str, Union[str, int, float]]]]]:
parsed_data = {
"Purchase Order No": "",
"Date": "",
"Supplier": "Federal Transformers Co. LLC",
"Invoice Address": "",
"Delivery Address": "",
"Currency": "",
"Payment Terms": "",
"Items": []
}
try:
# Extract Purchase Order No and Date
parsed_data["Purchase Order No"] = re.search(r"Purchase Order No\.\s(\d+)", file_text).group(1)
parsed_data["Date"] = re.search(r"Date:\s+(\d{2}-\w{3}-\d{2})", file_text).group(1)
# Extract Invoice Address and Delivery Address
parsed_data["Invoice Address"] = re.search(r"Invoice Address\s*:\s*(.*?)(?=\sDelivery Address)", file_text, re.DOTALL).group(1).strip()
parsed_data["Delivery Address"] = re.search(r"Delivery Address\s*:\s*(.*?)(?=\sNote)", file_text, re.DOTALL).group(1).strip()
# Define pattern for extracting item details
item_pattern = re.compile(r"(\d+)\s+([\w\s]+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)")
for match in item_pattern.finditer(file_text):
parsed_data["Items"].append({
"Item No": match.group(1),
"Description": match.group(2).strip(),
"Quantity": int(match.group(3)),
"Unit Price": float(match.group(4)),
"Total Price": float(match.group(5)),
})
except Exception as e:
print(f"Error parsing Federal Transformers PO: {e}")
return parsed_data
def read_pdf(file_path):
"""Reads the text content from a PDF file."""
text = ""
with fitz.open(file_path) as pdf:
for page_num in range(len(pdf)):
text += pdf[page_num].get_text()
return text
def process_pdf(file):
"""Handles PDF file input, extracts text, parses it, and returns an Excel file path."""
file_text = read_pdf(file.name) # Read text from PDF
parsed_data = parse_federal_transformers(file_text) # Parse extracted text
# Convert parsed data to DataFrame and create an Excel file
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer:
# Write main PO details to the first sheet
main_info = {
"Field": ["Purchase Order No", "Date", "Supplier", "Invoice Address", "Delivery Address", "Currency", "Payment Terms"],
"Value": [
parsed_data["Purchase Order No"],
parsed_data["Date"],
parsed_data["Supplier"],
parsed_data["Invoice Address"],
parsed_data["Delivery Address"],
parsed_data["Currency"],
parsed_data["Payment Terms"]
]
}
main_df = pd.DataFrame(main_info)
main_df.to_excel(writer, index=False, sheet_name="Purchase Order Details")
# Write item details to a second sheet
items_df = pd.DataFrame(parsed_data["Items"])
items_df.to_excel(writer, index=False, sheet_name="Items")
# Return the temporary file path for download
return temp_file.name
# Create Gradio interface
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload PDF"),
outputs=gr.File(label="Download Excel"),
title="Federal Transformers PO Parser",
description="Upload a PDF of a Federal Transformers purchase order to extract details and download as an Excel file."
)
if __name__ == "__main__":
iface.launch()