Spaces:
Runtime error
Runtime error
import re | |
from typing import Dict, List, Union | |
import gradio as gr | |
import fitz # PyMuPDF | |
import pandas as pd | |
from io import BytesIO | |
import tempfile | |
import os | |
def parse_federal_transformers(file_text: str) -> Dict[str, Union[str, List[Dict[str, Union[str, int, float]]]]]: | |
parsed_data = { | |
"Purchase Order No": "", | |
"Date": "", | |
"Supplier": "Federal Transformers Co. LLC", | |
"Invoice Address": "", | |
"Delivery Address": "", | |
"Currency": "", | |
"Payment Terms": "", | |
"Items": [] | |
} | |
try: | |
# Extract Purchase Order No and Date | |
parsed_data["Purchase Order No"] = re.search(r"Purchase Order No\.\s(\d+)", file_text).group(1) | |
parsed_data["Date"] = re.search(r"Date:\s+(\d{2}-\w{3}-\d{2})", file_text).group(1) | |
# Extract Invoice Address and Delivery Address | |
parsed_data["Invoice Address"] = re.search(r"Invoice Address\s*:\s*(.*?)(?=\sDelivery Address)", file_text, re.DOTALL).group(1).strip() | |
parsed_data["Delivery Address"] = re.search(r"Delivery Address\s*:\s*(.*?)(?=\sNote)", file_text, re.DOTALL).group(1).strip() | |
# Define pattern for extracting item details | |
item_pattern = re.compile(r"(\d+)\s+([\w\s]+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)") | |
for match in item_pattern.finditer(file_text): | |
parsed_data["Items"].append({ | |
"Item No": match.group(1), | |
"Description": match.group(2).strip(), | |
"Quantity": int(match.group(3)), | |
"Unit Price": float(match.group(4)), | |
"Total Price": float(match.group(5)), | |
}) | |
except Exception as e: | |
print(f"Error parsing Federal Transformers PO: {e}") | |
return parsed_data | |
def read_pdf(file_path): | |
"""Reads the text content from a PDF file.""" | |
text = "" | |
with fitz.open(file_path) as pdf: | |
for page_num in range(len(pdf)): | |
text += pdf[page_num].get_text() | |
return text | |
def process_pdf(file): | |
"""Handles PDF file input, extracts text, parses it, and returns an Excel file path.""" | |
file_text = read_pdf(file.name) # Read text from PDF | |
parsed_data = parse_federal_transformers(file_text) # Parse extracted text | |
# Convert parsed data to DataFrame and create an Excel file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file: | |
with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer: | |
# Write main PO details to the first sheet | |
main_info = { | |
"Field": ["Purchase Order No", "Date", "Supplier", "Invoice Address", "Delivery Address", "Currency", "Payment Terms"], | |
"Value": [ | |
parsed_data["Purchase Order No"], | |
parsed_data["Date"], | |
parsed_data["Supplier"], | |
parsed_data["Invoice Address"], | |
parsed_data["Delivery Address"], | |
parsed_data["Currency"], | |
parsed_data["Payment Terms"] | |
] | |
} | |
main_df = pd.DataFrame(main_info) | |
main_df.to_excel(writer, index=False, sheet_name="Purchase Order Details") | |
# Write item details to a second sheet | |
items_df = pd.DataFrame(parsed_data["Items"]) | |
items_df.to_excel(writer, index=False, sheet_name="Items") | |
# Return the temporary file path for download | |
return temp_file.name | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=process_pdf, | |
inputs=gr.File(label="Upload PDF"), | |
outputs=gr.File(label="Download Excel"), | |
title="Federal Transformers PO Parser", | |
description="Upload a PDF of a Federal Transformers purchase order to extract details and download as an Excel file." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |