Spaces:

DSatishchandra
/

POExtraction_UC3

Runtime error

App Files Files Community

POExtraction_UC3 / app.py

DSatishchandra

Update app.py

e94122a verified 4 months ago

raw

history blame

3.83 kB

	import re
	from typing import Dict, List, Union
	import gradio as gr
	import fitz # PyMuPDF
	import pandas as pd
	from io import BytesIO
	import tempfile
	import os

	def parse_federal_transformers(file_text: str) -> Dict[str, Union[str, List[Dict[str, Union[str, int, float]]]]]:
	parsed_data = {
	"Purchase Order No": "",
	"Date": "",
	"Supplier": "Federal Transformers Co. LLC",
	"Invoice Address": "",
	"Delivery Address": "",
	"Currency": "",
	"Payment Terms": "",
	"Items": []
	}

	try:
	# Extract Purchase Order No and Date
	parsed_data["Purchase Order No"] = re.search(r"Purchase Order No\.\s(\d+)", file_text).group(1)
	parsed_data["Date"] = re.search(r"Date:\s+(\d{2}-\w{3}-\d{2})", file_text).group(1)

	# Extract Invoice Address and Delivery Address
	parsed_data["Invoice Address"] = re.search(r"Invoice Address\s:\s(.*?)(?=\sDelivery Address)", file_text, re.DOTALL).group(1).strip()
	parsed_data["Delivery Address"] = re.search(r"Delivery Address\s:\s(.*?)(?=\sNote)", file_text, re.DOTALL).group(1).strip()

	# Define pattern for extracting item details
	item_pattern = re.compile(r"(\d+)\s+([\w\s]+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)")
	for match in item_pattern.finditer(file_text):
	parsed_data["Items"].append({
	"Item No": match.group(1),
	"Description": match.group(2).strip(),
	"Quantity": int(match.group(3)),
	"Unit Price": float(match.group(4)),
	"Total Price": float(match.group(5)),
	})
	except Exception as e:
	print(f"Error parsing Federal Transformers PO: {e}")

	return parsed_data

	def read_pdf(file_path):
	"""Reads the text content from a PDF file."""
	text = ""
	with fitz.open(file_path) as pdf:
	for page_num in range(len(pdf)):
	text += pdf[page_num].get_text()
	return text

	def process_pdf(file):
	"""Handles PDF file input, extracts text, parses it, and returns an Excel file path."""
	file_text = read_pdf(file.name) # Read text from PDF
	parsed_data = parse_federal_transformers(file_text) # Parse extracted text

	# Convert parsed data to DataFrame and create an Excel file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
	with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer:
	# Write main PO details to the first sheet
	main_info = {
	"Field": ["Purchase Order No", "Date", "Supplier", "Invoice Address", "Delivery Address", "Currency", "Payment Terms"],
	"Value": [
	parsed_data["Purchase Order No"],
	parsed_data["Date"],
	parsed_data["Supplier"],
	parsed_data["Invoice Address"],
	parsed_data["Delivery Address"],
	parsed_data["Currency"],
	parsed_data["Payment Terms"]
	]
	}
	main_df = pd.DataFrame(main_info)
	main_df.to_excel(writer, index=False, sheet_name="Purchase Order Details")

	# Write item details to a second sheet
	items_df = pd.DataFrame(parsed_data["Items"])
	items_df.to_excel(writer, index=False, sheet_name="Items")

	# Return the temporary file path for download
	return temp_file.name

	# Create Gradio interface
	iface = gr.Interface(
	fn=process_pdf,
	inputs=gr.File(label="Upload PDF"),
	outputs=gr.File(label="Download Excel"),
	title="Federal Transformers PO Parser",
	description="Upload a PDF of a Federal Transformers purchase order to extract details and download as an Excel file."
	)

	if __name__ == "__main__":
	iface.launch()