Spaces:

neerajkalyank
/

toshiba_2.O

Sleeping

App Files Files Community

toshiba_2.O / app.py

neerajkalyank

Create app.py

8b139bf verified about 2 months ago

raw

history blame

4.1 kB

	import gradio as gr
	from transformers import DonutProcessor, VisionEncoderDecoderModel
	import pandas as pd
	from io import BytesIO
	import fitz # PyMuPDF
	import re
	from PIL import Image

	# Initialize the Hugging Face Donut model and processor
	processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
	model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")

	def extract_data_from_pdf(pdf_file):
	# Open the PDF file
	doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
	text_data = []

	for page_num in range(doc.page_count):
	page = doc[page_num]
	pix = page.get_pixmap() # Render page to a Pixmap image

	# Convert Pixmap to PIL Image
	image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

	# Preprocess image for the Donut model
	processed_image = processor(image, return_tensors="pt").pixel_values
	outputs = model.generate(processed_image)

	# Decode generated text
	text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
	text_data.append(text)

	# Initialize list for parsed data
	data = []

	# Define regular expressions for parsing rows
	row_pattern = re.compile(
	r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)'
	)

	# Process and structure extracted text
	for text in text_data:
	for line in text.split('\n'):
	# Apply row pattern to each line
	match = row_pattern.search(line)
	if match:
	row = match.groupdict()
	row["description"] = row["description"].strip() # Clean description
	row["quantity"] = float(row["quantity"])
	row["price"] = float(row["price"])
	row["discount"] = float(row["discount"])
	row["amount"] = float(row["amount"])

	# Calculate Sub Total with assumed tax rate
	central_gst = row["amount"] * 0.09 # Central GST 9%
	state_gst = row["amount"] * 0.09 # State GST 9%
	row["Central GST"] = round(central_gst, 2)
	row["State GST"] = round(state_gst, 2)
	row["Sub Total"] = round(row["amount"] + central_gst + state_gst - row["discount"], 2)

	data.append(row)

	# Create DataFrame if data was extracted
	if data:
	df = pd.DataFrame(data)
	df.columns = [
	"Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price",
	"Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
	]

	# Export DataFrame to Excel
	output = BytesIO()
	with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
	df.to_excel(writer, index=False, sheet_name="Extracted Data")
	output.seek(0)

	return output
	else:
	# If no data was found, create a blank Excel file
	output = BytesIO()
	with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
	pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
	output.seek(0)

	return output

	# Define Gradio Interface with updated components
	iface = gr.Interface(
	fn=extract_data_from_pdf,
	inputs=gr.File(label="Upload PDF"),
	outputs=gr.File(label="Download Excel"),
	title="Advanced Document Data Extractor",
	description=(
	"Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
	"The model will identify and parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
	"Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
	),
	)

	iface.launch()