import gradio as gr from transformers import DonutProcessor, VisionEncoderDecoderModel import pandas as pd from io import BytesIO import fitz # PyMuPDF import re from PIL import Image # Initialize the Hugging Face Donut model and processor processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") def extract_data_from_pdf(pdf_file): # Open the PDF file doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text_data = [] for page_num in range(doc.page_count): page = doc[page_num] pix = page.get_pixmap() # Render page to a Pixmap image # Convert Pixmap to PIL Image image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # Preprocess image for the Donut model processed_image = processor(image, return_tensors="pt").pixel_values outputs = model.generate(processed_image) # Decode generated text text = processor.batch_decode(outputs, skip_special_tokens=True)[0] text_data.append(text) # Initialize list for parsed data data = [] # Define regular expressions for parsing rows row_pattern = re.compile( r'(?P\d+)\s+(?P\d+)\s+(?P.+?)\s+(?P\S+)\s+(?P\d{4}-\d{2}-\d{2})\s+(?P\d+\.\d+)\s+(?P\d+\.\d+)\s+(?P\d+\.\d+)\s+(?P\S+)\s+(?P\d+\.\d+)' ) # Process and structure extracted text for text in text_data: for line in text.split('\n'): # Apply row pattern to each line match = row_pattern.search(line) if match: row = match.groupdict() row["description"] = row["description"].strip() # Clean description row["quantity"] = float(row["quantity"]) row["price"] = float(row["price"]) row["discount"] = float(row["discount"]) row["amount"] = float(row["amount"]) # Calculate Sub Total with assumed tax rate central_gst = row["amount"] * 0.09 # Central GST 9% state_gst = row["amount"] * 0.09 # State GST 9% row["Central GST"] = round(central_gst, 2) row["State GST"] = round(state_gst, 2) row["Sub Total"] = round(row["amount"] + central_gst + state_gst - row["discount"], 2) data.append(row) # Create DataFrame if data was extracted if data: df = pd.DataFrame(data) df.columns = [ "Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price", "Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total" ] # Export DataFrame to Excel output = BytesIO() with pd.ExcelWriter(output, engine="xlsxwriter") as writer: df.to_excel(writer, index=False, sheet_name="Extracted Data") output.seek(0) return output else: # If no data was found, create a blank Excel file output = BytesIO() with pd.ExcelWriter(output, engine="xlsxwriter") as writer: pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error") output.seek(0) return output # Define Gradio Interface with updated components iface = gr.Interface( fn=extract_data_from_pdf, inputs=gr.File(label="Upload PDF"), outputs=gr.File(label="Download Excel"), title="Advanced Document Data Extractor", description=( "Upload a PDF file to extract structured purchase order data and download it as an Excel file. " "The model will identify and parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. " "Calculated fields (like Central GST, State GST, and Sub Total) are automatically included." ), ) iface.launch()