DSatishchandra's picture
Update app.py
71106bd verified
raw
history blame
3.27 kB
import pdfplumber
import pandas as pd
import gradio as gr
import re
# Define function to extract data
def extract_data(pdf_file):
data = []
columns = ["Purchase Order No", "Date", "SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
purchase_order_no = None
purchase_order_date = None
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
text = page.extract_text().splitlines()
# Extract Purchase Order No and Date dynamically from the first page
if not purchase_order_no or not purchase_order_date:
for line in text:
# Search for Purchase Order No
po_match = re.search(r'Purchase Order No[:\s]+(\d+)', line)
if po_match:
purchase_order_no = po_match.group(1)
# Search for Date
date_match = re.search(r'Date[:\s]+(\d{2}\.\d{2}\.\d{4})', line)
if date_match:
purchase_order_date = date_match.group(1)
# Stop searching if both fields are found
if purchase_order_no and purchase_order_date:
break
# Process each line to extract relevant data rows
for line in text:
# Using regex or keywords to identify each row
try:
# Example row pattern match for SI No (Assuming starts with numbers in multiples of 10)
if re.match(r'^\d+\s', line):
parts = line.split()
si_no = parts[0] # Extract SI No
material_desc = "BPS 017507\nMaterial Number: {}\nHSN Code: 8310\nIGST: 18%".format(parts[2]) # Example Material Description
unit = "NO"
quantity = parts[3]
dely_qty = parts[4]
dely_date = parts[5]
unit_rate = parts[6]
value = parts[7]
# Append data as a row in the correct order
data.append([
purchase_order_no,
purchase_order_date,
si_no,
material_desc,
unit,
quantity,
dely_qty,
dely_date,
unit_rate,
value
])
except (ValueError, IndexError):
continue # Skip lines that do not match the expected pattern
# Convert data to a DataFrame and save it as Excel
df = pd.DataFrame(data, columns=columns)
excel_path = "/tmp/Extracted_Purchase_Order_Data.xlsx"
df.to_excel(excel_path, index=False)
return excel_path
# Set up Gradio interface
iface = gr.Interface(
fn=extract_data,
inputs=gr.File(label="Upload PDF"),
outputs=gr.File(label="Download Excel"),
title="PDF Data Extractor"
)
# Launch the app
iface.launch()