Spaces:
Sleeping
Sleeping
File size: 2,149 Bytes
a72b612 8b139bf d97cfeb f6ae938 8b139bf d97cfeb 8b139bf 8755e00 a72b612 8b139bf 7ebbb35 d97cfeb bfda109 d97cfeb 7ebbb35 d97cfeb 7ebbb35 d97cfeb 7ebbb35 d97cfeb 7ebbb35 53debb2 7ebbb35 d97cfeb bfda109 d97cfeb 7ebbb35 53debb2 d97cfeb 7ebbb35 d97cfeb 53debb2 8b139bf ea64bb3 d97cfeb 53debb2 8b139bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import pdfplumber
import pandas as pd
import re
import gradio as gr
def extract_data_from_pdf(pdf_file):
data = []
po_number = None
# Open PDF file directly
with pdfplumber.open(pdf_file.name) as pdf:
for page in pdf.pages:
text = page.extract_text()
# Extract PO number (only once at the start)
if po_number is None:
po_match = re.search(r"Purchase Order : (\w+)", text)
po_number = po_match.group(1) if po_match else "N/A"
# Regex pattern for extracting rows
row_pattern = re.compile(
r"(\d+)\s+(\d{9})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+INR\s+([\d.]+)"
)
# Extract each row using the pattern
for match in row_pattern.finditer(text):
pos, item_code, unit, delivery_date, quantity, basic_price, discount, amount = match.groups()
# Extract subtotal if present
sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
sub_total = sub_total_match.group(1) if sub_total_match else "0.0"
# Append data for each matched row
data.append({
"Purchase Order": po_number,
"Pos.": pos,
"Item Code": item_code,
"Unit": unit,
"Delivery Date": delivery_date,
"Quantity": quantity,
"Basic Price": basic_price,
"Discount": discount,
"Amount": amount,
"SUB TOTAL": sub_total,
})
# Convert data to DataFrame and save to Excel
df = pd.DataFrame(data)
output_file = "output.xlsx"
df.to_excel(output_file, index=False)
return output_file
# Gradio Interface
iface = gr.Interface(
fn=extract_data_from_pdf,
inputs=gr.File(label="Upload PDF"),
outputs=gr.File(label="Download Excel"),
title="PDF Data Extractor",
description="Extract structured data from a PDF and output it as an Excel file.",
)
iface.launch()
|