Spaces:
Sleeping
Sleeping
File size: 2,451 Bytes
a72b612 8b139bf a72b612 d97cfeb f6ae938 8b139bf d97cfeb 8b139bf 0bb856e a72b612 8b139bf 53debb2 d97cfeb bfda109 d97cfeb 53debb2 d97cfeb bfda109 d97cfeb 0135d09 d97cfeb 53debb2 d97cfeb bfda109 d97cfeb 53debb2 d97cfeb 0bb856e d97cfeb 53debb2 8b139bf ea64bb3 d97cfeb 53debb2 8b139bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import pdfplumber
import pandas as pd
from io import BytesIO
import re
import gradio as gr
def extract_data_from_pdf(pdf_file):
data = []
po_number = None
# Save BytesIO to temporary file
with open("temp.pdf", "wb") as f:
f.write(pdf_file.getbuffer())
with pdfplumber.open("temp.pdf") as pdf:
for page in pdf.pages:
text = page.extract_text()
# Extract PO number
if po_number is None:
po_match = re.search(r"Purchase Order : (\w+)", text)
po_number = po_match.group(1) if po_match else "N/A"
# Regex pattern for row data
row_pattern = re.compile(
r"(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
)
# Extract matching rows
for match in row_pattern.finditer(text):
(
pos,
item_code,
unit,
delivery_date,
quantity,
basic_price,
amount,
) = match.groups()
sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
sub_total = sub_total_match.group(1) if sub_total_match else "0.0"
data.append(
{
"Purchase Order": po_number,
"Pos.": pos,
"Item Code": item_code,
"Unit": unit,
"Delivery Date": delivery_date,
"Quantity": quantity,
"Basic Price": basic_price,
"Amount": amount,
"SUB TOTAL": sub_total,
}
)
# Convert data to DataFrame and save to Excel
df = pd.DataFrame(data)
output = BytesIO()
with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
df.to_excel(writer, index=False, sheet_name="Extracted Data")
output.seek(0)
# Remove temporary PDF file
import os
os.remove("temp.pdf")
return output
# Gradio Interface
iface = gr.Interface(
fn=extract_data_from_pdf,
inputs=gr.File(label="Upload PDF"),
outputs=gr.File(label="Download Excel"),
title="PDF Data Extractor",
description="Extract structured data from a PDF and output it as an Excel file.",
)
iface.launch()
|