Spaces:
Sleeping
Sleeping
neerajkalyank
commited on
Commit
•
bfda109
1
Parent(s):
0135d09
Update app.py
Browse files
app.py
CHANGED
@@ -8,27 +8,26 @@ def extract_data_from_pdf(pdf_file):
|
|
8 |
data = []
|
9 |
po_number = None
|
10 |
|
11 |
-
# Use pdfplumber
|
12 |
-
with pdfplumber.open(pdf_file.
|
13 |
for page in pdf.pages:
|
14 |
text = page.extract_text()
|
15 |
|
16 |
# Extract PO number if available
|
17 |
if po_number is None:
|
18 |
po_match = re.search(r"Purchase Order : (\w+)", text)
|
19 |
-
if po_match
|
20 |
-
po_number = po_match.group(1)
|
21 |
|
22 |
# Regex pattern to match the row data
|
23 |
row_pattern = re.compile(
|
24 |
-
r"(\d+)\s+(\d
|
25 |
)
|
26 |
|
27 |
# Extract matching rows
|
28 |
for match in row_pattern.finditer(text):
|
29 |
pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups()
|
30 |
sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
|
31 |
-
sub_total = sub_total_match.group(1) if sub_total_match else ""
|
32 |
|
33 |
data.append({
|
34 |
"Purchase Order": po_number,
|
@@ -54,8 +53,8 @@ def extract_data_from_pdf(pdf_file):
|
|
54 |
# Gradio Interface
|
55 |
iface = gr.Interface(
|
56 |
fn=extract_data_from_pdf,
|
57 |
-
inputs=gr.File(label="Upload PDF"),
|
58 |
-
outputs=gr.File(label="Download Excel"),
|
59 |
title="PDF Data Extractor",
|
60 |
description="Extract structured data from a PDF and output it as an Excel file."
|
61 |
)
|
|
|
8 |
data = []
|
9 |
po_number = None
|
10 |
|
11 |
+
# Use pdfplumber with BytesIO for Gradio compatibility
|
12 |
+
with pdfplumber.open(BytesIO(pdf_file.read())) as pdf:
|
13 |
for page in pdf.pages:
|
14 |
text = page.extract_text()
|
15 |
|
16 |
# Extract PO number if available
|
17 |
if po_number is None:
|
18 |
po_match = re.search(r"Purchase Order : (\w+)", text)
|
19 |
+
po_number = po_match.group(1) if po_match else "N/A"
|
|
|
20 |
|
21 |
# Regex pattern to match the row data
|
22 |
row_pattern = re.compile(
|
23 |
+
r"(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
|
24 |
)
|
25 |
|
26 |
# Extract matching rows
|
27 |
for match in row_pattern.finditer(text):
|
28 |
pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups()
|
29 |
sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
|
30 |
+
sub_total = sub_total_match.group(1) if sub_total_match else "0.0"
|
31 |
|
32 |
data.append({
|
33 |
"Purchase Order": po_number,
|
|
|
53 |
# Gradio Interface
|
54 |
iface = gr.Interface(
|
55 |
fn=extract_data_from_pdf,
|
56 |
+
inputs=gr.inputs.File(label="Upload PDF"),
|
57 |
+
outputs=gr.outputs.File(label="Download Excel"),
|
58 |
title="PDF Data Extractor",
|
59 |
description="Extract structured data from a PDF and output it as an Excel file."
|
60 |
)
|