Spaces:
Runtime error
Runtime error
DSatishchandra
commited on
Commit
•
7be7132
1
Parent(s):
e16fdc8
Update parse_bhel.py
Browse files- parse_bhel.py +45 -37
parse_bhel.py
CHANGED
@@ -1,44 +1,52 @@
|
|
1 |
-
import gradio as gr
|
2 |
import pdfplumber
|
3 |
import pandas as pd
|
4 |
|
5 |
-
def
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
for page in pdf.pages:
|
10 |
-
|
11 |
-
if
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
'Sl No': parts[0],
|
18 |
-
'Material Description': " ".join(parts[1:-6]),
|
19 |
-
'Unit': parts[-6],
|
20 |
-
'Quantity': parts[-5],
|
21 |
-
'Dely Qty': parts[-4],
|
22 |
-
'Dely Date': parts[-3],
|
23 |
-
'Unit Rate': parts[-2],
|
24 |
-
'Value': parts[-1]
|
25 |
-
}
|
26 |
data.append(row)
|
|
|
|
|
27 |
|
28 |
-
#
|
29 |
-
df = pd.DataFrame(data)
|
30 |
return df
|
31 |
-
|
32 |
-
def gradio_interface(pdf_file):
|
33 |
-
# Parse the PDF file and return the extracted table as an HTML table
|
34 |
-
df = parse_bhel_pdf(pdf_file.name)
|
35 |
-
return df.to_html()
|
36 |
-
|
37 |
-
# Gradio interface
|
38 |
-
gr.Interface(
|
39 |
-
fn=gradio_interface,
|
40 |
-
inputs=gr.File(type="file", label="Upload PDF File"),
|
41 |
-
outputs="html",
|
42 |
-
title="BHEL PDF Data Extractor",
|
43 |
-
description="Upload a BHEL PDF file to extract structured data in a tabular format."
|
44 |
-
).launch()
|
|
|
|
|
1 |
import pdfplumber
|
2 |
import pandas as pd
|
3 |
|
4 |
+
def preprocess_rows(rows, expected_columns):
|
5 |
+
aligned_rows = []
|
6 |
+
buffer = []
|
7 |
+
|
8 |
+
for row in rows:
|
9 |
+
# If the row has the correct number of columns, add it as-is
|
10 |
+
if len(row) == expected_columns:
|
11 |
+
# If there's buffered content from previous rows, add it before this row
|
12 |
+
if buffer:
|
13 |
+
aligned_rows.append(buffer)
|
14 |
+
buffer = [] # Clear the buffer
|
15 |
+
aligned_rows.append(row)
|
16 |
+
else:
|
17 |
+
# If row has fewer columns, treat it as a continuation and add to the buffer
|
18 |
+
if buffer:
|
19 |
+
buffer = [f"{b} {r}" if r else b for b, r in zip(buffer, row + [""] * (len(buffer) - len(row)))]
|
20 |
+
else:
|
21 |
+
buffer = row # Initialize the buffer with the row
|
22 |
+
|
23 |
+
# If there's any remaining buffered row, add it to aligned rows
|
24 |
+
if buffer:
|
25 |
+
aligned_rows.append(buffer)
|
26 |
+
|
27 |
+
return aligned_rows
|
28 |
+
|
29 |
+
def parse_bhel_pdf(pdf_path):
|
30 |
+
columns = [
|
31 |
+
"Purchase Order No", "Date", "Sl No", "Material Description",
|
32 |
+
"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
|
33 |
+
]
|
34 |
+
expected_columns = len(columns)
|
35 |
+
data = []
|
36 |
+
|
37 |
+
with pdfplumber.open(pdf_path) as pdf:
|
38 |
for page in pdf.pages:
|
39 |
+
table = page.extract_table()
|
40 |
+
if table:
|
41 |
+
# Skip the header row and preprocess rows to align data
|
42 |
+
rows = preprocess_rows(table[1:], expected_columns)
|
43 |
+
for row in rows:
|
44 |
+
# Only add rows that match the expected number of columns after preprocessing
|
45 |
+
if len(row) == expected_columns:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
data.append(row)
|
47 |
+
else:
|
48 |
+
print(f"Skipping unalignable row: {row}")
|
49 |
|
50 |
+
# Create a DataFrame with the specified columns
|
51 |
+
df = pd.DataFrame(data, columns=columns)
|
52 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|