Spaces:
Runtime error
Runtime error
DSatishchandra
commited on
Commit
•
b051e96
1
Parent(s):
da34543
Update parse_bhel.py
Browse files- parse_bhel.py +54 -59
parse_bhel.py
CHANGED
@@ -1,63 +1,58 @@
|
|
1 |
-
import
|
2 |
import pandas as pd
|
|
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
else:
|
26 |
-
|
27 |
-
else:
|
28 |
-
# If unalignable, add to unalignable_rows for debugging
|
29 |
-
unalignable_rows.append(row)
|
30 |
-
|
31 |
-
# Log any remaining buffered content
|
32 |
-
if buffer:
|
33 |
-
aligned_rows.append(buffer)
|
34 |
-
|
35 |
-
# Print unalignable rows for analysis
|
36 |
-
for row in unalignable_rows:
|
37 |
-
print(f"Unalignable row: {row}")
|
38 |
-
|
39 |
-
return aligned_rows
|
40 |
-
|
41 |
-
def parse_bhel_pdf(pdf_path):
|
42 |
-
columns = [
|
43 |
-
"Purchase Order No", "Date", "Sl No", "Material Description",
|
44 |
-
"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
|
45 |
-
]
|
46 |
-
expected_columns = len(columns)
|
47 |
-
data = []
|
48 |
-
|
49 |
-
with pdfplumber.open(pdf_path) as pdf:
|
50 |
-
for page in pdf.pages:
|
51 |
-
table = page.extract_table()
|
52 |
-
if table:
|
53 |
-
# Preprocess and align rows before DataFrame conversion
|
54 |
-
rows = preprocess_rows(table[1:], expected_columns)
|
55 |
-
for row in rows:
|
56 |
-
if len(row) == expected_columns:
|
57 |
-
data.append(row)
|
58 |
-
else:
|
59 |
-
print(f"Skipping unalignable row: {row}")
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
return df
|
|
|
1 |
+
import re
|
2 |
import pandas as pd
|
3 |
+
import pdfplumber
|
4 |
|
5 |
+
# Define the target columns based on your table headers
|
6 |
+
columns = [
|
7 |
+
"Purchase Order No", "Date", "Sl No", "Material Description",
|
8 |
+
"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
|
9 |
+
]
|
10 |
+
|
11 |
+
# Initialize an empty DataFrame with the defined columns
|
12 |
+
data = pd.DataFrame(columns=columns)
|
13 |
+
|
14 |
+
# Define regex patterns to identify and parse required lines
|
15 |
+
po_pattern = re.compile(r'^\d{10} / \d{2}\.\d{2}\.\d{4}') # Purchase Order pattern
|
16 |
+
material_pattern = re.compile(r'^\d{1,3} ') # Pattern for lines starting with Sl No
|
17 |
+
|
18 |
+
# Function to clean and split rows
|
19 |
+
def clean_and_split_line(line):
|
20 |
+
# Split line into components based on spaces and commas
|
21 |
+
parts = re.split(r'\s{2,}', line.strip()) # Split by two or more spaces
|
22 |
+
return parts if len(parts) == len(columns) else None
|
23 |
+
|
24 |
+
# Process the PDF and extract relevant lines
|
25 |
+
with pdfplumber.open('your_pdf_file.pdf') as pdf:
|
26 |
+
for page in pdf.pages:
|
27 |
+
text = page.extract_text().splitlines()
|
28 |
+
|
29 |
+
for line in text:
|
30 |
+
# Check for Purchase Order row
|
31 |
+
if po_pattern.match(line):
|
32 |
+
po_data = line.split(' / ')
|
33 |
+
po_no = po_data[0]
|
34 |
+
po_date = po_data[1]
|
35 |
+
|
36 |
+
# Check if the line contains material data
|
37 |
+
elif material_pattern.match(line):
|
38 |
+
cleaned_data = clean_and_split_line(line)
|
39 |
+
if cleaned_data:
|
40 |
+
row_data = {
|
41 |
+
"Purchase Order No": po_no,
|
42 |
+
"Date": po_date,
|
43 |
+
"Sl No": cleaned_data[0],
|
44 |
+
"Material Description": cleaned_data[1],
|
45 |
+
"Unit": cleaned_data[2],
|
46 |
+
"Quantity": cleaned_data[3],
|
47 |
+
"Dely Qty": cleaned_data[4],
|
48 |
+
"Dely Date": cleaned_data[5],
|
49 |
+
"Unit Rate": cleaned_data[6],
|
50 |
+
"Value": cleaned_data[7],
|
51 |
+
}
|
52 |
+
data = data.append(row_data, ignore_index=True)
|
53 |
+
# Skip irrelevant lines or unalignable rows
|
54 |
else:
|
55 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
+
# Save extracted data to an Excel file
|
58 |
+
data.to_excel("extracted_data.xlsx", index=False)
|
|