Spaces:
Runtime error
Runtime error
DSatishchandra
commited on
Commit
•
7cbc3d2
1
Parent(s):
7be7132
Update parse_bhel.py
Browse files- parse_bhel.py +47 -45
parse_bhel.py
CHANGED
@@ -1,52 +1,54 @@
|
|
1 |
import pdfplumber
|
2 |
import pandas as pd
|
|
|
3 |
|
4 |
-
def
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
# If the row has the correct number of columns, add it as-is
|
10 |
-
if len(row) == expected_columns:
|
11 |
-
# If there's buffered content from previous rows, add it before this row
|
12 |
-
if buffer:
|
13 |
-
aligned_rows.append(buffer)
|
14 |
-
buffer = [] # Clear the buffer
|
15 |
-
aligned_rows.append(row)
|
16 |
-
else:
|
17 |
-
# If row has fewer columns, treat it as a continuation and add to the buffer
|
18 |
-
if buffer:
|
19 |
-
buffer = [f"{b} {r}" if r else b for b, r in zip(buffer, row + [""] * (len(buffer) - len(row)))]
|
20 |
-
else:
|
21 |
-
buffer = row # Initialize the buffer with the row
|
22 |
-
|
23 |
-
# If there's any remaining buffered row, add it to aligned rows
|
24 |
-
if buffer:
|
25 |
-
aligned_rows.append(buffer)
|
26 |
-
|
27 |
-
return aligned_rows
|
28 |
-
|
29 |
-
def parse_bhel_pdf(pdf_path):
|
30 |
-
columns = [
|
31 |
-
"Purchase Order No", "Date", "Sl No", "Material Description",
|
32 |
-
"Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
|
33 |
-
]
|
34 |
-
expected_columns = len(columns)
|
35 |
data = []
|
|
|
|
|
36 |
|
37 |
-
with pdfplumber.open(
|
38 |
for page in pdf.pages:
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
df = pd.DataFrame(data, columns=columns)
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pdfplumber
|
2 |
import pandas as pd
|
3 |
+
import tempfile
|
4 |
|
5 |
+
def format_material_description(description_series, si_no):
|
6 |
+
# Placeholder for a formatting function; update with your logic
|
7 |
+
return f"{description_series.iloc[0]} (SI No: {si_no})"
|
8 |
+
|
9 |
+
def extract_bhel_data(pdf_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
data = []
|
11 |
+
columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value", "Material Number", "HSN Code", "IGST"]
|
12 |
+
start_si, end_si = 10, 1150
|
13 |
|
14 |
+
with pdfplumber.open(pdf_file) as pdf:
|
15 |
for page in pdf.pages:
|
16 |
+
text = page.extract_text().splitlines()
|
17 |
+
for line in text:
|
18 |
+
parts = line.split()
|
19 |
+
try:
|
20 |
+
si_no = int(parts[0])
|
21 |
+
if start_si <= si_no <= end_si:
|
22 |
+
material_desc = " ".join(parts[1:3])
|
23 |
+
unit = parts[3]
|
24 |
+
quantity = int(parts[4])
|
25 |
+
dely_qty = int(parts[5])
|
26 |
+
dely_date = parts[6]
|
27 |
+
unit_rate = float(parts[7])
|
28 |
+
value = float(parts[8])
|
29 |
+
material_number = parts[9] if len(parts) > 9 else ""
|
30 |
+
hsn_code = parts[10] if len(parts) > 10 else ""
|
31 |
+
igst = parts[11] if len(parts) > 11 else ""
|
32 |
+
data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value, material_number, hsn_code, igst])
|
33 |
+
except (ValueError, IndexError):
|
34 |
+
continue
|
35 |
+
|
36 |
df = pd.DataFrame(data, columns=columns)
|
37 |
+
|
38 |
+
# Correct the SI No column to follow increments of 10
|
39 |
+
df['SI No'] = range(10, 10 + len(df) * 10, 10)
|
40 |
+
|
41 |
+
# Reapply the Material Description formatting based on the corrected SI No
|
42 |
+
df['Material Description'] = df['SI No'].apply(
|
43 |
+
lambda si_no: format_material_description(df['Material Description'], si_no)
|
44 |
+
)
|
45 |
+
|
46 |
+
# Save to temporary file for download
|
47 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
48 |
+
df.to_excel(temp_file.name, index=False)
|
49 |
+
|
50 |
+
# Display the corrected data to the user
|
51 |
+
import ace_tools as tools # Replace with your preferred display method
|
52 |
+
tools.display_dataframe_to_user(name="Corrected Data with Updated SI No", dataframe=df)
|
53 |
+
|
54 |
+
return temp_file.name
|