DSatishchandra commited on
Commit
7be7132
1 Parent(s): e16fdc8

Update parse_bhel.py

Browse files
Files changed (1) hide show
  1. parse_bhel.py +45 -37
parse_bhel.py CHANGED
@@ -1,44 +1,52 @@
1
- import gradio as gr
2
  import pdfplumber
3
  import pandas as pd
4
 
5
- def parse_bhel_pdf(pdf_file):
6
- # Open the uploaded PDF file
7
- with pdfplumber.open(pdf_file) as pdf:
8
- data = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  for page in pdf.pages:
10
- text = page.extract_text()
11
- if text:
12
- lines = text.split('\n')
13
- for line in lines:
14
- parts = line.split()
15
- if len(parts) >= 8:
16
- row = {
17
- 'Sl No': parts[0],
18
- 'Material Description': " ".join(parts[1:-6]),
19
- 'Unit': parts[-6],
20
- 'Quantity': parts[-5],
21
- 'Dely Qty': parts[-4],
22
- 'Dely Date': parts[-3],
23
- 'Unit Rate': parts[-2],
24
- 'Value': parts[-1]
25
- }
26
  data.append(row)
 
 
27
 
28
- # Convert extracted data to a DataFrame
29
- df = pd.DataFrame(data)
30
  return df
31
-
32
- def gradio_interface(pdf_file):
33
- # Parse the PDF file and return the extracted table as an HTML table
34
- df = parse_bhel_pdf(pdf_file.name)
35
- return df.to_html()
36
-
37
- # Gradio interface
38
- gr.Interface(
39
- fn=gradio_interface,
40
- inputs=gr.File(type="file", label="Upload PDF File"),
41
- outputs="html",
42
- title="BHEL PDF Data Extractor",
43
- description="Upload a BHEL PDF file to extract structured data in a tabular format."
44
- ).launch()
 
 
1
  import pdfplumber
2
  import pandas as pd
3
 
4
+ def preprocess_rows(rows, expected_columns):
5
+ aligned_rows = []
6
+ buffer = []
7
+
8
+ for row in rows:
9
+ # If the row has the correct number of columns, add it as-is
10
+ if len(row) == expected_columns:
11
+ # If there's buffered content from previous rows, add it before this row
12
+ if buffer:
13
+ aligned_rows.append(buffer)
14
+ buffer = [] # Clear the buffer
15
+ aligned_rows.append(row)
16
+ else:
17
+ # If row has fewer columns, treat it as a continuation and add to the buffer
18
+ if buffer:
19
+ buffer = [f"{b} {r}" if r else b for b, r in zip(buffer, row + [""] * (len(buffer) - len(row)))]
20
+ else:
21
+ buffer = row # Initialize the buffer with the row
22
+
23
+ # If there's any remaining buffered row, add it to aligned rows
24
+ if buffer:
25
+ aligned_rows.append(buffer)
26
+
27
+ return aligned_rows
28
+
29
+ def parse_bhel_pdf(pdf_path):
30
+ columns = [
31
+ "Purchase Order No", "Date", "Sl No", "Material Description",
32
+ "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
33
+ ]
34
+ expected_columns = len(columns)
35
+ data = []
36
+
37
+ with pdfplumber.open(pdf_path) as pdf:
38
  for page in pdf.pages:
39
+ table = page.extract_table()
40
+ if table:
41
+ # Skip the header row and preprocess rows to align data
42
+ rows = preprocess_rows(table[1:], expected_columns)
43
+ for row in rows:
44
+ # Only add rows that match the expected number of columns after preprocessing
45
+ if len(row) == expected_columns:
 
 
 
 
 
 
 
 
 
46
  data.append(row)
47
+ else:
48
+ print(f"Skipping unalignable row: {row}")
49
 
50
+ # Create a DataFrame with the specified columns
51
+ df = pd.DataFrame(data, columns=columns)
52
  return df