Spaces:

DSatishchandra
/

POExtraction_UC3

Runtime error

App Files Files Community

DSatishchandra commited on Nov 12, 2024

Commit

ab2a518

•

1 Parent(s): b051e96

Update parse_bhel.py

Browse files

Files changed (1) hide show

parse_bhel.py +39 -53

parse_bhel.py CHANGED Viewed

@@ -1,58 +1,44 @@
-import re
-import pandas as pd
 import pdfplumber
-# Define the target columns based on your table headers
-columns = [
-    "Purchase Order No", "Date", "Sl No", "Material Description",
-    "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"
-]
-# Initialize an empty DataFrame with the defined columns
-data = pd.DataFrame(columns=columns)
-# Define regex patterns to identify and parse required lines
-po_pattern = re.compile(r'^\d{10} / \d{2}\.\d{2}\.\d{4}')  # Purchase Order pattern
-material_pattern = re.compile(r'^\d{1,3} ')  # Pattern for lines starting with Sl No
-# Function to clean and split rows
-def clean_and_split_line(line):
-    # Split line into components based on spaces and commas
-    parts = re.split(r'\s{2,}', line.strip())  # Split by two or more spaces
-    return parts if len(parts) == len(columns) else None
-# Process the PDF and extract relevant lines
-with pdfplumber.open('your_pdf_file.pdf') as pdf:
-    for page in pdf.pages:
-        text = page.extract_text().splitlines()
-        for line in text:
-            # Check for Purchase Order row
-            if po_pattern.match(line):
-                po_data = line.split(' / ')
-                po_no = po_data[0]
-                po_date = po_data[1]
-            # Check if the line contains material data
-            elif material_pattern.match(line):
-                cleaned_data = clean_and_split_line(line)
-                if cleaned_data:
-                    row_data = {
-                        "Purchase Order No": po_no,
-                        "Date": po_date,
-                        "Sl No": cleaned_data[0],
-                        "Material Description": cleaned_data[1],
-                        "Unit": cleaned_data[2],
-                        "Quantity": cleaned_data[3],
-                        "Dely Qty": cleaned_data[4],
-                        "Dely Date": cleaned_data[5],
-                        "Unit Rate": cleaned_data[6],
-                        "Value": cleaned_data[7],
-                    }
-                    data = data.append(row_data, ignore_index=True)
-            # Skip irrelevant lines or unalignable rows
-            else:
-                continue
-# Save extracted data to an Excel file
-data.to_excel("extracted_data.xlsx", index=False)

+import gradio as gr
 import pdfplumber
+import pandas as pd
+def parse_bhel_pdf(pdf_file):
+    # Open the uploaded PDF file
+    with pdfplumber.open(pdf_file) as pdf:
+        data = []
+        for page in pdf.pages:
+            text = page.extract_text()
+            if text:
+                lines = text.split('\n')
+                for line in lines:
+                    parts = line.split()
+                    if len(parts) >= 8:
+                        row = {
+                            'Sl No': parts[0],
+                            'Material Description': " ".join(parts[1:-6]),
+                            'Unit': parts[-6],
+                            'Quantity': parts[-5],
+                            'Dely Qty': parts[-4],
+                            'Dely Date': parts[-3],
+                            'Unit Rate': parts[-2],
+                            'Value': parts[-1]
+                        }
+                        data.append(row)
+    # Convert extracted data to a DataFrame
+    df = pd.DataFrame(data)
+    return df
+def gradio_interface(pdf_file):
+    # Parse the PDF file and return the extracted table as an HTML table
+    df = parse_bhel_pdf(pdf_file.name)
+    return df.to_html()
+# Gradio interface
+gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.File(type="file", label="Upload PDF File"),
+    outputs="html",
+    title="BHEL PDF Data Extractor",
+    description="Upload a BHEL PDF file to extract structured data in a tabular format."
+).launch()