File size: 4,227 Bytes
8236ea1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a7c854
8236ea1
8a7c854
8236ea1
 
 
 
 
 
 
 
8a7c854
 
 
 
 
 
8236ea1
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pdfplumber
import pandas as pd
import re
import gradio as gr

def extract_data(pdf_file):
    """
    Extract data from the uploaded PDF for dynamic ranges (e.g., 10 to n).
    """
    data = []
    columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]

    start_si = 10  # Start from SI No 10
    end_si = None  # Dynamically detect the end SI No

    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            full_text = page.extract_text()  # Get the text content for the page
            lines = full_text.splitlines() if full_text else []

            for line in lines:
                try:
                    # Parse the first column for SI No
                    si_no_match = re.match(r"^\s*(\d+)\s", line)
                    if not si_no_match:
                        continue

                    si_no = int(si_no_match.group(1))
                    # Dynamically set the end SI No if higher SI Nos are found
                    if end_si is None or si_no > end_si:
                        end_si = si_no

                    if si_no < start_si:
                        continue  # Skip rows below the start SI No

                    # Extract Material Description and details dynamically
                    material_desc = extract_material_description(full_text, si_no)

                    # Extract remaining fields
                    parts = line.split()
                    unit = parts[3]
                    quantity = int(parts[4])
                    dely_qty = int(parts[5])
                    dely_date = parts[6]
                    unit_rate = float(parts[7])
                    value = float(parts[8])

                    # Append row data
                    data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value])

                except (ValueError, IndexError):
                    # Skip invalid rows or rows with missing data
                    continue

    # Convert data to DataFrame and save as Excel
    df = pd.DataFrame(data, columns=columns)
    excel_path = "/tmp/Extracted_PO_Data_Dynamic.xlsx"
    df.to_excel(excel_path, index=False)
    return excel_path

def extract_material_description(full_text, si_no):
    """
    Extract Material Description, including Material Number, HSN Code, and IGST, using unique patterns.
    """
    material_desc = ""

    # Match the specific SI No row to extract details
    si_no_pattern = rf"{si_no}\s+(BPS\s+\d+).*?Material\s+Number:\s+(\d+)"
    match = re.search(si_no_pattern, full_text, re.DOTALL)
    if match:
        bps_code = match.group(1)
        material_number = match.group(2)
        material_desc += f"{bps_code}\nMaterial Number: {material_number}\n"

    # Extract HSN Code
    hsn_code_match = re.search(r"HSN\s+Code:\s*(\d+)", full_text)
    if hsn_code_match:
        hsn_code = hsn_code_match.group(1)
        material_desc += f"HSN Code: {hsn_code}\n"
    else:
        material_desc += "HSN Code: Not Found\n"

    # Extract IGST
    igst_match = re.search(r"IGST\s*:\s*(\d+)\s*%", full_text)
    if igst_match:
        igst = igst_match.group(1)
        material_desc += f"IGST: {igst} %"
    else:
        material_desc += "IGST: Not Found"

    return material_desc.strip()

def process_pdf(file):
    """
    Process the uploaded PDF and return the extracted data.
    """
    try:
        # Extract text from the PDF
        output_path = extract_data(file.name)
        return output_path, "Data extraction successful!"
    except Exception as e:
        return None, f"Error during processing: {str(e)}"

# Gradio Interface
def gradio_interface(pdf_file):
    """
    Interface function for Gradio to process the PDF and return the Excel file.
    """
    return extract_data(pdf_file.name)

# Define Gradio interface
interface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.File(label="Download Extracted Excel"),
    title="Dynamic BHEL PO Data Extractor",
    description="Upload a PDF to extract accurate Material Numbers and related data dynamically into an Excel file."
)

if __name__ == "__main__":
    interface.launch()