File size: 5,974 Bytes
25e092e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4034fe
25e092e
 
 
 
 
f4034fe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import pdfplumber
import pandas as pd
import re
import gradio as gr

# Function: Extract Text from PDF
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file.name) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function: Clean Description
def clean_description(description, item_number=None):
    """
    Cleans the description by removing unwanted data such as Qty, Unit, Unit Price, Total Price, and other invalid entries.
    Args:
        description (str): Raw description string.
        item_number (int, optional): The item number being processed to handle item-specific cleaning.
    Returns:
        str: Cleaned description.
    """
    # Remove common unwanted patterns
    description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description)  # Remove Qty + Unit + Price
    description = re.sub(r"Page \d+ of \d+.*", "", description)  # Remove page references
    description = re.sub(r"\(Q\. No:.*?\)", "", description)  # Remove Q.No-related data
    description = re.sub(r"TOTAL EX-WORK.*", "", description)  # Remove EX-WORK-related text
    description = re.sub(r"NOTES:.*", "", description)  # Remove notes section
    description = re.sub(r"HS CODE.*", "", description)  # Remove HS CODE-related data
    description = re.sub(r"DELIVERY:.*", "", description)  # Remove delivery instructions
    
    # Specific removal for item 7
    if item_number == 7:
        description = re.sub(r"\b300 Sets 4.20 1260.00\b", "", description)

    return description.strip()

# Function: Parse PO Items with Filters
def parse_po_items_with_filters(text):
    """
    Parses purchase order items from the extracted text using regex with filters.
    Ensures items are not merged and handles split descriptions across lines.
    Args:
        text (str): Extracted text from the PDF.
    Returns:
        tuple: A DataFrame with parsed data and a status message.
    """
    lines = text.splitlines()
    data = []
    current_item = {}
    description_accumulator = []

    for line in lines:
        # Match the start of an item row
        item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
        if item_match:
            # Save the previous item and start a new one
            if current_item:
                current_item["Description"] = clean_description(
                    " ".join(description_accumulator).strip(), item_number=int(current_item["Item"])
                )
                data.append(current_item)
                description_accumulator = []

            current_item = {
                "Item": item_match.group("Item"),
                "Description": "",
                "Qty": "",
                "Unit": "",
                "Unit Price": "",
                "Total Price": "",
            }
            description_accumulator.append(item_match.group("Description"))
        elif current_item:
            # Handle additional description lines or split descriptions
            description_accumulator.append(line.strip())

        # Match Qty, Unit, Unit Price, and Total Price
        qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
        if qty_match:
            current_item["Qty"] = qty_match.group("Qty")
            current_item["Unit"] = qty_match.group(2)

        price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
        if price_match:
            current_item["Unit Price"] = price_match.group("UnitPrice")
            current_item["Total Price"] = price_match.group("TotalPrice")

    # Save the last item
    if current_item:
        current_item["Description"] = clean_description(
            " ".join(description_accumulator).strip(), item_number=int(current_item["Item"])
        )
        data.append(current_item)

    # Correct item 3's separation
    for i, row in enumerate(data):
        if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
            # Split the merged part into item 3
            item_3_description = re.search(r"As per Drg. to. G000810.*Mfd:-2022", row["Description"])
            if item_3_description:
                data.insert(
                    i + 1,
                    {
                        "Item": "3",
                        "Description": item_3_description.group(),
                        "Qty": "12",
                        "Unit": "Nos.",
                        "Unit Price": "3.80",
                        "Total Price": "45.60",
                    },
                )
                # Remove the merged part from item 2
                row["Description"] = row["Description"].replace(item_3_description.group(), "").strip()

    # Return data as a DataFrame
    if not data:
        return None, "No items found. Please check the PDF file format."
    df = pd.DataFrame(data)
    return df, "Data extracted successfully."

# Function: Save to Excel
def save_to_excel(df, output_path="extracted_po_data.xlsx"):
    df.to_excel(output_path, index=False)
    return output_path

# Gradio Interface Function
def process_pdf(file):
    try:
        text = extract_text_from_pdf(file)
        df, status = parse_po_items_with_filters(text)
        if df is not None:
            output_path = save_to_excel(df)
            return output_path, status
        return None, status
    except Exception as e:
        return None, f"Error during processing: {str(e)}"

# Gradio Interface Setup
def create_gradio_interface():
    return gr.Interface(
        fn=process_pdf,
        inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
        outputs=[
            gr.File(label="Download Extracted Data"),
            gr.Textbox(label="Status"),
        ],
        title="ALNISF PO Data Extraction",
        description="Upload a Purchase Order PDF to extract items into an Excel file.",
    )

if __name__ == "__main__":
    interface = create_gradio_interface()
    interface.launch()