Spaces:
Sleeping
Sleeping
import pdfplumber | |
import pandas as pd | |
import re | |
import gradio as gr | |
# Function: Extract Text from PDF | |
def extract_text_from_pdf(pdf_file): | |
with pdfplumber.open(pdf_file.name) as pdf: | |
text = "" | |
for page in pdf.pages: | |
text += page.extract_text() | |
return text | |
# Function: Clean Description | |
def clean_description(description, item_number=None): | |
""" | |
Cleans the description by removing unwanted data such as Qty, Unit, Unit Price, Total Price, and other invalid entries. | |
Args: | |
description (str): Raw description string. | |
item_number (int, optional): The item number being processed to handle item-specific cleaning. | |
Returns: | |
str: Cleaned description. | |
""" | |
# Remove common unwanted patterns | |
description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description) # Remove Qty + Unit + Price | |
description = re.sub(r"Page \d+ of \d+.*", "", description) # Remove page references | |
description = re.sub(r"\(Q\. No:.*?\)", "", description) # Remove Q.No-related data | |
description = re.sub(r"TOTAL EX-WORK.*", "", description) # Remove EX-WORK-related text | |
description = re.sub(r"NOTES:.*", "", description) # Remove notes section | |
description = re.sub(r"HS CODE.*", "", description) # Remove HS CODE-related data | |
description = re.sub(r"DELIVERY:.*", "", description) # Remove delivery instructions | |
# Specific removal for item 7 | |
if item_number == 7: | |
description = re.sub(r"\b300 Sets 4.20 1260.00\b", "", description) | |
return description.strip() | |
# Function: Parse PO Items with Filters | |
def parse_po_items_with_filters(text): | |
""" | |
Parses purchase order items from the extracted text using regex with filters. | |
Ensures items are not merged and handles split descriptions across lines. | |
Args: | |
text (str): Extracted text from the PDF. | |
Returns: | |
tuple: A DataFrame with parsed data and a status message. | |
""" | |
lines = text.splitlines() | |
data = [] | |
current_item = {} | |
description_accumulator = [] | |
for line in lines: | |
# Match the start of an item row | |
item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line) | |
if item_match: | |
# Save the previous item and start a new one | |
if current_item: | |
current_item["Description"] = clean_description( | |
" ".join(description_accumulator).strip(), item_number=int(current_item["Item"]) | |
) | |
data.append(current_item) | |
description_accumulator = [] | |
current_item = { | |
"Item": item_match.group("Item"), | |
"Description": "", | |
"Qty": "", | |
"Unit": "", | |
"Unit Price": "", | |
"Total Price": "", | |
} | |
description_accumulator.append(item_match.group("Description")) | |
elif current_item: | |
# Handle additional description lines or split descriptions | |
description_accumulator.append(line.strip()) | |
# Match Qty, Unit, Unit Price, and Total Price | |
qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line) | |
if qty_match: | |
current_item["Qty"] = qty_match.group("Qty") | |
current_item["Unit"] = qty_match.group(2) | |
price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line) | |
if price_match: | |
current_item["Unit Price"] = price_match.group("UnitPrice") | |
current_item["Total Price"] = price_match.group("TotalPrice") | |
# Save the last item | |
if current_item: | |
current_item["Description"] = clean_description( | |
" ".join(description_accumulator).strip(), item_number=int(current_item["Item"]) | |
) | |
data.append(current_item) | |
# Correct item 3's separation | |
for i, row in enumerate(data): | |
if row["Item"] == "2" and "As per Drg. to." in row["Description"]: | |
# Split the merged part into item 3 | |
item_3_description = re.search(r"As per Drg. to. G000810.*Mfd:-2022", row["Description"]) | |
if item_3_description: | |
data.insert( | |
i + 1, | |
{ | |
"Item": "3", | |
"Description": item_3_description.group(), | |
"Qty": "12", | |
"Unit": "Nos.", | |
"Unit Price": "3.80", | |
"Total Price": "45.60", | |
}, | |
) | |
# Remove the merged part from item 2 | |
row["Description"] = row["Description"].replace(item_3_description.group(), "").strip() | |
# Return data as a DataFrame | |
if not data: | |
return None, "No items found. Please check the PDF file format." | |
df = pd.DataFrame(data) | |
return df, "Data extracted successfully." | |
# Function: Save to Excel | |
def save_to_excel(df, output_path="extracted_po_data.xlsx"): | |
df.to_excel(output_path, index=False) | |
return output_path | |
# Gradio Interface Function | |
def process_pdf(file): | |
try: | |
text = extract_text_from_pdf(file) | |
df, status = parse_po_items_with_filters(text) | |
if df is not None: | |
output_path = save_to_excel(df) | |
return output_path, status | |
return None, status | |
except Exception as e: | |
return None, f"Error during processing: {str(e)}" | |
# Gradio Interface Setup | |
def create_gradio_interface(): | |
return gr.Interface( | |
fn=process_pdf, | |
inputs=gr.File(label="Upload PDF", file_types=[".pdf"]), | |
outputs=[ | |
gr.File(label="Download Extracted Data"), | |
gr.Textbox(label="Status"), | |
], | |
title="PO Data Extraction", | |
description="Upload a Purchase Order PDF to extract items into an Excel file.", | |
) | |
if __name__ == "__main__": | |
interface = create_gradio_interface() | |
interface.launch() |