import gradio as gr import pdfplumber import pandas as pd def parse_bhel_pdf(pdf_file): # Open the uploaded PDF file with pdfplumber.open(pdf_file) as pdf: data = [] for page in pdf.pages: text = page.extract_text() if text: lines = text.split('\n') for line in lines: parts = line.split() if len(parts) >= 8: row = { 'Sl No': parts[0], 'Material Description': " ".join(parts[1:-6]), 'Unit': parts[-6], 'Quantity': parts[-5], 'Dely Qty': parts[-4], 'Dely Date': parts[-3], 'Unit Rate': parts[-2], 'Value': parts[-1] } data.append(row) # Convert extracted data to a DataFrame df = pd.DataFrame(data) return df def gradio_interface(pdf_file): # Parse the PDF file and return the extracted table as an HTML table df = parse_bhel_pdf(pdf_file.name) return df.to_html() # Gradio interface gr.Interface( fn=gradio_interface, inputs=gr.File(type="file", label="Upload PDF File"), outputs="html", title="BHEL PDF Data Extractor", description="Upload a BHEL PDF file to extract structured data in a tabular format." ).launch()