import gradio as gr import pandas as pd import pdfplumber # Or any other PDF extraction library import io def extract_columns_from_pdf(file, columns): # Read the PDF and extract tables try: pdf_data = [] with pdfplumber.open(file) as pdf: for page in pdf.pages: tables = page.extract_tables() for table in tables: # Assume tables are correctly extracted; convert them to DataFrames df = pd.DataFrame(table[1:], columns=table[0]) pdf_data.append(df) # Combine all tables into one DataFrame full_data = pd.concat(pdf_data, ignore_index=True) # Process column input required_columns = [col.strip() for col in columns.split(",")] # Filter data based on required columns filtered_data = full_data[required_columns] # Save to Excel output = io.BytesIO() with pd.ExcelWriter(output, engine='xlsxwriter') as writer: filtered_data.to_excel(writer, index=False) output.seek(0) return output except Exception as e: return f"Error: {str(e)}" def main(): # Define Gradio interface column_input = gr.Textbox(label="Enter Column Names (comma-separated)") file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) output_file = gr.File(label="Download Extracted Excel File") # Create Gradio interface interface = gr.Interface( fn=extract_columns_from_pdf, inputs=[file_input, column_input], outputs=output_file, live=False ) # Launch app interface.launch() if __name__ == "__main__": main()