pdf-to-excel / app.py
neerajkalyank's picture
Create app.py
415afc1 verified
raw
history blame
1.72 kB
import gradio as gr
import pandas as pd
import pdfplumber # Or any other PDF extraction library
import io
def extract_columns_from_pdf(file, columns):
# Read the PDF and extract tables
try:
pdf_data = []
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
# Assume tables are correctly extracted; convert them to DataFrames
df = pd.DataFrame(table[1:], columns=table[0])
pdf_data.append(df)
# Combine all tables into one DataFrame
full_data = pd.concat(pdf_data, ignore_index=True)
# Process column input
required_columns = [col.strip() for col in columns.split(",")]
# Filter data based on required columns
filtered_data = full_data[required_columns]
# Save to Excel
output = io.BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
filtered_data.to_excel(writer, index=False)
output.seek(0)
return output
except Exception as e:
return f"Error: {str(e)}"
def main():
# Define Gradio interface
column_input = gr.Textbox(label="Enter Column Names (comma-separated)")
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
output_file = gr.File(label="Download Extracted Excel File")
# Create Gradio interface
interface = gr.Interface(
fn=extract_columns_from_pdf,
inputs=[file_input, column_input],
outputs=output_file,
live=False
)
# Launch app
interface.launch()
if __name__ == "__main__":
main()