Spaces:
Build error
Build error
import gradio as gr | |
import pandas as pd | |
import pdfplumber # Or any other PDF extraction library | |
import io | |
def extract_columns_from_pdf(file, columns): | |
# Read the PDF and extract tables | |
try: | |
pdf_data = [] | |
with pdfplumber.open(file) as pdf: | |
for page in pdf.pages: | |
tables = page.extract_tables() | |
for table in tables: | |
# Assume tables are correctly extracted; convert them to DataFrames | |
df = pd.DataFrame(table[1:], columns=table[0]) | |
pdf_data.append(df) | |
# Combine all tables into one DataFrame | |
full_data = pd.concat(pdf_data, ignore_index=True) | |
# Process column input | |
required_columns = [col.strip() for col in columns.split(",")] | |
# Filter data based on required columns | |
filtered_data = full_data[required_columns] | |
# Save to Excel | |
output = io.BytesIO() | |
with pd.ExcelWriter(output, engine='xlsxwriter') as writer: | |
filtered_data.to_excel(writer, index=False) | |
output.seek(0) | |
return output | |
except Exception as e: | |
return f"Error: {str(e)}" | |
def main(): | |
# Define Gradio interface | |
column_input = gr.Textbox(label="Enter Column Names (comma-separated)") | |
file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
output_file = gr.File(label="Download Extracted Excel File") | |
# Create Gradio interface | |
interface = gr.Interface( | |
fn=extract_columns_from_pdf, | |
inputs=[file_input, column_input], | |
outputs=output_file, | |
live=False | |
) | |
# Launch app | |
interface.launch() | |
if __name__ == "__main__": | |
main() | |