import pandas as pd import tabula def extract_data(pdf_file): # Extract data from the PDF file using tabula tables = tabula.read_pdf(pdf_file, pages='all') # Combine the extracted tables into a single DataFrame data = pd.concat(tables, ignore_index=True) # Rename columns to match the expected output format data.columns = ['Purchase Order No', 'Date', 'Material Description', 'Unit', 'Quantity', 'Dely Qty', 'Dely Date', 'Unit Rate', 'Value'] # Remove any unnecessary rows and columns data = data.dropna(how='all') return data if __name__ == "__main__": pdf_file = 'your_pdf_file.pdf' data = extract_data(pdf_file) # Save the extracted data to an Excel file data.to_excel('output.xlsx', index=False)