Spaces:

DSatishchandra
/

POExtraction_UC3

Runtime error

POExtraction_UC3 / app.py

Update app.py

80b61aa verified about 1 month ago

759 Bytes

	import pandas as pd
	import tabula

	def extract_data(pdf_file):
	# Extract data from the PDF file using tabula
	tables = tabula.read_pdf(pdf_file, pages='all')

	# Combine the extracted tables into a single DataFrame
	data = pd.concat(tables, ignore_index=True)

	# Rename columns to match the expected output format
	data.columns = ['Purchase Order No', 'Date', 'Material Description', 'Unit', 'Quantity', 'Dely Qty', 'Dely Date', 'Unit Rate', 'Value']

	# Remove any unnecessary rows and columns
	data = data.dropna(how='all')

	return data

	if __name__ == "__main__":
	pdf_file = 'your_pdf_file.pdf'
	data = extract_data(pdf_file)

	# Save the extracted data to an Excel file
	data.to_excel('output.xlsx', index=False)