File size: 759 Bytes
d2251a9
80b61aa
d2251a9
80b61aa
 
 
2659c0a
80b61aa
 
2659c0a
80b61aa
 
2659c0a
80b61aa
 
2659c0a
80b61aa
2659c0a
80b61aa
 
 
2659c0a
80b61aa
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import pandas as pd
import tabula

def extract_data(pdf_file):
    # Extract data from the PDF file using tabula
    tables = tabula.read_pdf(pdf_file, pages='all')

    # Combine the extracted tables into a single DataFrame
    data = pd.concat(tables, ignore_index=True)

    # Rename columns to match the expected output format
    data.columns = ['Purchase Order No', 'Date', 'Material Description', 'Unit', 'Quantity', 'Dely Qty', 'Dely Date', 'Unit Rate', 'Value']

    # Remove any unnecessary rows and columns
    data = data.dropna(how='all')

    return data

if __name__ == "__main__":
    pdf_file = 'your_pdf_file.pdf'
    data = extract_data(pdf_file)

    # Save the extracted data to an Excel file
    data.to_excel('output.xlsx', index=False)