DSatishchandra commited on
Commit
80b61aa
1 Parent(s): 2659c0a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -46
app.py CHANGED
@@ -1,54 +1,24 @@
1
  import pandas as pd
2
- import gradio as gr
3
- import re
4
 
5
- # Define function to extract data from Excel file
6
- def extract_data(excel_file):
7
- # Load the Excel file
8
- df = pd.read_excel(excel_file)
9
 
10
- # Attempt to extract 'Purchase Order No' and 'Date' from the first few rows
11
- for _, row in df.iterrows():
12
- # Search for Purchase Order No pattern in the row data
13
- po_match = re.search(r'Purchase Order No[:\s]+(\w+)', str(row), re.IGNORECASE)
14
- if po_match:
15
- purchase_order_no = po_match.group(1)
16
 
17
- # Search for Date pattern in the row data (e.g., "Date: 10.10.2023" or "10/10/2023")
18
- date_match = re.search(r'Date[:\s]+(\d{2}[\./-]\d{2}[\./-]\d{4})', str(row), re.IGNORECASE)
19
- if date_match:
20
- purchase_order_date = date_match.group(1)
21
 
22
- # Stop if both values are found
23
- if purchase_order_no != "Not Found" and purchase_order_date != "Not Found":
24
- break
25
 
26
- # Required columns to keep
27
- columns_to_keep = ["Purchase Order No", "Date", "SI No", "Material Description",
28
- "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
29
-
30
- # Add Purchase Order No and Date columns to the DataFrame if they are missing
31
- if "Purchase Order No" not in df.columns:
32
- df["Purchase Order No"] = purchase_order_no
33
- if "Date" not in df.columns:
34
- df["Date"] = purchase_order_date
35
 
36
- # Filter the DataFrame to only include relevant columns
37
- df_filtered = df[columns_to_keep]
 
38
 
39
- # Save the filtered data to a new Excel file
40
- output_path = "/tmp/Filtered_Purchase_Order_Data.xlsx"
41
- df_filtered.to_excel(output_path, index=False)
42
-
43
- return output_path
44
-
45
- # Set up Gradio interface
46
- iface = gr.Interface(
47
- fn=extract_data,
48
- inputs=gr.File(label="Upload Excel File"),
49
- outputs=gr.File(label="Download Filtered Excel"),
50
- title="Excel Data Extractor"
51
- )
52
-
53
- # Launch the app
54
- iface.launch()
 
1
  import pandas as pd
2
+ import tabula
 
3
 
4
+ def extract_data(pdf_file):
5
+ # Extract data from the PDF file using tabula
6
+ tables = tabula.read_pdf(pdf_file, pages='all')
 
7
 
8
+ # Combine the extracted tables into a single DataFrame
9
+ data = pd.concat(tables, ignore_index=True)
 
 
 
 
10
 
11
+ # Rename columns to match the expected output format
12
+ data.columns = ['Purchase Order No', 'Date', 'Material Description', 'Unit', 'Quantity', 'Dely Qty', 'Dely Date', 'Unit Rate', 'Value']
 
 
13
 
14
+ # Remove any unnecessary rows and columns
15
+ data = data.dropna(how='all')
 
16
 
17
+ return data
 
 
 
 
 
 
 
 
18
 
19
+ if __name__ == "__main__":
20
+ pdf_file = 'your_pdf_file.pdf'
21
+ data = extract_data(pdf_file)
22
 
23
+ # Save the extracted data to an Excel file
24
+ data.to_excel('output.xlsx', index=False)