DSatishchandra commited on
Commit
c051cf9
1 Parent(s): e86b321

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -17
app.py CHANGED
@@ -1,24 +1,51 @@
1
- import pandas as pd
2
- import tabula
 
3
 
4
- def extract_data(pdf_file):
5
- # Extract data from the PDF file using tabula
6
- tables = tabula.read_pdf(pdf_file, pages='all')
 
 
 
 
 
 
 
 
7
 
8
- # Combine the extracted tables into a single DataFrame
9
- data = pd.concat(tables, ignore_index=True)
 
 
10
 
11
- # Rename columns to match the expected output format
12
- data.columns = ['Purchase Order No', 'Date', 'Material Description', 'Unit', 'Quantity', 'Dely Qty', 'Dely Date', 'Unit Rate', 'Value']
 
13
 
14
- # Remove any unnecessary rows and columns
15
- data = data.dropna(how='all')
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- return data
 
 
 
 
 
 
 
18
 
19
  if __name__ == "__main__":
20
- pdf_file = 'your_pdf_file.pdf'
21
- data = extract_data(pdf_file)
22
-
23
- # Save the extracted data to an Excel file
24
- data.to_excel('output.xlsx', index=False)
 
1
+ import re
2
+ from typing import Dict, List, Union
3
+ import gradio as gr
4
 
5
+ def parse_federal_transformers(file_text: str) -> Dict[str, Union[str, List[Dict[str, Union[str, int, float]]]]]:
6
+ parsed_data = {
7
+ "Purchase Order No": "",
8
+ "Date": "",
9
+ "Supplier": "Federal Transformers Co. LLC",
10
+ "Invoice Address": "",
11
+ "Delivery Address": "",
12
+ "Currency": "",
13
+ "Payment Terms": "",
14
+ "Items": []
15
+ }
16
 
17
+ try:
18
+ # Extract Purchase Order No and Date
19
+ parsed_data["Purchase Order No"] = re.search(r"Purchase Order No\.\s(\d+)", file_text).group(1)
20
+ parsed_data["Date"] = re.search(r"Date:\s+(\d{2}-\w{3}-\d{2})", file_text).group(1)
21
 
22
+ # Extract Invoice Address and Delivery Address
23
+ parsed_data["Invoice Address"] = re.search(r"Invoice Address\s*:\s*(.*?)(?=\sDelivery Address)", file_text, re.DOTALL).group(1).strip()
24
+ parsed_data["Delivery Address"] = re.search(r"Delivery Address\s*:\s*(.*?)(?=\sNote)", file_text, re.DOTALL).group(1).strip()
25
 
26
+ # Define pattern for extracting item details
27
+ item_pattern = re.compile(r"(\d+)\s+([\w\s]+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)")
28
+ for match in item_pattern.finditer(file_text):
29
+ parsed_data["Items"].append({
30
+ "Item No": match.group(1),
31
+ "Description": match.group(2).strip(),
32
+ "Quantity": int(match.group(3)),
33
+ "Unit Price": float(match.group(4)),
34
+ "Total Price": float(match.group(5)),
35
+ })
36
+ except Exception as e:
37
+ print(f"Error parsing Federal Transformers PO: {e}")
38
+
39
+ return parsed_data
40
 
41
+ # Create Gradio interface
42
+ iface = gr.Interface(
43
+ fn=parse_federal_transformers,
44
+ inputs="text",
45
+ outputs="json",
46
+ title="Federal Transformers PO Parser",
47
+ description="Upload the text of a Federal Transformers purchase order to extract details."
48
+ )
49
 
50
  if __name__ == "__main__":
51
+ iface.launch()