neerajkalyank commited on
Commit
359e981
1 Parent(s): 7ebbb35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -27
app.py CHANGED
@@ -7,50 +7,60 @@ def extract_data_from_pdf(pdf_file):
7
  data = []
8
  po_number = None
9
 
10
- # Open PDF file directly
11
  with pdfplumber.open(pdf_file.name) as pdf:
12
  for page in pdf.pages:
13
  text = page.extract_text()
14
 
15
- # Extract PO number (only once at the start)
16
  if po_number is None:
17
  po_match = re.search(r"Purchase Order : (\w+)", text)
18
  po_number = po_match.group(1) if po_match else "N/A"
19
 
20
- # Regex pattern for extracting rows
21
  row_pattern = re.compile(
22
- r"(\d+)\s+(\d{9})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+INR\s+([\d.]+)"
23
  )
24
 
25
- # Extract each row using the pattern
26
  for match in row_pattern.finditer(text):
27
- pos, item_code, unit, delivery_date, quantity, basic_price, discount, amount = match.groups()
 
 
 
 
 
 
 
 
28
 
29
- # Extract subtotal if present
30
  sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
31
  sub_total = sub_total_match.group(1) if sub_total_match else "0.0"
32
 
33
- # Append data for each matched row
34
- data.append({
35
- "Purchase Order": po_number,
36
- "Pos.": pos,
37
- "Item Code": item_code,
38
- "Unit": unit,
39
- "Delivery Date": delivery_date,
40
- "Quantity": quantity,
41
- "Basic Price": basic_price,
42
- "Discount": discount,
43
- "Amount": amount,
44
- "SUB TOTAL": sub_total,
45
- })
46
-
47
- # Convert data to DataFrame and save to Excel
48
  df = pd.DataFrame(data)
49
- output_file = "output.xlsx"
50
- df.to_excel(output_file, index=False)
51
- return output_file
52
 
53
- # Gradio Interface
 
 
 
 
 
 
 
54
  iface = gr.Interface(
55
  fn=extract_data_from_pdf,
56
  inputs=gr.File(label="Upload PDF"),
@@ -58,4 +68,4 @@ iface = gr.Interface(
58
  title="PDF Data Extractor",
59
  description="Extract structured data from a PDF and output it as an Excel file.",
60
  )
61
- iface.launch()
 
7
  data = []
8
  po_number = None
9
 
 
10
  with pdfplumber.open(pdf_file.name) as pdf:
11
  for page in pdf.pages:
12
  text = page.extract_text()
13
 
14
+ # Extract PO number
15
  if po_number is None:
16
  po_match = re.search(r"Purchase Order : (\w+)", text)
17
  po_number = po_match.group(1) if po_match else "N/A"
18
 
19
+ # Regex pattern for row data
20
  row_pattern = re.compile(
21
+ r"(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
22
  )
23
 
24
+ # Extract matching rows
25
  for match in row_pattern.finditer(text):
26
+ (
27
+ pos,
28
+ item_code,
29
+ unit,
30
+ delivery_date,
31
+ quantity,
32
+ basic_price,
33
+ amount,
34
+ ) = match.groups()
35
 
 
36
  sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
37
  sub_total = sub_total_match.group(1) if sub_total_match else "0.0"
38
 
39
+ data.append(
40
+ {
41
+ "Purchase Order": po_number,
42
+ "Pos.": pos,
43
+ "Item Code": item_code,
44
+ "Unit": unit,
45
+ "Delivery Date": delivery_date,
46
+ "Quantity": quantity,
47
+ "Basic Price": basic_price,
48
+ "Amount": amount,
49
+ "SUB TOTAL": sub_total,
50
+ }
51
+ )
52
+
53
+ # Convert data to DataFrame
54
  df = pd.DataFrame(data)
 
 
 
55
 
56
+ # Print extracted data (debugging)
57
+ print(df)
58
+
59
+ # Save to Excel
60
+ df.to_excel("output.xlsx", index=False)
61
+
62
+ return "output.xlsx"
63
+
64
  iface = gr.Interface(
65
  fn=extract_data_from_pdf,
66
  inputs=gr.File(label="Upload PDF"),
 
68
  title="PDF Data Extractor",
69
  description="Extract structured data from a PDF and output it as an Excel file.",
70
  )
71
+ iface.launch()