neerajkalyank commited on
Commit
3469319
·
verified ·
1 Parent(s): 359e981

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -62
app.py CHANGED
@@ -1,71 +1,20 @@
1
- import pdfplumber
2
  import pandas as pd
3
- import re
4
  import gradio as gr
5
 
6
- def extract_data_from_pdf(pdf_file):
7
- data = []
8
- po_number = None
9
-
10
- with pdfplumber.open(pdf_file.name) as pdf:
11
- for page in pdf.pages:
12
- text = page.extract_text()
13
-
14
- # Extract PO number
15
- if po_number is None:
16
- po_match = re.search(r"Purchase Order : (\w+)", text)
17
- po_number = po_match.group(1) if po_match else "N/A"
18
-
19
- # Regex pattern for row data
20
- row_pattern = re.compile(
21
- r"(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
22
- )
23
-
24
- # Extract matching rows
25
- for match in row_pattern.finditer(text):
26
- (
27
- pos,
28
- item_code,
29
- unit,
30
- delivery_date,
31
- quantity,
32
- basic_price,
33
- amount,
34
- ) = match.groups()
35
-
36
- sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
37
- sub_total = sub_total_match.group(1) if sub_total_match else "0.0"
38
-
39
- data.append(
40
- {
41
- "Purchase Order": po_number,
42
- "Pos.": pos,
43
- "Item Code": item_code,
44
- "Unit": unit,
45
- "Delivery Date": delivery_date,
46
- "Quantity": quantity,
47
- "Basic Price": basic_price,
48
- "Amount": amount,
49
- "SUB TOTAL": sub_total,
50
- }
51
- )
52
-
53
- # Convert data to DataFrame
54
- df = pd.DataFrame(data)
55
-
56
- # Print extracted data (debugging)
57
- print(df)
58
-
59
- # Save to Excel
60
  df.to_excel("output.xlsx", index=False)
61
-
62
  return "output.xlsx"
63
 
64
- iface = gr.Interface(
65
- fn=extract_data_from_pdf,
66
  inputs=gr.File(label="Upload PDF"),
67
  outputs=gr.File(label="Download Excel"),
68
- title="PDF Data Extractor",
69
- description="Extract structured data from a PDF and output it as an Excel file.",
70
  )
71
- iface.launch()
 
 
 
1
+ import camelot
2
  import pandas as pd
 
3
  import gradio as gr
4
 
5
+ def extract_tables(pdf_file):
6
+ tables = camelot.read_pdf(pdf_file.name, pages="all")
7
+ df = pd.concat([table.df for table in tables], ignore_index=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  df.to_excel("output.xlsx", index=False)
 
9
  return "output.xlsx"
10
 
11
+ interface = gr.Interface(
12
+ fn=extract_tables,
13
  inputs=gr.File(label="Upload PDF"),
14
  outputs=gr.File(label="Download Excel"),
15
+ title="PDF Table Extractor",
16
+ description="Extract tables from PDF and output as Excel file.",
17
  )
18
+
19
+ if __name__ == "__main__":
20
+ interface.launch()