neerajkalyank commited on
Commit
d97cfeb
1 Parent(s): 938ff71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -63
app.py CHANGED
@@ -1,80 +1,62 @@
1
- import gradio as gr
2
  import pdfplumber
3
  import pandas as pd
4
- import re
5
  from io import BytesIO
6
- import tempfile
 
7
 
8
  def extract_data_from_pdf(pdf_file):
9
- # Initialize list to hold text from each page
10
- text_data = []
11
 
12
- # Open the PDF file with pdfplumber
13
  with pdfplumber.open(pdf_file) as pdf:
14
  for page in pdf.pages:
15
- # Extract text from each page
16
  text = page.extract_text()
17
- if text:
18
- print(f"Extracted text from page {page.page_number}:\n{text}\n") # Debugging: Print extracted text
19
- text_data.append(text)
20
-
21
- # Initialize list for parsed data
22
- data = []
23
-
24
- # Define regular expressions for parsing rows
25
- row_pattern = re.compile(
26
- r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)'
27
- )
28
-
29
- # Process and structure extracted text
30
- for text in text_data:
31
- for line in text.split('\n'):
32
- # Apply row pattern to each line
33
- match = row_pattern.search(line)
34
- if match:
35
- row = match.groupdict()
36
- row["description"] = row["description"].strip() # Clean description
37
- row["quantity"] = float(row["quantity"])
38
- row["price"] = float(row["price"])
39
- row["discount"] = float(row["discount"])
40
- row["amount"] = float(row["amount"])
41
-
42
- # Append extracted row to data
43
- data.append(row)
44
-
45
- # Create DataFrame if data was extracted
46
- if data:
47
- df = pd.DataFrame(data)
48
- df.columns = [
49
- "Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price",
50
- "Discount", "Currency", "Amount"
51
- ]
52
-
53
- # Save the DataFrame to a temporary Excel file
54
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
55
- with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
56
- df.to_excel(writer, index=False, sheet_name="Extracted Data")
57
-
58
- return temp_file.name
59
- else:
60
- # If no data was found, create a blank Excel file
61
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
62
- with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
63
- pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
64
-
65
- return temp_file.name
66
 
67
- # Define Gradio Interface with updated components
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  iface = gr.Interface(
69
  fn=extract_data_from_pdf,
70
  inputs=gr.File(label="Upload PDF"),
71
  outputs=gr.File(label="Download Excel"),
72
- title="Advanced Document Data Extractor",
73
- description=(
74
- "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
75
- "The app will parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
76
- "No additional calculations are performed; it simply extracts the data as it appears."
77
- ),
78
  )
79
 
80
  iface.launch()
 
 
1
  import pdfplumber
2
  import pandas as pd
 
3
  from io import BytesIO
4
+ import re
5
+ import gradio as gr
6
 
7
  def extract_data_from_pdf(pdf_file):
8
+ data = []
9
+ po_number = None
10
 
 
11
  with pdfplumber.open(pdf_file) as pdf:
12
  for page in pdf.pages:
 
13
  text = page.extract_text()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # Extract PO number once (if not already extracted)
16
+ if po_number is None:
17
+ po_match = re.search(r"Purchase Order : (\w+)", text)
18
+ if po_match:
19
+ po_number = po_match.group(1)
20
+
21
+ # Regex pattern to match the row data
22
+ row_pattern = re.compile(
23
+ r"(\d+)\s+(\d{10,})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
24
+ )
25
+
26
+ # Find all rows matching the pattern
27
+ for match in row_pattern.finditer(text):
28
+ pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups()
29
+ sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
30
+ sub_total = sub_total_match.group(1) if sub_total_match else ""
31
+
32
+ data.append({
33
+ "Purchase Order": po_number,
34
+ "Pos.": pos,
35
+ "Item Code": item_code,
36
+ "Unit": unit,
37
+ "Delivery Date": delivery_date,
38
+ "Quantity": quantity,
39
+ "Basic Price": basic_price,
40
+ "Amount": amount,
41
+ "SUB TOTAL": sub_total
42
+ })
43
+
44
+ # Convert the data to a DataFrame
45
+ df = pd.DataFrame(data)
46
+ output = BytesIO()
47
+ with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
48
+ df.to_excel(writer, index=False, sheet_name="Extracted Data")
49
+ output.seek(0)
50
+
51
+ return output
52
+
53
+ # Gradio Interface
54
  iface = gr.Interface(
55
  fn=extract_data_from_pdf,
56
  inputs=gr.File(label="Upload PDF"),
57
  outputs=gr.File(label="Download Excel"),
58
+ title="PDF Data Extractor",
59
+ description="Extract structured data from a PDF and output it as an Excel file."
 
 
 
 
60
  )
61
 
62
  iface.launch()