neerajkalyank commited on
Commit
bfda109
1 Parent(s): 0135d09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -8
app.py CHANGED
@@ -8,27 +8,26 @@ def extract_data_from_pdf(pdf_file):
8
  data = []
9
  po_number = None
10
 
11
- # Use pdfplumber to open the provided file path directly
12
- with pdfplumber.open(pdf_file.name) as pdf:
13
  for page in pdf.pages:
14
  text = page.extract_text()
15
 
16
  # Extract PO number if available
17
  if po_number is None:
18
  po_match = re.search(r"Purchase Order : (\w+)", text)
19
- if po_match:
20
- po_number = po_match.group(1)
21
 
22
  # Regex pattern to match the row data
23
  row_pattern = re.compile(
24
- r"(\d+)\s+(\d{10,})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
25
  )
26
 
27
  # Extract matching rows
28
  for match in row_pattern.finditer(text):
29
  pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups()
30
  sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
31
- sub_total = sub_total_match.group(1) if sub_total_match else ""
32
 
33
  data.append({
34
  "Purchase Order": po_number,
@@ -54,8 +53,8 @@ def extract_data_from_pdf(pdf_file):
54
  # Gradio Interface
55
  iface = gr.Interface(
56
  fn=extract_data_from_pdf,
57
- inputs=gr.File(label="Upload PDF"),
58
- outputs=gr.File(label="Download Excel"),
59
  title="PDF Data Extractor",
60
  description="Extract structured data from a PDF and output it as an Excel file."
61
  )
 
8
  data = []
9
  po_number = None
10
 
11
+ # Use pdfplumber with BytesIO for Gradio compatibility
12
+ with pdfplumber.open(BytesIO(pdf_file.read())) as pdf:
13
  for page in pdf.pages:
14
  text = page.extract_text()
15
 
16
  # Extract PO number if available
17
  if po_number is None:
18
  po_match = re.search(r"Purchase Order : (\w+)", text)
19
+ po_number = po_match.group(1) if po_match else "N/A"
 
20
 
21
  # Regex pattern to match the row data
22
  row_pattern = re.compile(
23
+ r"(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
24
  )
25
 
26
  # Extract matching rows
27
  for match in row_pattern.finditer(text):
28
  pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups()
29
  sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
30
+ sub_total = sub_total_match.group(1) if sub_total_match else "0.0"
31
 
32
  data.append({
33
  "Purchase Order": po_number,
 
53
  # Gradio Interface
54
  iface = gr.Interface(
55
  fn=extract_data_from_pdf,
56
+ inputs=gr.inputs.File(label="Upload PDF"),
57
+ outputs=gr.outputs.File(label="Download Excel"),
58
  title="PDF Data Extractor",
59
  description="Extract structured data from a PDF and output it as an Excel file."
60
  )