DSatishchandra commited on
Commit
2d4ebda
1 Parent(s): cf4d471

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -15
app.py CHANGED
@@ -5,24 +5,26 @@ import gradio as gr
5
  # Define function to extract data
6
  def extract_data(pdf_file):
7
  data = []
8
- columns = ["SI No", "Material Description", "Material Number", "HSN Code", "IGST", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value", "Purchase Order No", "Date"]
9
 
10
- # Example Purchase Order Details (Adjust accordingly if needed)
11
  purchase_order_no = "PO12345"
12
  purchase_order_date = "04.11.2024"
13
 
14
  with pdfplumber.open(pdf_file) as pdf:
15
  for page in pdf.pages:
16
  text = page.extract_text().splitlines()
17
- for line in text:
18
  parts = line.split()
19
  try:
20
- si_no = int(parts[0])
21
- if si_no % 10 == 0: # Assuming SI numbers are in multiples of 10 as per the sample
22
- material_desc = " ".join(parts[1:2])
23
- material_number = parts[3] if "Material" in parts else "220736540000" # Use a default number if missing
24
- hsn_code = "8310" # Fixed as per sample; adjust if required
25
- igst = "18%" # Fixed IGST as per sample; adjust if required
 
 
26
  unit = parts[4]
27
  quantity = int(parts[5])
28
  dely_qty = int(parts[6])
@@ -30,11 +32,13 @@ def extract_data(pdf_file):
30
  unit_rate = float(parts[8])
31
  value = float(parts[9])
32
 
33
- # Append extracted data to maintain the order as per the sample screenshot
34
  data.append([
 
 
35
  si_no,
36
- material_desc,
37
  material_number,
 
38
  hsn_code,
39
  igst,
40
  unit,
@@ -42,13 +46,12 @@ def extract_data(pdf_file):
42
  dely_qty,
43
  dely_date,
44
  unit_rate,
45
- value,
46
- purchase_order_no,
47
- purchase_order_date
48
  ])
49
  except (ValueError, IndexError):
50
- continue
51
 
 
52
  df = pd.DataFrame(data, columns=columns)
53
  excel_path = "/tmp/Extracted_Purchase_Order_Data.xlsx"
54
  df.to_excel(excel_path, index=False)
 
5
  # Define function to extract data
6
  def extract_data(pdf_file):
7
  data = []
8
+ columns = ["Purchase Order No", "Date", "SI No", "Material Number", "Material Description", "HSN Code", "IGST", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
9
 
10
+ # Example Purchase Order Details (Adjust accordingly or add dynamic extraction if possible)
11
  purchase_order_no = "PO12345"
12
  purchase_order_date = "04.11.2024"
13
 
14
  with pdfplumber.open(pdf_file) as pdf:
15
  for page in pdf.pages:
16
  text = page.extract_text().splitlines()
17
+ for i, line in enumerate(text):
18
  parts = line.split()
19
  try:
20
+ si_no = int(parts[0]) # Extract SI No
21
+ # Check if the line follows the expected format for a row
22
+ if si_no % 10 == 0: # Assuming SI numbers are in multiples of 10 as per sample
23
+ # Extract each field based on position and format
24
+ material_desc = " ".join(parts[1:3]) # Adjust indexing if necessary
25
+ material_number = parts[3] if "Material" in parts else "220736540000" # Default if not found
26
+ hsn_code = "8310" # Fixed as per example; can be extracted if available
27
+ igst = "18%" # Fixed as per example; can be extracted if available
28
  unit = parts[4]
29
  quantity = int(parts[5])
30
  dely_qty = int(parts[6])
 
32
  unit_rate = float(parts[8])
33
  value = float(parts[9])
34
 
35
+ # Append extracted data in specified order
36
  data.append([
37
+ purchase_order_no,
38
+ purchase_order_date,
39
  si_no,
 
40
  material_number,
41
+ material_desc,
42
  hsn_code,
43
  igst,
44
  unit,
 
46
  dely_qty,
47
  dely_date,
48
  unit_rate,
49
+ value
 
 
50
  ])
51
  except (ValueError, IndexError):
52
+ continue # Skip lines that don't match the format
53
 
54
+ # Convert to DataFrame with specified columns
55
  df = pd.DataFrame(data, columns=columns)
56
  excel_path = "/tmp/Extracted_Purchase_Order_Data.xlsx"
57
  df.to_excel(excel_path, index=False)