Spaces:

neerajkalyank
/

toshiba_2.O

Sleeping

neerajkalyank commited on Nov 12

Commit

bfda109

•

1 Parent(s): 0135d09

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,27 +8,26 @@ def extract_data_from_pdf(pdf_file):
     data = []
     po_number = None
-    # Use pdfplumber to open the provided file path directly
-    with pdfplumber.open(pdf_file.name) as pdf:
         for page in pdf.pages:
             text = page.extract_text()
             # Extract PO number if available
             if po_number is None:
                 po_match = re.search(r"Purchase Order : (\w+)", text)
-                if po_match:
-                    po_number = po_match.group(1)
             # Regex pattern to match the row data
             row_pattern = re.compile(
-                r"(\d+)\s+(\d{10,})\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
             )
             # Extract matching rows
             for match in row_pattern.finditer(text):
                 pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups()
                 sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
-                sub_total = sub_total_match.group(1) if sub_total_match else ""
                 data.append({
                     "Purchase Order": po_number,
@@ -54,8 +53,8 @@ def extract_data_from_pdf(pdf_file):
 # Gradio Interface
 iface = gr.Interface(
     fn=extract_data_from_pdf,
-    inputs=gr.File(label="Upload PDF"),
-    outputs=gr.File(label="Download Excel"),
     title="PDF Data Extractor",
     description="Extract structured data from a PDF and output it as an Excel file."
 )

     data = []
     po_number = None
+    # Use pdfplumber with BytesIO for Gradio compatibility
+    with pdfplumber.open(BytesIO(pdf_file.read())) as pdf:
         for page in pdf.pages:
             text = page.extract_text()
             # Extract PO number if available
             if po_number is None:
                 po_match = re.search(r"Purchase Order : (\w+)", text)
+                po_number = po_match.group(1) if po_match else "N/A"
             # Regex pattern to match the row data
             row_pattern = re.compile(
+                r"(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
             )
             # Extract matching rows
             for match in row_pattern.finditer(text):
                 pos, item_code, unit, delivery_date, quantity, basic_price, amount = match.groups()
                 sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
+                sub_total = sub_total_match.group(1) if sub_total_match else "0.0"
                 data.append({
                     "Purchase Order": po_number,
 # Gradio Interface
 iface = gr.Interface(
     fn=extract_data_from_pdf,
+    inputs=gr.inputs.File(label="Upload PDF"),
+    outputs=gr.outputs.File(label="Download Excel"),
     title="PDF Data Extractor",
     description="Extract structured data from a PDF and output it as an Excel file."
 )