Spaces:

neerajkalyank
/

toshiba_2.O

Sleeping

App Files Files Community

neerajkalyank commited on Nov 12, 2024

Commit

3469319

verified ·

1 Parent(s): 359e981

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -62

app.py CHANGED Viewed

@@ -1,71 +1,20 @@
-import pdfplumber
 import pandas as pd
-import re
 import gradio as gr
-def extract_data_from_pdf(pdf_file):
-    data = []
-    po_number = None
-    with pdfplumber.open(pdf_file.name) as pdf:
-        for page in pdf.pages:
-            text = page.extract_text()
-            # Extract PO number
-            if po_number is None:
-                po_match = re.search(r"Purchase Order : (\w+)", text)
-                po_number = po_match.group(1) if po_match else "N/A"
-            # Regex pattern for row data
-            row_pattern = re.compile(
-                r"(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
-            )
-            # Extract matching rows
-            for match in row_pattern.finditer(text):
-                (
-                    pos,
-                    item_code,
-                    unit,
-                    delivery_date,
-                    quantity,
-                    basic_price,
-                    amount,
-                ) = match.groups()
-                sub_total_match = re.search(r"SUB TOTAL : ([\d.]+)", text)
-                sub_total = sub_total_match.group(1) if sub_total_match else "0.0"
-                data.append(
-                    {
-                        "Purchase Order": po_number,
-                        "Pos.": pos,
-                        "Item Code": item_code,
-                        "Unit": unit,
-                        "Delivery Date": delivery_date,
-                        "Quantity": quantity,
-                        "Basic Price": basic_price,
-                        "Amount": amount,
-                        "SUB TOTAL": sub_total,
-                    }
-                )
-    # Convert data to DataFrame
-    df = pd.DataFrame(data)
-    # Print extracted data (debugging)
-    print(df)
-    # Save to Excel
     df.to_excel("output.xlsx", index=False)
     return "output.xlsx"
-iface = gr.Interface(
-    fn=extract_data_from_pdf,
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.File(label="Download Excel"),
-    title="PDF Data Extractor",
-    description="Extract structured data from a PDF and output it as an Excel file.",
 )
-iface.launch()

+import camelot
 import pandas as pd
 import gradio as gr
+def extract_tables(pdf_file):
+    tables = camelot.read_pdf(pdf_file.name, pages="all")
+    df = pd.concat([table.df for table in tables], ignore_index=True)
     df.to_excel("output.xlsx", index=False)
     return "output.xlsx"
+interface = gr.Interface(
+    fn=extract_tables,
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.File(label="Download Excel"),
+    title="PDF Table Extractor",
+    description="Extract tables from PDF and output as Excel file.",
 )
+if __name__ == "__main__":
+    interface.launch()