Spaces:

DSatishchandra
/

POExtraction_UC3

Runtime error

App Files Files Community

DSatishchandra commited on Nov 12

Commit

80b61aa

•

1 Parent(s): 2659c0a

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -46

app.py CHANGED Viewed

@@ -1,54 +1,24 @@
 import pandas as pd
-import gradio as gr
-import re
-# Define function to extract data from Excel file
-def extract_data(excel_file):
-    # Load the Excel file
-    df = pd.read_excel(excel_file)
-    # Attempt to extract 'Purchase Order No' and 'Date' from the first few rows
-    for _, row in df.iterrows():
-        # Search for Purchase Order No pattern in the row data
-        po_match = re.search(r'Purchase Order No[:\s]+(\w+)', str(row), re.IGNORECASE)
-        if po_match:
-            purchase_order_no = po_match.group(1)
-        # Search for Date pattern in the row data (e.g., "Date: 10.10.2023" or "10/10/2023")
-        date_match = re.search(r'Date[:\s]+(\d{2}[\./-]\d{2}[\./-]\d{4})', str(row), re.IGNORECASE)
-        if date_match:
-            purchase_order_date = date_match.group(1)
-        # Stop if both values are found
-        if purchase_order_no != "Not Found" and purchase_order_date != "Not Found":
-            break
-    # Required columns to keep
-    columns_to_keep = ["Purchase Order No", "Date", "SI No", "Material Description",
-                       "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
-    # Add Purchase Order No and Date columns to the DataFrame if they are missing
-    if "Purchase Order No" not in df.columns:
-        df["Purchase Order No"] = purchase_order_no
-    if "Date" not in df.columns:
-        df["Date"] = purchase_order_date
-    # Filter the DataFrame to only include relevant columns
-    df_filtered = df[columns_to_keep]
-    # Save the filtered data to a new Excel file
-    output_path = "/tmp/Filtered_Purchase_Order_Data.xlsx"
-    df_filtered.to_excel(output_path, index=False)
-    return output_path
-# Set up Gradio interface
-iface = gr.Interface(
-    fn=extract_data,
-    inputs=gr.File(label="Upload Excel File"),
-    outputs=gr.File(label="Download Filtered Excel"),
-    title="Excel Data Extractor"
-)
-# Launch the app
-iface.launch()

 import pandas as pd
+import tabula
+def extract_data(pdf_file):
+    # Extract data from the PDF file using tabula
+    tables = tabula.read_pdf(pdf_file, pages='all')
+    # Combine the extracted tables into a single DataFrame
+    data = pd.concat(tables, ignore_index=True)
+    # Rename columns to match the expected output format
+    data.columns = ['Purchase Order No', 'Date', 'Material Description', 'Unit', 'Quantity', 'Dely Qty', 'Dely Date', 'Unit Rate', 'Value']
+    # Remove any unnecessary rows and columns
+    data = data.dropna(how='all')
+    return data
+if __name__ == "__main__":
+    pdf_file = 'your_pdf_file.pdf'
+    data = extract_data(pdf_file)
+    # Save the extracted data to an Excel file
+    data.to_excel('output.xlsx', index=False)