Spaces:

neerajkalyank
/

toshiba_2.O

Sleeping

App Files Files Community

neerajkalyank commited on Nov 12

Commit

8b139bf

•

1 Parent(s): 65c3f55

Create app.py

Browse files

Files changed (1) hide show

app.py +100 -0

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import gradio as gr
+from transformers import DonutProcessor, VisionEncoderDecoderModel
+import pandas as pd
+from io import BytesIO
+import fitz  # PyMuPDF
+import re
+from PIL import Image
+# Initialize the Hugging Face Donut model and processor
+processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+def extract_data_from_pdf(pdf_file):
+    # Open the PDF file
+    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+    text_data = []
+    for page_num in range(doc.page_count):
+        page = doc[page_num]
+        pix = page.get_pixmap()  # Render page to a Pixmap image
+        # Convert Pixmap to PIL Image
+        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        # Preprocess image for the Donut model
+        processed_image = processor(image, return_tensors="pt").pixel_values
+        outputs = model.generate(processed_image)
+        # Decode generated text
+        text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        text_data.append(text)
+    # Initialize list for parsed data
+    data = []
+    # Define regular expressions for parsing rows
+    row_pattern = re.compile(
+        r'(?P<pos>\d+)\s+(?P<item_code>\d+)\s+(?P<description>.+?)\s+(?P<unit>\S+)\s+(?P<date>\d{4}-\d{2}-\d{2})\s+(?P<quantity>\d+\.\d+)\s+(?P<price>\d+\.\d+)\s+(?P<discount>\d+\.\d+)\s+(?P<currency>\S+)\s+(?P<amount>\d+\.\d+)'
+    )
+    # Process and structure extracted text
+    for text in text_data:
+        for line in text.split('\n'):
+            # Apply row pattern to each line
+            match = row_pattern.search(line)
+            if match:
+                row = match.groupdict()
+                row["description"] = row["description"].strip()  # Clean description
+                row["quantity"] = float(row["quantity"])
+                row["price"] = float(row["price"])
+                row["discount"] = float(row["discount"])
+                row["amount"] = float(row["amount"])
+                # Calculate Sub Total with assumed tax rate
+                central_gst = row["amount"] * 0.09  # Central GST 9%
+                state_gst = row["amount"] * 0.09    # State GST 9%
+                row["Central GST"] = round(central_gst, 2)
+                row["State GST"] = round(state_gst, 2)
+                row["Sub Total"] = round(row["amount"] + central_gst + state_gst - row["discount"], 2)
+                data.append(row)
+    # Create DataFrame if data was extracted
+    if data:
+        df = pd.DataFrame(data)
+        df.columns = [
+            "Pos", "Item Code", "Description", "Unit", "Delivery Date", "Quantity", "Basic Price",
+            "Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
+        ]
+        # Export DataFrame to Excel
+        output = BytesIO()
+        with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
+            df.to_excel(writer, index=False, sheet_name="Extracted Data")
+        output.seek(0)
+        return output
+    else:
+        # If no data was found, create a blank Excel file
+        output = BytesIO()
+        with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
+            pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
+        output.seek(0)
+        return output
+# Define Gradio Interface with updated components
+iface = gr.Interface(
+    fn=extract_data_from_pdf,
+    inputs=gr.File(label="Upload PDF"),
+    outputs=gr.File(label="Download Excel"),
+    title="Advanced Document Data Extractor",
+    description=(
+        "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
+        "The model will identify and parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
+        "Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
+    ),
+)
+iface.launch()