Spaces:

neerajkalyank
/

toshiba_2.O

Sleeping

App Files Files Community

neerajkalyank commited on Nov 12

Commit

9d104f1

•

1 Parent(s): 01fd285

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -28

app.py CHANGED Viewed

@@ -1,38 +1,28 @@
 import gradio as gr
-from transformers import DonutProcessor, VisionEncoderDecoderModel
 import pandas as pd
 from io import BytesIO
 import fitz  # PyMuPDF
 import re
 from PIL import Image
-# Initialize the Hugging Face Donut model and processor
-processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
-model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
 def extract_data_from_pdf(pdf_file):
     # Open the PDF file using the path provided by gr.File
     doc = fitz.open(pdf_file.name)
     text_data = []
-    # Limit processing to the first 5 pages for faster results
-    max_pages = min(doc.page_count, 5)
-    for page_num in range(max_pages):
         page = doc[page_num]
         pix = page.get_pixmap()  # Render page to a Pixmap image
-        # Convert Pixmap to PIL Image and resize for faster processing
         image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        image = image.resize((image.width // 2, image.height // 2))  # Resize to 50% for faster processing
-        # Preprocess image for the Donut model
-        processed_image = processor(image, return_tensors="pt").pixel_values
-        # Generate text with controlled length using `max_new_tokens`
-        outputs = model.generate(processed_image, max_new_tokens=50)  # Reduced length for faster output
-        # Decode generated text
-        text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
         text_data.append(text)
     # Initialize list for parsed data
@@ -73,21 +63,19 @@ def extract_data_from_pdf(pdf_file):
             "Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
         ]
-        # Export DataFrame to Excel
-        output = BytesIO()
-        with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
             df.to_excel(writer, index=False, sheet_name="Extracted Data")
-        output.seek(0)
-        return output
     else:
         # If no data was found, create a blank Excel file
-        output = BytesIO()
-        with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
             pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
-        output.seek(0)
-        return output
 # Define Gradio Interface with updated components
 iface = gr.Interface(
@@ -97,6 +85,8 @@ iface = gr.Interface(
     title="Advanced Document Data Extractor",
     description=(
         "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
     ),
 )

 import gradio as gr
+import pytesseract
 import pandas as pd
 from io import BytesIO
 import fitz  # PyMuPDF
 import re
 from PIL import Image
+import tempfile
+import os
 def extract_data_from_pdf(pdf_file):
     # Open the PDF file using the path provided by gr.File
     doc = fitz.open(pdf_file.name)
     text_data = []
+    # Process each page in the PDF using Tesseract OCR
+    for page_num in range(doc.page_count):
         page = doc[page_num]
         pix = page.get_pixmap()  # Render page to a Pixmap image
+        # Convert Pixmap to PIL Image
         image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        # Use Tesseract to extract text from the image
+        text = pytesseract.image_to_string(image)
         text_data.append(text)
     # Initialize list for parsed data
             "Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
         ]
+        # Save the DataFrame to a temporary Excel file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
+        with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
             df.to_excel(writer, index=False, sheet_name="Extracted Data")
+        return temp_file.name
     else:
         # If no data was found, create a blank Excel file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
+        with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
             pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
+        return temp_file.name
 # Define Gradio Interface with updated components
 iface = gr.Interface(
     title="Advanced Document Data Extractor",
     description=(
         "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
+        "The app will parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
+        "Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
     ),
 )