neerajkalyank commited on
Commit
9d104f1
1 Parent(s): 01fd285

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -28
app.py CHANGED
@@ -1,38 +1,28 @@
1
  import gradio as gr
2
- from transformers import DonutProcessor, VisionEncoderDecoderModel
3
  import pandas as pd
4
  from io import BytesIO
5
  import fitz # PyMuPDF
6
  import re
7
  from PIL import Image
8
-
9
- # Initialize the Hugging Face Donut model and processor
10
- processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
11
- model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
12
 
13
  def extract_data_from_pdf(pdf_file):
14
  # Open the PDF file using the path provided by gr.File
15
  doc = fitz.open(pdf_file.name)
16
  text_data = []
17
 
18
- # Limit processing to the first 5 pages for faster results
19
- max_pages = min(doc.page_count, 5)
20
-
21
- for page_num in range(max_pages):
22
  page = doc[page_num]
23
  pix = page.get_pixmap() # Render page to a Pixmap image
24
 
25
- # Convert Pixmap to PIL Image and resize for faster processing
26
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
27
- image = image.resize((image.width // 2, image.height // 2)) # Resize to 50% for faster processing
28
-
29
- # Preprocess image for the Donut model
30
- processed_image = processor(image, return_tensors="pt").pixel_values
31
- # Generate text with controlled length using `max_new_tokens`
32
- outputs = model.generate(processed_image, max_new_tokens=50) # Reduced length for faster output
33
 
34
- # Decode generated text
35
- text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
36
  text_data.append(text)
37
 
38
  # Initialize list for parsed data
@@ -73,21 +63,19 @@ def extract_data_from_pdf(pdf_file):
73
  "Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
74
  ]
75
 
76
- # Export DataFrame to Excel
77
- output = BytesIO()
78
- with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
79
  df.to_excel(writer, index=False, sheet_name="Extracted Data")
80
- output.seek(0)
81
-
82
- return output
83
  else:
84
  # If no data was found, create a blank Excel file
85
- output = BytesIO()
86
- with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
87
  pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
88
- output.seek(0)
89
 
90
- return output
91
 
92
  # Define Gradio Interface with updated components
93
  iface = gr.Interface(
@@ -97,6 +85,8 @@ iface = gr.Interface(
97
  title="Advanced Document Data Extractor",
98
  description=(
99
  "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
 
 
100
  ),
101
  )
102
 
 
1
  import gradio as gr
2
+ import pytesseract
3
  import pandas as pd
4
  from io import BytesIO
5
  import fitz # PyMuPDF
6
  import re
7
  from PIL import Image
8
+ import tempfile
9
+ import os
 
 
10
 
11
  def extract_data_from_pdf(pdf_file):
12
  # Open the PDF file using the path provided by gr.File
13
  doc = fitz.open(pdf_file.name)
14
  text_data = []
15
 
16
+ # Process each page in the PDF using Tesseract OCR
17
+ for page_num in range(doc.page_count):
 
 
18
  page = doc[page_num]
19
  pix = page.get_pixmap() # Render page to a Pixmap image
20
 
21
+ # Convert Pixmap to PIL Image
22
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
 
 
 
 
 
 
23
 
24
+ # Use Tesseract to extract text from the image
25
+ text = pytesseract.image_to_string(image)
26
  text_data.append(text)
27
 
28
  # Initialize list for parsed data
 
63
  "Discount", "Currency", "Amount", "Central GST", "State GST", "Sub Total"
64
  ]
65
 
66
+ # Save the DataFrame to a temporary Excel file
67
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
68
+ with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
69
  df.to_excel(writer, index=False, sheet_name="Extracted Data")
70
+
71
+ return temp_file.name
 
72
  else:
73
  # If no data was found, create a blank Excel file
74
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
75
+ with pd.ExcelWriter(temp_file.name, engine="xlsxwriter") as writer:
76
  pd.DataFrame([["No structured data found. Please check the PDF structure."]], columns=["Error"]).to_excel(writer, index=False, sheet_name="Error")
 
77
 
78
+ return temp_file.name
79
 
80
  # Define Gradio Interface with updated components
81
  iface = gr.Interface(
 
85
  title="Advanced Document Data Extractor",
86
  description=(
87
  "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
88
+ "The app will parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
89
+ "Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
90
  ),
91
  )
92