neerajkalyank commited on
Commit
01fd285
1 Parent(s): 5b67e8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -15,17 +15,21 @@ def extract_data_from_pdf(pdf_file):
15
  doc = fitz.open(pdf_file.name)
16
  text_data = []
17
 
18
- for page_num in range(doc.page_count):
 
 
 
19
  page = doc[page_num]
20
  pix = page.get_pixmap() # Render page to a Pixmap image
21
 
22
- # Convert Pixmap to PIL Image
23
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
 
24
 
25
  # Preprocess image for the Donut model
26
  processed_image = processor(image, return_tensors="pt").pixel_values
27
  # Generate text with controlled length using `max_new_tokens`
28
- outputs = model.generate(processed_image, max_new_tokens=100)
29
 
30
  # Decode generated text
31
  text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
@@ -93,8 +97,6 @@ iface = gr.Interface(
93
  title="Advanced Document Data Extractor",
94
  description=(
95
  "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
96
- "The model will identify and parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
97
- "Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
98
  ),
99
  )
100
 
 
15
  doc = fitz.open(pdf_file.name)
16
  text_data = []
17
 
18
+ # Limit processing to the first 5 pages for faster results
19
+ max_pages = min(doc.page_count, 5)
20
+
21
+ for page_num in range(max_pages):
22
  page = doc[page_num]
23
  pix = page.get_pixmap() # Render page to a Pixmap image
24
 
25
+ # Convert Pixmap to PIL Image and resize for faster processing
26
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
27
+ image = image.resize((image.width // 2, image.height // 2)) # Resize to 50% for faster processing
28
 
29
  # Preprocess image for the Donut model
30
  processed_image = processor(image, return_tensors="pt").pixel_values
31
  # Generate text with controlled length using `max_new_tokens`
32
+ outputs = model.generate(processed_image, max_new_tokens=50) # Reduced length for faster output
33
 
34
  # Decode generated text
35
  text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
 
97
  title="Advanced Document Data Extractor",
98
  description=(
99
  "Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
 
 
100
  ),
101
  )
102