Spaces:
Sleeping
Sleeping
neerajkalyank
commited on
Commit
•
01fd285
1
Parent(s):
5b67e8d
Update app.py
Browse files
app.py
CHANGED
@@ -15,17 +15,21 @@ def extract_data_from_pdf(pdf_file):
|
|
15 |
doc = fitz.open(pdf_file.name)
|
16 |
text_data = []
|
17 |
|
18 |
-
for
|
|
|
|
|
|
|
19 |
page = doc[page_num]
|
20 |
pix = page.get_pixmap() # Render page to a Pixmap image
|
21 |
|
22 |
-
# Convert Pixmap to PIL Image
|
23 |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
|
24 |
|
25 |
# Preprocess image for the Donut model
|
26 |
processed_image = processor(image, return_tensors="pt").pixel_values
|
27 |
# Generate text with controlled length using `max_new_tokens`
|
28 |
-
outputs = model.generate(processed_image, max_new_tokens=
|
29 |
|
30 |
# Decode generated text
|
31 |
text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
@@ -93,8 +97,6 @@ iface = gr.Interface(
|
|
93 |
title="Advanced Document Data Extractor",
|
94 |
description=(
|
95 |
"Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
|
96 |
-
"The model will identify and parse rows with fields like Position, Item Code, Description, Quantity, Price, etc. "
|
97 |
-
"Calculated fields (like Central GST, State GST, and Sub Total) are automatically included."
|
98 |
),
|
99 |
)
|
100 |
|
|
|
15 |
doc = fitz.open(pdf_file.name)
|
16 |
text_data = []
|
17 |
|
18 |
+
# Limit processing to the first 5 pages for faster results
|
19 |
+
max_pages = min(doc.page_count, 5)
|
20 |
+
|
21 |
+
for page_num in range(max_pages):
|
22 |
page = doc[page_num]
|
23 |
pix = page.get_pixmap() # Render page to a Pixmap image
|
24 |
|
25 |
+
# Convert Pixmap to PIL Image and resize for faster processing
|
26 |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
27 |
+
image = image.resize((image.width // 2, image.height // 2)) # Resize to 50% for faster processing
|
28 |
|
29 |
# Preprocess image for the Donut model
|
30 |
processed_image = processor(image, return_tensors="pt").pixel_values
|
31 |
# Generate text with controlled length using `max_new_tokens`
|
32 |
+
outputs = model.generate(processed_image, max_new_tokens=50) # Reduced length for faster output
|
33 |
|
34 |
# Decode generated text
|
35 |
text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
|
|
97 |
title="Advanced Document Data Extractor",
|
98 |
description=(
|
99 |
"Upload a PDF file to extract structured purchase order data and download it as an Excel file. "
|
|
|
|
|
100 |
),
|
101 |
)
|
102 |
|