Spaces:
Running
Running
added docling
Browse files- app.py +2 -5
- requirements.txt +3 -1
- utils/pdf_processor.py +98 -69
app.py
CHANGED
@@ -355,7 +355,7 @@ def chat_response(
|
|
355 |
history.append((message, response))
|
356 |
|
357 |
# Generate PDF preview if source information is available
|
358 |
-
preview_image = None
|
359 |
if (
|
360 |
source_info
|
361 |
and source_info.get("source_file")
|
@@ -364,13 +364,10 @@ def chat_response(
|
|
364 |
try:
|
365 |
# Get the first page number from the source
|
366 |
page_num = source_info["page_numbers"][0]
|
367 |
-
preview_image = pdf_processor.render_page(
|
368 |
-
source_info["source_file"], int(page_num)
|
369 |
-
)
|
370 |
except Exception as e:
|
371 |
logger.error(f"Error generating PDF preview: {str(e)}")
|
372 |
|
373 |
-
return history
|
374 |
|
375 |
|
376 |
def create_gr_interface() -> gr.Blocks:
|
|
|
355 |
history.append((message, response))
|
356 |
|
357 |
# Generate PDF preview if source information is available
|
358 |
+
# preview_image = None
|
359 |
if (
|
360 |
source_info
|
361 |
and source_info.get("source_file")
|
|
|
364 |
try:
|
365 |
# Get the first page number from the source
|
366 |
page_num = source_info["page_numbers"][0]
|
|
|
|
|
|
|
367 |
except Exception as e:
|
368 |
logger.error(f"Error generating PDF preview: {str(e)}")
|
369 |
|
370 |
+
return history
|
371 |
|
372 |
|
373 |
def create_gr_interface() -> gr.Blocks:
|
requirements.txt
CHANGED
@@ -14,4 +14,6 @@ python-slugify
|
|
14 |
PyMuPDF==1.23.8
|
15 |
Pillow==10.2.0
|
16 |
sqlmodel==0.0.22
|
17 |
-
cachetools
|
|
|
|
|
|
14 |
PyMuPDF==1.23.8
|
15 |
Pillow==10.2.0
|
16 |
sqlmodel==0.0.22
|
17 |
+
cachetools
|
18 |
+
docling
|
19 |
+
llama-index-readers-docling
|
utils/pdf_processor.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
"""
|
2 |
PDF processing module for ACRES RAG Platform.
|
3 |
Handles PDF file processing, text extraction, and page rendering.
|
@@ -6,10 +8,11 @@ Handles PDF file processing, text extraction, and page rendering.
|
|
6 |
import datetime
|
7 |
import json
|
8 |
import logging
|
9 |
-
# utils/pdf_processor.py
|
10 |
import os
|
11 |
import re
|
12 |
from typing import Dict, List, Optional
|
|
|
|
|
13 |
|
14 |
import fitz
|
15 |
from PIL import Image
|
@@ -18,6 +21,9 @@ from slugify import slugify
|
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
20 |
|
|
|
|
|
|
|
21 |
class PDFProcessor:
|
22 |
def __init__(self, upload_dir: str = "data/uploads"):
|
23 |
"""Initialize PDFProcessor with upload directory."""
|
@@ -25,30 +31,6 @@ class PDFProcessor:
|
|
25 |
os.makedirs(upload_dir, exist_ok=True)
|
26 |
self.current_page = 0
|
27 |
|
28 |
-
def render_page(self, file_path: str, page_num: int) -> Optional[Image.Image]:
|
29 |
-
"""Render a specific page from a PDF as an image."""
|
30 |
-
try:
|
31 |
-
logger.info(f"Attempting to render page {page_num} from {file_path}")
|
32 |
-
doc = fitz.open(file_path)
|
33 |
-
|
34 |
-
# Ensure page number is valid
|
35 |
-
if page_num < 0 or page_num >= len(doc):
|
36 |
-
logger.error(
|
37 |
-
f"Invalid page number {page_num} for document with {len(doc)} pages"
|
38 |
-
)
|
39 |
-
return None
|
40 |
-
|
41 |
-
page = doc[page_num]
|
42 |
-
# Increase resolution for better quality
|
43 |
-
pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
|
44 |
-
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
45 |
-
doc.close()
|
46 |
-
logger.info(f"Successfully rendered page {page_num}")
|
47 |
-
return image
|
48 |
-
except Exception as e:
|
49 |
-
logger.error(f"Error rendering page {page_num} from {file_path}: {str(e)}")
|
50 |
-
return None
|
51 |
-
|
52 |
def is_references_page(self, text: str) -> bool:
|
53 |
"""
|
54 |
Check if the page appears to be a references/bibliography page.
|
@@ -134,60 +116,107 @@ class PDFProcessor:
|
|
134 |
return output_path
|
135 |
|
136 |
def extract_text_from_pdf(self, file_path: str) -> Dict:
|
137 |
-
"""
|
|
|
|
|
|
|
138 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
doc = fitz.open(file_path)
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
pages = {}
|
147 |
-
for page_num in range(
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
|
|
153 |
continue
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
)
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
# Extract metadata
|
168 |
-
metadata = doc.metadata
|
169 |
-
if not metadata.get("title"):
|
170 |
-
metadata["title"] = os.path.basename(file_path)
|
171 |
-
|
172 |
-
# Create structured document
|
173 |
document = {
|
174 |
-
"title":
|
175 |
-
"authors":
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
),
|
180 |
-
"date": metadata.get("creationDate", ""),
|
181 |
-
"abstract": text[:500] + "..." if len(text) > 500 else text,
|
182 |
-
"full_text": text,
|
183 |
"source_file": file_path,
|
184 |
"pages": pages,
|
185 |
-
"page_count":
|
186 |
-
"content_pages": len(pages)
|
187 |
}
|
188 |
-
|
189 |
doc.close()
|
190 |
return document
|
|
|
191 |
except Exception as e:
|
192 |
logger.error(f"Error processing PDF {file_path}: {str(e)}")
|
193 |
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# utils/pdf_processor.py
|
2 |
+
|
3 |
"""
|
4 |
PDF processing module for ACRES RAG Platform.
|
5 |
Handles PDF file processing, text extraction, and page rendering.
|
|
|
8 |
import datetime
|
9 |
import json
|
10 |
import logging
|
|
|
11 |
import os
|
12 |
import re
|
13 |
from typing import Dict, List, Optional
|
14 |
+
from llama_index.readers.docling import DoclingReader
|
15 |
+
|
16 |
|
17 |
import fitz
|
18 |
from PIL import Image
|
|
|
21 |
logger = logging.getLogger(__name__)
|
22 |
|
23 |
|
24 |
+
reader = DoclingReader()
|
25 |
+
|
26 |
+
|
27 |
class PDFProcessor:
|
28 |
def __init__(self, upload_dir: str = "data/uploads"):
|
29 |
"""Initialize PDFProcessor with upload directory."""
|
|
|
31 |
os.makedirs(upload_dir, exist_ok=True)
|
32 |
self.current_page = 0
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
def is_references_page(self, text: str) -> bool:
|
35 |
"""
|
36 |
Check if the page appears to be a references/bibliography page.
|
|
|
116 |
return output_path
|
117 |
|
118 |
def extract_text_from_pdf(self, file_path: str) -> Dict:
|
119 |
+
"""
|
120 |
+
Extract text and metadata from a PDF file using DoclingReader.
|
121 |
+
Maintains accurate page numbers for source citation.
|
122 |
+
"""
|
123 |
try:
|
124 |
+
# Use DoclingReader for main content extraction
|
125 |
+
reader = DoclingReader()
|
126 |
+
documents = reader.load_data(file_path)
|
127 |
+
text = documents[0].text if documents else ""
|
128 |
+
|
129 |
+
# Use PyMuPDF to get accurate page count
|
130 |
doc = fitz.open(file_path)
|
131 |
+
total_pages = len(doc)
|
132 |
+
|
133 |
+
# Extract title from document
|
134 |
+
title = os.path.basename(file_path)
|
135 |
+
title_match = re.search(r'#+ (.+?)\n', text)
|
136 |
+
if title_match:
|
137 |
+
title = title_match.group(1).strip()
|
138 |
+
|
139 |
+
# Extract abstract
|
140 |
+
abstract = ""
|
141 |
+
abstract_match = re.search(r'Abstract:?(.*?)(?=\n\n|Keywords:|$)', text, re.DOTALL | re.IGNORECASE)
|
142 |
+
if abstract_match:
|
143 |
+
abstract = abstract_match.group(1).strip()
|
144 |
+
|
145 |
+
# Extract authors
|
146 |
+
authors = []
|
147 |
+
author_section = re.search(r'\n(.*?)\n.*?Department', text)
|
148 |
+
if author_section:
|
149 |
+
author_text = author_section.group(1)
|
150 |
+
authors = [a.strip() for a in author_text.split(',') if a.strip()]
|
151 |
+
|
152 |
+
# Remove references section
|
153 |
+
content = text
|
154 |
+
ref_patterns = [r'\nReferences\n', r'\nBibliography\n', r'\nWorks Cited\n']
|
155 |
+
for pattern in ref_patterns:
|
156 |
+
split_text = re.split(pattern, content, flags=re.IGNORECASE)
|
157 |
+
if len(split_text) > 1:
|
158 |
+
content = split_text[0]
|
159 |
+
break
|
160 |
+
|
161 |
+
# Map content to pages using PyMuPDF for accurate page numbers
|
162 |
pages = {}
|
163 |
+
for page_num in range(total_pages):
|
164 |
+
page = doc[page_num]
|
165 |
+
page_text = page.get_text()
|
166 |
+
|
167 |
+
# Skip if this appears to be a references page
|
168 |
+
if self.is_references_page(page_text):
|
169 |
+
logger.info(f"Skipping references page {page_num}")
|
170 |
continue
|
171 |
+
|
172 |
+
# Look for this page's content in the Docling-extracted text
|
173 |
+
# This is a heuristic approach - we look for unique phrases from the page
|
174 |
+
key_phrases = self._get_key_phrases(page_text)
|
175 |
+
page_content = self._find_matching_content(content, key_phrases)
|
176 |
+
|
177 |
+
if page_content:
|
178 |
+
pages[str(page_num)] = {
|
179 |
+
'text': page_content,
|
180 |
+
'page_number': page_num + 1 # 1-based page numbers for human readability
|
181 |
+
}
|
182 |
+
|
183 |
+
# Create structured document with page-aware content
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
document = {
|
185 |
+
"title": title,
|
186 |
+
"authors": authors,
|
187 |
+
"date": "", # Could be extracted if needed
|
188 |
+
"abstract": abstract,
|
189 |
+
"full_text": content,
|
|
|
|
|
|
|
|
|
190 |
"source_file": file_path,
|
191 |
"pages": pages,
|
192 |
+
"page_count": total_pages,
|
193 |
+
"content_pages": len(pages) # Number of non-reference pages
|
194 |
}
|
195 |
+
|
196 |
doc.close()
|
197 |
return document
|
198 |
+
|
199 |
except Exception as e:
|
200 |
logger.error(f"Error processing PDF {file_path}: {str(e)}")
|
201 |
raise
|
202 |
+
|
203 |
+
def _get_key_phrases(self, text: str, phrase_length: int = 10) -> List[str]:
|
204 |
+
"""Extract key phrases from text for matching."""
|
205 |
+
words = text.split()
|
206 |
+
phrases = []
|
207 |
+
for i in range(0, len(words), phrase_length):
|
208 |
+
phrase = ' '.join(words[i:i + phrase_length])
|
209 |
+
if len(phrase.strip()) > 20: # Only use substantial phrases
|
210 |
+
phrases.append(phrase)
|
211 |
+
return phrases
|
212 |
+
|
213 |
+
def _find_matching_content(self, docling_text: str, key_phrases: List[str]) -> Optional[str]:
|
214 |
+
"""Find the corresponding content in Docling text using key phrases."""
|
215 |
+
for phrase in key_phrases:
|
216 |
+
if phrase in docling_text:
|
217 |
+
# Find the paragraph or section containing this phrase
|
218 |
+
paragraphs = docling_text.split('\n\n')
|
219 |
+
for para in paragraphs:
|
220 |
+
if phrase in para:
|
221 |
+
return para
|
222 |
+
return None
|