ak3ra commited on
Commit
e31c953
·
1 Parent(s): d0a03de

added docling

Browse files
Files changed (3) hide show
  1. app.py +2 -5
  2. requirements.txt +3 -1
  3. utils/pdf_processor.py +98 -69
app.py CHANGED
@@ -355,7 +355,7 @@ def chat_response(
355
  history.append((message, response))
356
 
357
  # Generate PDF preview if source information is available
358
- preview_image = None
359
  if (
360
  source_info
361
  and source_info.get("source_file")
@@ -364,13 +364,10 @@ def chat_response(
364
  try:
365
  # Get the first page number from the source
366
  page_num = source_info["page_numbers"][0]
367
- preview_image = pdf_processor.render_page(
368
- source_info["source_file"], int(page_num)
369
- )
370
  except Exception as e:
371
  logger.error(f"Error generating PDF preview: {str(e)}")
372
 
373
- return history, preview_image
374
 
375
 
376
  def create_gr_interface() -> gr.Blocks:
 
355
  history.append((message, response))
356
 
357
  # Generate PDF preview if source information is available
358
+ # preview_image = None
359
  if (
360
  source_info
361
  and source_info.get("source_file")
 
364
  try:
365
  # Get the first page number from the source
366
  page_num = source_info["page_numbers"][0]
 
 
 
367
  except Exception as e:
368
  logger.error(f"Error generating PDF preview: {str(e)}")
369
 
370
+ return history
371
 
372
 
373
  def create_gr_interface() -> gr.Blocks:
requirements.txt CHANGED
@@ -14,4 +14,6 @@ python-slugify
14
  PyMuPDF==1.23.8
15
  Pillow==10.2.0
16
  sqlmodel==0.0.22
17
- cachetools
 
 
 
14
  PyMuPDF==1.23.8
15
  Pillow==10.2.0
16
  sqlmodel==0.0.22
17
+ cachetools
18
+ docling
19
+ llama-index-readers-docling
utils/pdf_processor.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  """
2
  PDF processing module for ACRES RAG Platform.
3
  Handles PDF file processing, text extraction, and page rendering.
@@ -6,10 +8,11 @@ Handles PDF file processing, text extraction, and page rendering.
6
  import datetime
7
  import json
8
  import logging
9
- # utils/pdf_processor.py
10
  import os
11
  import re
12
  from typing import Dict, List, Optional
 
 
13
 
14
  import fitz
15
  from PIL import Image
@@ -18,6 +21,9 @@ from slugify import slugify
18
  logger = logging.getLogger(__name__)
19
 
20
 
 
 
 
21
  class PDFProcessor:
22
  def __init__(self, upload_dir: str = "data/uploads"):
23
  """Initialize PDFProcessor with upload directory."""
@@ -25,30 +31,6 @@ class PDFProcessor:
25
  os.makedirs(upload_dir, exist_ok=True)
26
  self.current_page = 0
27
 
28
- def render_page(self, file_path: str, page_num: int) -> Optional[Image.Image]:
29
- """Render a specific page from a PDF as an image."""
30
- try:
31
- logger.info(f"Attempting to render page {page_num} from {file_path}")
32
- doc = fitz.open(file_path)
33
-
34
- # Ensure page number is valid
35
- if page_num < 0 or page_num >= len(doc):
36
- logger.error(
37
- f"Invalid page number {page_num} for document with {len(doc)} pages"
38
- )
39
- return None
40
-
41
- page = doc[page_num]
42
- # Increase resolution for better quality
43
- pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
44
- image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
45
- doc.close()
46
- logger.info(f"Successfully rendered page {page_num}")
47
- return image
48
- except Exception as e:
49
- logger.error(f"Error rendering page {page_num} from {file_path}: {str(e)}")
50
- return None
51
-
52
  def is_references_page(self, text: str) -> bool:
53
  """
54
  Check if the page appears to be a references/bibliography page.
@@ -134,60 +116,107 @@ class PDFProcessor:
134
  return output_path
135
 
136
  def extract_text_from_pdf(self, file_path: str) -> Dict:
137
- """Extract text and metadata from a PDF file."""
 
 
 
138
  try:
 
 
 
 
 
 
139
  doc = fitz.open(file_path)
140
-
141
- # Find references section start
142
- refs_start = self.detect_references_start(doc)
143
-
144
- # Extract text from all pages with page tracking
145
- text = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  pages = {}
147
- for page_num in range(len(doc)):
148
- # Skip if this is after references section starts
149
- if refs_start is not None and page_num >= refs_start:
150
- logger.info(
151
- f"Skipping page {page_num} as it appears to be part of references"
152
- )
 
153
  continue
154
-
155
- page_text = doc[page_num].get_text()
156
-
157
- # Extra check to catch references if they weren't caught by the initial scan
158
- if page_num > 0 and self.is_references_page(page_text):
159
- logger.info(
160
- f"Detected references content on page {page_num}, skipping"
161
- )
162
- continue
163
-
164
- pages[str(page_num)] = page_text
165
- text += page_text + "\n"
166
-
167
- # Extract metadata
168
- metadata = doc.metadata
169
- if not metadata.get("title"):
170
- metadata["title"] = os.path.basename(file_path)
171
-
172
- # Create structured document
173
  document = {
174
- "title": metadata.get("title", ""),
175
- "authors": (
176
- metadata.get("author", "").split(";")
177
- if metadata.get("author")
178
- else []
179
- ),
180
- "date": metadata.get("creationDate", ""),
181
- "abstract": text[:500] + "..." if len(text) > 500 else text,
182
- "full_text": text,
183
  "source_file": file_path,
184
  "pages": pages,
185
- "page_count": len(doc),
186
- "content_pages": len(pages), # Number of pages excluding references
187
  }
188
-
189
  doc.close()
190
  return document
 
191
  except Exception as e:
192
  logger.error(f"Error processing PDF {file_path}: {str(e)}")
193
  raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/pdf_processor.py
2
+
3
  """
4
  PDF processing module for ACRES RAG Platform.
5
  Handles PDF file processing, text extraction, and page rendering.
 
8
  import datetime
9
  import json
10
  import logging
 
11
  import os
12
  import re
13
  from typing import Dict, List, Optional
14
+ from llama_index.readers.docling import DoclingReader
15
+
16
 
17
  import fitz
18
  from PIL import Image
 
21
  logger = logging.getLogger(__name__)
22
 
23
 
24
+ reader = DoclingReader()
25
+
26
+
27
  class PDFProcessor:
28
  def __init__(self, upload_dir: str = "data/uploads"):
29
  """Initialize PDFProcessor with upload directory."""
 
31
  os.makedirs(upload_dir, exist_ok=True)
32
  self.current_page = 0
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def is_references_page(self, text: str) -> bool:
35
  """
36
  Check if the page appears to be a references/bibliography page.
 
116
  return output_path
117
 
118
  def extract_text_from_pdf(self, file_path: str) -> Dict:
119
+ """
120
+ Extract text and metadata from a PDF file using DoclingReader.
121
+ Maintains accurate page numbers for source citation.
122
+ """
123
  try:
124
+ # Use DoclingReader for main content extraction
125
+ reader = DoclingReader()
126
+ documents = reader.load_data(file_path)
127
+ text = documents[0].text if documents else ""
128
+
129
+ # Use PyMuPDF to get accurate page count
130
  doc = fitz.open(file_path)
131
+ total_pages = len(doc)
132
+
133
+ # Extract title from document
134
+ title = os.path.basename(file_path)
135
+ title_match = re.search(r'#+ (.+?)\n', text)
136
+ if title_match:
137
+ title = title_match.group(1).strip()
138
+
139
+ # Extract abstract
140
+ abstract = ""
141
+ abstract_match = re.search(r'Abstract:?(.*?)(?=\n\n|Keywords:|$)', text, re.DOTALL | re.IGNORECASE)
142
+ if abstract_match:
143
+ abstract = abstract_match.group(1).strip()
144
+
145
+ # Extract authors
146
+ authors = []
147
+ author_section = re.search(r'\n(.*?)\n.*?Department', text)
148
+ if author_section:
149
+ author_text = author_section.group(1)
150
+ authors = [a.strip() for a in author_text.split(',') if a.strip()]
151
+
152
+ # Remove references section
153
+ content = text
154
+ ref_patterns = [r'\nReferences\n', r'\nBibliography\n', r'\nWorks Cited\n']
155
+ for pattern in ref_patterns:
156
+ split_text = re.split(pattern, content, flags=re.IGNORECASE)
157
+ if len(split_text) > 1:
158
+ content = split_text[0]
159
+ break
160
+
161
+ # Map content to pages using PyMuPDF for accurate page numbers
162
  pages = {}
163
+ for page_num in range(total_pages):
164
+ page = doc[page_num]
165
+ page_text = page.get_text()
166
+
167
+ # Skip if this appears to be a references page
168
+ if self.is_references_page(page_text):
169
+ logger.info(f"Skipping references page {page_num}")
170
  continue
171
+
172
+ # Look for this page's content in the Docling-extracted text
173
+ # This is a heuristic approach - we look for unique phrases from the page
174
+ key_phrases = self._get_key_phrases(page_text)
175
+ page_content = self._find_matching_content(content, key_phrases)
176
+
177
+ if page_content:
178
+ pages[str(page_num)] = {
179
+ 'text': page_content,
180
+ 'page_number': page_num + 1 # 1-based page numbers for human readability
181
+ }
182
+
183
+ # Create structured document with page-aware content
 
 
 
 
 
 
184
  document = {
185
+ "title": title,
186
+ "authors": authors,
187
+ "date": "", # Could be extracted if needed
188
+ "abstract": abstract,
189
+ "full_text": content,
 
 
 
 
190
  "source_file": file_path,
191
  "pages": pages,
192
+ "page_count": total_pages,
193
+ "content_pages": len(pages) # Number of non-reference pages
194
  }
195
+
196
  doc.close()
197
  return document
198
+
199
  except Exception as e:
200
  logger.error(f"Error processing PDF {file_path}: {str(e)}")
201
  raise
202
+
203
+ def _get_key_phrases(self, text: str, phrase_length: int = 10) -> List[str]:
204
+ """Extract key phrases from text for matching."""
205
+ words = text.split()
206
+ phrases = []
207
+ for i in range(0, len(words), phrase_length):
208
+ phrase = ' '.join(words[i:i + phrase_length])
209
+ if len(phrase.strip()) > 20: # Only use substantial phrases
210
+ phrases.append(phrase)
211
+ return phrases
212
+
213
+ def _find_matching_content(self, docling_text: str, key_phrases: List[str]) -> Optional[str]:
214
+ """Find the corresponding content in Docling text using key phrases."""
215
+ for phrase in key_phrases:
216
+ if phrase in docling_text:
217
+ # Find the paragraph or section containing this phrase
218
+ paragraphs = docling_text.split('\n\n')
219
+ for para in paragraphs:
220
+ if phrase in para:
221
+ return para
222
+ return None