Spaces:

Sunbird
/

acres

Running

App Files Files Community

ak3ra commited on Nov 7, 2024

Commit

286d467

1 Parent(s): b8698fb

Refactor chat_function to unpack response tuple and return only the response

Browse files

Files changed (2) hide show

app.py +2 -2
rag/rag_pipeline.py +75 -36

app.py CHANGED Viewed

@@ -134,8 +134,8 @@ def chat_function(message: str, study_name: str, prompt_type: str) -> str:
         "Evidence-based": evidence_based_prompt,
     }.get(prompt_type)
-    response = rag.query(message, prompt_template=prompt)
-    return response.response
 def process_zotero_library_items(

         "Evidence-based": evidence_based_prompt,
     }.get(prompt_type)
+    response, _ = rag.query(message, prompt_template=prompt)  # Unpack the tuple
+    return response
 def process_zotero_library_items(

rag/rag_pipeline.py CHANGED Viewed

@@ -10,7 +10,7 @@ from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.llms.openai import OpenAI
 from llama_index.vector_stores.chroma import ChromaVectorStore
 import chromadb
-from typing import Dict, Any, List, Tuple
 import re
 import logging
@@ -33,9 +33,23 @@ class RAGPipeline:
         self.client = chromadb.Client()
         self.collection = self.client.get_or_create_collection(self.collection_name)
         self.embedding_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
         self.load_documents()
         self.build_index()
     def extract_page_number_from_query(self, query: str) -> int:
         """Extract page number from query text."""
         # Look for patterns like "page 3", "p3", "p. 3", etc.
@@ -59,14 +73,45 @@ class RAGPipeline:
                 self.data = json.load(f)
             self.documents = []
-            for index, doc_data in enumerate(self.data):
-                # Process each page's content separately
-                pages = doc_data.get("pages", {})
-                for page_num, page_content in pages.items():
                     doc_content = (
-                        f"Title: {doc_data['title']}\n"
-                        f"Page {page_num} Content:\n{page_content}\n"
-                        f"Authors: {', '.join(doc_data['authors'])}\n"
                     )
                     metadata = {
@@ -74,16 +119,11 @@ class RAGPipeline:
                         "authors": ", ".join(doc_data.get("authors", [])),
                         "year": doc_data.get("date"),
                         "doi": doc_data.get("doi"),
-                        "source_file": doc_data.get("source_file"),
-                        "page_number": int(page_num),  # Store as integer
-                        "total_pages": len(pages),
                     }
                     self.documents.append(
                         Document(
-                            text=doc_content,
-                            id_=f"doc_{index}_page_{page_num}",
-                            metadata=metadata,
                         )
                     )
@@ -113,7 +153,7 @@ class RAGPipeline:
     def query(
         self, context: str, prompt_template: PromptTemplate = None
-    ) -> Tuple[str, Dict[str, Any]]:
         if prompt_template is None:
             prompt_template = PromptTemplate(
                 "Context information is below.\n"
@@ -123,13 +163,14 @@ class RAGPipeline:
                 "Given this information, please answer the question: {query_str}\n"
                 "Provide a detailed answer using the content from the context above. "
                 "If the question asks about specific page content, make sure to include that information. "
-                "Cite sources using square brackets for EVERY piece of information, e.g. [1, p.3], [2, p.5], etc. "
                 "If you're unsure about something, say so rather than making assumptions."
             )
-        # Extract page number from query if present
-        requested_page = self.extract_page_number_from_query(context)
-        logger.info(f"Requested page number: {requested_page}")
         query_engine = self.index.as_query_engine(
             text_qa_template=prompt_template,
@@ -140,26 +181,24 @@ class RAGPipeline:
         response = query_engine.query(context)
-        # Extract source information from the response nodes
-        source_info = {}
         if hasattr(response, "source_nodes") and response.source_nodes:
             source_node = response.source_nodes[0]
             metadata = source_node.metadata
-            # Use requested page number if available, otherwise use the page from metadata
-            page_number = (
-                requested_page
-                if requested_page is not None
-                else metadata.get("page_number", 0)
-            )
-            source_info = {
-                "source_file": metadata.get("source_file"),
-                "page_number": page_number,
-                "title": metadata.get("title"),
-                "authors": metadata.get("authors"),
-                "content": source_node.text,
-            }
-            logger.info(f"Source info page number: {page_number}")
         return response.response, source_info

 from llama_index.llms.openai import OpenAI
 from llama_index.vector_stores.chroma import ChromaVectorStore
 import chromadb
+from typing import Dict, Any, List, Tuple, Optional
 import re
 import logging
         self.client = chromadb.Client()
         self.collection = self.client.get_or_create_collection(self.collection_name)
         self.embedding_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
+        self.is_pdf = self._check_if_pdf_collection()
         self.load_documents()
         self.build_index()
+    def _check_if_pdf_collection(self) -> bool:
+        """Check if this is a PDF collection based on the JSON structure."""
+        try:
+            with open(self.study_json, "r") as f:
+                data = json.load(f)
+                # Check first document for PDF-specific fields
+                if data and isinstance(data, list) and len(data) > 0:
+                    return "pages" in data[0] and "source_file" in data[0]
+            return False
+        except Exception as e:
+            logger.error(f"Error checking collection type: {str(e)}")
+            return False
     def extract_page_number_from_query(self, query: str) -> int:
         """Extract page number from query text."""
         # Look for patterns like "page 3", "p3", "p. 3", etc.
                 self.data = json.load(f)
             self.documents = []
+            if self.is_pdf:
+                # Handle PDF documents
+                for index, doc_data in enumerate(self.data):
+                    pages = doc_data.get("pages", {})
+                    for page_num, page_content in pages.items():
+                        if isinstance(page_content, dict):
+                            content = page_content.get("text", "")
+                        else:
+                            content = page_content
+                        doc_content = (
+                            f"Title: {doc_data['title']}\n"
+                            f"Page {page_num} Content:\n{content}\n"
+                            f"Authors: {', '.join(doc_data['authors'])}\n"
+                        )
+                        metadata = {
+                            "title": doc_data.get("title"),
+                            "authors": ", ".join(doc_data.get("authors", [])),
+                            "year": doc_data.get("date"),
+                            "source_file": doc_data.get("source_file"),
+                            "page_number": int(page_num),
+                            "total_pages": doc_data.get("page_count"),
+                        }
+                        self.documents.append(
+                            Document(
+                                text=doc_content,
+                                id_=f"doc_{index}_page_{page_num}",
+                                metadata=metadata,
+                            )
+                        )
+            else:
+                # Handle Zotero documents
+                for index, doc_data in enumerate(self.data):
                     doc_content = (
+                        f"Title: {doc_data.get('title', '')}\n"
+                        f"Abstract: {doc_data.get('abstract', '')}\n"
+                        f"Authors: {', '.join(doc_data.get('authors', []))}\n"
                     )
                     metadata = {
                         "authors": ", ".join(doc_data.get("authors", [])),
                         "year": doc_data.get("date"),
                         "doi": doc_data.get("doi"),
                     }
                     self.documents.append(
                         Document(
+                            text=doc_content, id_=f"doc_{index}", metadata=metadata
                         )
                     )
     def query(
         self, context: str, prompt_template: PromptTemplate = None
+    ) -> Tuple[str, Optional[Dict[str, Any]]]:
         if prompt_template is None:
             prompt_template = PromptTemplate(
                 "Context information is below.\n"
                 "Given this information, please answer the question: {query_str}\n"
                 "Provide a detailed answer using the content from the context above. "
                 "If the question asks about specific page content, make sure to include that information. "
+                "Cite sources using square brackets for EVERY piece of information, e.g. [1], [2], etc. "
                 "If you're unsure about something, say so rather than making assumptions."
             )
+        # Extract page number for PDF documents
+        requested_page = (
+            self.extract_page_number_from_query(context) if self.is_pdf else None
+        )
         query_engine = self.index.as_query_engine(
             text_qa_template=prompt_template,
         response = query_engine.query(context)
+        # Handle source information based on document type
+        source_info = None
         if hasattr(response, "source_nodes") and response.source_nodes:
             source_node = response.source_nodes[0]
             metadata = source_node.metadata
+            if self.is_pdf:
+                page_number = (
+                    requested_page
+                    if requested_page is not None
+                    else metadata.get("page_number", 0)
+                )
+                source_info = {
+                    "source_file": metadata.get("source_file"),
+                    "page_number": page_number,
+                    "title": metadata.get("title"),
+                    "authors": metadata.get("authors"),
+                    "content": source_node.text,
+                }
         return response.response, source_info