Spaces:

Sunbird
/

acres

Sleeping

App Files Files Community

ak3ra commited on 11 days ago

Commit

d0a03de

1 Parent(s): c55ca9a

added source citations

Browse files

Files changed (4) hide show

.gitignore +5 -0
app.py +75 -32
rag/rag_pipeline.py +27 -34
study_files.json +9 -1

.gitignore CHANGED Viewed

@@ -131,6 +131,11 @@ ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject

 env.bak/
 venv.bak/
+yes
+*.pub
 # Spyder project settings
 .spyderproject
 .spyproject

app.py CHANGED Viewed

@@ -494,7 +494,11 @@ def create_gr_interface() -> gr.Blocks:
                     # Right column: PDF Preview and Upload
                     with gr.Column(scale=3):
-                        pdf_preview = gr.Image(label="Source Page", height=600)
                         with gr.Row():
                             pdf_files = gr.File(
                                 file_count="multiple",
@@ -572,6 +576,31 @@ def create_gr_interface() -> gr.Blocks:
             history = history + [(message, None)]
             return history, "", None
         def generate_chat_response(history, collection_id, pdf_processor):
             """Generate response for the last message in history."""
             if not collection_id:
@@ -583,41 +612,55 @@ def create_gr_interface() -> gr.Blocks:
             try:
                 # Get response and source info
                 rag = get_rag_pipeline(collection_id)
-                response, source_info = rag.query(last_message)
-                # Generate preview if source information is available
-                preview_image = None
-                if (
-                    source_info
-                    and source_info.get("source_file")
-                    and source_info.get("page_number") is not None
-                ):
-                    try:
-                        page_num = source_info["page_number"]
-                        logger.info(f"Attempting to render page {page_num}")
-                        preview_image = pdf_processor.render_page(
-                            source_info["source_file"], page_num
                         )
-                        if preview_image:
-                            logger.info(
-                                f"Successfully generated preview for page {page_num}"
-                            )
-                        else:
-                            logger.warning(
-                                f"Failed to generate preview for page {page_num}"
-                            )
-                    except Exception as e:
-                        logger.error(f"Error generating PDF preview: {str(e)}")
-                        preview_image = None
                 # Update history with response
-                history[-1] = (last_message, response)
-                return history, preview_image
             except Exception as e:
                 logger.error(f"Error in generate_chat_response: {str(e)}")
                 history[-1] = (last_message, f"Error: {str(e)}")
-                return history, None
         # Update PDF event handlers
         upload_btn.click(  # Change from pdf_files.upload to upload_btn.click
@@ -630,11 +673,11 @@ def create_gr_interface() -> gr.Blocks:
         chat_submit_btn.click(
             add_message,
             inputs=[chat_history, query_input],
-            outputs=[chat_history, query_input, pdf_preview],
         ).success(
-            lambda h, c: generate_chat_response(h, c, pdf_processor),
             inputs=[chat_history, current_collection],
-            outputs=[chat_history, pdf_preview],
         )
     return demo

                     # Right column: PDF Preview and Upload
                     with gr.Column(scale=3):
+                        # pdf_preview = gr.Image(label="Source Page", height=600)
+                        source_info = gr.Markdown(
+                            label="Sources",
+                            value="No sources available yet."
+                        )
                         with gr.Row():
                             pdf_files = gr.File(
                                 file_count="multiple",
             history = history + [(message, None)]
             return history, "", None
+        def format_source_info(source_nodes) -> str:
+            """Format source information into a markdown string."""
+            if not source_nodes:
+                return "No source information available"
+            sources_md = "### Sources\n\n"
+            seen_sources = set()  # To track unique sources
+            for idx, node in enumerate(source_nodes, 1):
+                metadata = node.metadata
+                if not metadata:
+                    continue
+                source_key = (metadata.get('source_file', ''), metadata.get('page_number', 0))
+                if source_key in seen_sources:
+                    continue
+                seen_sources.add(source_key)
+                title = metadata.get('title', os.path.basename(metadata.get('source_file', 'Unknown')))
+                page = metadata.get('page_number', 'N/A')
+                sources_md += f"{idx}. **{title}** - Page {page}\n"
+            return sources_md
         def generate_chat_response(history, collection_id, pdf_processor):
             """Generate response for the last message in history."""
             if not collection_id:
             try:
                 # Get response and source info
                 rag = get_rag_pipeline(collection_id)
+                response_text, source_nodes = rag.query(last_message)
+                # Format sources info
+                sources_md = "### Top Sources\n\n"
+                if source_nodes and len(source_nodes) > 0:
+                    seen_sources = set()
+                    source_count = 0
+                    # Only process up to 3 sources
+                    for node in source_nodes:
+                        if source_count >= 3:  # Stop after 3 sources
+                            break
+                        if not hasattr(node, 'metadata'):
+                            continue
+                        metadata = node.metadata
+                        source_key = (
+                            metadata.get('source_file', ''),
+                            metadata.get('page_number', 0)
                         )
+                        if source_key in seen_sources:
+                            continue
+                        seen_sources.add(source_key)
+                        source_count += 1
+                        title = metadata.get('title', 'Unknown')
+                        if not title or title == 'Unknown':
+                            title = os.path.basename(metadata.get('source_file', 'Unknown Document'))
+                        page = metadata.get('page_number', 'N/A')
+                        sources_md += f"{source_count}. **{title}** - Page {page}\n"
+                    if source_count == 0:
+                        sources_md = "No source information available"
+                else:
+                    sources_md = "No source information available"
                 # Update history with response
+                history[-1] = (last_message, response_text)
+                return history, sources_md
             except Exception as e:
                 logger.error(f"Error in generate_chat_response: {str(e)}")
                 history[-1] = (last_message, f"Error: {str(e)}")
+                return history, "Error retrieving sources"
         # Update PDF event handlers
         upload_btn.click(  # Change from pdf_files.upload to upload_btn.click
         chat_submit_btn.click(
             add_message,
             inputs=[chat_history, query_input],
+            outputs=[chat_history, query_input],
         ).success(
+            generate_chat_response,
             inputs=[chat_history, current_collection],
+            outputs=[chat_history, source_info],
         )
     return demo

rag/rag_pipeline.py CHANGED Viewed

@@ -152,29 +152,36 @@ class RAGPipeline:
         self.index = VectorStoreIndex(
             nodes, vector_store=vector_store, embed_model=self.embedding_model
         )
     def query(
         self, context: str, prompt_template: PromptTemplate = None
-    ) -> Tuple[str, Optional[Dict[str, Any]]]:
         if prompt_template is None:
             prompt_template = PromptTemplate(
-                "Context information is below.\n"
-                "---------------------\n"
-                "{context_str}\n"
-                "---------------------\n"
-                "Given this information, please answer the question: {query_str}\n"
-                "Provide a detailed answer using the content from the context above. "
-                "If the question asks about specific page content, make sure to include that information. "
-                "Cite sources using square brackets for EVERY piece of information, e.g. [1], [2], etc. "
-                "If you're unsure about something, say so rather than making assumptions."
-            )
         # Extract page number for PDF documents
         requested_page = (
             self.extract_page_number_from_query(context) if self.is_pdf else None
         )
-        # This is a hack to index all the documents in the store :)
         n_documents = len(self.index.docstore.docs)
         print(f"n_documents: {n_documents}")
         query_engine = self.index.as_query_engine(
@@ -185,25 +192,11 @@ class RAGPipeline:
         )
         response = query_engine.query(context)
-        # Handle source information based on document type
-        source_info = None
-        if hasattr(response, "source_nodes") and response.source_nodes:
-            source_node = response.source_nodes[0]
-            metadata = source_node.metadata
-            if self.is_pdf:
-                page_number = (
-                    requested_page
-                    if requested_page is not None
-                    else metadata.get("page_number", 0)
-                )
-                source_info = {
-                    "source_file": metadata.get("source_file"),
-                    "page_number": page_number,
-                    "title": metadata.get("title"),
-                    "authors": metadata.get("authors"),
-                    "content": source_node.text,
-                }
-        return response.response, source_info

         self.index = VectorStoreIndex(
             nodes, vector_store=vector_store, embed_model=self.embedding_model
         )
     def query(
         self, context: str, prompt_template: PromptTemplate = None
+    ) -> Tuple[str, List[Any]]:
         if prompt_template is None:
             prompt_template = PromptTemplate(
+            "Context information is below.\n"
+            "---------------------\n"
+            "{context_str}\n"
+            "---------------------\n"
+            "Given this information, please answer the question: {query_str}\n"
+            "Follow these guidelines for your response:\n"
+            "1. If the answer contains multiple pieces of information (e.g., author names, dates, statistics), "
+            "present it in a markdown table format.\n"
+            "2. For single piece information or simple answers, respond in a clear sentence.\n"
+            "3. Always cite sources using square brackets for EVERY piece of information, e.g. [1], [2], etc.\n"
+            "4. If the information spans multiple documents or pages, organize it by source.\n"
+            "5. If you're unsure about something, say so rather than making assumptions.\n"
+            "\nFormat tables like this:\n"
+            "| Field | Information | Source |\n"
+            "|-------|-------------|--------|\n"
+            "| Title | Example Title | [1] |\n"
+        )
         # Extract page number for PDF documents
         requested_page = (
             self.extract_page_number_from_query(context) if self.is_pdf else None
         )
         n_documents = len(self.index.docstore.docs)
         print(f"n_documents: {n_documents}")
         query_engine = self.index.as_query_engine(
         )
         response = query_engine.query(context)
+        # Debug logging
+        print(f"Response type: {type(response)}")
+        print(f"Has source_nodes: {hasattr(response, 'source_nodes')}")
+        if hasattr(response, 'source_nodes'):
+            print(f"Number of source nodes: {len(response.source_nodes)}")
+        return response.response, getattr(response, 'source_nodes', [])

study_files.json CHANGED Viewed

@@ -1,5 +1,13 @@
 {
     "Ebola Virus": "data/ebola-virus_zotero_items.json",
     "GeneXpert": "data/genexpert_zotero_items.json",
-    "Vaccine coverage": "data/vaccine-coverage_zotero_items.json"
 }

 {
     "Ebola Virus": "data/ebola-virus_zotero_items.json",
     "GeneXpert": "data/genexpert_zotero_items.json",
+    "Vaccine coverage": "data/vaccine-coverage_zotero_items.json",
+    "Concept": "data/concept_zotero_items.json",
+    "Zotero Collection Pastan": "data/zotero-collection-pastan_zotero_items.json",
+    "pdf_thequickone": "data/thequickone_20250108_111913_documents.json",
+    "pdf_aforapples": "data/aforapples_20250108_113044_documents.json",
+    "pdf_bforbinance": "data/bforbinance_20250108_114459_documents.json",
+    "pdf_cforcongo": "data/cforcongo_20250108_115233_documents.json",
+    "pdf_hjhj": "data/hjhj_20250108_115714_documents.json",
+    "pdf_schooldropouts": "data/schooldropouts_20250108_140257_documents.json"
 }