Spaces:

Sunbird
/

acres

Sleeping

App Files Files Community

ak3ra commited on Sep 4, 2024

Commit

bc5a5b2

1 Parent(s): daee42b

added rag pipelin

Browse files

Files changed (1) hide show

rag/rag_pipeline.py +53 -39

rag/rag_pipeline.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import gradio as gr
 import json
 import os
 from typing import Dict, Any
@@ -6,40 +5,42 @@ from llama_index.core import (
     SimpleDirectoryReader,
     VectorStoreIndex,
     Document,
-    Response,
-    PromptTemplate
 )
-from llama_index.core.node_parser import SentenceSplitter
 from llama_index.embeddings.openai import OpenAIEmbedding
-# Make sure to set your OpenAI API key in the Hugging Face Spaces secrets
-import openai
-openai.api_key = os.environ.get('OPENAI_API_KEY')
 class RAGPipeline:
-    def __init__(self, metadata_file, pdf_dir, use_semantic_splitter=False):
         self.metadata_file = metadata_file
         self.pdf_dir = pdf_dir
-        self.index = None
         self.use_semantic_splitter = use_semantic_splitter
         self.load_documents()
         self.build_index()
     def load_documents(self):
-        with open(self.metadata_file, 'r') as f:
             self.metadata = json.load(f)
         self.documents = []
         for item_key, item_data in self.metadata.items():
-            metadata = item_data['metadata']
-            pdf_path = item_data.get('pdf_path')
             if pdf_path:
                 full_pdf_path = os.path.join(self.pdf_dir, os.path.basename(pdf_path))
                 if os.path.exists(full_pdf_path):
-                    pdf_content = SimpleDirectoryReader(input_files=[full_pdf_path]).load_data()[0].text
                 else:
                     pdf_content = "PDF file not found"
             else:
@@ -54,18 +55,9 @@ class RAGPipeline:
                 f"Full Text: {pdf_content}"
             )
-            self.documents.append(Document(
-                text=doc_content,
-                id_=item_key,
-                metadata={
-                    "title": metadata['title'],
-                    "abstract": metadata['abstract'],
-                    "authors": metadata['authors'],
-                    "year": metadata['year'],
-                    "doi": metadata['doi']
-                }
-            ))
     def build_index(self):
         if self.use_semantic_splitter:
@@ -73,7 +65,7 @@ class RAGPipeline:
             splitter = SemanticSplitterNodeParser(
                 buffer_size=1,
                 breakpoint_percentile_threshold=95,
-                embed_model=embed_model
             )
         else:
             splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
@@ -81,10 +73,40 @@ class RAGPipeline:
         nodes = splitter.get_nodes_from_documents(self.documents)
         self.index = VectorStoreIndex(nodes)
-    def query(self, question, prompt_template=None):
-        if prompt_template is None:
-            prompt_template = PromptTemplate(
                 "Context information is below.\n"
                 "---------------------\n"
                 "{context_str}\n"
@@ -95,11 +117,3 @@ class RAGPipeline:
                 "If the information is not available in the context, please state that clearly. "
                 "When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
             )
-        query_engine = self.index.as_query_engine(
-            text_qa_template=prompt_template,
-            similarity_top_k=5
-        )
-        response = query_engine.query(question)
-        return response

 import json
 import os
 from typing import Dict, Any
     SimpleDirectoryReader,
     VectorStoreIndex,
     Document,
+    StorageContext,
+    load_index_from_storage,
 )
+from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
 from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.core import PromptTemplate
 class RAGPipeline:
+    def __init__(
+        self, metadata_file: str, pdf_dir: str, use_semantic_splitter: bool = False
+    ):
         self.metadata_file = metadata_file
         self.pdf_dir = pdf_dir
         self.use_semantic_splitter = use_semantic_splitter
+        self.index = None
         self.load_documents()
         self.build_index()
     def load_documents(self):
+        with open(self.metadata_file, "r") as f:
             self.metadata = json.load(f)
         self.documents = []
         for item_key, item_data in self.metadata.items():
+            metadata = item_data["metadata"]
+            pdf_path = item_data.get("pdf_path")
             if pdf_path:
                 full_pdf_path = os.path.join(self.pdf_dir, os.path.basename(pdf_path))
                 if os.path.exists(full_pdf_path):
+                    pdf_content = (
+                        SimpleDirectoryReader(input_files=[full_pdf_path])
+                        .load_data()[0]
+                        .text
+                    )
                 else:
                     pdf_content = "PDF file not found"
             else:
                 f"Full Text: {pdf_content}"
             )
+            self.documents.append(
+                Document(text=doc_content, id_=item_key, metadata=metadata)
+            )
     def build_index(self):
         if self.use_semantic_splitter:
             splitter = SemanticSplitterNodeParser(
                 buffer_size=1,
                 breakpoint_percentile_threshold=95,
+                embed_model=embed_model,
             )
         else:
             splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
         nodes = splitter.get_nodes_from_documents(self.documents)
         self.index = VectorStoreIndex(nodes)
+    def query(self, question: str, prompt_type: str = "default") -> Dict[str, Any]:
+        prompt_template = self._get_prompt_template(prompt_type)
+        query_engine = self.index.as_query_engine(
+            text_qa_template=prompt_template, similarity_top_k=5
+        )
+        response = query_engine.query(question)
+        return response
+    def _get_prompt_template(self, prompt_type: str) -> PromptTemplate:
+        if prompt_type == "highlight":
+            return PromptTemplate(
+                "Context information is below.\n"
+                "---------------------\n"
+                "{context_str}\n"
+                "---------------------\n"
+                "Given this information, please answer the question: {query_str}\n"
+                "Include all relevant information from the provided context. "
+                "Highlight key information by enclosing it in **asterisks**. "
+                "When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
+            )
+        elif prompt_type == "evidence_based":
+            return PromptTemplate(
+                "Context information is below.\n"
+                "---------------------\n"
+                "{context_str}\n"
+                "---------------------\n"
+                "Given this information, please answer the question: {query_str}\n"
+                "Provide an answer to the question using evidence from the context above. "
+                "Cite sources using square brackets."
+            )
+        else:
+            return PromptTemplate(
                 "Context information is below.\n"
                 "---------------------\n"
                 "{context_str}\n"
                 "If the information is not available in the context, please state that clearly. "
                 "When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
             )