Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

TheoLvs commited on Sep 9

Commit

99e91d8

•

1 Parent(s): fd67e15

agents mode

Browse files

Files changed (12) hide show

climateqa/engine/chains/keywords_extraction.py +40 -0
climateqa/engine/chains/query_transformation.py +45 -4
climateqa/engine/chains/{retriever.py → retrieve_documents.py} +90 -55
climateqa/engine/graph.py +23 -12
climateqa/engine/llm/__init__.py +3 -0
climateqa/engine/llm/ollama.py +6 -0
climateqa/engine/utils.py +17 -0
climateqa/knowledge/__init__.py +0 -0
climateqa/{papers → knowledge}/openalex.py +61 -12
climateqa/{engine → knowledge}/retriever.py +1 -83
climateqa/papers/__init__.py +0 -43
sandbox/20240310 - CQA - Semantic Routing 1.ipynb +0 -0

climateqa/engine/chains/keywords_extraction.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from typing import List
+from typing import Literal
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+class KeywordExtraction(BaseModel):
+    """
+    Analyzing the user query to extract keywords to feed a search engine
+    """
+    keywords: List[str] = Field(
+        description="""
+        Extract the keywords from the user query to feed a search engine as a list
+        Avoid adding super specific keywords to prefer general keywords
+        Maximum 3 keywords
+        Examples:
+        - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
+        - "How will El Nino be impacted by climate change" -> ["el nino","climate change"]
+        - "Is climate change a hoax" -> ["climate change","hoax"]
+        """
+    )
+def make_keywords_extraction_chain(llm):
+    openai_functions = [convert_to_openai_function(KeywordExtraction)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"KeywordExtraction"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain

climateqa/engine/chains/query_transformation.py CHANGED Viewed

@@ -8,6 +8,13 @@ from langchain_core.utils.function_calling import convert_to_openai_function
 from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
 # Prompt from the original paper https://arxiv.org/pdf/2305.14283
 # Query Rewriting for Retrieval-Augmented Large Language Models
 class QueryDecomposition(BaseModel):
@@ -20,8 +27,8 @@ class QueryDecomposition(BaseModel):
         description="""
         Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
         Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
-        - If it's already a standalone question, you don't need to provide more questions, just reformulate it if relevant as a better question for a search engine
-        - If you need to decompose the question, output a list of maximum 3 questions
     """
     )
@@ -125,12 +132,20 @@ def make_query_rewriter_chain(llm):
     return chain
-def make_query_transform_node(llm):
     decomposition_chain = make_query_decomposition_chain(llm)
     rewriter_chain = make_query_rewriter_chain(llm)
     def transform_query(state):
         new_state = {}
@@ -145,7 +160,33 @@ def make_query_transform_node(llm):
             analysis_output = rewriter_chain.invoke({"input":question})
             question_state.update(analysis_output)
             questions.append(question_state)
-        new_state["questions"] = questions
         return new_state

 from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+ROUTING_INDEX = {
+    "Vector":["IPCC","IPBES","IPOS"],
+    "OpenAlex":["OpenAlex"],
+}
+POSSIBLE_SOURCES = [y for values in ROUTING_INDEX.values() for y in values]
 # Prompt from the original paper https://arxiv.org/pdf/2305.14283
 # Query Rewriting for Retrieval-Augmented Large Language Models
 class QueryDecomposition(BaseModel):
         description="""
         Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
         Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
+        - If it's already a standalone and explicit question, just return the reformulated question for the search engine
+        - If you need to decompose the question, output a list of maximum 2 to 3 questions
     """
     )
     return chain
+def make_query_transform_node(llm,k_final=15):
     decomposition_chain = make_query_decomposition_chain(llm)
     rewriter_chain = make_query_rewriter_chain(llm)
     def transform_query(state):
+        if "sources_auto" not in state or state["sources_auto"] is None or state["sources_auto"] is False:
+            auto_mode = False
+        else:
+            auto_mode = True
+        sources_input = state.get("sources_input")
+        if sources_input is None: sources_input = ROUTING_INDEX["Vector"]
         new_state = {}
             analysis_output = rewriter_chain.invoke({"input":question})
             question_state.update(analysis_output)
             questions.append(question_state)
+        # Explode the questions into multiple questions with different sources
+        new_questions = []
+        for q in questions:
+            question,sources = q["question"],q["sources"]
+            # If not auto mode we take the configuration
+            if not auto_mode:
+                sources = sources_input
+            for index,index_sources in ROUTING_INDEX.items():
+                selected_sources = list(set(sources).intersection(index_sources))
+                if len(selected_sources) > 0:
+                    new_questions.append({"question":question,"sources":selected_sources,"index":index})
+        # # Add the number of questions to search
+        # k_by_question = k_final // len(new_questions)
+        # for q in new_questions:
+        #     q["k"] = k_by_question
+        # new_state["questions"] = new_questions
+        # new_state["remaining_questions"] = new_questions
+        new_state = {
+            "remaining_questions":new_questions,
+            "n_questions":len(new_questions),
+        }
         return new_state

climateqa/engine/chains/{retriever.py → retrieve_documents.py} RENAMED Viewed

@@ -2,8 +2,16 @@ import sys
 import os
 from contextlib import contextmanager
 from ..reranker import rerank_docs
-from ..retriever import ClimateQARetriever
@@ -44,80 +52,107 @@ def suppress_output():
             sys.stderr = old_stderr
-def make_retriever_node(vectorstore,reranker,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
-    def retrieve_documents(state):
-        POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS","OpenAlex"]
-        questions = state["questions"]
-        # Use sources from the user input or from the LLM detection
-        if "sources_input" not in state or state["sources_input"] is None:
-            sources_input = ["auto"]
-        else:
-            sources_input = state["sources_input"]
-        auto_mode = "auto" in sources_input
-        # There are several options to get the final top k
-        # Option 1 - Get 100 documents by question and rerank by question
-        # Option 2 - Get 100/n documents by question and rerank the total
-        if rerank_by_question:
-            k_by_question = divide_into_parts(k_final,len(questions))
         docs = []
-        for i,q in enumerate(questions):
-            sources = q["sources"]
-            question = q["question"]
-            # If auto mode, we use the sources detected by the LLM
-            if auto_mode:
-                sources = [x for x in sources if x in POSSIBLE_SOURCES]
-            # Otherwise, we use the config
-            else:
-                sources = sources_input
             # Search the document store using the retriever
             # Configure high top k for further reranking step
             retriever = ClimateQARetriever(
                 vectorstore=vectorstore,
                 sources = sources,
-                # reports = ias_reports,
-            min_size = 200,
-            k_summary = k_summary,k_total = k_before_reranking,
-            threshold = 0.5,
             )
-            docs_question = retriever.get_relevant_documents(question)
-            # Rerank
-            if reranker is not None:
-                with suppress_output():
-                    docs_question = rerank_docs(reranker,docs_question,question)
-            else:
-                # Add a default reranking score
-                for doc in docs_question:
-                    doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-            # If rerank by question we select the top documents for each question
-            if rerank_by_question:
-                docs_question = docs_question[:k_by_question[i]]
-            # Add sources used in the metadata
             for doc in docs_question:
-                doc.metadata["sources_used"] = sources
-            # Add to the list of docs
-            docs.extend(docs_question)
         # Sorting the list in descending order by rerank_score
-        # Then select the top k
         docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
-        docs = docs[:k_final]
-        new_state = {"documents":docs}
         return new_state
     return retrieve_documents

 import os
 from contextlib import contextmanager
+from langchain_core.tools import tool
+from langchain_core.runnables import chain
+from langchain_core.runnables import RunnableParallel, RunnablePassthrough
+from langchain_core.runnables import RunnableLambda
 from ..reranker import rerank_docs
+from ...knowledge.retriever import ClimateQARetriever
+from ...knowledge.openalex import OpenAlexRetriever
+from .keywords_extraction import make_keywords_extraction_chain
+from ..utils import log_event
             sys.stderr = old_stderr
+@tool
+def query_retriever(question):
+    """Just a dummy tool to simulate the retriever query"""
+    return question
+def make_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
+    # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
+    @chain
+    async def retrieve_documents(state,config):
+        keywords_extraction = make_keywords_extraction_chain(llm)
+        current_question = state["remaining_questions"][0]
+        remaining_questions = state["remaining_questions"][1:]
+        # ToolMessage(f"Retrieving documents for question: {current_question['question']}",tool_call_id = "retriever")
+        # # There are several options to get the final top k
+        # # Option 1 - Get 100 documents by question and rerank by question
+        # # Option 2 - Get 100/n documents by question and rerank the total
+        # if rerank_by_question:
+        #     k_by_question = divide_into_parts(k_final,len(questions))
+        # docs = state["documents"]
+        # if docs is None: docs = []
         docs = []
+        k_by_question = k_final // state["n_questions"]
+        sources = current_question["sources"]
+        question = current_question["question"]
+        index = current_question["index"]
+        await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
+        if index == "Vector":
             # Search the document store using the retriever
             # Configure high top k for further reranking step
             retriever = ClimateQARetriever(
                 vectorstore=vectorstore,
                 sources = sources,
+                min_size = 200,
+                k_summary = k_summary,
+                k_total = k_before_reranking,
+                threshold = 0.5,
             )
+            docs_question = await retriever.ainvoke(question,config)
+        elif index == "OpenAlex":
+            keywords = keywords_extraction.invoke(question)["keywords"]
+            openalex_query = " AND ".join(keywords)
+            print(f"... OpenAlex query: {openalex_query}")
+            retriever_openalex = OpenAlexRetriever(
+                min_year = state.get("min_year",1960),
+                max_year = state.get("max_year",None),
+                k = k_before_reranking
+            )
+            docs_question = await retriever_openalex.ainvoke(openalex_query,config)
+        else:
+            raise Exception(f"Index {index} not found in the routing index")
+        # Rerank
+        if reranker is not None:
+            with suppress_output():
+                docs_question = rerank_docs(reranker,docs_question,question)
+        else:
+            # Add a default reranking score
             for doc in docs_question:
+                doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
+        # If rerank by question we select the top documents for each question
+        if rerank_by_question:
+            docs_question = docs_question[:k_by_question]
+        # Add sources used in the metadata
+        for doc in docs_question:
+            doc.metadata["sources_used"] = sources
+            doc.metadata["question_used"] = question
+            doc.metadata["index_used"] = index
+        # Add to the list of docs
+        docs.extend(docs_question)
         # Sorting the list in descending order by rerank_score
         docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
+        new_state = {"documents":docs,"remaining_questions":remaining_questions}
         return new_state
     return retrieve_documents

climateqa/engine/graph.py CHANGED Viewed

@@ -16,10 +16,9 @@ from .chains.answer_ai_impact import make_ai_impact_node
 from .chains.query_transformation import make_query_transform_node
 from .chains.translation import make_translation_node
 from .chains.intent_categorization import make_intent_categorization_node
-from .chains.retriever import make_retriever_node
 from .chains.answer_rag import make_rag_node
 class GraphState(TypedDict):
     """
     Represents the state of our graph.
@@ -28,21 +27,28 @@ class GraphState(TypedDict):
     language : str
     intent : str
     query: str
-    questions : List[dict]
     answer: str
     audience: str = "experts"
-    sources_input: List[str] = ["auto"]
     documents: List[Document]
 def search(state):
     return {}
 def route_intent(state):
     intent = state["intent"]
     if intent in ["chitchat","esg"]:
         return "answer_chitchat"
-    elif intent == "ai_impact":
-        return "answer_ai_impact"
     else:
         # Search route
         return "search"
@@ -74,17 +80,18 @@ def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
     translate_query = make_translation_node(llm)
     answer_chitchat = make_chitchat_node(llm)
     answer_ai_impact = make_ai_impact_node(llm)
-    retrieve_documents = make_retriever_node(vectorstore,reranker)
     answer_rag = make_rag_node(llm,with_docs=True)
     answer_rag_no_docs = make_rag_node(llm,with_docs=False)
     # Define the nodes
     workflow.add_node("categorize_intent", categorize_intent)
     workflow.add_node("search", search)
     workflow.add_node("transform_query", transform_query)
     workflow.add_node("translate_query", translate_query)
     workflow.add_node("answer_chitchat", answer_chitchat)
-    workflow.add_node("answer_ai_impact", answer_ai_impact)
     workflow.add_node("retrieve_documents",retrieve_documents)
     workflow.add_node("answer_rag",answer_rag)
     workflow.add_node("answer_rag_no_docs",answer_rag_no_docs)
@@ -96,7 +103,7 @@ def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
     workflow.add_conditional_edges(
         "categorize_intent",
         route_intent,
-        make_id_dict(["answer_chitchat","answer_ai_impact","search"])
     )
     workflow.add_conditional_edges(
@@ -104,9 +111,14 @@ def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
         route_translation,
         make_id_dict(["translate_query","transform_query"])
     )
     workflow.add_conditional_edges(
         "retrieve_documents",
         lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
         make_id_dict(["answer_rag","answer_rag_no_docs"])
     )
@@ -114,11 +126,10 @@ def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
     # Define the edges
     workflow.add_edge("translate_query", "transform_query")
     workflow.add_edge("transform_query", "retrieve_documents")
-    workflow.add_edge("retrieve_documents", "answer_rag")
     workflow.add_edge("answer_rag", END)
     workflow.add_edge("answer_rag_no_docs", END)
     workflow.add_edge("answer_chitchat", END)
-    workflow.add_edge("answer_ai_impact", END)
     # Compile
     app = workflow.compile()

 from .chains.query_transformation import make_query_transform_node
 from .chains.translation import make_translation_node
 from .chains.intent_categorization import make_intent_categorization_node
+from .chains.retrieve_documents import make_retriever_node
 from .chains.answer_rag import make_rag_node
 class GraphState(TypedDict):
     """
     Represents the state of our graph.
     language : str
     intent : str
     query: str
+    remaining_questions : List[dict]
+    n_questions : int
     answer: str
     audience: str = "experts"
+    sources_input: List[str] = ["IPCC","IPBES"]
+    sources_auto: bool = True
+    min_year: int = 1960
+    max_year: int = None
     documents: List[Document]
 def search(state):
     return {}
+def answer_search(state):
+    return {}
 def route_intent(state):
     intent = state["intent"]
     if intent in ["chitchat","esg"]:
         return "answer_chitchat"
+    # elif intent == "ai_impact":
+    #     return "answer_ai_impact"
     else:
         # Search route
         return "search"
     translate_query = make_translation_node(llm)
     answer_chitchat = make_chitchat_node(llm)
     answer_ai_impact = make_ai_impact_node(llm)
+    retrieve_documents = make_retriever_node(vectorstore,reranker,llm)
     answer_rag = make_rag_node(llm,with_docs=True)
     answer_rag_no_docs = make_rag_node(llm,with_docs=False)
     # Define the nodes
     workflow.add_node("categorize_intent", categorize_intent)
     workflow.add_node("search", search)
+    workflow.add_node("answer_search", answer_search)
     workflow.add_node("transform_query", transform_query)
     workflow.add_node("translate_query", translate_query)
     workflow.add_node("answer_chitchat", answer_chitchat)
+    # workflow.add_node("answer_ai_impact", answer_ai_impact)
     workflow.add_node("retrieve_documents",retrieve_documents)
     workflow.add_node("answer_rag",answer_rag)
     workflow.add_node("answer_rag_no_docs",answer_rag_no_docs)
     workflow.add_conditional_edges(
         "categorize_intent",
         route_intent,
+        make_id_dict(["answer_chitchat","search"])
     )
     workflow.add_conditional_edges(
         route_translation,
         make_id_dict(["translate_query","transform_query"])
     )
     workflow.add_conditional_edges(
         "retrieve_documents",
+        lambda state : "retrieve_documents" if len(state["remaining_questions"]) > 0 else "answer_search",
+        make_id_dict(["retrieve_documents","answer_search"])
+    )
+    workflow.add_conditional_edges(
+        "answer_search",
         lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
         make_id_dict(["answer_rag","answer_rag_no_docs"])
     )
     # Define the edges
     workflow.add_edge("translate_query", "transform_query")
     workflow.add_edge("transform_query", "retrieve_documents")
     workflow.add_edge("answer_rag", END)
     workflow.add_edge("answer_rag_no_docs", END)
     workflow.add_edge("answer_chitchat", END)
+    # workflow.add_edge("answer_ai_impact", END)
     # Compile
     app = workflow.compile()

climateqa/engine/llm/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
 def get_llm(provider="openai",**kwargs):
@@ -8,6 +9,8 @@ def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
+from climateqa.engine.llm.ollama import get_llm as get_ollama_llm
 def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
+    elif provider == "ollama":
+        return  get_ollama_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

climateqa/engine/llm/ollama.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from langchain_community.llms import Ollama
+def get_llm(model="llama3", **kwargs):
+    return Ollama(model=model, **kwargs)

climateqa/engine/utils.py CHANGED Viewed

@@ -1,8 +1,15 @@
 from operator import itemgetter
 from typing import Any, Dict, Iterable, Tuple
 from langchain_core.runnables import RunnablePassthrough
 def pass_values(x):
     if not isinstance(x, list):
         x = [x]
@@ -67,3 +74,13 @@ def flatten_dict(
     """
     flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
     return flat_dict

 from operator import itemgetter
 from typing import Any, Dict, Iterable, Tuple
+import tiktoken
 from langchain_core.runnables import RunnablePassthrough
+def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
 def pass_values(x):
     if not isinstance(x, list):
         x = [x]
     """
     flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
     return flat_dict
+async def log_event(info,name,config):
+    """Helper function that will run a dummy chain with the given info
+    The astream_event function will catch this chain and stream the dict info to the logger
+    """
+    chain = RunnablePassthrough().with_config(run_name=name)
+    _ = await chain.ainvoke(info,config)

climateqa/knowledge/__init__.py ADDED Viewed

File without changes

climateqa/{papers → knowledge}/openalex.py RENAMED Viewed

@@ -3,18 +3,32 @@ import networkx as nx
 import matplotlib.pyplot as plt
 from pyvis.network import Network
 from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
 import pyalex
 pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
 class OpenAlex():
     def __init__(self):
         pass
-    def search(self,keywords,n_results = 100,after = None,before = None):
         if isinstance(keywords,str):
             works = Works().search(keywords)
@@ -27,18 +41,21 @@ class OpenAlex():
                 break
             df_works = pd.DataFrame(page)
-            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x))
             df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
             df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
-            df_works["content"] = df_works["title"] + "\n" + df_works["abstract"]
         else:
-            df_works = []
-            for keyword in keywords:
-                df_keyword = self.search(keyword,n_results = n_results,after = after,before = before)
-                df_works.append(df_keyword)
-            df_works = pd.concat(df_works,ignore_index=True,axis = 0)
-        return df_works
     def rerank(self,query,df,reranker):
@@ -139,4 +156,36 @@ class OpenAlex():
                     reconstructed[position] = token
             # Join the tokens to form the reconstructed sentence(s)
-            return ' '.join(reconstructed)

 import matplotlib.pyplot as plt
 from pyvis.network import Network
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.vectorstores import VectorStoreRetriever
+from langchain_core.documents.base import Document
+from langchain_core.vectorstores import VectorStore
+from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
+from ..engine.utils import num_tokens_from_string
+from typing import List
+from pydantic import Field
 from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
 import pyalex
 pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
+def replace_nan_with_empty_dict(x):
+    return x if pd.notna(x) else {}
 class OpenAlex():
     def __init__(self):
         pass
+    def search(self,keywords:str,n_results = 100,after = None,before = None):
         if isinstance(keywords,str):
             works = Works().search(keywords)
                 break
             df_works = pd.DataFrame(page)
+            df_works = df_works.dropna(subset = ["title"])
+            df_works["primary_location"] = df_works["primary_location"].map(replace_nan_with_empty_dict)
+            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x)).fillna("")
             df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
             df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
+            df_works["url"] = df_works["id"]
+            df_works["content"] = (df_works["title"] + "\n" + df_works["abstract"]).map(lambda x : x.strip())
+            df_works["num_tokens"] = df_works["content"].map(lambda x : num_tokens_from_string(x))
+            df_works = df_works.drop(columns = ["abstract_inverted_index"])
+            # df_works["subtitle"] = df_works["title"] + " - " + df_works["primary_location"]["source"]["display_name"] + " - " + df_works["publication_year"]
+            return df_works
         else:
+           raise Exception("Keywords must be a string")
     def rerank(self,query,df,reranker):
                     reconstructed[position] = token
             # Join the tokens to form the reconstructed sentence(s)
+            return ' '.join(reconstructed)
+class OpenAlexRetriever(BaseRetriever):
+    min_year:int = 1960
+    max_year:int = None
+    k = 100
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        openalex = OpenAlex()
+        # Search for documents
+        df_docs = openalex.search(query,n_results=self.k,after = self.min_year,before = self.max_year)
+        docs = []
+        for i,row in df_docs.iterrows():
+            num_tokens = row["num_tokens"]
+            if num_tokens < 50 or num_tokens > 1000:
+                continue
+            doc = Document(
+                page_content = row["content"],
+                metadata = row.to_dict()
+            )
+            docs.append(doc)
+        return docs

climateqa/{engine → knowledge}/retriever.py RENAMED Viewed

@@ -66,6 +66,7 @@ class ClimateQARetriever(BaseRetriever):
         # Add score to metadata
         results = []
         for i,(doc,score) in enumerate(docs):
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
@@ -78,86 +79,3 @@ class ClimateQARetriever(BaseRetriever):
         return results
-# def filter_summaries(df,k_summary = 3,k_total = 10):
-#     # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
-#     # # Filter by source
-#     # if source == "IPCC":
-#     #     df = df.loc[df["source"]=="IPCC"]
-#     # elif source == "IPBES":
-#     #     df = df.loc[df["source"]=="IPBES"]
-#     # else:
-#     #     pass
-#     # Separate summaries and full reports
-#     df_summaries = df.loc[df["report_type"].isin(["SPM","TS"])]
-#     df_full = df.loc[~df["report_type"].isin(["SPM","TS"])]
-#     # Find passages from summaries dataset
-#     passages_summaries = df_summaries.head(k_summary)
-#     # Find passages from full reports dataset
-#     passages_fullreports = df_full.head(k_total - len(passages_summaries))
-#     # Concatenate passages
-#     passages = pd.concat([passages_summaries,passages_fullreports],axis = 0,ignore_index = True)
-#     return passages
-# def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
-#     assert max_k > k_total
-#     validated_sources = ["IPCC","IPBES"]
-#     sources = [x for x in sources if x in validated_sources]
-#     filters = {
-#         "source": { "$in": sources },
-#     }
-#     print(filters)
-#     # Retrieve documents
-#     docs = retriever.retrieve(query,top_k = max_k,filters = filters)
-#     # Filter by score
-#     docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs if x.score > threshold]
-#     if len(docs) == 0:
-#         return []
-#     res = pd.DataFrame(docs)
-#     passages_df = filter_summaries(res,k_summary,k_total)
-#     if as_dict:
-#         contents = passages_df["content"].tolist()
-#         meta = passages_df.drop(columns = ["content"]).to_dict(orient = "records")
-#         passages = []
-#         for i in range(len(contents)):
-#             passages.append({"content":contents[i],"meta":meta[i]})
-#         return passages
-#     else:
-#         return passages_df
-# def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
-#     print("hellooooo")
-#     # Reformulate queries
-#     reformulated_query,language = reformulate(query)
-#     print(reformulated_query)
-#     # Retrieve documents
-#     passages = retrieve_with_summaries(reformulated_query,retriever,k_total = k,k_summary = 3,as_dict = True,sources = sources,threshold = threshold)
-#     response = {
-#       "query":query,
-#       "reformulated_query":reformulated_query,
-#       "language":language,
-#       "sources":passages,
-#       "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
-#     }
-#     return response

         # Add score to metadata
         results = []
         for i,(doc,score) in enumerate(docs):
+            doc.page_content = doc.page_content.replace("\r\n"," ")
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
         return results

climateqa/papers/__init__.py DELETED Viewed

@@ -1,43 +0,0 @@
-import pandas as pd
-from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
-import pyalex
-pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
-class OpenAlex():
-    def __init__(self):
-        pass
-    def search(self,keywords,n_results = 100,after = None,before = None):
-        works = Works().search(keywords).get()
-        for page in works.paginate(per_page=n_results):
-            break
-        df_works = pd.DataFrame(page)
-        return works
-    def make_network(self):
-        pass
-    def get_abstract_from_inverted_index(self,index):
-        # Determine the maximum index to know the length of the reconstructed array
-        max_index = max([max(positions) for positions in index.values()])
-        # Initialize a list with placeholders for all positions
-        reconstructed = [''] * (max_index + 1)
-        # Iterate through the inverted index and place each token at its respective position(s)
-        for token, positions in index.items():
-            for position in positions:
-                reconstructed[position] = token
-        # Join the tokens to form the reconstructed sentence(s)
-        return ' '.join(reconstructed)

sandbox/20240310 - CQA - Semantic Routing 1.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff