timeki's picture
Rerank documents and force summary for policy makers
d562d38
raw
history blame
4.33 kB
# import sys
# import os
# from contextlib import contextmanager
# from ..reranker import rerank_docs
# from ...knowledge.retriever import ClimateQARetriever
# def divide_into_parts(target, parts):
# # Base value for each part
# base = target // parts
# # Remainder to distribute
# remainder = target % parts
# # List to hold the result
# result = []
# for i in range(parts):
# if i < remainder:
# # These parts get base value + 1
# result.append(base + 1)
# else:
# # The rest get the base value
# result.append(base)
# return result
# @contextmanager
# def suppress_output():
# # Open a null device
# with open(os.devnull, 'w') as devnull:
# # Store the original stdout and stderr
# old_stdout = sys.stdout
# old_stderr = sys.stderr
# # Redirect stdout and stderr to the null device
# sys.stdout = devnull
# sys.stderr = devnull
# try:
# yield
# finally:
# # Restore stdout and stderr
# sys.stdout = old_stdout
# sys.stderr = old_stderr
# def make_retriever_node(vectorstore,reranker,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
# def retrieve_documents(state):
# POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS"] # ,"OpenAlex"]
# questions = state["questions"]
# # Use sources from the user input or from the LLM detection
# if "sources_input" not in state or state["sources_input"] is None:
# sources_input = ["auto"]
# else:
# sources_input = state["sources_input"]
# auto_mode = "auto" in sources_input
# # There are several options to get the final top k
# # Option 1 - Get 100 documents by question and rerank by question
# # Option 2 - Get 100/n documents by question and rerank the total
# if rerank_by_question:
# k_by_question = divide_into_parts(k_final,len(questions))
# docs = []
# for i,q in enumerate(questions):
# sources = q["sources"]
# question = q["question"]
# # If auto mode, we use the sources detected by the LLM
# if auto_mode:
# sources = [x for x in sources if x in POSSIBLE_SOURCES]
# # Otherwise, we use the config
# else:
# sources = sources_input
# # Search the document store using the retriever
# # Configure high top k for further reranking step
# retriever = ClimateQARetriever(
# vectorstore=vectorstore,
# sources = sources,
# # reports = ias_reports,
# min_size = 200,
# k_summary = k_summary,
# k_total = k_before_reranking,
# threshold = 0.5,
# )
# docs_question = retriever.get_relevant_documents(question)
# # Rerank
# if reranker is not None:
# with suppress_output():
# docs_question = rerank_docs(reranker,docs_question,question)
# else:
# # Add a default reranking score
# for doc in docs_question:
# doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
# # If rerank by question we select the top documents for each question
# if rerank_by_question:
# docs_question = docs_question[:k_by_question[i]]
# # Add sources used in the metadata
# for doc in docs_question:
# doc.metadata["sources_used"] = sources
# # Add to the list of docs
# docs.extend(docs_question)
# # Sorting the list in descending order by rerank_score
# # Then select the top k
# docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
# docs = docs[:k_final]
# new_state = {"documents":docs}
# return new_state
# return retrieve_documents