Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

climate-question-answering / climateqa /engine /chains /retriever.py

timeki

Rerank documents and force summary for policy makers

d562d38 about 1 month ago

raw

history blame

4.33 kB

	# import sys
	# import os
	# from contextlib import contextmanager

	# from ..reranker import rerank_docs
	# from ...knowledge.retriever import ClimateQARetriever




	# def divide_into_parts(target, parts):
	# # Base value for each part
	# base = target // parts
	# # Remainder to distribute
	# remainder = target % parts
	# # List to hold the result
	# result = []

	# for i in range(parts):
	# if i < remainder:
	# # These parts get base value + 1
	# result.append(base + 1)
	# else:
	# # The rest get the base value
	# result.append(base)

	# return result


	# @contextmanager
	# def suppress_output():
	# # Open a null device
	# with open(os.devnull, 'w') as devnull:
	# # Store the original stdout and stderr
	# old_stdout = sys.stdout
	# old_stderr = sys.stderr
	# # Redirect stdout and stderr to the null device
	# sys.stdout = devnull
	# sys.stderr = devnull
	# try:
	# yield
	# finally:
	# # Restore stdout and stderr
	# sys.stdout = old_stdout
	# sys.stderr = old_stderr



	# def make_retriever_node(vectorstore,reranker,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):

	# def retrieve_documents(state):

	# POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS"] # ,"OpenAlex"]
	# questions = state["questions"]

	# # Use sources from the user input or from the LLM detection
	# if "sources_input" not in state or state["sources_input"] is None:
	# sources_input = ["auto"]
	# else:
	# sources_input = state["sources_input"]
	# auto_mode = "auto" in sources_input

	# # There are several options to get the final top k
	# # Option 1 - Get 100 documents by question and rerank by question
	# # Option 2 - Get 100/n documents by question and rerank the total
	# if rerank_by_question:
	# k_by_question = divide_into_parts(k_final,len(questions))

	# docs = []

	# for i,q in enumerate(questions):

	# sources = q["sources"]
	# question = q["question"]

	# # If auto mode, we use the sources detected by the LLM
	# if auto_mode:
	# sources = [x for x in sources if x in POSSIBLE_SOURCES]

	# # Otherwise, we use the config
	# else:
	# sources = sources_input

	# # Search the document store using the retriever
	# # Configure high top k for further reranking step
	# retriever = ClimateQARetriever(
	# vectorstore=vectorstore,
	# sources = sources,
	# # reports = ias_reports,
	# min_size = 200,
	# k_summary = k_summary,
	# k_total = k_before_reranking,
	# threshold = 0.5,
	# )
	# docs_question = retriever.get_relevant_documents(question)

	# # Rerank
	# if reranker is not None:
	# with suppress_output():
	# docs_question = rerank_docs(reranker,docs_question,question)
	# else:
	# # Add a default reranking score
	# for doc in docs_question:
	# doc.metadata["reranking_score"] = doc.metadata["similarity_score"]

	# # If rerank by question we select the top documents for each question
	# if rerank_by_question:
	# docs_question = docs_question[:k_by_question[i]]

	# # Add sources used in the metadata
	# for doc in docs_question:
	# doc.metadata["sources_used"] = sources

	# # Add to the list of docs
	# docs.extend(docs_question)

	# # Sorting the list in descending order by rerank_score
	# # Then select the top k
	# docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
	# docs = docs[:k_final]

	# new_state = {"documents":docs}
	# return new_state

	# return retrieve_documents