# # https://github.com/langchain-ai/langchain/issues/8623 | |
# import pandas as pd | |
# from langchain_core.retrievers import BaseRetriever | |
# from langchain_core.vectorstores import VectorStoreRetriever | |
# from langchain_core.documents.base import Document | |
# from langchain_core.vectorstores import VectorStore | |
# from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun | |
# from typing import List | |
# from pydantic import Field | |
# def _add_metadata_and_score(docs: List) -> Document: | |
# # Add score to metadata | |
# docs_with_metadata = [] | |
# for i,(doc,score) in enumerate(docs): | |
# doc.page_content = doc.page_content.replace("\r\n"," ") | |
# doc.metadata["similarity_score"] = score | |
# doc.metadata["content"] = doc.page_content | |
# doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1 | |
# # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}""" | |
# docs_with_metadata.append(doc) | |
# return docs_with_metadata | |
# class ClimateQARetriever(BaseRetriever): | |
# vectorstore:VectorStore | |
# sources:list = ["IPCC","IPBES","IPOS"] | |
# reports:list = [] | |
# threshold:float = 0.6 | |
# k_summary:int = 3 | |
# k_total:int = 10 | |
# namespace:str = "vectors", | |
# min_size:int = 200, | |
# def _get_relevant_documents( | |
# self, query: str, *, run_manager: CallbackManagerForRetrieverRun | |
# ) -> List[Document]: | |
# # Check if all elements in the list are either IPCC or IPBES | |
# assert isinstance(self.sources,list) | |
# assert self.sources | |
# assert all([x in ["IPCC","IPBES","IPOS"] for x in self.sources]) | |
# assert self.k_total > self.k_summary, "k_total should be greater than k_summary" | |
# # Prepare base search kwargs | |
# filters = {} | |
# if len(self.reports) > 0: | |
# filters["short_name"] = {"$in":self.reports} | |
# else: | |
# filters["source"] = { "$in":self.sources} | |
# # Search for k_summary documents in the summaries dataset | |
# filters_summaries = { | |
# **filters, | |
# "chunk_type":"text", | |
# "report_type": { "$in":["SPM"]}, | |
# } | |
# docs_summaries = self.vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = self.k_summary) | |
# docs_summaries = [x for x in docs_summaries if x[1] > self.threshold] | |
# # docs_summaries = [] | |
# # Search for k_total - k_summary documents in the full reports dataset | |
# filters_full = { | |
# **filters, | |
# "chunk_type":"text", | |
# "report_type": { "$nin":["SPM"]}, | |
# } | |
# k_full = self.k_total - len(docs_summaries) | |
# docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full) | |
# # Images | |
# filters_image = { | |
# **filters, | |
# "chunk_type":"image" | |
# } | |
# docs_images = self.vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_full) | |
# # docs_images = [] | |
# # Concatenate documents | |
# # docs = docs_summaries + docs_full + docs_images | |
# # Filter if scores are below threshold | |
# # docs = [x for x in docs if x[1] > self.threshold] | |
# docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images) | |
# # Filter if length are below threshold | |
# docs_summaries = [x for x in docs_summaries if len(x.page_content) > self.min_size] | |
# docs_full = [x for x in docs_full if len(x.page_content) > self.min_size] | |
# return { | |
# "docs_summaries" : docs_summaries, | |
# "docs_full" : docs_full, | |
# "docs_images" : docs_images, | |
# } | |