"https://sites.google.com/airliquide.com/sis-processsafety/knowledge/lessons-learned-sources" from sentence_transformers import SentenceTransformer #model=SentenceTransformer("all-mpnet-base-v2") import streamlit as st import pickle import faiss from llama_index.core import VectorStoreIndex,StorageContext from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.vector_stores.faiss import FaissVectorStore from llama_index.core import VectorStoreIndex from llama_index.retrievers.bm25 import BM25Retriever from llama_index.core.postprocessor import SentenceTransformerRerank from llama_index.core import QueryBundle from llama_index.core.schema import NodeWithScore from llama_index.core.retrievers import BaseRetriever from transformers import AutoTokenizer, AutoModel @st.cache_resource(show_spinner=False) def load_data(): with open('nodes_clean.pkl', 'rb') as file: nodes=pickle.load( file) d = 768 faiss_index = faiss.IndexFlatL2(d) vector_store = FaissVectorStore(faiss_index=faiss_index ) storage_context = StorageContext.from_defaults(vector_store=vector_store) # use later nodes_clean index = VectorStoreIndex(nodes,embed_model=embed_model,storage_context=storage_context) retriever_dense = index.as_retriever(similarity_top_k=25,embedding=True) retrieverBM25 = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=25) hybrid_retriever = HybridRetriever(retriever_dense, retrieverBM25) return hybrid_retriever @st.cache_resource(show_spinner=False) class models(): def __init__(self): EMBEDDING_MODEL="BAAI/llm-embedder" self.embed_model = HuggingFaceEmbedding(EMBEDDING_MODEL,device='cpu',) self.reranker = SentenceTransformerRerank(top_n=25, model="BAAI/bge-reranker-base",device='cpu',) mod=models() embed_model=mod.embed_model reranker= mod.reranker class HybridRetriever(BaseRetriever): def __init__(self, vector_retriever, bm25_retriever): self.vector_retriever = vector_retriever self.bm25_retriever = bm25_retriever super().__init__() def _retrieve(self, query, **kwargs): bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs) vector_nodes = self.vector_retriever.retrieve(query, **kwargs) # combine the two lists of nodes all_nodes = [] node_ids = set() for n in bm25_nodes + vector_nodes: if n.node.node_id not in node_ids: all_nodes.append(n) node_ids.add(n.node.node_id) return all_nodes hybrid_retriever = load_data() import re def clean_whitespace(text,k=5): text = text.strip() text=" ".join([i for i in text.split("\n")[:k] if len(i.strip())>25]+text.split("\n")[k:]) text = re.sub(r"\.EU", "", text) #text = re.sub(r"\n+", "\n", text) text = re.sub(r"\s+", " ", text) return text.lower() def stream(reranked_nodes,text_size=700): nodes_dict={} for nod in reranked_nodes: file_name = nod.metadata["file_name"] if file_name not in nodes_dict: nodes_dict[file_name]=[] nodes_dict[file_name].append(nod) for rank,i_di in enumerate(nodes_dict): i_di=nodes_dict[i_di] title = i_di[0].metadata['title'] file_name = i_di[0].metadata['file_name'] summary = i_di[0].metadata['text'] url = i_di[0].metadata['doc_url'] st.write(f"**Rank: {rank+1}** {file_name} ") st.write(f"- Tittle: [{title}](%s)"% url) #st.write("check out this [link](%s)" % url) with st.expander(f"Summary"): st.write(f"{summary}") #st.write(f"- Summary: {summery}") #st.link_button("Link", url) #st.write(f"- URL: {url}") with st.expander(f"Extra Text(s) "): for n_extra,t in enumerate(i_di[:5]): st.write(f"Found in **page n°{t.metadata['page_label']}** - Extra text **n°{n_extra+1}:**") st.write(f"\t {clean_whitespace(t.text[:text_size])}...") st.markdown("""---""") #stream(reranked_nodes,150) # Function to perform search and return sorted documents def perform_search(query): if query: retrieved_nodes = hybrid_retriever.retrieve(query) reranked_nodes = reranker.postprocess_nodes( retrieved_nodes, query_bundle=QueryBundle( query ),) return reranked_nodes else: return [] # Main Streamlit app def main(): #st.sidebar.title("Pagination") #st.sidebar.write( f"Totale Pages :{total_pages}") st.title("Information Retrieval System") query = st.text_input("Enter your question:") if st.button("Search") or query: sorted_docs = perform_search(query) #st.session_state.sorted_docs = sorted_docs else: sorted_docs = st.session_state.get("sorted_docs", []) if sorted_docs: stream(sorted_docs,200) #st.write(f"Current Page Number: {page_number}") # Run the app if __name__ == "__main__": main()