from langchain.text_splitter import CharacterTextSplitter from langchain_community.document_loaders import TextLoader from langchain_community.document_loaders import DirectoryLoader from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain_community.vectorstores import Chroma import streamlit as st text_loader_kwargs={'autodetect_encoding': True} loader = DirectoryLoader("src_info_hf", glob="./*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs) docs = loader.load() # split it into chunks #text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) #docs = text_splitter.split_documents(documents) # create the open-source embedding function #embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2") # load it into Chroma chdb = Chroma.from_documents(docs, embedding_function, collection_metadata={"hnsw:space": "cosine"}, persist_directory='chroma_db_info') text = st.text_area("enter text") if text: docs = chdb.similarity_search_with_score(query, k=3) docnum = len(docs) index = 0 ret = '' for ii in range(docnum): doc = docs[ii][0] score = docs[ii][1] ret += f"Return {index} ({score:.4f}) :\n{doc.page_content}\n" st.ret