askeco / app.py
gofeco's picture
Update app.py
18d4c13 verified
raw
history blame
1.38 kB
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
import streamlit as st
text_loader_kwargs={'autodetect_encoding': True}
loader = DirectoryLoader("src_info_hf", glob="./*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
docs = loader.load()
# split it into chunks
#text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
#docs = text_splitter.split_documents(documents)
# create the open-source embedding function
#embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
# load it into Chroma
chdb = Chroma.from_documents(docs, embedding_function, collection_metadata={"hnsw:space": "cosine"}, persist_directory='chroma_db_info')
text = st.text_area("enter text")
if text:
docs = chdb.similarity_search_with_score(query, k=3)
docnum = len(docs)
index = 0
ret = ''
for ii in range(docnum):
doc = docs[ii][0]
score = docs[ii][1]
ret += f"Return {index} ({score:.4f}) :\n{doc.page_content}\n"
st.ret