File size: 1,378 Bytes
7b0074a
 
 
 
 
f1aebfb
7b0074a
 
18d4c13
7b0074a
 
 
 
 
 
 
 
 
 
 
 
 
 
b5a732c
0cc9ff0
7b0074a
 
 
 
 
 
 
 
 
0cc9ff0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
import streamlit as st

text_loader_kwargs={'autodetect_encoding': True}
loader = DirectoryLoader("src_info_hf", glob="./*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
docs = loader.load()

# split it into chunks
#text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
#docs = text_splitter.split_documents(documents)

# create the open-source embedding function
#embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")

# load it into Chroma
chdb = Chroma.from_documents(docs, embedding_function, collection_metadata={"hnsw:space": "cosine"}, persist_directory='chroma_db_info')


text = st.text_area("enter text")
if text:
    docs = chdb.similarity_search_with_score(query, k=3)
    docnum = len(docs)
    index = 0
    ret = ''
    for ii in range(docnum):    
        doc = docs[ii][0]
        score = docs[ii][1]
        ret += f"Return {index} ({score:.4f}) :\n{doc.page_content}\n"
    st.ret