Brocxi / inject.py
Esrath's picture
Upload 3 files
96d5d14 verified
raw
history blame
1.48 kB
import os
from dotenv import load_dotenv
from transformers import AutoModel
from langchain.storage import LocalFileStore
from langchain.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain.embeddings import CacheBackedEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# cache_store = LocalFileStore("./mxbai_cache_v2/")
# Load txt files from dir
loader = DirectoryLoader('../extracted_files', glob="*.txt", loader_cls=TextLoader, show_progress=True)
docs = loader.load()
# Chunking
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=256,
chunk_overlap=64,
)
chunked = text_splitter.split_documents(docs)
# model = AutoModel.from_pretrained('mixedbread-ai/mxbai-embed-large-v1', trust_remote_code=True)
model_name = "mixedbread-ai/mxbai-embed-large-v1"
model_kwargs = {'device': 'cpu'}
embeddings_model = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
)
# embeddings_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
embeddings_model, cache_store, namespace="mixedbread-ai/mxbai-embed-large-v1")
db = FAISS.from_documents(chunked, cached_embedder)
db.save_local("mxbai_faiss_index_v2")
print("Embeddings saved ...")