mh-explo / build_db.py
amtam0's picture
add new files
eaffd42
raw
history blame
3.43 kB
import requests
import xml.etree.ElementTree as ET
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import Chroma, FAISS
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
import chromadb
import logging
import os
# works only with sitemap.xml url ONLY
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
#Params to edit
global Chunk_size, Chunk_overlap
Chunk_size = 2000
Chunk_overlap = 100
Sitemap_url = "https://www.malakoffhumanis.com/sitemap.xml"
def langchain_web_scraper(sitemap_url, chunk_size=1000, chunk_overlap=100):
"""
"""
# Fetch the sitemap.xml file
response = requests.get(sitemap_url)
tree = ET.fromstring(response.content)
# Extract URLs from sitemap
urls = []
for url in tree.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
loc = url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
# if "" in loc:
urls.append(loc)
print("len(urls)", len(urls))
# scraping
loaders = UnstructuredURLLoader(urls=urls)
data = loaders.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap
# separators=[" ", "\n"]
)
documents = text_splitter.split_documents(data)
return documents
def store_vdb_faiss(documents=[], hf_embs=None, save_path="faiss_MHCOM"):
"""
"""
db = FAISS.from_documents(documents, hf_embs)
db.save_local(save_path)
def store_vdb_chroma(documents=[], hf_embs=None, save_path="chroma_MHCOM"):
"""
"""
ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, save_path)
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=DB_DIR,
anonymized_telemetry=False
)
vectorstore = Chroma(
collection_name="langchain_store",
embedding_function=hf_embs,
client_settings=client_settings,
persist_directory=DB_DIR,
)
vectorstore.add_documents(documents=documents, embedding=hf_embs)
vectorstore.persist()
def main():
print("scrapping website")
documents = langchain_web_scraper(sitemap_url=Sitemap_url,
chunk_size=Chunk_size,
chunk_overlap=Chunk_overlap)
#store in vector DB FAISS
print("load embeddings")
embeddings_model_name = "sentence-transformers/all-mpnet-base-v2"
hf_embs = HuggingFaceEmbeddings(model_name=embeddings_model_name,
model_kwargs={"device": "cuda"})
print("storing chunks in vector db")
store_vdb_faiss(documents=documents,
hf_embs=hf_embs,
save_path="faiss_MH_c{}_o{}".format(str(Chunk_size),
str(Chunk_overlap)))
# store_vdb_chroma(documents=documents,
# hf_embs=hf_embs,
# save_path="chroma_MH_c{}_o{}".format(str(Chunk_size),
# str(Chunk_overlap)))
if __name__ == '__main__':
main()