import requests import xml.etree.ElementTree as ET from langchain.document_loaders import UnstructuredURLLoader from langchain.vectorstores import Chroma, FAISS from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings import chromadb import logging import os # works only with sitemap.xml url ONLY # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') #Params to edit global Chunk_size, Chunk_overlap Chunk_size = 2000 Chunk_overlap = 100 Sitemap_url = "https://www.malakoffhumanis.com/sitemap.xml" def langchain_web_scraper(sitemap_url, chunk_size=1000, chunk_overlap=100): """ """ # Fetch the sitemap.xml file response = requests.get(sitemap_url) tree = ET.fromstring(response.content) # Extract URLs from sitemap urls = [] for url in tree.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url"): loc = url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text # if "" in loc: urls.append(loc) print("len(urls)", len(urls)) # scraping loaders = UnstructuredURLLoader(urls=urls) data = loaders.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap # separators=[" ", "\n"] ) documents = text_splitter.split_documents(data) return documents def store_vdb_faiss(documents=[], hf_embs=None, save_path="faiss_MHCOM"): """ """ db = FAISS.from_documents(documents, hf_embs) db.save_local(save_path) def store_vdb_chroma(documents=[], hf_embs=None, save_path="chroma_MHCOM"): """ """ ABS_PATH = os.path.dirname(os.path.abspath(__file__)) DB_DIR = os.path.join(ABS_PATH, save_path) client_settings = chromadb.config.Settings( chroma_db_impl="duckdb+parquet", persist_directory=DB_DIR, anonymized_telemetry=False ) vectorstore = Chroma( collection_name="langchain_store", embedding_function=hf_embs, client_settings=client_settings, persist_directory=DB_DIR, ) vectorstore.add_documents(documents=documents, embedding=hf_embs) vectorstore.persist() def main(): print("scrapping website") documents = langchain_web_scraper(sitemap_url=Sitemap_url, chunk_size=Chunk_size, chunk_overlap=Chunk_overlap) #store in vector DB FAISS print("load embeddings") embeddings_model_name = "sentence-transformers/all-mpnet-base-v2" hf_embs = HuggingFaceEmbeddings(model_name=embeddings_model_name, model_kwargs={"device": "cuda"}) print("storing chunks in vector db") store_vdb_faiss(documents=documents, hf_embs=hf_embs, save_path="faiss_MH_c{}_o{}".format(str(Chunk_size), str(Chunk_overlap))) # store_vdb_chroma(documents=documents, # hf_embs=hf_embs, # save_path="chroma_MH_c{}_o{}".format(str(Chunk_size), # str(Chunk_overlap))) if __name__ == '__main__': main()