|
import requests |
|
import xml.etree.ElementTree as ET |
|
from langchain.document_loaders import UnstructuredURLLoader |
|
from langchain.vectorstores import Chroma, FAISS |
|
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
import chromadb |
|
import logging |
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
global Chunk_size, Chunk_overlap |
|
Chunk_size = 2000 |
|
Chunk_overlap = 100 |
|
Sitemap_url = "https://www.malakoffhumanis.com/sitemap.xml" |
|
|
|
def langchain_web_scraper(sitemap_url, chunk_size=1000, chunk_overlap=100): |
|
""" |
|
""" |
|
|
|
response = requests.get(sitemap_url) |
|
tree = ET.fromstring(response.content) |
|
|
|
urls = [] |
|
for url in tree.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url"): |
|
loc = url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text |
|
|
|
urls.append(loc) |
|
print("len(urls)", len(urls)) |
|
|
|
loaders = UnstructuredURLLoader(urls=urls) |
|
data = loaders.load() |
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap |
|
|
|
) |
|
|
|
documents = text_splitter.split_documents(data) |
|
return documents |
|
|
|
def store_vdb_faiss(documents=[], hf_embs=None, save_path="faiss_MHCOM"): |
|
""" |
|
""" |
|
db = FAISS.from_documents(documents, hf_embs) |
|
db.save_local(save_path) |
|
|
|
def store_vdb_chroma(documents=[], hf_embs=None, save_path="chroma_MHCOM"): |
|
""" |
|
""" |
|
ABS_PATH = os.path.dirname(os.path.abspath(__file__)) |
|
DB_DIR = os.path.join(ABS_PATH, save_path) |
|
|
|
client_settings = chromadb.config.Settings( |
|
chroma_db_impl="duckdb+parquet", |
|
persist_directory=DB_DIR, |
|
anonymized_telemetry=False |
|
) |
|
vectorstore = Chroma( |
|
collection_name="langchain_store", |
|
embedding_function=hf_embs, |
|
client_settings=client_settings, |
|
persist_directory=DB_DIR, |
|
) |
|
vectorstore.add_documents(documents=documents, embedding=hf_embs) |
|
vectorstore.persist() |
|
|
|
|
|
def main(): |
|
print("scrapping website") |
|
documents = langchain_web_scraper(sitemap_url=Sitemap_url, |
|
chunk_size=Chunk_size, |
|
chunk_overlap=Chunk_overlap) |
|
|
|
print("load embeddings") |
|
embeddings_model_name = "sentence-transformers/all-mpnet-base-v2" |
|
hf_embs = HuggingFaceEmbeddings(model_name=embeddings_model_name, |
|
model_kwargs={"device": "cuda"}) |
|
|
|
print("storing chunks in vector db") |
|
store_vdb_faiss(documents=documents, |
|
hf_embs=hf_embs, |
|
save_path="faiss_MH_c{}_o{}".format(str(Chunk_size), |
|
str(Chunk_overlap))) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|