File size: 3,433 Bytes
eaffd42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import requests
import xml.etree.ElementTree as ET
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import Chroma, FAISS
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
import chromadb
import logging
import os

# works only with sitemap.xml url ONLY

# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

#Params to edit
global Chunk_size, Chunk_overlap
Chunk_size = 2000
Chunk_overlap = 100
Sitemap_url = "https://www.malakoffhumanis.com/sitemap.xml"

def langchain_web_scraper(sitemap_url, chunk_size=1000, chunk_overlap=100):
    """
    """
    # Fetch the sitemap.xml file
    response = requests.get(sitemap_url)
    tree = ET.fromstring(response.content)
    # Extract URLs from sitemap
    urls = []
    for url in tree.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
        loc = url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
        # if "" in loc:
        urls.append(loc)
    print("len(urls)", len(urls))
    # scraping
    loaders = UnstructuredURLLoader(urls=urls)
    data = loaders.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                chunk_overlap=chunk_overlap
                                                # separators=[" ", "\n"]
                                                )

    documents = text_splitter.split_documents(data)
    return documents

def store_vdb_faiss(documents=[], hf_embs=None, save_path="faiss_MHCOM"):
    """
    """
    db = FAISS.from_documents(documents, hf_embs)
    db.save_local(save_path)

def store_vdb_chroma(documents=[], hf_embs=None, save_path="chroma_MHCOM"):
    """
    """
    ABS_PATH = os.path.dirname(os.path.abspath(__file__))
    DB_DIR = os.path.join(ABS_PATH, save_path)

    client_settings = chromadb.config.Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=DB_DIR,
        anonymized_telemetry=False
    )
    vectorstore = Chroma(
        collection_name="langchain_store",
        embedding_function=hf_embs,
        client_settings=client_settings,
        persist_directory=DB_DIR,
    )
    vectorstore.add_documents(documents=documents, embedding=hf_embs)
    vectorstore.persist()


def main():
    print("scrapping website")
    documents = langchain_web_scraper(sitemap_url=Sitemap_url,
                                    chunk_size=Chunk_size,
                                    chunk_overlap=Chunk_overlap)
    #store in vector DB FAISS
    print("load embeddings")
    embeddings_model_name = "sentence-transformers/all-mpnet-base-v2"
    hf_embs = HuggingFaceEmbeddings(model_name=embeddings_model_name,
                                    model_kwargs={"device": "cuda"})

    print("storing chunks in vector db")
    store_vdb_faiss(documents=documents,
                    hf_embs=hf_embs,
                    save_path="faiss_MH_c{}_o{}".format(str(Chunk_size),
                                                        str(Chunk_overlap)))
    
    # store_vdb_chroma(documents=documents,
    #                 hf_embs=hf_embs,
    #                 save_path="chroma_MH_c{}_o{}".format(str(Chunk_size),
    #                                                     str(Chunk_overlap)))

if __name__ == '__main__':
    main()