heikowagner commited on
Commit
40c3ade
0 Parent(s):

Duplicate from heikowagner/GPT-Docker

Browse files
.dockerignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./docker/zeppelin/logs/*
2
+ *.openaiapikey*
3
+ *.log
4
+ *.log.*
5
+ *__pycache__*
6
+ root
7
+ *.ipynb_checkpoints*
8
+ .vscode
9
+ /app/mymodels
10
+ /app/.cache
11
+ /app/VectorStore
12
+ *chroma-embeddings.parquet*
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./docker/zeppelin/logs/*
2
+ *.openaiapikey*
3
+ *.log
4
+ *.log.*
5
+ *__pycache__*
6
+ root
7
+ *.ipynb_checkpoints*
8
+ .vscode
9
+ /app/mymodels
10
+ /app/.cache
11
+ /app/VectorStore
12
+ *chroma-embeddings.parquet*
Dockerfile ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Navigate to your user folder cd $env:USERPROFILE\AppData\Local\Docker\wsl\data
2
+ #Enter the following command resize-vhd -Path .\ext4.vhdx -SizeBytes 300GB, after that I was able to continue building with docker-compose!
3
+
4
+ FROM python:latest AS builder
5
+ RUN apt update -y
6
+ RUN apt install -y git git-lfs make gcc g++ libgmp-dev libmpfr-dev libmpc-dev
7
+ RUN git lfs install
8
+ RUN git clone https://github.com/ggerganov/llama.cpp
9
+ RUN cd llama.cpp && make
10
+ RUN git clone https://huggingface.co/nyanko7/LLaMA-7B
11
+ RUN ls -la
12
+ RUN cp -r ./LLaMA-7B ./llama.cpp/models
13
+ RUN ls -la ./llama.cpp/models/LLaMA-7B
14
+ # convert the 7B model to ggml FP16 format
15
+ WORKDIR llama.cpp
16
+ RUN python3 -m pip install -r requirements.txt
17
+ RUN python3 convert.py ./models/LLaMA-7B
18
+ # quantize the model to 4-bits (using q4_0 method)
19
+ RUN mkdir ./models/7B/
20
+ RUN ./quantize ./models/LLaMA-7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
21
+
22
+ FROM tensorflow/tensorflow:latest-gpu
23
+ WORKDIR /app
24
+ COPY --from=builder /llama.cpp//models/7B/ ./mymodels/LLaMA-7B
25
+ # RUN apt-get upgrade -y
26
+ RUN apt update -y
27
+ RUN apt install -y git git-lfs
28
+ RUN apt install -y make wget git gcc g++ lhasa libgmp-dev libmpfr-dev libmpc-dev flex bison gettext texinfo ncurses-dev autoconf rsync
29
+ COPY ./requirements.txt requirements.txt
30
+ RUN pip install -r requirements.txt
31
+ COPY ./app .
32
+ #RUN python load_docs.py
33
+ #RUN --mount=type=secret,id=OPENAI_API_KEY \
34
+ # cat /run/secrets/OPENAI_API_KEY > .openaiapikey
35
+ RUN echo "" > .openaiapikey
36
+ RUN mkdir /.cache
37
+ RUN mkdir /nltk_data
38
+ RUN mkdir /VectorStore
39
+ RUN mkdir /app/.cache
40
+ RUN mkdir /mymodels
41
+ RUN ls -la
42
+ RUN python run.py
43
+ RUN chmod 777 /VectorStore
44
+ RUN chmod 777 /mymodels
45
+ RUN chmod 777 /nltk_data
46
+ RUN chmod 777 /.cache
47
+ RUN chmod 777 /app/.cache
48
+ RUN chmod 777 /app/mymodels
49
+ RUN chmod 777 /app/VectorStore/
50
+ CMD ["streamlit", "run", "app.py", "--server.port=7860"]
51
+ #CMD ls -la
52
+ EXPOSE 7860
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: myRetrievalGPT
3
+ emoji: 🔥
4
+ colorFrom: green
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: true
8
+ duplicated_from: heikowagner/GPT-Docker
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app/app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import load_model
3
+ import utils as ut
4
+ import elements as el
5
+ import os
6
+ import torch
7
+ import psutil
8
+
9
+ persist_directory = load_model.persist_directory
10
+ st.title('myRetrievalGPT')
11
+ st.header('An GPT Retrieval example brought to you by Heiko Wagner')
12
+
13
+ st.markdown('*Let $\phi$ be a word embedding mapping $W$ → $\mathbb{R}^n$ where $W$ is the word space and $\mathbb{R}^n$ is an $n$-dimensional vector space then: $\phi(king)-\phi(man)+\phi(woman)=\phi(queen)$.* ')
14
+
15
+ agree = st.checkbox('Load new Documents')
16
+ if agree:
17
+ el.load_files()
18
+ else:
19
+
20
+ import torch
21
+ torch.cuda.empty_cache()
22
+
23
+ st.write(str( torch.cuda.is_available()) + str(psutil.virtual_memory()))
24
+ model_type = st.selectbox(
25
+ 'Select the Documents to be used to answer your question',
26
+ ('OpenAI', 'decapoda-research/llama-7b-hf (gpu+cpu)', 'llama-7b 4bit (cpu only)',) )
27
+
28
+ if model_type=='OpenAI':
29
+ if 'openai_key' not in st.session_state:
30
+ openai_key= st.text_area('OpenAI Key:', '')
31
+ if len(openai_key)>-1:
32
+ st.session_state['openai_key'] = openai_key
33
+ os.environ["OPENAI_API_KEY"] = openai_key
34
+ else:
35
+ os.environ["OPENAI_API_KEY"] = st.session_state.openai_key
36
+ llm= load_model.load_openai_model()
37
+ elif model_type=='decapoda-research/llama-7b-hf (gpu+cpu)':
38
+ # Add more models here
39
+ if not torch.cuda.is_available() and psutil.virtual_memory().available< 18254768640:
40
+ st.write('You do not have enough memory to use this model:' + str(psutil.virtual_memory().available))
41
+ else:
42
+ llm = load_model.load_gpu_model("decapoda-research/llama-7b-hf")
43
+ else:
44
+ llm = load_model.load_cpu_model()
45
+
46
+
47
+ collections = ut.retrieve_collections()
48
+ option = st.selectbox(
49
+ 'Select the Documents to be used to answer your question',
50
+ collections )
51
+
52
+ st.write('You selected:', option['name'])
53
+
54
+ chain = load_model.create_chain(llm, collection=option['name'], model_name=option['model_name'], metadata= option['metadata'])
55
+ query = st.text_area('Ask a question:', 'Hallo how are you today?')
56
+ result = chain({"query": query + " Add a Score of the propability that your answer is correct to your answer"})
57
+ ut.format_result_set(result)
58
+
59
+ #from langchain.chains import ConversationChain
60
+ #from langchain.memory import ConversationBufferMemory
61
+
62
+ #conversation = ConversationChain(
63
+ # llm=chat,
64
+ # memory=ConversationBufferMemory()
65
+ #)
app/elements.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ from langchain.docstore.document import Document
4
+ from chromadb.config import Settings
5
+ from load_model import load_embedding
6
+ from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
7
+ from utils import retrieve_collections, get_chroma_client
8
+
9
+ def llm_module():
10
+ pass
11
+
12
+ def load_files():
13
+
14
+ client = get_chroma_client()
15
+
16
+ option = st.radio(
17
+ "",
18
+ options=["Add Documents", "Start new collection"],
19
+ )
20
+
21
+ if option == "Add Documents":
22
+ collections = retrieve_collections()
23
+ selected_collection = st.selectbox(
24
+ 'Add to exsisting collection or create a new one',
25
+ collections )
26
+ if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
27
+ client.delete_collection(name=selected_collection["name"])
28
+ #retrieve_collections.clear()
29
+ collections = retrieve_collections()
30
+
31
+ if selected_collection:
32
+ st.write("Selected Vectorstore:", selected_collection)
33
+ option = st.radio(
34
+ "",
35
+ options=["Upload Files from Local", "Upload Files from Web"],
36
+ )
37
+ if option == "Upload Files from Local":
38
+ st.write('Source Documents:')
39
+ uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
40
+ chunk_size = st.text_area('chunk Size:', 1000)
41
+
42
+ if st.button('Upload'):
43
+ docs = load_from_file(uploaded_files)
44
+ sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
45
+ vec1 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
46
+ st.write("Upload succesful")
47
+ else:
48
+ st.write('Urls of Source Documents (Comma separated):')
49
+ urls = chunk_size = st.text_area('Urls:', '')
50
+ chunk_size = st.text_area('chunk Size:', 1000)
51
+ urls = urls.replace(",", "" ).replace('"', "" ).split(',')
52
+
53
+ if st.button('Upload'):
54
+ docs = load_from_web(urls)
55
+ sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
56
+ vec2 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
57
+ st.write("Upload succesful")
58
+ else:
59
+ collection = st.text_area('Name of your new collection:', '')
60
+ model_name = st.text_area('Choose the embedding function:', "hkunlp/instructor-large")
61
+ if st.button('Create'):
62
+ if len(collection)>3:
63
+ ef = load_embedding(model_name)
64
+ metadata= {"loaded_docs":[], "Subject":"Terms Example", "model_name": ef.model_name}
65
+ client.create_collection(collection, embedding_function=ef, metadata=metadata)
66
+ # retrieve_collections.clear()
67
+ st.write("Collection " +collection+" succesfully created.")
app/exploration.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+
3
+ from utils import retrieve_collections, get_chroma_client
4
+
5
+
6
+ from load_model import load_embedding
7
+
8
+ #retrieve_collections()
9
+
10
+ client = get_chroma_client()
11
+
12
+ # %%
13
+ client.reset()
14
+ # %%
15
+ collections = tuple( [collection.name for collection in client.list_collections()] ) ##Keine Embedding function in der Collection angelegt...
16
+
17
+ ef = load_embedding("hkunlp/instructor-large")
18
+ collection="heikostest2"
19
+ client.create_collection(collection, embedding_function=ef, metadata={"loaded_docs":[]})
20
+
21
+
22
+ # %%
23
+ my_col = client.list_collections()
24
+
25
+ # %%
26
+ my_col.embedding_function
27
+
28
+ # %%
29
+ from langchain.vectorstores import Chroma
30
+ import load_model
31
+
32
+ from load_model import load_embedding
33
+
34
+ persist_directory = load_model.persist_directory
35
+
36
+ ef = load_embedding("hkunlp/instructor-large")
37
+ vectorstore = Chroma(
38
+ collection_name="papers",
39
+ embedding_function=ef,
40
+ persist_directory=persist_directory,
41
+ )
42
+
43
+ # %%
44
+ query = "What did the president say about Ketanji Brown Jackson"
45
+ docs = vectorstore.similarity_search(query)
46
+
47
+
48
+ # %%
49
+ docs
50
+ # %%
51
+ vectorstore.similarity_search_with_score(query)
app/load_model.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ # git clone https://huggingface.co/nyanko7/LLaMA-7B
3
+ # python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu117/torch2.00/index.html
4
+ # apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
5
+ from transformers import LlamaForCausalLM, LlamaTokenizer
6
+ from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings, OpenAIEmbeddings
7
+ from langchain.llms import LlamaCpp, HuggingFacePipeline
8
+ from langchain.vectorstores import Chroma
9
+ from transformers import pipeline
10
+ import torch
11
+ torch.backends.cuda.matmul.allow_tf32 = True
12
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
13
+ import streamlit as st
14
+ import cloudpickle
15
+ import os
16
+ from langchain.chains import RetrievalQA
17
+ from langchain.indexes import VectorstoreIndexCreator
18
+ from langchain.llms import OpenAI
19
+ import multiprocessing
20
+
21
+ from chromadb.config import Settings
22
+ import chromadb
23
+
24
+ import pathlib
25
+
26
+ current_path = str( pathlib.Path(__file__).parent.resolve() )
27
+ print(current_path)
28
+ persist_directory = current_path + "/VectorStore"
29
+
30
+ # %%
31
+ @st.cache_resource
32
+ def load_cpu_model():
33
+ """Does not work atm, bc cpu model is not persisted"""
34
+ model_path= "./mymodels/LLaMA-7B/ggml-model-q4_0.bin"
35
+ device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
36
+ llm = LlamaCpp(
37
+ model_path=model_path,
38
+ n_ctx=6000,
39
+ n_threads=multiprocessing.cpu_count(),
40
+ temperature=0.6,
41
+ top_p=0.95
42
+ )
43
+
44
+ llama_embeddings = LlamaCppEmbeddings(model_path=model_path)
45
+ return llm
46
+
47
+ @st.cache_resource(max_entries =1)
48
+ def load_gpu_model(used_model):
49
+ torch.cuda.empty_cache()
50
+ tokenizer = LlamaTokenizer.from_pretrained(used_model)
51
+
52
+ if not torch.cuda.is_available():
53
+ device_map = {
54
+ "": "cpu"
55
+ }
56
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
57
+ torch_dtype=torch.float32
58
+ load_in_8bit=False
59
+ else:
60
+ device_map="auto"
61
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True) #atm no offload, bc device_map="auto"
62
+
63
+
64
+ base_model = LlamaForCausalLM.from_pretrained(
65
+ used_model,
66
+ device_map=device_map,
67
+ offload_folder=current_path + "/models_gpt/",
68
+ low_cpu_mem_usage=True,
69
+ quantization_config=quantization_config,
70
+ cache_dir = current_path + "/mymodels/"
71
+ )
72
+ pipe = pipeline(
73
+ "text-generation",
74
+ model=base_model,
75
+ tokenizer=tokenizer,
76
+ max_length=8000,
77
+ temperature=0.6,
78
+ top_p=0.95,
79
+ repetition_penalty=1.2
80
+ )
81
+ llm = HuggingFacePipeline(pipeline=pipe)
82
+ return llm
83
+
84
+ #@st.cache_resource
85
+ def load_openai_model(temperature=0.9):
86
+ return OpenAI(temperature=temperature)
87
+
88
+ @st.cache_resource
89
+ def load_openai_embedding():
90
+ return OpenAIEmbeddings()
91
+
92
+ #@st.cache_resource
93
+ def load_embedding(model_name):
94
+ embeddings = HuggingFaceInstructEmbeddings(
95
+ query_instruction="Represent the query for retrieval: ",
96
+ model_name = model_name,
97
+ cache_folder=current_path + "/mymodels/"
98
+ )
99
+ return embeddings
100
+
101
+ def load_vectorstore(model_name, collection, metadata):
102
+ embeddings = load_embedding(model_name)
103
+ client_settings = Settings(
104
+ chroma_db_impl="duckdb+parquet",
105
+ persist_directory=persist_directory,
106
+ anonymized_telemetry=False
107
+ )
108
+ vectorstore = Chroma(
109
+ collection_name=collection,
110
+ embedding_function=embeddings,
111
+ client_settings=client_settings,
112
+ persist_directory=persist_directory,
113
+ collection_metadata=metadata
114
+ )
115
+ return vectorstore
116
+
117
+ def create_chain(_llm, collection, model_name, metadata):
118
+ vectorstore = load_vectorstore(model_name, collection, metadata=metadata)
119
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
120
+ chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
121
+ return chain
122
+ # %%
app/load_test.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ # %%
3
+ import os
4
+ import pathlib
5
+
6
+ from load_model import load_embedding
7
+ from utils import get_chroma_client
8
+ from load_vectors import load_from_web, create_and_add, load_and_split
9
+
10
+ collection="axaterms"
11
+ client = get_chroma_client()
12
+ # Load collection to get metadata
13
+ loaded_collection = client.get_collection(collection)
14
+
15
+ # %%
16
+ model_name = loaded_collection.metadata['model_name']
17
+
18
+ # %%
19
+ print( loaded_collection.json() )
20
+
21
+
22
+ # %%
23
+ client.get_collection(collection).json() #add documents destroys the metadata... maybe :)
24
+ # %%
25
+
26
+ #loaded_collection.modify(metadata={"Test":99})
27
+
28
+ # %%
29
+ loaded_collection.json()
app/load_vectors.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import nltk
3
+ from langchain.indexes import VectorstoreIndexCreator
4
+ from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
5
+ from langchain.document_loaders import OnlinePDFLoader, UnstructuredPDFLoader
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
8
+ from chromadb.config import Settings
9
+ import chromadb
10
+ from chromadb.utils import embedding_functions
11
+ from hashlib import sha256
12
+ import cloudpickle
13
+ import logging
14
+ import os
15
+ from load_model import load_embedding, load_vectorstore
16
+ import torch
17
+ import re
18
+ import pathlib
19
+ import tempfile
20
+
21
+
22
+ current_path = str( pathlib.Path(__file__).parent.resolve() )
23
+
24
+ os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
25
+ nltk.download('punkt')
26
+
27
+ persist_directory = current_path + "/VectorStore"
28
+ logger = logging.getLogger()
29
+
30
+
31
+ # %%
32
+
33
+ def create_collection(collection_name, model_name, client):
34
+ """Not used atm"""
35
+ if not torch.cuda.is_available():
36
+ device= "cpu"
37
+ else:
38
+ device= "cuda"
39
+ ef = embedding_functions.InstructorEmbeddingFunction(
40
+ model_name=model_name, device=device)
41
+ client.get_or_create_collection(collection_name, embedding_function=ef)
42
+ return True
43
+
44
+ def create_and_add(collection_name, sub_docs, model_name, metadata):
45
+ logging.info(f"Adding documents to {collection_name}")
46
+ embeddings = load_embedding(model_name)
47
+ vectorstore = load_vectorstore(model_name, collection_name, metadata = metadata)
48
+ vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
49
+ vectorstore.persist()
50
+
51
+ # Test Vectorstore
52
+ vectorstore2 = load_vectorstore(model_name, collection_name, metadata = metadata)
53
+ print( vectorstore2.similarity_search_with_score(query="What is a transformer llm?", k=4) )
54
+
55
+ return True
56
+
57
+ def load_from_file(files):
58
+
59
+ saved_files=[]
60
+ with tempfile.TemporaryDirectory() as tmpdirname:
61
+ for file in files:
62
+ temp_dir = pathlib.Path(tmpdirname)
63
+ file_name = os.path.join(temp_dir,file.name)
64
+ saved_files.append(file_name)
65
+ with open(file_name, mode='wb') as w:
66
+ w.write(file.read())
67
+
68
+ print(saved_files)
69
+ loaders=[UnstructuredPDFLoader(pdf) for pdf in saved_files]
70
+ docs = []
71
+ print(loaders)
72
+ for loader in loaders:
73
+ docs.extend(loader.load())
74
+ return docs
75
+
76
+ def load_from_web(urls, cache=True):
77
+ docs_list = urls
78
+ filename=f"{current_path}/.cache/{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl"
79
+
80
+ isFile = os.path.isfile(filename)
81
+
82
+ if cache and isFile:
83
+ logger.info("Using Cache")
84
+ pikd = open(filename, "rb")
85
+ docs = cloudpickle.load(pikd)
86
+ else:
87
+ loaders=[OnlinePDFLoader(pdf) for pdf in docs_list]
88
+ docs = []
89
+ for loader in loaders:
90
+ docs.extend(loader.load())
91
+ with open(filename, 'wb') as output:
92
+ cloudpickle.dump(docs, output)
93
+
94
+ #update metadata
95
+ i=0
96
+ for doc in docs:
97
+ doc.metadata = {'source': docs_list[i], 'url': docs_list[i], 'owner':'Heiko Wagner'}
98
+ i=i+1
99
+ return docs
100
+
101
+ def load_and_split(docs, chunk_size=700):
102
+ text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
103
+ sub_docs = text_splitter.split_documents(docs)
104
+ return sub_docs
105
+
106
+ def metadata_generator(doc, llm,max_token=4000):
107
+ #query = f"Document = {doc.page_content[1:max_token]} -> Respond a python code using a dict filling xxxx like {{'document_type': xxxx, 'summary (max. 30 letters)':'xxxx'}} resond at leat 10 letter"
108
+ query = f"""
109
+ Cluster the following Input document into topic categories based on patterns seen within the text. Also mention reasoning behind how these categories were defined.
110
+ Output format:
111
+ {{
112
+ "DOCUMENT TYPE": "",
113
+ "SUMMARY": [],
114
+ "REASONING": ""
115
+ }}
116
+
117
+ Input document:
118
+ {doc.page_content[1:max_token]}
119
+ Output:
120
+ """
121
+ return llm(query)
app/playground/load_docs.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from load_vectors import load_from_web, load_and_split, create_and_add
3
+
4
+ docs = [
5
+ "https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/85ec0278-bf2f-4392-94b9-c086717fa8f6_axa_urd2022_accessible_va.pdf"
6
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/d97a94ff-a848-474b-b802-c22afc8311cd_axa_half_year_2022_financial_report.pdf"
7
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/51954d53-c0cf-4f90-84f7-53ee27dbe4e6_axa_ri2021_va_accessible.pdf"
8
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/e3f52b5e-d4aa-4fc8-8bcd-f432df86e804_axa_urd_2021_en_accessible.pdf"
9
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/4f303cec-a12d-480b-accb-7b56f706f60e_axa-ri2020-en-accessible.pdf"
10
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/d6aef906-e41f-40c7-ac9c-29044e98939d_AXA_URD_2020_EN_accessible_b.pdf"
11
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F47b47783-ddd1-47c3-912f-bc6e318ebbb3_axa_half_year_2020_financial_report.pdf"
12
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ffd5a8bd8-9ef1-40eb-b953-c268c0ab4bf9_axa-ri2019-en-accessible.pdf"
13
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F90abd6c7-80c4-48ef-84bf-1d038670d9b7_axa-urd2019-en.pdf"
14
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F3ef6a9cc-6215-4e58-83b5-756774ef5b73_axa_half_year_2019_financial_report2.pdf"
15
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F0a5e0bd9-78f2-4ef8-b32c-1d3d35ddce80_axa-ri2018-en-accessible.pdf"
16
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F913d1869-3d11-4eb2-b013-4caedb747fab_axa-ddr2018b-en.pdf"
17
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F476f79c9-c0c7-4ce3-88ed-4f99b3d22259_axa_half_year_2018_financial_report.pdf"
18
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F01f6966b-c26c-4935-91dc-1b296511ba8c_axa_ri2017_gb_planche.pdf"
19
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fec440dc9-69df-41b5-a3af-5b5f4fc29670_axa_reference_document_2017c.pdf"
20
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F72c59a61-8124-4066-a86d-bece5f41ce53_axa_us_statutory_statements_fy17.pdf"
21
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F9237d78f-c1ac-43ca-9623-d0382a5aaaec_axa_us_statutory_statements_3q17.pdf"
22
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ffdd639e0-2ea6-4c3f-8a42-8bca4359e858_axa_us_statutory_statements_2q17.pdf"
23
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F877e30a9-df72-480f-ac25-edcfcd4049c2_axa_us_statutory_statements_1q17.pdf"
24
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F6f3108fd-fabc-4dc6-a984-23eb0dca7a19_axa-ri2016-en_01.pdf"
25
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F268bab7a-2e78-4843-844a-fd3ad2d340bc_axa_reference_document_2016.pdf"
26
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fd2f66d05-e6ad-47a2-ab72-9bc727bd49c2_axa_half_year_2016_financial_report.pdf"
27
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F7a5f0af2-03c3-4a82-a077-46fdc52e5685_axa_us_statutory_statements_fy16.pdf"
28
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fdd643342-e975-473d-af54-c64491252a19_axa_us_statutory_statements_3q16.pdf"
29
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F53e10a7a-9348-40dc-935e-01fb0a1d0441_axa_us_statutory_statements_2q16.pdf"
30
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F8906bad6-14cb-4594-b7c0-029f8fc2172d_axa_us_statutory_statements_1q16.pdf"
31
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F2d8e525a-1161-453a-a14f-817f0f070f79_axa_activity_cr_report_2015_accessible.pdf"
32
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F73719a96-c3b1-456b-abaf-63b80c06968c_axa_reference_document_2015.pdf"
33
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fe2936c1a-65f0-40db-b34b-bef9c27e91c0_axa_2015_half_year_financial_report.pdf"
34
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fdaac2a30-a3b8-4839-9331-041805836a6f_axa_us_statutory_statements_fy15.pdf"
35
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F61a6c98a-08fb-4cb1-b6c0-4d1ef0f72aa9_axa_us_statutory_statements_3q15.pdf"
36
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fe0689ffc-5aec-4388-a10e-26d1d1a7eb9a_axa_us_statutory_statements_2q15.pdf"
37
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fbfa8ef5b-6533-4773-8502-5170a51735c9_axa_us_statutory_statements_1q15.pdf"
38
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fbbb94857-f5d4-4afd-81d0-e85666883936_axa_annual+financial+report_2014.pdf"
39
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fb826839c-76c9-48c7-b8c1-9eda7fe3b032_axa_activity_csr_report_2014_va_b.pdf"
40
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fac63e0f9-60ba-47c2-9e23-f1d25731c7ee_axa_2014_half_year_financial_report.pdf"
41
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fb7db2a55-8eb6-4131-bc03-698e4bc756d6_axa_us_statutory_statements_fy2014.pdf"
42
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F417b48df-c585-4cb6-9d10-719d81228756_axa_us_statutory_statements_3q14.pdf"
43
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F4586d978-6fb8-4c44-b934-e15c14143b6d_axa_us_statutory_statements_2q14.pdf"
44
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F29cc016e-aff9-49c5-bb04-d55598aab844_axa_us_statutory_statements_1q14.pdf"
45
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F25fee379-c187-40e7-bf3a-5fe1423cec0f_axa_annual+financial+report_2013.pdf"
46
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F37614ed4-1fe0-483e-a0eb-0acefdedd065_axa_2013_half_year_financial_report.pdf"
47
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Febb51afc-af0e-4aff-9494-5b852b3233e5_axa_us_statutory_statements_fy2013.pdf"
48
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fdd4cd68e-710e-4e00-ba96-c7560d738a43_axa_us_statutory_statements_3q13.pdf"
49
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Feab93a81-859a-487c-941c-11e4ce08d5f0_axa_us_statutory_statements_2q13.pdf"
50
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F879d09d7-8ff7-4c43-9a24-7ee44ee55404_axa_us_statutory_statements_1q13.pdf"
51
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F9224097f-d703-4efd-8050-6553ef4336f8_axa_annual+financial+report_2012b.pdf"
52
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fcb9d1279-948a-4238-ab8f-754e9e10f2a5_axa_activity_csr_report_2012b_va.pdf"
53
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fbed64ef2-5078-425a-a616-ffb1947e0b65_axa_2012_half_year_financial_report.pdf"
54
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fa148165a-b818-4ea1-b7ee-7949cc86ff9a_axa_us_statutory_statements_fy2012.pdf"
55
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F601ed5e8-189d-4e59-b0d4-d1c1eedb2ffe_axa_us_statutory_statements_3q12.pdf"
56
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F9637b674-c740-4115-9c90-3a8827516cc0_axa_us_statutory_statements_2q12.pdf"
57
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F319a5964-ea51-4d51-96c8-cf6838047b72_axa_us_statutory_statements_1q12.pdf"
58
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F0b75d1fe-4b11-4462-9883-4e3bc7532bf4_axa_annual+financial+report_2011.pdf"
59
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F17e098ab-3335-4ee1-ade7-058517a952c4_axa_activity_csr_report_2011_vab.pdf"
60
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F03996908-7e75-465e-8082-b44f02da326a_axa_us_statutory_statements_fy2011.pdf"
61
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fea813e84-7d08-4cf2-bea1-3a01fd4bdf62_axa_us_statutory_statements_3q11.pdf"
62
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fda28b496-275b-451d-bffd-108714eb2c39_axa_us_statutory_statements_2q11.pdf"
63
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fd6aa2b39-896e-47cf-9882-9985c8d44276_axa_us_statutory_statements_1q11.pdf"
64
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fc76f47f4-0917-4fb1-b1ae-78e2a4fbcef5_axa_annual+financial+report_2010c+%281%29.pdf"
65
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F68c2771e-5ed8-41d9-bb59-f37f6403b4bf_axa_activity_csr_report_2010_vac.pdf"
66
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F9b7812a1-a1a2-4e17-9bf2-88c11aac4e08_axa_2010_half_year_financial_report.pdf.pdf"
67
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F3f4cc3bd-6823-4ccf-a918-f0c9d9063c2a_axa_us_statutory_statements_fy2010.pdf"
68
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F5a8a399f-9a0a-4475-8fbd-5bc0ca1dffe6_axa_us_statutory_statements_3q10.pdf"
69
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F67ba6c6d-7063-41d4-ad4e-75d86b15da43_axa_us_statutory_statements_1q10.pdf"
70
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fa151a532-da4f-4d12-8b3b-9867df4f9724_axa_annual+financial+report_2009.pdf"
71
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F5f89c4dd-d935-47fe-ac69-23fada9bfc96_axa_2009_half_year_financial_report.pdf"
72
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ff322c77a-e2a2-4cd7-88a0-edd8ad4cd021_axa_annual+financial+report_2008.pdf"
73
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fb7f88f05-053a-460b-aa4d-6163d3644cfc_axa_activity_csr_report_2008_vad.pdf"
74
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ff657a419-e066-485a-a58e-1d2870a6a035_axa_2008_half_year_financial_report.pdf"
75
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F820b669d-b3b5-4c14-986d-2223e2bcbcfb_axa_annual+financial+report_2007.pdf"
76
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F2741b55e-9349-47ef-9704-3cbca0853b76_axa_activity_csr_report_2007.pdf"
77
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F42159571-a3f1-4d36-b4b9-a5493fcc95e3_axa_2007_half_year_financial_report.pdf"
78
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F40f9da2a-1bcb-4e5e-9380-18f64b3ce86e_axa_annual+financial+report_2006b.pdf"
79
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fa6a14e0c-62cd-4812-a2d0-3a0aae8c862d_axa_activity_csr_report_2006b.pdf"
80
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Faf242b66-1308-4331-829f-fa91bd0db43e_axa_annual+financial+report_2005.pdf"
81
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F6b3313d1-3b72-4f28-bc7b-f445b9b3190c_axa_activity_csr_report_2005.pdf"
82
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F048b0d90-b28f-4fc3-bc30-b02cf8e0d6fc_axa_annual+financial+report_2004_ci.pdf"
83
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F02acbd05-712f-4b73-93f0-dffa37e2faa2_axa_annual+financial+report_2004_ci.pdf"
84
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fcf0b84a5-6da9-499d-985f-530559940494_axa_activity_csr_report_2004.pdf"
85
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fafa397b5-d613-40f3-a28f-81bde0d461e2_axa_annual+financial+report_2003.pdf"
86
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F2a31ebb9-ba04-4998-982e-9dd336abca1f_axa_annual+financial+report_2002.pdf"
87
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F51e5f017-954b-4f81-84f9-15a086bf1e33_axa_annual+financial+report_2002_ci01.pdf"
88
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F05fea38c-c626-4aaf-9ead-10e9c8f849c1_axa_annual+financial+report_2002_ci02.pdf"
89
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F3e41d00d-42b3-4bfd-babc-8b9f76b73d95_axa_activity_csr_report_2002.pdf"
90
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F4f2676f4-d36c-4d2e-b088-ef26878ff28b_axa_annual+financial+report_2001.pdf"
91
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ffdfa0941-6fb5-4ce8-9f42-3b0152e72ce2_axa_activity_csr_report_2001.pdf"
92
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F98922150-f1c5-4df4-9006-a8ef17a514cd_axa_annual+financial+report_2000.pdf"
93
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F1a645a94-1c56-43be-9a5a-94495e902a23_axa_activity_csr_report_2000.pdf"
94
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F51c109ca-2bba-45b3-a03b-78fdd16faeca_axa_annual+financial+report_1999.pdf"
95
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F21cdedc6-c082-4ae6-abb3-4c57f0cf9dd8_axa_annual+financial+report_1998.pdf"
96
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fd3132d9d-b656-470d-ba4f-fe8d51586e4b_axa_activity_csr_report_1998.pdf"
97
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F746d88d3-a4f7-4126-b539-a5da353f53d7_axa_annual+financial+report_1997.pdf"
98
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F97097956-6cd5-4fb4-a6ea-9aeb32fd9023_axa_activity_csr_report_1997.pdf"
99
+ ]
100
+
101
+
102
+ docs_tarifs= [
103
+ "https://www.axa.de/site/axa-de/get/documents_E1805589786/axade/medien/privatkunden/fahrzeugversicherungen/kfz-versicherung/start-and-drive/start-and-drive-versicherungsbedingungen.pdf",
104
+ "https://www.axa.de/site/axa-de/get/documents_E-298610932/axade/medien/privatkunden/haftpflicht-und-recht/rechtsschutz/versicherungsbedingungen-roland-rechtsschutz.pdf",
105
+ "https://www.axa.de/site/axa-de/get/documents_E101690225/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-S-5-mio.pdf",
106
+ "https://www.axa.de/site/axa-de/get/documents_E-1067805129/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-S-10-mio.pdf",
107
+ "https://www.axa.de/site/axa-de/get/documents_E1026401604/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-M.pdf",
108
+ "https://www.axa.de/site/axa-de/get/documents_E1450059874/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-L.pdf",
109
+ "https://www.axa.de/site/axa-de/get/documents_E1636759799/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-S.pdf",
110
+ "https://www.axa.de/site/axa-de/get/documents_E1147682774/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-M-20%25.pdf",
111
+ "https://www.axa.de/site/axa-de/get/documents_E1642308493/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-M-40%25.pdf",
112
+ "https://www.axa.de/site/axa-de/get/documents_E1883536226/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-L.pdf",
113
+ ]
114
+
115
+ docs_list = [
116
+ "https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/85ec0278-bf2f-4392-94b9-c086717fa8f6_axa_urd2022_accessible_va.pdf"
117
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/e3f52b5e-d4aa-4fc8-8bcd-f432df86e804_axa_urd_2021_en_accessible.pdf"
118
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/d6aef906-e41f-40c7-ac9c-29044e98939d_AXA_URD_2020_EN_accessible_b.pdf"
119
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ffd5a8bd8-9ef1-40eb-b953-c268c0ab4bf9_axa-ri2019-en-accessible.pdf"
120
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F0a5e0bd9-78f2-4ef8-b32c-1d3d35ddce80_axa-ri2018-en-accessible.pdf"
121
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F01f6966b-c26c-4935-91dc-1b296511ba8c_axa_ri2017_gb_planche.pdf"
122
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F6f3108fd-fabc-4dc6-a984-23eb0dca7a19_axa-ri2016-en_01.pdf"
123
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fe2936c1a-65f0-40db-b34b-bef9c27e91c0_axa_2015_half_year_financial_report.pdf"
124
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fbbb94857-f5d4-4afd-81d0-e85666883936_axa_annual+financial+report_2014.pdf"
125
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F25fee379-c187-40e7-bf3a-5fe1423cec0f_axa_annual+financial+report_2013.pdf"
126
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F9224097f-d703-4efd-8050-6553ef4336f8_axa_annual+financial+report_2012b.pdf"
127
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F0b75d1fe-4b11-4462-9883-4e3bc7532bf4_axa_annual+financial+report_2011.pdf"
128
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fc76f47f4-0917-4fb1-b1ae-78e2a4fbcef5_axa_annual+financial+report_2010c+%281%29.pdf"
129
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fa151a532-da4f-4d12-8b3b-9867df4f9724_axa_annual+financial+report_2009.pdf"
130
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ff322c77a-e2a2-4cd7-88a0-edd8ad4cd021_axa_annual+financial+report_2008.pdf"
131
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F820b669d-b3b5-4c14-986d-2223e2bcbcfb_axa_annual+financial+report_2007.pdf"
132
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F40f9da2a-1bcb-4e5e-9380-18f64b3ce86e_axa_annual+financial+report_2006b.pdf"
133
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Faf242b66-1308-4331-829f-fa91bd0db43e_axa_annual+financial+report_2005.pdf"
134
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F02acbd05-712f-4b73-93f0-dffa37e2faa2_axa_annual+financial+report_2004_ci.pdf"
135
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fafa397b5-d613-40f3-a28f-81bde0d461e2_axa_annual+financial+report_2003.pdf"
136
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F2a31ebb9-ba04-4998-982e-9dd336abca1f_axa_annual+financial+report_2002.pdf"
137
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F4f2676f4-d36c-4d2e-b088-ef26878ff28b_axa_annual+financial+report_2001.pdf"
138
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F98922150-f1c5-4df4-9006-a8ef17a514cd_axa_annual+financial+report_2000.pdf"
139
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F51c109ca-2bba-45b3-a03b-78fdd16faeca_axa_annual+financial+report_1999.pdf"
140
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F21cdedc6-c082-4ae6-abb3-4c57f0cf9dd8_axa_annual+financial+report_1998.pdf"
141
+ ,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F746d88d3-a4f7-4126-b539-a5da353f53d7_axa_annual+financial+report_1997.pdf"
142
+ ]
143
+
144
+
145
+ docs = load_from_web(docs_tarifs)
146
+ sub_docs = load_and_split(docs, chunk_size=700)
147
+
148
+ # %%
149
+ create_and_add("axa_terms", sub_docs, "hkunlp/instructor-large")
150
+
151
+ docs = load_from_web(docs_list)
152
+ sub_docs = load_and_split(docs)
153
+
154
+ # %%
155
+ create_and_add("axa_gpt", sub_docs, "hkunlp/instructor-large")
app/playground/result.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5b61c0f601cb65f2779f18fdbe5bf47f88d61f23dfbe2afdafb64c951207da8
3
+ size 429
app/playground/st_render_doc.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import utils as ut
3
+ import cloudpickle
4
+
5
+ filename="./result.pkl"
6
+ pikd = open(filename, "rb")
7
+ result = dict( cloudpickle.load(pikd) )
8
+ del pikd
9
+ ut.format_result_set(result)
app/run.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This script inits the models and adds an example collection to the Vectorstore
2
+ # %%
3
+ import os
4
+ import pathlib
5
+
6
+ from load_model import load_embedding
7
+ from utils import get_chroma_client
8
+ from load_vectors import load_from_web, create_and_add, load_and_split, metadata_generator
9
+
10
+ current_path = str( pathlib.Path(__file__).parent.resolve() )
11
+ with open(current_path+'/.openaiapikey', 'r') as reader:
12
+ os.environ['OPENAI_API_KEY']=reader.read()
13
+ import load_model
14
+
15
+ # %%
16
+ #load_model.load_gpu_model("decapoda-research/llama-7b-hf") #Download local model
17
+ #llm= load_model.load_openai_model()
18
+
19
+ # %%
20
+ #Load example Data
21
+ client = get_chroma_client()
22
+ client.reset()
23
+ ef = load_embedding("hkunlp/instructor-large")
24
+ collection_name="papers"
25
+ metadata= {"loaded_docs":[], "Subject":"Heikos Papers", "model_name": ef.model_name}
26
+ selected_collection = client.create_collection(collection_name, embedding_function=ef, metadata=metadata)
27
+
28
+ docs_tarifs= [
29
+ "https://edoc.hu-berlin.de/bitstream/handle/18452/5294/33.pdf",
30
+ "https://arxiv.org/pdf/1702.03556v3.pdf",
31
+ "https://arxiv.org/pdf/1706.03762"
32
+ ]
33
+
34
+ # %%
35
+ # Load collection to get metadata
36
+ loaded_collection = client.get_collection(collection_name)
37
+ model_name = loaded_collection.metadata['model_name']
38
+
39
+ # %%
40
+
41
+ docs = load_from_web(docs_tarifs)
42
+ sub_docs = load_and_split(docs, chunk_size=1000)
43
+ create_and_add(collection_name, sub_docs, model_name, metadata)
44
+
45
+
46
+
47
+ # %%
48
+ #chain = load_model.create_chain(llm, collection=collection_name, model_name=model_name, metadata=metadata)
49
+ #result = chain({"query": "Ist mein Kinderwagen bei einem Leitungswasserschaden mitversichert?"})
50
+ #print(result)
51
+ #llm= load_model.load_openai_model(temperature=0.1)
52
+
53
+ #llm= load_model.load_cpu_model()
54
+
55
+ #meta= metadata_generator(docs[0], llm)
56
+ # %%
57
+ #print(meta)
58
+
59
+ # %%
app/utils.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import latex2markdown
3
+ from langchain.docstore.document import Document
4
+ import chromadb
5
+ from chromadb.config import Settings
6
+ import load_model
7
+ from load_model import load_embedding
8
+ from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
9
+ persist_directory = load_model.persist_directory
10
+
11
+ def format_document(document: Document):
12
+ """TODO: Implement a nice style"""
13
+ return document.dict()
14
+
15
+ def format_result_set(result):
16
+ st.write(latex2markdown.LaTeX2Markdown(result["result"]).to_markdown())
17
+
18
+ agree = st.checkbox('Show source documents')
19
+ source_documents = result["source_documents"]
20
+ if agree:
21
+ st.write('Source Documents:')
22
+ for document in source_documents:
23
+ st.write(format_document(document))
24
+
25
+ #@st.cache_resource
26
+ def get_chroma_client():
27
+ return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
28
+ persist_directory=persist_directory
29
+ ))
30
+ #@st.cache_data
31
+ def retrieve_collections():
32
+ client = get_chroma_client()
33
+ all_collections = client.list_collections()
34
+ collections = tuple( [{'name': collection.name, 'model_name': collection.metadata['model_name'], "metadata": collection.metadata} for collection in all_collections] )
35
+ return collections
docker-compose.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.9"
2
+ services:
3
+ streamlit_app:
4
+ image: myretrievalgpt
5
+ build: .
6
+ tty: true
7
+ ports:
8
+ - 7860:7860
9
+ deploy:
10
+ resources:
11
+ reservations:
12
+ devices:
13
+ - capabilities: [gpu]
14
+ dev_app:
15
+ image: myretrievalgpt
16
+ tty: true
17
+ volumes:
18
+ - ./app:/app
19
+ - ./root:/root
20
+ depends_on:
21
+ - streamlit_app
22
+ deploy:
23
+ resources:
24
+ reservations:
25
+ devices:
26
+ - capabilities: [gpu]
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #git+https://github.com/hwchase17/langchain.git
2
+ langchain==0.0.154
3
+ git+https://github.com/huggingface/transformers.git
4
+ #git+https://github.com/chroma-core/chroma.git
5
+ chromadb
6
+ accelerate
7
+ bitsandbytes
8
+ InstructorEmbedding
9
+ cloudpickle
10
+ streamlit
11
+ requests==2.28.0
12
+ latex2markdown
13
+ openai
14
+ unstructured[local-inference]
15
+ llama-cpp-python