itismouad commited on
Commit
b2b64bc
·
1 Parent(s): ffa9554

initial commit of app

Browse files
Files changed (6) hide show
  1. .gitignore +169 -0
  2. Dockerfile +11 -0
  3. app.py +89 -0
  4. chainlit.md +3 -0
  5. requirements.txt +9 -0
  6. utils.py +182 -0
.gitignore ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # misc
10
+ .DS_Store
11
+ .chainlit
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Ignore weights & biases folder
120
+ wandb/
121
+
122
+ # Celery stuff
123
+ celerybeat-schedule
124
+ celerybeat.pid
125
+
126
+ # SageMath parsed files
127
+ *.sage.py
128
+
129
+ # Environments
130
+ .env
131
+ .venv
132
+ env/
133
+ venv/
134
+ ENV/
135
+ env.bak/
136
+ venv.bak/
137
+
138
+ # Spyder project settings
139
+ .spyderproject
140
+ .spyproject
141
+
142
+ # Rope project settings
143
+ .ropeproject
144
+
145
+ # mkdocs documentation
146
+ /site
147
+
148
+ # mypy
149
+ .mypy_cache/
150
+ .dmypy.json
151
+ dmypy.json
152
+
153
+ # Pyre type checker
154
+ .pyre/
155
+
156
+ # pytype static type analyzer
157
+ .pytype/
158
+
159
+ # Cython debug symbols
160
+ cython_debug/
161
+
162
+ # PyCharm
163
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
164
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
165
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
166
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
167
+ #.idea/
168
+
169
+ .DS_Store
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ COPY . .
11
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+ import chainlit as cl
3
+ from langchain.schema.runnable import RunnablePassthrough
4
+ from langchain.vectorstores import FAISS
5
+ from langchain.chains import RetrievalQA
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.prompts.chat import (
8
+ ChatPromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ HumanMessagePromptTemplate,
11
+ )
12
+
13
+ from utils import ArxivLoader, PineconeIndexer
14
+
15
+ system_template = """
16
+ Use the provided context to answer the user's query.
17
+
18
+ You may not answer the user's query unless there is specific context in the following text.
19
+
20
+ If you do not know the answer, or cannot answer, please respond with "I don't know".
21
+
22
+ Context:
23
+ {context}
24
+ """
25
+
26
+ messages = [
27
+ SystemMessagePromptTemplate.from_template(system_template),
28
+ HumanMessagePromptTemplate.from_template("{question}"),
29
+ ]
30
+
31
+ prompt = ChatPromptTemplate(messages=messages)
32
+ chain_type_kwargs = {"prompt": prompt}
33
+
34
+ @cl.author_rename
35
+ def rename(orig_author: str):
36
+ rename_dict = {"RetrievalQA": "Learning about Nuclear Fission"}
37
+ return rename_dict.get(orig_author, orig_author)
38
+
39
+ @cl.on_chat_start # marks a function that will be executed at the start of a user session
40
+ async def start_chat():
41
+
42
+ msg = cl.Message(content=f"Building Index...")
43
+ await msg.send()
44
+
45
+ # load documents from Arxiv
46
+ axloader = ArxivLoader()
47
+ axloader.main()
48
+
49
+ # build index in Pinecone
50
+ pi = PineconeIndexer()
51
+ pi.load_embedder()
52
+ pi.index_documents(axloader.documents)
53
+ retriever=pi.get_vectorstore().as_retriever()
54
+ print(pi.index.describe_index_stats())
55
+
56
+ # build llm
57
+ llm = ChatOpenAI(
58
+ model="gpt-3.5-turbo",
59
+ temperature=0
60
+ )
61
+
62
+ msg.content = f"Index built!"
63
+ await msg.send()
64
+
65
+ cl.user_session.set("llm", llm)
66
+ cl.user_session.set("retriever", retriever)
67
+
68
+ @cl.on_message # marks a function that should be run each time the chatbot receives a message from a user
69
+ async def main(message: cl.Message):
70
+
71
+ llm = cl.user_session.get("llm")
72
+ retriever = cl.user_session.get("retriever")
73
+
74
+ retrieval_augmented_qa_chain = (
75
+ {"context": itemgetter("question") | retriever,
76
+ "question": itemgetter("question")
77
+ }
78
+ | RunnablePassthrough.assign(
79
+ context=itemgetter("context")
80
+ )
81
+ | {
82
+ "response": prompt | llm,
83
+ "context": itemgetter("context"),
84
+ }
85
+ )
86
+
87
+ answer = retrieval_augmented_qa_chain.invoke({"question" : message.content})
88
+
89
+ await cl.Message(content=answer["response"].content).send()
chainlit.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Pythonic RAGA with LangChain & Pinecone
2
+
3
+ This application leverages Chainlit, OpenAI, LangChain, Pinecone and Hugging Face to build a basic RAQA (Retrieval Augmented Question Answering) application based on a Pinecone index containing documents with arxiv papers about nuclear fission.
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ chainlit==0.7.700
2
+ langchain==0.0.350
3
+ openai==0.27.8
4
+ tiktoken==0.4.0
5
+ chainlit==0.7.700
6
+ faiss-cpu==1.7.4
7
+ pinecone-client
8
+ arxiv
9
+ pypdf
utils.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+
4
+ import pinecone
5
+ from tqdm.auto import tqdm
6
+ from uuid import uuid4
7
+ import arxiv
8
+
9
+ from langchain.document_loaders import PyPDFLoader
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain.embeddings.openai import OpenAIEmbeddings
12
+ from langchain.embeddings import CacheBackedEmbeddings
13
+ from langchain.storage import LocalFileStore
14
+ from langchain.vectorstores import Pinecone
15
+
16
+ INDEX_BATCH_LIMIT = 100
17
+
18
+ class CharacterTextSplitter:
19
+ def __init__(
20
+ self,
21
+ chunk_size: int = 1000,
22
+ chunk_overlap: int = 200,
23
+ ):
24
+ assert (
25
+ chunk_size > chunk_overlap
26
+ ), "Chunk size must be greater than chunk overlap"
27
+
28
+ self.chunk_size = chunk_size
29
+ self.chunk_overlap = chunk_overlap
30
+
31
+ self.text_splitter = RecursiveCharacterTextSplitter(
32
+ chunk_size = self.chunk_size, # the character length of the chunk
33
+ chunk_overlap = self.chunk_overlap, # the character length of the overlap between chunks
34
+ length_function = len, # the length function - in this case, character length (aka the python len() fn.)
35
+
36
+ )
37
+
38
+ def split(self, text: str) -> List[str]:
39
+ return self.text_splitter.split_text(text)
40
+
41
+ class ArxivLoader:
42
+
43
+ def __init__(self, query : str = "Nuclear Fission", max_results : int = 5, encoding: str = "utf-8"):
44
+ """"""
45
+ self.query = query
46
+ self.max_results = max_results
47
+
48
+ self.paper_urls = []
49
+ self.documents = []
50
+ self.splitter = CharacterTextSplitter()
51
+
52
+ def retrieve_urls(self):
53
+ """"""
54
+ arxiv_client = arxiv.Client()
55
+ search = arxiv.Search(
56
+ query = self.query,
57
+ max_results = self.max_results,
58
+ sort_by = arxiv.SortCriterion.Relevance
59
+ )
60
+
61
+ for result in arxiv_client.results(search):
62
+ self.paper_urls.append(result.pdf_url)
63
+
64
+ def load_documents(self):
65
+ """"""
66
+ for paper_url in self.paper_urls:
67
+ loader = PyPDFLoader(paper_url)
68
+
69
+ self.documents.append(loader.load())
70
+
71
+ def format_document(self, document):
72
+ """"""
73
+ metadata = {
74
+ 'source_document' : document.metadata["source"],
75
+ 'page_number' : document.metadata["page"]
76
+ }
77
+
78
+ record_texts = self.splitter.split(document.page_content)
79
+ record_metadatas = [{
80
+ "chunk": j, "text": text, **metadata
81
+ } for j, text in enumerate(record_texts)]
82
+
83
+ return record_texts, record_metadatas
84
+
85
+ def main(self):
86
+ """"""
87
+ self.retrieve_urls()
88
+ self.load_documents()
89
+
90
+
91
+ class PineconeIndexer:
92
+
93
+ def __init__(self, index_name : str = "arxiv-paper-index", metric : str = "cosine", n_dims : int = 1536):
94
+ """"""
95
+ pinecone.init(
96
+ api_key=os.environ["PINECONE_API_KEY"],
97
+ environment=os.environ["PINECONE_ENV"]
98
+ )
99
+
100
+ if index_name not in pinecone.list_indexes():
101
+ # we create a new index
102
+ pinecone.create_index(
103
+ name=index_name,
104
+ metric=metric,
105
+ dimension=n_dims
106
+ )
107
+
108
+ self.index = pinecone.Index(index_name)
109
+ self.arxiv_loader = ArxivLoader()
110
+
111
+
112
+ def load_embedder(self):
113
+ """"""
114
+ store = LocalFileStore("./cache/")
115
+
116
+ core_embeddings_model = OpenAIEmbeddings()
117
+
118
+ self.embedder = CacheBackedEmbeddings.from_bytes_store(
119
+ core_embeddings_model,
120
+ store,
121
+ namespace=core_embeddings_model.model
122
+ )
123
+
124
+ def upsert(self, texts, metadatas):
125
+ """"""
126
+ ids = [str(uuid4()) for _ in range(len(texts))]
127
+ embeds = self.embedder.embed_documents(texts)
128
+ self.index.upsert(vectors=zip(ids, embeds, metadatas))
129
+
130
+ def index_documents(self, documents, batch_limit : int = INDEX_BATCH_LIMIT):
131
+ """"""
132
+ texts = []
133
+ metadatas = []
134
+
135
+ # iterate through your top-level document
136
+ for i in tqdm(range(len(documents))):
137
+
138
+ # select single document object
139
+ for page in documents[i] :
140
+
141
+ record_texts, record_metadatas = self.arxiv_loader.format_document(page)
142
+
143
+ texts.extend(record_texts)
144
+ metadatas.extend(record_metadatas)
145
+
146
+ if len(texts) >= batch_limit:
147
+ self.upsert(texts, metadatas)
148
+
149
+ texts = []
150
+ metadatas = []
151
+
152
+ if len(texts) > 0:
153
+ self.upsert(texts, metadatas)
154
+
155
+ def get_vectorstore(self):
156
+ """"""
157
+ return Pinecone(self.index, self.embedder.embed_query, "text")
158
+
159
+
160
+ if __name__ == "__main__":
161
+
162
+ print("-------------- Loading Arxiv --------------")
163
+ axloader = ArxivLoader()
164
+ axloader.retrieve_urls()
165
+ axloader.load_documents()
166
+
167
+ print("\n-------------- Splitting sample doc --------------")
168
+ sample_doc = axloader.documents[0]
169
+ sample_page = sample_doc[0]
170
+
171
+ splitter = CharacterTextSplitter()
172
+ chunks = splitter.split(sample_page.page_content)
173
+ print(len(chunks))
174
+ print(chunks[0])
175
+
176
+ print("\n-------------- testing pinecode indexer --------------")
177
+
178
+ pi = PineconeIndexer()
179
+ pi.load_embedder()
180
+ pi.index_documents(axloader.documents)
181
+
182
+ print(pi.index.describe_index_stats())