Files changed (10) hide show
  1. .gitignore +0 -2
  2. app.py +43 -170
  3. boto_client.py +0 -54
  4. doctr_ocr.py +17 -0
  5. package.txt +1 -0
  6. poetry.lock +0 -0
  7. pyproject.toml +0 -25
  8. requirements.txt +3 -1
  9. retriever.py +143 -0
  10. s3bucket.py +29 -0
.gitignore CHANGED
@@ -3,5 +3,3 @@ PDFs
3
  Adina_Vector_Database
4
  temp-pdf-files
5
  __pycache__/
6
- pdf_files
7
- .venv
 
3
  Adina_Vector_Database
4
  temp-pdf-files
5
  __pycache__/
 
 
app.py CHANGED
@@ -1,184 +1,57 @@
1
- import os
2
-
3
  import streamlit as st
4
- from langchain_community.vectorstores import FAISS
5
  from langchain_core.messages import AIMessage, HumanMessage
6
- from langchain_core.output_parsers import StrOutputParser
7
- from langchain_core.prompts import ChatPromptTemplate
8
- from langchain_openai.chat_models.azure import ChatOpenAI
9
- from langchain_openai.embeddings.azure import OpenAIEmbeddings
10
- from langchain_text_splitters import RecursiveCharacterTextSplitter
11
-
12
- from boto_client import extract_text_from_pdf
13
-
14
- vector_database_name = "Adina_Vector_Database"
15
- temp_pdf_folder = "temp-pdf-files"
16
- vector_database_path = (
17
- f"{os.environ.get('VECTOR_DATABASE_PATH', '.')}/{vector_database_name}"
18
- )
19
-
20
- RETRIEVER = None
21
-
22
-
23
- def delete_temp_files():
24
- for item in os.listdir(temp_pdf_folder):
25
- file_path = os.path.join(temp_pdf_folder, item)
26
- os.remove(file_path)
27
 
 
28
 
29
- def load_and_split(file):
30
- if not os.path.exists(temp_pdf_folder):
31
- os.makedirs(temp_pdf_folder)
32
- local_filepath = os.path.join(temp_pdf_folder, file.name)
33
- with open(local_filepath, "wb") as f:
34
- f.write(file.getvalue())
35
- text = extract_text_from_pdf(file_path=local_filepath, file_name=file.name)
36
- docs = []
37
- if text:
38
- text_splitter = RecursiveCharacterTextSplitter(
39
- chunk_size=512, chunk_overlap=100
40
- )
41
- texts = text_splitter.split_text(text)
42
- docs = text_splitter.create_documents(
43
- texts=texts, metadatas=[{"file_name": file.name}] * len(texts)
44
- )
45
- delete_temp_files()
46
- return docs
47
-
48
-
49
- def initialize_vector_db():
50
- vector_database = FAISS.from_texts(
51
- ["Adina Cosmetic Ingredients"], OpenAIEmbeddings()
52
- )
53
- vector_database.save_local(vector_database_path)
54
- return vector_database
55
-
56
-
57
- def load_vector_db():
58
- if os.path.exists(vector_database_path):
59
- return FAISS.load_local(
60
- vector_database_path,
61
- OpenAIEmbeddings(),
62
- allow_dangerous_deserialization=True,
63
- )
64
- return initialize_vector_db()
65
 
 
 
 
66
 
67
- def append_to_vector_db(docs: list = []):
68
- global RETRIEVER
69
- existing_vector_db = load_vector_db()
70
- new_vector_db = FAISS.from_documents(docs, OpenAIEmbeddings())
71
- existing_vector_db.merge_from(new_vector_db)
72
- existing_vector_db.save_local(vector_database_path)
73
- RETRIEVER = existing_vector_db.as_retriever()
74
 
 
 
 
 
 
 
 
 
75
 
76
- def create_embeddings(files: list = []):
77
- for file in files:
78
- docs = load_and_split(file)
79
- if docs:
80
- append_to_vector_db(docs=docs)
81
- st.session_state.last_uploaded_files.append(file.name)
82
- st.toast(f"{file.name} processed successfully")
83
- print(f"{file.name} processed successfully")
84
- else:
85
- st.toast(f"{file.name} could not be processed")
86
- print(f"{file.name} could not be processed")
87
 
 
 
88
 
89
- def get_response(user_query, chat_history):
90
- docs = RETRIEVER.invoke(user_query)
91
- additional_info = RETRIEVER.invoke(
92
- " ".join(
93
- [
94
- message.content
95
- for message in chat_history
96
- if isinstance(message, HumanMessage)
97
- ]
98
  )
99
- )
100
- docs_content = [doc.page_content for doc in docs]
101
- for doc in additional_info:
102
- if doc.page_content not in docs_content:
103
- docs.append(doc)
104
- template = """
105
- Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
106
- <rules>
107
- - Answer the question based on the context only.
108
- - If the question can not be answered, simply say you can not annswer it.
109
- </rules>
110
- Execute the below mandatory considerations when responding to the inquiries:
111
- --- Tone - Respectful, Patient, and Encouraging:
112
- Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
113
- Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
114
- --- Clarity - Simple, Direct, and Unambiguous:
115
- Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
116
- Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
117
- --- Structure - Organized, Consistent, and Considerate:
118
- Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
119
- --- Empathy and Understanding - Compassionate and Responsive:
120
- Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
121
- Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
122
- Answer the following questions considering the context and/or history of the conversation.
123
- Chat history: {chat_history}
124
- Context: {retrieved_info}
125
- User question: {user_question}
126
- """
127
-
128
- prompt = ChatPromptTemplate.from_template(template)
129
- llm = ChatOpenAI(model="gpt-3.5-turbo-0125", streaming=True)
130
-
131
- chain = prompt | llm | StrOutputParser()
132
-
133
- return chain.stream(
134
- {
135
- "chat_history": chat_history,
136
- "retrieved_info": docs,
137
- "user_question": user_query,
138
- }
139
- )
140
 
 
141
 
142
- def main():
143
- st.set_page_config(page_title="Adina Cosmetic Ingredients", page_icon="")
144
- st.title("Adina Cosmetic Ingredients")
145
- if "last_uploaded_files" not in st.session_state:
146
- st.session_state.last_uploaded_files = []
147
- if "chat_history" not in st.session_state:
148
- st.session_state.chat_history = [
149
- AIMessage(content="Hello, I am Adina. How can I help you?"),
150
- ]
151
- for message in st.session_state.chat_history:
152
- if isinstance(message, AIMessage):
153
- with st.chat_message("AI"):
154
- st.write(message.content)
155
- elif isinstance(message, HumanMessage):
156
- with st.chat_message("Human"):
157
- st.write(message.content)
158
- user_query = st.chat_input("Type your message here...")
159
- if user_query is not None and user_query != "":
160
- st.session_state.chat_history.append(HumanMessage(content=user_query))
161
- with st.chat_message("Human"):
162
- st.markdown(user_query)
163
- with st.chat_message("AI"):
164
- response = st.write_stream(
165
- get_response(
166
- user_query=user_query, chat_history=st.session_state.chat_history
167
- )
168
- )
169
- st.session_state.chat_history.append(AIMessage(content=response))
170
- uploaded_files = st.sidebar.file_uploader(
171
- label="Upload files", type="pdf", accept_multiple_files=True
172
- )
173
- to_be_vectorised_files = [
174
- item
175
- for item in uploaded_files
176
- if item.name not in st.session_state.last_uploaded_files
177
- ]
178
- if to_be_vectorised_files:
179
- create_embeddings(to_be_vectorised_files)
180
-
181
 
182
- if __name__ == "__main__":
183
- RETRIEVER = load_vector_db().as_retriever()
184
- main()
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
  from langchain_core.messages import AIMessage, HumanMessage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ from retriever import get_response, get_retriever
5
 
6
+ st.set_page_config(page_title="Adina Cosmetic Ingredients", page_icon="")
7
+ st.title("Adina Cosmetic Ingredients")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # last uploaded files
10
+ if "last_uploaded_files" not in st.session_state:
11
+ st.session_state.last_uploaded_files = []
12
 
13
+ # Initialize chat history
14
+ if "chat_history" not in st.session_state:
15
+ st.session_state.chat_history = [
16
+ AIMessage(content="Hello, I am Adina. How can I help you?"),
17
+ ]
 
 
18
 
19
+ # conversation
20
+ for message in st.session_state.chat_history:
21
+ if isinstance(message, AIMessage):
22
+ with st.chat_message("AI"):
23
+ st.write(message.content)
24
+ elif isinstance(message, HumanMessage):
25
+ with st.chat_message("Human"):
26
+ st.write(message.content)
27
 
28
+ user_query = st.chat_input("Type your message here...")
29
+ if user_query is not None and user_query != "":
30
+ st.session_state.chat_history.append(HumanMessage(content=user_query))
 
 
 
 
 
 
 
 
31
 
32
+ with st.chat_message("Human"):
33
+ st.markdown(user_query)
34
 
35
+ with st.chat_message("AI"):
36
+ response = st.write_stream(
37
+ get_response(
38
+ user_query=user_query, chat_history=st.session_state.chat_history
39
+ )
 
 
 
 
40
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ st.session_state.chat_history.append(AIMessage(content=response))
43
 
44
+ # File uploader
45
+ uploaded_files = st.sidebar.file_uploader(
46
+ label="Upload files", type="pdf", accept_multiple_files=True
47
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ to_be_vectorised_files = [
50
+ item
51
+ for item in uploaded_files
52
+ if item.name not in st.session_state.last_uploaded_files
53
+ ]
54
+ retriever = get_retriever(to_be_vectorised_files)
55
+ st.session_state.last_uploaded_files.extend(
56
+ [item.name for item in to_be_vectorised_files]
57
+ )
boto_client.py DELETED
@@ -1,54 +0,0 @@
1
- import os
2
- import time
3
-
4
- import boto3
5
- from dotenv import load_dotenv
6
- from textractor import Textractor
7
- from textractor.data.constants import TextractFeatures
8
- from textractor.data.text_linearization_config import TextLinearizationConfig
9
- from textractor.visualizers.entitylist import EntityList
10
-
11
- load_dotenv()
12
-
13
- AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
14
- AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
15
- AWS_ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL")
16
- AWS_REGION = os.getenv("AWS_REGION")
17
- AWS_S3_BUCKET_NAME = os.getenv("AWS_S3_BUCKET_NAME")
18
-
19
-
20
- def upload_to_s3(file_path, file_name):
21
- s3 = boto3.client(
22
- "s3",
23
- region_name=AWS_REGION,
24
- endpoint_url=AWS_ENDPOINT_URL,
25
- aws_access_key_id=AWS_ACCESS_KEY_ID,
26
- aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
27
- )
28
- s3.upload_file(Filename=file_path, Key=file_name, Bucket=AWS_S3_BUCKET_NAME)
29
-
30
-
31
- def analyze_pdf(file_name):
32
- extractor = Textractor(region_name=AWS_REGION)
33
- file = f"s3://{AWS_S3_BUCKET_NAME}/{file_name}"
34
- document = extractor.start_document_analysis(
35
- file_source=file,
36
- features=[
37
- TextractFeatures.LAYOUT,
38
- TextractFeatures.TABLES,
39
- # TextractFeatures.FORMS,
40
- ],
41
- save_image=False,
42
- )
43
- text = ""
44
- for page in document.pages:
45
- text += page.get_text()
46
- return text
47
-
48
-
49
- def extract_text_from_pdf(file_path, file_name):
50
- try:
51
- upload_to_s3(file_path, file_name)
52
- return analyze_pdf(file_name=file_name)
53
- except Exception as e:
54
- print("Error extracting text from PDF:", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
doctr_ocr.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from doctr.io import read_pdf
2
+ from doctr.models import ocr_predictor
3
+
4
+ predictor = ocr_predictor(
5
+ pretrained=True,
6
+ detect_orientation=True,
7
+ straighten_pages=True,
8
+ )
9
+
10
+
11
+ def pdf_extractor(pdf_file_path: str):
12
+ try:
13
+ docs = read_pdf(pdf_file_path)
14
+ result = predictor(docs)
15
+ return result.render()
16
+ except Exception as e:
17
+ print(f"Error in pdf_extractor: {e}")
package.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python3-opencv
poetry.lock DELETED
The diff for this file is too large to render. See raw diff
 
pyproject.toml DELETED
@@ -1,25 +0,0 @@
1
- [tool.poetry]
2
- name = "adina-poc"
3
- version = "0.1.0"
4
- description = ""
5
- authors = ["Kanha Upadhyay <kanha.upadhyay@sifars.com>"]
6
- readme = "README.md"
7
-
8
- [tool.poetry.dependencies]
9
- python = "^3.10"
10
- langchain = "0.1.16"
11
- streamlit = "1.33.0"
12
- langchain-openai = "0.1.3"
13
- openai = "1.17.1"
14
- langchain-community = "0.0.32"
15
- langchain-text-splitters = "0.0.1"
16
- python-dotenv = "1.0.1"
17
- boto3 = "1.34.84"
18
- langchain-core = "0.1.42"
19
- faiss-cpu = "1.8.0"
20
- amazon-textract-textractor = "1.7.1"
21
-
22
-
23
- [build-system]
24
- requires = ["poetry-core"]
25
- build-backend = "poetry.core.masonry.api"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -8,4 +8,6 @@ python-dotenv==1.0.1
8
  boto3==1.34.84
9
  langchain-core==0.1.42
10
  faiss-cpu==1.8.0
11
- amazon-textract-textractor==1.7.1
 
 
 
8
  boto3==1.34.84
9
  langchain-core==0.1.42
10
  faiss-cpu==1.8.0
11
+ python-doctr==0.8.1
12
+ tf2onnx==1.16.1
13
+ tensorflow==2.15.0
retriever.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from dotenv import load_dotenv
4
+ from langchain.schema import Document
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_core.output_parsers import StrOutputParser
7
+ from langchain_core.prompts import ChatPromptTemplate
8
+ from langchain_openai.chat_models.azure import ChatOpenAI
9
+ from langchain_openai.embeddings.azure import OpenAIEmbeddings
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
11
+
12
+ from doctr_ocr import pdf_extractor
13
+ from s3bucket import upload_to_s3
14
+
15
+ load_dotenv()
16
+
17
+ vector_database_name = "Adina_Vector_Database"
18
+ temp_pdf_folder = "temp-pdf-files"
19
+
20
+
21
+ def delete_temp_files():
22
+ for item in os.listdir(temp_pdf_folder):
23
+ file_path = os.path.join(temp_pdf_folder, item)
24
+ os.remove(file_path)
25
+
26
+
27
+ def initialize_vector_db():
28
+ embeddings = OpenAIEmbeddings()
29
+ vector_database = FAISS.from_texts(["Adina Cosmetic Ingredients"], embeddings)
30
+ vector_database.save_local(f"{vector_database_name}")
31
+
32
+
33
+ def get_vector_db(docs: list[Document]):
34
+ embeddings = OpenAIEmbeddings()
35
+
36
+ try:
37
+ currentVectorDatabase = FAISS.from_documents(docs, embeddings)
38
+ existingVectorDatabase = FAISS.load_local(
39
+ f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
40
+ )
41
+
42
+ existingVectorDatabase.merge_from(currentVectorDatabase)
43
+ existingVectorDatabase.save_local(f"{vector_database_name}")
44
+
45
+ return existingVectorDatabase
46
+
47
+ except Exception as e:
48
+ print(
49
+ "!Warning : Document is empty or not in the correct format. Thus provided pdf(s) are not added to the vector database.",
50
+ e,
51
+ )
52
+ return FAISS.load_local(
53
+ f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
54
+ )
55
+
56
+
57
+ def load_and_split(uploaded_files):
58
+ if not os.path.exists(temp_pdf_folder):
59
+ os.makedirs(temp_pdf_folder)
60
+
61
+ docs = []
62
+ for file in uploaded_files:
63
+ local_filepath = os.path.join(temp_pdf_folder, file.name)
64
+ with open(local_filepath, "wb") as f:
65
+ f.write(file.getvalue())
66
+
67
+ if upload_to_s3(file_path=local_filepath, file_name=file.name):
68
+ print(f"\n{file.name} uploaded successfully.")
69
+ else:
70
+ print(f"\nFailed to upload {file.name}.")
71
+
72
+ text = pdf_extractor(local_filepath)
73
+
74
+ text_splitter = RecursiveCharacterTextSplitter(
75
+ chunk_size=1000, chunk_overlap=200
76
+ )
77
+ temp_docs = text_splitter.create_documents(text_splitter.split_text(text))
78
+ docs.extend(temp_docs)
79
+ delete_temp_files()
80
+ return docs
81
+
82
+
83
+ def get_retriever(uploaded_files):
84
+ if os.path.exists(f"{vector_database_name}") == False:
85
+ initialize_vector_db()
86
+
87
+ if len(uploaded_files) == 0:
88
+ embeddings = OpenAIEmbeddings()
89
+ vectorDatabase = FAISS.load_local(
90
+ f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
91
+ )
92
+
93
+ retriever = vectorDatabase.as_retriever()
94
+ return retriever
95
+
96
+ docs = load_and_split(uploaded_files)
97
+ vector_database = get_vector_db(docs=docs)
98
+
99
+ retriever = vector_database.as_retriever()
100
+ return retriever
101
+
102
+
103
+ def get_response(user_query, chat_history):
104
+ retriever = get_retriever(uploaded_files=[])
105
+ docs = retriever.invoke(user_query)
106
+
107
+ template = """
108
+ Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
109
+ <rules>
110
+ - Answer the question based on the retrieved information only.
111
+ - If the question can not be answered, simply say you can not annswer it.
112
+ - Avoid mentioning that you are answering based on retreived information.
113
+ </rules>
114
+ Execute the below mandatory considerations when responding to the inquiries:
115
+ --- Tone - Respectful, Patient, and Encouraging:
116
+ Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
117
+ Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
118
+ --- Clarity - Simple, Direct, and Unambiguous:
119
+ Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
120
+ Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
121
+ --- Structure - Organized, Consistent, and Considerate:
122
+ Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
123
+ --- Empathy and Understanding - Compassionate and Responsive:
124
+ Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
125
+ Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
126
+ Answer the following questions considering the history of the conversation and retrieved information.
127
+ Chat history: {chat_history}
128
+ retrieved information: {retrieved_info}
129
+ User question: {user_question}
130
+ """
131
+
132
+ prompt = ChatPromptTemplate.from_template(template)
133
+ llm = ChatOpenAI(model="gpt-3.5-turbo-0125", streaming=True)
134
+
135
+ chain = prompt | llm | StrOutputParser()
136
+
137
+ return chain.stream(
138
+ {
139
+ "chat_history": chat_history,
140
+ "retrieved_info": docs,
141
+ "user_question": user_query,
142
+ }
143
+ )
s3bucket.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import boto3
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+
9
+ def upload_to_s3(file_path, file_name):
10
+ ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
11
+ SECRET_KEY = os.getenv("AWS_SECRET_KEY")
12
+
13
+ try:
14
+ # Initialize a session using DigitalOcean Spaces.
15
+ session = boto3.session.Session()
16
+ client = session.client(
17
+ "s3",
18
+ region_name="ams3",
19
+ endpoint_url="https://ams3.digitaloceanspaces.com",
20
+ aws_access_key_id=ACCESS_KEY,
21
+ aws_secret_access_key=SECRET_KEY,
22
+ )
23
+
24
+ client.upload_file(Filename=file_path, Key=f"{file_name}", Bucket="adina-poc")
25
+ return True
26
+
27
+ except Exception as e:
28
+ print("Error uploading file to S3 bucket.", e)
29
+ return False