kanha-upadhyay commited on
Commit
0ec6a0b
1 Parent(s): ceefdfd

first version

Browse files
Files changed (6) hide show
  1. app.py +167 -45
  2. doctr_ocr.py +0 -17
  3. package.txt +2 -1
  4. requirements.txt +2 -3
  5. retriever.py +0 -143
  6. s3bucket.py +0 -2
app.py CHANGED
@@ -1,57 +1,179 @@
 
 
 
 
1
  import streamlit as st
 
2
  from langchain_core.messages import AIMessage, HumanMessage
 
 
 
 
 
3
 
4
- from retriever import get_response, get_retriever
5
 
6
- st.set_page_config(page_title="Adina Cosmetic Ingredients", page_icon="")
7
- st.title("Adina Cosmetic Ingredients")
8
 
9
- # last uploaded files
10
- if "last_uploaded_files" not in st.session_state:
11
- st.session_state.last_uploaded_files = []
12
 
13
- # Initialize chat history
14
- if "chat_history" not in st.session_state:
15
- st.session_state.chat_history = [
16
- AIMessage(content="Hello, I am Adina. How can I help you?"),
17
- ]
18
 
19
- # conversation
20
- for message in st.session_state.chat_history:
21
- if isinstance(message, AIMessage):
22
- with st.chat_message("AI"):
23
- st.write(message.content)
24
- elif isinstance(message, HumanMessage):
25
- with st.chat_message("Human"):
26
- st.write(message.content)
27
 
28
- user_query = st.chat_input("Type your message here...")
29
- if user_query is not None and user_query != "":
30
- st.session_state.chat_history.append(HumanMessage(content=user_query))
31
 
32
- with st.chat_message("Human"):
33
- st.markdown(user_query)
 
 
 
 
 
 
 
 
34
 
35
- with st.chat_message("AI"):
36
- response = st.write_stream(
37
- get_response(
38
- user_query=user_query, chat_history=st.session_state.chat_history
39
- )
 
 
 
 
 
 
 
40
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- st.session_state.chat_history.append(AIMessage(content=response))
43
-
44
- # File uploader
45
- uploaded_files = st.sidebar.file_uploader(
46
- label="Upload files", type="pdf", accept_multiple_files=True
47
- )
48
-
49
- to_be_vectorised_files = [
50
- item
51
- for item in uploaded_files
52
- if item.name not in st.session_state.last_uploaded_files
53
- ]
54
- retriever = get_retriever(to_be_vectorised_files)
55
- st.session_state.last_uploaded_files.extend(
56
- [item.name for item in to_be_vectorised_files]
57
- )
 
1
+ import os
2
+
3
+ import pdf2image
4
+ import pytesseract
5
  import streamlit as st
6
+ from langchain_community.vectorstores import FAISS
7
  from langchain_core.messages import AIMessage, HumanMessage
8
+ from langchain_core.output_parsers import StrOutputParser
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ from langchain_openai.chat_models.azure import ChatOpenAI
11
+ from langchain_openai.embeddings.azure import OpenAIEmbeddings
12
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
13
 
14
+ from s3bucket import upload_to_s3
15
 
16
+ vector_database_name = "Adina_Vector_Database"
17
+ temp_pdf_folder = "temp-pdf-files"
18
 
19
+ RETRIEVER = None
 
 
20
 
 
 
 
 
 
21
 
22
+ def delete_temp_files():
23
+ for item in os.listdir(temp_pdf_folder):
24
+ file_path = os.path.join(temp_pdf_folder, item)
25
+ os.remove(file_path)
 
 
 
 
26
 
 
 
 
27
 
28
+ def extract_text(file):
29
+ if file.type == "application/pdf":
30
+ images = pdf2image.convert_from_bytes(file.getvalue())
31
+ text = ""
32
+ for img in images:
33
+ text += pytesseract.image_to_string(img)
34
+ else:
35
+ st.error("Invalid file type. Please upload pdf file.")
36
+ return None
37
+ return text
38
 
39
+
40
+ def load_and_split(file):
41
+ if not os.path.exists(temp_pdf_folder):
42
+ os.makedirs(temp_pdf_folder)
43
+ local_filepath = os.path.join(temp_pdf_folder, file.name)
44
+ with open(local_filepath, "wb") as f:
45
+ f.write(file.getvalue())
46
+ upload_to_s3(file_path=local_filepath, file_name=file.name)
47
+ text = extract_text(file)
48
+ if text:
49
+ text_splitter = RecursiveCharacterTextSplitter(
50
+ chunk_size=1000, chunk_overlap=200
51
  )
52
+ texts = text_splitter.split_text(text)
53
+ docs = text_splitter.create_documents(
54
+ texts=texts, metadatas=[{"file_name": file.name}] * len(texts)
55
+ )
56
+ delete_temp_files()
57
+ return docs
58
+
59
+
60
+ def initialize_vector_db():
61
+ vector_database = FAISS.from_texts(
62
+ ["Adina Cosmetic Ingredients"], OpenAIEmbeddings()
63
+ )
64
+ vector_database.save_local(f"{vector_database_name}")
65
+ return vector_database
66
+
67
+
68
+ def load_vector_db():
69
+ if os.path.exists(f"{vector_database_name}"):
70
+ return FAISS.load_local(
71
+ f"{vector_database_name}",
72
+ OpenAIEmbeddings(),
73
+ allow_dangerous_deserialization=True,
74
+ )
75
+ return initialize_vector_db()
76
+
77
+
78
+ def append_to_vector_db(docs: list = []):
79
+ global RETRIEVER
80
+ existing_vector_db = load_vector_db()
81
+ new_vector_db = FAISS.from_documents(docs, OpenAIEmbeddings())
82
+ existing_vector_db.merge_from(new_vector_db)
83
+ existing_vector_db.save_local(f"{vector_database_name}")
84
+ RETRIEVER = existing_vector_db.as_retriever()
85
+
86
+
87
+ def create_embeddings(files: list = []):
88
+ for file in files:
89
+ docs = load_and_split(file)
90
+ append_to_vector_db(docs=docs)
91
+ st.session_state.last_uploaded_files.append(file.name)
92
+ print(file.name, "processed successfully.")
93
+
94
+
95
+ def get_response(user_query, chat_history):
96
+ docs = RETRIEVER.invoke(user_query)
97
+
98
+ template = """
99
+ Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
100
+ <rules>
101
+ - Answer the question based on the retrieved information only.
102
+ - If the question can not be answered, simply say you can not annswer it.
103
+ - Avoid mentioning that you are answering based on retreived information.
104
+ </rules>
105
+ Execute the below mandatory considerations when responding to the inquiries:
106
+ --- Tone - Respectful, Patient, and Encouraging:
107
+ Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
108
+ Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
109
+ --- Clarity - Simple, Direct, and Unambiguous:
110
+ Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
111
+ Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
112
+ --- Structure - Organized, Consistent, and Considerate:
113
+ Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
114
+ --- Empathy and Understanding - Compassionate and Responsive:
115
+ Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
116
+ Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
117
+ Answer the following questions considering the history of the conversation and retrieved information.
118
+ Chat history: {chat_history}
119
+ retrieved information: {retrieved_info}
120
+ User question: {user_question}
121
+ """
122
+
123
+ prompt = ChatPromptTemplate.from_template(template)
124
+ llm = ChatOpenAI(model="gpt-3.5-turbo-0125", streaming=True)
125
+
126
+ chain = prompt | llm | StrOutputParser()
127
+
128
+ return chain.stream(
129
+ {
130
+ "chat_history": chat_history,
131
+ "retrieved_info": docs,
132
+ "user_question": user_query,
133
+ }
134
+ )
135
+
136
+
137
+ def main():
138
+ st.set_page_config(page_title="Adina Cosmetic Ingredients", page_icon="")
139
+ st.title("Adina Cosmetic Ingredients")
140
+ if "last_uploaded_files" not in st.session_state:
141
+ st.session_state.last_uploaded_files = []
142
+ if "chat_history" not in st.session_state:
143
+ st.session_state.chat_history = [
144
+ AIMessage(content="Hello, I am Adina. How can I help you?"),
145
+ ]
146
+ for message in st.session_state.chat_history:
147
+ if isinstance(message, AIMessage):
148
+ with st.chat_message("AI"):
149
+ st.write(message.content)
150
+ elif isinstance(message, HumanMessage):
151
+ with st.chat_message("Human"):
152
+ st.write(message.content)
153
+ user_query = st.chat_input("Type your message here...")
154
+ if user_query is not None and user_query != "":
155
+ st.session_state.chat_history.append(HumanMessage(content=user_query))
156
+ with st.chat_message("Human"):
157
+ st.markdown(user_query)
158
+ with st.chat_message("AI"):
159
+ response = st.write_stream(
160
+ get_response(
161
+ user_query=user_query, chat_history=st.session_state.chat_history
162
+ )
163
+ )
164
+ st.session_state.chat_history.append(AIMessage(content=response))
165
+ uploaded_files = st.sidebar.file_uploader(
166
+ label="Upload files", type="pdf", accept_multiple_files=True
167
+ )
168
+ to_be_vectorised_files = [
169
+ item
170
+ for item in uploaded_files
171
+ if item.name not in st.session_state.last_uploaded_files
172
+ ]
173
+ if to_be_vectorised_files:
174
+ create_embeddings(to_be_vectorised_files)
175
+
176
 
177
+ if __name__ == "__main__":
178
+ RETRIEVER = load_vector_db().as_retriever()
179
+ main()
 
 
 
 
 
 
 
 
 
 
 
 
 
doctr_ocr.py DELETED
@@ -1,17 +0,0 @@
1
- from doctr.io import read_pdf
2
- from doctr.models import ocr_predictor
3
-
4
- predictor = ocr_predictor(
5
- pretrained=True,
6
- detect_orientation=True,
7
- straighten_pages=True,
8
- )
9
-
10
-
11
- def pdf_extractor(pdf_file_path: str):
12
- try:
13
- docs = read_pdf(pdf_file_path)
14
- result = predictor(docs)
15
- return result.render()
16
- except Exception as e:
17
- print(f"Error in pdf_extractor: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
package.txt CHANGED
@@ -1 +1,2 @@
1
- python3-opencv
 
 
1
+ tesseract-ocr
2
+ poppler-utils
requirements.txt CHANGED
@@ -8,6 +8,5 @@ python-dotenv==1.0.1
8
  boto3==1.34.84
9
  langchain-core==0.1.42
10
  faiss-cpu==1.8.0
11
- python-doctr==0.8.1
12
- tf2onnx==1.16.1
13
- tensorflow==2.15.0
 
8
  boto3==1.34.84
9
  langchain-core==0.1.42
10
  faiss-cpu==1.8.0
11
+ pdf2image==1.17.0
12
+ pytesseract==0.3.10
 
retriever.py DELETED
@@ -1,143 +0,0 @@
1
- import os
2
-
3
- from dotenv import load_dotenv
4
- from langchain.schema import Document
5
- from langchain_community.vectorstores import FAISS
6
- from langchain_core.output_parsers import StrOutputParser
7
- from langchain_core.prompts import ChatPromptTemplate
8
- from langchain_openai.chat_models.azure import ChatOpenAI
9
- from langchain_openai.embeddings.azure import OpenAIEmbeddings
10
- from langchain_text_splitters import RecursiveCharacterTextSplitter
11
-
12
- from doctr_ocr import pdf_extractor
13
- from s3bucket import upload_to_s3
14
-
15
- load_dotenv()
16
-
17
- vector_database_name = "Adina_Vector_Database"
18
- temp_pdf_folder = "temp-pdf-files"
19
-
20
-
21
- def delete_temp_files():
22
- for item in os.listdir(temp_pdf_folder):
23
- file_path = os.path.join(temp_pdf_folder, item)
24
- os.remove(file_path)
25
-
26
-
27
- def initialize_vector_db():
28
- embeddings = OpenAIEmbeddings()
29
- vector_database = FAISS.from_texts(["Adina Cosmetic Ingredients"], embeddings)
30
- vector_database.save_local(f"{vector_database_name}")
31
-
32
-
33
- def get_vector_db(docs: list[Document]):
34
- embeddings = OpenAIEmbeddings()
35
-
36
- try:
37
- currentVectorDatabase = FAISS.from_documents(docs, embeddings)
38
- existingVectorDatabase = FAISS.load_local(
39
- f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
40
- )
41
-
42
- existingVectorDatabase.merge_from(currentVectorDatabase)
43
- existingVectorDatabase.save_local(f"{vector_database_name}")
44
-
45
- return existingVectorDatabase
46
-
47
- except Exception as e:
48
- print(
49
- "!Warning : Document is empty or not in the correct format. Thus provided pdf(s) are not added to the vector database.",
50
- e,
51
- )
52
- return FAISS.load_local(
53
- f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
54
- )
55
-
56
-
57
- def load_and_split(uploaded_files):
58
- if not os.path.exists(temp_pdf_folder):
59
- os.makedirs(temp_pdf_folder)
60
-
61
- docs = []
62
- for file in uploaded_files:
63
- local_filepath = os.path.join(temp_pdf_folder, file.name)
64
- with open(local_filepath, "wb") as f:
65
- f.write(file.getvalue())
66
-
67
- if upload_to_s3(file_path=local_filepath, file_name=file.name):
68
- print(f"\n{file.name} uploaded successfully.")
69
- else:
70
- print(f"\nFailed to upload {file.name}.")
71
-
72
- text = pdf_extractor(local_filepath)
73
-
74
- text_splitter = RecursiveCharacterTextSplitter(
75
- chunk_size=1000, chunk_overlap=200
76
- )
77
- temp_docs = text_splitter.create_documents(text_splitter.split_text(text))
78
- docs.extend(temp_docs)
79
- delete_temp_files()
80
- return docs
81
-
82
-
83
- def get_retriever(uploaded_files):
84
- if os.path.exists(f"{vector_database_name}") == False:
85
- initialize_vector_db()
86
-
87
- if len(uploaded_files) == 0:
88
- embeddings = OpenAIEmbeddings()
89
- vectorDatabase = FAISS.load_local(
90
- f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
91
- )
92
-
93
- retriever = vectorDatabase.as_retriever()
94
- return retriever
95
-
96
- docs = load_and_split(uploaded_files)
97
- vector_database = get_vector_db(docs=docs)
98
-
99
- retriever = vector_database.as_retriever()
100
- return retriever
101
-
102
-
103
- def get_response(user_query, chat_history):
104
- retriever = get_retriever(uploaded_files=[])
105
- docs = retriever.invoke(user_query)
106
-
107
- template = """
108
- Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
109
- <rules>
110
- - Answer the question based on the retrieved information only.
111
- - If the question can not be answered, simply say you can not annswer it.
112
- - Avoid mentioning that you are answering based on retreived information.
113
- </rules>
114
- Execute the below mandatory considerations when responding to the inquiries:
115
- --- Tone - Respectful, Patient, and Encouraging:
116
- Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
117
- Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
118
- --- Clarity - Simple, Direct, and Unambiguous:
119
- Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
120
- Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
121
- --- Structure - Organized, Consistent, and Considerate:
122
- Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
123
- --- Empathy and Understanding - Compassionate and Responsive:
124
- Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
125
- Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
126
- Answer the following questions considering the history of the conversation and retrieved information.
127
- Chat history: {chat_history}
128
- retrieved information: {retrieved_info}
129
- User question: {user_question}
130
- """
131
-
132
- prompt = ChatPromptTemplate.from_template(template)
133
- llm = ChatOpenAI(model="gpt-3.5-turbo-0125", streaming=True)
134
-
135
- chain = prompt | llm | StrOutputParser()
136
-
137
- return chain.stream(
138
- {
139
- "chat_history": chat_history,
140
- "retrieved_info": docs,
141
- "user_question": user_query,
142
- }
143
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
s3bucket.py CHANGED
@@ -22,8 +22,6 @@ def upload_to_s3(file_path, file_name):
22
  )
23
 
24
  client.upload_file(Filename=file_path, Key=f"{file_name}", Bucket="adina-poc")
25
- return True
26
 
27
  except Exception as e:
28
  print("Error uploading file to S3 bucket.", e)
29
- return False
 
22
  )
23
 
24
  client.upload_file(Filename=file_path, Key=f"{file_name}", Bucket="adina-poc")
 
25
 
26
  except Exception as e:
27
  print("Error uploading file to S3 bucket.", e)