kanha-upadhyay commited on
Commit
ceefdfd
1 Parent(s): 8a7c586

init project

Browse files
Files changed (7) hide show
  1. .gitignore +5 -0
  2. app.py +57 -0
  3. doctr_ocr.py +17 -0
  4. package.txt +1 -0
  5. requirements.txt +13 -0
  6. retriever.py +143 -0
  7. s3bucket.py +29 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .env
2
+ PDFs
3
+ Adina_Vector_Database
4
+ temp-pdf-files
5
+ __pycache__/
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_core.messages import AIMessage, HumanMessage
3
+
4
+ from retriever import get_response, get_retriever
5
+
6
+ st.set_page_config(page_title="Adina Cosmetic Ingredients", page_icon="")
7
+ st.title("Adina Cosmetic Ingredients")
8
+
9
+ # last uploaded files
10
+ if "last_uploaded_files" not in st.session_state:
11
+ st.session_state.last_uploaded_files = []
12
+
13
+ # Initialize chat history
14
+ if "chat_history" not in st.session_state:
15
+ st.session_state.chat_history = [
16
+ AIMessage(content="Hello, I am Adina. How can I help you?"),
17
+ ]
18
+
19
+ # conversation
20
+ for message in st.session_state.chat_history:
21
+ if isinstance(message, AIMessage):
22
+ with st.chat_message("AI"):
23
+ st.write(message.content)
24
+ elif isinstance(message, HumanMessage):
25
+ with st.chat_message("Human"):
26
+ st.write(message.content)
27
+
28
+ user_query = st.chat_input("Type your message here...")
29
+ if user_query is not None and user_query != "":
30
+ st.session_state.chat_history.append(HumanMessage(content=user_query))
31
+
32
+ with st.chat_message("Human"):
33
+ st.markdown(user_query)
34
+
35
+ with st.chat_message("AI"):
36
+ response = st.write_stream(
37
+ get_response(
38
+ user_query=user_query, chat_history=st.session_state.chat_history
39
+ )
40
+ )
41
+
42
+ st.session_state.chat_history.append(AIMessage(content=response))
43
+
44
+ # File uploader
45
+ uploaded_files = st.sidebar.file_uploader(
46
+ label="Upload files", type="pdf", accept_multiple_files=True
47
+ )
48
+
49
+ to_be_vectorised_files = [
50
+ item
51
+ for item in uploaded_files
52
+ if item.name not in st.session_state.last_uploaded_files
53
+ ]
54
+ retriever = get_retriever(to_be_vectorised_files)
55
+ st.session_state.last_uploaded_files.extend(
56
+ [item.name for item in to_be_vectorised_files]
57
+ )
doctr_ocr.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from doctr.io import read_pdf
2
+ from doctr.models import ocr_predictor
3
+
4
+ predictor = ocr_predictor(
5
+ pretrained=True,
6
+ detect_orientation=True,
7
+ straighten_pages=True,
8
+ )
9
+
10
+
11
+ def pdf_extractor(pdf_file_path: str):
12
+ try:
13
+ docs = read_pdf(pdf_file_path)
14
+ result = predictor(docs)
15
+ return result.render()
16
+ except Exception as e:
17
+ print(f"Error in pdf_extractor: {e}")
package.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python3-opencv
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.1.16
2
+ streamlit==1.33.0
3
+ langchain-openai==0.1.3
4
+ openai==1.17.1
5
+ langchain-community==0.0.32
6
+ langchain-text-splitters==0.0.1
7
+ python-dotenv==1.0.1
8
+ boto3==1.34.84
9
+ langchain-core==0.1.42
10
+ faiss-cpu==1.8.0
11
+ python-doctr==0.8.1
12
+ tf2onnx==1.16.1
13
+ tensorflow==2.15.0
retriever.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from dotenv import load_dotenv
4
+ from langchain.schema import Document
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_core.output_parsers import StrOutputParser
7
+ from langchain_core.prompts import ChatPromptTemplate
8
+ from langchain_openai.chat_models.azure import ChatOpenAI
9
+ from langchain_openai.embeddings.azure import OpenAIEmbeddings
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
11
+
12
+ from doctr_ocr import pdf_extractor
13
+ from s3bucket import upload_to_s3
14
+
15
+ load_dotenv()
16
+
17
+ vector_database_name = "Adina_Vector_Database"
18
+ temp_pdf_folder = "temp-pdf-files"
19
+
20
+
21
+ def delete_temp_files():
22
+ for item in os.listdir(temp_pdf_folder):
23
+ file_path = os.path.join(temp_pdf_folder, item)
24
+ os.remove(file_path)
25
+
26
+
27
+ def initialize_vector_db():
28
+ embeddings = OpenAIEmbeddings()
29
+ vector_database = FAISS.from_texts(["Adina Cosmetic Ingredients"], embeddings)
30
+ vector_database.save_local(f"{vector_database_name}")
31
+
32
+
33
+ def get_vector_db(docs: list[Document]):
34
+ embeddings = OpenAIEmbeddings()
35
+
36
+ try:
37
+ currentVectorDatabase = FAISS.from_documents(docs, embeddings)
38
+ existingVectorDatabase = FAISS.load_local(
39
+ f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
40
+ )
41
+
42
+ existingVectorDatabase.merge_from(currentVectorDatabase)
43
+ existingVectorDatabase.save_local(f"{vector_database_name}")
44
+
45
+ return existingVectorDatabase
46
+
47
+ except Exception as e:
48
+ print(
49
+ "!Warning : Document is empty or not in the correct format. Thus provided pdf(s) are not added to the vector database.",
50
+ e,
51
+ )
52
+ return FAISS.load_local(
53
+ f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
54
+ )
55
+
56
+
57
+ def load_and_split(uploaded_files):
58
+ if not os.path.exists(temp_pdf_folder):
59
+ os.makedirs(temp_pdf_folder)
60
+
61
+ docs = []
62
+ for file in uploaded_files:
63
+ local_filepath = os.path.join(temp_pdf_folder, file.name)
64
+ with open(local_filepath, "wb") as f:
65
+ f.write(file.getvalue())
66
+
67
+ if upload_to_s3(file_path=local_filepath, file_name=file.name):
68
+ print(f"\n{file.name} uploaded successfully.")
69
+ else:
70
+ print(f"\nFailed to upload {file.name}.")
71
+
72
+ text = pdf_extractor(local_filepath)
73
+
74
+ text_splitter = RecursiveCharacterTextSplitter(
75
+ chunk_size=1000, chunk_overlap=200
76
+ )
77
+ temp_docs = text_splitter.create_documents(text_splitter.split_text(text))
78
+ docs.extend(temp_docs)
79
+ delete_temp_files()
80
+ return docs
81
+
82
+
83
+ def get_retriever(uploaded_files):
84
+ if os.path.exists(f"{vector_database_name}") == False:
85
+ initialize_vector_db()
86
+
87
+ if len(uploaded_files) == 0:
88
+ embeddings = OpenAIEmbeddings()
89
+ vectorDatabase = FAISS.load_local(
90
+ f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
91
+ )
92
+
93
+ retriever = vectorDatabase.as_retriever()
94
+ return retriever
95
+
96
+ docs = load_and_split(uploaded_files)
97
+ vector_database = get_vector_db(docs=docs)
98
+
99
+ retriever = vector_database.as_retriever()
100
+ return retriever
101
+
102
+
103
+ def get_response(user_query, chat_history):
104
+ retriever = get_retriever(uploaded_files=[])
105
+ docs = retriever.invoke(user_query)
106
+
107
+ template = """
108
+ Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
109
+ <rules>
110
+ - Answer the question based on the retrieved information only.
111
+ - If the question can not be answered, simply say you can not annswer it.
112
+ - Avoid mentioning that you are answering based on retreived information.
113
+ </rules>
114
+ Execute the below mandatory considerations when responding to the inquiries:
115
+ --- Tone - Respectful, Patient, and Encouraging:
116
+ Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
117
+ Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
118
+ --- Clarity - Simple, Direct, and Unambiguous:
119
+ Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
120
+ Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
121
+ --- Structure - Organized, Consistent, and Considerate:
122
+ Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
123
+ --- Empathy and Understanding - Compassionate and Responsive:
124
+ Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
125
+ Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
126
+ Answer the following questions considering the history of the conversation and retrieved information.
127
+ Chat history: {chat_history}
128
+ retrieved information: {retrieved_info}
129
+ User question: {user_question}
130
+ """
131
+
132
+ prompt = ChatPromptTemplate.from_template(template)
133
+ llm = ChatOpenAI(model="gpt-3.5-turbo-0125", streaming=True)
134
+
135
+ chain = prompt | llm | StrOutputParser()
136
+
137
+ return chain.stream(
138
+ {
139
+ "chat_history": chat_history,
140
+ "retrieved_info": docs,
141
+ "user_question": user_query,
142
+ }
143
+ )
s3bucket.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import boto3
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+
9
+ def upload_to_s3(file_path, file_name):
10
+ ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
11
+ SECRET_KEY = os.getenv("AWS_SECRET_KEY")
12
+
13
+ try:
14
+ # Initialize a session using DigitalOcean Spaces.
15
+ session = boto3.session.Session()
16
+ client = session.client(
17
+ "s3",
18
+ region_name="ams3",
19
+ endpoint_url="https://ams3.digitaloceanspaces.com",
20
+ aws_access_key_id=ACCESS_KEY,
21
+ aws_secret_access_key=SECRET_KEY,
22
+ )
23
+
24
+ client.upload_file(Filename=file_path, Key=f"{file_name}", Bucket="adina-poc")
25
+ return True
26
+
27
+ except Exception as e:
28
+ print("Error uploading file to S3 bucket.", e)
29
+ return False