File size: 7,098 Bytes
0ec6a0b
 
ceefdfd
0ec6a0b
ceefdfd
0ec6a0b
 
 
 
 
ceefdfd
2669ae8
ceefdfd
0ec6a0b
 
5f929fc
 
 
ceefdfd
0ec6a0b
ceefdfd
 
0ec6a0b
 
 
 
ceefdfd
 
0ec6a0b
 
 
 
 
 
2669ae8
 
0ec6a0b
 
2669ae8
ceefdfd
0ec6a0b
 
 
 
 
 
 
 
 
 
 
 
5f929fc
0ec6a0b
 
 
 
5f929fc
0ec6a0b
5f929fc
0ec6a0b
 
 
 
 
 
 
 
 
 
 
5f929fc
0ec6a0b
 
 
 
 
 
2669ae8
 
 
 
 
 
 
 
0ec6a0b
 
 
 
8bc2404
2669ae8
8bc2404
 
2669ae8
8bc2404
 
 
 
2669ae8
 
 
 
0ec6a0b
 
 
2669ae8
0ec6a0b
 
 
 
 
 
 
 
 
 
 
 
 
 
2669ae8
0ec6a0b
8bc2404
0ec6a0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ceefdfd
0ec6a0b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import os

import streamlit as st
from langchain_community.vectorstores import FAISS
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai.chat_models.azure import ChatOpenAI
from langchain_openai.embeddings.azure import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from boto_client import extract_text_from_pdf

vector_database_name = "Adina_Vector_Database"
temp_pdf_folder = "temp-pdf-files"
vector_database_path = (
    f"{os.environ.get('VECTOR_DATABASE_PATH', '.')}/{vector_database_name}"
)

RETRIEVER = None


def delete_temp_files():
    for item in os.listdir(temp_pdf_folder):
        file_path = os.path.join(temp_pdf_folder, item)
        os.remove(file_path)


def load_and_split(file):
    if not os.path.exists(temp_pdf_folder):
        os.makedirs(temp_pdf_folder)
    local_filepath = os.path.join(temp_pdf_folder, file.name)
    with open(local_filepath, "wb") as f:
        f.write(file.getvalue())
    text = extract_text_from_pdf(file_path=local_filepath, file_name=file.name)
    docs = []
    if text:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=512, chunk_overlap=100
        )
        texts = text_splitter.split_text(text)
        docs = text_splitter.create_documents(
            texts=texts, metadatas=[{"file_name": file.name}] * len(texts)
        )
    delete_temp_files()
    return docs


def initialize_vector_db():
    vector_database = FAISS.from_texts(
        ["Adina Cosmetic Ingredients"], OpenAIEmbeddings()
    )
    vector_database.save_local(vector_database_path)
    return vector_database


def load_vector_db():
    if os.path.exists(vector_database_path):
        return FAISS.load_local(
            vector_database_path,
            OpenAIEmbeddings(),
            allow_dangerous_deserialization=True,
        )
    return initialize_vector_db()


def append_to_vector_db(docs: list = []):
    global RETRIEVER
    existing_vector_db = load_vector_db()
    new_vector_db = FAISS.from_documents(docs, OpenAIEmbeddings())
    existing_vector_db.merge_from(new_vector_db)
    existing_vector_db.save_local(vector_database_path)
    RETRIEVER = existing_vector_db.as_retriever()


def create_embeddings(files: list = []):
    for file in files:
        docs = load_and_split(file)
        if docs:
            append_to_vector_db(docs=docs)
            st.session_state.last_uploaded_files.append(file.name)
            st.toast(f"{file.name} processed successfully")
            print(f"{file.name} processed successfully")
        else:
            st.toast(f"{file.name} could not be processed")
            print(f"{file.name} could not be processed")


def get_response(user_query, chat_history):
    docs = RETRIEVER.invoke(user_query)
    additional_info = RETRIEVER.invoke(
        " ".join(
            [
                message.content
                for message in chat_history
                if isinstance(message, HumanMessage)
            ]
        )
    )
    docs_content = [doc.page_content for doc in docs]
    for doc in additional_info:
        if doc.page_content not in docs_content:
            docs.append(doc)
    template = """
    Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
    <rules>
    - Answer the question based on the context only.
    - If the question can not be answered, simply say you can not annswer it.
    </rules>
    Execute the below mandatory considerations when responding to the inquiries:
    --- Tone - Respectful, Patient, and Encouraging:
        Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
        Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
    --- Clarity - Simple, Direct, and Unambiguous:
        Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
        Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
    --- Structure - Organized, Consistent, and Considerate:
        Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
    --- Empathy and Understanding - Compassionate and Responsive:
        Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
        Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
    Answer the following questions considering the context and/or history of the conversation.
    Chat history: {chat_history}
    Context: {retrieved_info}
    User question: {user_question}
    """

    prompt = ChatPromptTemplate.from_template(template)
    llm = ChatOpenAI(model="gpt-3.5-turbo-0125", streaming=True)

    chain = prompt | llm | StrOutputParser()

    return chain.stream(
        {
            "chat_history": chat_history,
            "retrieved_info": docs,
            "user_question": user_query,
        }
    )


def main():
    st.set_page_config(page_title="Adina Cosmetic Ingredients", page_icon="")
    st.title("Adina Cosmetic Ingredients")
    if "last_uploaded_files" not in st.session_state:
        st.session_state.last_uploaded_files = []
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = [
            AIMessage(content="Hello, I am Adina. How can I help you?"),
        ]
    for message in st.session_state.chat_history:
        if isinstance(message, AIMessage):
            with st.chat_message("AI"):
                st.write(message.content)
        elif isinstance(message, HumanMessage):
            with st.chat_message("Human"):
                st.write(message.content)
    user_query = st.chat_input("Type your message here...")
    if user_query is not None and user_query != "":
        st.session_state.chat_history.append(HumanMessage(content=user_query))
        with st.chat_message("Human"):
            st.markdown(user_query)
        with st.chat_message("AI"):
            response = st.write_stream(
                get_response(
                    user_query=user_query, chat_history=st.session_state.chat_history
                )
            )
        st.session_state.chat_history.append(AIMessage(content=response))
    uploaded_files = st.sidebar.file_uploader(
        label="Upload files", type="pdf", accept_multiple_files=True
    )
    to_be_vectorised_files = [
        item
        for item in uploaded_files
        if item.name not in st.session_state.last_uploaded_files
    ]
    if to_be_vectorised_files:
        create_embeddings(to_be_vectorised_files)


if __name__ == "__main__":
    RETRIEVER = load_vector_db().as_retriever()
    main()