Spaces:

sifars
/

adina-poc

Sleeping

App Files Files Community

kanha-upadhyay commited on Apr 26, 2024

Commit

ceefdfd

1 Parent(s): 8a7c586

init project

Browse files

Files changed (7) hide show

.gitignore +5 -0
app.py +57 -0
doctr_ocr.py +17 -0
package.txt +1 -0
requirements.txt +13 -0
retriever.py +143 -0
s3bucket.py +29 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.env
+PDFs
+Adina_Vector_Database
+temp-pdf-files
+__pycache__/

app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import streamlit as st
+from langchain_core.messages import AIMessage, HumanMessage
+from retriever import get_response, get_retriever
+st.set_page_config(page_title="Adina Cosmetic Ingredients", page_icon="")
+st.title("Adina Cosmetic Ingredients")
+# last uploaded files
+if "last_uploaded_files" not in st.session_state:
+    st.session_state.last_uploaded_files = []
+# Initialize chat history
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = [
+        AIMessage(content="Hello, I am Adina. How can I help you?"),
+    ]
+# conversation
+for message in st.session_state.chat_history:
+    if isinstance(message, AIMessage):
+        with st.chat_message("AI"):
+            st.write(message.content)
+    elif isinstance(message, HumanMessage):
+        with st.chat_message("Human"):
+            st.write(message.content)
+user_query = st.chat_input("Type your message here...")
+if user_query is not None and user_query != "":
+    st.session_state.chat_history.append(HumanMessage(content=user_query))
+    with st.chat_message("Human"):
+        st.markdown(user_query)
+    with st.chat_message("AI"):
+        response = st.write_stream(
+            get_response(
+                user_query=user_query, chat_history=st.session_state.chat_history
+            )
+        )
+    st.session_state.chat_history.append(AIMessage(content=response))
+# File uploader
+uploaded_files = st.sidebar.file_uploader(
+    label="Upload files", type="pdf", accept_multiple_files=True
+)
+to_be_vectorised_files = [
+    item
+    for item in uploaded_files
+    if item.name not in st.session_state.last_uploaded_files
+]
+retriever = get_retriever(to_be_vectorised_files)
+st.session_state.last_uploaded_files.extend(
+    [item.name for item in to_be_vectorised_files]
+)

doctr_ocr.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from doctr.io import read_pdf
+from doctr.models import ocr_predictor
+predictor = ocr_predictor(
+    pretrained=True,
+    detect_orientation=True,
+    straighten_pages=True,
+)
+def pdf_extractor(pdf_file_path: str):
+    try:
+        docs = read_pdf(pdf_file_path)
+        result = predictor(docs)
+        return result.render()
+    except Exception as e:
+        print(f"Error in pdf_extractor: {e}")

package.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python3-opencv

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+langchain==0.1.16
+streamlit==1.33.0
+langchain-openai==0.1.3
+openai==1.17.1
+langchain-community==0.0.32
+langchain-text-splitters==0.0.1
+python-dotenv==1.0.1
+boto3==1.34.84
+langchain-core==0.1.42
+faiss-cpu==1.8.0
+python-doctr==0.8.1
+tf2onnx==1.16.1
+tensorflow==2.15.0

retriever.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+from dotenv import load_dotenv
+from langchain.schema import Document
+from langchain_community.vectorstores import FAISS
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai.chat_models.azure import ChatOpenAI
+from langchain_openai.embeddings.azure import OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from doctr_ocr import pdf_extractor
+from s3bucket import upload_to_s3
+load_dotenv()
+vector_database_name = "Adina_Vector_Database"
+temp_pdf_folder = "temp-pdf-files"
+def delete_temp_files():
+    for item in os.listdir(temp_pdf_folder):
+        file_path = os.path.join(temp_pdf_folder, item)
+        os.remove(file_path)
+def initialize_vector_db():
+    embeddings = OpenAIEmbeddings()
+    vector_database = FAISS.from_texts(["Adina Cosmetic Ingredients"], embeddings)
+    vector_database.save_local(f"{vector_database_name}")
+def get_vector_db(docs: list[Document]):
+    embeddings = OpenAIEmbeddings()
+    try:
+        currentVectorDatabase = FAISS.from_documents(docs, embeddings)
+        existingVectorDatabase = FAISS.load_local(
+            f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
+        )
+        existingVectorDatabase.merge_from(currentVectorDatabase)
+        existingVectorDatabase.save_local(f"{vector_database_name}")
+        return existingVectorDatabase
+    except Exception as e:
+        print(
+            "!Warning : Document is empty or not in the correct format. Thus provided pdf(s) are not added to the vector database.",
+            e,
+        )
+        return FAISS.load_local(
+            f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
+        )
+def load_and_split(uploaded_files):
+    if not os.path.exists(temp_pdf_folder):
+        os.makedirs(temp_pdf_folder)
+    docs = []
+    for file in uploaded_files:
+        local_filepath = os.path.join(temp_pdf_folder, file.name)
+        with open(local_filepath, "wb") as f:
+            f.write(file.getvalue())
+        if upload_to_s3(file_path=local_filepath, file_name=file.name):
+            print(f"\n{file.name} uploaded successfully.")
+        else:
+            print(f"\nFailed to upload {file.name}.")
+        text = pdf_extractor(local_filepath)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000, chunk_overlap=200
+        )
+        temp_docs = text_splitter.create_documents(text_splitter.split_text(text))
+        docs.extend(temp_docs)
+    delete_temp_files()
+    return docs
+def get_retriever(uploaded_files):
+    if os.path.exists(f"{vector_database_name}") == False:
+        initialize_vector_db()
+    if len(uploaded_files) == 0:
+        embeddings = OpenAIEmbeddings()
+        vectorDatabase = FAISS.load_local(
+            f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
+        )
+        retriever = vectorDatabase.as_retriever()
+        return retriever
+    docs = load_and_split(uploaded_files)
+    vector_database = get_vector_db(docs=docs)
+    retriever = vector_database.as_retriever()
+    return retriever
+def get_response(user_query, chat_history):
+    retriever = get_retriever(uploaded_files=[])
+    docs = retriever.invoke(user_query)
+    template = """
+    Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
+    <rules>
+    - Answer the question based on the retrieved information only.
+    - If the question can not be answered, simply say you can not annswer it.
+    - Avoid mentioning that you are answering based on retreived information.
+    </rules>
+    Execute the below mandatory considerations when responding to the inquiries:
+    --- Tone - Respectful, Patient, and Encouraging:
+        Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
+        Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
+    --- Clarity - Simple, Direct, and Unambiguous:
+        Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
+        Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
+    --- Structure - Organized, Consistent, and Considerate:
+        Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
+    --- Empathy and Understanding - Compassionate and Responsive:
+        Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
+        Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
+    Answer the following questions considering the history of the conversation and retrieved information.
+    Chat history: {chat_history}
+    retrieved information: {retrieved_info}
+    User question: {user_question}
+    """
+    prompt = ChatPromptTemplate.from_template(template)
+    llm = ChatOpenAI(model="gpt-3.5-turbo-0125", streaming=True)
+    chain = prompt | llm | StrOutputParser()
+    return chain.stream(
+        {
+            "chat_history": chat_history,
+            "retrieved_info": docs,
+            "user_question": user_query,
+        }
+    )

s3bucket.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+import boto3
+from dotenv import load_dotenv
+load_dotenv()
+def upload_to_s3(file_path, file_name):
+    ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
+    SECRET_KEY = os.getenv("AWS_SECRET_KEY")
+    try:
+        # Initialize a session using DigitalOcean Spaces.
+        session = boto3.session.Session()
+        client = session.client(
+            "s3",
+            region_name="ams3",
+            endpoint_url="https://ams3.digitaloceanspaces.com",
+            aws_access_key_id=ACCESS_KEY,
+            aws_secret_access_key=SECRET_KEY,
+        )
+        client.upload_file(Filename=file_path, Key=f"{file_name}", Bucket="adina-poc")
+        return True
+    except Exception as e:
+        print("Error uploading file to S3 bucket.", e)
+        return False