Spaces:

sifars
/

adina-poc

Sleeping

App Files Files Community

test

by kanha-upadhyay - opened Apr 26

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+236

-2983

Files changed (10) hide show

.gitignore +0 -2
app.py +43 -170
boto_client.py +0 -54
doctr_ocr.py +17 -0
package.txt +1 -0
poetry.lock +0 -0
pyproject.toml +0 -25
requirements.txt +3 -1
retriever.py +143 -0
s3bucket.py +29 -0

.gitignore CHANGED Viewed

@@ -3,5 +3,3 @@ PDFs
 Adina_Vector_Database
 temp-pdf-files
 __pycache__/
-pdf_files
-.venv

 Adina_Vector_Database
 temp-pdf-files
 __pycache__/

app.py CHANGED Viewed

@@ -1,184 +1,57 @@
-import os
 import streamlit as st
-from langchain_community.vectorstores import FAISS
 from langchain_core.messages import AIMessage, HumanMessage
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_openai.chat_models.azure import ChatOpenAI
-from langchain_openai.embeddings.azure import OpenAIEmbeddings
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from boto_client import extract_text_from_pdf
-vector_database_name = "Adina_Vector_Database"
-temp_pdf_folder = "temp-pdf-files"
-vector_database_path = (
-    f"{os.environ.get('VECTOR_DATABASE_PATH', '.')}/{vector_database_name}"
-)
-RETRIEVER = None
-def delete_temp_files():
-    for item in os.listdir(temp_pdf_folder):
-        file_path = os.path.join(temp_pdf_folder, item)
-        os.remove(file_path)
-def load_and_split(file):
-    if not os.path.exists(temp_pdf_folder):
-        os.makedirs(temp_pdf_folder)
-    local_filepath = os.path.join(temp_pdf_folder, file.name)
-    with open(local_filepath, "wb") as f:
-        f.write(file.getvalue())
-    text = extract_text_from_pdf(file_path=local_filepath, file_name=file.name)
-    docs = []
-    if text:
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=512, chunk_overlap=100
-        )
-        texts = text_splitter.split_text(text)
-        docs = text_splitter.create_documents(
-            texts=texts, metadatas=[{"file_name": file.name}] * len(texts)
-        )
-    delete_temp_files()
-    return docs
-def initialize_vector_db():
-    vector_database = FAISS.from_texts(
-        ["Adina Cosmetic Ingredients"], OpenAIEmbeddings()
-    )
-    vector_database.save_local(vector_database_path)
-    return vector_database
-def load_vector_db():
-    if os.path.exists(vector_database_path):
-        return FAISS.load_local(
-            vector_database_path,
-            OpenAIEmbeddings(),
-            allow_dangerous_deserialization=True,
-        )
-    return initialize_vector_db()
-def append_to_vector_db(docs: list = []):
-    global RETRIEVER
-    existing_vector_db = load_vector_db()
-    new_vector_db = FAISS.from_documents(docs, OpenAIEmbeddings())
-    existing_vector_db.merge_from(new_vector_db)
-    existing_vector_db.save_local(vector_database_path)
-    RETRIEVER = existing_vector_db.as_retriever()
-def create_embeddings(files: list = []):
-    for file in files:
-        docs = load_and_split(file)
-        if docs:
-            append_to_vector_db(docs=docs)
-            st.session_state.last_uploaded_files.append(file.name)
-            st.toast(f"{file.name} processed successfully")
-            print(f"{file.name} processed successfully")
-        else:
-            st.toast(f"{file.name} could not be processed")
-            print(f"{file.name} could not be processed")
-def get_response(user_query, chat_history):
-    docs = RETRIEVER.invoke(user_query)
-    additional_info = RETRIEVER.invoke(
-        " ".join(
-            [
-                message.content
-                for message in chat_history
-                if isinstance(message, HumanMessage)
-            ]
         )
-    )
-    docs_content = [doc.page_content for doc in docs]
-    for doc in additional_info:
-        if doc.page_content not in docs_content:
-            docs.append(doc)
-    template = """
-    Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
-    <rules>
-    - Answer the question based on the context only.
-    - If the question can not be answered, simply say you can not annswer it.
-    </rules>
-    Execute the below mandatory considerations when responding to the inquiries:
-    --- Tone - Respectful, Patient, and Encouraging:
-        Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
-        Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
-    --- Clarity - Simple, Direct, and Unambiguous:
-        Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
-        Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
-    --- Structure - Organized, Consistent, and Considerate:
-        Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
-    --- Empathy and Understanding - Compassionate and Responsive:
-        Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
-        Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
-    Answer the following questions considering the context and/or history of the conversation.
-    Chat history: {chat_history}
-    Context: {retrieved_info}
-    User question: {user_question}
-    """
-    prompt = ChatPromptTemplate.from_template(template)
-    llm = ChatOpenAI(model="gpt-3.5-turbo-0125", streaming=True)
-    chain = prompt | llm | StrOutputParser()
-    return chain.stream(
-        {
-            "chat_history": chat_history,
-            "retrieved_info": docs,
-            "user_question": user_query,
-        }
-    )
-def main():
-    st.set_page_config(page_title="Adina Cosmetic Ingredients", page_icon="")
-    st.title("Adina Cosmetic Ingredients")
-    if "last_uploaded_files" not in st.session_state:
-        st.session_state.last_uploaded_files = []
-    if "chat_history" not in st.session_state:
-        st.session_state.chat_history = [
-            AIMessage(content="Hello, I am Adina. How can I help you?"),
-        ]
-    for message in st.session_state.chat_history:
-        if isinstance(message, AIMessage):
-            with st.chat_message("AI"):
-                st.write(message.content)
-        elif isinstance(message, HumanMessage):
-            with st.chat_message("Human"):
-                st.write(message.content)
-    user_query = st.chat_input("Type your message here...")
-    if user_query is not None and user_query != "":
-        st.session_state.chat_history.append(HumanMessage(content=user_query))
-        with st.chat_message("Human"):
-            st.markdown(user_query)
-        with st.chat_message("AI"):
-            response = st.write_stream(
-                get_response(
-                    user_query=user_query, chat_history=st.session_state.chat_history
-                )
-            )
-        st.session_state.chat_history.append(AIMessage(content=response))
-    uploaded_files = st.sidebar.file_uploader(
-        label="Upload files", type="pdf", accept_multiple_files=True
-    )
-    to_be_vectorised_files = [
-        item
-        for item in uploaded_files
-        if item.name not in st.session_state.last_uploaded_files
-    ]
-    if to_be_vectorised_files:
-        create_embeddings(to_be_vectorised_files)
-if __name__ == "__main__":
-    RETRIEVER = load_vector_db().as_retriever()
-    main()

 import streamlit as st
 from langchain_core.messages import AIMessage, HumanMessage
+from retriever import get_response, get_retriever
+st.set_page_config(page_title="Adina Cosmetic Ingredients", page_icon="")
+st.title("Adina Cosmetic Ingredients")
+# last uploaded files
+if "last_uploaded_files" not in st.session_state:
+    st.session_state.last_uploaded_files = []
+# Initialize chat history
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = [
+        AIMessage(content="Hello, I am Adina. How can I help you?"),
+    ]
+# conversation
+for message in st.session_state.chat_history:
+    if isinstance(message, AIMessage):
+        with st.chat_message("AI"):
+            st.write(message.content)
+    elif isinstance(message, HumanMessage):
+        with st.chat_message("Human"):
+            st.write(message.content)
+user_query = st.chat_input("Type your message here...")
+if user_query is not None and user_query != "":
+    st.session_state.chat_history.append(HumanMessage(content=user_query))
+    with st.chat_message("Human"):
+        st.markdown(user_query)
+    with st.chat_message("AI"):
+        response = st.write_stream(
+            get_response(
+                user_query=user_query, chat_history=st.session_state.chat_history
+            )
         )
+    st.session_state.chat_history.append(AIMessage(content=response))
+# File uploader
+uploaded_files = st.sidebar.file_uploader(
+    label="Upload files", type="pdf", accept_multiple_files=True
+)
+to_be_vectorised_files = [
+    item
+    for item in uploaded_files
+    if item.name not in st.session_state.last_uploaded_files
+]
+retriever = get_retriever(to_be_vectorised_files)
+st.session_state.last_uploaded_files.extend(
+    [item.name for item in to_be_vectorised_files]
+)

boto_client.py DELETED Viewed

@@ -1,54 +0,0 @@
-import os
-import time
-import boto3
-from dotenv import load_dotenv
-from textractor import Textractor
-from textractor.data.constants import TextractFeatures
-from textractor.data.text_linearization_config import TextLinearizationConfig
-from textractor.visualizers.entitylist import EntityList
-load_dotenv()
-AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
-AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
-AWS_ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL")
-AWS_REGION = os.getenv("AWS_REGION")
-AWS_S3_BUCKET_NAME = os.getenv("AWS_S3_BUCKET_NAME")
-def upload_to_s3(file_path, file_name):
-    s3 = boto3.client(
-        "s3",
-        region_name=AWS_REGION,
-        endpoint_url=AWS_ENDPOINT_URL,
-        aws_access_key_id=AWS_ACCESS_KEY_ID,
-        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
-    )
-    s3.upload_file(Filename=file_path, Key=file_name, Bucket=AWS_S3_BUCKET_NAME)
-def analyze_pdf(file_name):
-    extractor = Textractor(region_name=AWS_REGION)
-    file = f"s3://{AWS_S3_BUCKET_NAME}/{file_name}"
-    document = extractor.start_document_analysis(
-        file_source=file,
-        features=[
-            TextractFeatures.LAYOUT,
-            TextractFeatures.TABLES,
-            # TextractFeatures.FORMS,
-        ],
-        save_image=False,
-    )
-    text = ""
-    for page in document.pages:
-        text += page.get_text()
-    return text
-def extract_text_from_pdf(file_path, file_name):
-    try:
-        upload_to_s3(file_path, file_name)
-        return analyze_pdf(file_name=file_name)
-    except Exception as e:
-        print("Error extracting text from PDF:", e)

doctr_ocr.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from doctr.io import read_pdf
+from doctr.models import ocr_predictor
+predictor = ocr_predictor(
+    pretrained=True,
+    detect_orientation=True,
+    straighten_pages=True,
+)
+def pdf_extractor(pdf_file_path: str):
+    try:
+        docs = read_pdf(pdf_file_path)
+        result = predictor(docs)
+        return result.render()
+    except Exception as e:
+        print(f"Error in pdf_extractor: {e}")

package.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python3-opencv

poetry.lock DELETED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml DELETED Viewed

@@ -1,25 +0,0 @@
-[tool.poetry]
-name = "adina-poc"
-version = "0.1.0"
-description = ""
-authors = ["Kanha Upadhyay <kanha.upadhyay@sifars.com>"]
-readme = "README.md"
-[tool.poetry.dependencies]
-python = "^3.10"
-langchain = "0.1.16"
-streamlit = "1.33.0"
-langchain-openai = "0.1.3"
-openai = "1.17.1"
-langchain-community = "0.0.32"
-langchain-text-splitters = "0.0.1"
-python-dotenv = "1.0.1"
-boto3 = "1.34.84"
-langchain-core = "0.1.42"
-faiss-cpu = "1.8.0"
-amazon-textract-textractor = "1.7.1"
-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"

requirements.txt CHANGED Viewed

@@ -8,4 +8,6 @@ python-dotenv==1.0.1
 boto3==1.34.84
 langchain-core==0.1.42
 faiss-cpu==1.8.0
-amazon-textract-textractor==1.7.1

 boto3==1.34.84
 langchain-core==0.1.42
 faiss-cpu==1.8.0
+python-doctr==0.8.1
+tf2onnx==1.16.1
+tensorflow==2.15.0

retriever.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+from dotenv import load_dotenv
+from langchain.schema import Document
+from langchain_community.vectorstores import FAISS
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai.chat_models.azure import ChatOpenAI
+from langchain_openai.embeddings.azure import OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from doctr_ocr import pdf_extractor
+from s3bucket import upload_to_s3
+load_dotenv()
+vector_database_name = "Adina_Vector_Database"
+temp_pdf_folder = "temp-pdf-files"
+def delete_temp_files():
+    for item in os.listdir(temp_pdf_folder):
+        file_path = os.path.join(temp_pdf_folder, item)
+        os.remove(file_path)
+def initialize_vector_db():
+    embeddings = OpenAIEmbeddings()
+    vector_database = FAISS.from_texts(["Adina Cosmetic Ingredients"], embeddings)
+    vector_database.save_local(f"{vector_database_name}")
+def get_vector_db(docs: list[Document]):
+    embeddings = OpenAIEmbeddings()
+    try:
+        currentVectorDatabase = FAISS.from_documents(docs, embeddings)
+        existingVectorDatabase = FAISS.load_local(
+            f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
+        )
+        existingVectorDatabase.merge_from(currentVectorDatabase)
+        existingVectorDatabase.save_local(f"{vector_database_name}")
+        return existingVectorDatabase
+    except Exception as e:
+        print(
+            "!Warning : Document is empty or not in the correct format. Thus provided pdf(s) are not added to the vector database.",
+            e,
+        )
+        return FAISS.load_local(
+            f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
+        )
+def load_and_split(uploaded_files):
+    if not os.path.exists(temp_pdf_folder):
+        os.makedirs(temp_pdf_folder)
+    docs = []
+    for file in uploaded_files:
+        local_filepath = os.path.join(temp_pdf_folder, file.name)
+        with open(local_filepath, "wb") as f:
+            f.write(file.getvalue())
+        if upload_to_s3(file_path=local_filepath, file_name=file.name):
+            print(f"\n{file.name} uploaded successfully.")
+        else:
+            print(f"\nFailed to upload {file.name}.")
+        text = pdf_extractor(local_filepath)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000, chunk_overlap=200
+        )
+        temp_docs = text_splitter.create_documents(text_splitter.split_text(text))
+        docs.extend(temp_docs)
+    delete_temp_files()
+    return docs
+def get_retriever(uploaded_files):
+    if os.path.exists(f"{vector_database_name}") == False:
+        initialize_vector_db()
+    if len(uploaded_files) == 0:
+        embeddings = OpenAIEmbeddings()
+        vectorDatabase = FAISS.load_local(
+            f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
+        )
+        retriever = vectorDatabase.as_retriever()
+        return retriever
+    docs = load_and_split(uploaded_files)
+    vector_database = get_vector_db(docs=docs)
+    retriever = vector_database.as_retriever()
+    return retriever
+def get_response(user_query, chat_history):
+    retriever = get_retriever(uploaded_files=[])
+    docs = retriever.invoke(user_query)
+    template = """
+    Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
+    <rules>
+    - Answer the question based on the retrieved information only.
+    - If the question can not be answered, simply say you can not annswer it.
+    - Avoid mentioning that you are answering based on retreived information.
+    </rules>
+    Execute the below mandatory considerations when responding to the inquiries:
+    --- Tone - Respectful, Patient, and Encouraging:
+        Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
+        Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
+    --- Clarity - Simple, Direct, and Unambiguous:
+        Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
+        Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
+    --- Structure - Organized, Consistent, and Considerate:
+        Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
+    --- Empathy and Understanding - Compassionate and Responsive:
+        Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
+        Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
+    Answer the following questions considering the history of the conversation and retrieved information.
+    Chat history: {chat_history}
+    retrieved information: {retrieved_info}
+    User question: {user_question}
+    """
+    prompt = ChatPromptTemplate.from_template(template)
+    llm = ChatOpenAI(model="gpt-3.5-turbo-0125", streaming=True)
+    chain = prompt | llm | StrOutputParser()
+    return chain.stream(
+        {
+            "chat_history": chat_history,
+            "retrieved_info": docs,
+            "user_question": user_query,
+        }
+    )

s3bucket.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+import boto3
+from dotenv import load_dotenv
+load_dotenv()
+def upload_to_s3(file_path, file_name):
+    ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
+    SECRET_KEY = os.getenv("AWS_SECRET_KEY")
+    try:
+        # Initialize a session using DigitalOcean Spaces.
+        session = boto3.session.Session()
+        client = session.client(
+            "s3",
+            region_name="ams3",
+            endpoint_url="https://ams3.digitaloceanspaces.com",
+            aws_access_key_id=ACCESS_KEY,
+            aws_secret_access_key=SECRET_KEY,
+        )
+        client.upload_file(Filename=file_path, Key=f"{file_name}", Bucket="adina-poc")
+        return True
+    except Exception as e:
+        print("Error uploading file to S3 bucket.", e)
+        return False