Spaces:

sifars
/

adina-poc

Sleeping

App Files Files Community

kanha-upadhyay commited on Apr 26

Commit

0ec6a0b

•

1 Parent(s): ceefdfd

first version

Browse files

Files changed (6) hide show

app.py +167 -45
doctr_ocr.py +0 -17
package.txt +2 -1
requirements.txt +2 -3
retriever.py +0 -143
s3bucket.py +0 -2

app.py CHANGED Viewed

@@ -1,57 +1,179 @@
 import streamlit as st
 from langchain_core.messages import AIMessage, HumanMessage
-from retriever import get_response, get_retriever
-st.set_page_config(page_title="Adina Cosmetic Ingredients", page_icon="")
-st.title("Adina Cosmetic Ingredients")
-# last uploaded files
-if "last_uploaded_files" not in st.session_state:
-    st.session_state.last_uploaded_files = []
-# Initialize chat history
-if "chat_history" not in st.session_state:
-    st.session_state.chat_history = [
-        AIMessage(content="Hello, I am Adina. How can I help you?"),
-    ]
-# conversation
-for message in st.session_state.chat_history:
-    if isinstance(message, AIMessage):
-        with st.chat_message("AI"):
-            st.write(message.content)
-    elif isinstance(message, HumanMessage):
-        with st.chat_message("Human"):
-            st.write(message.content)
-user_query = st.chat_input("Type your message here...")
-if user_query is not None and user_query != "":
-    st.session_state.chat_history.append(HumanMessage(content=user_query))
-    with st.chat_message("Human"):
-        st.markdown(user_query)
-    with st.chat_message("AI"):
-        response = st.write_stream(
-            get_response(
-                user_query=user_query, chat_history=st.session_state.chat_history
-            )
         )
-    st.session_state.chat_history.append(AIMessage(content=response))
-# File uploader
-uploaded_files = st.sidebar.file_uploader(
-    label="Upload files", type="pdf", accept_multiple_files=True
-)
-to_be_vectorised_files = [
-    item
-    for item in uploaded_files
-    if item.name not in st.session_state.last_uploaded_files
-]
-retriever = get_retriever(to_be_vectorised_files)
-st.session_state.last_uploaded_files.extend(
-    [item.name for item in to_be_vectorised_files]
-)

+import os
+import pdf2image
+import pytesseract
 import streamlit as st
+from langchain_community.vectorstores import FAISS
 from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai.chat_models.azure import ChatOpenAI
+from langchain_openai.embeddings.azure import OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from s3bucket import upload_to_s3
+vector_database_name = "Adina_Vector_Database"
+temp_pdf_folder = "temp-pdf-files"
+RETRIEVER = None
+def delete_temp_files():
+    for item in os.listdir(temp_pdf_folder):
+        file_path = os.path.join(temp_pdf_folder, item)
+        os.remove(file_path)
+def extract_text(file):
+    if file.type == "application/pdf":
+        images = pdf2image.convert_from_bytes(file.getvalue())
+        text = ""
+        for img in images:
+            text += pytesseract.image_to_string(img)
+    else:
+        st.error("Invalid file type. Please upload pdf file.")
+        return None
+    return text
+def load_and_split(file):
+    if not os.path.exists(temp_pdf_folder):
+        os.makedirs(temp_pdf_folder)
+    local_filepath = os.path.join(temp_pdf_folder, file.name)
+    with open(local_filepath, "wb") as f:
+        f.write(file.getvalue())
+    upload_to_s3(file_path=local_filepath, file_name=file.name)
+    text = extract_text(file)
+    if text:
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000, chunk_overlap=200
         )
+        texts = text_splitter.split_text(text)
+        docs = text_splitter.create_documents(
+            texts=texts, metadatas=[{"file_name": file.name}] * len(texts)
+        )
+    delete_temp_files()
+    return docs
+def initialize_vector_db():
+    vector_database = FAISS.from_texts(
+        ["Adina Cosmetic Ingredients"], OpenAIEmbeddings()
+    )
+    vector_database.save_local(f"{vector_database_name}")
+    return vector_database
+def load_vector_db():
+    if os.path.exists(f"{vector_database_name}"):
+        return FAISS.load_local(
+            f"{vector_database_name}",
+            OpenAIEmbeddings(),
+            allow_dangerous_deserialization=True,
+        )
+    return initialize_vector_db()
+def append_to_vector_db(docs: list = []):
+    global RETRIEVER
+    existing_vector_db = load_vector_db()
+    new_vector_db = FAISS.from_documents(docs, OpenAIEmbeddings())
+    existing_vector_db.merge_from(new_vector_db)
+    existing_vector_db.save_local(f"{vector_database_name}")
+    RETRIEVER = existing_vector_db.as_retriever()
+def create_embeddings(files: list = []):
+    for file in files:
+        docs = load_and_split(file)
+        append_to_vector_db(docs=docs)
+        st.session_state.last_uploaded_files.append(file.name)
+        print(file.name, "processed successfully.")
+def get_response(user_query, chat_history):
+    docs = RETRIEVER.invoke(user_query)
+    template = """
+    Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
+    <rules>
+    - Answer the question based on the retrieved information only.
+    - If the question can not be answered, simply say you can not annswer it.
+    - Avoid mentioning that you are answering based on retreived information.
+    </rules>
+    Execute the below mandatory considerations when responding to the inquiries:
+    --- Tone - Respectful, Patient, and Encouraging:
+        Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
+        Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
+    --- Clarity - Simple, Direct, and Unambiguous:
+        Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
+        Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
+    --- Structure - Organized, Consistent, and Considerate:
+        Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
+    --- Empathy and Understanding - Compassionate and Responsive:
+        Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
+        Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
+    Answer the following questions considering the history of the conversation and retrieved information.
+    Chat history: {chat_history}
+    retrieved information: {retrieved_info}
+    User question: {user_question}
+    """
+    prompt = ChatPromptTemplate.from_template(template)
+    llm = ChatOpenAI(model="gpt-3.5-turbo-0125", streaming=True)
+    chain = prompt | llm | StrOutputParser()
+    return chain.stream(
+        {
+            "chat_history": chat_history,
+            "retrieved_info": docs,
+            "user_question": user_query,
+        }
+    )
+def main():
+    st.set_page_config(page_title="Adina Cosmetic Ingredients", page_icon="")
+    st.title("Adina Cosmetic Ingredients")
+    if "last_uploaded_files" not in st.session_state:
+        st.session_state.last_uploaded_files = []
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = [
+            AIMessage(content="Hello, I am Adina. How can I help you?"),
+        ]
+    for message in st.session_state.chat_history:
+        if isinstance(message, AIMessage):
+            with st.chat_message("AI"):
+                st.write(message.content)
+        elif isinstance(message, HumanMessage):
+            with st.chat_message("Human"):
+                st.write(message.content)
+    user_query = st.chat_input("Type your message here...")
+    if user_query is not None and user_query != "":
+        st.session_state.chat_history.append(HumanMessage(content=user_query))
+        with st.chat_message("Human"):
+            st.markdown(user_query)
+        with st.chat_message("AI"):
+            response = st.write_stream(
+                get_response(
+                    user_query=user_query, chat_history=st.session_state.chat_history
+                )
+            )
+        st.session_state.chat_history.append(AIMessage(content=response))
+    uploaded_files = st.sidebar.file_uploader(
+        label="Upload files", type="pdf", accept_multiple_files=True
+    )
+    to_be_vectorised_files = [
+        item
+        for item in uploaded_files
+        if item.name not in st.session_state.last_uploaded_files
+    ]
+    if to_be_vectorised_files:
+        create_embeddings(to_be_vectorised_files)
+if __name__ == "__main__":
+    RETRIEVER = load_vector_db().as_retriever()
+    main()

doctr_ocr.py DELETED Viewed

@@ -1,17 +0,0 @@
-from doctr.io import read_pdf
-from doctr.models import ocr_predictor
-predictor = ocr_predictor(
-    pretrained=True,
-    detect_orientation=True,
-    straighten_pages=True,
-)
-def pdf_extractor(pdf_file_path: str):
-    try:
-        docs = read_pdf(pdf_file_path)
-        result = predictor(docs)
-        return result.render()
-    except Exception as e:
-        print(f"Error in pdf_extractor: {e}")

package.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- ~~python3~~-~~opencv~~


1	+ tesseract-ocr
2	+ poppler-utils

requirements.txt CHANGED Viewed

@@ -8,6 +8,5 @@ python-dotenv==1.0.1
 boto3==1.34.84
 langchain-core==0.1.42
 faiss-cpu==1.8.0
-python-doctr==0.8.1
-tf2onnx==1.16.1
-tensorflow==2.15.0

 boto3==1.34.84
 langchain-core==0.1.42
 faiss-cpu==1.8.0
+pdf2image==1.17.0
+pytesseract==0.3.10

retriever.py DELETED Viewed

@@ -1,143 +0,0 @@
-import os
-from dotenv import load_dotenv
-from langchain.schema import Document
-from langchain_community.vectorstores import FAISS
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_openai.chat_models.azure import ChatOpenAI
-from langchain_openai.embeddings.azure import OpenAIEmbeddings
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from doctr_ocr import pdf_extractor
-from s3bucket import upload_to_s3
-load_dotenv()
-vector_database_name = "Adina_Vector_Database"
-temp_pdf_folder = "temp-pdf-files"
-def delete_temp_files():
-    for item in os.listdir(temp_pdf_folder):
-        file_path = os.path.join(temp_pdf_folder, item)
-        os.remove(file_path)
-def initialize_vector_db():
-    embeddings = OpenAIEmbeddings()
-    vector_database = FAISS.from_texts(["Adina Cosmetic Ingredients"], embeddings)
-    vector_database.save_local(f"{vector_database_name}")
-def get_vector_db(docs: list[Document]):
-    embeddings = OpenAIEmbeddings()
-    try:
-        currentVectorDatabase = FAISS.from_documents(docs, embeddings)
-        existingVectorDatabase = FAISS.load_local(
-            f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
-        )
-        existingVectorDatabase.merge_from(currentVectorDatabase)
-        existingVectorDatabase.save_local(f"{vector_database_name}")
-        return existingVectorDatabase
-    except Exception as e:
-        print(
-            "!Warning : Document is empty or not in the correct format. Thus provided pdf(s) are not added to the vector database.",
-            e,
-        )
-        return FAISS.load_local(
-            f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
-        )
-def load_and_split(uploaded_files):
-    if not os.path.exists(temp_pdf_folder):
-        os.makedirs(temp_pdf_folder)
-    docs = []
-    for file in uploaded_files:
-        local_filepath = os.path.join(temp_pdf_folder, file.name)
-        with open(local_filepath, "wb") as f:
-            f.write(file.getvalue())
-        if upload_to_s3(file_path=local_filepath, file_name=file.name):
-            print(f"\n{file.name} uploaded successfully.")
-        else:
-            print(f"\nFailed to upload {file.name}.")
-        text = pdf_extractor(local_filepath)
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000, chunk_overlap=200
-        )
-        temp_docs = text_splitter.create_documents(text_splitter.split_text(text))
-        docs.extend(temp_docs)
-    delete_temp_files()
-    return docs
-def get_retriever(uploaded_files):
-    if os.path.exists(f"{vector_database_name}") == False:
-        initialize_vector_db()
-    if len(uploaded_files) == 0:
-        embeddings = OpenAIEmbeddings()
-        vectorDatabase = FAISS.load_local(
-            f"{vector_database_name}", embeddings, allow_dangerous_deserialization=True
-        )
-        retriever = vectorDatabase.as_retriever()
-        return retriever
-    docs = load_and_split(uploaded_files)
-    vector_database = get_vector_db(docs=docs)
-    retriever = vector_database.as_retriever()
-    return retriever
-def get_response(user_query, chat_history):
-    retriever = get_retriever(uploaded_files=[])
-    docs = retriever.invoke(user_query)
-    template = """
-    Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
-    <rules>
-    - Answer the question based on the retrieved information only.
-    - If the question can not be answered, simply say you can not annswer it.
-    - Avoid mentioning that you are answering based on retreived information.
-    </rules>
-    Execute the below mandatory considerations when responding to the inquiries:
-    --- Tone - Respectful, Patient, and Encouraging:
-        Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
-        Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
-    --- Clarity - Simple, Direct, and Unambiguous:
-        Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
-        Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
-    --- Structure - Organized, Consistent, and Considerate:
-        Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
-    --- Empathy and Understanding - Compassionate and Responsive:
-        Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
-        Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
-    Answer the following questions considering the history of the conversation and retrieved information.
-    Chat history: {chat_history}
-    retrieved information: {retrieved_info}
-    User question: {user_question}
-    """
-    prompt = ChatPromptTemplate.from_template(template)
-    llm = ChatOpenAI(model="gpt-3.5-turbo-0125", streaming=True)
-    chain = prompt | llm | StrOutputParser()
-    return chain.stream(
-        {
-            "chat_history": chat_history,
-            "retrieved_info": docs,
-            "user_question": user_query,
-        }
-    )

s3bucket.py CHANGED Viewed

@@ -22,8 +22,6 @@ def upload_to_s3(file_path, file_name):
         )
         client.upload_file(Filename=file_path, Key=f"{file_name}", Bucket="adina-poc")
-        return True
     except Exception as e:
         print("Error uploading file to S3 bucket.", e)
-        return False

         )
         client.upload_file(Filename=file_path, Key=f"{file_name}", Bucket="adina-poc")
     except Exception as e:
         print("Error uploading file to S3 bucket.", e)