Spaces:

sifars
/

adina-poc

Sleeping

App Files Files Community

kanha-upadhyay commited on Apr 27

Commit

2669ae8

•

1 Parent(s): 8bc2404

add aws textract

Browse files

Files changed (8) hide show

.gitignore +2 -0
app.py +20 -31
boto_client.py +54 -0
packages.txt +0 -2
poetry.lock +0 -0
pyproject.toml +25 -0
requirements.txt +1 -2
s3bucket.py +0 -27

.gitignore CHANGED Viewed

@@ -3,3 +3,5 @@ PDFs
 Adina_Vector_Database
 temp-pdf-files
 __pycache__/

 Adina_Vector_Database
 temp-pdf-files
 __pycache__/
+pdf_files
+.venv

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import os
-import pdf2image
-import pytesseract
 import streamlit as st
 from langchain_community.vectorstores import FAISS
 from langchain_core.messages import AIMessage, HumanMessage
@@ -11,7 +9,7 @@ from langchain_openai.chat_models.azure import ChatOpenAI
 from langchain_openai.embeddings.azure import OpenAIEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from s3bucket import upload_to_s3
 vector_database_name = "Adina_Vector_Database"
 temp_pdf_folder = "temp-pdf-files"
@@ -28,29 +26,17 @@ def delete_temp_files():
         os.remove(file_path)
-def extract_text(file):
-    if file.type == "application/pdf":
-        images = pdf2image.convert_from_bytes(file.getvalue())
-        text = ""
-        for img in images:
-            text += pytesseract.image_to_string(img)
-    else:
-        st.error("Invalid file type. Please upload pdf file.")
-        return None
-    return text
 def load_and_split(file):
     if not os.path.exists(temp_pdf_folder):
         os.makedirs(temp_pdf_folder)
     local_filepath = os.path.join(temp_pdf_folder, file.name)
     with open(local_filepath, "wb") as f:
         f.write(file.getvalue())
-    upload_to_s3(file_path=local_filepath, file_name=file.name)
-    text = extract_text(file)
     if text:
         text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000, chunk_overlap=200
         )
         texts = text_splitter.split_text(text)
         docs = text_splitter.create_documents(
@@ -90,31 +76,36 @@ def append_to_vector_db(docs: list = []):
 def create_embeddings(files: list = []):
     for file in files:
         docs = load_and_split(file)
-        append_to_vector_db(docs=docs)
-        st.session_state.last_uploaded_files.append(file.name)
-        print(f"{file.name} processed successfully")
-        st.toast(f"{file.name} processed successfully")
 def get_response(user_query, chat_history):
     docs = RETRIEVER.invoke(user_query)
     additional_info = RETRIEVER.invoke(
-        user_query
-        + ". ".join(
             [
                 message.content
-                for message in st.session_state.chat_history
                 if isinstance(message, HumanMessage)
             ]
         )
     )
     template = """
     Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
     <rules>
-    - Answer the question based on the context and/or additional information only.
     - If the question can not be answered, simply say you can not annswer it.
-    - Avoid mentioning that you are answering based on retreived information.
     </rules>
     Execute the below mandatory considerations when responding to the inquiries:
     --- Tone - Respectful, Patient, and Encouraging:
@@ -128,10 +119,9 @@ def get_response(user_query, chat_history):
     --- Empathy and Understanding - Compassionate and Responsive:
         Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
         Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
-    Answer the following questions considering the history of the conversation, context and/or additional information.
     Chat history: {chat_history}
     Context: {retrieved_info}
-    Additional Information: {additional_info}
     User question: {user_question}
     """
@@ -144,7 +134,6 @@ def get_response(user_query, chat_history):
         {
             "chat_history": chat_history,
             "retrieved_info": docs,
-            "additional_info": additional_info,
             "user_question": user_query,
         }
     )

 import os
 import streamlit as st
 from langchain_community.vectorstores import FAISS
 from langchain_core.messages import AIMessage, HumanMessage
 from langchain_openai.embeddings.azure import OpenAIEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
+from boto_client import extract_text_from_pdf
 vector_database_name = "Adina_Vector_Database"
 temp_pdf_folder = "temp-pdf-files"
         os.remove(file_path)
 def load_and_split(file):
     if not os.path.exists(temp_pdf_folder):
         os.makedirs(temp_pdf_folder)
     local_filepath = os.path.join(temp_pdf_folder, file.name)
     with open(local_filepath, "wb") as f:
         f.write(file.getvalue())
+    text = extract_text_from_pdf(file_path=local_filepath, file_name=file.name)
+    docs = []
     if text:
         text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=512, chunk_overlap=100
         )
         texts = text_splitter.split_text(text)
         docs = text_splitter.create_documents(
 def create_embeddings(files: list = []):
     for file in files:
         docs = load_and_split(file)
+        if docs:
+            append_to_vector_db(docs=docs)
+            st.session_state.last_uploaded_files.append(file.name)
+            st.toast(f"{file.name} processed successfully")
+            print(f"{file.name} processed successfully")
+        else:
+            st.toast(f"{file.name} could not be processed")
+            print(f"{file.name} could not be processed")
 def get_response(user_query, chat_history):
     docs = RETRIEVER.invoke(user_query)
     additional_info = RETRIEVER.invoke(
+        " ".join(
             [
                 message.content
+                for message in chat_history
                 if isinstance(message, HumanMessage)
             ]
         )
     )
+    docs_content = [doc.page_content for doc in docs]
+    for doc in additional_info:
+        if doc.page_content not in docs_content:
+            docs.append(doc)
     template = """
     Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
     <rules>
+    - Answer the question based on the context only.
     - If the question can not be answered, simply say you can not annswer it.
     </rules>
     Execute the below mandatory considerations when responding to the inquiries:
     --- Tone - Respectful, Patient, and Encouraging:
     --- Empathy and Understanding - Compassionate and Responsive:
         Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
         Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
+    Answer the following questions considering the context and/or history of the conversation.
     Chat history: {chat_history}
     Context: {retrieved_info}
     User question: {user_question}
     """
         {
             "chat_history": chat_history,
             "retrieved_info": docs,
             "user_question": user_query,
         }
     )

boto_client.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import time
+import boto3
+from dotenv import load_dotenv
+from textractor import Textractor
+from textractor.data.constants import TextractFeatures
+from textractor.data.text_linearization_config import TextLinearizationConfig
+from textractor.visualizers.entitylist import EntityList
+load_dotenv()
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+AWS_ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL")
+AWS_REGION = os.getenv("AWS_REGION")
+AWS_S3_BUCKET_NAME = os.getenv("AWS_S3_BUCKET_NAME")
+def upload_to_s3(file_path, file_name):
+    s3 = boto3.client(
+        "s3",
+        region_name=AWS_REGION,
+        endpoint_url=AWS_ENDPOINT_URL,
+        aws_access_key_id=AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+    )
+    s3.upload_file(Filename=file_path, Key=file_name, Bucket=AWS_S3_BUCKET_NAME)
+def analyze_pdf(file_name):
+    extractor = Textractor(region_name=AWS_REGION)
+    file = f"s3://{AWS_S3_BUCKET_NAME}/{file_name}"
+    document = extractor.start_document_analysis(
+        file_source=file,
+        features=[
+            TextractFeatures.LAYOUT,
+            TextractFeatures.TABLES,
+            TextractFeatures.FORMS,
+        ],
+        save_image=False,
+    )
+    text = ""
+    for page in document.pages:
+        text += page.get_text()
+    return text
+def extract_text_from_pdf(file_path, file_name):
+    try:
+        upload_to_s3(file_path, file_name)
+        return analyze_pdf(file_name=file_name)
+    except Exception as e:
+        print("Error extracting text from PDF:", e)

packages.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- poppler-utils
2	- tesseract-ocr

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,25 @@

+[tool.poetry]
+name = "adina-poc"
+version = "0.1.0"
+description = ""
+authors = ["Kanha Upadhyay <kanha.upadhyay@sifars.com>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.10"
+langchain = "0.1.16"
+streamlit = "1.33.0"
+langchain-openai = "0.1.3"
+openai = "1.17.1"
+langchain-community = "0.0.32"
+langchain-text-splitters = "0.0.1"
+python-dotenv = "1.0.1"
+boto3 = "1.34.84"
+langchain-core = "0.1.42"
+faiss-cpu = "1.8.0"
+amazon-textract-textractor = "1.7.1"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

requirements.txt CHANGED Viewed

@@ -8,5 +8,4 @@ python-dotenv==1.0.1
 boto3==1.34.84
 langchain-core==0.1.42
 faiss-cpu==1.8.0
-pdf2image==1.17.0
-pytesseract==0.3.10

 boto3==1.34.84
 langchain-core==0.1.42
 faiss-cpu==1.8.0
+amazon-textract-textractor==1.7.1

s3bucket.py DELETED Viewed

@@ -1,27 +0,0 @@
-import os
-import boto3
-from dotenv import load_dotenv
-load_dotenv()
-def upload_to_s3(file_path, file_name):
-    ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
-    SECRET_KEY = os.getenv("AWS_SECRET_KEY")
-    try:
-        # Initialize a session using DigitalOcean Spaces.
-        session = boto3.session.Session()
-        client = session.client(
-            "s3",
-            region_name="ams3",
-            endpoint_url="https://ams3.digitaloceanspaces.com",
-            aws_access_key_id=ACCESS_KEY,
-            aws_secret_access_key=SECRET_KEY,
-        )
-        client.upload_file(Filename=file_path, Key=f"{file_name}", Bucket="adina-poc")
-    except Exception as e:
-        print("Error uploading file to S3 bucket.", e)