kanha-upadhyay commited on
Commit
2669ae8
1 Parent(s): 8bc2404

add aws textract

Browse files
Files changed (8) hide show
  1. .gitignore +2 -0
  2. app.py +20 -31
  3. boto_client.py +54 -0
  4. packages.txt +0 -2
  5. poetry.lock +0 -0
  6. pyproject.toml +25 -0
  7. requirements.txt +1 -2
  8. s3bucket.py +0 -27
.gitignore CHANGED
@@ -3,3 +3,5 @@ PDFs
3
  Adina_Vector_Database
4
  temp-pdf-files
5
  __pycache__/
 
 
 
3
  Adina_Vector_Database
4
  temp-pdf-files
5
  __pycache__/
6
+ pdf_files
7
+ .venv
app.py CHANGED
@@ -1,7 +1,5 @@
1
  import os
2
 
3
- import pdf2image
4
- import pytesseract
5
  import streamlit as st
6
  from langchain_community.vectorstores import FAISS
7
  from langchain_core.messages import AIMessage, HumanMessage
@@ -11,7 +9,7 @@ from langchain_openai.chat_models.azure import ChatOpenAI
11
  from langchain_openai.embeddings.azure import OpenAIEmbeddings
12
  from langchain_text_splitters import RecursiveCharacterTextSplitter
13
 
14
- from s3bucket import upload_to_s3
15
 
16
  vector_database_name = "Adina_Vector_Database"
17
  temp_pdf_folder = "temp-pdf-files"
@@ -28,29 +26,17 @@ def delete_temp_files():
28
  os.remove(file_path)
29
 
30
 
31
- def extract_text(file):
32
- if file.type == "application/pdf":
33
- images = pdf2image.convert_from_bytes(file.getvalue())
34
- text = ""
35
- for img in images:
36
- text += pytesseract.image_to_string(img)
37
- else:
38
- st.error("Invalid file type. Please upload pdf file.")
39
- return None
40
- return text
41
-
42
-
43
  def load_and_split(file):
44
  if not os.path.exists(temp_pdf_folder):
45
  os.makedirs(temp_pdf_folder)
46
  local_filepath = os.path.join(temp_pdf_folder, file.name)
47
  with open(local_filepath, "wb") as f:
48
  f.write(file.getvalue())
49
- upload_to_s3(file_path=local_filepath, file_name=file.name)
50
- text = extract_text(file)
51
  if text:
52
  text_splitter = RecursiveCharacterTextSplitter(
53
- chunk_size=1000, chunk_overlap=200
54
  )
55
  texts = text_splitter.split_text(text)
56
  docs = text_splitter.create_documents(
@@ -90,31 +76,36 @@ def append_to_vector_db(docs: list = []):
90
  def create_embeddings(files: list = []):
91
  for file in files:
92
  docs = load_and_split(file)
93
- append_to_vector_db(docs=docs)
94
- st.session_state.last_uploaded_files.append(file.name)
95
- print(f"{file.name} processed successfully")
96
- st.toast(f"{file.name} processed successfully")
 
 
 
 
97
 
98
 
99
  def get_response(user_query, chat_history):
100
  docs = RETRIEVER.invoke(user_query)
101
  additional_info = RETRIEVER.invoke(
102
- user_query
103
- + ". ".join(
104
  [
105
  message.content
106
- for message in st.session_state.chat_history
107
  if isinstance(message, HumanMessage)
108
  ]
109
  )
110
  )
111
-
 
 
 
112
  template = """
113
  Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
114
  <rules>
115
- - Answer the question based on the context and/or additional information only.
116
  - If the question can not be answered, simply say you can not annswer it.
117
- - Avoid mentioning that you are answering based on retreived information.
118
  </rules>
119
  Execute the below mandatory considerations when responding to the inquiries:
120
  --- Tone - Respectful, Patient, and Encouraging:
@@ -128,10 +119,9 @@ def get_response(user_query, chat_history):
128
  --- Empathy and Understanding - Compassionate and Responsive:
129
  Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
130
  Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
131
- Answer the following questions considering the history of the conversation, context and/or additional information.
132
  Chat history: {chat_history}
133
  Context: {retrieved_info}
134
- Additional Information: {additional_info}
135
  User question: {user_question}
136
  """
137
 
@@ -144,7 +134,6 @@ def get_response(user_query, chat_history):
144
  {
145
  "chat_history": chat_history,
146
  "retrieved_info": docs,
147
- "additional_info": additional_info,
148
  "user_question": user_query,
149
  }
150
  )
 
1
  import os
2
 
 
 
3
  import streamlit as st
4
  from langchain_community.vectorstores import FAISS
5
  from langchain_core.messages import AIMessage, HumanMessage
 
9
  from langchain_openai.embeddings.azure import OpenAIEmbeddings
10
  from langchain_text_splitters import RecursiveCharacterTextSplitter
11
 
12
+ from boto_client import extract_text_from_pdf
13
 
14
  vector_database_name = "Adina_Vector_Database"
15
  temp_pdf_folder = "temp-pdf-files"
 
26
  os.remove(file_path)
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def load_and_split(file):
30
  if not os.path.exists(temp_pdf_folder):
31
  os.makedirs(temp_pdf_folder)
32
  local_filepath = os.path.join(temp_pdf_folder, file.name)
33
  with open(local_filepath, "wb") as f:
34
  f.write(file.getvalue())
35
+ text = extract_text_from_pdf(file_path=local_filepath, file_name=file.name)
36
+ docs = []
37
  if text:
38
  text_splitter = RecursiveCharacterTextSplitter(
39
+ chunk_size=512, chunk_overlap=100
40
  )
41
  texts = text_splitter.split_text(text)
42
  docs = text_splitter.create_documents(
 
76
  def create_embeddings(files: list = []):
77
  for file in files:
78
  docs = load_and_split(file)
79
+ if docs:
80
+ append_to_vector_db(docs=docs)
81
+ st.session_state.last_uploaded_files.append(file.name)
82
+ st.toast(f"{file.name} processed successfully")
83
+ print(f"{file.name} processed successfully")
84
+ else:
85
+ st.toast(f"{file.name} could not be processed")
86
+ print(f"{file.name} could not be processed")
87
 
88
 
89
  def get_response(user_query, chat_history):
90
  docs = RETRIEVER.invoke(user_query)
91
  additional_info = RETRIEVER.invoke(
92
+ " ".join(
 
93
  [
94
  message.content
95
+ for message in chat_history
96
  if isinstance(message, HumanMessage)
97
  ]
98
  )
99
  )
100
+ docs_content = [doc.page_content for doc in docs]
101
+ for doc in additional_info:
102
+ if doc.page_content not in docs_content:
103
+ docs.append(doc)
104
  template = """
105
  Your name is ADINA, who provides helpful information about Adina Consmetic Ingredients.
106
  <rules>
107
+ - Answer the question based on the context only.
108
  - If the question can not be answered, simply say you can not annswer it.
 
109
  </rules>
110
  Execute the below mandatory considerations when responding to the inquiries:
111
  --- Tone - Respectful, Patient, and Encouraging:
 
119
  --- Empathy and Understanding - Compassionate and Responsive:
120
  Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
121
  Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
122
+ Answer the following questions considering the context and/or history of the conversation.
123
  Chat history: {chat_history}
124
  Context: {retrieved_info}
 
125
  User question: {user_question}
126
  """
127
 
 
134
  {
135
  "chat_history": chat_history,
136
  "retrieved_info": docs,
 
137
  "user_question": user_query,
138
  }
139
  )
boto_client.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ import boto3
5
+ from dotenv import load_dotenv
6
+ from textractor import Textractor
7
+ from textractor.data.constants import TextractFeatures
8
+ from textractor.data.text_linearization_config import TextLinearizationConfig
9
+ from textractor.visualizers.entitylist import EntityList
10
+
11
+ load_dotenv()
12
+
13
+ AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
14
+ AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
15
+ AWS_ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL")
16
+ AWS_REGION = os.getenv("AWS_REGION")
17
+ AWS_S3_BUCKET_NAME = os.getenv("AWS_S3_BUCKET_NAME")
18
+
19
+
20
+ def upload_to_s3(file_path, file_name):
21
+ s3 = boto3.client(
22
+ "s3",
23
+ region_name=AWS_REGION,
24
+ endpoint_url=AWS_ENDPOINT_URL,
25
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
26
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
27
+ )
28
+ s3.upload_file(Filename=file_path, Key=file_name, Bucket=AWS_S3_BUCKET_NAME)
29
+
30
+
31
+ def analyze_pdf(file_name):
32
+ extractor = Textractor(region_name=AWS_REGION)
33
+ file = f"s3://{AWS_S3_BUCKET_NAME}/{file_name}"
34
+ document = extractor.start_document_analysis(
35
+ file_source=file,
36
+ features=[
37
+ TextractFeatures.LAYOUT,
38
+ TextractFeatures.TABLES,
39
+ TextractFeatures.FORMS,
40
+ ],
41
+ save_image=False,
42
+ )
43
+ text = ""
44
+ for page in document.pages:
45
+ text += page.get_text()
46
+ return text
47
+
48
+
49
+ def extract_text_from_pdf(file_path, file_name):
50
+ try:
51
+ upload_to_s3(file_path, file_name)
52
+ return analyze_pdf(file_name=file_name)
53
+ except Exception as e:
54
+ print("Error extracting text from PDF:", e)
packages.txt DELETED
@@ -1,2 +0,0 @@
1
- poppler-utils
2
- tesseract-ocr
 
 
 
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "adina-poc"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Kanha Upadhyay <kanha.upadhyay@sifars.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.10"
10
+ langchain = "0.1.16"
11
+ streamlit = "1.33.0"
12
+ langchain-openai = "0.1.3"
13
+ openai = "1.17.1"
14
+ langchain-community = "0.0.32"
15
+ langchain-text-splitters = "0.0.1"
16
+ python-dotenv = "1.0.1"
17
+ boto3 = "1.34.84"
18
+ langchain-core = "0.1.42"
19
+ faiss-cpu = "1.8.0"
20
+ amazon-textract-textractor = "1.7.1"
21
+
22
+
23
+ [build-system]
24
+ requires = ["poetry-core"]
25
+ build-backend = "poetry.core.masonry.api"
requirements.txt CHANGED
@@ -8,5 +8,4 @@ python-dotenv==1.0.1
8
  boto3==1.34.84
9
  langchain-core==0.1.42
10
  faiss-cpu==1.8.0
11
- pdf2image==1.17.0
12
- pytesseract==0.3.10
 
8
  boto3==1.34.84
9
  langchain-core==0.1.42
10
  faiss-cpu==1.8.0
11
+ amazon-textract-textractor==1.7.1
 
s3bucket.py DELETED
@@ -1,27 +0,0 @@
1
- import os
2
-
3
- import boto3
4
- from dotenv import load_dotenv
5
-
6
- load_dotenv()
7
-
8
-
9
- def upload_to_s3(file_path, file_name):
10
- ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
11
- SECRET_KEY = os.getenv("AWS_SECRET_KEY")
12
-
13
- try:
14
- # Initialize a session using DigitalOcean Spaces.
15
- session = boto3.session.Session()
16
- client = session.client(
17
- "s3",
18
- region_name="ams3",
19
- endpoint_url="https://ams3.digitaloceanspaces.com",
20
- aws_access_key_id=ACCESS_KEY,
21
- aws_secret_access_key=SECRET_KEY,
22
- )
23
-
24
- client.upload_file(Filename=file_path, Key=f"{file_name}", Bucket="adina-poc")
25
-
26
- except Exception as e:
27
- print("Error uploading file to S3 bucket.", e)