Abhilashvj commited on
Commit
7ba9210
1 Parent(s): 26add68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -84
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import json
2
  import logging
3
  import os
@@ -6,9 +7,9 @@ import sys
6
  import uuid
7
  from json import JSONDecodeError
8
  from pathlib import Path
9
- from tqdm.auto import tqdm
10
- import datetime
11
  from time import sleep
 
 
12
  import pandas as pd
13
  import pinecone
14
  import streamlit as st
@@ -27,8 +28,7 @@ from haystack.nodes import (
27
  from haystack.pipelines import ExtractiveQAPipeline, Pipeline
28
  from markdown import markdown
29
  from sentence_transformers import SentenceTransformer
30
-
31
- import openai
32
 
33
  # get API key from top-right dropdown on OpenAI website
34
  openai.api_key = st.secrets["OPENAI_API_KEY"]
@@ -36,10 +36,7 @@ index_name = "qa_demo"
36
 
37
 
38
  # connect to pinecone environment
39
- pinecone.init(
40
- api_key=st.secrets["pinecone_apikey"],
41
- environment="us-east1-gcp"
42
- )
43
  index_name = "qa-demo"
44
 
45
  embed_model = "text-embedding-ada-002"
@@ -49,7 +46,7 @@ preprocessor = PreProcessor(
49
  clean_header_footer=False,
50
  split_by="word",
51
  split_length=200,
52
- split_respect_sentence_boundary=True
53
  )
54
  file_type_classifier = FileTypeClassifier()
55
  text_converter = TextConverter()
@@ -59,58 +56,37 @@ docx_converter = DocxToTextConverter()
59
  # check if the abstractive-question-answering index exists
60
  if index_name not in pinecone.list_indexes():
61
  # create the index if it does not exist
62
- pinecone.create_index(
63
- index_name,
64
- dimension=1536,
65
- metric="cosine"
66
- )
67
 
68
  # connect to abstractive-question-answering index we created
69
  index = pinecone.Index(index_name)
70
 
71
- FILE_UPLOAD_PATH= "./data/uploads/"
72
  os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
73
 
74
  limit = 3750
75
 
 
76
  def retrieve(query):
77
- res = openai.Embedding.create(
78
- input=[query],
79
- engine=embed_model
80
- )
81
 
82
  # retrieve from Pinecone
83
- xq = res['data'][0]['embedding']
84
 
85
  # get relevant contexts
86
  res = index.query(xq, top_k=3, include_metadata=True)
87
- contexts = [
88
- x['metadata']['text'] for x in res['matches']
89
- ]
90
 
91
  # build our prompt with the retrieved contexts included
92
- prompt_start = (
93
- "Answer the question based on the context below.\n\n"+
94
- "Context:\n"
95
- )
96
- prompt_end = (
97
- f"\n\nQuestion: {query}\nAnswer:"
98
- )
99
  # append contexts until hitting limit
100
  for i in range(1, len(contexts)):
101
  if len("\n\n---\n\n".join(contexts[:i])) >= limit:
102
- prompt = (
103
- prompt_start +
104
- "\n\n---\n\n".join(contexts[:i-1]) +
105
- prompt_end
106
- )
107
  break
108
- elif i == len(contexts)-1:
109
- prompt = (
110
- prompt_start +
111
- "\n\n---\n\n".join(contexts) +
112
- prompt_end
113
- )
114
  return prompt, contexts
115
 
116
 
@@ -118,17 +94,18 @@ def retrieve(query):
118
  def complete(prompt):
119
  # query text-davinci-003
120
  res = openai.Completion.create(
121
- engine='text-davinci-003',
122
  prompt=prompt,
123
  temperature=0,
124
  max_tokens=400,
125
  top_p=1,
126
  frequency_penalty=0,
127
  presence_penalty=0,
128
- stop=None
129
  )
130
- return res['choices'][0]['text'].strip()
131
-
 
132
  def query(question, top_k_reader, top_k_retriever):
133
  # first we retrieve relevant items from Pinecone
134
  query_with_contexts, contexts = retrieve(question)
@@ -154,20 +131,29 @@ indexing_pipeline_with_classification.add_node(
154
  inputs=["TextConverter", "PdfConverter", "DocxConverter"],
155
  )
156
 
 
157
  def set_state_if_absent(key, value):
158
  if key not in st.session_state:
159
  st.session_state[key] = value
160
 
 
161
  # Adjust to a question that you would like users to see in the search bar when they load the UI:
162
- DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics.")
163
- DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "7% more remote workers have been at their current organization for 5 years or fewer")
 
 
 
 
 
164
 
165
  # Sliders
166
  DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
167
  DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))
168
 
169
 
170
- st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")
 
 
171
 
172
  # Persistent state
173
  set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
@@ -181,6 +167,7 @@ def reset_results(*args):
181
  st.session_state.results = None
182
  st.session_state.raw_json = None
183
 
 
184
  # Title
185
  st.write("# GPT3 and Langchain Demo")
186
  st.markdown(
@@ -208,46 +195,52 @@ for data_file in data_files:
208
  f.write(data_file.getbuffer())
209
  ALL_FILES.append(file_path)
210
  st.sidebar.write(str(data_file.name) + "    ✅ ")
211
- META_DATA.append({"filename":data_file.name})
212
-
213
 
214
  if len(ALL_FILES) > 0:
215
  # document_store.update_embeddings(retriever, update_existing_embeddings=False)
216
- docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)["documents"]
 
 
217
  index_name = "qa_demo"
218
  # we will use batches of 64
219
  batch_size = 200
220
  # docs = docs['documents']
221
- with st.spinner(
222
- "🧠    Performing indexing of uplaoded documents... \n "
223
- ):
224
  for i in range(0, len(docs), batch_size):
225
  # find end of batch
226
- i_end = min(i+batch_size, len(docs))
227
  # extract batch
228
  batch = [doc.content for doc in docs[i:i_end]]
229
  # generate embeddings for batch
230
  try:
231
  res = openai.Embedding.create(input=batch, engine=embed_model)
232
- except:
233
  done = False
234
- while not done:
 
235
  sleep(5)
236
  try:
237
  res = openai.Embedding.create(input=batch, engine=embed_model)
238
  done = True
239
  except:
 
 
240
  pass
241
- embeds = [record['embedding'] for record in res['data']]
 
 
 
242
  # get metadata
243
  meta = [doc.meta for doc in docs[i:i_end]]
244
  # create unique IDs
245
  ids = [doc.id for doc in docs[i:i_end]]
246
  # add all to upsert list
247
- to_upsert = list(zip(ids, emb, meta))
248
  # upsert/insert these records to pinecone
249
  _ = index.upsert(vectors=to_upsert)
250
-
251
  # top_k_reader = st.sidebar.slider(
252
  # "Max. number of answers",
253
  # min_value=1,
@@ -273,12 +266,12 @@ if len(ALL_FILES) > 0:
273
  # raw_json = upload_doc(data_file)
274
 
275
  question = st.text_input(
276
- value=st.session_state.question,
277
- max_chars=100,
278
- on_change=reset_results,
279
- label="question",
280
- label_visibility="hidden",
281
- )
282
  col1, col2 = st.columns(2)
283
  col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
284
  col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
@@ -287,23 +280,19 @@ col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html
287
  run_pressed = col1.button("Run")
288
  if run_pressed:
289
 
290
- run_query = (
291
- run_pressed or question != st.session_state.question
292
- )
293
  # Get results for query
294
  if run_query and question:
295
  reset_results()
296
  st.session_state.question = question
297
 
298
- with st.spinner(
299
- "🧠 &nbsp;&nbsp; Performing neural search on documents... \n "
300
- ):
301
  try:
302
- st.session_state.results = query(
303
- question, top_k_reader=None, top_k_retriever=None
304
- )
305
  except JSONDecodeError as je:
306
- st.error("👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
 
 
307
  except Exception as e:
308
  logging.exception(e)
309
  if "The server is busy processing requests" in str(e) or "503" in str(e):
@@ -316,7 +305,7 @@ if st.session_state.results:
316
 
317
  st.write("## Results:")
318
 
319
- for result,contexts in st.session_state.results:
320
  # answer, context = result.answer, result.context
321
  # start_idx = context.find(answer)
322
  # end_idx = start_idx + len(answer)
@@ -328,8 +317,8 @@ if st.session_state.results:
328
  # unsafe_allow_html=True,
329
  # )
330
  st.write(
331
- markdown(f"Answer: {result} \n Extracted from context {contexts}"),
332
- unsafe_allow_html=True,
333
  )
334
  except:
335
  # filename = result.meta.get('filename', "")
@@ -338,9 +327,6 @@ if st.session_state.results:
338
  # unsafe_allow_html=True,
339
  # )
340
  st.write(
341
- markdown(f"Answer: {result}"),
342
- unsafe_allow_html=True,
343
  )
344
-
345
-
346
-
 
1
+ import datetime
2
  import json
3
  import logging
4
  import os
 
7
  import uuid
8
  from json import JSONDecodeError
9
  from pathlib import Path
 
 
10
  from time import sleep
11
+
12
+ import openai
13
  import pandas as pd
14
  import pinecone
15
  import streamlit as st
 
28
  from haystack.pipelines import ExtractiveQAPipeline, Pipeline
29
  from markdown import markdown
30
  from sentence_transformers import SentenceTransformer
31
+ from tqdm.auto import tqdm
 
32
 
33
  # get API key from top-right dropdown on OpenAI website
34
  openai.api_key = st.secrets["OPENAI_API_KEY"]
 
36
 
37
 
38
  # connect to pinecone environment
39
+ pinecone.init(api_key=st.secrets["pinecone_apikey"], environment="us-east1-gcp")
 
 
 
40
  index_name = "qa-demo"
41
 
42
  embed_model = "text-embedding-ada-002"
 
46
  clean_header_footer=False,
47
  split_by="word",
48
  split_length=200,
49
+ split_respect_sentence_boundary=True,
50
  )
51
  file_type_classifier = FileTypeClassifier()
52
  text_converter = TextConverter()
 
56
  # check if the abstractive-question-answering index exists
57
  if index_name not in pinecone.list_indexes():
58
  # create the index if it does not exist
59
+ pinecone.create_index(index_name, dimension=1536, metric="cosine")
 
 
 
 
60
 
61
  # connect to abstractive-question-answering index we created
62
  index = pinecone.Index(index_name)
63
 
64
+ FILE_UPLOAD_PATH = "./data/uploads/"
65
  os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
66
 
67
  limit = 3750
68
 
69
+
70
  def retrieve(query):
71
+ res = openai.Embedding.create(input=[query], engine=embed_model)
 
 
 
72
 
73
  # retrieve from Pinecone
74
+ xq = res["data"][0]["embedding"]
75
 
76
  # get relevant contexts
77
  res = index.query(xq, top_k=3, include_metadata=True)
78
+ contexts = [x["metadata"]["text"] for x in res["matches"]]
 
 
79
 
80
  # build our prompt with the retrieved contexts included
81
+ prompt_start = "Answer the question based on the context below.\n\n" + "Context:\n"
82
+ prompt_end = f"\n\nQuestion: {query}\nAnswer:"
 
 
 
 
 
83
  # append contexts until hitting limit
84
  for i in range(1, len(contexts)):
85
  if len("\n\n---\n\n".join(contexts[:i])) >= limit:
86
+ prompt = prompt_start + "\n\n---\n\n".join(contexts[: i - 1]) + prompt_end
 
 
 
 
87
  break
88
+ elif i == len(contexts) - 1:
89
+ prompt = prompt_start + "\n\n---\n\n".join(contexts) + prompt_end
 
 
 
 
90
  return prompt, contexts
91
 
92
 
 
94
  def complete(prompt):
95
  # query text-davinci-003
96
  res = openai.Completion.create(
97
+ engine="text-davinci-003",
98
  prompt=prompt,
99
  temperature=0,
100
  max_tokens=400,
101
  top_p=1,
102
  frequency_penalty=0,
103
  presence_penalty=0,
104
+ stop=None,
105
  )
106
+ return res["choices"][0]["text"].strip()
107
+
108
+
109
  def query(question, top_k_reader, top_k_retriever):
110
  # first we retrieve relevant items from Pinecone
111
  query_with_contexts, contexts = retrieve(question)
 
131
  inputs=["TextConverter", "PdfConverter", "DocxConverter"],
132
  )
133
 
134
+
135
  def set_state_if_absent(key, value):
136
  if key not in st.session_state:
137
  st.session_state[key] = value
138
 
139
+
140
  # Adjust to a question that you would like users to see in the search bar when they load the UI:
141
+ DEFAULT_QUESTION_AT_STARTUP = os.getenv(
142
+ "DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics."
143
+ )
144
+ DEFAULT_ANSWER_AT_STARTUP = os.getenv(
145
+ "DEFAULT_ANSWER_AT_STARTUP",
146
+ "7% more remote workers have been at their current organization for 5 years or fewer",
147
+ )
148
 
149
  # Sliders
150
  DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
151
  DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))
152
 
153
 
154
+ st.set_page_config(
155
+ page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png"
156
+ )
157
 
158
  # Persistent state
159
  set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
 
167
  st.session_state.results = None
168
  st.session_state.raw_json = None
169
 
170
+
171
  # Title
172
  st.write("# GPT3 and Langchain Demo")
173
  st.markdown(
 
195
  f.write(data_file.getbuffer())
196
  ALL_FILES.append(file_path)
197
  st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; ✅ ")
198
+ META_DATA.append({"filename": data_file.name})
199
+
200
 
201
  if len(ALL_FILES) > 0:
202
  # document_store.update_embeddings(retriever, update_existing_embeddings=False)
203
+ docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)[
204
+ "documents"
205
+ ]
206
  index_name = "qa_demo"
207
  # we will use batches of 64
208
  batch_size = 200
209
  # docs = docs['documents']
210
+ with st.spinner("🧠 &nbsp;&nbsp; Performing indexing of uplaoded documents... \n "):
 
 
211
  for i in range(0, len(docs), batch_size):
212
  # find end of batch
213
+ i_end = min(i + batch_size, len(docs))
214
  # extract batch
215
  batch = [doc.content for doc in docs[i:i_end]]
216
  # generate embeddings for batch
217
  try:
218
  res = openai.Embedding.create(input=batch, engine=embed_model)
219
+ except Exception as e:
220
  done = False
221
+ count = 0
222
+ while not done and count < 5:
223
  sleep(5)
224
  try:
225
  res = openai.Embedding.create(input=batch, engine=embed_model)
226
  done = True
227
  except:
228
+ count += 1
229
+
230
  pass
231
+ if count >= 5:
232
+ st.error(f"🐞 File indexing failed{str(e)}")
233
+
234
+ embeds = [record["embedding"] for record in res["data"]]
235
  # get metadata
236
  meta = [doc.meta for doc in docs[i:i_end]]
237
  # create unique IDs
238
  ids = [doc.id for doc in docs[i:i_end]]
239
  # add all to upsert list
240
+ to_upsert = list(zip(ids, embeds, meta))
241
  # upsert/insert these records to pinecone
242
  _ = index.upsert(vectors=to_upsert)
243
+
244
  # top_k_reader = st.sidebar.slider(
245
  # "Max. number of answers",
246
  # min_value=1,
 
266
  # raw_json = upload_doc(data_file)
267
 
268
  question = st.text_input(
269
+ value=st.session_state.question,
270
+ max_chars=100,
271
+ on_change=reset_results,
272
+ label="question",
273
+ label_visibility="hidden",
274
+ )
275
  col1, col2 = st.columns(2)
276
  col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
277
  col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
 
280
  run_pressed = col1.button("Run")
281
  if run_pressed:
282
 
283
+ run_query = run_pressed or question != st.session_state.question
 
 
284
  # Get results for query
285
  if run_query and question:
286
  reset_results()
287
  st.session_state.question = question
288
 
289
+ with st.spinner("🧠 &nbsp;&nbsp; Performing neural search on documents... \n "):
 
 
290
  try:
291
+ st.session_state.results = query(question, top_k_reader=None, top_k_retriever=None)
 
 
292
  except JSONDecodeError as je:
293
+ st.error(
294
+ "👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?"
295
+ )
296
  except Exception as e:
297
  logging.exception(e)
298
  if "The server is busy processing requests" in str(e) or "503" in str(e):
 
305
 
306
  st.write("## Results:")
307
 
308
+ for result, contexts in st.session_state.results:
309
  # answer, context = result.answer, result.context
310
  # start_idx = context.find(answer)
311
  # end_idx = start_idx + len(answer)
 
317
  # unsafe_allow_html=True,
318
  # )
319
  st.write(
320
+ markdown(f"Answer: {result} \n Extracted from context {contexts}"),
321
+ unsafe_allow_html=True,
322
  )
323
  except:
324
  # filename = result.meta.get('filename', "")
 
327
  # unsafe_allow_html=True,
328
  # )
329
  st.write(
330
+ markdown(f"Answer: {result}"),
331
+ unsafe_allow_html=True,
332
  )