Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
adding comments
Browse files- app.py +52 -58
- auditqa/__pycache__/__init__.cpython-310.pyc +0 -0
- auditqa/__pycache__/doc_process.cpython-310.pyc +0 -0
- auditqa/__pycache__/process_chunks.cpython-310.pyc +0 -0
- auditqa/__pycache__/reports.cpython-310.pyc +0 -0
- auditqa/__pycache__/sample_questions.cpython-310.pyc +0 -0
- auditqa/doc_process.py +3 -0
- auditqa/engine/prompts.py +0 -68
- auditqa/reports.py +0 -6
- requirements.txt +1 -2
app.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import logging
|
4 |
-
import numpy as np
|
5 |
import os
|
6 |
-
import time
|
7 |
import re
|
8 |
import json
|
9 |
from uuid import uuid4
|
@@ -11,15 +9,11 @@ from datetime import datetime
|
|
11 |
from pathlib import Path
|
12 |
from huggingface_hub import CommitScheduler
|
13 |
from auditqa.sample_questions import QUESTIONS
|
14 |
-
from auditqa.engine.prompts import audience_prompts
|
15 |
from auditqa.reports import files, report_list
|
16 |
-
from auditqa.doc_process import process_pdf, get_local_qdrant
|
17 |
from langchain.schema import (
|
18 |
HumanMessage,
|
19 |
SystemMessage,
|
20 |
)
|
21 |
-
from langchain_core.output_parsers import StrOutputParser
|
22 |
-
from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
23 |
from langchain_community.llms import HuggingFaceEndpoint
|
24 |
from auditqa.process_chunks import load_chunks, getconfig
|
25 |
from langchain_community.chat_models.huggingface import ChatHuggingFace
|
@@ -27,15 +21,19 @@ from langchain.retrievers import ContextualCompressionRetriever
|
|
27 |
from langchain.retrievers.document_compressors import CrossEncoderReranker
|
28 |
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
|
29 |
from qdrant_client.http import models as rest
|
30 |
-
#from qdrant_client import QdrantClient
|
31 |
from dotenv import load_dotenv
|
32 |
-
import pkg_resources
|
33 |
load_dotenv()
|
|
|
|
|
34 |
HF_token = os.environ["HF_TOKEN"]
|
|
|
|
|
35 |
JSON_DATASET_DIR = Path("json_dataset")
|
36 |
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
37 |
JSON_DATASET_PATH = JSON_DATASET_DIR / f"logs-{uuid4()}.json"
|
38 |
|
|
|
|
|
39 |
scheduler = CommitScheduler(
|
40 |
repo_id="GIZ/spaces_logs",
|
41 |
repo_type="dataset",
|
@@ -44,38 +42,36 @@ scheduler = CommitScheduler(
|
|
44 |
)
|
45 |
|
46 |
model_config = getconfig("model_params.cfg")
|
47 |
-
#installed_packages = pkg_resources.working_set
|
48 |
-
#package_list_ = ""
|
49 |
-
#for package in installed_packages:
|
50 |
-
# package_list_ = package_list_ + f"{package.key}=={package.version}\n"
|
51 |
-
#print(package_list_)
|
52 |
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
#
|
57 |
-
#
|
58 |
vectorstores = load_chunks()
|
59 |
-
# once the vectore embeddings are created we will qdrant client to access these
|
60 |
-
#vectorstores = get_local_qdrant()
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
#
|
|
|
|
|
65 |
|
66 |
def save_logs(logs) -> None:
|
|
|
|
|
|
|
67 |
with scheduler.lock:
|
68 |
with JSON_DATASET_PATH.open("a") as f:
|
69 |
json.dump(logs, f)
|
70 |
f.write("\n")
|
71 |
logging.info("logging done")
|
72 |
-
|
|
|
73 |
def make_html_source(source,i):
|
74 |
"""
|
75 |
takes the text and converts it into html format for display in "source" side tab
|
76 |
"""
|
77 |
meta = source.metadata
|
78 |
-
# content = source.page_content.split(":",1)[1].strip()
|
79 |
content = source.page_content.strip()
|
80 |
|
81 |
name = meta['filename']
|
@@ -120,8 +116,9 @@ def finish_chat():
|
|
120 |
return (gr.update(interactive = True,value = ""))
|
121 |
|
122 |
async def chat(query,history,sources,reports,subtype,year):
|
123 |
-
"""taking a query and a message history, use a pipeline (reformulation, retriever, answering)
|
124 |
-
|
|
|
125 |
|
126 |
logging.info(f">> NEW QUESTION : {query}")
|
127 |
logging.info(f"history:{history}")
|
@@ -133,13 +130,9 @@ async def chat(query,history,sources,reports,subtype,year):
|
|
133 |
docs_html = ""
|
134 |
output_query = ""
|
135 |
|
136 |
-
##------------------------
|
137 |
-
|
138 |
-
|
139 |
-
else:
|
140 |
-
vectorstore = vectorstores["allreports"]
|
141 |
-
|
142 |
-
###-------------------------------------Construct Filter------------------------------------
|
143 |
if len(reports) == 0:
|
144 |
("defining filter for:{}:{}:{}".format(sources,subtype,year))
|
145 |
filter=rest.Filter(
|
@@ -165,12 +158,18 @@ async def chat(query,history,sources,reports,subtype,year):
|
|
165 |
)])
|
166 |
|
167 |
|
168 |
-
##------------------------------get context
|
169 |
context_retrieved_lst = []
|
170 |
question_lst= [query]
|
|
|
171 |
for question in question_lst:
|
|
|
|
|
|
|
172 |
retriever = vectorstore.as_retriever(
|
173 |
-
search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.6,
|
|
|
|
|
174 |
model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")
|
175 |
compressor = CrossEncoderReranker(model=model, top_n=3)
|
176 |
compression_retriever = ContextualCompressionRetriever(
|
@@ -187,7 +186,7 @@ async def chat(query,history,sources,reports,subtype,year):
|
|
187 |
context_retrieved_formatted = format_docs(context_retrieved)
|
188 |
context_retrieved_lst.append(context_retrieved_formatted)
|
189 |
|
190 |
-
##-------------------Prompt
|
191 |
SYSTEM_PROMPT = """
|
192 |
You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports. Provide a clear and structured answer based on the passages/context provided and the guidelines.
|
193 |
Guidelines:
|
@@ -209,15 +208,13 @@ async def chat(query,history,sources,reports,subtype,year):
|
|
209 |
""".format(context = context_retrieved_lst, question=query)
|
210 |
|
211 |
messages = [
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
|
217 |
-
|
218 |
|
219 |
-
# llama-3_1 endpoint = https://howaqfw0lpap12sg.us-east-1.aws.endpoints.huggingface.cloud
|
220 |
-
# llama-3 endpoint = https://nhe9phsr2zhs0e36.eu-west-1.aws.endpoints.huggingface.cloud
|
221 |
#callbacks = [StreamingStdOutCallbackHandler()]
|
222 |
llm_qa = HuggingFaceEndpoint(
|
223 |
endpoint_url= model_config.get('reader','ENDPOINT'),
|
@@ -226,10 +223,10 @@ async def chat(query,history,sources,reports,subtype,year):
|
|
226 |
timeout=70,
|
227 |
huggingfacehub_api_token=HF_token,)
|
228 |
|
229 |
-
# create
|
230 |
chat_model = ChatHuggingFace(llm=llm_qa)
|
231 |
|
232 |
-
|
233 |
answer_lst = []
|
234 |
for question, context in zip(question_lst , context_retrieved_lst):
|
235 |
answer = chat_model.invoke(messages)
|
@@ -249,10 +246,9 @@ async def chat(query,history,sources,reports,subtype,year):
|
|
249 |
|
250 |
yield history,docs_html
|
251 |
|
252 |
-
|
253 |
try:
|
254 |
timestamp = str(datetime.now().timestamp())
|
255 |
-
#file_store = "/data/logs/" + timestamp + ".json"
|
256 |
logs = {
|
257 |
"system_prompt": SYSTEM_PROMPT,
|
258 |
"sources":sources,
|
@@ -271,12 +267,10 @@ async def chat(query,history,sources,reports,subtype,year):
|
|
271 |
except Exception as e:
|
272 |
logging.error(e)
|
273 |
|
274 |
-
#process_pdf()
|
275 |
|
276 |
|
277 |
-
|
278 |
-
|
279 |
-
# --------------------------------------------------------------------
|
280 |
|
281 |
# Set up Gradio Theme
|
282 |
theme = gr.themes.Base(
|
@@ -323,13 +317,13 @@ with gr.Blocks(title="Audit Q&A", css= "style.css", theme=theme,elem_id = "main-
|
|
323 |
with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
|
324 |
# creating tabs on right panel
|
325 |
with gr.Tabs() as tabs:
|
326 |
-
|
327 |
|
328 |
with gr.Tab("Reports",elem_id = "tab-config",id = 2):
|
329 |
gr.Markdown("Reminder: To get better results select the specific report/reports")
|
330 |
|
331 |
|
332 |
-
|
333 |
dropdown_sources = gr.Radio(
|
334 |
["Consolidated", "District","Ministry"],
|
335 |
label="Select Report Category",
|
@@ -337,19 +331,19 @@ with gr.Blocks(title="Audit Q&A", css= "style.css", theme=theme,elem_id = "main-
|
|
337 |
interactive=True,
|
338 |
)
|
339 |
|
340 |
-
|
341 |
dropdown_category = gr.Dropdown(
|
342 |
list(files["Consolidated"].keys()),
|
343 |
value = list(files["Consolidated"].keys())[0],
|
344 |
label = "Filter for Sub-Type",
|
345 |
interactive=True)
|
346 |
|
347 |
-
|
348 |
def rs_change(rs):
|
349 |
return gr.update(choices=files[rs], value=list(files[rs].keys())[0])
|
350 |
dropdown_sources.change(fn=rs_change, inputs=[dropdown_sources], outputs=[dropdown_category])
|
351 |
|
352 |
-
|
353 |
dropdown_year = gr.Dropdown(
|
354 |
['2018','2019','2020','2021','2022'],
|
355 |
label="Filter for year",
|
@@ -358,7 +352,7 @@ with gr.Blocks(title="Audit Q&A", css= "style.css", theme=theme,elem_id = "main-
|
|
358 |
interactive=True,
|
359 |
)
|
360 |
gr.Markdown("-------------------------------------------------------------------------")
|
361 |
-
|
362 |
dropdown_reports = gr.Dropdown(
|
363 |
report_list,
|
364 |
label="Or select specific reports",
|
@@ -396,7 +390,7 @@ with gr.Blocks(title="Audit Q&A", css= "style.css", theme=theme,elem_id = "main-
|
|
396 |
)
|
397 |
|
398 |
samples.append(group_examples)
|
399 |
-
|
400 |
with gr.Tab("Sources",elem_id = "tab-citations",id = 1):
|
401 |
sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
|
402 |
docs_textbox = gr.State("")
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import logging
|
|
|
4 |
import os
|
|
|
5 |
import re
|
6 |
import json
|
7 |
from uuid import uuid4
|
|
|
9 |
from pathlib import Path
|
10 |
from huggingface_hub import CommitScheduler
|
11 |
from auditqa.sample_questions import QUESTIONS
|
|
|
12 |
from auditqa.reports import files, report_list
|
|
|
13 |
from langchain.schema import (
|
14 |
HumanMessage,
|
15 |
SystemMessage,
|
16 |
)
|
|
|
|
|
17 |
from langchain_community.llms import HuggingFaceEndpoint
|
18 |
from auditqa.process_chunks import load_chunks, getconfig
|
19 |
from langchain_community.chat_models.huggingface import ChatHuggingFace
|
|
|
21 |
from langchain.retrievers.document_compressors import CrossEncoderReranker
|
22 |
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
|
23 |
from qdrant_client.http import models as rest
|
|
|
24 |
from dotenv import load_dotenv
|
|
|
25 |
load_dotenv()
|
26 |
+
# token to allow acces to Hub, This token should also be
|
27 |
+
# valid fo calls made to Inference endpoints
|
28 |
HF_token = os.environ["HF_TOKEN"]
|
29 |
+
|
30 |
+
# create the local logs repo
|
31 |
JSON_DATASET_DIR = Path("json_dataset")
|
32 |
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
33 |
JSON_DATASET_PATH = JSON_DATASET_DIR / f"logs-{uuid4()}.json"
|
34 |
|
35 |
+
# the logs are written to dataset repo
|
36 |
+
# https://huggingface.co/spaces/Wauplin/space_to_dataset_saver
|
37 |
scheduler = CommitScheduler(
|
38 |
repo_id="GIZ/spaces_logs",
|
39 |
repo_type="dataset",
|
|
|
42 |
)
|
43 |
|
44 |
model_config = getconfig("model_params.cfg")
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
|
47 |
+
|
48 |
+
#### VECTOR STORE ####
|
49 |
+
# reports contain the already created chunks from Markdown version of pdf reports
|
50 |
+
# document processing was done using : https://github.com/axa-group/Parsr
|
51 |
vectorstores = load_chunks()
|
|
|
|
|
52 |
|
53 |
+
|
54 |
+
#### FUNCTIONS ####
|
55 |
+
# App UI and and its functionality is inspired and adapted from
|
56 |
+
# https://huggingface.co/spaces/Ekimetrics/climate-question-answering
|
57 |
+
|
58 |
|
59 |
def save_logs(logs) -> None:
|
60 |
+
""" Every interaction with app saves the log of question and answer,
|
61 |
+
this is to get the usage statistics of app and evaluate model performances
|
62 |
+
"""
|
63 |
with scheduler.lock:
|
64 |
with JSON_DATASET_PATH.open("a") as f:
|
65 |
json.dump(logs, f)
|
66 |
f.write("\n")
|
67 |
logging.info("logging done")
|
68 |
+
|
69 |
+
|
70 |
def make_html_source(source,i):
|
71 |
"""
|
72 |
takes the text and converts it into html format for display in "source" side tab
|
73 |
"""
|
74 |
meta = source.metadata
|
|
|
75 |
content = source.page_content.strip()
|
76 |
|
77 |
name = meta['filename']
|
|
|
116 |
return (gr.update(interactive = True,value = ""))
|
117 |
|
118 |
async def chat(query,history,sources,reports,subtype,year):
|
119 |
+
"""taking a query and a message history, use a pipeline (reformulation, retriever, answering)
|
120 |
+
to yield a tuple of:(messages in gradio format/messages in langchain format, source documents)
|
121 |
+
"""
|
122 |
|
123 |
logging.info(f">> NEW QUESTION : {query}")
|
124 |
logging.info(f"history:{history}")
|
|
|
130 |
docs_html = ""
|
131 |
output_query = ""
|
132 |
|
133 |
+
##------------------------fetch collection from vectorstore------------------------------
|
134 |
+
vectorstore = vectorstores["allreports"]
|
135 |
+
##---------------------construct filter for metdata filtering---------------------------
|
|
|
|
|
|
|
|
|
136 |
if len(reports) == 0:
|
137 |
("defining filter for:{}:{}:{}".format(sources,subtype,year))
|
138 |
filter=rest.Filter(
|
|
|
158 |
)])
|
159 |
|
160 |
|
161 |
+
##------------------------------get context----------------------------------------------
|
162 |
context_retrieved_lst = []
|
163 |
question_lst= [query]
|
164 |
+
|
165 |
for question in question_lst:
|
166 |
+
# similarity score threshold can be used to make adjustments in quality and quantity for Retriever
|
167 |
+
# However need to make balancing, as retrieved results are again used by Ranker to fetch best among
|
168 |
+
# retreived results
|
169 |
retriever = vectorstore.as_retriever(
|
170 |
+
search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.6,
|
171 |
+
"k": int(model_config.get('retriever','TOP_K')),
|
172 |
+
"filter":filter})
|
173 |
model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")
|
174 |
compressor = CrossEncoderReranker(model=model, top_n=3)
|
175 |
compression_retriever = ContextualCompressionRetriever(
|
|
|
186 |
context_retrieved_formatted = format_docs(context_retrieved)
|
187 |
context_retrieved_lst.append(context_retrieved_formatted)
|
188 |
|
189 |
+
##------------------- -------------Prompt--------------------------------------------------
|
190 |
SYSTEM_PROMPT = """
|
191 |
You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports. Provide a clear and structured answer based on the passages/context provided and the guidelines.
|
192 |
Guidelines:
|
|
|
208 |
""".format(context = context_retrieved_lst, question=query)
|
209 |
|
210 |
messages = [
|
211 |
+
SystemMessage(content=SYSTEM_PROMPT),
|
212 |
+
HumanMessage(
|
213 |
+
content=USER_PROMPT
|
214 |
+
),]
|
215 |
|
216 |
+
##-----------------------getting inference endpoints------------------------------
|
217 |
|
|
|
|
|
218 |
#callbacks = [StreamingStdOutCallbackHandler()]
|
219 |
llm_qa = HuggingFaceEndpoint(
|
220 |
endpoint_url= model_config.get('reader','ENDPOINT'),
|
|
|
223 |
timeout=70,
|
224 |
huggingfacehub_api_token=HF_token,)
|
225 |
|
226 |
+
# create RAG
|
227 |
chat_model = ChatHuggingFace(llm=llm_qa)
|
228 |
|
229 |
+
##-------------------------- get answers ---------------------------------------
|
230 |
answer_lst = []
|
231 |
for question, context in zip(question_lst , context_retrieved_lst):
|
232 |
answer = chat_model.invoke(messages)
|
|
|
246 |
|
247 |
yield history,docs_html
|
248 |
|
249 |
+
# logging the event
|
250 |
try:
|
251 |
timestamp = str(datetime.now().timestamp())
|
|
|
252 |
logs = {
|
253 |
"system_prompt": SYSTEM_PROMPT,
|
254 |
"sources":sources,
|
|
|
267 |
except Exception as e:
|
268 |
logging.error(e)
|
269 |
|
|
|
270 |
|
271 |
|
272 |
+
|
273 |
+
#### Gradio App ####
|
|
|
274 |
|
275 |
# Set up Gradio Theme
|
276 |
theme = gr.themes.Base(
|
|
|
317 |
with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
|
318 |
# creating tabs on right panel
|
319 |
with gr.Tabs() as tabs:
|
320 |
+
#---------------- tab for REPORTS SELECTION ----------------------
|
321 |
|
322 |
with gr.Tab("Reports",elem_id = "tab-config",id = 2):
|
323 |
gr.Markdown("Reminder: To get better results select the specific report/reports")
|
324 |
|
325 |
|
326 |
+
#----- First level filter for selecting Report source/category ----------
|
327 |
dropdown_sources = gr.Radio(
|
328 |
["Consolidated", "District","Ministry"],
|
329 |
label="Select Report Category",
|
|
|
331 |
interactive=True,
|
332 |
)
|
333 |
|
334 |
+
#------ second level filter for selecting subtype within the report category selected above
|
335 |
dropdown_category = gr.Dropdown(
|
336 |
list(files["Consolidated"].keys()),
|
337 |
value = list(files["Consolidated"].keys())[0],
|
338 |
label = "Filter for Sub-Type",
|
339 |
interactive=True)
|
340 |
|
341 |
+
#----------- update the secodn level filter abse don values from first level ----------------
|
342 |
def rs_change(rs):
|
343 |
return gr.update(choices=files[rs], value=list(files[rs].keys())[0])
|
344 |
dropdown_sources.change(fn=rs_change, inputs=[dropdown_sources], outputs=[dropdown_category])
|
345 |
|
346 |
+
#--------- Select the years for reports -------------------------------------
|
347 |
dropdown_year = gr.Dropdown(
|
348 |
['2018','2019','2020','2021','2022'],
|
349 |
label="Filter for year",
|
|
|
352 |
interactive=True,
|
353 |
)
|
354 |
gr.Markdown("-------------------------------------------------------------------------")
|
355 |
+
#---------------- Another way to select reports across category and sub-type ------------
|
356 |
dropdown_reports = gr.Dropdown(
|
357 |
report_list,
|
358 |
label="Or select specific reports",
|
|
|
390 |
)
|
391 |
|
392 |
samples.append(group_examples)
|
393 |
+
##------------------- tab for Sources reporting ##------------------
|
394 |
with gr.Tab("Sources",elem_id = "tab-citations",id = 1):
|
395 |
sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
|
396 |
docs_textbox = gr.State("")
|
auditqa/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (127 Bytes). View file
|
|
auditqa/__pycache__/doc_process.cpython-310.pyc
ADDED
Binary file (3.22 kB). View file
|
|
auditqa/__pycache__/process_chunks.cpython-310.pyc
ADDED
Binary file (3.29 kB). View file
|
|
auditqa/__pycache__/reports.cpython-310.pyc
ADDED
Binary file (1.68 kB). View file
|
|
auditqa/__pycache__/sample_questions.cpython-310.pyc
ADDED
Binary file (3.65 kB). View file
|
|
auditqa/doc_process.py
CHANGED
@@ -10,6 +10,9 @@ from qdrant_client import QdrantClient
|
|
10 |
from auditqa.reports import files, report_list
|
11 |
device = 'cuda' if cuda.is_available() else 'cpu'
|
12 |
|
|
|
|
|
|
|
13 |
# path to the pdf files
|
14 |
path_to_data = "./data/pdf/"
|
15 |
|
|
|
10 |
from auditqa.reports import files, report_list
|
11 |
device = 'cuda' if cuda.is_available() else 'cpu'
|
12 |
|
13 |
+
### This script is NO MORE IN USE #####
|
14 |
+
# Preprocessed report pdf is brought along with chunks and added to existing reports database
|
15 |
+
|
16 |
# path to the pdf files
|
17 |
path_to_data = "./data/pdf/"
|
18 |
|
auditqa/engine/prompts.py
DELETED
@@ -1,68 +0,0 @@
|
|
1 |
-
llama_propmt = """<|begin_of_text|>
|
2 |
-
<|start_header_id|>system<|end_header_id|>
|
3 |
-
You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
|
4 |
-
Guidelines:
|
5 |
-
- If the passages have useful facts or numbers, use them in your answer.
|
6 |
-
- When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
|
7 |
-
- Do not use the sentence 'Doc i says ...' to say where information came from.
|
8 |
-
- If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
|
9 |
-
- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
|
10 |
-
- If it makes sense, use bullet points and lists to make your answers easier to understand.
|
11 |
-
- You do not need to use every passage. Only use the ones that help answer the question.
|
12 |
-
- If the documents do not have the information needed to answer the question, just say you do not have enough information.
|
13 |
-
<|eot_id|>
|
14 |
-
<|start_header_id|>user<|end_header_id|>
|
15 |
-
Passages:
|
16 |
-
{context}
|
17 |
-
-----------------------
|
18 |
-
Question: {question} - Explained to {audience}
|
19 |
-
Answer in {language} with the passages citations:
|
20 |
-
<|eot_id|>
|
21 |
-
<|start_header_id|>assistant<|end_header_id|>
|
22 |
-
"""
|
23 |
-
system_propmt = """
|
24 |
-
You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
|
25 |
-
Guidelines:
|
26 |
-
- If the passages have useful facts or numbers, use them in your answer.
|
27 |
-
- When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
|
28 |
-
- Do not use the sentence 'Doc i says ...' to say where information came from.
|
29 |
-
- If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
|
30 |
-
- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
|
31 |
-
- If it makes sense, use bullet points and lists to make your answers easier to understand.
|
32 |
-
- You do not need to use every passage. Only use the ones that help answer the question.
|
33 |
-
- If the documents do not have the information needed to answer the question, just say you do not have enough information.
|
34 |
-
"""
|
35 |
-
user_propmt = """
|
36 |
-
Passages:
|
37 |
-
{context}
|
38 |
-
-----------------------
|
39 |
-
Question: {question} - Explained to {audience}
|
40 |
-
Answer in {language} with the passages citations:
|
41 |
-
"""
|
42 |
-
|
43 |
-
answer_prompt_template = """
|
44 |
-
You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
|
45 |
-
Guidelines:
|
46 |
-
- If the passages have useful facts or numbers, use them in your answer.
|
47 |
-
- When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
|
48 |
-
- Do not use the sentence 'Doc i says ...' to say where information came from.
|
49 |
-
- If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
|
50 |
-
- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
|
51 |
-
- If it makes sense, use bullet points and lists to make your answers easier to understand.
|
52 |
-
- You do not need to use every passage. Only use the ones that help answer the question.
|
53 |
-
- If the documents do not have the information needed to answer the question, just say you do not have enough information.
|
54 |
-
- Consider by default that the question is about the past century unless it is specified otherwise.
|
55 |
-
- If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
|
56 |
-
-----------------------
|
57 |
-
Passages:
|
58 |
-
{context}
|
59 |
-
-----------------------
|
60 |
-
Question: {question} - Explained to {audience}
|
61 |
-
Answer in {language} with the passages citations:
|
62 |
-
"""
|
63 |
-
|
64 |
-
audience_prompts = {
|
65 |
-
"children": "6 year old children that don't know anything about audit and governance and need metaphors to learn",
|
66 |
-
"general": "the general public who know the basics in audit and governance and want to learn more about it without technical terms. Still use references to passages.",
|
67 |
-
"experts": "expert and climate scientists that are not afraid of technical terms",
|
68 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auditqa/reports.py
CHANGED
@@ -1,9 +1,3 @@
|
|
1 |
-
POSSIBLE_REPORTS = [
|
2 |
-
"Consolidated2021",
|
3 |
-
"MWTS2021",
|
4 |
-
"MWTS2022"
|
5 |
-
]
|
6 |
-
|
7 |
report_list = ['Annual Consolidated OAG audit reports 2018',
|
8 |
'Annual Consolidated OAG audit reports 2019',
|
9 |
'Annual Consolidated OAG audit reports 2020',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
report_list = ['Annual Consolidated OAG audit reports 2018',
|
2 |
'Annual Consolidated OAG audit reports 2019',
|
3 |
'Annual Consolidated OAG audit reports 2020',
|
requirements.txt
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
langchain~=0.1.0
|
2 |
langchain-huggingface==0.0.3
|
3 |
-
#langchainhub~=0.1.14
|
4 |
python-dotenv
|
5 |
transformers>=4.35.2
|
6 |
huggingface_hub==0.23.5
|
@@ -8,4 +7,4 @@ sentence_transformers~=3.0.1
|
|
8 |
langchain-qdrant==0.1.3
|
9 |
qdrant-client~=1.10.1
|
10 |
PyMuPDF~=1.23.7
|
11 |
-
sentencepiece
|
|
|
1 |
langchain~=0.1.0
|
2 |
langchain-huggingface==0.0.3
|
|
|
3 |
python-dotenv
|
4 |
transformers>=4.35.2
|
5 |
huggingface_hub==0.23.5
|
|
|
7 |
langchain-qdrant==0.1.3
|
8 |
qdrant-client~=1.10.1
|
9 |
PyMuPDF~=1.23.7
|
10 |
+
sentencepiece==0.2.0
|