Spaces:
Sleeping
Sleeping
File size: 11,532 Bytes
e9c0973 109ff5f e9c0973 109ff5f e9c0973 2676f5d e9c0973 2676f5d e9c0973 2676f5d dcf594a e9c0973 dcf594a e9c0973 dcf594a e9c0973 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
import os
import gradio as gr
from operator import itemgetter
from pinecone import Pinecone
from huggingface_hub import whoami
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_openai import AzureChatOpenAI
from langchain.prompts.prompt import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain_community.vectorstores import Pinecone as PineconeVectorstore
from eki_esrsqa.utils import (
make_html_source,
make_pairs,
_format_chat_history,
_combine_documents,
get_llm,
init_env,
)
init_env()
chat_model_init = get_llm()
demo_name = "ESRS_QA"
hf_model = "BAAI/bge-base-en-v1.5"
embeddings = HuggingFaceBgeEmbeddings(
model_name=hf_model,
encode_kwargs={"normalize_embeddings": True},
)
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index(os.getenv("PINECONE_API_INDEX"))
vectorstore = PineconeVectorstore(index, embeddings, "page_content")
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
chat_model = AzureChatOpenAI()
esrs_wiki = """
The Corporate Sustainability Reporting Directive (CSRD) is a mandate that requires all companies to report on their sustainability initiatives. In response to this directive, the European Sustainability Reporting Standards (ESRS) were developed. These standards are a key tool in promoting the transition to a sustainable economy within the EU, providing a structured framework for companies to disclose their sustainability initiatives. The ESRS cover a wide range of environmental, social, and governance (ESG) issues, including climate change, biodiversity, and human rights. Companies that adhere to the ESRS can provide investors with valuable insights into their sustainability impact, thereby informing investment decisions. The ESRS are designed to be highly interoperable with global reporting standards, which helps to avoid unnecessary duplication in reporting by companies. The reporting requirements based on the ESRS will be gradually implemented for different companies over time. In summary, the ESRS play a critical role in fostering sustainable finance and enabling companies to demonstrate their commitment to the green deal agenda while accessing sustainable finance.
---
"""
reformulation_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(reformulation_template)
answering_template = """
You are an ESG expert, with 20 years experience analyzing corporate sustainability reports.
You are specialist in the upcoming CSRD regulation and in general with corporate sustainability disclosure requirements.
{esrs_wiki}
You will answer the question based on the following passages extracted from CSRD specific sustainability guidelines and reports:
```
{context}
```
Guidelines:
1. Context: You'll receive relevant excerpts from a CSRD-specific sustainability guideline or report to address a given question.
2. Relevance: Only include passages directly pertaining to the question; omit irrelevant content.
3. Facts and Figures: Prioritize factual information in your response.
4. Conciseness: Keep answers sharp and succinct, avoiding unnecessary context.
5. Focus: Address the specific question without veering into related topics.
6. Honesty: If unsure, state that you don't know rather than inventing an answer.
7. Source Attribution: When using information from a passage, mention it as [Doc i] at the end of the sentence (where 'i' represents the document number).
8. Multiple Sources: If the same content appears in multiple documents, cite them collectively (e.g., [Doc i, Doc j, Doc k]).
9. Structured Paragraphs: Instead of bullet-point summaries, compile your responses into well-structured paragraphs.
10. Method Focus: When addressing "how" questions, emphasize methods and procedures over outcomes.
11. Selective Usage: You're not obligated to use every passage; include only those relevant to the question.
12. Insufficient Information: If documents lack necessary details, indicate that you don't have enough information.
Question: {question}
Answer:
"""
ANSWER_PROMPT = ChatPromptTemplate.from_template(answering_template)
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
memory = ConversationBufferMemory(
return_messages=True, output_key="answer", input_key="question"
)
# First we add a step to load memory
# This adds a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)
# Now we calculate the standalone question
standalone_question = {
"standalone_question": {
"question": lambda x: x["question"],
"chat_history": lambda x: _format_chat_history(x["chat_history"]),
}
| CONDENSE_QUESTION_PROMPT
| chat_model
| StrOutputParser(),
}
# Now we retrieve the documents
retrieved_documents = {
"docs": itemgetter("standalone_question") | retriever,
"question": lambda x: x["standalone_question"],
}
# Now we construct the inputs for the final prompt
final_inputs = {
"context": lambda x: _combine_documents(x["docs"], DEFAULT_DOCUMENT_PROMPT),
"question": itemgetter("question"),
"esrs_wiki": lambda x: esrs_wiki,
}
# And finally, we do the part that returns the answers
answer = {
"answer": final_inputs | ANSWER_PROMPT | chat_model,
"docs": itemgetter("docs"),
}
# And now we put it all together!
final_chain = loaded_memory | standalone_question | retrieved_documents | answer
async def chat(
query: str,
history: list = [],
):
"""taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
(messages in gradio format, messages in langchain format, source documents)"""
source_string = ""
gradio_format = make_pairs([a.content for a in history]) + [(query, "")]
# reset memory
memory.clear()
for message in history:
memory.chat_memory.add_message(message)
inputs = {"question": query}
result = final_chain.astream_log({"question": query})
reformulated_question_path_id = "/logs/AzureChatOpenAI/streamed_output_str/-"
retriever_path_id = "/logs/Retriever/final_output"
final_answer_path_id = "/logs/AzureChatOpenAI:2/streamed_output_str/-"
async for op in result:
op = op.ops[0]
if op["path"] == reformulated_question_path_id: # reforulated question
new_token = op["value"] # str
elif op["path"] == retriever_path_id: # documents
sources = op["value"]["documents"] # List[Document]
source_string = "\n\n".join(
[(make_html_source(i, doc)) for i, doc in enumerate(sources, 1)]
)
elif op["path"] == final_answer_path_id: # final answer
new_token = op["value"] # str
answer_yet = gradio_format[-1][1]
gradio_format[-1] = (query, answer_yet + new_token)
yield "", gradio_format, history, source_string
memory.save_context(inputs, {"answer": gradio_format[-1][1]})
yield "", gradio_format, memory.load_memory_variables({})["history"], source_string
with open("./assets/style.css", "r") as f:
css = f.read()
def update_visible(oauth_token: gr.OAuthToken | None):
if oauth_token is None:
return {
bloc_1: gr.update(visible=True),
bloc_2: gr.update(visible=False),
bloc_3: gr.update(visible=False),
}
org_names = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
if "ekimetrics-esrsqa" in org_names: # logged in group
return {
bloc_1: gr.update(visible=False),
bloc_2: gr.update(visible=True),
bloc_3: gr.update(visible=False),
}
else: # logged but not in group
return {
bloc_1: gr.update(visible=False),
bloc_2: gr.update(visible=False),
bloc_3: gr.update(visible=True),
}
# Set up Gradio Theme
theme = gr.themes.Base(
primary_hue="blue",
secondary_hue="red",
font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
)
init_prompt = """
Hello, I am ESRS Q&A, a conversational assistant designed to help you understand the content of European Sustainability Reporting Standards (ESRS). I will answer your questions based **on the official definition of each ESRS as well as guidelines**.
⚠️ Limitations
*Please note that this chatbot is in an early stage phase, it is not perfect and may sometimes give irrelevant answers. If you are not satisfied with the answer, please ask a more specific question or report your feedback to help us improve the system.*
What do you want to learn ?
"""
with gr.Blocks(title=f"{demo_name}", css=css, theme=theme) as demo:
# gr.LoginButton()
# with gr.Column() as bloc_1:
# textbox_1 = gr.Textbox("You are not logged to Hugging Face !", show_label=False)
# with gr.Column(visible=False) as bloc_3:
# textbox_3 = gr.Textbox(
# "You are not part of the ESRS Q&A Project, ask access here : https://huggingface.co/ekimetrics-esrsqa"
# )
with gr.Column(visible=True) as bloc_2:
with gr.Tab("ESRS Q&A"):
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(
value=[(None, init_prompt)],
show_copy_button=True,
show_label=False,
elem_id="chatbot",
layout="panel",
avatar_images=(None, "https://i.ibb.co/YNyd5W2/logo4.png"),
)
state = gr.State([])
with gr.Row(elem_id="input-message"):
ask = gr.Textbox(
placeholder="Ask me anything here!",
show_label=False,
scale=7,
lines=1,
interactive=True,
elem_id="input-textbox",
)
with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
with gr.Tab("Sources", elem_id="tab-citations", id=1):
sources_textbox = gr.HTML(
show_label=False, elem_id="sources-textbox"
)
docs_textbox = gr.State("")
with gr.Tab("About", elem_classes="max-height other-tabs"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("WIP")
# demo.load(update_visible, inputs=None, outputs=[bloc_1, bloc_2, bloc_3])
ask.submit(
fn=chat,
inputs=[
ask,
state,
],
outputs=[ask, chatbot, state, sources_textbox],
)
demo.launch(
share=True,
# auth=("", ""),
debug=True,
)
|