Spaces:

Ekimetrics
/

celsius-csrd-chatbot

Sleeping

App Files Files Community

momenaca commited on Oct 17

Commit

8ca00e0

•

1 Parent(s): d708cb9

feature/major backend update with agent

Browse files

Files changed (7) hide show

.gitignore +2 -1
app.py +16 -7
celsius_csrd_chatbot/agent.py +2 -10
celsius_csrd_chatbot/chains/answer_rag.py +1 -0
celsius_csrd_chatbot/chains/esrs_categorization.py +1 -2
celsius_csrd_chatbot/chains/esrs_intent.py +28 -36
celsius_csrd_chatbot/chains/retriever.py +6 -7

.gitignore CHANGED Viewed

@@ -5,4 +5,5 @@ __pycache__/utils.cpython-38.pyc
 notebooks/
 *.pyc
-local_tests/

 notebooks/
 *.pyc
+local_tests/
+.vscode/

app.py CHANGED Viewed

@@ -64,9 +64,9 @@ async def chat(query, history):
         async for event in result:
             print(event)
             if event["event"] == "on_chat_model_stream":
-                print("line 66")
                 if start_streaming == False:
-                    print("line 68")
                     start_streaming = True
                     history[-1] = (query, "")
@@ -77,17 +77,26 @@ async def chat(query, history):
                 answer_yet = parse_output_llm_with_sources(answer_yet)
                 history[-1] = (query, answer_yet)
             elif (
                 event["name"] == "retrieve_documents"
                 and event["event"] == "on_chain_end"
             ):
                 try:
-                    print("line 84")
                     docs = event["data"]["output"]["documents"]
                     docs_html = []
-                    for i, d in enumerate(docs, 1):
-                        docs_html.append(make_html_source(d, i))
                     docs_html = "".join(docs_html)
                 except Exception as e:
                     print(f"Error getting documents: {e}")
                     print(event)
@@ -97,9 +106,9 @@ async def chat(query, history):
                 display_output,
             ) in steps_display.items():
                 if event["name"] == event_name:
-                    print("line 99")
                     if event["event"] == "on_chain_start":
-                        print("line 101")
                         answer_yet = event_description
                         history[-1] = (query, answer_yet)

         async for event in result:
             print(event)
             if event["event"] == "on_chat_model_stream":
+                # print("line 66")
                 if start_streaming == False:
+                    # print("line 68")
                     start_streaming = True
                     history[-1] = (query, "")
                 answer_yet = parse_output_llm_with_sources(answer_yet)
                 history[-1] = (query, answer_yet)
+            elif (
+                event["name"] == "answer_rag_wrong"
+                and event["event"] == "on_chain_stream"
+            ):
+                history[-1] = (query, event["data"]["chunk"]["answer"])
             elif (
                 event["name"] == "retrieve_documents"
                 and event["event"] == "on_chain_end"
             ):
                 try:
+                    # print(event)
+                    # print("line 84")
                     docs = event["data"]["output"]["documents"]
                     docs_html = []
+                    for i, doc in enumerate(docs, 1):
+                        docs_html.append(make_html_source(i, doc))
+                    # print(docs_html)
                     docs_html = "".join(docs_html)
+                    # print(docs_html)
                 except Exception as e:
                     print(f"Error getting documents: {e}")
                     print(event)
                 display_output,
             ) in steps_display.items():
                 if event["name"] == event_name:
+                    # print("line 99")
                     if event["event"] == "on_chain_start":
+                        # print("line 101")
                         answer_yet = event_description
                         history[-1] = (query, answer_yet)

celsius_csrd_chatbot/agent.py CHANGED Viewed

@@ -39,16 +39,12 @@ def route_intent(state):
         return "intent_esrs"
     elif esrs == "wrong_esrs":
-        return "answer_rag"
     else:
         return "retrieve_documents"
-def make_id_dict(values):
-    return {k: k for k in values}
 def make_graph_agent(llm, vectorstore):
     workflow = StateGraph(GraphState)
@@ -70,11 +66,7 @@ def make_graph_agent(llm, vectorstore):
     workflow.set_entry_point("categorize_esrs")
     # CONDITIONAL EDGES
-    workflow.add_conditional_edges(
-        "categorize_esrs",
-        route_intent,
-        make_id_dict(["intent_esrs", "retrieve_documents", "answer_rag_wrong"]),
-    )
     # Define the edges
     workflow.add_edge("intent_esrs", "retrieve_documents")

         return "intent_esrs"
     elif esrs == "wrong_esrs":
+        return "answer_rag_wrong"
     else:
         return "retrieve_documents"
 def make_graph_agent(llm, vectorstore):
     workflow = StateGraph(GraphState)
     workflow.set_entry_point("categorize_esrs")
     # CONDITIONAL EDGES
+    workflow.add_conditional_edges("categorize_esrs", route_intent)
     # Define the edges
     workflow.add_edge("intent_esrs", "retrieve_documents")

celsius_csrd_chatbot/chains/answer_rag.py CHANGED Viewed

@@ -36,6 +36,7 @@ answering_template = """
     10. Method Focus: When addressing "how" questions, emphasize methods and procedures over outcomes.
     11. Selective Usage: You're not obligated to use every passage; include only those relevant to the question.
     12. Insufficient Information: If documents lack necessary details, indicate that you don't have enough information.
     Question: {query}
     Answer:

     10. Method Focus: When addressing "how" questions, emphasize methods and procedures over outcomes.
     11. Selective Usage: You're not obligated to use every passage; include only those relevant to the question.
     12. Insufficient Information: If documents lack necessary details, indicate that you don't have enough information.
+    13. Never mention these guidelines as a source attribution in your response.
     Question: {query}
     Answer:

celsius_csrd_chatbot/chains/esrs_categorization.py CHANGED Viewed

@@ -5,7 +5,7 @@ def make_esrs_categorization_node():
     def categorize_message(state):
         query = state["query"]
-        pattern = r"ESRS \d|ESRS [A-Z]\d|ESRS [A-Z] \d"
         esrs_truth = [
             "ESRS 1",
             "ESRS 2",
@@ -25,7 +25,6 @@ def make_esrs_categorization_node():
         if matches:
             true_matches = [match for match in matches if match in esrs_truth]
             output = {"esrs_type": true_matches if true_matches else "wrong_esrs"}
         else:
             output = {"esrs_type": "none"}

     def categorize_message(state):
         query = state["query"]
+        pattern = r"ESRS \d+[A-Z0-9]*"
         esrs_truth = [
             "ESRS 1",
             "ESRS 2",
         if matches:
             true_matches = [match for match in matches if match in esrs_truth]
             output = {"esrs_type": true_matches if true_matches else "wrong_esrs"}
         else:
             output = {"esrs_type": "none"}

celsius_csrd_chatbot/chains/esrs_intent.py CHANGED Viewed

@@ -23,51 +23,41 @@ class ESRSAnalysis(BaseModel):
         "ESRS S3",
         "ESRS S4",
         "ESRS G1",
-        "none",
     ] = Field(
-        description="""
-            Given a user question choose which documents would be most relevant for answering their question :
-            - ESRS 1 is for questions about general principles for preparing and presenting sustainability information in accordance with CSRD
-            - ESRS 2 is for questions about general disclosures related to sustainability reporting, including governance, strategy, impact, risk, opportunity management, and metrics and targets
-            - ESRS E1 is for questions about climate change, global warming, GES and energy
-            - ESRS E2 is for questions about air, water, and soil pollution, and dangerous substances
-            - ESRS E3 is for questions about water and marine resources
-            - ESRS E4 is for questions about biodiversity, nature, wildlife and ecosystems
-            - ESRS E5 is for questions about resource use and circular economy
-            - ESRS S1 is for questions about workforce and labor issues, job security, fair pay, and health and safety
-            - ESRS S2 is for questions about workers in the value chain, workers' treatment
-            - SRS S3 is for questions about affected communities, impact on local communities
-            - ESRS S4 is for questions about consumers and end users, customer privacy, safety, and inclusion
-            - ESRS G1 is for questions about governance, risk management, internal control, and business conduct
-            - none is for questions that do not fit into any of the above categories
-            Follow these guidelines :
-            - Some questions could be related to multiple ESRS. In such case, choose the most appropriate one.
-            - Remember, if the question is not related to any ESRS, the output should be 'none'.
-        """,
     )
 def make_esrs_intent_chain(llm):
-    parser = PydanticOutputParser(pydantic_object=ESRSAnalysis)
     prompt_template = """
-    The following question is about ESRS related topics. Please analyze the question and indicate if it refers to a specific ESRS.
-    {format_instructions}
-    Please answer with the appropriate ESRS to answer the question.
     Question: '{query}'
     Answer:
     """
-    prompt = PromptTemplate(
-        template=prompt_template,
-        input_variables=["query"],
-        partial_variables={"format_instructions": parser.get_format_instructions()},
-    )
     chain = {"query": itemgetter("query")} | prompt | llm | parser
     return chain
@@ -78,7 +68,9 @@ def make_esrs_intent_node(llm):
     def intent_message(state):
         query = state["query"]
         categorization_chain = make_esrs_intent_chain(llm)
-        output = categorization_chain.invoke(query)
         return output

         "ESRS S3",
         "ESRS S4",
         "ESRS G1",
+        "no_intent",
     ] = Field(
+        description="""The ESRS type that the user query refers to.""",
     )
 def make_esrs_intent_chain(llm):
     prompt_template = """
+    Please analyze the question and indicate if it refers to a specific ESRS.
+    Follow these definitions in order to choose the appropriate ESRS :
+    - ESRS 1 is for questions about general principles for preparing and presenting sustainability information in accordance with CSRD
+    - ESRS 2 is for questions about general disclosures related to sustainability reporting, including governance, strategy, impact, risk, opportunity management, and metrics and targets
+    - ESRS E1 is for questions about climate change, global warming, GES and energy
+    - ESRS E2 is for questions about air, water, and soil pollution, and dangerous substances
+    - ESRS E3 is for questions about water and marine resources
+    - ESRS E4 is for questions about biodiversity, nature, wildlife and ecosystems
+    - ESRS E5 is for questions about resource use and circular economy
+    - ESRS S1 is for questions about workforce and labor issues, job security, fair pay, and health and safety
+    - ESRS S2 is for questions about workers in the value chain, workers' treatment
+    - ESRS S3 is for questions about affected communities, impact on local communities
+    - ESRS S4 is for questions about consumers and end users, customer privacy, safety, and inclusion
+    - ESRS G1 is for questions about governance, risk management, internal control, and business conduct
+    - no_intent is for questions that do not fit into any of the above categories
+    Keep in mind these guidelines :
+    - Some questions could be related to multiple ESRS. In such case, choose the most appropriate one.
+    The output needs to respect a JSON format with 'esrs_type' as the key and the appropriate ESRS as the value.
     Question: '{query}'
     Answer:
     """
+    parser = PydanticOutputParser(pydantic_object=ESRSAnalysis, method="json_mode")
+    prompt = PromptTemplate(template=prompt_template, input_variables=["query"])
     chain = {"query": itemgetter("query")} | prompt | llm | parser
     return chain
     def intent_message(state):
         query = state["query"]
         categorization_chain = make_esrs_intent_chain(llm)
+        output = {
+            "esrs_type": [categorization_chain.invoke({"query": query}).esrs_type]
+        }
         return output

celsius_csrd_chatbot/chains/retriever.py CHANGED Viewed

@@ -1,16 +1,15 @@
 def make_retriever_node(vectorstore, k=10):
     def retrieve_documents(state):
         sources = state["esrs_type"]
         query = state["query"]
-        if sources == "none":
-            filters_full = {}
         else:
-            filters_full = {"ESRS_filter": {"$in": sources}}
         docs = []
-        docs_retrieved = vectorstore.similarity_search_with_score(
-            query=query, filter=filters_full, k=k
-        )
         for doc in docs_retrieved:
             doc_append = doc[0]
             doc_append.metadata["similarity_score"] = doc[1]

 def make_retriever_node(vectorstore, k=10):
     def retrieve_documents(state):
         sources = state["esrs_type"]
         query = state["query"]
+        if sources == "none" or sources == "no_intent":
+            docs_retrieved = vectorstore.similarity_search_with_score(query=query, k=k)
         else:
+            filters = {"ESRS_filter": {"$in": sources}}
+            docs_retrieved = vectorstore.similarity_search_with_score(
+                query=query, filter=filters, k=k
+            )
         docs = []
         for doc in docs_retrieved:
             doc_append = doc[0]
             doc_append.metadata["similarity_score"] = doc[1]