Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on Nov 6

Commit

76603df

•

1 Parent(s): a059c93

put event handling in separate file

Browse files

Files changed (3) hide show

app.py +22 -131
climateqa/event_handler.py +120 -0
front/utils.py +42 -1

app.py CHANGED Viewed

@@ -27,12 +27,11 @@ from azure.storage.fileshare import ShareServiceClient
 from utils import create_user_id
-from langchain_chroma import Chroma
-from collections import defaultdict
 from gradio_modal import Modal
 from PIL import Image
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
@@ -49,9 +48,9 @@ from climateqa.engine.keywords import make_keywords_chain
 from climateqa.engine.graph import make_graph_agent,display_graph
 from climateqa.engine.embeddings import get_embeddings_function
-from front.utils import make_html_source,parse_output_llm_with_sources,serialize_docs,make_toolbox,generate_html_graphs
-from front.utils import make_html_source, make_html_figure_sources,parse_output_llm_with_sources,serialize_docs,make_toolbox
 # Load environment variables in local mode
 try:
@@ -121,6 +120,7 @@ reranker = get_reranker("nano")
 # agent = make_graph_agent(llm,vectorstore,reranker)
 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, reranker=reranker)
 async def chat(query,history,audience,sources,reports,current_graphs):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
@@ -128,14 +128,7 @@ async def chat(query,history,audience,sources,reports,current_graphs):
     date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     print(f">> NEW QUESTION ({date_now}) : {query}")
-    if audience == "Children":
-        audience_prompt = audience_prompts["children"]
-    elif audience == "General public":
-        audience_prompt = audience_prompts["general"]
-    elif audience == "Experts":
-        audience_prompt = audience_prompts["experts"]
-    else:
-        audience_prompt = audience_prompts["experts"]
     # Prepare default values
     if sources is None or len(sources) == 0:
@@ -149,14 +142,11 @@ async def chat(query,history,audience,sources,reports,current_graphs):
     docs = []
-    docs_used = True
     used_figures=[]
     docs_html = ""
     output_query = ""
     output_language = ""
     output_keywords = ""
-    gallery = []
-    updates = []
     start_streaming = False
     graphs_html = ""
     figures = '<div class="figures-container"><p></p> </div>'
@@ -175,79 +165,19 @@ async def chat(query,history,audience,sources,reports,current_graphs):
                 node = event["metadata"]["langgraph_node"]
                 if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" :# when documents are retrieved
-                    try:
-                        docs = event["data"]["output"]["documents"]
-                        docs_html = []
-                        textual_docs = [d for d in docs if d.metadata["chunk_type"] == "text"]
-                        for i, d in enumerate(textual_docs, 1):
-                            if d.metadata["chunk_type"] == "text":
-                                docs_html.append(make_html_source(d, i))
-                        used_documents = used_documents + [f"{d.metadata['short_name']} - {d.metadata['name']}" for d in docs]
-                        history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
-                        docs_html = "".join(docs_html)
-                    except Exception as e:
-                        print(f"Error getting documents: {e}")
-                        print(event)
                 elif event["name"] in steps_display.keys() and event["event"] == "on_chain_start": #display steps
-                    event_description,display_output = steps_display[node]
                     if not hasattr(history[-1], 'metadata') or history[-1].metadata["title"] != event_description: # if a new step begins
                         history.append(ChatMessage(role="assistant", content = "", metadata={'title' :event_description}))
                 elif event["name"] != "transform_query" and event["event"] == "on_chat_model_stream" and node in ["answer_rag", "answer_search","answer_chitchat"]:# if streaming answer
-                    if start_streaming == False:
-                        start_streaming = True
-                        history.append(ChatMessage(role="assistant", content = ""))
-                    answer_message_content +=  event["data"]["chunk"].content
-                    answer_message_content = parse_output_llm_with_sources(answer_message_content)
-                    history[-1] = ChatMessage(role="assistant", content = answer_message_content)
-                    # history.append(ChatMessage(role="assistant", content = new_message_content))
                 elif event["name"] in ["retrieve_graphs", "retrieve_graphs_ai"] and event["event"] == "on_chain_end":
-                    try:
-                        recommended_content = event["data"]["output"]["recommended_content"]
-                        unique_graphs = []
-                        seen_embeddings = set()
-                        for x in recommended_content:
-                            embedding = x.metadata["returned_content"]
-                            # Check if the embedding has already been seen
-                            if embedding not in seen_embeddings:
-                                unique_graphs.append({
-                                    "embedding": embedding,
-                                    "metadata": {
-                                        "source": x.metadata["source"],
-                                        "category": x.metadata["category"]
-                                    }
-                                })
-                                # Add the embedding to the seen set
-                                seen_embeddings.add(embedding)
-                        categories = {}
-                        for graph in unique_graphs:
-                            category = graph['metadata']['category']
-                            if category not in categories:
-                                categories[category] = []
-                            categories[category].append(graph['embedding'])
-                        for category, embeddings in categories.items():
-                            graphs_html += f"<h3>{category}</h3>"
-                            for embedding in embeddings:
-                                graphs_html += f"<div>{embedding}</div>"
-                    except Exception as e:
-                        print(f"Error getting graphs: {e}")
                 if event["name"] == "transform_query" and event["event"] =="on_chain_end":
@@ -257,7 +187,7 @@ async def chat(query,history,audience,sources,reports,current_graphs):
                 if event["name"] == "categorize_intent" and event["event"] == "on_chain_start":
                     print("X")
-            yield history, docs_html, output_query, output_language, docs , graphs_html#gallery, figures,  #,output_query,output_keywords
     except Exception as e:
         print(event, "has failed")
@@ -285,52 +215,9 @@ async def chat(query,history,audience,sources,reports,current_graphs):
         print(f"Error logging on Azure Blob Storage: {e}")
         raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)")
-    yield history, docs_html, output_query, output_language, docs, graphs_html # gallery, figures, graphs_html#,output_query,output_keywords
-# def process_figures(docs, figures, gallery, used_figures =[]):
-def process_figures(docs):
-    gallery=[]
-    used_figures =[]
-    figures = '<div class="figures-container"><p></p> </div>'
-    docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"]
-    for i, doc in enumerate(docs_figures):
-        if doc.metadata["chunk_type"] == "image":
-            if doc.metadata["figure_code"] != "N/A":
-                title = f"{doc.metadata['figure_code']} - {doc.metadata['short_name']}"
-            else:
-                title = f"{doc.metadata['short_name']}"
-            if title not in used_figures:
-                used_figures.append(title)
-                try:
-                    key = f"Image {i+1}"
-                    image_path = doc.metadata["image_path"].split("documents/")[1]
-                    img = get_image_from_azure_blob_storage(image_path)
-                    # Convert the image to a byte buffer
-                    buffered = BytesIO()
-                    max_image_length = 500
-                    img_resized = img.resize((max_image_length, int(max_image_length * img.size[1]/img.size[0])))
-                    img_resized.save(buffered, format="PNG")
-                    img_str = base64.b64encode(buffered.getvalue()).decode()
-                    figures = figures + make_html_figure_sources(doc, i, img_str)
-                    gallery.append(img)
-                except Exception as e:
-                    print(f"Skipped adding image {i} because of {e}")
-    return figures, gallery
 def save_feedback(feed: str, user_id):
     if len(feed) > 1:
         timestamp = str(datetime.now().timestamp())
@@ -657,13 +544,15 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
-                gr.Markdown("""
-### More info
-- See more info at [https://climateqa.com](https://climateqa.com/docs/intro/)
-- Feedbacks on this [form](https://forms.office.com/e/1Yzgxm6jbp)
-### Citation
-""")
                 with gr.Accordion(CITATION_LABEL,elem_id="citation", open = False,):
                     # # Display citation label and text)
                     gr.Textbox(
@@ -721,6 +610,8 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     sources_raw.change(process_figures, inputs=[sources_raw], outputs=[figures_cards, gallery_component])
     sources_textbox.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs],[tab_sources, tab_figures, tab_recommended_content])
     figures_cards.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs],[tab_sources, tab_figures, tab_recommended_content])
     current_graphs.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs],[tab_sources, tab_figures, tab_recommended_content])

 from utils import create_user_id
 from gradio_modal import Modal
 from PIL import Image
+from langchain_core.runnables.schema import StreamEvent
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
 from climateqa.engine.graph import make_graph_agent,display_graph
 from climateqa.engine.embeddings import get_embeddings_function
+from front.utils import serialize_docs,process_figures
+from climateqa.event_handler import init_audience, handle_retrieved_documents, stream_answer,handle_retrieved_owid_graphs
 # Load environment variables in local mode
 try:
 # agent = make_graph_agent(llm,vectorstore,reranker)
 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, reranker=reranker)
 async def chat(query,history,audience,sources,reports,current_graphs):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
     date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     print(f">> NEW QUESTION ({date_now}) : {query}")
+    audience_prompt = init_audience(audience)
     # Prepare default values
     if sources is None or len(sources) == 0:
     docs = []
     used_figures=[]
     docs_html = ""
     output_query = ""
     output_language = ""
     output_keywords = ""
     start_streaming = False
     graphs_html = ""
     figures = '<div class="figures-container"><p></p> </div>'
                 node = event["metadata"]["langgraph_node"]
                 if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" :# when documents are retrieved
+                    docs, docs_html, history, used_documents = handle_retrieved_documents(event, history, used_documents)
                 elif event["name"] in steps_display.keys() and event["event"] == "on_chain_start": #display steps
+                    event_description, display_output = steps_display[node]
                     if not hasattr(history[-1], 'metadata') or history[-1].metadata["title"] != event_description: # if a new step begins
                         history.append(ChatMessage(role="assistant", content = "", metadata={'title' :event_description}))
                 elif event["name"] != "transform_query" and event["event"] == "on_chat_model_stream" and node in ["answer_rag", "answer_search","answer_chitchat"]:# if streaming answer
+                    history, start_streaming, answer_message_content = stream_answer(history, event, start_streaming, answer_message_content)
                 elif event["name"] in ["retrieve_graphs", "retrieve_graphs_ai"] and event["event"] == "on_chain_end":
+                    graphs_html = handle_retrieved_owid_graphs(event, graphs_html)
                 if event["name"] == "transform_query" and event["event"] =="on_chain_end":
                 if event["name"] == "categorize_intent" and event["event"] == "on_chain_start":
                     print("X")
+            yield history, docs_html, output_query, output_language, docs , graphs_html #,output_query,output_keywords
     except Exception as e:
         print(event, "has failed")
         print(f"Error logging on Azure Blob Storage: {e}")
         raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)")
+    yield history, docs_html, output_query, output_language, docs, graphs_html
 def save_feedback(feed: str, user_id):
     if len(feed) > 1:
         timestamp = str(datetime.now().timestamp())
+                gr.Markdown(
+                    """
+                    ### More info
+                    - See more info at [https://climateqa.com](https://climateqa.com/docs/intro/)
+                    - Feedbacks on this [form](https://forms.office.com/e/1Yzgxm6jbp)
+                    ### Citation
+                    """
+                )
                 with gr.Accordion(CITATION_LABEL,elem_id="citation", open = False,):
                     # # Display citation label and text)
                     gr.Textbox(
     sources_raw.change(process_figures, inputs=[sources_raw], outputs=[figures_cards, gallery_component])
     sources_textbox.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs],[tab_sources, tab_figures, tab_recommended_content])
     figures_cards.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs],[tab_sources, tab_figures, tab_recommended_content])
     current_graphs.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs],[tab_sources, tab_figures, tab_recommended_content])

climateqa/event_handler.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from langchain_core.runnables.schema import StreamEvent
+from gradio import ChatMessage
+from climateqa.engine.chains.prompts import audience_prompts
+from front.utils import make_html_source,parse_output_llm_with_sources,serialize_docs,make_toolbox,generate_html_graphs
+import numpy as np
+def init_audience(audience :str) -> str:
+    if audience == "Children":
+        audience_prompt = audience_prompts["children"]
+    elif audience == "General public":
+        audience_prompt = audience_prompts["general"]
+    elif audience == "Experts":
+        audience_prompt = audience_prompts["experts"]
+    else:
+        audience_prompt = audience_prompts["experts"]
+    return audience_prompt
+def handle_retrieved_documents(event: StreamEvent, history : list[ChatMessage], used_documents : list[str]) -> tuple[str, list[ChatMessage], list[str]]:
+    """
+    Handles the retrieved documents and returns the HTML representation of the documents
+    Args:
+        event (StreamEvent): The event containing the retrieved documents
+        history (list[ChatMessage]): The current message history
+        used_documents (list[str]): The list of used documents
+    Returns:
+        tuple[str, list[ChatMessage], list[str]]: The updated HTML representation of the documents, the updated message history and the updated list of used documents
+    """
+    try:
+        docs = event["data"]["output"]["documents"]
+        docs_html = []
+        textual_docs = [d for d in docs if d.metadata["chunk_type"] == "text"]
+        for i, d in enumerate(textual_docs, 1):
+            if d.metadata["chunk_type"] == "text":
+                docs_html.append(make_html_source(d, i))
+        used_documents = used_documents + [f"{d.metadata['short_name']} - {d.metadata['name']}" for d in docs]
+        history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
+        docs_html = "".join(docs_html)
+    except Exception as e:
+        print(f"Error getting documents: {e}")
+        print(event)
+    return docs, docs_html, history, used_documents
+def stream_answer(history: list[ChatMessage], event : StreamEvent, start_streaming : bool, answer_message_content : str)-> tuple[list[ChatMessage], bool, str]:
+    """
+    Handles the streaming of the answer and updates the history with the new message content
+    Args:
+        history (list[ChatMessage]): The current message history
+        event (StreamEvent): The event containing the streamed answer
+        start_streaming (bool): A flag indicating if the streaming has started
+        new_message_content (str): The content of the new message
+    Returns:
+        tuple[list[ChatMessage], bool, str]: The updated history, the updated streaming flag and the updated message content
+    """
+    if start_streaming == False:
+        start_streaming = True
+        history.append(ChatMessage(role="assistant", content = ""))
+    answer_message_content +=  event["data"]["chunk"].content
+    answer_message_content = parse_output_llm_with_sources(answer_message_content)
+    history[-1] = ChatMessage(role="assistant", content = answer_message_content)
+    # history.append(ChatMessage(role="assistant", content = new_message_content))
+    return history, start_streaming, answer_message_content
+def handle_retrieved_owid_graphs(event :StreamEvent, graphs_html: str) -> str:
+    """
+    Handles the retrieved OWID graphs and returns the HTML representation of the graphs
+    Args:
+        event (StreamEvent): The event containing the retrieved graphs
+        graphs_html (str): The current HTML representation of the graphs
+    Returns:
+        str: The updated HTML representation
+    """
+    try:
+        recommended_content = event["data"]["output"]["recommended_content"]
+        unique_graphs = []
+        seen_embeddings = set()
+        for x in recommended_content:
+            embedding = x.metadata["returned_content"]
+            # Check if the embedding has already been seen
+            if embedding not in seen_embeddings:
+                unique_graphs.append({
+                    "embedding": embedding,
+                    "metadata": {
+                        "source": x.metadata["source"],
+                        "category": x.metadata["category"]
+                    }
+                })
+                # Add the embedding to the seen set
+                seen_embeddings.add(embedding)
+        categories = {}
+        for graph in unique_graphs:
+            category = graph['metadata']['category']
+            if category not in categories:
+                categories[category] = []
+            categories[category].append(graph['embedding'])
+        for category, embeddings in categories.items():
+            graphs_html += f"<h3>{category}</h3>"
+            for embedding in embeddings:
+                graphs_html += f"<div>{embedding}</div>"
+    except Exception as e:
+        print(f"Error getting graphs: {e}")
+    return graphs_html

front/utils.py CHANGED Viewed

@@ -1,5 +1,12 @@
 import re
 def make_pairs(lst):
     """from a list of even lenght, make tupple pairs"""
@@ -32,8 +39,42 @@ def parse_output_llm_with_sources(output):
     content_parts = "".join(parts)
     return content_parts
-from collections import defaultdict
 def generate_html_graphs(graphs):
     # Organize graphs by category

 import re
+from collections import defaultdict
+from climateqa.utils import get_image_from_azure_blob_storage
+from climateqa.engine.chains.prompts import audience_prompts
+from PIL import Image
+from io import BytesIO
+import base64
 def make_pairs(lst):
     """from a list of even lenght, make tupple pairs"""
     content_parts = "".join(parts)
     return content_parts
+def process_figures(docs):
+    gallery=[]
+    used_figures =[]
+    figures = '<div class="figures-container"><p></p> </div>'
+    docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"]
+    for i, doc in enumerate(docs_figures):
+        if doc.metadata["chunk_type"] == "image":
+            if doc.metadata["figure_code"] != "N/A":
+                title = f"{doc.metadata['figure_code']} - {doc.metadata['short_name']}"
+            else:
+                title = f"{doc.metadata['short_name']}"
+            if title not in used_figures:
+                used_figures.append(title)
+                try:
+                    key = f"Image {i+1}"
+                    image_path = doc.metadata["image_path"].split("documents/")[1]
+                    img = get_image_from_azure_blob_storage(image_path)
+                    # Convert the image to a byte buffer
+                    buffered = BytesIO()
+                    max_image_length = 500
+                    img_resized = img.resize((max_image_length, int(max_image_length * img.size[1]/img.size[0])))
+                    img_resized.save(buffered, format="PNG")
+                    img_str = base64.b64encode(buffered.getvalue()).decode()
+                    figures = figures + make_html_figure_sources(doc, i, img_str)
+                    gallery.append(img)
+                except Exception as e:
+                    print(f"Skipped adding image {i} because of {e}")
+    return figures, gallery
 def generate_html_graphs(graphs):
     # Organize graphs by category