Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on Dec 11, 2024

Commit

d396732

1 Parent(s): 094ee34

update display and fix search only

Browse files

Files changed (6) hide show

app.py +25 -23
climateqa/engine/chains/retrieve_documents.py +42 -31
climateqa/engine/chains/retrieve_papers.py +2 -2
climateqa/event_handler.py +2 -1
climateqa/knowledge/openalex.py +3 -2
front/utils.py +5 -2

app.py CHANGED Viewed

@@ -46,7 +46,7 @@ from climateqa.engine.graph import make_graph_agent
 from climateqa.engine.embeddings import get_embeddings_function
 from climateqa.engine.chains.retrieve_papers import find_papers
-from front.utils import serialize_docs,process_figures,make_html_df
 from climateqa.event_handler import init_audience, handle_retrieved_documents, stream_answer,handle_retrieved_owid_graphs
@@ -409,7 +409,6 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                     with gr.Tab("Sources",elem_id = "tab-sources",id = 1) as tab_sources:
                         sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
-                        docs_textbox = gr.State("")
@@ -439,7 +438,6 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                                 # Fenêtre simulée pour les Relevant Papers
                                 with gr.Accordion(visible=True, elem_id="papers-relevant-popup",label= "See relevant papers", open= False) as relevant_popup:
                                     papers_html = gr.HTML(show_label=False, elem_id="papers-textbox")
-                                    docs_textbox = gr.State("")
                                 btn_citations_network = gr.Button("Explore papers citations network")
                                 # Fenêtre simulée pour le Citations Network
@@ -458,21 +456,15 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                 gr.Markdown("Reminders: You can talk in any language, ClimateQ&A is multi-lingual!")
-                with gr.Row():
-                    dropdown_sources = gr.CheckboxGroup(
-                        ["IPCC", "IPBES","IPOS"],
-                        label="Select source",
-                        value=["IPCC"],
-                        interactive=True,
-                    )
-                    dropdown_external_sources = gr.CheckboxGroup(
-                        ["IPCC figures","OpenAlex", "OurWorldInData"],
-                        label="Select database to search for relevant content",
-                        value=["IPCC figures"],
-                        interactive=True,
-                    )
                 dropdown_reports = gr.Dropdown(
                     POSSIBLE_REPORTS,
                     label="Or select specific reports",
@@ -480,8 +472,15 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                     value=None,
                     interactive=True,
                 )
-                search_only = gr.Checkbox(label="Search only without chating", value=False, interactive=True, elem_id="checkbox-chat")
                 dropdown_audience = gr.Dropdown(
@@ -501,7 +500,7 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                 dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after])
-                close_config_modal = gr.Button("Close",elem_id="close-config-modal")
                 close_config_modal.click(fn=update_config_modal_visibility, inputs=[config_open], outputs=[config_modal, config_open])
                 # dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after], visible=True)
@@ -589,9 +588,12 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
-    def start_chat(query,history):
         history = history + [ChatMessage(role="user", content=query)]
-        return (gr.update(interactive = False),gr.update(selected=1),history)
     def finish_chat():
         return gr.update(interactive = True,value = "")
@@ -630,14 +632,14 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
         return gr.update(label = recommended_content_notif_label), gr.update(label = sources_notif_label), gr.update(label = figures_notif_label), gr.update(label = graphs_notif_label), gr.update(label = papers_notif_label)
     (textbox
-        .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
         .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
         # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
     )
     (examples_hidden
-        .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
         .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
         # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )

 from climateqa.engine.embeddings import get_embeddings_function
 from climateqa.engine.chains.retrieve_papers import find_papers
+from front.utils import serialize_docs,process_figures
 from climateqa.event_handler import init_audience, handle_retrieved_documents, stream_answer,handle_retrieved_owid_graphs
                     with gr.Tab("Sources",elem_id = "tab-sources",id = 1) as tab_sources:
                         sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
                                 # Fenêtre simulée pour les Relevant Papers
                                 with gr.Accordion(visible=True, elem_id="papers-relevant-popup",label= "See relevant papers", open= False) as relevant_popup:
                                     papers_html = gr.HTML(show_label=False, elem_id="papers-textbox")
                                 btn_citations_network = gr.Button("Explore papers citations network")
                                 # Fenêtre simulée pour le Citations Network
                 gr.Markdown("Reminders: You can talk in any language, ClimateQ&A is multi-lingual!")
+                # with gr.Row():
+                dropdown_sources = gr.CheckboxGroup(
+                    ["IPCC", "IPBES","IPOS"],
+                    label="Select source (by default search in all sources)",
+                    value=["IPCC"],
+                    interactive=True,
+                )
                 dropdown_reports = gr.Dropdown(
                     POSSIBLE_REPORTS,
                     label="Or select specific reports",
                     value=None,
                     interactive=True,
                 )
+                dropdown_external_sources = gr.CheckboxGroup(
+                    ["IPCC figures","OpenAlex", "OurWorldInData"],
+                    label="Select database to search for relevant content",
+                    value=["IPCC figures"],
+                    interactive=True,
+                )
+                search_only = gr.Checkbox(label="Search only for recommended content without chating", value=False, interactive=True, elem_id="checkbox-chat")
                 dropdown_audience = gr.Dropdown(
                 dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after])
+                close_config_modal = gr.Button("Validate and Close",elem_id="close-config-modal")
                 close_config_modal.click(fn=update_config_modal_visibility, inputs=[config_open], outputs=[config_modal, config_open])
                 # dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after], visible=True)
+    def start_chat(query,history,search_only):
         history = history + [ChatMessage(role="user", content=query)]
+        if search_only:
+            return (gr.update(interactive = False),gr.update(selected=1),history)
+        else:
+            return (gr.update(interactive = False),gr.update(selected=2),history)
     def finish_chat():
         return gr.update(interactive = True,value = "")
         return gr.update(label = recommended_content_notif_label), gr.update(label = sources_notif_label), gr.update(label = figures_notif_label), gr.update(label = graphs_notif_label), gr.update(label = papers_notif_label)
     (textbox
+        .submit(start_chat, [textbox,chatbot, search_only], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
         .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
         # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
     )
     (examples_hidden
+        .change(start_chat, [examples_hidden,chatbot, search_only], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
         .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
         # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )

climateqa/engine/chains/retrieve_documents.py CHANGED Viewed

@@ -115,6 +115,7 @@ async def get_IPCC_relevant_documents(
     k_images: int = 5,
     namespace:str = "vectors",
     min_size:int = 200,
 ) :
     # Check if all elements in the list are either IPCC or IPBES
@@ -136,41 +137,49 @@ async def get_IPCC_relevant_documents(
     docs_full = []
     docs_images = []
-    # Search for k_summary documents in the summaries dataset
-    filters_summaries = {
-        **filters,
-        "chunk_type":"text",
-        "report_type": { "$in":["SPM"]},
-    }
-    docs_summaries = vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = k_summary)
-    docs_summaries = [x for x in docs_summaries if x[1] > threshold]
-    # docs_summaries = []
-    # Search for k_total - k_summary documents in the full reports dataset
-    filters_full = {
-        **filters,
-        "chunk_type":"text",
-        "report_type": { "$nin":["SPM"]},
-    }
-    k_full = k_total - len(docs_summaries)
-    docs_full = vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
-    if search_figures:
-        # Images
-        filters_image = {
             **filters,
-            "chunk_type":"image"
         }
-        docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
-    docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images)
-    # Filter if length are below threshold
-    docs_summaries = [x for x in docs_summaries if len(x.page_content) > min_size]
-    docs_full = [x for x in docs_full if len(x.page_content) > min_size]
     return {
         "docs_summaries" : docs_summaries,
@@ -214,6 +223,7 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
         related_content = []
     search_figures = "IPCC figures" in state["relevant_content_sources"]
     # Get the current question
     current_question = state["remaining_questions"][0]
@@ -242,6 +252,7 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
             k_total = k_before_reranking,
             k_images = k_images_by_question,
             threshold = 0.5,
         )

     k_images: int = 5,
     namespace:str = "vectors",
     min_size:int = 200,
+    search_only:bool = False,
 ) :
     # Check if all elements in the list are either IPCC or IPBES
     docs_full = []
     docs_images = []
+    if search_only:
+        # Only search for images if search_only is True
+        if search_figures:
+            filters_image = {
+                **filters,
+                "chunk_type":"image"
+            }
+            docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
+            docs_images = _add_metadata_and_score(docs_images)
+    else:
+        # Regular search flow for text and optionally images
+        # Search for k_summary documents in the summaries dataset
+        filters_summaries = {
             **filters,
+            "chunk_type":"text",
+            "report_type": { "$in":["SPM"]},
         }
+        docs_summaries = vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = k_summary)
+        docs_summaries = [x for x in docs_summaries if x[1] > threshold]
+        # Search for k_total - k_summary documents in the full reports dataset
+        filters_full = {
+            **filters,
+            "chunk_type":"text",
+            "report_type": { "$nin":["SPM"]},
+        }
+        k_full = k_total - len(docs_summaries)
+        docs_full = vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
+        if search_figures:
+            # Images
+            filters_image = {
+                **filters,
+                "chunk_type":"image"
+            }
+            docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
+        docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images)
+        # Filter if length are below threshold
+        docs_summaries = [x for x in docs_summaries if len(x.page_content) > min_size]
+        docs_full = [x for x in docs_full if len(x.page_content) > min_size]
     return {
         "docs_summaries" : docs_summaries,
         related_content = []
     search_figures = "IPCC figures" in state["relevant_content_sources"]
+    search_only = state["search_only"]
     # Get the current question
     current_question = state["remaining_questions"][0]
             k_total = k_before_reranking,
             k_images = k_images_by_question,
             threshold = 0.5,
+            search_only = search_only,
         )

climateqa/engine/chains/retrieve_papers.py CHANGED Viewed

@@ -2,7 +2,7 @@ from climateqa.engine.keywords import make_keywords_chain
 from climateqa.engine.llm import get_llm
 from climateqa.knowledge.openalex import OpenAlex
 from climateqa.engine.chains.answer_rag import make_rag_papers_chain
-from front.utils import make_html_df
 from climateqa.engine.reranker import get_reranker
 oa = OpenAlex()
@@ -47,7 +47,7 @@ async def find_papers(query,after, relevant_content_sources, reranker= reranker)
             df_works = df_works.sort_values("rerank_score",ascending=False)
             docs_html = []
             for i in range(10):
-                docs_html.append(make_html_df(df_works, i))
             docs_html = "".join(docs_html)
             G = oa.make_network(df_works)

 from climateqa.engine.llm import get_llm
 from climateqa.knowledge.openalex import OpenAlex
 from climateqa.engine.chains.answer_rag import make_rag_papers_chain
+from front.utils import make_html_papers
 from climateqa.engine.reranker import get_reranker
 oa = OpenAlex()
             df_works = df_works.sort_values("rerank_score",ascending=False)
             docs_html = []
             for i in range(10):
+                docs_html.append(make_html_papers(df_works, i))
             docs_html = "".join(docs_html)
             G = oa.make_network(df_works)

climateqa/event_handler.py CHANGED Viewed

@@ -36,7 +36,8 @@ def handle_retrieved_documents(event: StreamEvent, history : list[ChatMessage],
                 docs_html.append(make_html_source(d, i))
         used_documents = used_documents + [f"{d.metadata['short_name']} - {d.metadata['name']}" for d in docs]
-        history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
         docs_html = "".join(docs_html)

                 docs_html.append(make_html_source(d, i))
         used_documents = used_documents + [f"{d.metadata['short_name']} - {d.metadata['name']}" for d in docs]
+        if used_documents!=[]:
+            history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
         docs_html = "".join(docs_html)

climateqa/knowledge/openalex.py CHANGED Viewed

@@ -55,8 +55,9 @@ class OpenAlex():
             df_works["num_tokens"] = df_works["content"].map(lambda x : num_tokens_from_string(x))
             df_works = df_works.drop(columns = ["abstract_inverted_index"])
-            # df_works["subtitle"] = df_works["title"] + " - " + df_works["primary_location"]["source"]["display_name"] + " - " + df_works["publication_year"]
             return df_works
         else:
            raise Exception("Keywords must be a string")

             df_works["num_tokens"] = df_works["content"].map(lambda x : num_tokens_from_string(x))
             df_works = df_works.drop(columns = ["abstract_inverted_index"])
+            df_works["display_name"] = df_works["primary_location"].apply(lambda x :x["source"] if type(x) == dict and 'source' in x else "").apply(lambda x : x["display_name"] if type(x) == dict and "display_name" in x else "")
+            df_works["subtitle"] = df_works["title"].astype(str) + " - " + df_works["display_name"].astype(str) + " - " + df_works["publication_year"].astype(str)
             return df_works
         else:
            raise Exception("Keywords must be a string")

front/utils.py CHANGED Viewed

@@ -228,11 +228,12 @@ def make_html_source(source,i):
     return card
-def make_html_df(df,i):
     title = df['title'][i]
     content = df['abstract'][i]
     url = df['doi'][i]
     publication_date = df['publication_year'][i]
     card = f"""
     <div class="card" id="doc{i}">
@@ -241,8 +242,10 @@ def make_html_df(df,i):
             <p>{content}</p>
         </div>
         <div class="card-footer">
-            <span>{publication_date}</span>
             <a href="{url}" target="_blank" class="pdf-link">
         </div>
     </div>
         """

     return card
+def make_html_papers(df,i):
     title = df['title'][i]
     content = df['abstract'][i]
     url = df['doi'][i]
     publication_date = df['publication_year'][i]
+    subtitle = df['subtitle'][i]
     card = f"""
     <div class="card" id="doc{i}">
             <p>{content}</p>
         </div>
         <div class="card-footer">
+            <span>{subtitle}</span>
             <a href="{url}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open paper">🔗</span>
+            </a>
         </div>
     </div>
         """