timeki commited on
Commit
d396732
1 Parent(s): 094ee34

update display and fix search only

Browse files
app.py CHANGED
@@ -46,7 +46,7 @@ from climateqa.engine.graph import make_graph_agent
46
  from climateqa.engine.embeddings import get_embeddings_function
47
  from climateqa.engine.chains.retrieve_papers import find_papers
48
 
49
- from front.utils import serialize_docs,process_figures,make_html_df
50
 
51
  from climateqa.event_handler import init_audience, handle_retrieved_documents, stream_answer,handle_retrieved_owid_graphs
52
 
@@ -409,7 +409,6 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
409
 
410
  with gr.Tab("Sources",elem_id = "tab-sources",id = 1) as tab_sources:
411
  sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
412
- docs_textbox = gr.State("")
413
 
414
 
415
 
@@ -439,7 +438,6 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
439
  # Fenêtre simulée pour les Relevant Papers
440
  with gr.Accordion(visible=True, elem_id="papers-relevant-popup",label= "See relevant papers", open= False) as relevant_popup:
441
  papers_html = gr.HTML(show_label=False, elem_id="papers-textbox")
442
- docs_textbox = gr.State("")
443
 
444
  btn_citations_network = gr.Button("Explore papers citations network")
445
  # Fenêtre simulée pour le Citations Network
@@ -458,21 +456,15 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
458
  gr.Markdown("Reminders: You can talk in any language, ClimateQ&A is multi-lingual!")
459
 
460
 
461
- with gr.Row():
462
 
463
- dropdown_sources = gr.CheckboxGroup(
464
- ["IPCC", "IPBES","IPOS"],
465
- label="Select source",
466
- value=["IPCC"],
467
- interactive=True,
468
- )
469
- dropdown_external_sources = gr.CheckboxGroup(
470
- ["IPCC figures","OpenAlex", "OurWorldInData"],
471
- label="Select database to search for relevant content",
472
- value=["IPCC figures"],
473
- interactive=True,
474
- )
475
-
476
  dropdown_reports = gr.Dropdown(
477
  POSSIBLE_REPORTS,
478
  label="Or select specific reports",
@@ -480,8 +472,15 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
480
  value=None,
481
  interactive=True,
482
  )
 
 
 
 
 
 
 
483
 
484
- search_only = gr.Checkbox(label="Search only without chating", value=False, interactive=True, elem_id="checkbox-chat")
485
 
486
 
487
  dropdown_audience = gr.Dropdown(
@@ -501,7 +500,7 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
501
 
502
  dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after])
503
 
504
- close_config_modal = gr.Button("Close",elem_id="close-config-modal")
505
  close_config_modal.click(fn=update_config_modal_visibility, inputs=[config_open], outputs=[config_modal, config_open])
506
  # dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after], visible=True)
507
 
@@ -589,9 +588,12 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
589
 
590
 
591
 
592
- def start_chat(query,history):
593
  history = history + [ChatMessage(role="user", content=query)]
594
- return (gr.update(interactive = False),gr.update(selected=1),history)
 
 
 
595
 
596
  def finish_chat():
597
  return gr.update(interactive = True,value = "")
@@ -630,14 +632,14 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
630
  return gr.update(label = recommended_content_notif_label), gr.update(label = sources_notif_label), gr.update(label = figures_notif_label), gr.update(label = graphs_notif_label), gr.update(label = papers_notif_label)
631
 
632
  (textbox
633
- .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
634
  .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
635
  .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
636
  # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
637
  )
638
 
639
  (examples_hidden
640
- .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
641
  .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
642
  .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
643
  # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
 
46
  from climateqa.engine.embeddings import get_embeddings_function
47
  from climateqa.engine.chains.retrieve_papers import find_papers
48
 
49
+ from front.utils import serialize_docs,process_figures
50
 
51
  from climateqa.event_handler import init_audience, handle_retrieved_documents, stream_answer,handle_retrieved_owid_graphs
52
 
 
409
 
410
  with gr.Tab("Sources",elem_id = "tab-sources",id = 1) as tab_sources:
411
  sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
 
412
 
413
 
414
 
 
438
  # Fenêtre simulée pour les Relevant Papers
439
  with gr.Accordion(visible=True, elem_id="papers-relevant-popup",label= "See relevant papers", open= False) as relevant_popup:
440
  papers_html = gr.HTML(show_label=False, elem_id="papers-textbox")
 
441
 
442
  btn_citations_network = gr.Button("Explore papers citations network")
443
  # Fenêtre simulée pour le Citations Network
 
456
  gr.Markdown("Reminders: You can talk in any language, ClimateQ&A is multi-lingual!")
457
 
458
 
459
+ # with gr.Row():
460
 
461
+ dropdown_sources = gr.CheckboxGroup(
462
+ ["IPCC", "IPBES","IPOS"],
463
+ label="Select source (by default search in all sources)",
464
+ value=["IPCC"],
465
+ interactive=True,
466
+ )
467
+
 
 
 
 
 
 
468
  dropdown_reports = gr.Dropdown(
469
  POSSIBLE_REPORTS,
470
  label="Or select specific reports",
 
472
  value=None,
473
  interactive=True,
474
  )
475
+
476
+ dropdown_external_sources = gr.CheckboxGroup(
477
+ ["IPCC figures","OpenAlex", "OurWorldInData"],
478
+ label="Select database to search for relevant content",
479
+ value=["IPCC figures"],
480
+ interactive=True,
481
+ )
482
 
483
+ search_only = gr.Checkbox(label="Search only for recommended content without chating", value=False, interactive=True, elem_id="checkbox-chat")
484
 
485
 
486
  dropdown_audience = gr.Dropdown(
 
500
 
501
  dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after])
502
 
503
+ close_config_modal = gr.Button("Validate and Close",elem_id="close-config-modal")
504
  close_config_modal.click(fn=update_config_modal_visibility, inputs=[config_open], outputs=[config_modal, config_open])
505
  # dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after], visible=True)
506
 
 
588
 
589
 
590
 
591
+ def start_chat(query,history,search_only):
592
  history = history + [ChatMessage(role="user", content=query)]
593
+ if search_only:
594
+ return (gr.update(interactive = False),gr.update(selected=1),history)
595
+ else:
596
+ return (gr.update(interactive = False),gr.update(selected=2),history)
597
 
598
  def finish_chat():
599
  return gr.update(interactive = True,value = "")
 
632
  return gr.update(label = recommended_content_notif_label), gr.update(label = sources_notif_label), gr.update(label = figures_notif_label), gr.update(label = graphs_notif_label), gr.update(label = papers_notif_label)
633
 
634
  (textbox
635
+ .submit(start_chat, [textbox,chatbot, search_only], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
636
  .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
637
  .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
638
  # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
639
  )
640
 
641
  (examples_hidden
642
+ .change(start_chat, [examples_hidden,chatbot, search_only], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
643
  .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
644
  .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
645
  # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
climateqa/engine/chains/retrieve_documents.py CHANGED
@@ -115,6 +115,7 @@ async def get_IPCC_relevant_documents(
115
  k_images: int = 5,
116
  namespace:str = "vectors",
117
  min_size:int = 200,
 
118
  ) :
119
 
120
  # Check if all elements in the list are either IPCC or IPBES
@@ -136,41 +137,49 @@ async def get_IPCC_relevant_documents(
136
  docs_full = []
137
  docs_images = []
138
 
139
- # Search for k_summary documents in the summaries dataset
140
- filters_summaries = {
141
- **filters,
142
- "chunk_type":"text",
143
- "report_type": { "$in":["SPM"]},
144
- }
145
-
146
- docs_summaries = vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = k_summary)
147
- docs_summaries = [x for x in docs_summaries if x[1] > threshold]
148
- # docs_summaries = []
149
-
150
- # Search for k_total - k_summary documents in the full reports dataset
151
- filters_full = {
152
- **filters,
153
- "chunk_type":"text",
154
- "report_type": { "$nin":["SPM"]},
155
- }
156
- k_full = k_total - len(docs_summaries)
157
- docs_full = vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
158
-
159
- if search_figures:
160
- # Images
161
- filters_image = {
162
  **filters,
163
- "chunk_type":"image"
 
164
  }
165
- docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
166
 
 
 
167
 
168
- docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images)
169
-
170
- # Filter if length are below threshold
171
- docs_summaries = [x for x in docs_summaries if len(x.page_content) > min_size]
172
- docs_full = [x for x in docs_full if len(x.page_content) > min_size]
173
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  return {
176
  "docs_summaries" : docs_summaries,
@@ -214,6 +223,7 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
214
  related_content = []
215
 
216
  search_figures = "IPCC figures" in state["relevant_content_sources"]
 
217
 
218
  # Get the current question
219
  current_question = state["remaining_questions"][0]
@@ -242,6 +252,7 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
242
  k_total = k_before_reranking,
243
  k_images = k_images_by_question,
244
  threshold = 0.5,
 
245
  )
246
 
247
 
 
115
  k_images: int = 5,
116
  namespace:str = "vectors",
117
  min_size:int = 200,
118
+ search_only:bool = False,
119
  ) :
120
 
121
  # Check if all elements in the list are either IPCC or IPBES
 
137
  docs_full = []
138
  docs_images = []
139
 
140
+ if search_only:
141
+ # Only search for images if search_only is True
142
+ if search_figures:
143
+ filters_image = {
144
+ **filters,
145
+ "chunk_type":"image"
146
+ }
147
+ docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
148
+ docs_images = _add_metadata_and_score(docs_images)
149
+ else:
150
+ # Regular search flow for text and optionally images
151
+ # Search for k_summary documents in the summaries dataset
152
+ filters_summaries = {
 
 
 
 
 
 
 
 
 
 
153
  **filters,
154
+ "chunk_type":"text",
155
+ "report_type": { "$in":["SPM"]},
156
  }
 
157
 
158
+ docs_summaries = vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = k_summary)
159
+ docs_summaries = [x for x in docs_summaries if x[1] > threshold]
160
 
161
+ # Search for k_total - k_summary documents in the full reports dataset
162
+ filters_full = {
163
+ **filters,
164
+ "chunk_type":"text",
165
+ "report_type": { "$nin":["SPM"]},
166
+ }
167
+ k_full = k_total - len(docs_summaries)
168
+ docs_full = vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
169
+
170
+ if search_figures:
171
+ # Images
172
+ filters_image = {
173
+ **filters,
174
+ "chunk_type":"image"
175
+ }
176
+ docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
177
+
178
+ docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images)
179
+
180
+ # Filter if length are below threshold
181
+ docs_summaries = [x for x in docs_summaries if len(x.page_content) > min_size]
182
+ docs_full = [x for x in docs_full if len(x.page_content) > min_size]
183
 
184
  return {
185
  "docs_summaries" : docs_summaries,
 
223
  related_content = []
224
 
225
  search_figures = "IPCC figures" in state["relevant_content_sources"]
226
+ search_only = state["search_only"]
227
 
228
  # Get the current question
229
  current_question = state["remaining_questions"][0]
 
252
  k_total = k_before_reranking,
253
  k_images = k_images_by_question,
254
  threshold = 0.5,
255
+ search_only = search_only,
256
  )
257
 
258
 
climateqa/engine/chains/retrieve_papers.py CHANGED
@@ -2,7 +2,7 @@ from climateqa.engine.keywords import make_keywords_chain
2
  from climateqa.engine.llm import get_llm
3
  from climateqa.knowledge.openalex import OpenAlex
4
  from climateqa.engine.chains.answer_rag import make_rag_papers_chain
5
- from front.utils import make_html_df
6
  from climateqa.engine.reranker import get_reranker
7
 
8
  oa = OpenAlex()
@@ -47,7 +47,7 @@ async def find_papers(query,after, relevant_content_sources, reranker= reranker)
47
  df_works = df_works.sort_values("rerank_score",ascending=False)
48
  docs_html = []
49
  for i in range(10):
50
- docs_html.append(make_html_df(df_works, i))
51
  docs_html = "".join(docs_html)
52
  G = oa.make_network(df_works)
53
 
 
2
  from climateqa.engine.llm import get_llm
3
  from climateqa.knowledge.openalex import OpenAlex
4
  from climateqa.engine.chains.answer_rag import make_rag_papers_chain
5
+ from front.utils import make_html_papers
6
  from climateqa.engine.reranker import get_reranker
7
 
8
  oa = OpenAlex()
 
47
  df_works = df_works.sort_values("rerank_score",ascending=False)
48
  docs_html = []
49
  for i in range(10):
50
+ docs_html.append(make_html_papers(df_works, i))
51
  docs_html = "".join(docs_html)
52
  G = oa.make_network(df_works)
53
 
climateqa/event_handler.py CHANGED
@@ -36,7 +36,8 @@ def handle_retrieved_documents(event: StreamEvent, history : list[ChatMessage],
36
  docs_html.append(make_html_source(d, i))
37
 
38
  used_documents = used_documents + [f"{d.metadata['short_name']} - {d.metadata['name']}" for d in docs]
39
- history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
 
40
 
41
  docs_html = "".join(docs_html)
42
 
 
36
  docs_html.append(make_html_source(d, i))
37
 
38
  used_documents = used_documents + [f"{d.metadata['short_name']} - {d.metadata['name']}" for d in docs]
39
+ if used_documents!=[]:
40
+ history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
41
 
42
  docs_html = "".join(docs_html)
43
 
climateqa/knowledge/openalex.py CHANGED
@@ -55,8 +55,9 @@ class OpenAlex():
55
  df_works["num_tokens"] = df_works["content"].map(lambda x : num_tokens_from_string(x))
56
 
57
  df_works = df_works.drop(columns = ["abstract_inverted_index"])
58
- # df_works["subtitle"] = df_works["title"] + " - " + df_works["primary_location"]["source"]["display_name"] + " - " + df_works["publication_year"]
59
-
 
60
  return df_works
61
  else:
62
  raise Exception("Keywords must be a string")
 
55
  df_works["num_tokens"] = df_works["content"].map(lambda x : num_tokens_from_string(x))
56
 
57
  df_works = df_works.drop(columns = ["abstract_inverted_index"])
58
+ df_works["display_name"] = df_works["primary_location"].apply(lambda x :x["source"] if type(x) == dict and 'source' in x else "").apply(lambda x : x["display_name"] if type(x) == dict and "display_name" in x else "")
59
+ df_works["subtitle"] = df_works["title"].astype(str) + " - " + df_works["display_name"].astype(str) + " - " + df_works["publication_year"].astype(str)
60
+
61
  return df_works
62
  else:
63
  raise Exception("Keywords must be a string")
front/utils.py CHANGED
@@ -228,11 +228,12 @@ def make_html_source(source,i):
228
  return card
229
 
230
 
231
- def make_html_df(df,i):
232
  title = df['title'][i]
233
  content = df['abstract'][i]
234
  url = df['doi'][i]
235
  publication_date = df['publication_year'][i]
 
236
 
237
  card = f"""
238
  <div class="card" id="doc{i}">
@@ -241,8 +242,10 @@ def make_html_df(df,i):
241
  <p>{content}</p>
242
  </div>
243
  <div class="card-footer">
244
- <span>{publication_date}</span>
245
  <a href="{url}" target="_blank" class="pdf-link">
 
 
246
  </div>
247
  </div>
248
  """
 
228
  return card
229
 
230
 
231
+ def make_html_papers(df,i):
232
  title = df['title'][i]
233
  content = df['abstract'][i]
234
  url = df['doi'][i]
235
  publication_date = df['publication_year'][i]
236
+ subtitle = df['subtitle'][i]
237
 
238
  card = f"""
239
  <div class="card" id="doc{i}">
 
242
  <p>{content}</p>
243
  </div>
244
  <div class="card-footer">
245
+ <span>{subtitle}</span>
246
  <a href="{url}" target="_blank" class="pdf-link">
247
+ <span role="img" aria-label="Open paper">🔗</span>
248
+ </a>
249
  </div>
250
  </div>
251
  """