Spaces:

timeki
/

UNEP-decisions-qa

Sleeping

App Files Files Community

UNEP-decisions-qa / app.py

timeki

init qa

26ed9d3 4 months ago

raw

history blame

5.95 kB

	from langchain.memory import ConversationBufferMemory

	import pandas as pd
	import gradio as gr

	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import FAISS

	import os


	import pandas as pd
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.schema import Document
	import os

	from utils import make_html_source, make_pairs, get_llm, reset_textbox

	from prompt import PROMPT_INTERPRATE_INTENTION, ANSWER_PROMPT


	try:
	from dotenv import load_dotenv
	load_dotenv()
	except Exception:
	pass


	# Load your OpenAI API key
	import os
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	assert OPENAI_API_KEY, "Please set your OpenAI API key"

	embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)




	new_vector_store = FAISS.load_local(
	"faiss_index", embeddings, allow_dangerous_deserialization=True
	)

	retriever = new_vector_store.as_retriever()



	llm = get_llm()

	memory = ConversationBufferMemory(
	return_messages=True, output_key="answer", input_key="question"
	)

	def make_qa_chain(

	) :
	final_inputs = {
	"context": lambda x: x["context"],
	"question": lambda x: x["question"],
	}



	return final_inputs \| ANSWER_PROMPT \| llm


	def load_documents_meeting(meeting_number):
	# Step 1: Load the CSV data
	csv_file_path = "../data/mfls.xlsx"
	df = pd.read_excel(csv_file_path)

	df["meeting_number"]= df["Meeting"].apply(lambda x: x.split(" ")[0][:-2])
	df_meeting = df[df["meeting_number"] == meeting_number]
	def combine_title_and_content(row):
	return f"{row['Meeting']} {row['Issues']} {row['Content']}"

	df_meeting['combined'] = df_meeting.apply(combine_title_and_content, axis=1)

	# Step 3: Generate embeddings using OpenAI
	embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

	# Generate embeddings for each document
	documents = [
	Document(
	page_content=row['combined'],
	metadata={
	"Issues": row['Issues'],
	"Title": row['Title'],
	"meeting_number": row["Meeting"].split(" ")[0][:-2],
	"Agencies": row["Agencies"],
	"project": row["Projects"],
	}
	) for i,row in df_meeting.iterrows()]
	return documents


	async def chat(
	query: str,
	history: list = [],
	):
	"""taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
	(messages in gradio format, messages in langchain format, source documents)"""
	source_string = ""
	gradio_format = make_pairs([a.content for a in history]) + [(query, "")]
	qa_chain = make_qa_chain()

	# reset memory
	memory.clear()
	for message in history:
	memory.chat_memory.add_message(message)

	inputs = {"question": query}

	## INTENT
	intent = await llm.abatch([PROMPT_INTERPRATE_INTENTION.format_prompt(query = query)])
	intent = intent[0].content
	print("intent", intent)

	## RETRIEVER
	if intent.split(" ")[0] == "meeting":
	meeting_number = intent.split(" ")[-1]
	sources = load_documents_meeting(meeting_number)
	else :
	sources = new_vector_store.search(query, search_type="similarity", k=5)

	source_string = "\n\n".join([make_html_source(doc, i) for i, doc in enumerate(sources, 1)])

	## RAG
	inputs_rag = {"question": query, "context": sources}

	result = qa_chain.astream_log(inputs_rag)

	reformulated_question_path_id = "/logs/ChatOpenAI/streamed_output_str/-"
	retriever_path_id = "/logs/VectorStoreRetriever/final_output"
	final_answer_path_id = "/streamed_output/-"

	async for op in result:
	op = op.ops[0]
	# print(op["path"])
	if op['path'] == reformulated_question_path_id: # reforulated question
	new_token = op['value'] # str

	elif op['path'] == retriever_path_id: # documents
	sources = op['value']['documents'] # List[Document]
	source_string = "\n\n".join([make_html_source(i, doc) for i, doc in enumerate(sources, 1)])

	elif op['path'] == final_answer_path_id: # final answer
	new_token = op['value'].content # str
	answer_yet = gradio_format[-1][1]
	gradio_format[-1] = (query, answer_yet + new_token )

	yield gradio_format, history, source_string

	memory.save_context(inputs, {"answer": gradio_format[-1][1]})
	yield gradio_format, memory.load_memory_variables({})["history"], source_string


	### GRADIO UI

	theme = gr.themes.Soft(
	primary_hue="sky",
	font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
	)

	demo_name = "UNEP Q&A"

	with gr.Blocks(title=f"{demo_name}", theme=theme, css_paths=os.getcwd()+ "/style.css") as demo:

	gr.Markdown(f"<h1><center>{demo_name}</center></h1>")

	with gr.Row():
	with gr.Column(scale=2):
	chatbot = gr.Chatbot(
	value = [("","Hello ! How can I help you today ?")],
	elem_id="chatbot",
	label=f"{demo_name} chatbot",
	show_label=False
	)
	state = gr.State([])

	with gr.Row():
	ask = gr.Textbox(
	show_label=False,
	placeholder="Input your question then press enter",
	)

	with gr.Column(scale=1, variant="panel"):
	gr.Markdown("### Sources")
	sources_textbox = gr.HTML(show_label=False)

	ask.submit(
	fn=chat,
	inputs=[
	ask,
	state,
	],
	outputs=[chatbot, state, sources_textbox],
	)

	ask.submit(reset_textbox, [], [ask])

	demo.queue()
	demo.launch(
	share=True,
	debug=True
	)