Spaces:

harithapliyal
/

t5-small-mahabharat-vana-parva-qa

Sleeping

App Files Files Community

t5-small-mahabharat-vana-parva-qa / app.py

harithapliyal

Update app.py

71979cb verified 10 months ago

raw

history blame

3.77 kB

	!pip install transformers
	!pip install -Uq evaluate
	!pip install -Uq SentencePiece
	!pip install -Uq sentence-transformers

	from sentence_transformers import SentenceTransformer
	from sentence_transformers import util
	import streamlit as st

	import pandas as pd
	import torch
	import ast

	QA_VECTOR = '07.2-HBQA_QA_Vector_multi-qa-mpnet-base-dot-v1.csv'

	# @title Load Example Questions
	df_qa = pd.read_csv(QA_VECTOR)
	example_qa = df_qa['Question'].sample(5).to_list()

	question = st.selectbox(
	'Please Select a Question',
	(example_qa[0], example_qa[1], example_qa[2], example_qa[3], example_qa[4]))

	st.write('You Question:', question)

	CHUNK_VECTOR = '04.3-HBQA_Chunk_Vector_multi-qa-mpnet-base-dot-v1.csv'

	df_chunk = pd.read_csv(CHUNK_VECTOR)

	# print(df_qa.shape)
	# df_qa.head(1)
	# df_chunk.head(1)
	# df_chunk.shape

	# @title Load Embedding Model
	embmodelname, embmodelshort, embmodelname1 = 'multi-qa-mpnet-base-dot-v1', 'mpnet', '_multi-qa-mpnet-base-dot-v1'
	embmodel = SentenceTransformer(embmodelname)

	# @title Create Question Embedding
	def get_ques_vector(ques):
	Question_Embeddings = embmodel.encode(ques)
	return Question_Embeddings

	question_embedding = get_ques_vector(question)

	# @title Load all chunk_vectors into memotry
	chunk_id = df_chunk['Chunk_Id'].to_list()

	chunk_vector = [ast.literal_eval(df_chunk['ChunkVector'+embmodelname1][i]) for i in range(df_chunk.shape[0])]
	# chunk_vector = [df_chunk['ChunkVector'+embmodelname1][i] for i in range(df_chunk.shape[0])]
	chunk_vector = torch.tensor(chunk_vector)

	# @title Predict Chunk Id for Question
	top_k=5
	hits = util.semantic_search(question_embedding, chunk_vector, top_k=top_k)

	predictedId= [item['corpus_id'] for item in hits[0]] # Hit contains index (corpus_id), location of chunk_vector
	predicted_Docid = [df_chunk.loc[i, "Chunk_Id"] for i in predictedId] # We need to get chunk_id corresponding to chunk_index
	print (predicted_Docid)

	# @title Load Prediction Model
	from transformers import T5ForConditionalGeneration, T5Tokenizer
	MODEL_FOLDER = '/content/drive/MyDrive/HBQA/t5small-30epoch'

	# Load the corresponding tokenizer
	tokenizer = T5Tokenizer.from_pretrained( MODEL_FOLDER)

	# Load the pre-trained T5 model
	model = T5ForConditionalGeneration.from_pretrained(MODEL_FOLDER)
	DEVICE = 'cpu'
	model.to(DEVICE)

	# @title Predict Answer

	def predict_answer(context, question, tokenizer, model):
	Q_LEN=1500

	# if predmodel_name=="t5":
	inputs = tokenizer(question, context, max_length= Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

	input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
	attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

	with torch.no_grad():
	outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100)

	predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True)


	if len(predicted_answer)<3:
	predicted_answer="xxx"
	elif predicted_answer[0:5]=='[CLS]' or predicted_answer[0:5]=='[SEP]' or predicted_answer[0:3]=='<s>' :
	predicted_answer="xxx"
	return predicted_answer

	verbose = False

	if len(predicted_Docid)>3:
	ids = predicted_Docid
	else:
	print("Sorry, No document")

	ans=[]
	for id in ids:
	# print(ques_id, id)
	cond = df_chunk["Chunk_Id"]==id
	chunk = df_chunk.loc[cond]["Chunk"].values[0]

	# print(chunk[:20])
	pred_ans = predict_answer(chunk, question, tokenizer, model)
	ans.append( pred_ans )

	if verbose:
	print("Pred Ans :", pred_ans)

	# final prediction with all the joint answers.
	pred_finalans = predict_answer(" ".join(ans), question, tokenizer, model)

	st.write('Answer:', pred_finalans)