File size: 3,620 Bytes
081ac3f 3af7597 081ac3f c4f0d2a 081ac3f 71979cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
import streamlit as st
import pandas as pd
import torch
import ast
QA_VECTOR = '07.2-HBQA_QA_Vector_multi-qa-mpnet-base-dot-v1.csv'
# @title Load Example Questions
df_qa = pd.read_csv(QA_VECTOR)
example_qa = df_qa['Question'].sample(5).to_list()
question = st.selectbox(
'Please Select a Question',
(example_qa[0], example_qa[1], example_qa[2], example_qa[3], example_qa[4]))
st.write('You Question:', question)
CHUNK_VECTOR = '04.3-HBQA_Chunk_Vector_multi-qa-mpnet-base-dot-v1.csv'
df_chunk = pd.read_csv(CHUNK_VECTOR)
# print(df_qa.shape)
# df_qa.head(1)
# df_chunk.head(1)
# df_chunk.shape
# @title Load Embedding Model
embmodelname, embmodelshort, embmodelname1 = 'multi-qa-mpnet-base-dot-v1', 'mpnet', '_multi-qa-mpnet-base-dot-v1'
embmodel = SentenceTransformer(embmodelname)
# @title Create Question Embedding
def get_ques_vector(ques):
Question_Embeddings = embmodel.encode(ques)
return Question_Embeddings
question_embedding = get_ques_vector(question)
# @title Load all chunk_vectors into memotry
chunk_id = df_chunk['Chunk_Id'].to_list()
chunk_vector = [ast.literal_eval(df_chunk['ChunkVector'+embmodelname1][i]) for i in range(df_chunk.shape[0])]
# chunk_vector = [df_chunk['ChunkVector'+embmodelname1][i] for i in range(df_chunk.shape[0])]
chunk_vector = torch.tensor(chunk_vector)
# @title Predict Chunk Id for Question
top_k=5
hits = util.semantic_search(question_embedding, chunk_vector, top_k=top_k)
predictedId= [item['corpus_id'] for item in hits[0]] # Hit contains index (corpus_id), location of chunk_vector
predicted_Docid = [df_chunk.loc[i, "Chunk_Id"] for i in predictedId] # We need to get chunk_id corresponding to chunk_index
print (predicted_Docid)
# @title Load Prediction Model
from transformers import T5ForConditionalGeneration, T5Tokenizer
MODEL_FOLDER = 't5small-30epoch'
# Load the corresponding tokenizer
tokenizer = T5Tokenizer.from_pretrained( MODEL_FOLDER)
# Load the pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained(MODEL_FOLDER)
DEVICE = 'cpu'
model.to(DEVICE)
# @title Predict Answer
def predict_answer(context, question, tokenizer, model):
Q_LEN=1500
# if predmodel_name=="t5":
inputs = tokenizer(question, context, max_length= Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
with torch.no_grad():
outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100)
predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True)
if len(predicted_answer)<3:
predicted_answer="xxx"
elif predicted_answer[0:5]=='[CLS]' or predicted_answer[0:5]=='[SEP]' or predicted_answer[0:3]=='<s>' :
predicted_answer="xxx"
return predicted_answer
verbose = False
if len(predicted_Docid)>3:
ids = predicted_Docid
else:
print("Sorry, No document")
ans=[]
for id in ids:
# print(ques_id, id)
cond = df_chunk["Chunk_Id"]==id
chunk = df_chunk.loc[cond]["Chunk"].values[0]
# print(chunk[:20])
pred_ans = predict_answer(chunk, question, tokenizer, model)
ans.append( pred_ans )
if verbose:
print("Pred Ans :", pred_ans)
# final prediction with all the joint answers.
pred_finalans = predict_answer(" ".join(ans), question, tokenizer, model)
st.write('Answer:', pred_finalans) |