!pip install transformers !pip install -Uq evaluate !pip install -Uq SentencePiece !pip install -Uq sentence-transformers from sentence_transformers import SentenceTransformer from sentence_transformers import util import streamlit as st import pandas as pd import torch import ast QA_VECTOR = '07.2-HBQA_QA_Vector_multi-qa-mpnet-base-dot-v1.csv' # @title Load Example Questions df_qa = pd.read_csv(QA_VECTOR) example_qa = df_qa['Question'].sample(5).to_list() question = st.selectbox( 'Please Select a Question', (example_qa[0], example_qa[1], example_qa[2], example_qa[3], example_qa[4])) st.write('You Question:', question) CHUNK_VECTOR = '04.3-HBQA_Chunk_Vector_multi-qa-mpnet-base-dot-v1.csv' df_chunk = pd.read_csv(CHUNK_VECTOR) # print(df_qa.shape) # df_qa.head(1) # df_chunk.head(1) # df_chunk.shape # @title Load Embedding Model embmodelname, embmodelshort, embmodelname1 = 'multi-qa-mpnet-base-dot-v1', 'mpnet', '_multi-qa-mpnet-base-dot-v1' embmodel = SentenceTransformer(embmodelname) # @title Create Question Embedding def get_ques_vector(ques): Question_Embeddings = embmodel.encode(ques) return Question_Embeddings question_embedding = get_ques_vector(question) # @title Load all chunk_vectors into memotry chunk_id = df_chunk['Chunk_Id'].to_list() chunk_vector = [ast.literal_eval(df_chunk['ChunkVector'+embmodelname1][i]) for i in range(df_chunk.shape[0])] # chunk_vector = [df_chunk['ChunkVector'+embmodelname1][i] for i in range(df_chunk.shape[0])] chunk_vector = torch.tensor(chunk_vector) # @title Predict Chunk Id for Question top_k=5 hits = util.semantic_search(question_embedding, chunk_vector, top_k=top_k) predictedId= [item['corpus_id'] for item in hits[0]] # Hit contains index (corpus_id), location of chunk_vector predicted_Docid = [df_chunk.loc[i, "Chunk_Id"] for i in predictedId] # We need to get chunk_id corresponding to chunk_index print (predicted_Docid) # @title Load Prediction Model from transformers import T5ForConditionalGeneration, T5Tokenizer MODEL_FOLDER = '/content/drive/MyDrive/HBQA/t5small-30epoch' # Load the corresponding tokenizer tokenizer = T5Tokenizer.from_pretrained( MODEL_FOLDER) # Load the pre-trained T5 model model = T5ForConditionalGeneration.from_pretrained(MODEL_FOLDER) DEVICE = 'cpu' model.to(DEVICE) # @title Predict Answer def predict_answer(context, question, tokenizer, model): Q_LEN=1500 # if predmodel_name=="t5": inputs = tokenizer(question, context, max_length= Q_LEN, padding="max_length", truncation=True, add_special_tokens=True) input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0) attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0) with torch.no_grad(): outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100) predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True) if len(predicted_answer)<3: predicted_answer="xxx" elif predicted_answer[0:5]=='[CLS]' or predicted_answer[0:5]=='[SEP]' or predicted_answer[0:3]=='' : predicted_answer="xxx" return predicted_answer verbose = False if len(predicted_Docid)>3: ids = predicted_Docid else: print("Sorry, No document") ans=[] for id in ids: # print(ques_id, id) cond = df_chunk["Chunk_Id"]==id chunk = df_chunk.loc[cond]["Chunk"].values[0] # print(chunk[:20]) pred_ans = predict_answer(chunk, question, tokenizer, model) ans.append( pred_ans ) if verbose: print("Pred Ans :", pred_ans) # final prediction with all the joint answers. pred_finalans = predict_answer(" ".join(ans), question, tokenizer, model) st.write('Answer:', pred_finalans)