|
!pip install transformers |
|
!pip install -Uq evaluate |
|
!pip install -Uq SentencePiece |
|
!pip install -Uq sentence-transformers |
|
|
|
from sentence_transformers import SentenceTransformer |
|
from sentence_transformers import util |
|
import streamlit as st |
|
|
|
import pandas as pd |
|
import torch |
|
import ast |
|
|
|
QA_VECTOR = '07.2-HBQA_QA_Vector_multi-qa-mpnet-base-dot-v1.csv' |
|
|
|
|
|
df_qa = pd.read_csv(QA_VECTOR) |
|
example_qa = df_qa['Question'].sample(5).to_list() |
|
|
|
question = st.selectbox( |
|
'Please Select a Question', |
|
(example_qa[0], example_qa[1], example_qa[2], example_qa[3], example_qa[4])) |
|
|
|
st.write('You Question:', question) |
|
|
|
CHUNK_VECTOR = '04.3-HBQA_Chunk_Vector_multi-qa-mpnet-base-dot-v1.csv' |
|
|
|
df_chunk = pd.read_csv(CHUNK_VECTOR) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embmodelname, embmodelshort, embmodelname1 = 'multi-qa-mpnet-base-dot-v1', 'mpnet', '_multi-qa-mpnet-base-dot-v1' |
|
embmodel = SentenceTransformer(embmodelname) |
|
|
|
|
|
def get_ques_vector(ques): |
|
Question_Embeddings = embmodel.encode(ques) |
|
return Question_Embeddings |
|
|
|
question_embedding = get_ques_vector(question) |
|
|
|
|
|
chunk_id = df_chunk['Chunk_Id'].to_list() |
|
|
|
chunk_vector = [ast.literal_eval(df_chunk['ChunkVector'+embmodelname1][i]) for i in range(df_chunk.shape[0])] |
|
|
|
chunk_vector = torch.tensor(chunk_vector) |
|
|
|
|
|
top_k=5 |
|
hits = util.semantic_search(question_embedding, chunk_vector, top_k=top_k) |
|
|
|
predictedId= [item['corpus_id'] for item in hits[0]] |
|
predicted_Docid = [df_chunk.loc[i, "Chunk_Id"] for i in predictedId] |
|
print (predicted_Docid) |
|
|
|
|
|
from transformers import T5ForConditionalGeneration, T5Tokenizer |
|
MODEL_FOLDER = '/content/drive/MyDrive/HBQA/t5small-30epoch' |
|
|
|
|
|
tokenizer = T5Tokenizer.from_pretrained( MODEL_FOLDER) |
|
|
|
|
|
model = T5ForConditionalGeneration.from_pretrained(MODEL_FOLDER) |
|
DEVICE = 'cpu' |
|
model.to(DEVICE) |
|
|
|
|
|
|
|
def predict_answer(context, question, tokenizer, model): |
|
Q_LEN=1500 |
|
|
|
|
|
inputs = tokenizer(question, context, max_length= Q_LEN, padding="max_length", truncation=True, add_special_tokens=True) |
|
|
|
input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0) |
|
attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0) |
|
|
|
with torch.no_grad(): |
|
outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100) |
|
|
|
predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True) |
|
|
|
|
|
if len(predicted_answer)<3: |
|
predicted_answer="xxx" |
|
elif predicted_answer[0:5]=='[CLS]' or predicted_answer[0:5]=='[SEP]' or predicted_answer[0:3]=='<s>' : |
|
predicted_answer="xxx" |
|
return predicted_answer |
|
|
|
verbose = False |
|
|
|
if len(predicted_Docid)>3: |
|
ids = predicted_Docid |
|
else: |
|
print("Sorry, No document") |
|
|
|
ans=[] |
|
for id in ids: |
|
|
|
cond = df_chunk["Chunk_Id"]==id |
|
chunk = df_chunk.loc[cond]["Chunk"].values[0] |
|
|
|
|
|
pred_ans = predict_answer(chunk, question, tokenizer, model) |
|
ans.append( pred_ans ) |
|
|
|
if verbose: |
|
print("Pred Ans :", pred_ans) |
|
|
|
|
|
pred_finalans = predict_answer(" ".join(ans), question, tokenizer, model) |
|
|
|
st.write('Answer:', pred_finalans) |