File size: 3,771 Bytes
081ac3f
 
 
 
 
 
 
3af7597
 
081ac3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71979cb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
!pip install transformers
!pip install -Uq evaluate
!pip install -Uq SentencePiece
!pip install -Uq sentence-transformers

from sentence_transformers import SentenceTransformer
from sentence_transformers import util
import streamlit as st

import pandas as pd
import torch
import ast

QA_VECTOR = '07.2-HBQA_QA_Vector_multi-qa-mpnet-base-dot-v1.csv'

# @title Load Example Questions
df_qa =  pd.read_csv(QA_VECTOR)
example_qa = df_qa['Question'].sample(5).to_list()

question = st.selectbox(
    'Please Select a Question',
    (example_qa[0], example_qa[1], example_qa[2], example_qa[3], example_qa[4]))

st.write('You Question:', question)

CHUNK_VECTOR = '04.3-HBQA_Chunk_Vector_multi-qa-mpnet-base-dot-v1.csv'

df_chunk = pd.read_csv(CHUNK_VECTOR)

# print(df_qa.shape)
# df_qa.head(1)
# df_chunk.head(1)
# df_chunk.shape

# @title Load Embedding Model
embmodelname, embmodelshort, embmodelname1 = 'multi-qa-mpnet-base-dot-v1', 'mpnet', '_multi-qa-mpnet-base-dot-v1'
embmodel = SentenceTransformer(embmodelname)

# @title Create Question Embedding
def get_ques_vector(ques):
  Question_Embeddings = embmodel.encode(ques)
  return Question_Embeddings

question_embedding = get_ques_vector(question)

# @title Load all chunk_vectors into memotry
chunk_id = df_chunk['Chunk_Id'].to_list()

chunk_vector = [ast.literal_eval(df_chunk['ChunkVector'+embmodelname1][i]) for i in range(df_chunk.shape[0])]
# chunk_vector = [df_chunk['ChunkVector'+embmodelname1][i] for i in range(df_chunk.shape[0])]
chunk_vector = torch.tensor(chunk_vector)

# @title Predict Chunk Id for Question
top_k=5
hits = util.semantic_search(question_embedding, chunk_vector, top_k=top_k)

predictedId= [item['corpus_id'] for item in hits[0]] # Hit contains index (corpus_id), location of chunk_vector
predicted_Docid = [df_chunk.loc[i, "Chunk_Id"] for i in predictedId] # We need to get chunk_id corresponding to chunk_index
print (predicted_Docid)

# @title Load Prediction Model
from transformers import T5ForConditionalGeneration, T5Tokenizer
MODEL_FOLDER = '/content/drive/MyDrive/HBQA/t5small-30epoch'

# Load the corresponding tokenizer
tokenizer = T5Tokenizer.from_pretrained( MODEL_FOLDER)

# Load the pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained(MODEL_FOLDER)
DEVICE = 'cpu'
model.to(DEVICE)

# @title Predict Answer

def predict_answer(context, question, tokenizer, model):
    Q_LEN=1500

#     if predmodel_name=="t5":
    inputs = tokenizer(question, context, max_length= Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100)

    predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True)


    if len(predicted_answer)<3:
       predicted_answer="xxx"
    elif predicted_answer[0:5]=='[CLS]' or  predicted_answer[0:5]=='[SEP]' or predicted_answer[0:3]=='<s>' :
        predicted_answer="xxx"
    return predicted_answer

verbose = False

if len(predicted_Docid)>3:
    ids = predicted_Docid
else:
    print("Sorry, No document")

ans=[]
for id in ids:
    # print(ques_id, id)
    cond  = df_chunk["Chunk_Id"]==id
    chunk = df_chunk.loc[cond]["Chunk"].values[0]

    # print(chunk[:20])
    pred_ans = predict_answer(chunk, question, tokenizer, model)
    ans.append( pred_ans )

    if verbose:
        print("Pred Ans  :", pred_ans)

# final prediction with all the joint answers.
pred_finalans =  predict_answer(" ".join(ans), question, tokenizer, model)

st.write('Answer:', pred_finalans)