samarthagarwal23 commited on
Commit
5aa1bf9
1 Parent(s): cfdace1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -16
app.py CHANGED
@@ -11,10 +11,14 @@ import torch
11
  from transformers import pipeline
12
  import pdfminer
13
  from pdfminer.high_level import extract_text
14
- #from termcolor import colored
15
 
16
- len_doc = 400
17
- overlap = 50
 
 
 
 
 
18
 
19
  def read_pdf(file):
20
  text = extract_text(file.name)
@@ -50,10 +54,6 @@ def retrieval(query, top_k_retriver, docs, bm25_):
50
 
51
  return bm25_hits
52
 
53
- qa_model = pipeline("question-answering",
54
- model = "deepset/minilm-uncased-squad2")
55
- #model = "deepset/roberta-base-squad2")
56
-
57
  def qa_ranker(query, docs_, top_k_ranker):
58
  ans = []
59
  for doc in docs_:
@@ -86,33 +86,27 @@ def final_qa_pipeline(file, query):
86
 
87
  bm25 = BM25Okapi(tokenized_corpus)
88
 
89
- top_k_retriver, top_k_ranker = 30,3
90
  lvl1 = retrieval(query, top_k_retriver, docs, bm25)
91
 
92
  if len(lvl1) > 0:
93
  fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
94
- #return (fnl_rank[0]["answer"], str(np.round(100*fnl_rank[0]["score"],2))+"%" , fnl_rank[0]['doc'])
95
- #return (print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end']), str(np.round(100*fnl_rank[0]["score"],2))+"%"
96
  top1 = print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end'], str(np.round(100*fnl_rank[0]["score"],1))+"%")
97
  if len(lvl1)>1:
98
  top2 = print_colored(fnl_rank[1]['doc'], fnl_rank[1]['start'], fnl_rank[1]['end'], str(np.round(100*fnl_rank[1]["score"],1))+"%")
99
  else:
100
  top2 = "None"
101
  return (top1, top2)
102
- #for fnl_ in fnl_rank:
103
- # print("\n")
104
- # print_colored(fnl_['doc'], fnl_['start'], fnl_['end'])
105
- # print(colored("Confidence score of ") + colored(str(fnl_['score'])[:4], attrs=['bold']))
106
  else:
107
  return ("No match","No match")
108
 
109
  examples = [
 
 
110
  [os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders ?"],
111
  [os.path.abspath("dbs-annual-report-2020.pdf"), "what are the key risks ?"],
112
  [os.path.abspath("dbs-annual-report-2020.pdf"), "what is the sustainability focus ?"],
113
- [os.path.abspath("dbs-annual-report-2020.pdf"), "what is PURE ?"],
114
  [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares ?"],
115
- [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "How high is shareholders equity ?"],
116
  [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "what is competitors strategy ?"],
117
  [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "who is the chief executive officer ?"],
118
  ]
 
11
  from transformers import pipeline
12
  import pdfminer
13
  from pdfminer.high_level import extract_text
 
14
 
15
+ len_doc = 500
16
+ overlap = 30
17
+
18
+ qa_model = pipeline("question-answering",
19
+ #model = "deepset/minilm-uncased-squad2")
20
+ model = "deepset/roberta-base-squad2")
21
+
22
 
23
  def read_pdf(file):
24
  text = extract_text(file.name)
 
54
 
55
  return bm25_hits
56
 
 
 
 
 
57
  def qa_ranker(query, docs_, top_k_ranker):
58
  ans = []
59
  for doc in docs_:
 
86
 
87
  bm25 = BM25Okapi(tokenized_corpus)
88
 
89
+ top_k_retriver, top_k_ranker = 20,3
90
  lvl1 = retrieval(query, top_k_retriver, docs, bm25)
91
 
92
  if len(lvl1) > 0:
93
  fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
 
 
94
  top1 = print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end'], str(np.round(100*fnl_rank[0]["score"],1))+"%")
95
  if len(lvl1)>1:
96
  top2 = print_colored(fnl_rank[1]['doc'], fnl_rank[1]['start'], fnl_rank[1]['end'], str(np.round(100*fnl_rank[1]["score"],1))+"%")
97
  else:
98
  top2 = "None"
99
  return (top1, top2)
 
 
 
 
100
  else:
101
  return ("No match","No match")
102
 
103
  examples = [
104
+ [os.path.abspath("dbs-annual-report-2020.pdf"), "how many times has DBS won Best bank in the world ?"],
105
+ [os.path.abspath("dbs-annual-report-2020.pdf"), "what is PURE ?"],
106
  [os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders ?"],
107
  [os.path.abspath("dbs-annual-report-2020.pdf"), "what are the key risks ?"],
108
  [os.path.abspath("dbs-annual-report-2020.pdf"), "what is the sustainability focus ?"],
 
109
  [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares ?"],
 
110
  [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "what is competitors strategy ?"],
111
  [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "who is the chief executive officer ?"],
112
  ]