Spaces:
Runtime error
Runtime error
Commit
•
5aa1bf9
1
Parent(s):
cfdace1
Update app.py
Browse files
app.py
CHANGED
@@ -11,10 +11,14 @@ import torch
|
|
11 |
from transformers import pipeline
|
12 |
import pdfminer
|
13 |
from pdfminer.high_level import extract_text
|
14 |
-
#from termcolor import colored
|
15 |
|
16 |
-
len_doc =
|
17 |
-
overlap =
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def read_pdf(file):
|
20 |
text = extract_text(file.name)
|
@@ -50,10 +54,6 @@ def retrieval(query, top_k_retriver, docs, bm25_):
|
|
50 |
|
51 |
return bm25_hits
|
52 |
|
53 |
-
qa_model = pipeline("question-answering",
|
54 |
-
model = "deepset/minilm-uncased-squad2")
|
55 |
-
#model = "deepset/roberta-base-squad2")
|
56 |
-
|
57 |
def qa_ranker(query, docs_, top_k_ranker):
|
58 |
ans = []
|
59 |
for doc in docs_:
|
@@ -86,33 +86,27 @@ def final_qa_pipeline(file, query):
|
|
86 |
|
87 |
bm25 = BM25Okapi(tokenized_corpus)
|
88 |
|
89 |
-
top_k_retriver, top_k_ranker =
|
90 |
lvl1 = retrieval(query, top_k_retriver, docs, bm25)
|
91 |
|
92 |
if len(lvl1) > 0:
|
93 |
fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
|
94 |
-
#return (fnl_rank[0]["answer"], str(np.round(100*fnl_rank[0]["score"],2))+"%" , fnl_rank[0]['doc'])
|
95 |
-
#return (print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end']), str(np.round(100*fnl_rank[0]["score"],2))+"%"
|
96 |
top1 = print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end'], str(np.round(100*fnl_rank[0]["score"],1))+"%")
|
97 |
if len(lvl1)>1:
|
98 |
top2 = print_colored(fnl_rank[1]['doc'], fnl_rank[1]['start'], fnl_rank[1]['end'], str(np.round(100*fnl_rank[1]["score"],1))+"%")
|
99 |
else:
|
100 |
top2 = "None"
|
101 |
return (top1, top2)
|
102 |
-
#for fnl_ in fnl_rank:
|
103 |
-
# print("\n")
|
104 |
-
# print_colored(fnl_['doc'], fnl_['start'], fnl_['end'])
|
105 |
-
# print(colored("Confidence score of ") + colored(str(fnl_['score'])[:4], attrs=['bold']))
|
106 |
else:
|
107 |
return ("No match","No match")
|
108 |
|
109 |
examples = [
|
|
|
|
|
110 |
[os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders ?"],
|
111 |
[os.path.abspath("dbs-annual-report-2020.pdf"), "what are the key risks ?"],
|
112 |
[os.path.abspath("dbs-annual-report-2020.pdf"), "what is the sustainability focus ?"],
|
113 |
-
[os.path.abspath("dbs-annual-report-2020.pdf"), "what is PURE ?"],
|
114 |
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares ?"],
|
115 |
-
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "How high is shareholders equity ?"],
|
116 |
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "what is competitors strategy ?"],
|
117 |
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "who is the chief executive officer ?"],
|
118 |
]
|
|
|
11 |
from transformers import pipeline
|
12 |
import pdfminer
|
13 |
from pdfminer.high_level import extract_text
|
|
|
14 |
|
15 |
+
len_doc = 500
|
16 |
+
overlap = 30
|
17 |
+
|
18 |
+
qa_model = pipeline("question-answering",
|
19 |
+
#model = "deepset/minilm-uncased-squad2")
|
20 |
+
model = "deepset/roberta-base-squad2")
|
21 |
+
|
22 |
|
23 |
def read_pdf(file):
|
24 |
text = extract_text(file.name)
|
|
|
54 |
|
55 |
return bm25_hits
|
56 |
|
|
|
|
|
|
|
|
|
57 |
def qa_ranker(query, docs_, top_k_ranker):
|
58 |
ans = []
|
59 |
for doc in docs_:
|
|
|
86 |
|
87 |
bm25 = BM25Okapi(tokenized_corpus)
|
88 |
|
89 |
+
top_k_retriver, top_k_ranker = 20,3
|
90 |
lvl1 = retrieval(query, top_k_retriver, docs, bm25)
|
91 |
|
92 |
if len(lvl1) > 0:
|
93 |
fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
|
|
|
|
|
94 |
top1 = print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end'], str(np.round(100*fnl_rank[0]["score"],1))+"%")
|
95 |
if len(lvl1)>1:
|
96 |
top2 = print_colored(fnl_rank[1]['doc'], fnl_rank[1]['start'], fnl_rank[1]['end'], str(np.round(100*fnl_rank[1]["score"],1))+"%")
|
97 |
else:
|
98 |
top2 = "None"
|
99 |
return (top1, top2)
|
|
|
|
|
|
|
|
|
100 |
else:
|
101 |
return ("No match","No match")
|
102 |
|
103 |
examples = [
|
104 |
+
[os.path.abspath("dbs-annual-report-2020.pdf"), "how many times has DBS won Best bank in the world ?"],
|
105 |
+
[os.path.abspath("dbs-annual-report-2020.pdf"), "what is PURE ?"],
|
106 |
[os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders ?"],
|
107 |
[os.path.abspath("dbs-annual-report-2020.pdf"), "what are the key risks ?"],
|
108 |
[os.path.abspath("dbs-annual-report-2020.pdf"), "what is the sustainability focus ?"],
|
|
|
109 |
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares ?"],
|
|
|
110 |
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "what is competitors strategy ?"],
|
111 |
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "who is the chief executive officer ?"],
|
112 |
]
|