Spaces:
Runtime error
Runtime error
File size: 4,989 Bytes
442d312 472a794 15c3f5d d96dbbc 472a794 442d312 e5374a1 442d312 31f12cb bcbee82 442d312 5aa1bf9 b37de3c 5aa1bf9 442d312 981b258 442d312 4912f11 442d312 d1391ee 442d312 a3e1a90 442d312 a3e1a90 442d312 0e90d70 bc84182 b5b0fd2 854c812 5e68e2f bc84182 854c812 bbce466 c010f14 5e68e2f c010f14 5e68e2f 0e90d70 d7f0548 442d312 89b0019 b37de3c a3e1a90 442d312 d7f0548 0f32c86 d7f0548 442d312 ad9a1e9 5e68e2f bbce466 5e68e2f 442d312 5e68e2f 442d312 2482299 5aa1bf9 bc84182 de8c106 5da1616 d7f0548 2482299 442d312 7ef436d c379db0 5e68e2f 58cbf7b 31f12cb 574b79f 8af8a15 442d312 1a235c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
import os
import numpy as np
os.system("pip install pdfminer.six rank_bm25 torch transformers")
from gradio.mix import Series
#import re
from rank_bm25 import BM25Okapi
import string
import torch
from transformers import pipeline
import pdfminer
from pdfminer.high_level import extract_text
len_doc = 500
overlap = 15
param_top_k_retriver = 15
param_top_k_ranker = 3
def read_pdf(file):
text = extract_text(file.name)
# Split text into smaller docs
docs = []
i = 0
while i < len(text):
docs.append(text[i:i+len_doc])
i = i + len_doc - overlap
return docs
# We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching
def bm25_tokenizer(text):
stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why', 'what']
tokenized_doc = []
for token in text.lower().split():
token = token.strip(string.punctuation)
if len(token) > 0 and token not in stop_w:
tokenized_doc.append(token)
return tokenized_doc
def retrieval(query, top_k_retriver, docs, bm25_):
bm25_scores = bm25_.get_scores(bm25_tokenizer(query))
top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
bm25_hits = [{'corpus_id': idx,
'score': bm25_scores[idx],
'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
return bm25_hits
def qa_ranker(query, docs_, top_k_ranker):
ans = []
for doc in docs_:
answer = qa_model(question = query,
context = doc)
answer['doc'] = doc
ans.append(answer)
return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]
def cstr(s, color='black'):
return "<text style=color:{}>{}</text>".format(color, s)
def cstr_bold(s, color='black'):
return "<text style=color:{}><b>{}</b></text>".format(color, s)
def cstr_break(s, color='black'):
return "<text style=color:{}><br>{}</text>".format(color, s)
def print_colored(text, start_idx, end_idx, confidence):
conf_str = '- Confidence: ' + confidence
a = cstr(' '.join([text[:start_idx], \
cstr_bold(text[start_idx:end_idx], color='blue'), \
text[end_idx:], \
cstr_break(conf_str, color='grey')]), color='black')
return a
def final_qa_pipeline(file, query, model_nm):
docs = read_pdf(file)
tokenized_corpus = []
for doc in docs:
tokenized_corpus.append(bm25_tokenizer(doc))
bm25 = BM25Okapi(tokenized_corpus)
top_k_retriver, top_k_ranker = param_top_k_retriver, param_top_k_ranker
lvl1 = retrieval(query, top_k_retriver, docs, bm25)
qa_model = pipeline("question-answering",
#model = "deepset/minilm-uncased-squad2")
model = "deepset/"+ str(model_nm))
if len(lvl1) > 0:
fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
top1 = print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end'], str(np.round(100*fnl_rank[0]["score"],1))+"%")
if len(lvl1)>1:
top2 = print_colored(fnl_rank[1]['doc'], fnl_rank[1]['start'], fnl_rank[1]['end'], str(np.round(100*fnl_rank[1]["score"],1))+"%")
else:
top2 = "None"
return (top1, top2)
else:
return ("No match","No match")
examples = [
[os.path.abspath("dbs-annual-report-2020.pdf"), "how many times has DBS won Best bank in the world ?"],
[os.path.abspath("dbs-annual-report-2020.pdf"), "what is PURE ?"],
[os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders ?"],
[os.path.abspath("dbs-annual-report-2020.pdf"), "what are the key risks ?"],
[os.path.abspath("dbs-annual-report-2020.pdf"), "what is the sustainability focus ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "what is competitors strategy ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "who is the chief executive officer ?"],
[os.path.abspath("NASDAQ_MSFT_2020.pdf"), "How much is the guided revenue for next quarter?"],
]
iface = gr.Interface(
fn = final_qa_pipeline,
inputs = [gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:"), gr.inputs.Dropdown(choices=["minilm-uncased-squad2","roberta-base-squad2"],label="Model")],
outputs = [gr.outputs.HTML(label="Top 1 answer"), gr.outputs.HTML(label="Top 2 answer")],
examples=examples,
theme = "grass",
title = "Question Answering on annual reports",
description = "Navigate long annual reports by using Machine learning to answer your questions. \nSimply upload any annual report pdf you are interested in and ask model a question OR load an example from below."
)
iface.launch(enable_queue = True) |