Spaces:
Runtime error
Runtime error
File size: 4,683 Bytes
442d312 472a794 15c3f5d 0e90d70 472a794 442d312 bcbee82 442d312 bc84182 442d312 981b258 442d312 4912f11 442d312 d1391ee 442d312 a3e1a90 442d312 a3e1a90 442d312 dbd16a5 442d312 0e90d70 bc84182 b5b0fd2 854c812 bc84182 854c812 5a3144d bc84182 0e90d70 442d312 89b0019 a58a4e4 a3e1a90 442d312 bc84182 854c812 442d312 b5b0fd2 442d312 2482299 bc84182 de8c106 2482299 442d312 7ef436d 7e61444 b5b0fd2 58cbf7b d4f9cd8 6e382ec 442d312 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import gradio as gr
import os
import numpy as np
os.system("pip install pdfminer.six rank_bm25 torch transformers termcolor")
from gradio.mix import Series
import re
from rank_bm25 import BM25Okapi
import string
import torch
from transformers import pipeline
import pdfminer
from pdfminer.high_level import extract_text
#from termcolor import colored
def read_pdf(file):
text = extract_text(file.name)
# Split text into smaller docs
len_doc = 400
overlap = 50
docs = []
i = 0
while i < len(text):
docs.append(text[i:i+len_doc])
i = i + len_doc - overlap
return docs
# We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching
def bm25_tokenizer(text):
stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why', 'what']
tokenized_doc = []
for token in text.lower().split():
token = token.strip(string.punctuation)
if len(token) > 0 and token not in stop_w:
tokenized_doc.append(token)
return tokenized_doc
def retrieval(query, top_k_retriver, docs, bm25_):
bm25_scores = bm25_.get_scores(bm25_tokenizer(query))
top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
bm25_hits = [{'corpus_id': idx,
'score': bm25_scores[idx],
'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
return bm25_hits
qa_model = pipeline("question-answering",
model = "deepset/roberta-base-squad2")
def qa_ranker(query, docs_, top_k_ranker):
ans = []
for doc in docs_:
answer = qa_model(question = query,
context = doc)
answer['doc'] = doc
ans.append(answer)
return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]
def cstr(s, color='black'):
return "<text style=color:{}>{}</text>".format(color, s)
def cstr_bold(s, color='black'):
return "<text style=color:{}><b>{}</b></text>".format(color, s)
def cstr_break(s, color='black'):
return "<text style=color:{}>{}<br></text>".format(color, s)
def print_colored(text, start_idx, end_idx, confidence):
conf_str = 'Confidence: ' + confidence)
a = cstr_break(' '.join(cstr(' '.join([text[:start_idx], cstr_bold(text[start_idx:end_idx], color='red'), text[end_idx:]]), color='black'), conf_str))
#a = colored(text[:start_idx]) + colored(text[start_idx:end_idx], 'red', 'on_yellow') + colored(text[end_idx:])
return a
def final_qa_pipeline(file, query):
docs = read_pdf(file)
tokenized_corpus = []
for doc in docs:
tokenized_corpus.append(bm25_tokenizer(doc))
bm25 = BM25Okapi(tokenized_corpus)
top_k_retriver, top_k_ranker = 20,1
lvl1 = retrieval(query, top_k_retriver, docs, bm25)
if len(lvl1) > 0:
fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
#return (fnl_rank[0]["answer"], str(np.round(100*fnl_rank[0]["score"],2))+"%" , fnl_rank[0]['doc'])
#return (print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end']), str(np.round(100*fnl_rank[0]["score"],2))+"%"
return (print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end'], str(np.round(100*fnl_rank[0]["score"],2))+"%") )
#for fnl_ in fnl_rank:
# print("\n")
# print_colored(fnl_['doc'], fnl_['start'], fnl_['end'])
# print(colored("Confidence score of ") + colored(str(fnl_['score'])[:4], attrs=['bold']))
else:
return ("No match", "0")
examples = [
[os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders ?"],
[os.path.abspath("dbs-annual-report-2020.pdf"), "what are the key risks ?"],
[os.path.abspath("dbs-annual-report-2020.pdf"), "what is the sustainability focus ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "How high is shareholders equity ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "what is competitors strategy ?"],
]
iface = gr.Interface(
fn = final_qa_pipeline,
inputs = [gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:")],
outputs = [gr.outputs.HTML(label="Predicted answer"), gr.outputs.Textbox(label="Confidence") ],
examples=examples,
title = "Question Answering on company annual reports",
description = "Simply upload any annual report pdf you are interested in and ask model a question OR load an example from below."
)
iface.launch() |