import openai import requests import streamlit as st from bs4 import BeautifulSoup from sentence_transformers import CrossEncoder from transformers import pipeline all_documents = {} def qa_gpt3(question, context): print(question, context) openai.api_key = st.secrets["openai_key"] response = openai.Completion.create( model="text-davinci-003", prompt=f"Answer given the following context: {context}\n\nQuestion: {question}", temperature=0.7, max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0 ) print(response) return {'answer': response['choices'][0]['text'].strip()} st.title('Document Question Answering System') qa_model = None crawl_urls = st.checkbox('Crawl?', value=False) document_text = st.text_area( label="Links (Comma separated)", height=100, value='https://www.databricks.com/blog/2022/11/15/values-define-databricks-culture.html, https://databricks.com/product/databricks-runtime-for-machine-learning/faq' ) query = st.text_input("Query") qa_option = st.selectbox('Q/A Answerer', ('gpt3', 'a-ware/bart-squadv2')) tokenizing = st.selectbox('How to Tokenize', ("Don't (use entire body as document)", 'Newline (split by newline character)', 'Combo')) if qa_option == 'gpt3': qa_model = qa_gpt3 else: qa_model = pipeline("question-answering", qa_option) st.write(f'Using {qa_option} as the Q/A model') encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') def get_relevent_passage(question, documents): query_paragraph_list = [(question, para) for para in list(documents.keys()) if len(para.strip()) > 0] scores = encoder.predict(query_paragraph_list) top_5_indices = scores.argsort()[-5:] top_5_query_paragraph_list = [query_paragraph_list[i] for i in top_5_indices] top_5_query_paragraph_list.reverse() return top_5_query_paragraph_list[0][1] def answer_question(query, context): answer = qa_model(question=query, context=context)['answer'] return answer def get_documents(document_text, crawl=crawl_urls): urls = document_text.split(',') for url in urls: st.write(f'Crawling {url}') if url in set(all_documents.values()): continue html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') if crawl: st.write('Give me a sec, crawling..') import re more_urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html) more_urls = list( set([m for m in more_urls if m[-4] != '.' and m[-3] != '.' and m.split('/')[:3] == url.split('/')[:3]])) for more_url in more_urls: all_documents.update(get_documents(more_url, crawl=False)) body = "\n".join([x for x in soup.body.get_text().split('\n') if len(x) > 10]) print(body) if tokenizing == "Don't (use entire body as document)": document_paragraphs = [body] elif tokenizing == 'Newline (split by newline character)': document_paragraphs = [n for n in body.split('\n') if len(n) > 250] elif tokenizing == 'Combo': document_paragraphs = [body] + [n for n in body.split('\n') if len(n) > 250] for document_paragraph in document_paragraphs: all_documents[document_paragraph] = url return all_documents if len(document_text.strip()) > 0 and len(query.strip()) > 0 and qa_model and encoder: st.write('Hmmm let me think about that..') document_text = document_text.strip() documents = get_documents(document_text) st.write(f'I am looking through {len(set(documents.values()))} sites') query = query.strip() context = get_relevent_passage(query, documents) answer = answer_question(query, context) relevant_url = documents[context] st.write('Check the answer below...with reference text') st.header("ANSWER: " + answer) st.subheader("REFERENCE: " + context) st.subheader("REFERENCE URL: " + relevant_url)