Spaces:
Build error
Build error
File size: 3,133 Bytes
28ec4f0 a3fdd99 2a34bac 836e16d a3fdd99 f6cc0cb 28c1177 2a34bac a3fdd99 9a54394 5fdc2d5 cc0fbf1 9a54394 58c1223 9a54394 a3fdd99 3a4a956 a3fdd99 3a4a956 f6cc0cb 2a34bac cb95f0e 2d4dc51 67f4a7d d42a71a 2a34bac 9a54394 9097656 762970d f176fb0 762970d fe7b517 c1986cc fe7b517 3a4a956 2a34bac cb95f0e 3a4a956 67f4a7d 2a34bac 67f4a7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, Crawler
from haystack.schema import Document
import logging
import base64
from PIL import Image
import validators
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=200,
split_respect_sentence_boundary=True,
)
summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
return document_store, summarizer, preprocessor
def pdf_to_document_store(pdf_file):
document_store.delete_documents()
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
with open("temp-path.pdf", 'wb') as temp_file:
base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
temp_file.write(base64.b64decode(base64_pdf))
doc = converter.convert(file_path="temp-path.pdf", meta=None)
preprocessed_docs=preprocessor.process(doc)
document_store.write_documents(preprocessed_docs)
temp_file.close()
def summarize(content):
pdf_to_document_store(content)
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
return summaries
def set_state_if_absent(key, value):
if key not in st.session_state:
st.session_state[key] = value
set_state_if_absent("summaries", None)
document_store, summarizer, preprocessor = start_haystack()
st.title('TL;DR with Haystack')
image = Image.open('header-image.png')
st.image(image)
st.markdown( """
This Summarization demo uses a [Haystack TransformerSummarizer node](https://haystack.deepset.ai/pipeline_nodes/summarizer). You can upload a PDF file, which will be converted to text with the [Haystack PDFtoTextConverter](https://haystack.deepset.ai/reference/file-converters#pdftotextconverter). In this demo, we produce 1 summary for the whole file you upload. So, the TransformerSummarizer treats the whole thing as one string, which means along with the model limitations, PDFs that have a lot of unneeded text at the beginning produce poor results. For best results, upload a document that has minimal intro and tables at the top.
""", unsafe_allow_html=True)
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
if uploaded_file is not None :
if st.button('Summarize Document'):
with st.spinner("π Please wait while we produce a summary..."):
try:
st.session_state.summaries = summarize(uploaded_file)
except Exception as e:
logging.exception(e)
if st.session_state.summaries:
st.write('## Summary')
for count, summary in enumerate(st.session_state.summaries):
st.write(summary.content)
|