Spaces:

ml6team
/

post-processing-summarization

Running

App Files Files Community

MatthiasC commited on Apr 15, 2022

Commit

08e0095

1 Parent(s): f51bffc

Create some text and make code more general

Browse files

Files changed (3) hide show

__pycache__/custom_renderer.cpython-37.pyc +0 -0
app.py +148 -173
custom_renderer.py +0 -3

__pycache__/custom_renderer.cpython-37.pyc CHANGED Viewed

Binary files a/__pycache__/custom_renderer.cpython-37.pyc and b/__pycache__/custom_renderer.cpython-37.pyc differ

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import random
 from typing import AnyStr
 import streamlit as st
 from bs4 import BeautifulSoup
 import numpy as np
 import base64
@@ -48,8 +50,8 @@ potty_mouth_emojis = [
 # Page setup
 st.set_page_config(
-    page_title="Toxic Comment Detection Space",
-    page_icon="🤬",
     layout="centered",
     initial_sidebar_state="auto",
     menu_items={
@@ -114,7 +116,7 @@ def format_explainer_html(html_string):
 def list_all_article_names() -> list:
     filenames = []
-    for file in os.listdir('./sample-articles/'):
         if file.endswith('.txt'):
             filenames.append(file.replace('.txt', ''))
     return filenames
@@ -158,123 +160,68 @@ def classify_comment(comment, selected_model):
     st.session_state.results.append(result)
-# Start session
-if 'results' not in st.session_state:
-    st.session_state.results = []
-# Page
-# st.title('🤬 Dutch Toxic Comment Detection')
-# st.markdown("""This demo showcases two Dutch toxic comment detection models.""")
-#
-# # Introduction
-# st.markdown(f"""Both models were trained using a sequence classification task on a translated [Jigsaw Toxicity dataset](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) which contains toxic online comments.
-#     The first model is a fine-tuned multilingual [DistilBERT](https://huggingface.co/distilbert-base-multilingual-cased) model whereas the second is a fine-tuned Dutch RoBERTa-based model called [RobBERT](https://huggingface.co/pdelobelle/robbert-v2-dutch-base).""")
-# st.markdown(f"""For a more comprehensive overview of the models check out their model card on 🤗 Model Hub: [distilbert-base-dutch-toxic-comments]({model_names_to_URLs['ml6team/distilbert-base-dutch-cased-toxic-comments']}) and [RobBERT-dutch-base-toxic-comments]({model_names_to_URLs['ml6team/robbert-dutch-base-toxic-comments']}).
-# """)
-# st.markdown("""Enter a comment that you want to classify below. The model will determine the probability that it is toxic and highlights how much each token contributes to its decision:
-#     <font color="black">
-#         <span style="background-color: rgb(250, 219, 219); opacity: 1;">r</span><span style="background-color: rgb(244, 179, 179); opacity: 1;">e</span><span style="background-color: rgb(238, 135, 135); opacity: 1;">d</span>
-#     </font>
-#     tokens indicate toxicity whereas
-#     <font color="black">
-#     <span style="background-color: rgb(224, 251, 224); opacity: 1;">g</span><span style="background-color: rgb(197, 247, 197); opacity: 1;">re</span><span style="background-color: rgb(121, 236, 121); opacity: 1;">en</span>
-#     </font> tokens indicate the opposite.
-#
-# Try it yourself! 👇""",
-#     unsafe_allow_html=True)
-# Demo
-# with st.form("dutch-toxic-comment-detection-input", clear_on_submit=True):
-#     selected_model = st.selectbox('Select a model:', model_names_to_URLs.keys(),
-#     )#index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
-#     text = st.text_area(
-#         label='Enter the comment you want to classify below (in Dutch):')
-#     _, rightmost_col = st.columns([6,1])
-#     submitted = rightmost_col.form_submit_button("Classify",
-#                                                  help="Classify comment")
-# TODO: should probably set a minimum length of article or something
-selected_article = st.selectbox('Select an article or provide your own:',
-                                list_all_article_names())  # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
-st.session_state.article_text = fetch_article_contents(selected_article)
-article_text = st.text_area(
-    label='Full article text',
-    value=st.session_state.article_text,
-    height=250
-)
-# _, rightmost_col = st.columns([5, 1])
-# get_summary = rightmost_col.button("Generate summary",
-#                                                 help="Generate summary for the given article text")
 def display_summary(article_name: str):
-    st.subheader("Generated summary")
-    # st.markdown("######")
     summary_content = fetch_summary_contents(article_name)
     soup = BeautifulSoup(summary_content, features="html.parser")
     HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
     st.session_state.summary_output = HTML_WRAPPER.format(soup)
-    st.write(st.session_state.summary_output, unsafe_allow_html=True)
-# TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
-def get_and_compare_entities_spacy(article_name: str):
     nlp = spacy.load('en_core_web_lg')
-    article_content = fetch_article_contents(article_name)
-    doc = nlp(article_content)
-    # entities_article = doc.ents
-    entities_article = []
-    for entity in doc.ents:
-        entities_article.append(str(entity))
-    summary_content = fetch_summary_contents(article_name)
-    doc = nlp(summary_content)
-    # entities_summary = doc.ents
-    entities_summary = []
-    for entity in doc.ents:
-        entities_summary.append(str(entity))
-    matched_entities = []
-    unmatched_entities = []
-    for entity in entities_summary:
-        # TODO: currently substring matching but probably should do embedding method or idk?
-        if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
-            matched_entities.append(entity)
-        else:
-            unmatched_entities.append(entity)
-    # print(entities_article)
-    # print(entities_summary)
-    return matched_entities, unmatched_entities
-def get_and_compare_entities_flair(article_name: str):
-    nlp = spacy.load('en_core_web_sm')
-    tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")
-    article_content = fetch_article_contents(article_name)
-    doc = nlp(article_content)
-    entities_article = []
     sentences = list(doc.sents)
     for sentence in sentences:
         sentence_entities = Sentence(str(sentence))
         tagger.predict(sentence_entities)
         for entity in sentence_entities.get_spans('ner'):
-            entities_article.append(entity.text)
     summary_content = fetch_summary_contents(article_name)
-    doc = nlp(summary_content)
-    entities_summary = []
-    sentences = list(doc.sents)
-    for sentence in sentences:
-        sentence_entities = Sentence(str(sentence))
-        tagger.predict(sentence_entities)
-        for entity in sentence_entities.get_spans('ner'):
-            entities_summary.append(entity.text)
     matched_entities = []
     unmatched_entities = []
@@ -284,21 +231,18 @@ def get_and_compare_entities_flair(article_name: str):
             matched_entities.append(entity)
         else:
             unmatched_entities.append(entity)
-    # print(entities_article)
-    # print(entities_summary)
     return matched_entities, unmatched_entities
 def highlight_entities(article_name: str):
-    st.subheader("Match entities with article")
-    # st.markdown("####")
     summary_content = fetch_summary_contents(article_name)
     markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
     markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
     markdown_end = "</mark>"
-    matched_entities, unmatched_entities = get_and_compare_entities_spacy(article_name)
     for entity in matched_entities:
         summary_content = summary_content.replace(entity, markdown_start_green + entity + markdown_end)
@@ -306,55 +250,40 @@ def highlight_entities(article_name: str):
         summary_content = summary_content.replace(entity, markdown_start_red + entity + markdown_end)
     soup = BeautifulSoup(summary_content, features="html.parser")
-    HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
-    st.write(HTML_WRAPPER.format(soup), unsafe_allow_html=True)
 def render_dependency_parsing(text: str):
-    nlp = spacy.load('en_core_web_sm')
-    #doc = nlp(text)
-    # st.write(displacy.render(doc, style='dep'))
-    #sentence_spans = list(doc.sents)
-    # dep_svg = displacy.serve(sentence_spans, style="dep")
-    # dep_svg = displacy.render(doc, style="dep", jupyter = False,
-    #                           options = {"compact" : False,})
-    # st.image(dep_svg, width = 50,use_column_width=True)
-    #visualize_parser(doc)
-    #docs = [doc]
-    #split_sents = True
-    #docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
-    #for sent in docs:
     html = render_sentence_custom(text)
-    # Double newlines seem to mess with the rendering
     html = html.replace("\n\n", "\n")
     st.write(get_svg(html), unsafe_allow_html=True)
-    #st.image(html, width=50, use_column_width=True)
-def check_dependency(text):
-    tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")
     nlp = spacy.load('en_core_web_lg')
     doc = nlp(text)
     tok_l = doc.to_json()['tokens']
-    # all_deps = []
     all_deps = ""
     sentences = list(doc.sents)
-    for sentence in sentences:
-        all_entities = []
-        # # ENTITIES WITH SPACY:
-        for entity in sentence.ents:
-            all_entities.append(str(entity))
-        # # ENTITIES WITH FLAIR:
-        sentence_entities = Sentence(str(sentence))
-        tagger.predict(sentence_entities)
-        for entity in sentence_entities.get_spans('ner'):
-            all_entities.append(entity.text)
-        # ENTITIES WITH XLM ROBERTA
-        # entities_xlm = [entity["word"] for entity in ner_model(str(sentence))]
-        # for entity in entities_xlm:
-        #     all_entities.append(str(entity))
         start_id = sentence.start
         end_id = sentence.end
         for t in tok_l:
@@ -362,50 +291,96 @@ def check_dependency(text):
                 continue
             head = tok_l[t['head']]
             if t['dep'] == 'amod':
                 object_here = text[t['start']:t['end']]
                 object_target = text[head['start']:head['end']]
                 # ONE NEEDS TO BE ENTITY
-                if (object_here in all_entities):
-                    # all_deps.append(f"'{text[t['start']:t['end']]}' is {t['dep']} of '{text[head['start']:head['end']]}'")
                     all_deps = all_deps.join(str(sentence))
-                elif (object_target in all_entities):
-                    # all_deps.append(f"'{text[t['start']:t['end']]}' is {t['dep']} of '{text[head['start']:head['end']]}'")
                     all_deps = all_deps.join(str(sentence))
                 else:
                     continue
     return all_deps
-with st.form("article-input"):
-    left_column, _ = st.columns([1, 1])
-    get_summary = left_column.form_submit_button("Generate summary",
-                                                 help="Generate summary for the given article text")
-    # Listener
-    if get_summary:
-        if article_text:
-            with st.spinner('Generating summary...'):
-                # classify_comment(article_text, selected_model)
-                display_summary(selected_article)
-        else:
-            st.error('**Error**: No comment to classify. Please provide a comment.')
-# Entity part
-with st.form("Entity-part"):
-    left_column, _ = st.columns([1, 1])
-    draw_entities = left_column.form_submit_button("Draw Entities",
-                                                   help="Draw Entities")
-    if draw_entities:
-        with st.spinner("Drawing entities..."):
-            highlight_entities(selected_article)
-with st.form("Dependency-usage"):
-    left_column, _ = st.columns([1, 1])
-    parsing = left_column.form_submit_button("Dependency parsing",
-                                             help="Dependency parsing")
-    if parsing:
-        with st.spinner("Doing dependency parsing..."):
-            render_dependency_parsing(check_dependency(fetch_summary_contents(selected_article)))
 # Results
 # if 'results' in st.session_state and st.session_state.results:
 #     first = True

 import random
 from typing import AnyStr
+import itertools
 import streamlit as st
+import torch.nn.parameter
 from bs4 import BeautifulSoup
 import numpy as np
 import base64
 # Page setup
 st.set_page_config(
+    page_title="Post-processing summarization fact checker",
+    page_icon="",
     layout="centered",
     initial_sidebar_state="auto",
     menu_items={
 def list_all_article_names() -> list:
     filenames = []
+    for file in sorted(os.listdir('./sample-articles/')):
         if file.endswith('.txt'):
             filenames.append(file.replace('.txt', ''))
     return filenames
     st.session_state.results.append(result)
 def display_summary(article_name: str):
     summary_content = fetch_summary_contents(article_name)
     soup = BeautifulSoup(summary_content, features="html.parser")
     HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
     st.session_state.summary_output = HTML_WRAPPER.format(soup)
+##@st.cache(hash_funcs={preshed.maps.PreshMap: my_hash_func})
+def get_spacy():
     nlp = spacy.load('en_core_web_lg')
+    return nlp
+# TODO: check the output mutation thingy
+@st.cache(hash_funcs={torch.nn.parameter.Parameter: lambda _: None}, allow_output_mutation=True)
+def get_flair_tagger():
+    tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")
+    return tagger
+def get_all_entities_per_sentence(text):
+    # load all NER models
+    nlp = get_spacy()
+    tagger = get_flair_tagger()
+    doc = nlp(text)
     sentences = list(doc.sents)
+    entities_all_sentences = []
     for sentence in sentences:
+        entities_this_sentence = []
+        # SPACY ENTITIES
+        for entity in sentence.ents:
+            entities_this_sentence.append(str(entity))
+        # FLAIR ENTITIES
         sentence_entities = Sentence(str(sentence))
         tagger.predict(sentence_entities)
         for entity in sentence_entities.get_spans('ner'):
+            entities_this_sentence.append(entity.text)
+        entities_all_sentences.append(entities_this_sentence)
+    return entities_all_sentences
+def get_all_entities(text):
+    all_entities_per_sentence = get_all_entities_per_sentence(text)
+    return list(itertools.chain.from_iterable(all_entities_per_sentence))
+# TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
+def get_and_compare_entities(article_name: str):
+    article_content = fetch_article_contents(article_name)
+    all_entities_per_sentence = get_all_entities_per_sentence(article_content)
+    #st.session_state.entities_per_sentence_article = all_entities_per_sentence
+    entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
     summary_content = fetch_summary_contents(article_name)
+    all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
+    #st.session_state.entities_per_sentence_summary = all_entities_per_sentence
+    entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
     matched_entities = []
     unmatched_entities = []
             matched_entities.append(entity)
         else:
             unmatched_entities.append(entity)
     return matched_entities, unmatched_entities
 def highlight_entities(article_name: str):
     summary_content = fetch_summary_contents(article_name)
     markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
     markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
     markdown_end = "</mark>"
+    matched_entities, unmatched_entities = get_and_compare_entities(article_name)
     for entity in matched_entities:
         summary_content = summary_content.replace(entity, markdown_start_green + entity + markdown_end)
         summary_content = summary_content.replace(entity, markdown_start_red + entity + markdown_end)
     soup = BeautifulSoup(summary_content, features="html.parser")
+    HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
+    margin-bottom: 2.5rem">{}</div> """
+    return HTML_WRAPPER.format(soup)
 def render_dependency_parsing(text: str):
     html = render_sentence_custom(text)
     html = html.replace("\n\n", "\n")
     st.write(get_svg(html), unsafe_allow_html=True)
+# If deps for article: True, otherwise deps for summary calc
+def check_dependency(article: bool):
     nlp = spacy.load('en_core_web_lg')
+    if article:
+        text = st.session_state.article_text
+        all_entities = get_all_entities_per_sentence(text)
+        #all_entities = st.session_state.entities_per_sentence_article
+    else:
+        text = st.session_state.summary_output
+        all_entities = get_all_entities_per_sentence(text)
+        #all_entities = st.session_state.entities_per_sentence_summary
     doc = nlp(text)
     tok_l = doc.to_json()['tokens']
     all_deps = ""
+    print(str(all_deps))
+    print("OOPS")
     sentences = list(doc.sents)
+    print(sentences)
+    for i, sentence in enumerate(sentences):
+        #TODO MONDAY: THE PROBLEM LIES HERE WITH THE SENTENCE!!! (I THINK I KNOW PROBLEM: TEXT SAVED AS SESSION STATE IS HTML NOT PURE TEXT!)
+        print(str(sentence))
         start_id = sentence.start
         end_id = sentence.end
         for t in tok_l:
                 continue
             head = tok_l[t['head']]
             if t['dep'] == 'amod':
+                print("AMOD FOUND")
                 object_here = text[t['start']:t['end']]
                 object_target = text[head['start']:head['end']]
                 # ONE NEEDS TO BE ENTITY
+                if object_here in all_entities[i]:
+                    print("SENTENCE ADDED")
+                    print(all_deps)
                     all_deps = all_deps.join(str(sentence))
+                elif object_target in all_entities[i]:
                     all_deps = all_deps.join(str(sentence))
                 else:
                     continue
+    #print(f'all depps are {all_deps}')
+    #print(all_deps)
     return all_deps
+# Start session
+if 'results' not in st.session_state:
+    st.session_state.results = []
+# Page
+st.title('Summarization fact checker')
+# INTRODUCTION
+st.header("Introduction")
+st.markdown("""Recent work using transformers on large text corpora has shown great succes when fine-tuned on several
+different downstream NLP tasks. One such task is that of text summarization. The goal of text summarization is to
+generate concise and accurate summaries from input document(s). There are 2 types of summarization: extractive and
+abstractive. **Exstractive summarization** merely copies informative fragments from the input, whereas **abstractive
+summarization** may generate novel words. A good abstractive summary should cover principal information in the input
+and has to be linguistically fluent. This blogpost will focus on this more difficult task of abstractive summary
+generation.""")
+st.markdown("""To generate summaries we will use the [PEGASUS] (https://huggingface.co/google/pegasus-cnn_dailymail)
+model, producing abstractive summaries from large articles. These summaries often still contain sentences with
+different kinds of errors. Rather than improving the core model, we will look at possible post-processing steps to
+improve the generated summaries by detecting such possible errors. By comparing contents of the summary with the
+source text, we can create some sort of factualness metric, indicating the trustworthiness of the generated
+summary.""")
+# GENERATING SUMMARIES PART
+st.header("Generating summaries")
+st.markdown("Let’s start by selecting an article text for which we want to generate a summary, or you can provide "
+            "text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary "
+            "generated might not be optimal to start from.")
+# TODO: NEED TO CHECK ARTICLE TEXT INSTEAD OF ARTICLE NAME ALSO FREE INPUT OPTION
+selected_article = st.selectbox('Select an article or provide your own:',
+                                list_all_article_names())  # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
+st.session_state.article_text = fetch_article_contents(selected_article)
+article_text = st.text_area(
+    label='Full article text',
+    value=st.session_state.article_text,
+    height=150
+)
+st.markdown("Below you can find the generated summary for the article. The summaries of the example articles "
+            "vary in quality, but are chosen as such. Based on some common errors, we will discuss possible "
+            "methods to improve or rank the summaries in the following paragraphs. The idea is that in "
+            "production, you could generate a set of summaries for the same article, with different "
+            "parameters (or even different models). By using post-processing methods and metrics, "
+            "we can detect some errors in summaries, and choose the best one to actually use.")
+if st.session_state.article_text:
+    with st.spinner('Generating summary...'):
+        # classify_comment(article_text, selected_model)
+        display_summary(selected_article)
+        st.write("**Generated summary:**", st.session_state.summary_output, unsafe_allow_html=True)
+else:
+    st.error('**Error**: No comment to classify. Please provide a comment.',
+             help="Generate summary for the given article text")
+# ENTITY MATCHING PART
+st.header("Entity matching")
+st.markdown("**Named entity recognition** (NER) is the task of identifying and categorising key information ("
+            "entities) in text. An entity can be a singular word or a series of words that consistently refers to the "
+            "same thing. Common entity classes are person names, organisations, locations and so on. By applying NER "
+            "to both the article and its summary, we can spot possible **hallucinations**. Hallucinations are words "
+            "generated by the model that are not supported by the source input. ")
+with st.spinner("Calculating and matching entities..."):
+    entity_match_html = highlight_entities(selected_article)
+    st.write(entity_match_html, unsafe_allow_html=True)
+# DEPENDENCY PARSING PART
+st.header("Dependency comparison")
+with st.spinner("Doing dependency parsing..."):
+    render_dependency_parsing(check_dependency(False))
 # Results
 # if 'results' in st.session_state and st.session_state.results:
 #     first = True

custom_renderer.py CHANGED Viewed

@@ -102,7 +102,6 @@ def render_sentence_custom(parsed: str):
         if a["label"] == "amod":
             couples = (a["start"], a["end"])
-    print(couples)
     x_value_counter = 10
     index_counter = 0
     svg_words = []
@@ -112,13 +111,11 @@ def render_sentence_custom(parsed: str):
         word = word + " "
         pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
         svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
-        print(index_counter)
         if index_counter >= couples[0] and index_counter <= couples[1]:
             coords_test.append(x_value_counter)
             x_value_counter += 50
         index_counter += 1
         x_value_counter += pixel_x_length + 4
-    print(coords_test)
     for i, a in enumerate(arcs):
         if a["label"] == "amod":
             arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))

         if a["label"] == "amod":
             couples = (a["start"], a["end"])
     x_value_counter = 10
     index_counter = 0
     svg_words = []
         word = word + " "
         pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
         svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
         if index_counter >= couples[0] and index_counter <= couples[1]:
             coords_test.append(x_value_counter)
             x_value_counter += 50
         index_counter += 1
         x_value_counter += pixel_x_length + 4
     for i, a in enumerate(arcs):
         if a["label"] == "amod":
             arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))