Create some text and make code more general
Browse files- __pycache__/custom_renderer.cpython-37.pyc +0 -0
- app.py +148 -173
- custom_renderer.py +0 -3
__pycache__/custom_renderer.cpython-37.pyc
CHANGED
Binary files a/__pycache__/custom_renderer.cpython-37.pyc and b/__pycache__/custom_renderer.cpython-37.pyc differ
|
|
app.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import random
|
2 |
from typing import AnyStr
|
3 |
|
|
|
4 |
import streamlit as st
|
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
import numpy as np
|
7 |
import base64
|
@@ -48,8 +50,8 @@ potty_mouth_emojis = [
|
|
48 |
|
49 |
# Page setup
|
50 |
st.set_page_config(
|
51 |
-
page_title="
|
52 |
-
page_icon="
|
53 |
layout="centered",
|
54 |
initial_sidebar_state="auto",
|
55 |
menu_items={
|
@@ -114,7 +116,7 @@ def format_explainer_html(html_string):
|
|
114 |
|
115 |
def list_all_article_names() -> list:
|
116 |
filenames = []
|
117 |
-
for file in os.listdir('./sample-articles/'):
|
118 |
if file.endswith('.txt'):
|
119 |
filenames.append(file.replace('.txt', ''))
|
120 |
return filenames
|
@@ -158,123 +160,68 @@ def classify_comment(comment, selected_model):
|
|
158 |
st.session_state.results.append(result)
|
159 |
|
160 |
|
161 |
-
# Start session
|
162 |
-
if 'results' not in st.session_state:
|
163 |
-
st.session_state.results = []
|
164 |
-
|
165 |
-
# Page
|
166 |
-
# st.title('🤬 Dutch Toxic Comment Detection')
|
167 |
-
# st.markdown("""This demo showcases two Dutch toxic comment detection models.""")
|
168 |
-
#
|
169 |
-
# # Introduction
|
170 |
-
# st.markdown(f"""Both models were trained using a sequence classification task on a translated [Jigsaw Toxicity dataset](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) which contains toxic online comments.
|
171 |
-
# The first model is a fine-tuned multilingual [DistilBERT](https://huggingface.co/distilbert-base-multilingual-cased) model whereas the second is a fine-tuned Dutch RoBERTa-based model called [RobBERT](https://huggingface.co/pdelobelle/robbert-v2-dutch-base).""")
|
172 |
-
# st.markdown(f"""For a more comprehensive overview of the models check out their model card on 🤗 Model Hub: [distilbert-base-dutch-toxic-comments]({model_names_to_URLs['ml6team/distilbert-base-dutch-cased-toxic-comments']}) and [RobBERT-dutch-base-toxic-comments]({model_names_to_URLs['ml6team/robbert-dutch-base-toxic-comments']}).
|
173 |
-
# """)
|
174 |
-
# st.markdown("""Enter a comment that you want to classify below. The model will determine the probability that it is toxic and highlights how much each token contributes to its decision:
|
175 |
-
# <font color="black">
|
176 |
-
# <span style="background-color: rgb(250, 219, 219); opacity: 1;">r</span><span style="background-color: rgb(244, 179, 179); opacity: 1;">e</span><span style="background-color: rgb(238, 135, 135); opacity: 1;">d</span>
|
177 |
-
# </font>
|
178 |
-
# tokens indicate toxicity whereas
|
179 |
-
# <font color="black">
|
180 |
-
# <span style="background-color: rgb(224, 251, 224); opacity: 1;">g</span><span style="background-color: rgb(197, 247, 197); opacity: 1;">re</span><span style="background-color: rgb(121, 236, 121); opacity: 1;">en</span>
|
181 |
-
# </font> tokens indicate the opposite.
|
182 |
-
#
|
183 |
-
# Try it yourself! 👇""",
|
184 |
-
# unsafe_allow_html=True)
|
185 |
-
|
186 |
-
|
187 |
-
# Demo
|
188 |
-
# with st.form("dutch-toxic-comment-detection-input", clear_on_submit=True):
|
189 |
-
# selected_model = st.selectbox('Select a model:', model_names_to_URLs.keys(),
|
190 |
-
# )#index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
|
191 |
-
# text = st.text_area(
|
192 |
-
# label='Enter the comment you want to classify below (in Dutch):')
|
193 |
-
# _, rightmost_col = st.columns([6,1])
|
194 |
-
# submitted = rightmost_col.form_submit_button("Classify",
|
195 |
-
# help="Classify comment")
|
196 |
-
|
197 |
-
|
198 |
-
# TODO: should probably set a minimum length of article or something
|
199 |
-
selected_article = st.selectbox('Select an article or provide your own:',
|
200 |
-
list_all_article_names()) # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
|
201 |
-
st.session_state.article_text = fetch_article_contents(selected_article)
|
202 |
-
article_text = st.text_area(
|
203 |
-
label='Full article text',
|
204 |
-
value=st.session_state.article_text,
|
205 |
-
height=250
|
206 |
-
)
|
207 |
-
|
208 |
-
|
209 |
-
# _, rightmost_col = st.columns([5, 1])
|
210 |
-
# get_summary = rightmost_col.button("Generate summary",
|
211 |
-
# help="Generate summary for the given article text")
|
212 |
-
|
213 |
-
|
214 |
def display_summary(article_name: str):
|
215 |
-
st.subheader("Generated summary")
|
216 |
-
# st.markdown("######")
|
217 |
summary_content = fetch_summary_contents(article_name)
|
218 |
soup = BeautifulSoup(summary_content, features="html.parser")
|
219 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
|
220 |
st.session_state.summary_output = HTML_WRAPPER.format(soup)
|
221 |
-
st.write(st.session_state.summary_output, unsafe_allow_html=True)
|
222 |
|
223 |
|
224 |
-
|
225 |
-
def
|
226 |
nlp = spacy.load('en_core_web_lg')
|
|
|
227 |
|
228 |
-
article_content = fetch_article_contents(article_name)
|
229 |
-
doc = nlp(article_content)
|
230 |
-
# entities_article = doc.ents
|
231 |
-
entities_article = []
|
232 |
-
for entity in doc.ents:
|
233 |
-
entities_article.append(str(entity))
|
234 |
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
entities_summary.append(str(entity))
|
241 |
-
|
242 |
-
matched_entities = []
|
243 |
-
unmatched_entities = []
|
244 |
-
for entity in entities_summary:
|
245 |
-
# TODO: currently substring matching but probably should do embedding method or idk?
|
246 |
-
if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
|
247 |
-
matched_entities.append(entity)
|
248 |
-
else:
|
249 |
-
unmatched_entities.append(entity)
|
250 |
-
# print(entities_article)
|
251 |
-
# print(entities_summary)
|
252 |
-
return matched_entities, unmatched_entities
|
253 |
|
254 |
|
255 |
-
def
|
256 |
-
|
257 |
-
|
|
|
|
|
258 |
|
259 |
-
article_content = fetch_article_contents(article_name)
|
260 |
-
doc = nlp(article_content)
|
261 |
-
entities_article = []
|
262 |
sentences = list(doc.sents)
|
|
|
|
|
263 |
for sentence in sentences:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
sentence_entities = Sentence(str(sentence))
|
265 |
tagger.predict(sentence_entities)
|
266 |
for entity in sentence_entities.get_spans('ner'):
|
267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
|
269 |
summary_content = fetch_summary_contents(article_name)
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
for sentence in sentences:
|
274 |
-
sentence_entities = Sentence(str(sentence))
|
275 |
-
tagger.predict(sentence_entities)
|
276 |
-
for entity in sentence_entities.get_spans('ner'):
|
277 |
-
entities_summary.append(entity.text)
|
278 |
|
279 |
matched_entities = []
|
280 |
unmatched_entities = []
|
@@ -284,21 +231,18 @@ def get_and_compare_entities_flair(article_name: str):
|
|
284 |
matched_entities.append(entity)
|
285 |
else:
|
286 |
unmatched_entities.append(entity)
|
287 |
-
# print(entities_article)
|
288 |
-
# print(entities_summary)
|
289 |
return matched_entities, unmatched_entities
|
290 |
|
291 |
|
292 |
def highlight_entities(article_name: str):
|
293 |
-
st.subheader("Match entities with article")
|
294 |
-
# st.markdown("####")
|
295 |
summary_content = fetch_summary_contents(article_name)
|
296 |
|
297 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
298 |
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
|
299 |
markdown_end = "</mark>"
|
300 |
|
301 |
-
matched_entities, unmatched_entities =
|
|
|
302 |
for entity in matched_entities:
|
303 |
summary_content = summary_content.replace(entity, markdown_start_green + entity + markdown_end)
|
304 |
|
@@ -306,55 +250,40 @@ def highlight_entities(article_name: str):
|
|
306 |
summary_content = summary_content.replace(entity, markdown_start_red + entity + markdown_end)
|
307 |
soup = BeautifulSoup(summary_content, features="html.parser")
|
308 |
|
309 |
-
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
|
|
310 |
|
311 |
-
|
312 |
|
313 |
|
314 |
def render_dependency_parsing(text: str):
|
315 |
-
nlp = spacy.load('en_core_web_sm')
|
316 |
-
#doc = nlp(text)
|
317 |
-
# st.write(displacy.render(doc, style='dep'))
|
318 |
-
#sentence_spans = list(doc.sents)
|
319 |
-
# dep_svg = displacy.serve(sentence_spans, style="dep")
|
320 |
-
# dep_svg = displacy.render(doc, style="dep", jupyter = False,
|
321 |
-
# options = {"compact" : False,})
|
322 |
-
# st.image(dep_svg, width = 50,use_column_width=True)
|
323 |
-
|
324 |
-
#visualize_parser(doc)
|
325 |
-
#docs = [doc]
|
326 |
-
#split_sents = True
|
327 |
-
#docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
|
328 |
-
#for sent in docs:
|
329 |
html = render_sentence_custom(text)
|
330 |
-
# Double newlines seem to mess with the rendering
|
331 |
html = html.replace("\n\n", "\n")
|
332 |
st.write(get_svg(html), unsafe_allow_html=True)
|
333 |
-
#st.image(html, width=50, use_column_width=True)
|
334 |
|
335 |
|
336 |
-
|
337 |
-
|
338 |
nlp = spacy.load('en_core_web_lg')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
doc = nlp(text)
|
340 |
tok_l = doc.to_json()['tokens']
|
341 |
-
# all_deps = []
|
342 |
all_deps = ""
|
|
|
|
|
|
|
343 |
sentences = list(doc.sents)
|
344 |
-
|
345 |
-
|
346 |
-
#
|
347 |
-
|
348 |
-
all_entities.append(str(entity))
|
349 |
-
# # ENTITIES WITH FLAIR:
|
350 |
-
sentence_entities = Sentence(str(sentence))
|
351 |
-
tagger.predict(sentence_entities)
|
352 |
-
for entity in sentence_entities.get_spans('ner'):
|
353 |
-
all_entities.append(entity.text)
|
354 |
-
# ENTITIES WITH XLM ROBERTA
|
355 |
-
# entities_xlm = [entity["word"] for entity in ner_model(str(sentence))]
|
356 |
-
# for entity in entities_xlm:
|
357 |
-
# all_entities.append(str(entity))
|
358 |
start_id = sentence.start
|
359 |
end_id = sentence.end
|
360 |
for t in tok_l:
|
@@ -362,50 +291,96 @@ def check_dependency(text):
|
|
362 |
continue
|
363 |
head = tok_l[t['head']]
|
364 |
if t['dep'] == 'amod':
|
|
|
365 |
object_here = text[t['start']:t['end']]
|
366 |
object_target = text[head['start']:head['end']]
|
|
|
367 |
# ONE NEEDS TO BE ENTITY
|
368 |
-
if
|
369 |
-
|
|
|
370 |
all_deps = all_deps.join(str(sentence))
|
371 |
-
elif
|
372 |
-
# all_deps.append(f"'{text[t['start']:t['end']]}' is {t['dep']} of '{text[head['start']:head['end']]}'")
|
373 |
all_deps = all_deps.join(str(sentence))
|
374 |
else:
|
375 |
continue
|
|
|
|
|
376 |
return all_deps
|
377 |
|
378 |
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
help="Generate summary for the given article text")
|
383 |
-
# Listener
|
384 |
-
if get_summary:
|
385 |
-
if article_text:
|
386 |
-
with st.spinner('Generating summary...'):
|
387 |
-
# classify_comment(article_text, selected_model)
|
388 |
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
409 |
# Results
|
410 |
# if 'results' in st.session_state and st.session_state.results:
|
411 |
# first = True
|
|
|
1 |
import random
|
2 |
from typing import AnyStr
|
3 |
|
4 |
+
import itertools
|
5 |
import streamlit as st
|
6 |
+
import torch.nn.parameter
|
7 |
from bs4 import BeautifulSoup
|
8 |
import numpy as np
|
9 |
import base64
|
|
|
50 |
|
51 |
# Page setup
|
52 |
st.set_page_config(
|
53 |
+
page_title="Post-processing summarization fact checker",
|
54 |
+
page_icon="",
|
55 |
layout="centered",
|
56 |
initial_sidebar_state="auto",
|
57 |
menu_items={
|
|
|
116 |
|
117 |
def list_all_article_names() -> list:
|
118 |
filenames = []
|
119 |
+
for file in sorted(os.listdir('./sample-articles/')):
|
120 |
if file.endswith('.txt'):
|
121 |
filenames.append(file.replace('.txt', ''))
|
122 |
return filenames
|
|
|
160 |
st.session_state.results.append(result)
|
161 |
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
def display_summary(article_name: str):
|
|
|
|
|
164 |
summary_content = fetch_summary_contents(article_name)
|
165 |
soup = BeautifulSoup(summary_content, features="html.parser")
|
166 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
|
167 |
st.session_state.summary_output = HTML_WRAPPER.format(soup)
|
|
|
168 |
|
169 |
|
170 |
+
##@st.cache(hash_funcs={preshed.maps.PreshMap: my_hash_func})
|
171 |
+
def get_spacy():
|
172 |
nlp = spacy.load('en_core_web_lg')
|
173 |
+
return nlp
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
+
# TODO: check the output mutation thingy
|
177 |
+
@st.cache(hash_funcs={torch.nn.parameter.Parameter: lambda _: None}, allow_output_mutation=True)
|
178 |
+
def get_flair_tagger():
|
179 |
+
tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")
|
180 |
+
return tagger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
|
183 |
+
def get_all_entities_per_sentence(text):
|
184 |
+
# load all NER models
|
185 |
+
nlp = get_spacy()
|
186 |
+
tagger = get_flair_tagger()
|
187 |
+
doc = nlp(text)
|
188 |
|
|
|
|
|
|
|
189 |
sentences = list(doc.sents)
|
190 |
+
|
191 |
+
entities_all_sentences = []
|
192 |
for sentence in sentences:
|
193 |
+
entities_this_sentence = []
|
194 |
+
|
195 |
+
# SPACY ENTITIES
|
196 |
+
for entity in sentence.ents:
|
197 |
+
entities_this_sentence.append(str(entity))
|
198 |
+
|
199 |
+
# FLAIR ENTITIES
|
200 |
sentence_entities = Sentence(str(sentence))
|
201 |
tagger.predict(sentence_entities)
|
202 |
for entity in sentence_entities.get_spans('ner'):
|
203 |
+
entities_this_sentence.append(entity.text)
|
204 |
+
entities_all_sentences.append(entities_this_sentence)
|
205 |
+
|
206 |
+
return entities_all_sentences
|
207 |
+
|
208 |
+
|
209 |
+
def get_all_entities(text):
|
210 |
+
all_entities_per_sentence = get_all_entities_per_sentence(text)
|
211 |
+
return list(itertools.chain.from_iterable(all_entities_per_sentence))
|
212 |
+
|
213 |
+
|
214 |
+
# TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
|
215 |
+
def get_and_compare_entities(article_name: str):
|
216 |
+
article_content = fetch_article_contents(article_name)
|
217 |
+
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
218 |
+
#st.session_state.entities_per_sentence_article = all_entities_per_sentence
|
219 |
+
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
220 |
|
221 |
summary_content = fetch_summary_contents(article_name)
|
222 |
+
all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
|
223 |
+
#st.session_state.entities_per_sentence_summary = all_entities_per_sentence
|
224 |
+
entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
|
|
|
|
|
|
|
|
|
|
225 |
|
226 |
matched_entities = []
|
227 |
unmatched_entities = []
|
|
|
231 |
matched_entities.append(entity)
|
232 |
else:
|
233 |
unmatched_entities.append(entity)
|
|
|
|
|
234 |
return matched_entities, unmatched_entities
|
235 |
|
236 |
|
237 |
def highlight_entities(article_name: str):
|
|
|
|
|
238 |
summary_content = fetch_summary_contents(article_name)
|
239 |
|
240 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
241 |
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
|
242 |
markdown_end = "</mark>"
|
243 |
|
244 |
+
matched_entities, unmatched_entities = get_and_compare_entities(article_name)
|
245 |
+
|
246 |
for entity in matched_entities:
|
247 |
summary_content = summary_content.replace(entity, markdown_start_green + entity + markdown_end)
|
248 |
|
|
|
250 |
summary_content = summary_content.replace(entity, markdown_start_red + entity + markdown_end)
|
251 |
soup = BeautifulSoup(summary_content, features="html.parser")
|
252 |
|
253 |
+
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
254 |
+
margin-bottom: 2.5rem">{}</div> """
|
255 |
|
256 |
+
return HTML_WRAPPER.format(soup)
|
257 |
|
258 |
|
259 |
def render_dependency_parsing(text: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
html = render_sentence_custom(text)
|
|
|
261 |
html = html.replace("\n\n", "\n")
|
262 |
st.write(get_svg(html), unsafe_allow_html=True)
|
|
|
263 |
|
264 |
|
265 |
+
# If deps for article: True, otherwise deps for summary calc
|
266 |
+
def check_dependency(article: bool):
|
267 |
nlp = spacy.load('en_core_web_lg')
|
268 |
+
if article:
|
269 |
+
text = st.session_state.article_text
|
270 |
+
all_entities = get_all_entities_per_sentence(text)
|
271 |
+
#all_entities = st.session_state.entities_per_sentence_article
|
272 |
+
else:
|
273 |
+
text = st.session_state.summary_output
|
274 |
+
all_entities = get_all_entities_per_sentence(text)
|
275 |
+
#all_entities = st.session_state.entities_per_sentence_summary
|
276 |
doc = nlp(text)
|
277 |
tok_l = doc.to_json()['tokens']
|
|
|
278 |
all_deps = ""
|
279 |
+
print(str(all_deps))
|
280 |
+
print("OOPS")
|
281 |
+
|
282 |
sentences = list(doc.sents)
|
283 |
+
print(sentences)
|
284 |
+
for i, sentence in enumerate(sentences):
|
285 |
+
#TODO MONDAY: THE PROBLEM LIES HERE WITH THE SENTENCE!!! (I THINK I KNOW PROBLEM: TEXT SAVED AS SESSION STATE IS HTML NOT PURE TEXT!)
|
286 |
+
print(str(sentence))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
start_id = sentence.start
|
288 |
end_id = sentence.end
|
289 |
for t in tok_l:
|
|
|
291 |
continue
|
292 |
head = tok_l[t['head']]
|
293 |
if t['dep'] == 'amod':
|
294 |
+
print("AMOD FOUND")
|
295 |
object_here = text[t['start']:t['end']]
|
296 |
object_target = text[head['start']:head['end']]
|
297 |
+
|
298 |
# ONE NEEDS TO BE ENTITY
|
299 |
+
if object_here in all_entities[i]:
|
300 |
+
print("SENTENCE ADDED")
|
301 |
+
print(all_deps)
|
302 |
all_deps = all_deps.join(str(sentence))
|
303 |
+
elif object_target in all_entities[i]:
|
|
|
304 |
all_deps = all_deps.join(str(sentence))
|
305 |
else:
|
306 |
continue
|
307 |
+
#print(f'all depps are {all_deps}')
|
308 |
+
#print(all_deps)
|
309 |
return all_deps
|
310 |
|
311 |
|
312 |
+
# Start session
|
313 |
+
if 'results' not in st.session_state:
|
314 |
+
st.session_state.results = []
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
|
316 |
+
# Page
|
317 |
+
st.title('Summarization fact checker')
|
318 |
+
|
319 |
+
# INTRODUCTION
|
320 |
+
st.header("Introduction")
|
321 |
+
st.markdown("""Recent work using transformers on large text corpora has shown great succes when fine-tuned on several
|
322 |
+
different downstream NLP tasks. One such task is that of text summarization. The goal of text summarization is to
|
323 |
+
generate concise and accurate summaries from input document(s). There are 2 types of summarization: extractive and
|
324 |
+
abstractive. **Exstractive summarization** merely copies informative fragments from the input, whereas **abstractive
|
325 |
+
summarization** may generate novel words. A good abstractive summary should cover principal information in the input
|
326 |
+
and has to be linguistically fluent. This blogpost will focus on this more difficult task of abstractive summary
|
327 |
+
generation.""")
|
328 |
+
|
329 |
+
st.markdown("""To generate summaries we will use the [PEGASUS] (https://huggingface.co/google/pegasus-cnn_dailymail)
|
330 |
+
model, producing abstractive summaries from large articles. These summaries often still contain sentences with
|
331 |
+
different kinds of errors. Rather than improving the core model, we will look at possible post-processing steps to
|
332 |
+
improve the generated summaries by detecting such possible errors. By comparing contents of the summary with the
|
333 |
+
source text, we can create some sort of factualness metric, indicating the trustworthiness of the generated
|
334 |
+
summary.""")
|
335 |
+
|
336 |
+
# GENERATING SUMMARIES PART
|
337 |
+
st.header("Generating summaries")
|
338 |
+
st.markdown("Let’s start by selecting an article text for which we want to generate a summary, or you can provide "
|
339 |
+
"text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary "
|
340 |
+
"generated might not be optimal to start from.")
|
341 |
+
|
342 |
+
# TODO: NEED TO CHECK ARTICLE TEXT INSTEAD OF ARTICLE NAME ALSO FREE INPUT OPTION
|
343 |
+
selected_article = st.selectbox('Select an article or provide your own:',
|
344 |
+
list_all_article_names()) # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
|
345 |
+
st.session_state.article_text = fetch_article_contents(selected_article)
|
346 |
+
article_text = st.text_area(
|
347 |
+
label='Full article text',
|
348 |
+
value=st.session_state.article_text,
|
349 |
+
height=150
|
350 |
+
)
|
351 |
+
|
352 |
+
st.markdown("Below you can find the generated summary for the article. The summaries of the example articles "
|
353 |
+
"vary in quality, but are chosen as such. Based on some common errors, we will discuss possible "
|
354 |
+
"methods to improve or rank the summaries in the following paragraphs. The idea is that in "
|
355 |
+
"production, you could generate a set of summaries for the same article, with different "
|
356 |
+
"parameters (or even different models). By using post-processing methods and metrics, "
|
357 |
+
"we can detect some errors in summaries, and choose the best one to actually use.")
|
358 |
+
if st.session_state.article_text:
|
359 |
+
with st.spinner('Generating summary...'):
|
360 |
+
# classify_comment(article_text, selected_model)
|
361 |
+
|
362 |
+
display_summary(selected_article)
|
363 |
+
|
364 |
+
st.write("**Generated summary:**", st.session_state.summary_output, unsafe_allow_html=True)
|
365 |
+
else:
|
366 |
+
st.error('**Error**: No comment to classify. Please provide a comment.',
|
367 |
+
help="Generate summary for the given article text")
|
368 |
+
|
369 |
+
# ENTITY MATCHING PART
|
370 |
+
st.header("Entity matching")
|
371 |
+
st.markdown("**Named entity recognition** (NER) is the task of identifying and categorising key information ("
|
372 |
+
"entities) in text. An entity can be a singular word or a series of words that consistently refers to the "
|
373 |
+
"same thing. Common entity classes are person names, organisations, locations and so on. By applying NER "
|
374 |
+
"to both the article and its summary, we can spot possible **hallucinations**. Hallucinations are words "
|
375 |
+
"generated by the model that are not supported by the source input. ")
|
376 |
+
with st.spinner("Calculating and matching entities..."):
|
377 |
+
entity_match_html = highlight_entities(selected_article)
|
378 |
+
st.write(entity_match_html, unsafe_allow_html=True)
|
379 |
+
|
380 |
+
# DEPENDENCY PARSING PART
|
381 |
+
st.header("Dependency comparison")
|
382 |
+
with st.spinner("Doing dependency parsing..."):
|
383 |
+
render_dependency_parsing(check_dependency(False))
|
384 |
# Results
|
385 |
# if 'results' in st.session_state and st.session_state.results:
|
386 |
# first = True
|
custom_renderer.py
CHANGED
@@ -102,7 +102,6 @@ def render_sentence_custom(parsed: str):
|
|
102 |
if a["label"] == "amod":
|
103 |
couples = (a["start"], a["end"])
|
104 |
|
105 |
-
print(couples)
|
106 |
x_value_counter = 10
|
107 |
index_counter = 0
|
108 |
svg_words = []
|
@@ -112,13 +111,11 @@ def render_sentence_custom(parsed: str):
|
|
112 |
word = word + " "
|
113 |
pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
|
114 |
svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
|
115 |
-
print(index_counter)
|
116 |
if index_counter >= couples[0] and index_counter <= couples[1]:
|
117 |
coords_test.append(x_value_counter)
|
118 |
x_value_counter += 50
|
119 |
index_counter += 1
|
120 |
x_value_counter += pixel_x_length + 4
|
121 |
-
print(coords_test)
|
122 |
for i, a in enumerate(arcs):
|
123 |
if a["label"] == "amod":
|
124 |
arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))
|
|
|
102 |
if a["label"] == "amod":
|
103 |
couples = (a["start"], a["end"])
|
104 |
|
|
|
105 |
x_value_counter = 10
|
106 |
index_counter = 0
|
107 |
svg_words = []
|
|
|
111 |
word = word + " "
|
112 |
pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
|
113 |
svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
|
|
|
114 |
if index_counter >= couples[0] and index_counter <= couples[1]:
|
115 |
coords_test.append(x_value_counter)
|
116 |
x_value_counter += 50
|
117 |
index_counter += 1
|
118 |
x_value_counter += pixel_x_length + 4
|
|
|
119 |
for i, a in enumerate(arcs):
|
120 |
if a["label"] == "amod":
|
121 |
arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))
|