MatthiasC commited on
Commit
08e0095
1 Parent(s): f51bffc

Create some text and make code more general

Browse files
__pycache__/custom_renderer.cpython-37.pyc CHANGED
Binary files a/__pycache__/custom_renderer.cpython-37.pyc and b/__pycache__/custom_renderer.cpython-37.pyc differ
 
app.py CHANGED
@@ -1,7 +1,9 @@
1
  import random
2
  from typing import AnyStr
3
 
 
4
  import streamlit as st
 
5
  from bs4 import BeautifulSoup
6
  import numpy as np
7
  import base64
@@ -48,8 +50,8 @@ potty_mouth_emojis = [
48
 
49
  # Page setup
50
  st.set_page_config(
51
- page_title="Toxic Comment Detection Space",
52
- page_icon="🤬",
53
  layout="centered",
54
  initial_sidebar_state="auto",
55
  menu_items={
@@ -114,7 +116,7 @@ def format_explainer_html(html_string):
114
 
115
  def list_all_article_names() -> list:
116
  filenames = []
117
- for file in os.listdir('./sample-articles/'):
118
  if file.endswith('.txt'):
119
  filenames.append(file.replace('.txt', ''))
120
  return filenames
@@ -158,123 +160,68 @@ def classify_comment(comment, selected_model):
158
  st.session_state.results.append(result)
159
 
160
 
161
- # Start session
162
- if 'results' not in st.session_state:
163
- st.session_state.results = []
164
-
165
- # Page
166
- # st.title('🤬 Dutch Toxic Comment Detection')
167
- # st.markdown("""This demo showcases two Dutch toxic comment detection models.""")
168
- #
169
- # # Introduction
170
- # st.markdown(f"""Both models were trained using a sequence classification task on a translated [Jigsaw Toxicity dataset](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) which contains toxic online comments.
171
- # The first model is a fine-tuned multilingual [DistilBERT](https://huggingface.co/distilbert-base-multilingual-cased) model whereas the second is a fine-tuned Dutch RoBERTa-based model called [RobBERT](https://huggingface.co/pdelobelle/robbert-v2-dutch-base).""")
172
- # st.markdown(f"""For a more comprehensive overview of the models check out their model card on 🤗 Model Hub: [distilbert-base-dutch-toxic-comments]({model_names_to_URLs['ml6team/distilbert-base-dutch-cased-toxic-comments']}) and [RobBERT-dutch-base-toxic-comments]({model_names_to_URLs['ml6team/robbert-dutch-base-toxic-comments']}).
173
- # """)
174
- # st.markdown("""Enter a comment that you want to classify below. The model will determine the probability that it is toxic and highlights how much each token contributes to its decision:
175
- # <font color="black">
176
- # <span style="background-color: rgb(250, 219, 219); opacity: 1;">r</span><span style="background-color: rgb(244, 179, 179); opacity: 1;">e</span><span style="background-color: rgb(238, 135, 135); opacity: 1;">d</span>
177
- # </font>
178
- # tokens indicate toxicity whereas
179
- # <font color="black">
180
- # <span style="background-color: rgb(224, 251, 224); opacity: 1;">g</span><span style="background-color: rgb(197, 247, 197); opacity: 1;">re</span><span style="background-color: rgb(121, 236, 121); opacity: 1;">en</span>
181
- # </font> tokens indicate the opposite.
182
- #
183
- # Try it yourself! 👇""",
184
- # unsafe_allow_html=True)
185
-
186
-
187
- # Demo
188
- # with st.form("dutch-toxic-comment-detection-input", clear_on_submit=True):
189
- # selected_model = st.selectbox('Select a model:', model_names_to_URLs.keys(),
190
- # )#index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
191
- # text = st.text_area(
192
- # label='Enter the comment you want to classify below (in Dutch):')
193
- # _, rightmost_col = st.columns([6,1])
194
- # submitted = rightmost_col.form_submit_button("Classify",
195
- # help="Classify comment")
196
-
197
-
198
- # TODO: should probably set a minimum length of article or something
199
- selected_article = st.selectbox('Select an article or provide your own:',
200
- list_all_article_names()) # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
201
- st.session_state.article_text = fetch_article_contents(selected_article)
202
- article_text = st.text_area(
203
- label='Full article text',
204
- value=st.session_state.article_text,
205
- height=250
206
- )
207
-
208
-
209
- # _, rightmost_col = st.columns([5, 1])
210
- # get_summary = rightmost_col.button("Generate summary",
211
- # help="Generate summary for the given article text")
212
-
213
-
214
  def display_summary(article_name: str):
215
- st.subheader("Generated summary")
216
- # st.markdown("######")
217
  summary_content = fetch_summary_contents(article_name)
218
  soup = BeautifulSoup(summary_content, features="html.parser")
219
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
220
  st.session_state.summary_output = HTML_WRAPPER.format(soup)
221
- st.write(st.session_state.summary_output, unsafe_allow_html=True)
222
 
223
 
224
- # TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
225
- def get_and_compare_entities_spacy(article_name: str):
226
  nlp = spacy.load('en_core_web_lg')
 
227
 
228
- article_content = fetch_article_contents(article_name)
229
- doc = nlp(article_content)
230
- # entities_article = doc.ents
231
- entities_article = []
232
- for entity in doc.ents:
233
- entities_article.append(str(entity))
234
 
235
- summary_content = fetch_summary_contents(article_name)
236
- doc = nlp(summary_content)
237
- # entities_summary = doc.ents
238
- entities_summary = []
239
- for entity in doc.ents:
240
- entities_summary.append(str(entity))
241
-
242
- matched_entities = []
243
- unmatched_entities = []
244
- for entity in entities_summary:
245
- # TODO: currently substring matching but probably should do embedding method or idk?
246
- if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
247
- matched_entities.append(entity)
248
- else:
249
- unmatched_entities.append(entity)
250
- # print(entities_article)
251
- # print(entities_summary)
252
- return matched_entities, unmatched_entities
253
 
254
 
255
- def get_and_compare_entities_flair(article_name: str):
256
- nlp = spacy.load('en_core_web_sm')
257
- tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")
 
 
258
 
259
- article_content = fetch_article_contents(article_name)
260
- doc = nlp(article_content)
261
- entities_article = []
262
  sentences = list(doc.sents)
 
 
263
  for sentence in sentences:
 
 
 
 
 
 
 
264
  sentence_entities = Sentence(str(sentence))
265
  tagger.predict(sentence_entities)
266
  for entity in sentence_entities.get_spans('ner'):
267
- entities_article.append(entity.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
  summary_content = fetch_summary_contents(article_name)
270
- doc = nlp(summary_content)
271
- entities_summary = []
272
- sentences = list(doc.sents)
273
- for sentence in sentences:
274
- sentence_entities = Sentence(str(sentence))
275
- tagger.predict(sentence_entities)
276
- for entity in sentence_entities.get_spans('ner'):
277
- entities_summary.append(entity.text)
278
 
279
  matched_entities = []
280
  unmatched_entities = []
@@ -284,21 +231,18 @@ def get_and_compare_entities_flair(article_name: str):
284
  matched_entities.append(entity)
285
  else:
286
  unmatched_entities.append(entity)
287
- # print(entities_article)
288
- # print(entities_summary)
289
  return matched_entities, unmatched_entities
290
 
291
 
292
  def highlight_entities(article_name: str):
293
- st.subheader("Match entities with article")
294
- # st.markdown("####")
295
  summary_content = fetch_summary_contents(article_name)
296
 
297
  markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
298
  markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
299
  markdown_end = "</mark>"
300
 
301
- matched_entities, unmatched_entities = get_and_compare_entities_spacy(article_name)
 
302
  for entity in matched_entities:
303
  summary_content = summary_content.replace(entity, markdown_start_green + entity + markdown_end)
304
 
@@ -306,55 +250,40 @@ def highlight_entities(article_name: str):
306
  summary_content = summary_content.replace(entity, markdown_start_red + entity + markdown_end)
307
  soup = BeautifulSoup(summary_content, features="html.parser")
308
 
309
- HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
 
310
 
311
- st.write(HTML_WRAPPER.format(soup), unsafe_allow_html=True)
312
 
313
 
314
  def render_dependency_parsing(text: str):
315
- nlp = spacy.load('en_core_web_sm')
316
- #doc = nlp(text)
317
- # st.write(displacy.render(doc, style='dep'))
318
- #sentence_spans = list(doc.sents)
319
- # dep_svg = displacy.serve(sentence_spans, style="dep")
320
- # dep_svg = displacy.render(doc, style="dep", jupyter = False,
321
- # options = {"compact" : False,})
322
- # st.image(dep_svg, width = 50,use_column_width=True)
323
-
324
- #visualize_parser(doc)
325
- #docs = [doc]
326
- #split_sents = True
327
- #docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
328
- #for sent in docs:
329
  html = render_sentence_custom(text)
330
- # Double newlines seem to mess with the rendering
331
  html = html.replace("\n\n", "\n")
332
  st.write(get_svg(html), unsafe_allow_html=True)
333
- #st.image(html, width=50, use_column_width=True)
334
 
335
 
336
- def check_dependency(text):
337
- tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")
338
  nlp = spacy.load('en_core_web_lg')
 
 
 
 
 
 
 
 
339
  doc = nlp(text)
340
  tok_l = doc.to_json()['tokens']
341
- # all_deps = []
342
  all_deps = ""
 
 
 
343
  sentences = list(doc.sents)
344
- for sentence in sentences:
345
- all_entities = []
346
- # # ENTITIES WITH SPACY:
347
- for entity in sentence.ents:
348
- all_entities.append(str(entity))
349
- # # ENTITIES WITH FLAIR:
350
- sentence_entities = Sentence(str(sentence))
351
- tagger.predict(sentence_entities)
352
- for entity in sentence_entities.get_spans('ner'):
353
- all_entities.append(entity.text)
354
- # ENTITIES WITH XLM ROBERTA
355
- # entities_xlm = [entity["word"] for entity in ner_model(str(sentence))]
356
- # for entity in entities_xlm:
357
- # all_entities.append(str(entity))
358
  start_id = sentence.start
359
  end_id = sentence.end
360
  for t in tok_l:
@@ -362,50 +291,96 @@ def check_dependency(text):
362
  continue
363
  head = tok_l[t['head']]
364
  if t['dep'] == 'amod':
 
365
  object_here = text[t['start']:t['end']]
366
  object_target = text[head['start']:head['end']]
 
367
  # ONE NEEDS TO BE ENTITY
368
- if (object_here in all_entities):
369
- # all_deps.append(f"'{text[t['start']:t['end']]}' is {t['dep']} of '{text[head['start']:head['end']]}'")
 
370
  all_deps = all_deps.join(str(sentence))
371
- elif (object_target in all_entities):
372
- # all_deps.append(f"'{text[t['start']:t['end']]}' is {t['dep']} of '{text[head['start']:head['end']]}'")
373
  all_deps = all_deps.join(str(sentence))
374
  else:
375
  continue
 
 
376
  return all_deps
377
 
378
 
379
- with st.form("article-input"):
380
- left_column, _ = st.columns([1, 1])
381
- get_summary = left_column.form_submit_button("Generate summary",
382
- help="Generate summary for the given article text")
383
- # Listener
384
- if get_summary:
385
- if article_text:
386
- with st.spinner('Generating summary...'):
387
- # classify_comment(article_text, selected_model)
388
 
389
- display_summary(selected_article)
390
- else:
391
- st.error('**Error**: No comment to classify. Please provide a comment.')
392
-
393
- # Entity part
394
- with st.form("Entity-part"):
395
- left_column, _ = st.columns([1, 1])
396
- draw_entities = left_column.form_submit_button("Draw Entities",
397
- help="Draw Entities")
398
- if draw_entities:
399
- with st.spinner("Drawing entities..."):
400
- highlight_entities(selected_article)
401
-
402
- with st.form("Dependency-usage"):
403
- left_column, _ = st.columns([1, 1])
404
- parsing = left_column.form_submit_button("Dependency parsing",
405
- help="Dependency parsing")
406
- if parsing:
407
- with st.spinner("Doing dependency parsing..."):
408
- render_dependency_parsing(check_dependency(fetch_summary_contents(selected_article)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  # Results
410
  # if 'results' in st.session_state and st.session_state.results:
411
  # first = True
 
1
  import random
2
  from typing import AnyStr
3
 
4
+ import itertools
5
  import streamlit as st
6
+ import torch.nn.parameter
7
  from bs4 import BeautifulSoup
8
  import numpy as np
9
  import base64
 
50
 
51
  # Page setup
52
  st.set_page_config(
53
+ page_title="Post-processing summarization fact checker",
54
+ page_icon="",
55
  layout="centered",
56
  initial_sidebar_state="auto",
57
  menu_items={
 
116
 
117
  def list_all_article_names() -> list:
118
  filenames = []
119
+ for file in sorted(os.listdir('./sample-articles/')):
120
  if file.endswith('.txt'):
121
  filenames.append(file.replace('.txt', ''))
122
  return filenames
 
160
  st.session_state.results.append(result)
161
 
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  def display_summary(article_name: str):
 
 
164
  summary_content = fetch_summary_contents(article_name)
165
  soup = BeautifulSoup(summary_content, features="html.parser")
166
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
167
  st.session_state.summary_output = HTML_WRAPPER.format(soup)
 
168
 
169
 
170
+ ##@st.cache(hash_funcs={preshed.maps.PreshMap: my_hash_func})
171
+ def get_spacy():
172
  nlp = spacy.load('en_core_web_lg')
173
+ return nlp
174
 
 
 
 
 
 
 
175
 
176
+ # TODO: check the output mutation thingy
177
+ @st.cache(hash_funcs={torch.nn.parameter.Parameter: lambda _: None}, allow_output_mutation=True)
178
+ def get_flair_tagger():
179
+ tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")
180
+ return tagger
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
 
183
+ def get_all_entities_per_sentence(text):
184
+ # load all NER models
185
+ nlp = get_spacy()
186
+ tagger = get_flair_tagger()
187
+ doc = nlp(text)
188
 
 
 
 
189
  sentences = list(doc.sents)
190
+
191
+ entities_all_sentences = []
192
  for sentence in sentences:
193
+ entities_this_sentence = []
194
+
195
+ # SPACY ENTITIES
196
+ for entity in sentence.ents:
197
+ entities_this_sentence.append(str(entity))
198
+
199
+ # FLAIR ENTITIES
200
  sentence_entities = Sentence(str(sentence))
201
  tagger.predict(sentence_entities)
202
  for entity in sentence_entities.get_spans('ner'):
203
+ entities_this_sentence.append(entity.text)
204
+ entities_all_sentences.append(entities_this_sentence)
205
+
206
+ return entities_all_sentences
207
+
208
+
209
+ def get_all_entities(text):
210
+ all_entities_per_sentence = get_all_entities_per_sentence(text)
211
+ return list(itertools.chain.from_iterable(all_entities_per_sentence))
212
+
213
+
214
+ # TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
215
+ def get_and_compare_entities(article_name: str):
216
+ article_content = fetch_article_contents(article_name)
217
+ all_entities_per_sentence = get_all_entities_per_sentence(article_content)
218
+ #st.session_state.entities_per_sentence_article = all_entities_per_sentence
219
+ entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
220
 
221
  summary_content = fetch_summary_contents(article_name)
222
+ all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
223
+ #st.session_state.entities_per_sentence_summary = all_entities_per_sentence
224
+ entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
 
 
 
 
 
225
 
226
  matched_entities = []
227
  unmatched_entities = []
 
231
  matched_entities.append(entity)
232
  else:
233
  unmatched_entities.append(entity)
 
 
234
  return matched_entities, unmatched_entities
235
 
236
 
237
  def highlight_entities(article_name: str):
 
 
238
  summary_content = fetch_summary_contents(article_name)
239
 
240
  markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
241
  markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
242
  markdown_end = "</mark>"
243
 
244
+ matched_entities, unmatched_entities = get_and_compare_entities(article_name)
245
+
246
  for entity in matched_entities:
247
  summary_content = summary_content.replace(entity, markdown_start_green + entity + markdown_end)
248
 
 
250
  summary_content = summary_content.replace(entity, markdown_start_red + entity + markdown_end)
251
  soup = BeautifulSoup(summary_content, features="html.parser")
252
 
253
+ HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
254
+ margin-bottom: 2.5rem">{}</div> """
255
 
256
+ return HTML_WRAPPER.format(soup)
257
 
258
 
259
  def render_dependency_parsing(text: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  html = render_sentence_custom(text)
 
261
  html = html.replace("\n\n", "\n")
262
  st.write(get_svg(html), unsafe_allow_html=True)
 
263
 
264
 
265
+ # If deps for article: True, otherwise deps for summary calc
266
+ def check_dependency(article: bool):
267
  nlp = spacy.load('en_core_web_lg')
268
+ if article:
269
+ text = st.session_state.article_text
270
+ all_entities = get_all_entities_per_sentence(text)
271
+ #all_entities = st.session_state.entities_per_sentence_article
272
+ else:
273
+ text = st.session_state.summary_output
274
+ all_entities = get_all_entities_per_sentence(text)
275
+ #all_entities = st.session_state.entities_per_sentence_summary
276
  doc = nlp(text)
277
  tok_l = doc.to_json()['tokens']
 
278
  all_deps = ""
279
+ print(str(all_deps))
280
+ print("OOPS")
281
+
282
  sentences = list(doc.sents)
283
+ print(sentences)
284
+ for i, sentence in enumerate(sentences):
285
+ #TODO MONDAY: THE PROBLEM LIES HERE WITH THE SENTENCE!!! (I THINK I KNOW PROBLEM: TEXT SAVED AS SESSION STATE IS HTML NOT PURE TEXT!)
286
+ print(str(sentence))
 
 
 
 
 
 
 
 
 
 
287
  start_id = sentence.start
288
  end_id = sentence.end
289
  for t in tok_l:
 
291
  continue
292
  head = tok_l[t['head']]
293
  if t['dep'] == 'amod':
294
+ print("AMOD FOUND")
295
  object_here = text[t['start']:t['end']]
296
  object_target = text[head['start']:head['end']]
297
+
298
  # ONE NEEDS TO BE ENTITY
299
+ if object_here in all_entities[i]:
300
+ print("SENTENCE ADDED")
301
+ print(all_deps)
302
  all_deps = all_deps.join(str(sentence))
303
+ elif object_target in all_entities[i]:
 
304
  all_deps = all_deps.join(str(sentence))
305
  else:
306
  continue
307
+ #print(f'all depps are {all_deps}')
308
+ #print(all_deps)
309
  return all_deps
310
 
311
 
312
+ # Start session
313
+ if 'results' not in st.session_state:
314
+ st.session_state.results = []
 
 
 
 
 
 
315
 
316
+ # Page
317
+ st.title('Summarization fact checker')
318
+
319
+ # INTRODUCTION
320
+ st.header("Introduction")
321
+ st.markdown("""Recent work using transformers on large text corpora has shown great succes when fine-tuned on several
322
+ different downstream NLP tasks. One such task is that of text summarization. The goal of text summarization is to
323
+ generate concise and accurate summaries from input document(s). There are 2 types of summarization: extractive and
324
+ abstractive. **Exstractive summarization** merely copies informative fragments from the input, whereas **abstractive
325
+ summarization** may generate novel words. A good abstractive summary should cover principal information in the input
326
+ and has to be linguistically fluent. This blogpost will focus on this more difficult task of abstractive summary
327
+ generation.""")
328
+
329
+ st.markdown("""To generate summaries we will use the [PEGASUS] (https://huggingface.co/google/pegasus-cnn_dailymail)
330
+ model, producing abstractive summaries from large articles. These summaries often still contain sentences with
331
+ different kinds of errors. Rather than improving the core model, we will look at possible post-processing steps to
332
+ improve the generated summaries by detecting such possible errors. By comparing contents of the summary with the
333
+ source text, we can create some sort of factualness metric, indicating the trustworthiness of the generated
334
+ summary.""")
335
+
336
+ # GENERATING SUMMARIES PART
337
+ st.header("Generating summaries")
338
+ st.markdown("Let’s start by selecting an article text for which we want to generate a summary, or you can provide "
339
+ "text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary "
340
+ "generated might not be optimal to start from.")
341
+
342
+ # TODO: NEED TO CHECK ARTICLE TEXT INSTEAD OF ARTICLE NAME ALSO FREE INPUT OPTION
343
+ selected_article = st.selectbox('Select an article or provide your own:',
344
+ list_all_article_names()) # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
345
+ st.session_state.article_text = fetch_article_contents(selected_article)
346
+ article_text = st.text_area(
347
+ label='Full article text',
348
+ value=st.session_state.article_text,
349
+ height=150
350
+ )
351
+
352
+ st.markdown("Below you can find the generated summary for the article. The summaries of the example articles "
353
+ "vary in quality, but are chosen as such. Based on some common errors, we will discuss possible "
354
+ "methods to improve or rank the summaries in the following paragraphs. The idea is that in "
355
+ "production, you could generate a set of summaries for the same article, with different "
356
+ "parameters (or even different models). By using post-processing methods and metrics, "
357
+ "we can detect some errors in summaries, and choose the best one to actually use.")
358
+ if st.session_state.article_text:
359
+ with st.spinner('Generating summary...'):
360
+ # classify_comment(article_text, selected_model)
361
+
362
+ display_summary(selected_article)
363
+
364
+ st.write("**Generated summary:**", st.session_state.summary_output, unsafe_allow_html=True)
365
+ else:
366
+ st.error('**Error**: No comment to classify. Please provide a comment.',
367
+ help="Generate summary for the given article text")
368
+
369
+ # ENTITY MATCHING PART
370
+ st.header("Entity matching")
371
+ st.markdown("**Named entity recognition** (NER) is the task of identifying and categorising key information ("
372
+ "entities) in text. An entity can be a singular word or a series of words that consistently refers to the "
373
+ "same thing. Common entity classes are person names, organisations, locations and so on. By applying NER "
374
+ "to both the article and its summary, we can spot possible **hallucinations**. Hallucinations are words "
375
+ "generated by the model that are not supported by the source input. ")
376
+ with st.spinner("Calculating and matching entities..."):
377
+ entity_match_html = highlight_entities(selected_article)
378
+ st.write(entity_match_html, unsafe_allow_html=True)
379
+
380
+ # DEPENDENCY PARSING PART
381
+ st.header("Dependency comparison")
382
+ with st.spinner("Doing dependency parsing..."):
383
+ render_dependency_parsing(check_dependency(False))
384
  # Results
385
  # if 'results' in st.session_state and st.session_state.results:
386
  # first = True
custom_renderer.py CHANGED
@@ -102,7 +102,6 @@ def render_sentence_custom(parsed: str):
102
  if a["label"] == "amod":
103
  couples = (a["start"], a["end"])
104
 
105
- print(couples)
106
  x_value_counter = 10
107
  index_counter = 0
108
  svg_words = []
@@ -112,13 +111,11 @@ def render_sentence_custom(parsed: str):
112
  word = word + " "
113
  pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
114
  svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
115
- print(index_counter)
116
  if index_counter >= couples[0] and index_counter <= couples[1]:
117
  coords_test.append(x_value_counter)
118
  x_value_counter += 50
119
  index_counter += 1
120
  x_value_counter += pixel_x_length + 4
121
- print(coords_test)
122
  for i, a in enumerate(arcs):
123
  if a["label"] == "amod":
124
  arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))
 
102
  if a["label"] == "amod":
103
  couples = (a["start"], a["end"])
104
 
 
105
  x_value_counter = 10
106
  index_counter = 0
107
  svg_words = []
 
111
  word = word + " "
112
  pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
113
  svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
 
114
  if index_counter >= couples[0] and index_counter <= couples[1]:
115
  coords_test.append(x_value_counter)
116
  x_value_counter += 50
117
  index_counter += 1
118
  x_value_counter += pixel_x_length + 4
 
119
  for i, a in enumerate(arcs):
120
  if a["label"] == "amod":
121
  arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))