MatthiasC commited on
Commit
1f3c19d
1 Parent(s): 10364d0

Transformer instead of flair and use dependency image for one article as test

Browse files
Files changed (2) hide show
  1. app.py +46 -26
  2. dependency-images/article11.txt +1 -0
app.py CHANGED
@@ -23,15 +23,15 @@ import spacy
23
  from spacy import displacy
24
  from spacy_streamlit import visualize_parser
25
 
26
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
27
  from transformers import pipeline
28
  import os
29
  from transformers_interpret import SequenceClassificationExplainer
30
 
31
-
32
  # USE_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
33
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
 
 
35
  @st.experimental_singleton
36
  def get_sentence_embedding_model():
37
  return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
@@ -43,14 +43,21 @@ def get_spacy():
43
  return nlp
44
 
45
 
46
- #TODO: might look into which one is the best here
47
- #TODO: might be useful to make an ml6 preloaded model for flair as this takes ridiculously long to load the first time
48
  @st.experimental_singleton
49
- #@st.cache(suppress_st_warning=True, allow_output_mutation=True)
50
  def get_flair_tagger():
51
  return SequenceTagger.load("flair/ner-english-ontonotes-fast")
52
 
53
 
 
 
 
 
 
 
 
54
  # Page setup
55
  st.set_page_config(
56
  page_title="Post-processing summarization fact checker",
@@ -97,6 +104,12 @@ def fetch_dependency_specific_contents(filename: str) -> AnyStr:
97
  return data
98
 
99
 
 
 
 
 
 
 
100
  def display_summary(article_name: str):
101
  summary_content = fetch_summary_contents(article_name)
102
  st.session_state.summary_output = summary_content
@@ -122,10 +135,16 @@ def get_all_entities_per_sentence(text):
122
  entities_this_sentence.append(str(entity))
123
 
124
  # FLAIR ENTITIES
125
- sentence_entities = Sentence(str(sentence))
126
- tagger.predict(sentence_entities)
127
- for entity in sentence_entities.get_spans('ner'):
128
- entities_this_sentence.append(entity.text)
 
 
 
 
 
 
129
  entities_all_sentences.append(entities_this_sentence)
130
 
131
  return entities_all_sentences
@@ -188,6 +207,7 @@ def highlight_entities(article_name: str):
188
  def render_dependency_parsing(text: str):
189
  html = render_sentence_custom(text)
190
  html = html.replace("\n\n", "\n")
 
191
  st.write(get_svg(html), unsafe_allow_html=True)
192
 
193
 
@@ -275,7 +295,8 @@ currently selected article.""")
275
 
276
  nlp = get_spacy()
277
  sentence_embedding_model = get_sentence_embedding_model()
278
- tagger = get_flair_tagger()
 
279
 
280
  # GENERATING SUMMARIES PART
281
  st.header("Generating summaries")
@@ -309,11 +330,6 @@ else:
309
  st.error('**Error**: No comment to classify. Please provide a comment.',
310
  help="Generate summary for the given article text")
311
 
312
- if is_valid_url(article_text):
313
- print("YES")
314
- else:
315
- print("NO")
316
-
317
 
318
  def render_svg(svg_file):
319
  with open(svg_file, "r") as f:
@@ -390,17 +406,21 @@ st.markdown("However, by empirical testing, we have found that there are certain
390
  "dependencies that satisfy the discussed constraints. We also discuss the specific results for the "
391
  "currently selected article.")
392
  with st.spinner("Doing dependency parsing..."):
393
- summary_deps = check_dependency(False)
394
- article_deps = check_dependency(True)
395
- total_unmatched_deps = []
396
- for summ_dep in summary_deps:
397
- if not any(summ_dep['identifier'] in art_dep['identifier'] for art_dep in article_deps):
398
- total_unmatched_deps.append(summ_dep)
399
- # print(f'ALL UNMATCHED DEPS ARE: {total_unmatched_deps}')
400
- # render_dependency_parsing(check_dependency(False))
401
- if total_unmatched_deps:
402
- for current_drawing_list in total_unmatched_deps:
403
- render_dependency_parsing(current_drawing_list)
 
 
 
 
404
  dep_specific_text = fetch_dependency_specific_contents(selected_article)
405
  soup = BeautifulSoup(dep_specific_text, features="html.parser")
406
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
 
23
  from spacy import displacy
24
  from spacy_streamlit import visualize_parser
25
 
26
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
27
  from transformers import pipeline
28
  import os
29
  from transformers_interpret import SequenceClassificationExplainer
30
 
 
31
  # USE_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
32
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33
 
34
+
35
  @st.experimental_singleton
36
  def get_sentence_embedding_model():
37
  return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 
43
  return nlp
44
 
45
 
46
+ # TODO: might look into which one is the best here
47
+ # TODO: might be useful to make an ml6 preloaded model for flair as this takes ridiculously long to load the first time
48
  @st.experimental_singleton
49
+ # @st.cache(suppress_st_warning=True, allow_output_mutation=True)
50
  def get_flair_tagger():
51
  return SequenceTagger.load("flair/ner-english-ontonotes-fast")
52
 
53
 
54
+ @st.experimental_singleton
55
+ def get_transformer_pipeline():
56
+ tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
57
+ model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
58
+ return pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
59
+
60
+
61
  # Page setup
62
  st.set_page_config(
63
  page_title="Post-processing summarization fact checker",
 
104
  return data
105
 
106
 
107
+ def fetch_dependency_svg(filename: str) -> AnyStr:
108
+ with open(f'./dependency-images/{filename.lower()}.txt', 'r') as f:
109
+ data = f.read()
110
+ return data
111
+
112
+
113
  def display_summary(article_name: str):
114
  summary_content = fetch_summary_contents(article_name)
115
  st.session_state.summary_output = summary_content
 
135
  entities_this_sentence.append(str(entity))
136
 
137
  # FLAIR ENTITIES
138
+ # sentence_entities = Sentence(str(sentence))
139
+ # tagger.predict(sentence_entities)
140
+ # for entity in sentence_entities.get_spans('ner'):
141
+ # entities_this_sentence.append(entity.text)
142
+
143
+ # XLM ENTITIES
144
+ entities_xlm = [entity["word"] for entity in ner_model(str(sentence))]
145
+ for entity in entities_xlm:
146
+ entities_this_sentence.append(str(entity))
147
+
148
  entities_all_sentences.append(entities_this_sentence)
149
 
150
  return entities_all_sentences
 
207
  def render_dependency_parsing(text: str):
208
  html = render_sentence_custom(text)
209
  html = html.replace("\n\n", "\n")
210
+ # print(get_svg(html))
211
  st.write(get_svg(html), unsafe_allow_html=True)
212
 
213
 
 
295
 
296
  nlp = get_spacy()
297
  sentence_embedding_model = get_sentence_embedding_model()
298
+ # tagger = get_flair_tagger()
299
+ ner_model = get_transformer_pipeline()
300
 
301
  # GENERATING SUMMARIES PART
302
  st.header("Generating summaries")
 
330
  st.error('**Error**: No comment to classify. Please provide a comment.',
331
  help="Generate summary for the given article text")
332
 
 
 
 
 
 
333
 
334
  def render_svg(svg_file):
335
  with open(svg_file, "r") as f:
 
406
  "dependencies that satisfy the discussed constraints. We also discuss the specific results for the "
407
  "currently selected article.")
408
  with st.spinner("Doing dependency parsing..."):
409
+ # TODO RIGHT IF FUNCTION (IF EXAMPLE AND IF INPUT UNCHANGED)
410
+ if selected_article == 'article11':
411
+ st.write(fetch_dependency_svg((selected_article)), unsafe_allow_html=True)
412
+ else:
413
+ summary_deps = check_dependency(False)
414
+ article_deps = check_dependency(True)
415
+ total_unmatched_deps = []
416
+ for summ_dep in summary_deps:
417
+ if not any(summ_dep['identifier'] in art_dep['identifier'] for art_dep in article_deps):
418
+ total_unmatched_deps.append(summ_dep)
419
+ # print(f'ALL UNMATCHED DEPS ARE: {total_unmatched_deps}')
420
+ # render_dependency_parsing(check_dependency(False))
421
+ if total_unmatched_deps:
422
+ for current_drawing_list in total_unmatched_deps:
423
+ render_dependency_parsing(current_drawing_list)
424
  dep_specific_text = fetch_dependency_specific_contents(selected_article)
425
  soup = BeautifulSoup(dep_specific_text, features="html.parser")
426
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
dependency-images/article11.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ <div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem"><img src="data:image/svg+xml;base64,CiAgPHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIiB4bWw6bGFuZz0iZW4iIGlkPSIwIiBjbGFzcz0iZGlzcGxhY3kiIHdpZHRoPSIxMjAwIiBoZWlnaHQ9Ijc1IiBkaXJlY3Rpb249Imx0ciIgc3R5bGU9Im1heC13aWR0aDogbm9uZTsgaGVpZ2h0OiA3NXB4OyBjb2xvcjogIzAwMDAwOyBiYWNrZ3JvdW5kOiAjZmZmZmZmOyBmb250LWZhbWlseTogQXJpYWw7IGRpcmVjdGlvbjogbHRyIj4KICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIxMCI+VGhlIDwvdHNwYW4+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktdGFnIiBkeT0iMmVtIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjEwIj48L3RzcGFuPgogIDwvdGV4dD4KICAKICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSI0NiI+aGVhZHBob25lcyA8L3RzcGFuPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXRhZyIgZHk9IjJlbSIgZmlsbD0iY3VycmVudENvbG9yIiB4PSI0NiI+PC90c3Bhbj4KICA8L3RleHQ+CiAgCiAgPHRleHQgY2xhc3M9ImRpc3BsYWN5LXRva2VuIiBmaWxsPSJjdXJyZW50Q29sb3IiIHRleHQtYW5jaG9yPSJzdGFydCIgeT0iNzAiPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXdvcmQiIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iMTQzIj5zdGFydCA8L3RzcGFuPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXRhZyIgZHk9IjJlbSIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIxNDMiPjwvdHNwYW4+CiAgPC90ZXh0PgogIAogIDx0ZXh0IGNsYXNzPSJkaXNwbGFjeS10b2tlbiIgZmlsbD0iY3VycmVudENvbG9yIiB0ZXh0LWFuY2hvcj0ic3RhcnQiIHk9IjcwIj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS13b3JkIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjE4MyI+YXQgPC90c3Bhbj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS10YWciIGR5PSIyZW0iIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iMTgzIj48L3RzcGFuPgogIDwvdGV4dD4KICAKICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIyMDUiPiQgPC90c3Bhbj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS10YWciIGR5PSIyZW0iIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iMjA1Ij48L3RzcGFuPgogIDwvdGV4dD4KICAKICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIyMjIiPjk5OSA8L3RzcGFuPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXRhZyIgZHk9IjJlbSIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIyMjIiPjwvdHNwYW4+CiAgPC90ZXh0PgogIAogIDx0ZXh0IGNsYXNzPSJkaXNwbGFjeS10b2tlbiIgZmlsbD0iY3VycmVudENvbG9yIiB0ZXh0LWFuY2hvcj0ic3RhcnQiIHk9IjcwIj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS13b3JkIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjI1NyI+YW5kIDwvdHNwYW4+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktdGFnIiBkeT0iMmVtIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjI1NyI+PC90c3Bhbj4KICA8L3RleHQ+CiAgCiAgPHRleHQgY2xhc3M9ImRpc3BsYWN5LXRva2VuIiBmaWxsPSJjdXJyZW50Q29sb3IiIHRleHQtYW5jaG9yPSJzdGFydCIgeT0iNzAiPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXdvcmQiIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iMjkyIj53aWxsIDwvdHNwYW4+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktdGFnIiBkeT0iMmVtIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjI5MiI+PC90c3Bhbj4KICA8L3RleHQ+CiAgCiAgPHRleHQgY2xhc3M9ImRpc3BsYWN5LXRva2VuIiBmaWxsPSJjdXJyZW50Q29sb3IiIHRleHQtYW5jaG9yPSJzdGFydCIgeT0iNzAiPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXdvcmQiIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iMzIzIj5iZSA8L3RzcGFuPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXRhZyIgZHk9IjJlbSIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIzMjMiPjwvdHNwYW4+CiAgPC90ZXh0PgogIAogIDx0ZXh0IGNsYXNzPSJkaXNwbGFjeS10b2tlbiIgZmlsbD0iY3VycmVudENvbG9yIiB0ZXh0LWFuY2hvcj0ic3RhcnQiIHk9IjcwIj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS13b3JkIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjM0OSI+YXZhaWxhYmxlIDwvdHNwYW4+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktdGFnIiBkeT0iMmVtIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjM0OSI+PC90c3Bhbj4KICA8L3RleHQ+CiAgCiAgPHRleHQgY2xhc3M9ImRpc3BsYWN5LXRva2VuIiBmaWxsPSJjdXJyZW50Q29sb3IiIHRleHQtYW5jaG9yPSJzdGFydCIgeT0iNzAiPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXdvcmQiIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iNDIxIj5zdGFydGluZyA8L3RzcGFuPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXRhZyIgZHk9IjJlbSIgZmlsbD0iY3VycmVudENvbG9yIiB4PSI0MjEiPjwvdHNwYW4+CiAgPC90ZXh0PgogIAogIDx0ZXh0IGNsYXNzPSJkaXNwbGFjeS10b2tlbiIgZmlsbD0iY3VycmVudENvbG9yIiB0ZXh0LWFuY2hvcj0ic3RhcnQiIHk9IjcwIj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS13b3JkIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjQ4MiI+dG9kYXkgPC90c3Bhbj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS10YWciIGR5PSIyZW0iIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iNDgyIj48L3RzcGFuPgogIDwvdGV4dD4KICAKICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSI1MzAiPmluIDwvdHNwYW4+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktdGFnIiBkeT0iMmVtIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjUzMCI+PC90c3Bhbj4KICA8L3RleHQ+CiAgCiAgPHRleHQgY2xhc3M9ImRpc3BsYWN5LXRva2VuIiBmaWxsPSJjdXJyZW50Q29sb3IiIHRleHQtYW5jaG9yPSJzdGFydCIgeT0iNzAiPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXdvcmQiIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iNjAxIj50aGUgPC90c3Bhbj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS10YWciIGR5PSIyZW0iIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iNjAxIj48L3RzcGFuPgogIDwvdGV4dD4KICAKICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSI2MzIiPlUuUy4gPC90c3Bhbj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS10YWciIGR5PSIyZW0iIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iNjMyIj48L3RzcGFuPgogIDwvdGV4dD4KICAKICAgIDxnIGNsYXNzPSJkaXNwbGFjeS1hcnJvdyI+CiAgICAgICAgPHBhdGggY2xhc3M9ImRpc3BsYWN5LWFyYyIgaWQ9ImFycm93LTAtMCIgc3Ryb2tlLXdpZHRoPSIycHgiIGQ9Ik01NDAsNTAgQzU0MCw1IDY0Miw1IDY0Miw1MCIgZmlsbD0ibm9uZSIgc3Ryb2tlPSJyZWQiLz4KICAgICAgICA8dGV4dCBkeT0iMS4yNWVtIiBzdHlsZT0iZm9udC1zaXplOiAwLjhlbTsgbGV0dGVyLXNwYWNpbmc6IDFweCI+CiAgICAgICAgICAgIDx0ZXh0UGF0aCB4bGluazpocmVmPSIjYXJyb3ctMC0wIiBjbGFzcz0iZGlzcGxhY3ktbGFiZWwiIHN0YXJ0T2Zmc2V0PSI1MCUiIHNpZGU9InJpZ2h0IiBmaWxsPSJyZWQiIHRleHQtYW5jaG9yPSJtaWRkbGUiPnBvYmo8L3RleHRQYXRoPgogICAgICAgIDwvdGV4dD4KICAgICAgICA8cGF0aCBjbGFzcz0iZGlzcGxhY3ktYXJyb3doZWFkIiBkPSJNNjQyLDUyIEw2NDYsNDQgNjM4LDQ0IiBmaWxsPSJyZWQiLz4KICAgIDwvZz4KICAgIDwvc3ZnPgogIA==" style=""/></div>