Transformer instead of flair and use dependency image for one article as test
Browse files- app.py +46 -26
- dependency-images/article11.txt +1 -0
app.py
CHANGED
@@ -23,15 +23,15 @@ import spacy
|
|
23 |
from spacy import displacy
|
24 |
from spacy_streamlit import visualize_parser
|
25 |
|
26 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
27 |
from transformers import pipeline
|
28 |
import os
|
29 |
from transformers_interpret import SequenceClassificationExplainer
|
30 |
|
31 |
-
|
32 |
# USE_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
|
33 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
34 |
|
|
|
35 |
@st.experimental_singleton
|
36 |
def get_sentence_embedding_model():
|
37 |
return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
@@ -43,14 +43,21 @@ def get_spacy():
|
|
43 |
return nlp
|
44 |
|
45 |
|
46 |
-
#TODO: might look into which one is the best here
|
47 |
-
#TODO: might be useful to make an ml6 preloaded model for flair as this takes ridiculously long to load the first time
|
48 |
@st.experimental_singleton
|
49 |
-
|
50 |
def get_flair_tagger():
|
51 |
return SequenceTagger.load("flair/ner-english-ontonotes-fast")
|
52 |
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
# Page setup
|
55 |
st.set_page_config(
|
56 |
page_title="Post-processing summarization fact checker",
|
@@ -97,6 +104,12 @@ def fetch_dependency_specific_contents(filename: str) -> AnyStr:
|
|
97 |
return data
|
98 |
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
def display_summary(article_name: str):
|
101 |
summary_content = fetch_summary_contents(article_name)
|
102 |
st.session_state.summary_output = summary_content
|
@@ -122,10 +135,16 @@ def get_all_entities_per_sentence(text):
|
|
122 |
entities_this_sentence.append(str(entity))
|
123 |
|
124 |
# FLAIR ENTITIES
|
125 |
-
sentence_entities = Sentence(str(sentence))
|
126 |
-
tagger.predict(sentence_entities)
|
127 |
-
for entity in sentence_entities.get_spans('ner'):
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
entities_all_sentences.append(entities_this_sentence)
|
130 |
|
131 |
return entities_all_sentences
|
@@ -188,6 +207,7 @@ def highlight_entities(article_name: str):
|
|
188 |
def render_dependency_parsing(text: str):
|
189 |
html = render_sentence_custom(text)
|
190 |
html = html.replace("\n\n", "\n")
|
|
|
191 |
st.write(get_svg(html), unsafe_allow_html=True)
|
192 |
|
193 |
|
@@ -275,7 +295,8 @@ currently selected article.""")
|
|
275 |
|
276 |
nlp = get_spacy()
|
277 |
sentence_embedding_model = get_sentence_embedding_model()
|
278 |
-
tagger = get_flair_tagger()
|
|
|
279 |
|
280 |
# GENERATING SUMMARIES PART
|
281 |
st.header("Generating summaries")
|
@@ -309,11 +330,6 @@ else:
|
|
309 |
st.error('**Error**: No comment to classify. Please provide a comment.',
|
310 |
help="Generate summary for the given article text")
|
311 |
|
312 |
-
if is_valid_url(article_text):
|
313 |
-
print("YES")
|
314 |
-
else:
|
315 |
-
print("NO")
|
316 |
-
|
317 |
|
318 |
def render_svg(svg_file):
|
319 |
with open(svg_file, "r") as f:
|
@@ -390,17 +406,21 @@ st.markdown("However, by empirical testing, we have found that there are certain
|
|
390 |
"dependencies that satisfy the discussed constraints. We also discuss the specific results for the "
|
391 |
"currently selected article.")
|
392 |
with st.spinner("Doing dependency parsing..."):
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
|
|
|
|
|
|
|
|
404 |
dep_specific_text = fetch_dependency_specific_contents(selected_article)
|
405 |
soup = BeautifulSoup(dep_specific_text, features="html.parser")
|
406 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
|
|
23 |
from spacy import displacy
|
24 |
from spacy_streamlit import visualize_parser
|
25 |
|
26 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
|
27 |
from transformers import pipeline
|
28 |
import os
|
29 |
from transformers_interpret import SequenceClassificationExplainer
|
30 |
|
|
|
31 |
# USE_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
|
32 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
33 |
|
34 |
+
|
35 |
@st.experimental_singleton
|
36 |
def get_sentence_embedding_model():
|
37 |
return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
|
|
43 |
return nlp
|
44 |
|
45 |
|
46 |
+
# TODO: might look into which one is the best here
|
47 |
+
# TODO: might be useful to make an ml6 preloaded model for flair as this takes ridiculously long to load the first time
|
48 |
@st.experimental_singleton
|
49 |
+
# @st.cache(suppress_st_warning=True, allow_output_mutation=True)
|
50 |
def get_flair_tagger():
|
51 |
return SequenceTagger.load("flair/ner-english-ontonotes-fast")
|
52 |
|
53 |
|
54 |
+
@st.experimental_singleton
|
55 |
+
def get_transformer_pipeline():
|
56 |
+
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
|
57 |
+
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
|
58 |
+
return pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
|
59 |
+
|
60 |
+
|
61 |
# Page setup
|
62 |
st.set_page_config(
|
63 |
page_title="Post-processing summarization fact checker",
|
|
|
104 |
return data
|
105 |
|
106 |
|
107 |
+
def fetch_dependency_svg(filename: str) -> AnyStr:
|
108 |
+
with open(f'./dependency-images/{filename.lower()}.txt', 'r') as f:
|
109 |
+
data = f.read()
|
110 |
+
return data
|
111 |
+
|
112 |
+
|
113 |
def display_summary(article_name: str):
|
114 |
summary_content = fetch_summary_contents(article_name)
|
115 |
st.session_state.summary_output = summary_content
|
|
|
135 |
entities_this_sentence.append(str(entity))
|
136 |
|
137 |
# FLAIR ENTITIES
|
138 |
+
# sentence_entities = Sentence(str(sentence))
|
139 |
+
# tagger.predict(sentence_entities)
|
140 |
+
# for entity in sentence_entities.get_spans('ner'):
|
141 |
+
# entities_this_sentence.append(entity.text)
|
142 |
+
|
143 |
+
# XLM ENTITIES
|
144 |
+
entities_xlm = [entity["word"] for entity in ner_model(str(sentence))]
|
145 |
+
for entity in entities_xlm:
|
146 |
+
entities_this_sentence.append(str(entity))
|
147 |
+
|
148 |
entities_all_sentences.append(entities_this_sentence)
|
149 |
|
150 |
return entities_all_sentences
|
|
|
207 |
def render_dependency_parsing(text: str):
|
208 |
html = render_sentence_custom(text)
|
209 |
html = html.replace("\n\n", "\n")
|
210 |
+
# print(get_svg(html))
|
211 |
st.write(get_svg(html), unsafe_allow_html=True)
|
212 |
|
213 |
|
|
|
295 |
|
296 |
nlp = get_spacy()
|
297 |
sentence_embedding_model = get_sentence_embedding_model()
|
298 |
+
# tagger = get_flair_tagger()
|
299 |
+
ner_model = get_transformer_pipeline()
|
300 |
|
301 |
# GENERATING SUMMARIES PART
|
302 |
st.header("Generating summaries")
|
|
|
330 |
st.error('**Error**: No comment to classify. Please provide a comment.',
|
331 |
help="Generate summary for the given article text")
|
332 |
|
|
|
|
|
|
|
|
|
|
|
333 |
|
334 |
def render_svg(svg_file):
|
335 |
with open(svg_file, "r") as f:
|
|
|
406 |
"dependencies that satisfy the discussed constraints. We also discuss the specific results for the "
|
407 |
"currently selected article.")
|
408 |
with st.spinner("Doing dependency parsing..."):
|
409 |
+
# TODO RIGHT IF FUNCTION (IF EXAMPLE AND IF INPUT UNCHANGED)
|
410 |
+
if selected_article == 'article11':
|
411 |
+
st.write(fetch_dependency_svg((selected_article)), unsafe_allow_html=True)
|
412 |
+
else:
|
413 |
+
summary_deps = check_dependency(False)
|
414 |
+
article_deps = check_dependency(True)
|
415 |
+
total_unmatched_deps = []
|
416 |
+
for summ_dep in summary_deps:
|
417 |
+
if not any(summ_dep['identifier'] in art_dep['identifier'] for art_dep in article_deps):
|
418 |
+
total_unmatched_deps.append(summ_dep)
|
419 |
+
# print(f'ALL UNMATCHED DEPS ARE: {total_unmatched_deps}')
|
420 |
+
# render_dependency_parsing(check_dependency(False))
|
421 |
+
if total_unmatched_deps:
|
422 |
+
for current_drawing_list in total_unmatched_deps:
|
423 |
+
render_dependency_parsing(current_drawing_list)
|
424 |
dep_specific_text = fetch_dependency_specific_contents(selected_article)
|
425 |
soup = BeautifulSoup(dep_specific_text, features="html.parser")
|
426 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
dependency-images/article11.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem"><img src="data:image/svg+xml;base64,CiAgPHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIiB4bWw6bGFuZz0iZW4iIGlkPSIwIiBjbGFzcz0iZGlzcGxhY3kiIHdpZHRoPSIxMjAwIiBoZWlnaHQ9Ijc1IiBkaXJlY3Rpb249Imx0ciIgc3R5bGU9Im1heC13aWR0aDogbm9uZTsgaGVpZ2h0OiA3NXB4OyBjb2xvcjogIzAwMDAwOyBiYWNrZ3JvdW5kOiAjZmZmZmZmOyBmb250LWZhbWlseTogQXJpYWw7IGRpcmVjdGlvbjogbHRyIj4KICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIxMCI+VGhlIDwvdHNwYW4+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktdGFnIiBkeT0iMmVtIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjEwIj48L3RzcGFuPgogIDwvdGV4dD4KICAKICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSI0NiI+aGVhZHBob25lcyA8L3RzcGFuPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXRhZyIgZHk9IjJlbSIgZmlsbD0iY3VycmVudENvbG9yIiB4PSI0NiI+PC90c3Bhbj4KICA8L3RleHQ+CiAgCiAgPHRleHQgY2xhc3M9ImRpc3BsYWN5LXRva2VuIiBmaWxsPSJjdXJyZW50Q29sb3IiIHRleHQtYW5jaG9yPSJzdGFydCIgeT0iNzAiPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXdvcmQiIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iMTQzIj5zdGFydCA8L3RzcGFuPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXRhZyIgZHk9IjJlbSIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIxNDMiPjwvdHNwYW4+CiAgPC90ZXh0PgogIAogIDx0ZXh0IGNsYXNzPSJkaXNwbGFjeS10b2tlbiIgZmlsbD0iY3VycmVudENvbG9yIiB0ZXh0LWFuY2hvcj0ic3RhcnQiIHk9IjcwIj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS13b3JkIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjE4MyI+YXQgPC90c3Bhbj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS10YWciIGR5PSIyZW0iIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iMTgzIj48L3RzcGFuPgogIDwvdGV4dD4KICAKICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIyMDUiPiQgPC90c3Bhbj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS10YWciIGR5PSIyZW0iIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iMjA1Ij48L3RzcGFuPgogIDwvdGV4dD4KICAKICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIyMjIiPjk5OSA8L3RzcGFuPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXRhZyIgZHk9IjJlbSIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIyMjIiPjwvdHNwYW4+CiAgPC90ZXh0PgogIAogIDx0ZXh0IGNsYXNzPSJkaXNwbGFjeS10b2tlbiIgZmlsbD0iY3VycmVudENvbG9yIiB0ZXh0LWFuY2hvcj0ic3RhcnQiIHk9IjcwIj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS13b3JkIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjI1NyI+YW5kIDwvdHNwYW4+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktdGFnIiBkeT0iMmVtIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjI1NyI+PC90c3Bhbj4KICA8L3RleHQ+CiAgCiAgPHRleHQgY2xhc3M9ImRpc3BsYWN5LXRva2VuIiBmaWxsPSJjdXJyZW50Q29sb3IiIHRleHQtYW5jaG9yPSJzdGFydCIgeT0iNzAiPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXdvcmQiIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iMjkyIj53aWxsIDwvdHNwYW4+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktdGFnIiBkeT0iMmVtIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjI5MiI+PC90c3Bhbj4KICA8L3RleHQ+CiAgCiAgPHRleHQgY2xhc3M9ImRpc3BsYWN5LXRva2VuIiBmaWxsPSJjdXJyZW50Q29sb3IiIHRleHQtYW5jaG9yPSJzdGFydCIgeT0iNzAiPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXdvcmQiIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iMzIzIj5iZSA8L3RzcGFuPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXRhZyIgZHk9IjJlbSIgZmlsbD0iY3VycmVudENvbG9yIiB4PSIzMjMiPjwvdHNwYW4+CiAgPC90ZXh0PgogIAogIDx0ZXh0IGNsYXNzPSJkaXNwbGFjeS10b2tlbiIgZmlsbD0iY3VycmVudENvbG9yIiB0ZXh0LWFuY2hvcj0ic3RhcnQiIHk9IjcwIj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS13b3JkIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjM0OSI+YXZhaWxhYmxlIDwvdHNwYW4+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktdGFnIiBkeT0iMmVtIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjM0OSI+PC90c3Bhbj4KICA8L3RleHQ+CiAgCiAgPHRleHQgY2xhc3M9ImRpc3BsYWN5LXRva2VuIiBmaWxsPSJjdXJyZW50Q29sb3IiIHRleHQtYW5jaG9yPSJzdGFydCIgeT0iNzAiPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXdvcmQiIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iNDIxIj5zdGFydGluZyA8L3RzcGFuPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXRhZyIgZHk9IjJlbSIgZmlsbD0iY3VycmVudENvbG9yIiB4PSI0MjEiPjwvdHNwYW4+CiAgPC90ZXh0PgogIAogIDx0ZXh0IGNsYXNzPSJkaXNwbGFjeS10b2tlbiIgZmlsbD0iY3VycmVudENvbG9yIiB0ZXh0LWFuY2hvcj0ic3RhcnQiIHk9IjcwIj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS13b3JkIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjQ4MiI+dG9kYXkgPC90c3Bhbj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS10YWciIGR5PSIyZW0iIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iNDgyIj48L3RzcGFuPgogIDwvdGV4dD4KICAKICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSI1MzAiPmluIDwvdHNwYW4+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktdGFnIiBkeT0iMmVtIiBmaWxsPSJjdXJyZW50Q29sb3IiIHg9IjUzMCI+PC90c3Bhbj4KICA8L3RleHQ+CiAgCiAgPHRleHQgY2xhc3M9ImRpc3BsYWN5LXRva2VuIiBmaWxsPSJjdXJyZW50Q29sb3IiIHRleHQtYW5jaG9yPSJzdGFydCIgeT0iNzAiPgogICAgICA8dHNwYW4gY2xhc3M9ImRpc3BsYWN5LXdvcmQiIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iNjAxIj50aGUgPC90c3Bhbj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS10YWciIGR5PSIyZW0iIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iNjAxIj48L3RzcGFuPgogIDwvdGV4dD4KICAKICA8dGV4dCBjbGFzcz0iZGlzcGxhY3ktdG9rZW4iIGZpbGw9ImN1cnJlbnRDb2xvciIgdGV4dC1hbmNob3I9InN0YXJ0IiB5PSI3MCI+CiAgICAgIDx0c3BhbiBjbGFzcz0iZGlzcGxhY3ktd29yZCIgZmlsbD0iY3VycmVudENvbG9yIiB4PSI2MzIiPlUuUy4gPC90c3Bhbj4KICAgICAgPHRzcGFuIGNsYXNzPSJkaXNwbGFjeS10YWciIGR5PSIyZW0iIGZpbGw9ImN1cnJlbnRDb2xvciIgeD0iNjMyIj48L3RzcGFuPgogIDwvdGV4dD4KICAKICAgIDxnIGNsYXNzPSJkaXNwbGFjeS1hcnJvdyI+CiAgICAgICAgPHBhdGggY2xhc3M9ImRpc3BsYWN5LWFyYyIgaWQ9ImFycm93LTAtMCIgc3Ryb2tlLXdpZHRoPSIycHgiIGQ9Ik01NDAsNTAgQzU0MCw1IDY0Miw1IDY0Miw1MCIgZmlsbD0ibm9uZSIgc3Ryb2tlPSJyZWQiLz4KICAgICAgICA8dGV4dCBkeT0iMS4yNWVtIiBzdHlsZT0iZm9udC1zaXplOiAwLjhlbTsgbGV0dGVyLXNwYWNpbmc6IDFweCI+CiAgICAgICAgICAgIDx0ZXh0UGF0aCB4bGluazpocmVmPSIjYXJyb3ctMC0wIiBjbGFzcz0iZGlzcGxhY3ktbGFiZWwiIHN0YXJ0T2Zmc2V0PSI1MCUiIHNpZGU9InJpZ2h0IiBmaWxsPSJyZWQiIHRleHQtYW5jaG9yPSJtaWRkbGUiPnBvYmo8L3RleHRQYXRoPgogICAgICAgIDwvdGV4dD4KICAgICAgICA8cGF0aCBjbGFzcz0iZGlzcGxhY3ktYXJyb3doZWFkIiBkPSJNNjQyLDUyIEw2NDYsNDQgNjM4LDQ0IiBmaWxsPSJyZWQiLz4KICAgIDwvZz4KICAgIDwvc3ZnPgogIA==" style=""/></div>
|