cpi-connect's picture
Update app.py
1608fe8
import streamlit as st
from annotated_text import annotated_text
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
from transformers import pipeline
import requests
import random
import justext
import pickle
from tqdm import tqdm
import torch
import jsonlines
st.title('Identifying Cybersecurity Entities on Webpages')
query_input = st.text_input("URL:")
if query_input:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
}
s = requests.Session()
s.headers.update(headers)
response = s.get(query_input)
paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
text = ""
for paragraph in paragraphs:
if not paragraph.is_boilerplate:
text += paragraph.text + "\n"
text = text.split("\n")
text = [text_block for text_block in text if text_block != ""]
pipe = pipeline("token-classification", model="cpi-connect/SecureBERT-NER", grouped_entities=True)
for text_block in text:
entities = pipe(text_block)
annotated = []
last_entity, last_idx = None, None
for entity in entities:
if last_entity is None and last_idx is None:
annotated.append(text_block[:entity["start"]])
annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"]))
last_entity = entity["entity_group"]
last_idx = entity["end"]
elif last_entity == entity["entity_group"] and last_idx == entity["start"]:
new_text = annotated[-1][0] + text_block[entity["start"] : entity["end"]]
label = annotated[-1][1]
annotated[-1] = (new_text, label)
last_entity = entity["entity_group"]
last_idx = entity["end"]
else:
annotated.append(text_block[last_idx : entity["start"]])
annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"]))
last_entity = entity["entity_group"]
last_idx = entity["end"]
annotated.append(text_block[last_idx : ])
annotated_text(annotated)