|
import streamlit as st |
|
from annotated_text import annotated_text |
|
from transformers import AutoModelForTokenClassification |
|
from transformers import AutoTokenizer |
|
from transformers import pipeline |
|
import requests |
|
import random |
|
import justext |
|
import pickle |
|
from tqdm import tqdm |
|
import torch |
|
import jsonlines |
|
|
|
st.title('Identifying Cybersecurity Entities on Webpages') |
|
|
|
query_input = st.text_input("URL:") |
|
if query_input: |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0", |
|
"Accept": "application/json, text/plain, */*", |
|
"Accept-Language": "en-US,en;q=0.5", |
|
"Accept-Encoding": "gzip, deflate", |
|
} |
|
|
|
s = requests.Session() |
|
s.headers.update(headers) |
|
|
|
response = s.get(query_input) |
|
paragraphs = justext.justext(response.content, justext.get_stoplist("English")) |
|
text = "" |
|
for paragraph in paragraphs: |
|
if not paragraph.is_boilerplate: |
|
text += paragraph.text + "\n" |
|
|
|
text = text.split("\n") |
|
text = [text_block for text_block in text if text_block != ""] |
|
|
|
pipe = pipeline("token-classification", model="cpi-connect/SecureBERT-NER", grouped_entities=True) |
|
|
|
for text_block in text: |
|
entities = pipe(text_block) |
|
annotated = [] |
|
|
|
last_entity, last_idx = None, None |
|
for entity in entities: |
|
if last_entity is None and last_idx is None: |
|
annotated.append(text_block[:entity["start"]]) |
|
annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"])) |
|
last_entity = entity["entity_group"] |
|
last_idx = entity["end"] |
|
elif last_entity == entity["entity_group"] and last_idx == entity["start"]: |
|
new_text = annotated[-1][0] + text_block[entity["start"] : entity["end"]] |
|
label = annotated[-1][1] |
|
annotated[-1] = (new_text, label) |
|
last_entity = entity["entity_group"] |
|
last_idx = entity["end"] |
|
else: |
|
annotated.append(text_block[last_idx : entity["start"]]) |
|
annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"])) |
|
last_entity = entity["entity_group"] |
|
last_idx = entity["end"] |
|
|
|
annotated.append(text_block[last_idx : ]) |
|
annotated_text(annotated) |