import streamlit as st from annotated_text import annotated_text from transformers import AutoModelForTokenClassification from transformers import AutoTokenizer from transformers import pipeline import requests import random import justext import pickle from tqdm import tqdm import torch import jsonlines st.title('Identifying Cybersecurity Entities on Webpages') query_input = st.text_input("URL:") if query_input: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0", "Accept": "application/json, text/plain, */*", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", } s = requests.Session() s.headers.update(headers) response = s.get(query_input) paragraphs = justext.justext(response.content, justext.get_stoplist("English")) text = "" for paragraph in paragraphs: if not paragraph.is_boilerplate: text += paragraph.text + "\n" text = text.split("\n") text = [text_block for text_block in text if text_block != ""] pipe = pipeline("token-classification", model="cpi-connect/SecureBERT-NER", grouped_entities=True) for text_block in text: entities = pipe(text_block) annotated = [] last_entity, last_idx = None, None for entity in entities: if last_entity is None and last_idx is None: annotated.append(text_block[:entity["start"]]) annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"])) last_entity = entity["entity_group"] last_idx = entity["end"] elif last_entity == entity["entity_group"] and last_idx == entity["start"]: new_text = annotated[-1][0] + text_block[entity["start"] : entity["end"]] label = annotated[-1][1] annotated[-1] = (new_text, label) last_entity = entity["entity_group"] last_idx = entity["end"] else: annotated.append(text_block[last_idx : entity["start"]]) annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"])) last_entity = entity["entity_group"] last_idx = entity["end"] annotated.append(text_block[last_idx : ]) annotated_text(annotated)