File size: 2,414 Bytes
8e7e2b1
 
 
 
dc4e268
8e7e2b1
 
 
 
 
 
 
 
1608fe8
8e7e2b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc4e268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import streamlit as st
from annotated_text import annotated_text
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
from transformers import pipeline
import requests 
import random
import justext
import pickle
from tqdm import tqdm
import torch
import jsonlines

st.title('Identifying Cybersecurity Entities on Webpages')

query_input = st.text_input("URL:")
if query_input:
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0",
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate",
    }

    s = requests.Session()
    s.headers.update(headers)

    response = s.get(query_input)
    paragraphs = justext.justext(response.content, justext.get_stoplist("English")) 
    text = ""
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            text += paragraph.text + "\n"
    
    text = text.split("\n")
    text = [text_block for text_block in text if text_block != ""]

    pipe = pipeline("token-classification", model="cpi-connect/SecureBERT-NER", grouped_entities=True)

    for text_block in text:
        entities = pipe(text_block)
        annotated = []

        last_entity, last_idx = None, None
        for entity in entities:
            if last_entity is None and last_idx is None:
                annotated.append(text_block[:entity["start"]])
                annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"]))
                last_entity = entity["entity_group"]
                last_idx = entity["end"]
            elif last_entity == entity["entity_group"] and last_idx == entity["start"]:
                new_text = annotated[-1][0] + text_block[entity["start"] : entity["end"]]
                label = annotated[-1][1]
                annotated[-1] = (new_text, label)
                last_entity = entity["entity_group"]
                last_idx = entity["end"]
            else:
                annotated.append(text_block[last_idx : entity["start"]])
                annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"]))
                last_entity = entity["entity_group"]
                last_idx = entity["end"]
        
        annotated.append(text_block[last_idx : ])
        annotated_text(annotated)