Spaces:

CyberPeace-Institute
/

SecureBERT-NER-Space

Sleeping

App Files Files Community

SecureBERT-NER-Space / app.py

cpi-connect

Update app.py

1608fe8 over 1 year ago

raw

history blame contribute delete

2.41 kB

	import streamlit as st
	from annotated_text import annotated_text
	from transformers import AutoModelForTokenClassification
	from transformers import AutoTokenizer
	from transformers import pipeline
	import requests
	import random
	import justext
	import pickle
	from tqdm import tqdm
	import torch
	import jsonlines

	st.title('Identifying Cybersecurity Entities on Webpages')

	query_input = st.text_input("URL:")
	if query_input:
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0",
	"Accept": "application/json, text/plain, /",
	"Accept-Language": "en-US,en;q=0.5",
	"Accept-Encoding": "gzip, deflate",
	}

	s = requests.Session()
	s.headers.update(headers)

	response = s.get(query_input)
	paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
	text = ""
	for paragraph in paragraphs:
	if not paragraph.is_boilerplate:
	text += paragraph.text + "\n"

	text = text.split("\n")
	text = [text_block for text_block in text if text_block != ""]

	pipe = pipeline("token-classification", model="cpi-connect/SecureBERT-NER", grouped_entities=True)

	for text_block in text:
	entities = pipe(text_block)
	annotated = []

	last_entity, last_idx = None, None
	for entity in entities:
	if last_entity is None and last_idx is None:
	annotated.append(text_block[:entity["start"]])
	annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"]))
	last_entity = entity["entity_group"]
	last_idx = entity["end"]
	elif last_entity == entity["entity_group"] and last_idx == entity["start"]:
	new_text = annotated[-1][0] + text_block[entity["start"] : entity["end"]]
	label = annotated[-1][1]
	annotated[-1] = (new_text, label)
	last_entity = entity["entity_group"]
	last_idx = entity["end"]
	else:
	annotated.append(text_block[last_idx : entity["start"]])
	annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"]))
	last_entity = entity["entity_group"]
	last_idx = entity["end"]

	annotated.append(text_block[last_idx : ])
	annotated_text(annotated)