|
import streamlit as st |
|
from annotated_text import annotated_text |
|
from transformers import AutoModelForTokenClassification |
|
from transformers import AutoTokenizer |
|
import requests |
|
import random |
|
import justext |
|
import pickle |
|
from tqdm import tqdm |
|
import torch |
|
import jsonlines |
|
|
|
st.sidebar.markdown("Enter the URLs to be processed!") |
|
|
|
model_checkpoint = "../SecureBERT-finetuned-ner/" |
|
device = "cpu" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) |
|
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=40).to(device) |
|
|
|
with open("../ner_classes.pkl", "rb") as f: |
|
ner_classes = pickle.load(f) |
|
|
|
query_input = st.text_input("URL:") |
|
if query_input: |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0", |
|
"Accept": "application/json, text/plain, */*", |
|
"Accept-Language": "en-US,en;q=0.5", |
|
"Accept-Encoding": "gzip, deflate", |
|
} |
|
|
|
s = requests.Session() |
|
s.headers.update(headers) |
|
|
|
response = s.get(query_input) |
|
paragraphs = justext.justext(response.content, justext.get_stoplist("English")) |
|
text = "" |
|
for paragraph in paragraphs: |
|
if not paragraph.is_boilerplate: |
|
text += paragraph.text + "\n" |
|
|
|
text = text.split("\n") |