import streamlit as st from annotated_text import annotated_text from transformers import AutoModelForTokenClassification from transformers import AutoTokenizer import requests import random import justext import pickle from tqdm import tqdm import torch import jsonlines st.sidebar.markdown("Enter the URLs to be processed!") model_checkpoint = "../SecureBERT-finetuned-ner/" device = "cpu" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=40).to(device) with open("../ner_classes.pkl", "rb") as f: ner_classes = pickle.load(f) query_input = st.text_input("URL:") if query_input: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0", "Accept": "application/json, text/plain, */*", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", } s = requests.Session() s.headers.update(headers) response = s.get(query_input) paragraphs = justext.justext(response.content, justext.get_stoplist("English")) text = "" for paragraph in paragraphs: if not paragraph.is_boilerplate: text += paragraph.text + "\n" text = text.split("\n")