cpi-connect's picture
Create app.py
8e7e2b1
raw
history blame
1.31 kB
import streamlit as st
from annotated_text import annotated_text
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
import requests
import random
import justext
import pickle
from tqdm import tqdm
import torch
import jsonlines
st.sidebar.markdown("Enter the URLs to be processed!")
model_checkpoint = "../SecureBERT-finetuned-ner/"
device = "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=40).to(device)
with open("../ner_classes.pkl", "rb") as f:
ner_classes = pickle.load(f)
query_input = st.text_input("URL:")
if query_input:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
}
s = requests.Session()
s.headers.update(headers)
response = s.get(query_input)
paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
text = ""
for paragraph in paragraphs:
if not paragraph.is_boilerplate:
text += paragraph.text + "\n"
text = text.split("\n")