import streamlit as st
from annotated_text import annotated_text
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
import requests 
import random
import justext
import pickle
from tqdm import tqdm
import torch
import jsonlines

st.sidebar.markdown("Enter the URLs to be processed!")

model_checkpoint = "../SecureBERT-finetuned-ner/"
device = "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=40).to(device)

with open("../ner_classes.pkl", "rb") as f:
    ner_classes = pickle.load(f)

query_input = st.text_input("URL:")
if query_input:
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0",
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate",
    }

    s = requests.Session()
    s.headers.update(headers)

    response = s.get(query_input)
    paragraphs = justext.justext(response.content, justext.get_stoplist("English")) 
    text = ""
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            text += paragraph.text + "\n"
    
    text = text.split("\n")