cpi-connect commited on
Commit
8e7e2b1
1 Parent(s): 80269cb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -0
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from annotated_text import annotated_text
3
+ from transformers import AutoModelForTokenClassification
4
+ from transformers import AutoTokenizer
5
+ import requests
6
+ import random
7
+ import justext
8
+ import pickle
9
+ from tqdm import tqdm
10
+ import torch
11
+ import jsonlines
12
+
13
+ st.sidebar.markdown("Enter the URLs to be processed!")
14
+
15
+ model_checkpoint = "../SecureBERT-finetuned-ner/"
16
+ device = "cpu"
17
+
18
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
19
+ model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=40).to(device)
20
+
21
+ with open("../ner_classes.pkl", "rb") as f:
22
+ ner_classes = pickle.load(f)
23
+
24
+ query_input = st.text_input("URL:")
25
+ if query_input:
26
+ headers = {
27
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0",
28
+ "Accept": "application/json, text/plain, */*",
29
+ "Accept-Language": "en-US,en;q=0.5",
30
+ "Accept-Encoding": "gzip, deflate",
31
+ }
32
+
33
+ s = requests.Session()
34
+ s.headers.update(headers)
35
+
36
+ response = s.get(query_input)
37
+ paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
38
+ text = ""
39
+ for paragraph in paragraphs:
40
+ if not paragraph.is_boilerplate:
41
+ text += paragraph.text + "\n"
42
+
43
+ text = text.split("\n")