rmdhirr commited on
Commit
b1ddb38
1 Parent(s): b5b6842

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -20
app.py CHANGED
@@ -1,18 +1,13 @@
1
  import gradio as gr
2
  import tensorflow as tf
3
- import pickle
4
  import numpy as np
5
- from sklearn.preprocessing import LabelEncoder
6
-
7
- # Load saved components
8
- with open('preprocessing_params.pkl', 'rb') as f:
9
- preprocessing_params = pickle.load(f)
10
- with open('label_encoder.pkl', 'rb') as f:
11
- label_encoder = pickle.load(f)
12
- with open('url_tokenizer.pkl', 'rb') as f:
13
- url_tokenizer = pickle.load(f)
14
- with open('html_tokenizer.pkl', 'rb') as f:
15
- html_tokenizer = pickle.load(f)
16
 
17
  # Load the model
18
  model = tf.keras.models.load_model('new_phishing_detection_model.keras')
@@ -22,26 +17,66 @@ model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
22
  loss='binary_crossentropy',
23
  metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
24
 
25
- # Function to preprocess input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def preprocess_input(input_text, tokenizer, max_length):
27
  sequences = tokenizer.texts_to_sequences([input_text])
28
- padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
29
  return padded_sequences
30
 
31
- # Function to get prediction
32
  def get_prediction(input_text, input_type):
33
  is_url = input_type == "URL"
34
  if is_url:
35
- input_data = preprocess_input(input_text, url_tokenizer, preprocessing_params['max_url_length'])
36
- input_data = [input_data, np.zeros((1, preprocessing_params['max_html_length']))] # dummy HTML input
 
37
  else:
38
- input_data = preprocess_input(input_text, html_tokenizer, preprocessing_params['max_html_length'])
39
- input_data = [np.zeros((1, preprocessing_params['max_url_length'])), input_data] # dummy URL input
 
40
 
41
  prediction = model.predict(input_data)[0][0]
42
  return prediction
43
 
44
- # Gradio UI
45
  def phishing_detection(input_text, input_type):
46
  prediction = get_prediction(input_text, input_type)
47
  if prediction > 0.5:
 
1
  import gradio as gr
2
  import tensorflow as tf
 
3
  import numpy as np
4
+ import nltk
5
+ from nltk.corpus import stopwords
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.stem import WordNetLemmatizer
8
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
9
+ from tensorflow.keras.preprocessing.text import Tokenizer
10
+ import re
 
 
 
 
11
 
12
  # Load the model
13
  model = tf.keras.models.load_model('new_phishing_detection_model.keras')
 
17
  loss='binary_crossentropy',
18
  metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
19
 
20
+ # Preprocessing functions
21
+ nltk.download('punkt')
22
+ nltk.download('stopwords')
23
+ nltk.download('wordnet')
24
+
25
+ STOPWORDS = set(stopwords.words('english'))
26
+ lemmatizer = WordNetLemmatizer()
27
+
28
+ def preprocess_url(url):
29
+ url = url.lower()
30
+ url = re.sub(r'https?://', '', url)
31
+ url = re.sub(r'www\.', '', url)
32
+ url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
33
+ url = re.sub(r'\s+', ' ', url).strip()
34
+ tokens = word_tokenize(url)
35
+ tokens = [word for word in tokens if word not in STOPWORDS]
36
+ tokens = [lemmatizer.lemmatize(word) for word in tokens]
37
+ return ' '.join(tokens)
38
+
39
+ def preprocess_html(html):
40
+ html = re.sub(r'<[^>]+>', ' ', html)
41
+ html = html.lower()
42
+ html = re.sub(r'https?://', '', html)
43
+ html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
44
+ html = re.sub(r'\s+', ' ', html).strip()
45
+ tokens = word_tokenize(html)
46
+ tokens = [word for word in tokens if word not in STOPWORDS]
47
+ tokens = [lemmatizer.lemmatize(word) for word in tokens]
48
+ return ' '.join(tokens)
49
+
50
+ max_url_length = 180
51
+ max_html_length = 2000
52
+ max_words = 10000
53
+
54
+ url_tokenizer = Tokenizer(num_words=max_words, char_level=True)
55
+ html_tokenizer = Tokenizer(num_words=max_words)
56
+
57
+ # Dummy fit to initialize tokenizers
58
+ url_tokenizer.fit_on_texts(["dummy"])
59
+ html_tokenizer.fit_on_texts(["dummy"])
60
+
61
  def preprocess_input(input_text, tokenizer, max_length):
62
  sequences = tokenizer.texts_to_sequences([input_text])
63
+ padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
64
  return padded_sequences
65
 
 
66
  def get_prediction(input_text, input_type):
67
  is_url = input_type == "URL"
68
  if is_url:
69
+ cleaned_text = preprocess_url(input_text)
70
+ input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
71
+ input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input
72
  else:
73
+ cleaned_text = preprocess_html(input_text)
74
+ input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
75
+ input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input
76
 
77
  prediction = model.predict(input_data)[0][0]
78
  return prediction
79
 
 
80
  def phishing_detection(input_text, input_type):
81
  prediction = get_prediction(input_text, input_type)
82
  if prediction > 0.5: