Spaces:
Sleeping
Sleeping
import gradio as gr | |
import nltk | |
import re | |
import pickle | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
from tensorflow import keras | |
import pandas as pd | |
# Ensure necessary NLTK resources are downloaded | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
nltk.download('wordnet') | |
# Load Stopwords and Initialize Lemmatizer | |
STOPWORDS = set(stopwords.words('english')) | |
lemmatizer = WordNetLemmatizer() | |
# Function to clean and preprocess URL data | |
def preprocess_url(url): | |
url = url.lower() # Convert to lowercase | |
url = re.sub(r'https?://', '', url) # Remove http or https | |
url = re.sub(r'www\.', '', url) # Remove www | |
url = re.sub(r'[^a-zA-Z0-9]', ' ', url) # Remove special characters | |
url = re.sub(r'\s+', ' ', url).strip() # Remove extra spaces | |
tokens = word_tokenize(url) # Tokenize | |
tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords | |
tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization | |
return ' '.join(tokens) | |
# Function to clean and preprocess HTML data | |
def preprocess_html(html): | |
html = re.sub(r'<[^>]+>', ' ', html) # Remove HTML tags | |
html = html.lower() # Convert to lowercase | |
html = re.sub(r'https?://', '', html) # Remove http or https | |
html = re.sub(r'[^a-zA-Z0-9]', ' ', html) # Remove special characters | |
html = re.sub(r'\s+', ' ', html).strip() # Remove extra spaces | |
tokens = word_tokenize(html) # Tokenize | |
tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords | |
tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization | |
return ' '.join(tokens) | |
# Load trained model | |
model = keras.models.load_model('new_phishing_detection_model.keras') | |
# Define maximum length and number of words | |
max_url_length = 180 | |
max_html_length = 2000 | |
max_words = 10000 | |
# Load the fitted tokenizers | |
with open('url_tokenizer.pkl', 'rb') as file: | |
url_tokenizer = pickle.load(file) | |
with open('html_tokenizer.pkl', 'rb') as file: | |
html_tokenizer = pickle.load(file) | |
# Define the prediction function | |
def predict_phishing(url, html): | |
cleaned_url = preprocess_url(url) | |
cleaned_html = preprocess_html(html) | |
new_url_sequences = url_tokenizer.texts_to_sequences([cleaned_url]) | |
new_url_padded = pad_sequences(new_url_sequences, maxlen=max_url_length, padding='post', truncating='post') | |
new_html_sequences = html_tokenizer.texts_to_sequences([cleaned_html]) | |
new_html_padded = pad_sequences(new_html_sequences, maxlen=max_html_length, padding='post', truncating='post') | |
new_predictions_prob = model.predict([new_url_padded, new_html_padded]) | |
new_predictions = (new_predictions_prob > 0.5).astype(int) | |
predicted_category = "Spam" if new_predictions[0][0] == 1 else "Legitimate" | |
predicted_probability = f"{new_predictions_prob[0][0]:.4f}" | |
return predicted_category, predicted_probability | |
# Create Gradio Interface | |
interface = gr.Interface( | |
fn=predict_phishing, | |
inputs=[ | |
gr.components.Textbox(label="URL"), | |
gr.components.Textbox(label="HTML Snippet") | |
], | |
outputs=[ | |
gr.components.Textbox(label="Predicted Category"), | |
gr.components.Textbox(label="Predicted Probability") | |
], | |
title="Phishing Detection Model", | |
description="Enter a URL and its HTML content to predict if it's spam or legitimate." | |
) | |
# Launch the Gradio interface | |
interface.launch() |