import gradio as gr
import nltk
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
# Ensure necessary NLTK resources are downloaded
# Load Stopwords and Initialize Lemmatizer
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Function to clean and preprocess URL data
def preprocess_url(url):
url = url.lower() # Convert to lowercase
url = re.sub(r'https?://', '', url) # Remove http or https
url = re.sub(r'www\.', '', url) # Remove www
url = re.sub(r'[^a-zA-Z0-9]', ' ', url) # Remove special characters
url = re.sub(r'\s+', ' ', url).strip() # Remove extra spaces
tokens = word_tokenize(url) # Tokenize
tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords
tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization
return ' '.join(tokens)
# Function to clean and preprocess HTML data
def preprocess_html(html):
html = re.sub(r'<[^>]+>', ' ', html) # Remove HTML tags
html = html.lower() # Convert to lowercase
html = re.sub(r'https?://', '', html) # Remove http or https
html = re.sub(r'[^a-zA-Z0-9]', ' ', html) # Remove special characters
html = re.sub(r'\s+', ' ', html).strip() # Remove extra spaces
tokens = word_tokenize(html) # Tokenize
tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords
tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization
return ' '.join(tokens)
# Load trained model
model = keras.models.load_model('new_phishing_detection_model.keras')
# Define maximum length and number of words
max_url_length = 180
max_html_length = 2000
max_words = 10000
# Load the fitted tokenizers
with open('url_tokenizer.pkl', 'rb') as file:
url_tokenizer = pickle.load(file)
with open('html_tokenizer.pkl', 'rb') as file:
html_tokenizer = pickle.load(file)
# Load the label encoder
with open('label_encoder.pkl', 'rb') as file:
label_encoder = pickle.load(file)
# Define the prediction function
def predict_phishing(url, html):
cleaned_url = preprocess_url(url)
cleaned_html = preprocess_html(html)
new_url_sequences = url_tokenizer.texts_to_sequences([cleaned_url])
new_url_padded = pad_sequences(new_url_sequences, maxlen=max_url_length, padding='post', truncating='post')
new_html_sequences = html_tokenizer.texts_to_sequences([cleaned_html])
new_html_padded = pad_sequences(new_html_sequences, maxlen=max_html_length, padding='post', truncating='post')
new_predictions_prob = model.predict([new_url_padded, new_html_padded])
new_predictions = (new_predictions_prob > 0.6).astype(int) # Adjust threshold if needed
predicted_category = label_encoder.inverse_transform(new_predictions)[0]
predicted_probability = f"{new_predictions_prob[0][0]:.4f}"
return predicted_category.capitalize(), predicted_probability
# Create Gradio Interface
interface = gr.Interface(
gr.components.Textbox(label="HTML Snippet", lines=10, placeholder="Paste HTML content here")
gr.components.Textbox(label="Predicted Category"),
gr.components.Textbox(label="Predicted Probability")
title="Phishing Detection Model",
description="Enter a URL and its HTML content to predict if it's spam or legitimate. It's recommended to provide both for accurate results.",
.interface-container {
border: 2px solid #4CAF50;
border-radius: 10px;
padding: 20px;
text-align: center;
.gr-textbox, .gr-textbox textarea, .gr-button {
margin-left: auto !important;
margin-right: auto !important;
# Footer text
footer = gr.Markdown("""
<div style="text-align: center;">
Made with ❤️ by Ramadhirra<br>
Model by Ramadhirra<br>
WebUI by Ramadhirra
# Combine the interface and footer
app = gr.Blocks()
with app:
# Launch the Gradio interface