import gradio as gr import nltk import re import pickle from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow import keras from sklearn.preprocessing import LabelEncoder # Ensure necessary NLTK resources are downloaded nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') # Load Stopwords and Initialize Lemmatizer STOPWORDS = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() # Function to clean and preprocess URL data def preprocess_url(url): url = url.lower() # Convert to lowercase url = re.sub(r'https?://', '', url) # Remove http or https url = re.sub(r'www\.', '', url) # Remove www url = re.sub(r'[^a-zA-Z0-9]', ' ', url) # Remove special characters url = re.sub(r'\s+', ' ', url).strip() # Remove extra spaces tokens = word_tokenize(url) # Tokenize tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization return ' '.join(tokens) # Function to clean and preprocess HTML data def preprocess_html(html): html = re.sub(r'<[^>]+>', ' ', html) # Remove HTML tags html = html.lower() # Convert to lowercase html = re.sub(r'https?://', '', html) # Remove http or https html = re.sub(r'[^a-zA-Z0-9]', ' ', html) # Remove special characters html = re.sub(r'\s+', ' ', html).strip() # Remove extra spaces tokens = word_tokenize(html) # Tokenize tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization return ' '.join(tokens) # Load trained model model = keras.models.load_model('new_phishing_detection_model.keras') # Define maximum length and number of words max_url_length = 180 max_html_length = 2000 max_words = 10000 # Load the fitted tokenizers with open('url_tokenizer.pkl', 'rb') as file: url_tokenizer = pickle.load(file) with open('html_tokenizer.pkl', 'rb') as file: html_tokenizer = pickle.load(file) # Load the label encoder with open('label_encoder.pkl', 'rb') as file: label_encoder = pickle.load(file) # Define the prediction function def predict_phishing(url, html): cleaned_url = preprocess_url(url) cleaned_html = preprocess_html(html) new_url_sequences = url_tokenizer.texts_to_sequences([cleaned_url]) new_url_padded = pad_sequences(new_url_sequences, maxlen=max_url_length, padding='post', truncating='post') new_html_sequences = html_tokenizer.texts_to_sequences([cleaned_html]) new_html_padded = pad_sequences(new_html_sequences, maxlen=max_html_length, padding='post', truncating='post') new_predictions_prob = model.predict([new_url_padded, new_html_padded]) new_predictions = (new_predictions_prob > 0.6).astype(int) # Adjust threshold if needed predicted_category = label_encoder.inverse_transform(new_predictions)[0] predicted_probability = f"{new_predictions_prob[0][0]:.4f}" return predicted_category.capitalize(), predicted_probability # Create Gradio Interface interface = gr.Interface( fn=predict_phishing, inputs=[ gr.components.Textbox(label="URL"), gr.components.Textbox(label="HTML Snippet", lines=10, placeholder="Paste HTML content here") ], outputs=[ gr.components.Textbox(label="Predicted Category"), gr.components.Textbox(label="Predicted Probability") ], title="Phishing Detection Model", description="Enter a URL and its HTML content to predict if it's spam or legitimate. It's recommended to provide both for accurate results.", live=True, css=""" .interface-container { border: 2px solid #4CAF50; border-radius: 10px; padding: 20px; text-align: center; } .gr-textbox, .gr-textbox textarea, .gr-button { margin-left: auto !important; margin-right: auto !important; } """ ) # Footer text footer = gr.Markdown(""" ---