import gradio as gr
import nltk
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
import pandas as pd

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load Stopwords and Initialize Lemmatizer
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean and preprocess URL data
def preprocess_url(url):
    url = url.lower()  # Convert to lowercase
    url = re.sub(r'https?://', '', url)  # Remove http or https
    url = re.sub(r'www\.', '', url)  # Remove www
    url = re.sub(r'[^a-zA-Z0-9]', ' ', url)  # Remove special characters
    url = re.sub(r'\s+', ' ', url).strip()  # Remove extra spaces
    tokens = word_tokenize(url)  # Tokenize
    tokens = [word for word in tokens if word not in STOPWORDS]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

# Function to clean and preprocess HTML data
def preprocess_html(html):
    html = re.sub(r'<[^>]+>', ' ', html)  # Remove HTML tags
    html = html.lower()  # Convert to lowercase
    html = re.sub(r'https?://', '', html)  # Remove http or https
    html = re.sub(r'[^a-zA-Z0-9]', ' ', html)  # Remove special characters
    html = re.sub(r'\s+', ' ', html).strip()  # Remove extra spaces
    tokens = word_tokenize(html)  # Tokenize
    tokens = [word for word in tokens if word not in STOPWORDS]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

# Load trained model
model = keras.models.load_model('new_phishing_detection_model.keras')

# Define maximum length and number of words
max_url_length = 180
max_html_length = 2000
max_words = 10000

# Load the fitted tokenizers
with open('url_tokenizer.pkl', 'rb') as file:
    url_tokenizer = pickle.load(file)

with open('html_tokenizer.pkl', 'rb') as file:
    html_tokenizer = pickle.load(file)

# Define the prediction function
def predict_phishing(url, html):
    cleaned_url = preprocess_url(url)
    cleaned_html = preprocess_html(html)
    
    new_url_sequences = url_tokenizer.texts_to_sequences([cleaned_url])
    new_url_padded = pad_sequences(new_url_sequences, maxlen=max_url_length, padding='post', truncating='post')
    
    new_html_sequences = html_tokenizer.texts_to_sequences([cleaned_html])
    new_html_padded = pad_sequences(new_html_sequences, maxlen=max_html_length, padding='post', truncating='post')
    
    new_predictions_prob = model.predict([new_url_padded, new_html_padded])
    new_predictions = (new_predictions_prob > 0.5).astype(int)
    
    predicted_category = "Spam" if new_predictions[0][0] == 1 else "Legitimate"
    predicted_probability = f"{new_predictions_prob[0][0]:.4f}"
    
    return predicted_category, predicted_probability

# Create Gradio Interface
interface = gr.Interface(
    fn=predict_phishing,
    inputs=[
        gr.components.Textbox(label="URL"),
        gr.components.Textbox(label="HTML Snippet")
    ],
    outputs=[
        gr.components.Textbox(label="Predicted Category"),
        gr.components.Textbox(label="Predicted Probability")
    ],
    title="Phishing Detection Model",
    description="Enter a URL and its HTML content to predict if it's spam or legitimate."
)

# Launch the Gradio interface
interface.launch()