from datasets import load_dataset, Dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer from sklearn.model_selection import train_test_split import torch # Step 1: Load Dataset dataset = load_dataset("ealvaradob/phishing-dataset", "combined_reduced", trust_remote_code=True) # Step 2: Convert to Pandas and Split df = dataset['train'].to_pandas() train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Step 3: Convert Back to Hugging Face Dataset train_dataset = Dataset.from_pandas(train_df, preserve_index=False) test_dataset = Dataset.from_pandas(test_df, preserve_index=False) # Step 4: Tokenizer Initialization tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased") # Step 5: Preprocess Function def preprocess_data(examples): # Use the correct column name for the text data return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512) # Step 6: Tokenize the Dataset tokenized_train = train_dataset.map(preprocess_data, batched=True) tokenized_test = test_dataset.map(preprocess_data, batched=True) # Remove unused columns and set format for PyTorch tokenized_train = tokenized_train.remove_columns(['text']) tokenized_test = tokenized_test.remove_columns(['text']) tokenized_train.set_format("torch") tokenized_test.set_format("torch") # Step 7: Model Initialization model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=2) # Step 8: Training Arguments training_args = TrainingArguments( evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01, save_strategy="epoch", logging_steps=10, ) # Step 9: Trainer Setup trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_test, ) # Step 10: Train the Model trainer.train() # Step 11: Save the Model model.save_pretrained("./phishing_model") tokenizer.save_pretrained("./phishing_model") # Step 12: Inference Example # Load the saved model for inference loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_model") loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_model") # Example input text = "Your account has been compromised, please reset your password now!" inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) # Run inference loaded_model.eval() with torch.no_grad(): outputs = loaded_model(**inputs) prediction = torch.argmax(outputs.logits, dim=-1).item() print(f"Predicted label: {prediction}") # 0 = non-phishing, 1 = phishing