Spaces:
Sleeping
Sleeping
from datasets import load_dataset, Dataset | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
from sklearn.model_selection import train_test_split | |
import torch | |
# Step 1: Load Dataset | |
dataset = load_dataset("ealvaradob/phishing-dataset", "combined_reduced", trust_remote_code=True) | |
# Step 2: Convert to Pandas and Split | |
df = dataset['train'].to_pandas() | |
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) | |
# Step 3: Convert Back to Hugging Face Dataset | |
train_dataset = Dataset.from_pandas(train_df, preserve_index=False) | |
test_dataset = Dataset.from_pandas(test_df, preserve_index=False) | |
# Step 4: Tokenizer Initialization | |
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased") | |
# Step 5: Preprocess Function | |
def preprocess_data(examples): | |
# Use the correct column name for the text data | |
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512) | |
# Step 6: Tokenize the Dataset | |
tokenized_train = train_dataset.map(preprocess_data, batched=True) | |
tokenized_test = test_dataset.map(preprocess_data, batched=True) | |
# Remove unused columns and set format for PyTorch | |
tokenized_train = tokenized_train.remove_columns(['text']) | |
tokenized_test = tokenized_test.remove_columns(['text']) | |
tokenized_train.set_format("torch") | |
tokenized_test.set_format("torch") | |
# Step 7: Model Initialization | |
model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=2) | |
# Step 8: Training Arguments | |
training_args = TrainingArguments( | |
evaluation_strategy="epoch", | |
learning_rate=2e-5, | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=16, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
save_strategy="epoch", | |
logging_steps=10, | |
) | |
# Step 9: Trainer Setup | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_train, | |
eval_dataset=tokenized_test, | |
) | |
# Step 10: Train the Model | |
trainer.train() | |
# Step 11: Save the Model | |
model.save_pretrained("./phishing_model") | |
tokenizer.save_pretrained("./phishing_model") | |
# Step 12: Inference Example | |
# Load the saved model for inference | |
loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_model") | |
loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_model") | |
# Example input | |
text = "Your account has been compromised, please reset your password now!" | |
inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) | |
# Run inference | |
loaded_model.eval() | |
with torch.no_grad(): | |
outputs = loaded_model(**inputs) | |
prediction = torch.argmax(outputs.logits, dim=-1).item() | |
print(f"Predicted label: {prediction}") # 0 = non-phishing, 1 = phishing | |