Spaces:
Sleeping
Sleeping
File size: 2,752 Bytes
7730772 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import torch
# Step 1: Load Dataset
dataset = load_dataset("ealvaradob/phishing-dataset", "combined_reduced", trust_remote_code=True)
# Step 2: Convert to Pandas and Split
df = dataset['train'].to_pandas()
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Step 3: Convert Back to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
# Step 4: Tokenizer Initialization
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
# Step 5: Preprocess Function
def preprocess_data(examples):
# Use the correct column name for the text data
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
# Step 6: Tokenize the Dataset
tokenized_train = train_dataset.map(preprocess_data, batched=True)
tokenized_test = test_dataset.map(preprocess_data, batched=True)
# Remove unused columns and set format for PyTorch
tokenized_train = tokenized_train.remove_columns(['text'])
tokenized_test = tokenized_test.remove_columns(['text'])
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")
# Step 7: Model Initialization
model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=2)
# Step 8: Training Arguments
training_args = TrainingArguments(
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
save_strategy="epoch",
logging_steps=10,
)
# Step 9: Trainer Setup
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_test,
)
# Step 10: Train the Model
trainer.train()
# Step 11: Save the Model
model.save_pretrained("./phishing_model")
tokenizer.save_pretrained("./phishing_model")
# Step 12: Inference Example
# Load the saved model for inference
loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_model")
loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_model")
# Example input
text = "Your account has been compromised, please reset your password now!"
inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
# Run inference
loaded_model.eval()
with torch.no_grad():
outputs = loaded_model(**inputs)
prediction = torch.argmax(outputs.logits, dim=-1).item()
print(f"Predicted label: {prediction}") # 0 = non-phishing, 1 = phishing
|