alwinvargheset@outlook.com commited on
Commit
7730772
·
1 Parent(s): c63277b

added_model

Browse files
README.md DELETED
@@ -1,13 +0,0 @@
1
- ---
2
- title: Phishing Email Detector
3
- emoji: 👀
4
- colorFrom: indigo
5
- colorTo: blue
6
- sdk: streamlit
7
- sdk_version: 1.40.2
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import imaplib
3
+ import email
4
+ from email.header import decode_header
5
+ import torch
6
+ from transformers import BertTokenizer, BertForSequenceClassification
7
+ import re
8
+
9
+ class EmailProcessor:
10
+ @staticmethod
11
+ def decode_email_content(content, default_charset='utf-8'):
12
+ if isinstance(content, bytes):
13
+ try:
14
+ return content.decode(default_charset)
15
+ except UnicodeDecodeError:
16
+ try:
17
+ return content.decode('iso-8859-1')
18
+ except UnicodeDecodeError:
19
+ return content.decode(default_charset, errors='ignore')
20
+ return str(content)
21
+
22
+ @staticmethod
23
+ def clean_text(text):
24
+ text = re.sub(r'<[^>]+>', '', text)
25
+ text = re.sub(r'\s+', ' ', text)
26
+ return text.strip()
27
+
28
+ @staticmethod
29
+ def get_emails(email_address, password, imap_server, imap_port):
30
+ try:
31
+ imap = imaplib.IMAP4_SSL(imap_server, imap_port)
32
+ imap.login(email_address, password)
33
+ imap.select('INBOX')
34
+
35
+ _, message_numbers = imap.search(None, 'ALL')
36
+
37
+ emails = []
38
+ for num in message_numbers[0].split()[-5:]:
39
+ _, msg_data = imap.fetch(num, '(RFC822)')
40
+ email_body = msg_data[0][1]
41
+ message = email.message_from_bytes(email_body)
42
+
43
+ subject = decode_header(message["subject"])[0][0]
44
+ if isinstance(subject, bytes):
45
+ subject = EmailProcessor.decode_email_content(subject)
46
+
47
+ if message.is_multipart():
48
+ content = ''
49
+ for part in message.walk():
50
+ if part.get_content_type() == "text/plain":
51
+ payload = part.get_payload(decode=True)
52
+ if payload:
53
+ charset = part.get_content_charset() or 'utf-8'
54
+ content += EmailProcessor.decode_email_content(payload, charset)
55
+ else:
56
+ payload = message.get_payload(decode=True)
57
+ if payload:
58
+ charset = message.get_content_charset() or 'utf-8'
59
+ content = EmailProcessor.decode_email_content(payload, charset)
60
+ else:
61
+ content = ""
62
+
63
+ emails.append({
64
+ 'subject': subject,
65
+ 'content': EmailProcessor.clean_text(content)
66
+ })
67
+
68
+ imap.close()
69
+ imap.logout()
70
+ return emails, None
71
+
72
+ except Exception as e:
73
+ return None, str(e)
74
+
75
+ class PhishingDetector:
76
+ def __init__(self, model_path="./phishing_model"):
77
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
+ self.tokenizer = BertTokenizer.from_pretrained(model_path)
79
+ self.model = BertForSequenceClassification.from_pretrained(
80
+ model_path,
81
+ num_labels=2
82
+ ).to(self.device)
83
+ self.model.eval()
84
+
85
+ @torch.no_grad()
86
+ def predict(self, text):
87
+ cleaned_text = EmailProcessor.clean_text(text)
88
+ inputs = self.tokenizer(
89
+ cleaned_text,
90
+ return_tensors="pt",
91
+ truncation=True,
92
+ max_length=512,
93
+ padding=True
94
+ )
95
+
96
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
97
+ outputs = self.model(**inputs)
98
+ probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
99
+ return probabilities[0][1].item()
100
+
101
+ # Initialize the app
102
+ st.title("📧 Email Phishing Detector")
103
+ st.write("Connect your email account to analyze messages for potential phishing attempts.")
104
+
105
+ # Email configuration in sidebar
106
+ with st.sidebar:
107
+ st.header("Email Settings")
108
+ email_address = st.text_input("Email Address", key="email_address_input")
109
+ password = st.text_input("Password", type="password", key="password_input")
110
+ imap_server = st.text_input("IMAP Server", value="imap.gmail.com", key="imap_server_input")
111
+ imap_port = st.number_input("IMAP Port", value=993, key="imap_port_input")
112
+
113
+ # Initialize the model using st.cache_resource
114
+ @st.cache_resource
115
+ def load_detector():
116
+ return PhishingDetector()
117
+
118
+ try:
119
+ detector = load_detector()
120
+ model_loaded = True
121
+ except Exception as e:
122
+ st.error(f"Error loading model: {str(e)}")
123
+ model_loaded = False
124
+
125
+ # Add manual text analysis option
126
+ st.markdown("### 📝 Manual Text Analysis")
127
+ manual_text = st.text_area("Enter text to analyze:", height=100, key="manual_text_input")
128
+ if st.button("Analyze Text", key="analyze_text_btn") and manual_text.strip():
129
+ with st.spinner("Analyzing text..."):
130
+ phishing_score = detector.predict(manual_text)
131
+ risk_color = "red" if phishing_score > 0.5 else "green"
132
+ st.markdown(f"**Phishing Risk Score:** <span style='color:{risk_color}'>{phishing_score:.2%}</span>", unsafe_allow_html=True)
133
+
134
+ if phishing_score > 0.8:
135
+ st.error("⚠️ High Risk: This text shows strong indicators of being a phishing attempt!")
136
+ elif phishing_score > 0.5:
137
+ st.warning("⚠️ Medium Risk: This text shows some suspicious characteristics.")
138
+ else:
139
+ st.success("✅ Low Risk: This text appears to be legitimate.")
140
+
141
+ st.markdown("### 📨 Email Analysis")
142
+ if model_loaded and st.button("Analyze Emails", key="analyze_emails_btn"):
143
+ if not email_address or not password:
144
+ st.warning("Please enter your email credentials.")
145
+ else:
146
+ with st.spinner("Connecting to email..."):
147
+ emails, error = EmailProcessor.get_emails(email_address, password, imap_server, imap_port)
148
+
149
+ if error:
150
+ st.error(f"Error connecting to email: {error}")
151
+ elif emails:
152
+ st.success("Successfully retrieved emails!")
153
+
154
+ for i, email_data in enumerate(emails):
155
+ with st.expander(f"Email {i+1}: {email_data['subject']}"):
156
+ phishing_score = detector.predict(email_data['content'])
157
+
158
+ risk_color = "red" if phishing_score > 0.5 else "green"
159
+ st.markdown(f"**Phishing Risk Score:** <span style='color:{risk_color}'>{phishing_score:.2%}</span>", unsafe_allow_html=True)
160
+
161
+ if phishing_score > 0.8:
162
+ st.error("⚠️ High Risk: This email shows strong indicators of being a phishing attempt!")
163
+ elif phishing_score > 0.5:
164
+ st.warning("⚠️ Medium Risk: This email shows some suspicious characteristics.")
165
+ else:
166
+ st.success("✅ Low Risk: This email appears to be legitimate.")
167
+
168
+ st.text_area("Email Content", email_data['content'], height=100, key=f"email_content_{i}")
169
+ else:
170
+ st.warning("No emails found in inbox.")
171
+
172
+ st.sidebar.markdown("---")
173
+ st.sidebar.markdown("""
174
+ ### Instructions
175
+ 1. Enter your email credentials
176
+ 2. For Gmail:
177
+ - Use an App Password instead of your regular password
178
+ - Enable 2FA and generate an App Password from Google Account settings
179
+ 3. Click "Analyze Emails" to scan your recent emails
180
+ """)
181
+
182
+ st.sidebar.markdown("---")
183
+ st.sidebar.markdown("""
184
+ ### About
185
+ This application uses a BERT-based model to detect phishing attempts in emails.
186
+ You can either:
187
+ 1. Analyze your emails directly by connecting your email account
188
+ 2. Manually input text to analyze for phishing content
189
+ """)
phishing_model/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
phishing_model/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # BERT FINETUNED ON PHISHING DETECTION
4
+
5
+ This model is a fine-tuned version of [bert-large-uncased](https://huggingface.co/bert-large-uncased) on an [phishing dataset](https://huggingface.co/datasets/ealvaradob/phishing-dataset),
6
+ capable of detecting phishing in its four most common forms: URLs, Emails, SMS messages and even websites.
7
+
8
+ It achieves the following results on the evaluation set:
9
+
10
+ - Loss: 0.1953
11
+ - Accuracy: 0.9717
12
+ - Precision: 0.9658
13
+ - Recall: 0.9670
14
+ - False Positive Rate: 0.0249
15
+
16
+ ## Model description
17
+
18
+ BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion.
19
+ This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why
20
+ it can use lots of publicly available data) with an automatic process to generate inputs and labels from
21
+ those texts.
22
+
23
+
24
+
25
+ ## Motivation and Purpose
26
+
27
+ Phishing is one of the most frequent and most expensive cyber-attacks according to several security reports.
28
+ This model aims to efficiently and accurately prevent phishing attacks against individuals and organizations.
29
+ To achieve it, BERT was trained on a diverse and robust dataset containing: URLs, SMS Messages, Emails and
30
+ Websites, which allows the model to extend its detection capability beyond the usual and to be used in various
31
+ contexts.
32
+
33
+
34
+ ### Training results
35
+
36
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy | Precision | Recall | False Positive Rate |
37
+ |:-------------:|:-----:|:-----:|:---------------:|:--------:|:---------:|:------:|:-------------------:|
38
+ | 0.1487 | 1.0 | 3866 | 0.1454 | 0.9596 | 0.9709 | 0.9320 | 0.0203 |
39
+ | 0.0805 | 2.0 | 7732 | 0.1389 | 0.9691 | 0.9663 | 0.9601 | 0.0243 |
40
+ | 0.0389 | 3.0 | 11598 | 0.1779 | 0.9683 | 0.9778 | 0.9461 | 0.0156 |
41
+ | 0.0091 | 4.0 | 15464 | 0.1953 | 0.9717 | 0.9658 | 0.9670 | 0.0249 |
42
+
43
+
phishing_model/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-large-uncased",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "id2label": {
13
+ "0": "benign",
14
+ "1": "phishing"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 4096,
18
+ "label2id": {
19
+ "benign": 0,
20
+ "phishing": 1
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 16,
26
+ "num_hidden_layers": 24,
27
+ "pad_token_id": 0,
28
+ "position_embedding_type": "absolute",
29
+ "problem_type": "single_label_classification",
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.34.1",
32
+ "type_vocab_size": 2,
33
+ "use_cache": true,
34
+ "vocab_size": 30522
35
+ }
phishing_model/gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
phishing_model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7fc8fd8ff9eb431b5876bff2e94d0ba31987fc2301942b65d1306eba9d18646
3
+ size 1340710638
phishing_model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
phishing_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
phishing_model/tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "BertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
phishing_model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d104fd966c5439370d740371ebeae1a9b747a93c604762957f98ecfeec61108
3
+ size 4536
phishing_model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ datasets
4
+ scikit-learn
5
+ streamlit
6
+ tqdm
7
+ email-validator
8
+ regex>=2023.5.5
train.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
3
+ from sklearn.model_selection import train_test_split
4
+ import torch
5
+
6
+ # Step 1: Load Dataset
7
+ dataset = load_dataset("ealvaradob/phishing-dataset", "combined_reduced", trust_remote_code=True)
8
+
9
+ # Step 2: Convert to Pandas and Split
10
+ df = dataset['train'].to_pandas()
11
+ train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
12
+
13
+ # Step 3: Convert Back to Hugging Face Dataset
14
+ train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
15
+ test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
16
+
17
+ # Step 4: Tokenizer Initialization
18
+ tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
19
+
20
+ # Step 5: Preprocess Function
21
+ def preprocess_data(examples):
22
+ # Use the correct column name for the text data
23
+ return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
24
+
25
+ # Step 6: Tokenize the Dataset
26
+ tokenized_train = train_dataset.map(preprocess_data, batched=True)
27
+ tokenized_test = test_dataset.map(preprocess_data, batched=True)
28
+
29
+ # Remove unused columns and set format for PyTorch
30
+ tokenized_train = tokenized_train.remove_columns(['text'])
31
+ tokenized_test = tokenized_test.remove_columns(['text'])
32
+ tokenized_train.set_format("torch")
33
+ tokenized_test.set_format("torch")
34
+
35
+ # Step 7: Model Initialization
36
+ model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=2)
37
+
38
+ # Step 8: Training Arguments
39
+ training_args = TrainingArguments(
40
+ evaluation_strategy="epoch",
41
+ learning_rate=2e-5,
42
+ per_device_train_batch_size=16,
43
+ per_device_eval_batch_size=16,
44
+ num_train_epochs=3,
45
+ weight_decay=0.01,
46
+ save_strategy="epoch",
47
+ logging_steps=10,
48
+ )
49
+
50
+ # Step 9: Trainer Setup
51
+ trainer = Trainer(
52
+ model=model,
53
+ args=training_args,
54
+ train_dataset=tokenized_train,
55
+ eval_dataset=tokenized_test,
56
+ )
57
+
58
+ # Step 10: Train the Model
59
+ trainer.train()
60
+
61
+ # Step 11: Save the Model
62
+ model.save_pretrained("./phishing_model")
63
+ tokenizer.save_pretrained("./phishing_model")
64
+
65
+ # Step 12: Inference Example
66
+ # Load the saved model for inference
67
+ loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_model")
68
+ loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_model")
69
+
70
+ # Example input
71
+ text = "Your account has been compromised, please reset your password now!"
72
+ inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
73
+
74
+ # Run inference
75
+ loaded_model.eval()
76
+ with torch.no_grad():
77
+ outputs = loaded_model(**inputs)
78
+ prediction = torch.argmax(outputs.logits, dim=-1).item()
79
+
80
+ print(f"Predicted label: {prediction}") # 0 = non-phishing, 1 = phishing