File size: 1,536 Bytes
92c34be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#import fireducks.pandas as pd
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

class DataProcessor:
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)
        self.label_encoders = {}
        self.scaler = StandardScaler()

    def preprocess_data(self):
        for column in ['IP', 'Hostnames', 'OS']:
            self.label_encoders[column] = LabelEncoder()
            self.data[column] = self.label_encoders[column].fit_transform(self.data[column].astype(str))

        self.data['Port'] = self.scaler.fit_transform(self.data[['Port']])
        self.data['Timestamp'] = pd.to_datetime(self.data['Timestamp']).astype(int) / 10**9
        return self.data

    def get_features_and_labels(self, label_column='Anomaly'):
        if label_column not in self.data.columns:
            raise ValueError(f"Label column '{label_column}' not found in data.")

        X = self.data.drop([label_column], axis=1)
        y = self.data[label_column]
        return X, y

    def split_data(self, X, y, test_size=0.2, random_state=42):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return X_train, X_test, y_train, y_test

# # Example usage:
# processor = DataProcessor('shodan_scan_results.csv')
# processed_data = processor.preprocess_data()
# X, y = processor.get_features_and_labels()
# X_train, X_test, y_train, y_test = processor.split_data(X, y)