File size: 10,012 Bytes
76cdfb8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
import os
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from nt_gan import GAN
from nt_gg import GG
dataset_directory = 'datasets'
saved_models_path = 'outputs'
def prepare_architecture(arff_data_path):
"""
This function create the architecture of the GAN network.
The generator and the discriminator are created and then combined into the GAN model
:param arff_data_path: data path for the arff file
:return: a dictionary with all the relevant variables for the next stages
"""
data, meta_data = arff.loadarff(arff_data_path) # This function reads arff file into tuple of data and its meta.
df = pd.DataFrame(data)
columns = df.columns
transformed_data, x, x_scaled, meta_data_rev, min_max_scaler = create_scaled_data(df, meta_data)
number_of_features = len(transformed_data.columns) # Define the GAN and training parameters
return x_scaled, meta_data_rev, columns, min_max_scaler, number_of_features
def create_scaled_data(df, meta_data):
"""
:param df:
:param meta_data:
:return:
"""
meta_data_dict = {k: {a.replace(' ', ''): b + 1 for b, a in enumerate(v.values)} for k, v in
meta_data._attributes.items() if
v.type_name != 'numeric'} # Starts from one and not zero because one is for Nan values
meta_data_rev = {k: {b + 1: a.replace(' ', '') for b, a in enumerate(v.values)} for k, v in
meta_data._attributes.items() if
v.type_name != 'numeric'} # Starts from one and not zero because one is for Nan values
transformed_data = df.copy()
for col in df.columns:
if col in meta_data_dict:
# Sometimes the values can not be found in the meta data, so we treat these values as Nan
transformed_data[col] = transformed_data[col].apply(
lambda x: meta_data_dict[col][str(x).split('\'')[1]] if str(x).split('\'')[1] in meta_data_dict[
col] else 0)
x = transformed_data.values # returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
return transformed_data, x, x_scaled, meta_data_rev, min_max_scaler
def re_scaled_data(data, columns, meta_data_rev, min_max_scaler):
"""
This function re-scaled the fake data to the original format.
:param data: the data we want to re scaled
:param columns:
:param meta_data_rev:
:return:
"""
data_inv = min_max_scaler.inverse_transform(data)
df = pd.DataFrame(data_inv, columns=columns)
transformed_data = df.copy()
for col in transformed_data.columns:
if col in meta_data_rev:
# Sometimes the values can not be found in the meta data, so we treat these values as Nan
transformed_data[col] = transformed_data[col].apply(
lambda x: meta_data_rev[col][int(round(x))] if int(round(x)) in meta_data_rev[
col] else np.nan)
return transformed_data
def first_question():
"""
This function answers the first question
:return:
"""
to_plot_losses = True
results_output = os.path.join(saved_models_path, f'question_one_results.csv')
results = {'dataset': [], 'lr': [], 'ep': [], 'bs': [], 'alpha': [], 'dropout': [], 'gen_loss': [], 'dis_loss': [],
'activation': [], 'fooled_len': [], 'not_fooled_len': [], 'mean_min_distance_fooled': [],
'mean_min_distance_not_fooled': [], 'mean_min_distance_gap': []}
# w1 * (MMDF + MMDNF) - w3 * (MMDG) + w2 * (NFL/ 100)
# MMDG = MMDNF - MMDF
# data_name = ["adult", "bank-full"]
# learning_rate = [0.01, 0.001, 0.0001]
# epochs = [5, 10, 15]
# batch_size = [64, 128, 1024]
# alpha_relu = [0.2, 0.5]
# dropout = [0.3, 0.5]
data_name = ["adult"]
learning_rate = [0.001]
epochs = [10]
batch_size = [128]
alpha_relu = [0.5]
dropout = [0.5]
loss = 'binary_crossentropy'
activation = 'sigmoid'
for data in data_name:
for lr in learning_rate:
for ep in epochs:
for bs in batch_size:
for al in alpha_relu:
for dr in dropout:
arff_data_path = f'./datasets/{data}.arff'
model_name = f'data_{data}_ep_{ep}_bs_{bs}_lr_{lr}_al_{al}_dr_{dr}'
pca_output = os.path.join(saved_models_path, f'{model_name}_pca.png')
fooled_output = os.path.join(saved_models_path, f'{model_name}_fooled.csv')
not_fooled_output = os.path.join(saved_models_path, f'{model_name}_not_fooled.csv')
x_scaled, meta_data_rev, columns, min_max_scaler, number_of_features = prepare_architecture(
arff_data_path)
gan_obj = GAN(number_of_features=number_of_features, saved_models_path=saved_models_path,
learning_rate=lr, alpha_relu=al, dropout=dr,
loss=loss, activation=activation)
gen_loss, dis_loss = gan_obj.train(scaled_data=x_scaled, epochs=ep, batch_size=bs,
to_plot_losses=to_plot_losses, model_name=model_name)
dis_fooled_scaled, dis_not_fooled_scaled, mean_min_distance_fooled, mean_min_distance_not_fooled = gan_obj.test(
scaled_data=x_scaled, sample_num=100, pca_output=pca_output)
dis_fooled = re_scaled_data(data=dis_fooled_scaled, columns=columns,
meta_data_rev=meta_data_rev,
min_max_scaler=min_max_scaler)
dis_fooled.to_csv(fooled_output)
dis_not_fooled = re_scaled_data(data=dis_not_fooled_scaled, columns=columns,
meta_data_rev=meta_data_rev,
min_max_scaler=min_max_scaler)
dis_not_fooled.to_csv(not_fooled_output)
results['dataset'].append(data)
results['lr'].append(lr)
results['ep'].append(ep)
results['bs'].append(bs)
results['alpha'].append(al)
results['dropout'].append(dr)
results['gen_loss'].append(gen_loss)
results['dis_loss'].append(dis_loss)
results['activation'].append(activation)
results['fooled_len'].append(len(dis_fooled_scaled))
results['not_fooled_len'].append(len(dis_not_fooled_scaled))
results['mean_min_distance_fooled'].append(mean_min_distance_fooled)
results['mean_min_distance_not_fooled'].append(mean_min_distance_not_fooled)
results['mean_min_distance_gap'].append(mean_min_distance_not_fooled-mean_min_distance_fooled)
results_df = pd.DataFrame.from_dict(results)
results_df.to_csv(results_output, index=False)
def second_question():
data_name = ["adult", "bank-full"]
learning_rate = [0.001]
epochs = [10]
batch_size = [128]
alpha_relu = [0.2]
dropout = [0.3]
results = {'dataset': [], 'lr': [], 'ep': [], 'bs': [], 'alpha': [], 'dropout': [], 'gen_loss': [], 'proba_error': []}
combs = len(data_name) * len(learning_rate) * len(epochs) * len(batch_size) * len(alpha_relu) * len(dropout)
i = 1
for data in data_name:
for lr in learning_rate:
for ep in epochs:
for bs in batch_size:
for al in alpha_relu:
for dr in dropout:
print(f'Running combination {i}/{combs}')
data_path = f'./datasets/{data}.arff'
model_name = f'data_{data}_ep_{ep}_bs_{bs}_lr_{lr}_part2'
x_scaled, meta_data_rev, cols, min_max_scaler, feature_num = prepare_architecture(data_path)
general_generator = GG(feature_num, saved_models_path, lr, dr, al)
x_train, x_test, y_train, y_test = train_test_split(x_scaled[:, :-1], x_scaled[:, -1], test_size=0.1)
general_generator.train_gg(x_train, y_train, ep, bs, model_name, data, saved_models_path, True)
error = general_generator.get_error()
results['dataset'].append(data)
results['lr'].append(lr)
results['ep'].append(ep)
results['bs'].append(bs)
results['alpha'].append(al)
results['dropout'].append(dr)
results['gen_loss'].append(general_generator.losses['gen_loss'][-1])
results['proba_error'].append(error.mean())
i += 1
# Test set performance
general_generator.plot_discriminator_results(x_test, y_test, data, saved_models_path)
general_generator.plot_generator_results(data, saved_models_path)
results_output = os.path.join(saved_models_path, f'question_two_results.csv')
results_df = pd.DataFrame.from_dict(results)
# results_df.to_csv(results_output, index=False)
def main():
# first_question()
second_question()
if __name__ == '__main__':
main()
|