|
import os |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from scipy.io import arff |
|
from sklearn import preprocessing |
|
from sklearn.model_selection import train_test_split |
|
|
|
from nt_gan import GAN |
|
from nt_gg import GG |
|
|
|
dataset_directory = 'datasets' |
|
saved_models_path = 'outputs' |
|
|
|
|
|
def prepare_architecture(arff_data_path): |
|
""" |
|
This function create the architecture of the GAN network. |
|
The generator and the discriminator are created and then combined into the GAN model |
|
:param arff_data_path: data path for the arff file |
|
:return: a dictionary with all the relevant variables for the next stages |
|
""" |
|
data, meta_data = arff.loadarff(arff_data_path) |
|
df = pd.DataFrame(data) |
|
columns = df.columns |
|
transformed_data, x, x_scaled, meta_data_rev, min_max_scaler = create_scaled_data(df, meta_data) |
|
|
|
number_of_features = len(transformed_data.columns) |
|
|
|
return x_scaled, meta_data_rev, columns, min_max_scaler, number_of_features |
|
|
|
|
|
def create_scaled_data(df, meta_data): |
|
""" |
|
|
|
:param df: |
|
:param meta_data: |
|
:return: |
|
""" |
|
meta_data_dict = {k: {a.replace(' ', ''): b + 1 for b, a in enumerate(v.values)} for k, v in |
|
meta_data._attributes.items() if |
|
v.type_name != 'numeric'} |
|
meta_data_rev = {k: {b + 1: a.replace(' ', '') for b, a in enumerate(v.values)} for k, v in |
|
meta_data._attributes.items() if |
|
v.type_name != 'numeric'} |
|
transformed_data = df.copy() |
|
for col in df.columns: |
|
if col in meta_data_dict: |
|
|
|
transformed_data[col] = transformed_data[col].apply( |
|
lambda x: meta_data_dict[col][str(x).split('\'')[1]] if str(x).split('\'')[1] in meta_data_dict[ |
|
col] else 0) |
|
x = transformed_data.values |
|
min_max_scaler = preprocessing.MinMaxScaler() |
|
x_scaled = min_max_scaler.fit_transform(x) |
|
return transformed_data, x, x_scaled, meta_data_rev, min_max_scaler |
|
|
|
|
|
def re_scaled_data(data, columns, meta_data_rev, min_max_scaler): |
|
""" |
|
This function re-scaled the fake data to the original format. |
|
:param data: the data we want to re scaled |
|
:param columns: |
|
:param meta_data_rev: |
|
:return: |
|
""" |
|
data_inv = min_max_scaler.inverse_transform(data) |
|
df = pd.DataFrame(data_inv, columns=columns) |
|
transformed_data = df.copy() |
|
for col in transformed_data.columns: |
|
if col in meta_data_rev: |
|
|
|
transformed_data[col] = transformed_data[col].apply( |
|
lambda x: meta_data_rev[col][int(round(x))] if int(round(x)) in meta_data_rev[ |
|
col] else np.nan) |
|
return transformed_data |
|
|
|
|
|
def first_question(): |
|
""" |
|
This function answers the first question |
|
:return: |
|
""" |
|
to_plot_losses = True |
|
results_output = os.path.join(saved_models_path, f'question_one_results.csv') |
|
results = {'dataset': [], 'lr': [], 'ep': [], 'bs': [], 'alpha': [], 'dropout': [], 'gen_loss': [], 'dis_loss': [], |
|
'activation': [], 'fooled_len': [], 'not_fooled_len': [], 'mean_min_distance_fooled': [], |
|
'mean_min_distance_not_fooled': [], 'mean_min_distance_gap': []} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_name = ["adult"] |
|
learning_rate = [0.001] |
|
epochs = [10] |
|
batch_size = [128] |
|
alpha_relu = [0.5] |
|
dropout = [0.5] |
|
loss = 'binary_crossentropy' |
|
activation = 'sigmoid' |
|
|
|
for data in data_name: |
|
for lr in learning_rate: |
|
for ep in epochs: |
|
for bs in batch_size: |
|
for al in alpha_relu: |
|
for dr in dropout: |
|
arff_data_path = f'./datasets/{data}.arff' |
|
model_name = f'data_{data}_ep_{ep}_bs_{bs}_lr_{lr}_al_{al}_dr_{dr}' |
|
pca_output = os.path.join(saved_models_path, f'{model_name}_pca.png') |
|
fooled_output = os.path.join(saved_models_path, f'{model_name}_fooled.csv') |
|
not_fooled_output = os.path.join(saved_models_path, f'{model_name}_not_fooled.csv') |
|
|
|
x_scaled, meta_data_rev, columns, min_max_scaler, number_of_features = prepare_architecture( |
|
arff_data_path) |
|
gan_obj = GAN(number_of_features=number_of_features, saved_models_path=saved_models_path, |
|
learning_rate=lr, alpha_relu=al, dropout=dr, |
|
loss=loss, activation=activation) |
|
gen_loss, dis_loss = gan_obj.train(scaled_data=x_scaled, epochs=ep, batch_size=bs, |
|
to_plot_losses=to_plot_losses, model_name=model_name) |
|
dis_fooled_scaled, dis_not_fooled_scaled, mean_min_distance_fooled, mean_min_distance_not_fooled = gan_obj.test( |
|
scaled_data=x_scaled, sample_num=100, pca_output=pca_output) |
|
dis_fooled = re_scaled_data(data=dis_fooled_scaled, columns=columns, |
|
meta_data_rev=meta_data_rev, |
|
min_max_scaler=min_max_scaler) |
|
dis_fooled.to_csv(fooled_output) |
|
dis_not_fooled = re_scaled_data(data=dis_not_fooled_scaled, columns=columns, |
|
meta_data_rev=meta_data_rev, |
|
min_max_scaler=min_max_scaler) |
|
dis_not_fooled.to_csv(not_fooled_output) |
|
results['dataset'].append(data) |
|
results['lr'].append(lr) |
|
results['ep'].append(ep) |
|
results['bs'].append(bs) |
|
results['alpha'].append(al) |
|
results['dropout'].append(dr) |
|
results['gen_loss'].append(gen_loss) |
|
results['dis_loss'].append(dis_loss) |
|
results['activation'].append(activation) |
|
results['fooled_len'].append(len(dis_fooled_scaled)) |
|
results['not_fooled_len'].append(len(dis_not_fooled_scaled)) |
|
results['mean_min_distance_fooled'].append(mean_min_distance_fooled) |
|
results['mean_min_distance_not_fooled'].append(mean_min_distance_not_fooled) |
|
results['mean_min_distance_gap'].append(mean_min_distance_not_fooled-mean_min_distance_fooled) |
|
results_df = pd.DataFrame.from_dict(results) |
|
results_df.to_csv(results_output, index=False) |
|
|
|
|
|
def second_question(): |
|
|
|
data_name = ["adult", "bank-full"] |
|
learning_rate = [0.001] |
|
epochs = [10] |
|
batch_size = [128] |
|
alpha_relu = [0.2] |
|
dropout = [0.3] |
|
results = {'dataset': [], 'lr': [], 'ep': [], 'bs': [], 'alpha': [], 'dropout': [], 'gen_loss': [], 'proba_error': []} |
|
combs = len(data_name) * len(learning_rate) * len(epochs) * len(batch_size) * len(alpha_relu) * len(dropout) |
|
i = 1 |
|
for data in data_name: |
|
for lr in learning_rate: |
|
for ep in epochs: |
|
for bs in batch_size: |
|
for al in alpha_relu: |
|
for dr in dropout: |
|
print(f'Running combination {i}/{combs}') |
|
data_path = f'./datasets/{data}.arff' |
|
model_name = f'data_{data}_ep_{ep}_bs_{bs}_lr_{lr}_part2' |
|
x_scaled, meta_data_rev, cols, min_max_scaler, feature_num = prepare_architecture(data_path) |
|
general_generator = GG(feature_num, saved_models_path, lr, dr, al) |
|
x_train, x_test, y_train, y_test = train_test_split(x_scaled[:, :-1], x_scaled[:, -1], test_size=0.1) |
|
general_generator.train_gg(x_train, y_train, ep, bs, model_name, data, saved_models_path, True) |
|
error = general_generator.get_error() |
|
results['dataset'].append(data) |
|
results['lr'].append(lr) |
|
results['ep'].append(ep) |
|
results['bs'].append(bs) |
|
results['alpha'].append(al) |
|
results['dropout'].append(dr) |
|
results['gen_loss'].append(general_generator.losses['gen_loss'][-1]) |
|
results['proba_error'].append(error.mean()) |
|
i += 1 |
|
|
|
general_generator.plot_discriminator_results(x_test, y_test, data, saved_models_path) |
|
general_generator.plot_generator_results(data, saved_models_path) |
|
|
|
results_output = os.path.join(saved_models_path, f'question_two_results.csv') |
|
results_df = pd.DataFrame.from_dict(results) |
|
|
|
|
|
|
|
|
|
def main(): |
|
|
|
second_question() |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|