nevoit's picture
Upload 40 files
76cdfb8
import os
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from nt_gan import GAN
from nt_gg import GG
dataset_directory = 'datasets'
saved_models_path = 'outputs'
def prepare_architecture(arff_data_path):
"""
This function create the architecture of the GAN network.
The generator and the discriminator are created and then combined into the GAN model
:param arff_data_path: data path for the arff file
:return: a dictionary with all the relevant variables for the next stages
"""
data, meta_data = arff.loadarff(arff_data_path) # This function reads arff file into tuple of data and its meta.
df = pd.DataFrame(data)
columns = df.columns
transformed_data, x, x_scaled, meta_data_rev, min_max_scaler = create_scaled_data(df, meta_data)
number_of_features = len(transformed_data.columns) # Define the GAN and training parameters
return x_scaled, meta_data_rev, columns, min_max_scaler, number_of_features
def create_scaled_data(df, meta_data):
"""
:param df:
:param meta_data:
:return:
"""
meta_data_dict = {k: {a.replace(' ', ''): b + 1 for b, a in enumerate(v.values)} for k, v in
meta_data._attributes.items() if
v.type_name != 'numeric'} # Starts from one and not zero because one is for Nan values
meta_data_rev = {k: {b + 1: a.replace(' ', '') for b, a in enumerate(v.values)} for k, v in
meta_data._attributes.items() if
v.type_name != 'numeric'} # Starts from one and not zero because one is for Nan values
transformed_data = df.copy()
for col in df.columns:
if col in meta_data_dict:
# Sometimes the values can not be found in the meta data, so we treat these values as Nan
transformed_data[col] = transformed_data[col].apply(
lambda x: meta_data_dict[col][str(x).split('\'')[1]] if str(x).split('\'')[1] in meta_data_dict[
col] else 0)
x = transformed_data.values # returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
return transformed_data, x, x_scaled, meta_data_rev, min_max_scaler
def re_scaled_data(data, columns, meta_data_rev, min_max_scaler):
"""
This function re-scaled the fake data to the original format.
:param data: the data we want to re scaled
:param columns:
:param meta_data_rev:
:return:
"""
data_inv = min_max_scaler.inverse_transform(data)
df = pd.DataFrame(data_inv, columns=columns)
transformed_data = df.copy()
for col in transformed_data.columns:
if col in meta_data_rev:
# Sometimes the values can not be found in the meta data, so we treat these values as Nan
transformed_data[col] = transformed_data[col].apply(
lambda x: meta_data_rev[col][int(round(x))] if int(round(x)) in meta_data_rev[
col] else np.nan)
return transformed_data
def first_question():
"""
This function answers the first question
:return:
"""
to_plot_losses = True
results_output = os.path.join(saved_models_path, f'question_one_results.csv')
results = {'dataset': [], 'lr': [], 'ep': [], 'bs': [], 'alpha': [], 'dropout': [], 'gen_loss': [], 'dis_loss': [],
'activation': [], 'fooled_len': [], 'not_fooled_len': [], 'mean_min_distance_fooled': [],
'mean_min_distance_not_fooled': [], 'mean_min_distance_gap': []}
# w1 * (MMDF + MMDNF) - w3 * (MMDG) + w2 * (NFL/ 100)
# MMDG = MMDNF - MMDF
# data_name = ["adult", "bank-full"]
# learning_rate = [0.01, 0.001, 0.0001]
# epochs = [5, 10, 15]
# batch_size = [64, 128, 1024]
# alpha_relu = [0.2, 0.5]
# dropout = [0.3, 0.5]
data_name = ["adult"]
learning_rate = [0.001]
epochs = [10]
batch_size = [128]
alpha_relu = [0.5]
dropout = [0.5]
loss = 'binary_crossentropy'
activation = 'sigmoid'
for data in data_name:
for lr in learning_rate:
for ep in epochs:
for bs in batch_size:
for al in alpha_relu:
for dr in dropout:
arff_data_path = f'./datasets/{data}.arff'
model_name = f'data_{data}_ep_{ep}_bs_{bs}_lr_{lr}_al_{al}_dr_{dr}'
pca_output = os.path.join(saved_models_path, f'{model_name}_pca.png')
fooled_output = os.path.join(saved_models_path, f'{model_name}_fooled.csv')
not_fooled_output = os.path.join(saved_models_path, f'{model_name}_not_fooled.csv')
x_scaled, meta_data_rev, columns, min_max_scaler, number_of_features = prepare_architecture(
arff_data_path)
gan_obj = GAN(number_of_features=number_of_features, saved_models_path=saved_models_path,
learning_rate=lr, alpha_relu=al, dropout=dr,
loss=loss, activation=activation)
gen_loss, dis_loss = gan_obj.train(scaled_data=x_scaled, epochs=ep, batch_size=bs,
to_plot_losses=to_plot_losses, model_name=model_name)
dis_fooled_scaled, dis_not_fooled_scaled, mean_min_distance_fooled, mean_min_distance_not_fooled = gan_obj.test(
scaled_data=x_scaled, sample_num=100, pca_output=pca_output)
dis_fooled = re_scaled_data(data=dis_fooled_scaled, columns=columns,
meta_data_rev=meta_data_rev,
min_max_scaler=min_max_scaler)
dis_fooled.to_csv(fooled_output)
dis_not_fooled = re_scaled_data(data=dis_not_fooled_scaled, columns=columns,
meta_data_rev=meta_data_rev,
min_max_scaler=min_max_scaler)
dis_not_fooled.to_csv(not_fooled_output)
results['dataset'].append(data)
results['lr'].append(lr)
results['ep'].append(ep)
results['bs'].append(bs)
results['alpha'].append(al)
results['dropout'].append(dr)
results['gen_loss'].append(gen_loss)
results['dis_loss'].append(dis_loss)
results['activation'].append(activation)
results['fooled_len'].append(len(dis_fooled_scaled))
results['not_fooled_len'].append(len(dis_not_fooled_scaled))
results['mean_min_distance_fooled'].append(mean_min_distance_fooled)
results['mean_min_distance_not_fooled'].append(mean_min_distance_not_fooled)
results['mean_min_distance_gap'].append(mean_min_distance_not_fooled-mean_min_distance_fooled)
results_df = pd.DataFrame.from_dict(results)
results_df.to_csv(results_output, index=False)
def second_question():
data_name = ["adult", "bank-full"]
learning_rate = [0.001]
epochs = [10]
batch_size = [128]
alpha_relu = [0.2]
dropout = [0.3]
results = {'dataset': [], 'lr': [], 'ep': [], 'bs': [], 'alpha': [], 'dropout': [], 'gen_loss': [], 'proba_error': []}
combs = len(data_name) * len(learning_rate) * len(epochs) * len(batch_size) * len(alpha_relu) * len(dropout)
i = 1
for data in data_name:
for lr in learning_rate:
for ep in epochs:
for bs in batch_size:
for al in alpha_relu:
for dr in dropout:
print(f'Running combination {i}/{combs}')
data_path = f'./datasets/{data}.arff'
model_name = f'data_{data}_ep_{ep}_bs_{bs}_lr_{lr}_part2'
x_scaled, meta_data_rev, cols, min_max_scaler, feature_num = prepare_architecture(data_path)
general_generator = GG(feature_num, saved_models_path, lr, dr, al)
x_train, x_test, y_train, y_test = train_test_split(x_scaled[:, :-1], x_scaled[:, -1], test_size=0.1)
general_generator.train_gg(x_train, y_train, ep, bs, model_name, data, saved_models_path, True)
error = general_generator.get_error()
results['dataset'].append(data)
results['lr'].append(lr)
results['ep'].append(ep)
results['bs'].append(bs)
results['alpha'].append(al)
results['dropout'].append(dr)
results['gen_loss'].append(general_generator.losses['gen_loss'][-1])
results['proba_error'].append(error.mean())
i += 1
# Test set performance
general_generator.plot_discriminator_results(x_test, y_test, data, saved_models_path)
general_generator.plot_generator_results(data, saved_models_path)
results_output = os.path.join(saved_models_path, f'question_two_results.csv')
results_df = pd.DataFrame.from_dict(results)
# results_df.to_csv(results_output, index=False)
def main():
# first_question()
second_question()
if __name__ == '__main__':
main()