SalazarPevelll
be
f291f4a
"""The DataProvider class serve as a helper module for retriving subject model data"""
from abc import ABC, abstractmethod
import os
import gc
import time
from singleVis.utils import *
from singleVis.eval.evaluate import evaluate_inv_accu
"""
DataContainder module
1. prepare data
2. estimate boundary
3. provide data
"""
class DataProviderAbstractClass(ABC):
def __init__(self, content_path, model, epoch_start, epoch_end, epoch_period):
self.mode = "abstract"
self.content_path = content_path
self.model = model
self.s = epoch_start
self.e = epoch_end
self.p = epoch_period
@property
@abstractmethod
def train_num(self):
pass
@property
@abstractmethod
def test_num(self):
pass
@abstractmethod
def _meta_data(self):
pass
@abstractmethod
def _estimate_boundary(self):
pass
def update_interval(self, epoch_s, epoch_e):
self.s = epoch_s
self.e = epoch_e
class DataProvider(DataProviderAbstractClass):
def __init__(self, content_path, model, epoch_start, epoch_end, epoch_period, device, classes, epoch_name, verbose=1):
self.content_path = content_path
self.model = model
self.s = epoch_start
self.e = epoch_end
self.p = epoch_period
self.DEVICE = device
self.classes = classes
self.verbose = verbose
self.epoch_name = epoch_name
self.model_path = os.path.join(self.content_path, "Model")
if verbose:
print("Finish initialization...")
@property
def train_num(self):
with open(os.path.join(self.content_path, "Model", "{}_{}".format(self.epoch_name, self.s), "index.json"), "r") as f:
idxs = json.load(f)
return len(idxs)
@property
def test_num(self):
testing_data_path = os.path.join(self.content_path, "Testing_data")
testing_data = torch.load(os.path.join(testing_data_path, "testing_dataset_data.pth"),
map_location="cpu")
test_num = len(testing_data)
del testing_data
gc.collect()
return test_num
def _meta_data(self):
raise NotImplementedError
def _estimate_boundary(self):
raise NotImplementedError
class NormalDataProvider(DataProvider):
def __init__(self, content_path, model, epoch_start, epoch_end, epoch_period, device, classes, epoch_name, verbose=1):
super().__init__(content_path, model, epoch_start, epoch_end, epoch_period, device, classes, epoch_name, verbose)
self.mode = "normal"
@property
def representation_dim(self):
train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, self.s), "train_data.npy")
try:
train_data = np.load(train_data_loc)
repr_dim = np.prod(train_data.shape[1:])
return repr_dim
except Exception as e:
return None
def _meta_data(self):
time_inference = list()
training_data_path = os.path.join(self.content_path, "Training_data")
training_data = torch.load(os.path.join(training_data_path, "training_dataset_data.pth"),
map_location="cpu")
training_data = training_data.to(self.DEVICE)
testing_data_path = os.path.join(self.content_path, "Testing_data")
testing_data = torch.load(os.path.join(testing_data_path, "testing_dataset_data.pth"),
map_location="cpu")
testing_data = testing_data.to(self.DEVICE)
for n_epoch in range(self.s, self.e + 1, self.p):
t_s = time.time()
# make it possible to choose a subset of testing data for testing
test_index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), "test_index.json")
if os.path.exists(test_index_file):
test_index = load_labelled_data_index(test_index_file)
else:
test_index = range(len(testing_data))
testing_data = testing_data[test_index]
model_location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), "subject_model.pth")
self.model.load_state_dict(torch.load(model_location, map_location=torch.device("cpu")))
self.model = self.model.to(self.DEVICE)
self.model.eval()
repr_model = self.feature_function(n_epoch)
# repr_model = torch.nn.Sequential(*(list(self.model.children())[:self.split]))
# training data clustering
data_pool_representation = batch_run(repr_model, training_data)
location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), "train_data.npy")
np.save(location, data_pool_representation)
# test data
test_data_representation = batch_run(repr_model, testing_data)
location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), "test_data.npy")
np.save(location, test_data_representation)
t_e = time.time()
time_inference.append(t_e-t_s)
if self.verbose > 0:
print("Finish inferencing data for Epoch {:d}...".format(n_epoch))
print(
"Average time for inferencing data: {:.4f}".format(sum(time_inference) / len(time_inference)))
# save result
save_dir = os.path.join(self.model_path, "time.json")
if not os.path.exists(save_dir):
evaluation = dict()
else:
f = open(save_dir, "r")
evaluation = json.load(f)
f.close()
evaluation["data_inference"] = round(sum(time_inference) / len(time_inference), 3)
with open(save_dir, 'w') as f:
json.dump(evaluation, f)
del training_data
del testing_data
gc.collect()
def _estimate_boundary(self, num, l_bound):
'''
Preprocessing data. This process includes find_border_points and find_border_centers
save data for later training
'''
time_borders_gen = list()
training_data_path = os.path.join(self.content_path, "Training_data")
training_data = torch.load(os.path.join(training_data_path, "training_dataset_data.pth"),
map_location="cpu")
training_data = training_data.to(self.DEVICE)
for n_epoch in range(self.s, self.e + 1, self.p):
index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), "index.json")
index = load_labelled_data_index(index_file)
training_data = training_data[index]
# model_location = os.path.join(self.model_path, "Epoch_{:d}".format(n_epoch), "subject_model.pth")
# self.model.load_state_dict(torch.load(model_location, map_location=torch.device("cpu")))
# self.model = self.model.to(self.DEVICE)
# self.model.eval()
# repr_model = torch.nn.Sequential(*(list(self.model.children())[:self.split]))
repr_model = self.feature_function(n_epoch)
t0 = time.time()
confs = batch_run(self.model, training_data)
preds = np.argmax(confs, axis=1).squeeze()
# TODO how to choose the number of boundary points?
num_adv_eg = num
border_points, _, _ = get_border_points(model=self.model, input_x=training_data, confs=confs, predictions=preds, device=self.DEVICE, l_bound=l_bound, num_adv_eg=num_adv_eg, lambd=0.05, verbose=0)
t1 = time.time()
time_borders_gen.append(round(t1 - t0, 4))
# get gap layer data
border_points = border_points.to(self.DEVICE)
border_centers = batch_run(repr_model, border_points)
location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), "border_centers.npy")
np.save(location, border_centers)
location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), "ori_border_centers.npy")
np.save(location, border_points.cpu().numpy())
num_adv_eg = num
border_points, _, _ = get_border_points(model=self.model, input_x=training_data, confs=confs, predictions=preds, device=self.DEVICE, l_bound=l_bound, num_adv_eg=num_adv_eg, lambd=0.05, verbose=0)
# get gap layer data
border_points = border_points.to(self.DEVICE)
border_centers = batch_run(repr_model, border_points)
location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), "test_border_centers.npy")
np.save(location, border_centers)
location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, n_epoch), "test_ori_border_centers.npy")
np.save(location, border_points.cpu().numpy())
if self.verbose > 0:
print("Finish generating borders for Epoch {:d}...".format(n_epoch))
print(
"Average time for generate border points: {:.4f}".format(sum(time_borders_gen) / len(time_borders_gen)))
# save result
save_dir = os.path.join(self.model_path, "time.json")
if not os.path.exists(save_dir):
evaluation = dict()
else:
f = open(save_dir, "r")
evaluation = json.load(f)
f.close()
evaluation["data_B_gene"] = round(sum(time_borders_gen) / len(time_borders_gen), 3)
with open(save_dir, 'w') as f:
json.dump(evaluation, f)
def initialize(self, num, l_bound):
self._meta_data()
self._estimate_boundary(num, l_bound)
def train_representation(self, epoch):
# load train data
train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "train_data.npy")
index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "index.json")
index = load_labelled_data_index(index_file)
try:
train_data = np.load(train_data_loc)
train_data = train_data[index]
except Exception as e:
print("no train data saved for Epoch {}".format(epoch))
train_data = None
return train_data
def train_labels(self, epoch):
# load train data
training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth")
index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "index.json")
index = load_labelled_data_index(index_file)
try:
training_labels = torch.load(training_data_loc, map_location="cpu")
# print(len(training_labels))
# print(len(index))
# # training_labels = training_labels[index]
training_labels = np.array(training_labels)
except Exception as e:
print("no train labels saved for Epoch {}".format(epoch))
training_labels = None
return training_labels
def test_representation(self, epoch):
data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "test_data.npy")
try:
test_data = np.load(data_loc)
index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "test_index.json")
if os.path.exists(index_file):
index = load_labelled_data_index(index_file)
test_data = test_data[index]
except Exception as e:
print("no test data saved for Epoch {}".format(epoch))
test_data = None
# max_x = self.max_norm(epoch)
return test_data
def test_labels(self, epoch):
# load train data
testing_data_loc = os.path.join(self.content_path, "Testing_data", "testing_dataset_label.pth")
try:
testing_labels = torch.load(testing_data_loc).to(device="cpu")
index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "test_index.json")
print(index_file)
if os.path.exists(index_file):
idxs = load_labelled_data_index(index_file)
# testing_labels = testing_labels[idxs]
testing_labels = torch.zeros(len(idxs))
except Exception as e:
print("no test labels saved for Epoch {}".format(epoch))
testing_labels = None
return testing_labels.cpu().numpy()
def border_representation(self, epoch):
border_centers_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch),
"border_centers.npy")
try:
border_centers = np.load(border_centers_loc)
except Exception as e:
print("no border points saved for Epoch {}".format(epoch))
border_centers = np.array([])
return border_centers
def test_border_representation(self, epoch):
border_centers_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch),
"test_border_centers.npy")
try:
border_centers = np.load(border_centers_loc)
except Exception as e:
print("no border points saved for Epoch {}".format(epoch))
border_centers = np.array([])
return border_centers
def max_norm(self, epoch):
train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "train_data.npy")
index_file = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "index.json")
index = load_labelled_data_index(index_file)
try:
train_data = np.load(train_data_loc)
train_data = train_data[index]
max_x = np.linalg.norm(train_data, axis=1).max()
except Exception as e:
print("no train data saved for Epoch {}".format(epoch))
max_x = None
return max_x
def prediction_function(self, epoch):
model_location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "subject_model.pth")
self.model.load_state_dict(torch.load(model_location, map_location=torch.device("cpu")),strict=False)
self.model.to(self.DEVICE)
self.model.eval()
pred_fn = self.model.prediction
return pred_fn
def feature_function(self, epoch):
model_location = os.path.join(self.model_path, "{}_{:d}".format(self.epoch_name, epoch), "subject_model.pth")
self.model.load_state_dict(torch.load(model_location, map_location=torch.device("cpu")),strict=False)
self.model = self.model.to(self.DEVICE)
self.model.eval()
fea_fn = self.model.feature
return fea_fn
def get_pred(self, epoch, data):
'''
get the prediction score for data in epoch_id
:param data: numpy.ndarray
:param epoch_id:
:return: pred, numpy.ndarray
'''
prediction_func = self.prediction_function(epoch)
data = torch.from_numpy(data)
data = data.to(self.DEVICE)
pred = batch_run(prediction_func, data)
return pred
def training_accu(self, epoch):
data = self.train_representation(epoch)
labels = self.train_labels(epoch)
pred = self.get_pred(epoch, data).argmax(-1)
val = evaluate_inv_accu(labels, pred)
return val
def testing_accu(self, epoch):
data = self.test_representation(epoch)
labels = self.test_labels(epoch)
test_index_file = os.path.join(self.model_path, "{}_{}".format(self.epoch_name, epoch), "test_index.json")
if os.path.exists(test_index_file):
index = load_labelled_data_index(test_index_file)
labels = labels[index]
pred = self.get_pred(epoch, data).argmax(-1)
val = evaluate_inv_accu(labels, pred)
return val
def is_deltaB(self, epoch, data):
"""
check wheter input vectors are lying on delta-boundary or not
:param epoch_id:
:param data: numpy.ndarray
:return: numpy.ndarray, boolean, True stands for is_delta_boundary
"""
preds = self.get_pred(epoch, data)
border = is_B(preds)
return border
def checkpoint_path(self, epoch):
path = os.path.join(self.model_path, "{}_{}".format(self.epoch_name, epoch))
return path
class ActiveLearningDataProvider(DataProvider):
def __init__(self, content_path, model, base_epoch_start, device, classes, iteration_name="Iteration",verbose=1):
# dummy input as epoch_end and epoch_period
super().__init__(content_path, model, base_epoch_start, base_epoch_start, 1, device, classes, iteration_name, verbose)
self.mode = "al"
self.iteration_name = iteration_name
@property
def pool_num(self):
return len(self.train_labels_all())
def label_num(self, iteration):
return len(self.get_labeled_idx(iteration))
def representation_dim(self, iteration):
train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "train_data.npy")
try:
train_data = np.load(train_data_loc)
repr_dim = np.prod(train_data.shape[1:])
return repr_dim
except Exception as e:
return None
def get_labeled_idx(self, iteration):
index_file = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "index.json")
lb_idxs = np.array(load_labelled_data_index(index_file))
return lb_idxs
def get_unlabeled_idx(self, pool_num, lb_idx):
tot_idx = np.arange(pool_num)
# !Noted that tot need to be the first arguement
ulb_idx = np.setdiff1d(tot_idx, lb_idx)
return ulb_idx
def _meta_data(self, iteration):
training_data_path = os.path.join(self.content_path, "Training_data")
training_data = torch.load(os.path.join(training_data_path, "training_dataset_data.pth"),
map_location="cpu")
training_data = training_data.to(self.DEVICE)
testing_data_path = os.path.join(self.content_path, "Testing_data")
testing_data = torch.load(os.path.join(testing_data_path, "testing_dataset_data.pth"),
map_location="cpu")
testing_data = testing_data.to(self.DEVICE)
t_s = time.time()
repr_model = self.feature_function(iteration)
# training data clustering
data_pool_representation = batch_run(repr_model, training_data)
location = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "train_data.npy")
np.save(location, data_pool_representation)
# test data
test_data_representation = batch_run(repr_model, testing_data)
location = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "test_data.npy")
np.save(location, test_data_representation)
t_e = time.time()
if self.verbose > 0:
print("Finish inferencing data for Iteration {:d} in {:.2f} seconds...".format(iteration, t_e-t_s))
# save result
save_dir = os.path.join(self.model_path, "time_al.json")
if not os.path.exists(save_dir):
evaluation = dict()
else:
f = open(save_dir, "r")
evaluation = json.load(f)
f.close()
if "data_inference" not in evaluation.keys():
evaluation["data_inference"] = dict()
evaluation["data_inference"][str(iteration)] = round(t_e - t_s, 3)
with open(save_dir, 'w') as f:
json.dump(evaluation, f)
del training_data
del testing_data
gc.collect()
def _estimate_boundary(self, iteration, num, l_bound):
'''
Preprocessing data. This process includes find_border_points and find_border_centers
save data for later training
'''
training_data_path = os.path.join(self.content_path, "Training_data")
training_data = torch.load(os.path.join(training_data_path, "training_dataset_data.pth"),
map_location="cpu")
training_data = training_data.to(self.DEVICE)
index_file = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "index.json")
index = load_labelled_data_index(index_file)
training_data = training_data[index]
repr_model = self.feature_function(iteration)
t0 = time.time()
confs = batch_run(self.model, training_data)
preds = np.argmax(confs, axis=1).squeeze()
# TODO how to choose the number of boundary points?
num_adv_eg = num
border_points, _, _ = get_border_points(model=self.model, input_x=training_data, confs=confs, predictions=preds, device=self.DEVICE, l_bound=l_bound, num_adv_eg=num_adv_eg, lambd=0.05, verbose=0)
t1 = time.time()
# get gap layer data
border_points = border_points.to(self.DEVICE)
border_centers = batch_run(repr_model, border_points)
location = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "border_centers.npy")
np.save(location, border_centers)
location = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "ori_border_centers.npy")
np.save(location, border_points.cpu().numpy())
num_adv_eg = num
border_points, _, _ = get_border_points(model=self.model, input_x=training_data, confs=confs, predictions=preds, device=self.DEVICE, l_bound=l_bound, num_adv_eg=num_adv_eg, lambd=0.05, verbose=0)
# get gap layer data
border_points = border_points.to(self.DEVICE)
border_centers = batch_run(repr_model, border_points)
location = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "test_border_centers.npy")
np.save(location, border_centers)
location = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_nameiteration), "test_ori_border_centers.npy")
np.save(location, border_points.cpu().numpy())
if self.verbose > 0:
print("Finish generating borders for Iteration {:d} in {:.2f} seconds ...".format(iteration, t1-t0))
# save result
save_dir = os.path.join(self.model_path, "time_al.json")
if not os.path.exists(save_dir):
evaluation = dict()
else:
f = open(save_dir, "r")
evaluation = json.load(f)
f.close()
if "data_B_gene" not in evaluation.keys():
evaluation["data_B_gene"] = dict()
evaluation["data_B_gene"][str(iteration)] = round(t1-t0, 3)
with open(save_dir, 'w') as f:
json.dump(evaluation, f)
def initialize_iteration(self, iteration, num, l_bound):
self._meta_data(iteration)
self._estimate_boundary(iteration, num, l_bound)
def train_representation(self, iteration):
# load labelled train data
train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "train_data.npy")
try:
idxs = self.get_labeled_idx(iteration)
train_data = np.load(train_data_loc)[idxs]
except Exception as e:
print("no train data saved for Iteration {}".format(iteration))
train_data = None
return train_data
def train_labels(self, epoch):
# load train data
training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth")
try:
idxs = self.get_labeled_idx(epoch)
training_labels = torch.load(training_data_loc, map_location="cpu")[idxs]
except Exception as e:
print("no train labels saved for Iteration {}".format(epoch))
training_labels = None
return training_labels.numpy()
def train_representation_ulb(self, iteration):
# load train data
train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "train_data.npy")
lb_idxs = self.get_labeled_idx(iteration)
try:
train_data = np.load(train_data_loc)
ulb_idxs = self.get_unlabeled_idx(len(train_data), lb_idxs)
train_data = train_data[ulb_idxs]
except Exception as e:
print("no train data saved for Iteration {}".format(iteration))
train_data = None
return train_data
def train_labels_ulb(self, epoch):
# load train data
training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth")
lb_idxs = self.get_labeled_idx(epoch)
try:
training_labels = torch.load(training_data_loc, map_location="cpu")
ulb_idxs = self.get_unlabeled_idx(len(training_labels), lb_idxs)
training_labels = training_labels[ulb_idxs]
except Exception as e:
print("no train labels saved for Iteration {}".format(epoch))
training_labels = None
return training_labels.numpy()
def train_representation_all(self, iteration):
# load train data
train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "train_data.npy")
try:
train_data = np.load(train_data_loc)
except Exception as e:
print("no train data saved for Iteration {}".format(iteration))
train_data = None
return train_data
def train_labels_all(self):
# load train data
training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth")
try:
training_labels = torch.load(training_data_loc, map_location="cpu")
except Exception as e:
print("no train labels saved")
training_labels = None
return training_labels.numpy()
def test_representation(self, epoch):
data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, epoch), "test_data.npy")
try:
test_data = np.load(data_loc)
index_file = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, epoch), "test_index.json")
if os.path.exists(index_file):
index = load_labelled_data_index(index_file)
test_data = test_data[index]
except Exception as e:
print("no test data saved for Iteration {}".format(epoch))
test_data = None
# max_x = self.max_norm(epoch)
return test_data
def test_labels(self, epoch):
# load train data
testing_data_loc = os.path.join(self.content_path, "Testing_data", "testing_dataset_label.pth")
try:
testing_labels = torch.load(testing_data_loc, map_location="cpu").numpy()
index_file = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, epoch), "test_index.json")
if os.path.exists(index_file):
idxs = load_labelled_data_index(index_file)
testing_labels = testing_labels[idxs]
except Exception as e:
print("no test labels saved for Iteration {}".format(epoch))
testing_labels = None
return testing_labels
def border_representation(self, epoch):
border_centers_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, epoch),
"border_centers.npy")
try:
border_centers = np.load(border_centers_loc)
except Exception as e:
print("no border points saved for Iteration {}".format(epoch))
border_centers = np.array([])
return border_centers
def test_border_representation(self, epoch):
border_centers_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, epoch),
"test_border_centers.npy")
try:
border_centers = np.load(border_centers_loc)
except Exception as e:
print("no border points saved for Epoch {}".format(epoch))
border_centers = np.array([])
return border_centers
def max_norm(self, epoch):
train_data = self.train_representation(epoch)
max_x = np.linalg.norm(train_data, axis=1).max()
return max_x
def prediction_function(self, iteration):
model_location = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "subject_model.pth")
self.model.load_state_dict(torch.load(model_location, map_location=torch.device("cpu")))
self.model.to(self.DEVICE)
self.model.eval()
pred_fn = self.model.prediction
return pred_fn
def feature_function(self, epoch):
model_location = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, epoch), "subject_model.pth")
self.model.load_state_dict(torch.load(model_location, map_location=torch.device("cpu")))
self.model.to(self.DEVICE)
self.model.eval()
fea_fn = self.model.feature
return fea_fn
def get_pred(self, iteration, data):
'''
get the prediction score for data in epoch_id
:param data: numpy.ndarray
:param epoch_id:
:return: pred, numpy.ndarray
'''
prediction_func = self.prediction_function(iteration)
data = torch.from_numpy(data)
data = data.to(self.DEVICE)
pred = batch_run(prediction_func, data)
return pred
def training_accu(self, epoch):
data = self.train_representation_lb(epoch)
labels = self.train_labels_lb(epoch)
pred = self.get_pred(epoch, data).argmax(1)
val = evaluate_inv_accu(labels, pred)
return val
def testing_accu(self, epoch):
data = self.test_representation(epoch)
labels = self.test_labels(epoch)
pred = self.get_pred(epoch, data).argmax(1)
val = evaluate_inv_accu(labels, pred)
return val
def is_deltaB(self, epoch, data):
"""
check wheter input vectors are lying on delta-boundary or not
:param epoch_id:
:param data: numpy.ndarray
:return: numpy.ndarray, boolean, True stands for is_delta_boundary
"""
preds = self.get_pred(epoch, data)
border = is_B(preds)
return border
def checkpoint_path(self, epoch):
path = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, epoch))
return path
class DenseActiveLearningDataProvider(ActiveLearningDataProvider):
def __init__(self, content_path, model, base_epoch_start, epoch_num, device, classes, iteration_name="Iteration", epoch_name="Epoch", verbose=1):
super().__init__(content_path, model, base_epoch_start, device, classes, iteration_name, verbose)
self.mode = "dense_al"
self.epoch_num = epoch_num
self.s = 1
self.p = 1
self.e = epoch_num
self.epoch_name = epoch_name
def representation_dim(self):
train_data_loc = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, self.s), "{}_{:d}".format(self.epoch_name, self.epoch_num), "train_data.npy")
try:
train_data = np.load(train_data_loc)
repr_dim = np.prod(train_data.shape[1:])
return repr_dim
except Exception as e:
return None
def _meta_data(self, iteration):
time_inference = list()
training_data_path = os.path.join(self.content_path, "Training_data")
training_data = torch.load(os.path.join(training_data_path, "training_dataset_data.pth"),
map_location="cpu")
training_data = training_data.to(self.DEVICE)
testing_data_path = os.path.join(self.content_path, "Testing_data")
testing_data = torch.load(os.path.join(testing_data_path, "testing_dataset_data.pth"),
map_location="cpu")
testing_data = testing_data.to(self.DEVICE)
t_s = time.time()
# make it possible to choose a subset of testing data for testing
test_index_file = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "test_index.json")
if os.path.exists(test_index_file):
test_index = load_labelled_data_index(test_index_file)
else:
test_index = range(len(testing_data))
testing_data = testing_data[test_index]
for n_epoch in range(1, self.epoch_num+1, 1):
repr_model = self.feature_function(iteration, n_epoch)
# training data clustering
data_pool_representation = batch_run(repr_model, training_data)
location = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, n_epoch), "train_data.npy")
np.save(location, data_pool_representation)
# test data
test_data_representation = batch_run(repr_model, testing_data)
location = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, n_epoch), "test_data.npy")
np.save(location, test_data_representation)
t_e = time.time()
time_inference.append(t_e-t_s)
if self.verbose > 0:
print("Finish inferencing data for Iteration {:d}...".format(iteration))
print(
"Average time for inferencing data: {:.4f}...".format(sum(time_inference) / len(time_inference)))
# save result
save_dir = os.path.join(self.model_path, "SV_time.json")
if not os.path.exists(save_dir):
evaluation = dict()
else:
f = open(save_dir, "r")
evaluation = json.load(f)
f.close()
evaluation["data_inference"] = round(sum(time_inference) / len(time_inference), 3)
with open(save_dir, 'w') as f:
json.dump(evaluation, f)
del training_data
del testing_data
gc.collect()
def _estimate_boundary(self, iteration, num, l_bound):
'''
Preprocessing data. This process includes find_border_points and find_border_centers
save data for later training
'''
time_borders_gen = list()
training_data_path = os.path.join(self.content_path, "Training_data")
training_data = torch.load(os.path.join(training_data_path, "training_dataset_data.pth"),
map_location="cpu")
training_data = training_data.to(self.DEVICE)
for n_epoch in range(1, self.epoch_num+1, 1):
index_file = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "index.json")
index = load_labelled_data_index(index_file)
training_data = training_data[index]
repr_model = self.feature_function(iteration, n_epoch)
t0 = time.time()
confs = batch_run(self.model, training_data)
preds = np.argmax(confs, axis=1).squeeze()
# TODO how to choose the number of boundary points?
num_adv_eg = num
border_points, _, _ = get_border_points(model=self.model, input_x=training_data, confs=confs, predictions=preds, device=self.DEVICE, l_bound=l_bound, num_adv_eg=num_adv_eg, lambd=0.05, verbose=0)
t1 = time.time()
time_borders_gen.append(round(t1 - t0, 4))
# get gap layer data
border_points = border_points.to(self.DEVICE)
border_centers = batch_run(repr_model, border_points)
location = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, n_epoch), "border_centers.npy")
np.save(location, border_centers)
location = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, n_epoch), "ori_border_centers.npy")
np.save(location, border_points.cpu().numpy())
num_adv_eg = num
border_points, _, _ = get_border_points(model=self.model, input_x=training_data, confs=confs, predictions=preds, device=self.DEVICE, l_bound=l_bound, num_adv_eg=num_adv_eg, lambd=0.05, verbose=0)
# get gap layer data
border_points = border_points.to(self.DEVICE)
border_centers = batch_run(repr_model, border_points)
location = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_nanme, n_epoch), "test_border_centers.npy")
np.save(location, border_centers)
location = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, n_epoch), "test_ori_border_centers.npy")
np.save(location, border_points.cpu().numpy())
if self.verbose > 0:
print("Finish generating borders for Epoch {:d}...".format(n_epoch))
print(
"Average time for generate border points for each iteration: {:.4f}".format(sum(time_borders_gen) / len(time_borders_gen)))
# save result
save_dir = os.path.join(self.model_path, "SV_time.json")
if not os.path.exists(save_dir):
evaluation = dict()
else:
f = open(save_dir, "r")
evaluation = json.load(f)
f.close()
evaluation["data_B_gene"] = round(sum(time_borders_gen) / len(time_borders_gen), 3)
with open(save_dir, 'w') as f:
json.dump(evaluation, f)
def train_representation(self, iteration, epoch):
# load train data
train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "{}_{}".format(self.epoch_name, epoch), "train_data.npy")
lb_idxs = self.get_labeled_idx(iteration)
try:
train_data = np.load(train_data_loc)[lb_idxs]
except Exception as e:
print("no train data saved for Iteration {}".format(iteration))
train_data = None
return train_data
def train_representation_all(self, iteration, epoch):
# load train data
train_data_loc = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, epoch), "train_data.npy")
try:
train_data = np.load(train_data_loc)
except Exception as e:
print("no train data saved for Iteration {} Epoch {}".format(iteration, epoch))
train_data = None
return train_data
def train_representation_ulb(self, iteration, epoch):
# load train data
train_data_loc = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "{}_{}".format(self.epoch_name, epoch), "train_data.npy")
lb_idxs = self.get_labeled_idx(iteration)
try:
train_data = np.load(train_data_loc)
pool_num = len(train_data)
ulb_idx = self.get_unlabeled_idx(pool_num=pool_num, lb_idx=lb_idxs)
train_data = train_data[ulb_idx]
except Exception as e:
print("no train data saved for Iteration {}".format(iteration))
train_data = None
return train_data
def train_labels_ulb(self, iteration):
# load train data
training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth")
lb_idxs = self.get_labeled_idx(iteration)
try:
training_labels = torch.load(training_data_loc, map_location="cpu")
ulb_idxs = self.get_unlabeled_idx(len(training_labels), lb_idxs)
training_labels = training_labels[ulb_idxs]
except Exception as e:
print("no train labels saved for Iteration {}".format(iteration))
training_labels = None
return training_labels.numpy()
def train_labels(self, iteration):
# load labelled train labels
training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth")
index_file = os.path.join(self.model_path, "{}_{:d}".format(self.iteration_name, iteration), "index.json")
lb_idxs = np.array(load_labelled_data_index(index_file))
try:
training_labels = torch.load(training_data_loc, map_location="cpu")
training_labels = training_labels[lb_idxs]
except Exception as e:
print("no train labels saved for Iteration {}".format(iteration))
training_labels = None
return training_labels.numpy()
def train_labels_all(self):
# load train data
training_data_loc = os.path.join(self.content_path, "Training_data", "training_dataset_label.pth")
try:
training_labels = torch.load(training_data_loc, map_location="cpu")
except Exception as e:
print("no train labels saved")
training_labels = None
return training_labels.numpy()
def test_representation(self, iteration, epoch):
data_loc = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{}".format(self.epoch_name, epoch), "test_data.npy")
try:
test_data = np.load(data_loc)
index_file = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "test_index.json")
if os.path.exists(index_file):
index = load_labelled_data_index(index_file)
test_data = test_data[index]
except Exception as e:
print("no test data saved for Iteration {} Epoch {}".format(iteration, epoch))
test_data = None
return test_data
def border_representation(self, iteration, epoch):
border_centers_loc = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, epoch),
"border_centers.npy")
try:
border_centers = np.load(border_centers_loc)
except Exception as e:
print("no border points saved for Epoch {}".format(epoch))
border_centers = np.array([])
return border_centers
def test_border_representation(self, iteration, epoch):
border_centers_loc = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, epoch),
"test_border_centers.npy")
try:
border_centers = np.load(border_centers_loc)
except Exception as e:
print("no border points saved for Iteration {} Epoch {}".format(iteration, epoch))
border_centers = np.array([])
return border_centers
def max_norm(self, iteration, epoch):
train_data_loc = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, epoch), "train_data.npy")
index_file = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "index.json")
index = load_labelled_data_index(index_file)
try:
train_data = np.load(train_data_loc)
train_data = train_data[index]
max_x = np.linalg.norm(train_data, axis=1).max()
except Exception as e:
print("no train data saved for Iteration {} Epoch {}".format(iteration, epoch))
max_x = None
return max_x
def prediction_function(self, iteration, epoch):
model_location = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, epoch), "subject_model.pth")
self.model.load_state_dict(torch.load(model_location, map_location=torch.device("cpu")))
self.model.to(self.DEVICE)
self.model.eval()
pred_fn = self.model.prediction
return pred_fn
def feature_function(self, iteration, epoch):
model_location = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{:d}".format(self.epoch_name, epoch), "subject_model.pth")
self.model.load_state_dict(torch.load(model_location, map_location=torch.device("cpu")))
self.model = self.model.to(self.DEVICE)
self.model.eval()
fea_fn = self.model.feature
return fea_fn
def get_pred(self, iteration, epoch, data):
'''
get the prediction score for data in epoch_id
:param data: numpy.ndarray
:param epoch_id:
:return: pred, numpy.ndarray
'''
prediction_func = self.prediction_function(iteration, epoch)
data = torch.from_numpy(data)
data = data.to(self.DEVICE)
pred = batch_run(prediction_func, data)
return pred
def training_accu(self, iteration, epoch):
data = self.train_representation_lb(iteration, epoch)
labels = self.train_labels_lb(iteration)
pred = self.get_pred(iteration, epoch, data).argmax(-1)
val = evaluate_inv_accu(labels, pred)
return val
def testing_accu(self, iteration, epoch):
data = self.test_representation(epoch)
labels = self.test_labels(epoch)
test_index_file = os.path.join(self.model_path,"{}_{}".format(self.iteration_name, iteration), "{}_{}".format(self.epoch_name, epoch), "test_index.json")
if os.path.exists(test_index_file):
index = load_labelled_data_index(test_index_file)
labels = labels[index]
pred = self.get_pred(epoch, data).argmax(-1)
val = evaluate_inv_accu(labels, pred)
return val
def is_deltaB(self, iteration, epoch, data):
"""
check wheter input vectors are lying on delta-boundary or not
:param epoch_id:
:param data: numpy.ndarray
:return: numpy.ndarray, boolean, True stands for is_delta_boundary
"""
preds = self.get_pred(iteration, epoch, data)
border = is_B(preds)
return border
def single_checkpoint_path(self, iteration, epoch):
path = os.path.join(self.model_path, "{}_{}".format(self.iteration_name, iteration), "{}_{}".format(self.epoch_name, epoch))
return path