SalazarPevelll
be
f291f4a
raw
history blame
15.8 kB
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.cluster import Birch
from pynndescent import NNDescent
from sklearn.neighbors import NearestNeighbors
# TODO random ignore
def find_cluster(trajectories, sub_labels, new_sample):
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(trajectories)
distances, indices = nbrs.kneighbors(new_sample[np.newaxis,:])
nearest_neighbor_idx = indices[0, 1]
nearest_neighbor_dist = distances[0, 1]
cls_idx = sub_labels[nearest_neighbor_idx]
samples_in_cls = trajectories[np.argwhere(sub_labels==cls_idx).squeeze()]
# number of trees in random projection forest
n_trees = min(64, 5 + int(round(samples_in_cls.shape[0] ** 0.5 / 20.0)))
# max number of nearest neighbor iters to perform
n_iters = max(5, int(round(np.log2(samples_in_cls.shape[0]))))
# get nearest neighbors
nnd = NNDescent(
samples_in_cls,
n_neighbors=2,
metric="euclidean",
n_trees=n_trees,
n_iters=n_iters,
max_candidates=60,
verbose=False
)
_, dists = nnd.neighbor_graph
dists = dists[:, 1]
max_dist = dists.max()
if nearest_neighbor_dist< max_dist:
return cls_idx, nearest_neighbor_idx
else:
return -1, -1
class TrajectoryManager:
def __init__(self, embeddings_2d, cls_num, period=100, metric="a"):
""" trajectory manager with no feedback
sample abnormal samples based on trajectories
Parameters
----------
samples: ndarray, shape(train_num, repr_dim)
embeddings_2d : ndarray, shape (train_num, epoch_num, 2)
all 2d embeddings of representations by timevis
cls_num: int
the number of classes to cluster
period: int
We only look at the last *period* epochs of trajectory
"""
self.embeddings_2d = embeddings_2d
train_num,time_steps, _ = embeddings_2d.shape
self.train_num = train_num
self.time_steps = time_steps
self.period = period
self.metric = metric
self.cls_num = cls_num
self.v = self.embeddings_2d[:, -period:,:][:,1:,:] - self.embeddings_2d[:, -period:,:][:,:-1,:]
self.a = self.v[:,1:,:]-self.v[:,:-1,:]
def clustered(self):
brc = Birch(n_clusters=self.cls_num)
if self.metric == "v":
brc.fit(self.v.reshape(self.train_num, -1))
elif self.metric == "a":
brc.fit(self.a.reshape(self.train_num, -1))
else:
print("Not a valid metric")
self.predict_sub_labels = brc.labels_
self.suspect_clean = np.argsort(np.bincount(self.predict_sub_labels))[-3:]
# to be updated each time
self.sample_rate = np.ones(self.cls_num)
self.sample_rate[self.suspect_clean] = 0.
self.selected = np.zeros(self.train_num)
def sample_batch(self, budget):
not_selected = np.argwhere(self.selected==0).squeeze()
s_idxs = np.random.choice(not_selected, size=budget)
return s_idxs
def sample_normal(self, budget):
# sample class
normal_rate = 1 - self.sample_rate
sample_rate = normal_rate[self.predict_sub_labels]
# check how many left
not_selected = np.argwhere(self.selected==0).squeeze()
# select one
normed_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected])
s_idxs = np.random.choice(not_selected, p=normed_rate,size=budget)
return s_idxs
def update_belief(self, idxs):
# update parameters
if len(idxs)>0:
self.selected[idxs] = 1
class FeedbackTrajectoryManager(TrajectoryManager):
""" trajectory manager with feedback
sample abnormal samples based on trajectories
"""
def __init__(self, embeddings_2d, cls_num, period=100, metric="a"):
super().__init__(embeddings_2d, cls_num, period, metric)
def clustered(self):
super().clustered()
# to be updated
# self.selected
# self.sample_rate
self.user_acc = np.zeros(self.train_num)
self.user_rej = np.zeros(self.train_num)
def sample_batch(self, budget, return_scores=False):
acc_idxs = np.argwhere(self.user_acc==1).squeeze()
rej_idxs = np.argwhere(self.user_rej==1).squeeze()
acc_rate = np.zeros(self.train_num)
rej_rate = np.zeros(self.train_num)
if len(acc_idxs)>0:
acc_rate[acc_idxs]=1.
if len(rej_idxs)>0:
rej_rate[rej_idxs]=1.
if len(np.intersect1d(acc_idxs, rej_idxs))>0:
raise Exception("Intersection between acc idxs and rej idxs!")
exploit_rate = np.zeros(self.cls_num)
explore_rate = np.zeros(self.cls_num)
for cls in range(self.cls_num):
cls_idxs = np.argwhere(self.predict_sub_labels==cls).squeeze()
acc_num = np.sum(acc_rate[cls_idxs])
rej_num = np.sum(rej_rate[cls_idxs])
query_sum = acc_num + rej_num
if query_sum > 0:
exploit_rate[cls] = acc_num/query_sum
explore_rate[cls] = 1 - query_sum/len(cls_idxs)
# remove clean cls
rate = (explore_rate + exploit_rate)* self.sample_rate
sample_rate = rate[self.predict_sub_labels]
not_selected = np.argwhere(self.selected==0).squeeze()
norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected])
s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False)
if return_scores:
scores = sample_rate[s_idxs]
return s_idxs, scores
return s_idxs
def update_belief(self, acc_idxs, rej_idxs):
if len(acc_idxs)>0:
self.user_acc[acc_idxs]=1
self.selected[acc_idxs] = 1
if len(rej_idxs)>0:
self.user_rej[rej_idxs]=1
self.selected[rej_idxs] = 1
class TBSampling(TrajectoryManager):
"""with no memory, for user study"""
def __init__(self, embeddings_2d, cls_num, period=100, metric="a"):
super().__init__(embeddings_2d, cls_num, period, metric)
def sample_batch(self, acc_idxs, rej_idxs, budget, return_scores=True):
selected = np.zeros(self.train_num)
if len(acc_idxs)>0:
selected[acc_idxs] = 1.
if len(rej_idxs)>0:
selected[rej_idxs] = 1.
if len(np.intersect1d(acc_idxs, rej_idxs))>0:
raise Exception("Intersection between acc idxs and rej idxs!")
sample_rate = self.sample_rate[self.predict_sub_labels]
not_selected = np.argwhere(selected==0).squeeze()
norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected])
s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False)
if return_scores:
scores = sample_rate[s_idxs]
return s_idxs, scores
return s_idxs
class FeedbackSampling(TrajectoryManager):
"""with no memory, for user study"""
def __init__(self, embeddings_2d, cls_num, period=100, metric="a"):
super().__init__(embeddings_2d, cls_num, period, metric)
def sample_batch(self, acc_idxs, rej_idxs, budget, return_scores=True):
acc_rate = np.zeros(self.train_num)
rej_rate = np.zeros(self.train_num)
selected = np.zeros(self.train_num)
if len(acc_idxs)>0:
acc_rate[acc_idxs]=1.
selected[acc_idxs] = 1.
if len(rej_idxs)>0:
rej_rate[rej_idxs]=1.
selected[rej_idxs] = 1.
if len(np.intersect1d(acc_idxs, rej_idxs))>0:
raise Exception("Intersection between acc idxs and rej idxs!")
exploit_rate = np.zeros(self.cls_num)
explore_rate = np.zeros(self.cls_num)
for cls in range(self.cls_num):
cls_idxs = np.argwhere(self.predict_sub_labels==cls).squeeze()
acc_num = np.sum(acc_rate[cls_idxs])
rej_num = np.sum(rej_rate[cls_idxs])
query_sum = acc_num + rej_num
if query_sum > 0:
exploit_rate[cls] = acc_num/query_sum
explore_rate[cls] = 1 - query_sum/len(cls_idxs)
# remove clean cls
rate = (explore_rate + exploit_rate)* self.sample_rate
sample_rate = rate[self.predict_sub_labels]
not_selected = np.argwhere(selected==0).squeeze()
norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected])
s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False)
if return_scores:
scores = sample_rate[s_idxs]
return s_idxs, scores
return s_idxs
# TODO: extension for new features. make features a dictionary
class Recommender:
def __init__(self, uncertainty, embeddings_2d, cls_num, period):
""" Recommend samples based on uncertainty and embeddings
"""
self.uncertainty = uncertainty
self.embeddings_2d = embeddings_2d
train_num,time_steps, _ = embeddings_2d.shape
self.train_num = train_num
self.time_steps = time_steps
self.period = period
self.cls_num = cls_num
self.position = self.embeddings_2d[:, -period:,:].reshape(self.train_num, -1)
self.v = self.embeddings_2d[:, -period:,:][:,1:,:] - self.embeddings_2d[:, -period:,:][:,:-1,:]
self.a = (self.v[:,1:,:]-self.v[:,:-1,:]).reshape(self.train_num, -1)
self.v = self.v.reshape(self.train_num, -1)
@property
def _sample_p_scores(self):
return self.p_scores[self.predict_p_sub_labels]
@property
def _sample_v_scores(self):
return self.v_scores[self.predict_v_sub_labels]
@property
def _sample_a_scores(self):
return self.a_scores[self.predict_a_sub_labels]
def clustered(self):
brc = Birch(n_clusters=self.cls_num)
brc.fit(self.v.reshape(self.train_num, -1))
self.predict_v_sub_labels = brc.labels_
self.v_scores = np.zeros(self.cls_num)
for cls in range(self.cls_num):
self.v_scores[cls] = 1 - np.sum(self.predict_v_sub_labels==cls)/self.train_num
brc = Birch(n_clusters=self.cls_num)
brc.fit(self.a.reshape(self.train_num, -1))
self.predict_a_sub_labels = brc.labels_
self.a_scores = np.zeros(self.cls_num)
for cls in range(self.cls_num):
self.a_scores[cls] = 1 - np.sum(self.predict_a_sub_labels==cls)/self.train_num
brc = Birch(n_clusters=self.cls_num)
brc.fit(self.position.reshape(self.train_num, -1))
self.predict_p_sub_labels = brc.labels_
self.p_scores = np.zeros(self.cls_num)
for cls in range(self.cls_num):
self.p_scores[cls] = 1 - np.sum(self.predict_p_sub_labels==cls)/self.train_num
def sample_batch_init(self, acc_idxs, rej_idxs, budget):
scores = (self.uncertainty + self.v_scores[self.predict_v_sub_labels]+self.a_scores[self.predict_a_sub_labels]+self.p_scores[self.predict_p_sub_labels])/4
selected = np.zeros(self.train_num)
if len(acc_idxs)>0:
selected[acc_idxs] = 1.
if len(rej_idxs)>0:
selected[rej_idxs] = 1.
if len(np.intersect1d(acc_idxs, rej_idxs))>0:
raise Exception("Intersection between acc idxs and rej idxs!")
not_selected_idxs = np.argwhere(selected==0).squeeze()
norm_rate = scores[not_selected_idxs]/np.sum(scores[not_selected_idxs])
s_idxs = np.random.choice(not_selected_idxs, p=norm_rate, size=budget, replace=False)
return s_idxs, scores[s_idxs]
def sample_batch(self, acc_idxs, rej_idxs, budget, return_coef=False):
if len(np.intersect1d(acc_idxs, rej_idxs))>0:
raise Exception("Intersection between acc idxs and rej idxs!")
s1 = self.uncertainty
s2 = self.v_scores[self.predict_v_sub_labels]
s3 = self.a_scores[self.predict_a_sub_labels]
s4 = self.p_scores[self.predict_p_sub_labels]
X = np.vstack((s1,s2,s3,s4)).transpose([1,0])
exp_idxs = np.concatenate((acc_idxs, rej_idxs), axis=0)
target_X = X[exp_idxs]
target_Y = np.zeros(len(exp_idxs))
target_Y[:len(acc_idxs)] = 1
krr = Ridge(alpha=1.0)
krr.fit(target_X, target_Y)
scores = krr.predict(X)
not_selected = np.setdiff1d(np.arange(self.train_num), exp_idxs)
remain_scores = scores[not_selected]
args = np.argsort(remain_scores)[-budget:]
selected_idxs = not_selected[args]
if return_coef:
return selected_idxs, scores[selected_idxs], krr.coef_
return selected_idxs, scores[selected_idxs]
def sample_batch_normal_init(self, acc_idxs, rej_idxs, budget):
# scores = (self.uncertainty + self.cls_scores[self.predict_sub_labels])/2
scores = (self.uncertainty + self.v_scores[self.predict_v_sub_labels]+self.a_scores[self.predict_a_sub_labels]+self.p_scores[self.predict_p_sub_labels])/4
selected = np.zeros(self.train_num)
if len(acc_idxs)>0:
selected[acc_idxs] = 1.
if len(rej_idxs)>0:
selected[rej_idxs] = 1.
if len(np.intersect1d(acc_idxs, rej_idxs))>0:
raise Exception("Intersection between acc idxs and rej idxs!")
not_selected_idxs = np.argwhere(selected==0).squeeze()
norm_rate = (1 - scores[not_selected_idxs])/np.sum((1 - scores[not_selected_idxs]))
s_idxs = np.random.choice(not_selected_idxs, p=norm_rate, size=budget, replace=False)
return s_idxs, scores[s_idxs]
def sample_batch_normal(self, acc_idxs, rej_idxs, budget):
if len(np.intersect1d(acc_idxs, rej_idxs))>0:
raise Exception("Intersection between acc idxs and rej idxs!")
s1 = self.uncertainty
s2 = self.v_scores[self.predict_v_sub_labels]
s3 = self.a_scores[self.predict_a_sub_labels]
s4 = self.p_scores[self.predict_p_sub_labels]
X = np.vstack((s1,s2,s3,s4)).transpose([1,0])
exp_idxs = np.concatenate((acc_idxs, rej_idxs), axis=0)
target_X = X[exp_idxs]
target_Y = np.zeros(len(exp_idxs))
target_Y[:len(acc_idxs)] = 1
krr = Ridge(alpha=1.0)
krr.fit(target_X, target_Y)
scores = krr.predict(X)
not_selected = np.setdiff1d(np.arange(self.train_num), exp_idxs)
remain_scores = scores[not_selected]
args = np.argsort(remain_scores)[:budget]
selected_idxs = not_selected[args]
return selected_idxs, scores[selected_idxs]
def score_new_sample(self, sample_trajectory, return_nearest=False):
new_position = sample_trajectory.reshape(-1)
new_v = sample_trajectory[1:, :] - sample_trajectory[:-1, :]
new_a = (new_v[1:,:]-new_v[:-1,:]).reshape(-1)
new_v = new_v.reshape(-1)
position_cls, p_nearest_idx = find_cluster(self.position, self.predict_p_sub_labels, new_position)
v_cls, v_nearest_idx = find_cluster(self.v, self.predict_v_sub_labels, new_v)
a_cls, a_nearest_idx = find_cluster(self.a, self.predict_a_sub_labels, new_a)
new_p_score = self.p_scores[position_cls] if position_cls>=0 else 1-1/self.train_num
new_v_score = self.v_scores[v_cls] if v_cls>=0 else 1-1/self.train_num
new_a_score = self.a_scores[a_cls] if a_cls>=0 else 1-1/self.train_num
if return_nearest:
return (new_p_score, new_v_score, new_a_score), (p_nearest_idx, v_nearest_idx, a_nearest_idx)
return new_p_score, new_v_score, new_a_score