|
import numpy as np |
|
from sklearn.linear_model import Ridge |
|
from sklearn.cluster import Birch |
|
from pynndescent import NNDescent |
|
from sklearn.neighbors import NearestNeighbors |
|
|
|
|
|
def find_cluster(trajectories, sub_labels, new_sample): |
|
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(trajectories) |
|
distances, indices = nbrs.kneighbors(new_sample[np.newaxis,:]) |
|
nearest_neighbor_idx = indices[0, 1] |
|
nearest_neighbor_dist = distances[0, 1] |
|
|
|
cls_idx = sub_labels[nearest_neighbor_idx] |
|
samples_in_cls = trajectories[np.argwhere(sub_labels==cls_idx).squeeze()] |
|
|
|
|
|
|
|
n_trees = min(64, 5 + int(round(samples_in_cls.shape[0] ** 0.5 / 20.0))) |
|
|
|
n_iters = max(5, int(round(np.log2(samples_in_cls.shape[0])))) |
|
|
|
nnd = NNDescent( |
|
samples_in_cls, |
|
n_neighbors=2, |
|
metric="euclidean", |
|
n_trees=n_trees, |
|
n_iters=n_iters, |
|
max_candidates=60, |
|
verbose=False |
|
) |
|
_, dists = nnd.neighbor_graph |
|
dists = dists[:, 1] |
|
max_dist = dists.max() |
|
if nearest_neighbor_dist< max_dist: |
|
return cls_idx, nearest_neighbor_idx |
|
else: |
|
return -1, -1 |
|
|
|
|
|
|
|
class TrajectoryManager: |
|
def __init__(self, embeddings_2d, cls_num, period=100, metric="a"): |
|
""" trajectory manager with no feedback |
|
sample abnormal samples based on trajectories |
|
Parameters |
|
---------- |
|
samples: ndarray, shape(train_num, repr_dim) |
|
embeddings_2d : ndarray, shape (train_num, epoch_num, 2) |
|
all 2d embeddings of representations by timevis |
|
cls_num: int |
|
the number of classes to cluster |
|
period: int |
|
We only look at the last *period* epochs of trajectory |
|
""" |
|
self.embeddings_2d = embeddings_2d |
|
train_num,time_steps, _ = embeddings_2d.shape |
|
self.train_num = train_num |
|
self.time_steps = time_steps |
|
self.period = period |
|
self.metric = metric |
|
self.cls_num = cls_num |
|
|
|
self.v = self.embeddings_2d[:, -period:,:][:,1:,:] - self.embeddings_2d[:, -period:,:][:,:-1,:] |
|
self.a = self.v[:,1:,:]-self.v[:,:-1,:] |
|
|
|
def clustered(self): |
|
brc = Birch(n_clusters=self.cls_num) |
|
if self.metric == "v": |
|
brc.fit(self.v.reshape(self.train_num, -1)) |
|
elif self.metric == "a": |
|
brc.fit(self.a.reshape(self.train_num, -1)) |
|
else: |
|
print("Not a valid metric") |
|
|
|
self.predict_sub_labels = brc.labels_ |
|
self.suspect_clean = np.argsort(np.bincount(self.predict_sub_labels))[-3:] |
|
|
|
|
|
self.sample_rate = np.ones(self.cls_num) |
|
self.sample_rate[self.suspect_clean] = 0. |
|
self.selected = np.zeros(self.train_num) |
|
|
|
def sample_batch(self, budget): |
|
not_selected = np.argwhere(self.selected==0).squeeze() |
|
s_idxs = np.random.choice(not_selected, size=budget) |
|
return s_idxs |
|
|
|
def sample_normal(self, budget): |
|
|
|
normal_rate = 1 - self.sample_rate |
|
sample_rate = normal_rate[self.predict_sub_labels] |
|
|
|
not_selected = np.argwhere(self.selected==0).squeeze() |
|
|
|
normed_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected]) |
|
s_idxs = np.random.choice(not_selected, p=normed_rate,size=budget) |
|
return s_idxs |
|
|
|
def update_belief(self, idxs): |
|
|
|
if len(idxs)>0: |
|
self.selected[idxs] = 1 |
|
|
|
|
|
class FeedbackTrajectoryManager(TrajectoryManager): |
|
""" trajectory manager with feedback |
|
sample abnormal samples based on trajectories |
|
""" |
|
def __init__(self, embeddings_2d, cls_num, period=100, metric="a"): |
|
super().__init__(embeddings_2d, cls_num, period, metric) |
|
|
|
def clustered(self): |
|
super().clustered() |
|
|
|
|
|
|
|
self.user_acc = np.zeros(self.train_num) |
|
self.user_rej = np.zeros(self.train_num) |
|
|
|
def sample_batch(self, budget, return_scores=False): |
|
acc_idxs = np.argwhere(self.user_acc==1).squeeze() |
|
rej_idxs = np.argwhere(self.user_rej==1).squeeze() |
|
|
|
acc_rate = np.zeros(self.train_num) |
|
rej_rate = np.zeros(self.train_num) |
|
if len(acc_idxs)>0: |
|
acc_rate[acc_idxs]=1. |
|
if len(rej_idxs)>0: |
|
rej_rate[rej_idxs]=1. |
|
|
|
if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
|
raise Exception("Intersection between acc idxs and rej idxs!") |
|
|
|
exploit_rate = np.zeros(self.cls_num) |
|
explore_rate = np.zeros(self.cls_num) |
|
for cls in range(self.cls_num): |
|
cls_idxs = np.argwhere(self.predict_sub_labels==cls).squeeze() |
|
acc_num = np.sum(acc_rate[cls_idxs]) |
|
rej_num = np.sum(rej_rate[cls_idxs]) |
|
query_sum = acc_num + rej_num |
|
if query_sum > 0: |
|
exploit_rate[cls] = acc_num/query_sum |
|
explore_rate[cls] = 1 - query_sum/len(cls_idxs) |
|
|
|
|
|
rate = (explore_rate + exploit_rate)* self.sample_rate |
|
sample_rate = rate[self.predict_sub_labels] |
|
not_selected = np.argwhere(self.selected==0).squeeze() |
|
norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected]) |
|
s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False) |
|
|
|
if return_scores: |
|
scores = sample_rate[s_idxs] |
|
return s_idxs, scores |
|
return s_idxs |
|
|
|
def update_belief(self, acc_idxs, rej_idxs): |
|
if len(acc_idxs)>0: |
|
self.user_acc[acc_idxs]=1 |
|
self.selected[acc_idxs] = 1 |
|
if len(rej_idxs)>0: |
|
self.user_rej[rej_idxs]=1 |
|
self.selected[rej_idxs] = 1 |
|
|
|
|
|
class TBSampling(TrajectoryManager): |
|
"""with no memory, for user study""" |
|
def __init__(self, embeddings_2d, cls_num, period=100, metric="a"): |
|
super().__init__(embeddings_2d, cls_num, period, metric) |
|
|
|
def sample_batch(self, acc_idxs, rej_idxs, budget, return_scores=True): |
|
selected = np.zeros(self.train_num) |
|
if len(acc_idxs)>0: |
|
selected[acc_idxs] = 1. |
|
if len(rej_idxs)>0: |
|
selected[rej_idxs] = 1. |
|
if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
|
raise Exception("Intersection between acc idxs and rej idxs!") |
|
|
|
sample_rate = self.sample_rate[self.predict_sub_labels] |
|
not_selected = np.argwhere(selected==0).squeeze() |
|
norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected]) |
|
s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False) |
|
if return_scores: |
|
scores = sample_rate[s_idxs] |
|
return s_idxs, scores |
|
return s_idxs |
|
|
|
|
|
class FeedbackSampling(TrajectoryManager): |
|
"""with no memory, for user study""" |
|
def __init__(self, embeddings_2d, cls_num, period=100, metric="a"): |
|
super().__init__(embeddings_2d, cls_num, period, metric) |
|
|
|
def sample_batch(self, acc_idxs, rej_idxs, budget, return_scores=True): |
|
acc_rate = np.zeros(self.train_num) |
|
rej_rate = np.zeros(self.train_num) |
|
selected = np.zeros(self.train_num) |
|
if len(acc_idxs)>0: |
|
acc_rate[acc_idxs]=1. |
|
selected[acc_idxs] = 1. |
|
if len(rej_idxs)>0: |
|
rej_rate[rej_idxs]=1. |
|
selected[rej_idxs] = 1. |
|
if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
|
raise Exception("Intersection between acc idxs and rej idxs!") |
|
|
|
exploit_rate = np.zeros(self.cls_num) |
|
explore_rate = np.zeros(self.cls_num) |
|
for cls in range(self.cls_num): |
|
cls_idxs = np.argwhere(self.predict_sub_labels==cls).squeeze() |
|
acc_num = np.sum(acc_rate[cls_idxs]) |
|
rej_num = np.sum(rej_rate[cls_idxs]) |
|
query_sum = acc_num + rej_num |
|
if query_sum > 0: |
|
exploit_rate[cls] = acc_num/query_sum |
|
explore_rate[cls] = 1 - query_sum/len(cls_idxs) |
|
|
|
|
|
rate = (explore_rate + exploit_rate)* self.sample_rate |
|
sample_rate = rate[self.predict_sub_labels] |
|
not_selected = np.argwhere(selected==0).squeeze() |
|
norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected]) |
|
s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False) |
|
if return_scores: |
|
scores = sample_rate[s_idxs] |
|
return s_idxs, scores |
|
return s_idxs |
|
|
|
|
|
class Recommender: |
|
def __init__(self, uncertainty, embeddings_2d, cls_num, period): |
|
""" Recommend samples based on uncertainty and embeddings |
|
""" |
|
self.uncertainty = uncertainty |
|
self.embeddings_2d = embeddings_2d |
|
train_num,time_steps, _ = embeddings_2d.shape |
|
self.train_num = train_num |
|
self.time_steps = time_steps |
|
self.period = period |
|
self.cls_num = cls_num |
|
|
|
self.position = self.embeddings_2d[:, -period:,:].reshape(self.train_num, -1) |
|
self.v = self.embeddings_2d[:, -period:,:][:,1:,:] - self.embeddings_2d[:, -period:,:][:,:-1,:] |
|
self.a = (self.v[:,1:,:]-self.v[:,:-1,:]).reshape(self.train_num, -1) |
|
self.v = self.v.reshape(self.train_num, -1) |
|
|
|
@property |
|
def _sample_p_scores(self): |
|
return self.p_scores[self.predict_p_sub_labels] |
|
|
|
@property |
|
def _sample_v_scores(self): |
|
return self.v_scores[self.predict_v_sub_labels] |
|
|
|
@property |
|
def _sample_a_scores(self): |
|
return self.a_scores[self.predict_a_sub_labels] |
|
|
|
def clustered(self): |
|
brc = Birch(n_clusters=self.cls_num) |
|
brc.fit(self.v.reshape(self.train_num, -1)) |
|
self.predict_v_sub_labels = brc.labels_ |
|
self.v_scores = np.zeros(self.cls_num) |
|
for cls in range(self.cls_num): |
|
self.v_scores[cls] = 1 - np.sum(self.predict_v_sub_labels==cls)/self.train_num |
|
|
|
brc = Birch(n_clusters=self.cls_num) |
|
brc.fit(self.a.reshape(self.train_num, -1)) |
|
self.predict_a_sub_labels = brc.labels_ |
|
self.a_scores = np.zeros(self.cls_num) |
|
for cls in range(self.cls_num): |
|
self.a_scores[cls] = 1 - np.sum(self.predict_a_sub_labels==cls)/self.train_num |
|
|
|
brc = Birch(n_clusters=self.cls_num) |
|
brc.fit(self.position.reshape(self.train_num, -1)) |
|
self.predict_p_sub_labels = brc.labels_ |
|
self.p_scores = np.zeros(self.cls_num) |
|
for cls in range(self.cls_num): |
|
self.p_scores[cls] = 1 - np.sum(self.predict_p_sub_labels==cls)/self.train_num |
|
|
|
def sample_batch_init(self, acc_idxs, rej_idxs, budget): |
|
scores = (self.uncertainty + self.v_scores[self.predict_v_sub_labels]+self.a_scores[self.predict_a_sub_labels]+self.p_scores[self.predict_p_sub_labels])/4 |
|
selected = np.zeros(self.train_num) |
|
if len(acc_idxs)>0: |
|
selected[acc_idxs] = 1. |
|
if len(rej_idxs)>0: |
|
selected[rej_idxs] = 1. |
|
if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
|
raise Exception("Intersection between acc idxs and rej idxs!") |
|
not_selected_idxs = np.argwhere(selected==0).squeeze() |
|
norm_rate = scores[not_selected_idxs]/np.sum(scores[not_selected_idxs]) |
|
s_idxs = np.random.choice(not_selected_idxs, p=norm_rate, size=budget, replace=False) |
|
return s_idxs, scores[s_idxs] |
|
|
|
def sample_batch(self, acc_idxs, rej_idxs, budget, return_coef=False): |
|
if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
|
raise Exception("Intersection between acc idxs and rej idxs!") |
|
|
|
s1 = self.uncertainty |
|
s2 = self.v_scores[self.predict_v_sub_labels] |
|
s3 = self.a_scores[self.predict_a_sub_labels] |
|
s4 = self.p_scores[self.predict_p_sub_labels] |
|
X = np.vstack((s1,s2,s3,s4)).transpose([1,0]) |
|
|
|
exp_idxs = np.concatenate((acc_idxs, rej_idxs), axis=0) |
|
target_X = X[exp_idxs] |
|
target_Y = np.zeros(len(exp_idxs)) |
|
target_Y[:len(acc_idxs)] = 1 |
|
krr = Ridge(alpha=1.0) |
|
krr.fit(target_X, target_Y) |
|
scores = krr.predict(X) |
|
|
|
not_selected = np.setdiff1d(np.arange(self.train_num), exp_idxs) |
|
remain_scores = scores[not_selected] |
|
args = np.argsort(remain_scores)[-budget:] |
|
selected_idxs = not_selected[args] |
|
if return_coef: |
|
return selected_idxs, scores[selected_idxs], krr.coef_ |
|
return selected_idxs, scores[selected_idxs] |
|
|
|
def sample_batch_normal_init(self, acc_idxs, rej_idxs, budget): |
|
|
|
scores = (self.uncertainty + self.v_scores[self.predict_v_sub_labels]+self.a_scores[self.predict_a_sub_labels]+self.p_scores[self.predict_p_sub_labels])/4 |
|
|
|
selected = np.zeros(self.train_num) |
|
if len(acc_idxs)>0: |
|
selected[acc_idxs] = 1. |
|
if len(rej_idxs)>0: |
|
selected[rej_idxs] = 1. |
|
if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
|
raise Exception("Intersection between acc idxs and rej idxs!") |
|
not_selected_idxs = np.argwhere(selected==0).squeeze() |
|
|
|
norm_rate = (1 - scores[not_selected_idxs])/np.sum((1 - scores[not_selected_idxs])) |
|
s_idxs = np.random.choice(not_selected_idxs, p=norm_rate, size=budget, replace=False) |
|
return s_idxs, scores[s_idxs] |
|
|
|
def sample_batch_normal(self, acc_idxs, rej_idxs, budget): |
|
if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
|
raise Exception("Intersection between acc idxs and rej idxs!") |
|
|
|
s1 = self.uncertainty |
|
s2 = self.v_scores[self.predict_v_sub_labels] |
|
s3 = self.a_scores[self.predict_a_sub_labels] |
|
s4 = self.p_scores[self.predict_p_sub_labels] |
|
X = np.vstack((s1,s2,s3,s4)).transpose([1,0]) |
|
|
|
exp_idxs = np.concatenate((acc_idxs, rej_idxs), axis=0) |
|
target_X = X[exp_idxs] |
|
target_Y = np.zeros(len(exp_idxs)) |
|
target_Y[:len(acc_idxs)] = 1 |
|
krr = Ridge(alpha=1.0) |
|
krr.fit(target_X, target_Y) |
|
scores = krr.predict(X) |
|
|
|
not_selected = np.setdiff1d(np.arange(self.train_num), exp_idxs) |
|
remain_scores = scores[not_selected] |
|
args = np.argsort(remain_scores)[:budget] |
|
selected_idxs = not_selected[args] |
|
return selected_idxs, scores[selected_idxs] |
|
|
|
def score_new_sample(self, sample_trajectory, return_nearest=False): |
|
new_position = sample_trajectory.reshape(-1) |
|
new_v = sample_trajectory[1:, :] - sample_trajectory[:-1, :] |
|
new_a = (new_v[1:,:]-new_v[:-1,:]).reshape(-1) |
|
new_v = new_v.reshape(-1) |
|
|
|
position_cls, p_nearest_idx = find_cluster(self.position, self.predict_p_sub_labels, new_position) |
|
v_cls, v_nearest_idx = find_cluster(self.v, self.predict_v_sub_labels, new_v) |
|
a_cls, a_nearest_idx = find_cluster(self.a, self.predict_a_sub_labels, new_a) |
|
|
|
new_p_score = self.p_scores[position_cls] if position_cls>=0 else 1-1/self.train_num |
|
new_v_score = self.v_scores[v_cls] if v_cls>=0 else 1-1/self.train_num |
|
new_a_score = self.a_scores[a_cls] if a_cls>=0 else 1-1/self.train_num |
|
if return_nearest: |
|
return (new_p_score, new_v_score, new_a_score), (p_nearest_idx, v_nearest_idx, a_nearest_idx) |
|
return new_p_score, new_v_score, new_a_score |
|
|
|
|
|
|
|
|
|
|
|
|