File size: 15,772 Bytes
f291f4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.cluster import Birch
from pynndescent import NNDescent
from sklearn.neighbors import NearestNeighbors
# TODO random ignore

def find_cluster(trajectories, sub_labels, new_sample):
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(trajectories)
    distances, indices = nbrs.kneighbors(new_sample[np.newaxis,:])
    nearest_neighbor_idx = indices[0, 1]
    nearest_neighbor_dist = distances[0, 1]

    cls_idx = sub_labels[nearest_neighbor_idx]
    samples_in_cls = trajectories[np.argwhere(sub_labels==cls_idx).squeeze()]


    # number of trees in random projection forest
    n_trees = min(64, 5 + int(round(samples_in_cls.shape[0] ** 0.5 / 20.0)))
    # max number of nearest neighbor iters to perform
    n_iters = max(5, int(round(np.log2(samples_in_cls.shape[0]))))
    # get nearest neighbors
    nnd = NNDescent(
        samples_in_cls,
        n_neighbors=2,
        metric="euclidean",
        n_trees=n_trees,
        n_iters=n_iters,
        max_candidates=60,
        verbose=False
    )
    _, dists = nnd.neighbor_graph
    dists = dists[:, 1]
    max_dist = dists.max()
    if nearest_neighbor_dist< max_dist:
        return cls_idx, nearest_neighbor_idx
    else:
        return -1, -1

    

class TrajectoryManager:
    def __init__(self, embeddings_2d, cls_num, period=100, metric="a"):
        """ trajectory manager with no feedback
        sample abnormal samples based on trajectories
        Parameters
        ----------
        samples: ndarray, shape(train_num, repr_dim)
        embeddings_2d : ndarray, shape (train_num, epoch_num, 2)
            all 2d embeddings of representations by timevis
        cls_num: int 
            the number of classes to cluster
        period: int
            We only look at the last *period* epochs of trajectory
        """
        self.embeddings_2d = embeddings_2d
        train_num,time_steps, _ = embeddings_2d.shape
        self.train_num = train_num
        self.time_steps = time_steps
        self.period = period
        self.metric = metric
        self.cls_num = cls_num

        self.v = self.embeddings_2d[:, -period:,:][:,1:,:] - self.embeddings_2d[:, -period:,:][:,:-1,:]
        self.a = self.v[:,1:,:]-self.v[:,:-1,:]
    
    def clustered(self):
        brc = Birch(n_clusters=self.cls_num)
        if self.metric == "v":
            brc.fit(self.v.reshape(self.train_num, -1))
        elif self.metric == "a":
            brc.fit(self.a.reshape(self.train_num, -1))
        else:
            print("Not a valid metric")
        
        self.predict_sub_labels = brc.labels_
        self.suspect_clean = np.argsort(np.bincount(self.predict_sub_labels))[-3:]

        # to be updated each time
        self.sample_rate = np.ones(self.cls_num)
        self.sample_rate[self.suspect_clean] = 0.
        self.selected = np.zeros(self.train_num)
    
    def sample_batch(self, budget):
        not_selected = np.argwhere(self.selected==0).squeeze()
        s_idxs = np.random.choice(not_selected, size=budget)
        return s_idxs
        
    def sample_normal(self, budget):
        # sample class
        normal_rate = 1 - self.sample_rate
        sample_rate = normal_rate[self.predict_sub_labels]
        # check how many left
        not_selected = np.argwhere(self.selected==0).squeeze()
        # select one
        normed_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected])
        s_idxs = np.random.choice(not_selected, p=normed_rate,size=budget)
        return s_idxs
    
    def update_belief(self, idxs):
        # update parameters
        if len(idxs)>0:
            self.selected[idxs] = 1
    

class FeedbackTrajectoryManager(TrajectoryManager):
    """ trajectory manager with feedback
    sample abnormal samples based on trajectories
    """
    def __init__(self, embeddings_2d, cls_num, period=100, metric="a"):
        super().__init__(embeddings_2d, cls_num, period, metric)
    
    def clustered(self):
        super().clustered()
        # to be updated
        # self.selected
        # self.sample_rate
        self.user_acc = np.zeros(self.train_num)
        self.user_rej = np.zeros(self.train_num)
    
    def sample_batch(self, budget, return_scores=False):
        acc_idxs = np.argwhere(self.user_acc==1).squeeze()
        rej_idxs = np.argwhere(self.user_rej==1).squeeze() 

        acc_rate = np.zeros(self.train_num)
        rej_rate = np.zeros(self.train_num)
        if len(acc_idxs)>0:
            acc_rate[acc_idxs]=1.
        if len(rej_idxs)>0:
            rej_rate[rej_idxs]=1.

        if len(np.intersect1d(acc_idxs, rej_idxs))>0:
            raise Exception("Intersection between acc idxs and rej idxs!")

        exploit_rate = np.zeros(self.cls_num)
        explore_rate = np.zeros(self.cls_num)
        for cls in range(self.cls_num):
            cls_idxs = np.argwhere(self.predict_sub_labels==cls).squeeze()
            acc_num = np.sum(acc_rate[cls_idxs])
            rej_num = np.sum(rej_rate[cls_idxs])
            query_sum = acc_num + rej_num
            if query_sum > 0:
                exploit_rate[cls] = acc_num/query_sum
            explore_rate[cls] = 1 - query_sum/len(cls_idxs)
        
        # remove clean cls
        rate = (explore_rate + exploit_rate)* self.sample_rate
        sample_rate = rate[self.predict_sub_labels]
        not_selected = np.argwhere(self.selected==0).squeeze()
        norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected])
        s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False)

        if return_scores:
            scores = sample_rate[s_idxs]
            return s_idxs, scores
        return s_idxs
    
    def update_belief(self, acc_idxs, rej_idxs):
        if len(acc_idxs)>0:
            self.user_acc[acc_idxs]=1
            self.selected[acc_idxs] = 1
        if len(rej_idxs)>0:
            self.user_rej[rej_idxs]=1
            self.selected[rej_idxs] = 1
    

class TBSampling(TrajectoryManager):
    """with no memory, for user study"""
    def __init__(self, embeddings_2d, cls_num, period=100, metric="a"):
        super().__init__(embeddings_2d, cls_num, period, metric)

    def sample_batch(self, acc_idxs, rej_idxs, budget, return_scores=True):
        selected = np.zeros(self.train_num)
        if len(acc_idxs)>0:
            selected[acc_idxs] = 1.
        if len(rej_idxs)>0:
            selected[rej_idxs] = 1.
        if len(np.intersect1d(acc_idxs, rej_idxs))>0:
            raise Exception("Intersection between acc idxs and rej idxs!")
        
        sample_rate = self.sample_rate[self.predict_sub_labels]
        not_selected = np.argwhere(selected==0).squeeze()
        norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected])
        s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False)
        if return_scores:
            scores = sample_rate[s_idxs]
            return s_idxs, scores
        return s_idxs


class FeedbackSampling(TrajectoryManager):
    """with no memory, for user study"""
    def __init__(self, embeddings_2d, cls_num, period=100, metric="a"):
        super().__init__(embeddings_2d, cls_num, period, metric)

    def sample_batch(self, acc_idxs, rej_idxs, budget, return_scores=True):
        acc_rate = np.zeros(self.train_num)
        rej_rate = np.zeros(self.train_num)
        selected = np.zeros(self.train_num)
        if len(acc_idxs)>0:
            acc_rate[acc_idxs]=1.
            selected[acc_idxs] = 1.
        if len(rej_idxs)>0:
            rej_rate[rej_idxs]=1.
            selected[rej_idxs] = 1.
        if len(np.intersect1d(acc_idxs, rej_idxs))>0:
            raise Exception("Intersection between acc idxs and rej idxs!")

        exploit_rate = np.zeros(self.cls_num)
        explore_rate = np.zeros(self.cls_num)
        for cls in range(self.cls_num):
            cls_idxs = np.argwhere(self.predict_sub_labels==cls).squeeze()
            acc_num = np.sum(acc_rate[cls_idxs])
            rej_num = np.sum(rej_rate[cls_idxs])
            query_sum = acc_num + rej_num
            if query_sum > 0:
                exploit_rate[cls] = acc_num/query_sum
            explore_rate[cls] = 1 - query_sum/len(cls_idxs)
        
        # remove clean cls
        rate = (explore_rate + exploit_rate)* self.sample_rate
        sample_rate = rate[self.predict_sub_labels]
        not_selected = np.argwhere(selected==0).squeeze()
        norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected])
        s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False)
        if return_scores:
            scores = sample_rate[s_idxs]
            return s_idxs, scores
        return s_idxs

# TODO: extension for new features. make features a dictionary
class Recommender:
    def __init__(self, uncertainty, embeddings_2d, cls_num, period):
        """ Recommend samples based on uncertainty and embeddings
        """
        self.uncertainty = uncertainty
        self.embeddings_2d = embeddings_2d
        train_num,time_steps, _ = embeddings_2d.shape
        self.train_num = train_num
        self.time_steps = time_steps
        self.period = period
        self.cls_num = cls_num
        
        self.position = self.embeddings_2d[:, -period:,:].reshape(self.train_num, -1)
        self.v = self.embeddings_2d[:, -period:,:][:,1:,:] - self.embeddings_2d[:, -period:,:][:,:-1,:]
        self.a = (self.v[:,1:,:]-self.v[:,:-1,:]).reshape(self.train_num, -1)
        self.v = self.v.reshape(self.train_num, -1)
    
    @property
    def _sample_p_scores(self):
        return self.p_scores[self.predict_p_sub_labels]
    
    @property
    def _sample_v_scores(self):
        return self.v_scores[self.predict_v_sub_labels]
    
    @property
    def _sample_a_scores(self):
        return self.a_scores[self.predict_a_sub_labels]
    
    def clustered(self):
        brc = Birch(n_clusters=self.cls_num)
        brc.fit(self.v.reshape(self.train_num, -1))
        self.predict_v_sub_labels = brc.labels_
        self.v_scores = np.zeros(self.cls_num)
        for cls in range(self.cls_num):
            self.v_scores[cls] = 1 - np.sum(self.predict_v_sub_labels==cls)/self.train_num

        brc = Birch(n_clusters=self.cls_num)
        brc.fit(self.a.reshape(self.train_num, -1))
        self.predict_a_sub_labels = brc.labels_
        self.a_scores = np.zeros(self.cls_num)
        for cls in range(self.cls_num):
            self.a_scores[cls] = 1 - np.sum(self.predict_a_sub_labels==cls)/self.train_num

        brc = Birch(n_clusters=self.cls_num)
        brc.fit(self.position.reshape(self.train_num, -1))
        self.predict_p_sub_labels = brc.labels_
        self.p_scores = np.zeros(self.cls_num)
        for cls in range(self.cls_num):
            self.p_scores[cls] = 1 - np.sum(self.predict_p_sub_labels==cls)/self.train_num

    def sample_batch_init(self, acc_idxs, rej_idxs, budget):
        scores = (self.uncertainty + self.v_scores[self.predict_v_sub_labels]+self.a_scores[self.predict_a_sub_labels]+self.p_scores[self.predict_p_sub_labels])/4
        selected = np.zeros(self.train_num)
        if len(acc_idxs)>0:
            selected[acc_idxs] = 1.
        if len(rej_idxs)>0:
            selected[rej_idxs] = 1.
        if len(np.intersect1d(acc_idxs, rej_idxs))>0:
            raise Exception("Intersection between acc idxs and rej idxs!")
        not_selected_idxs = np.argwhere(selected==0).squeeze()
        norm_rate = scores[not_selected_idxs]/np.sum(scores[not_selected_idxs])
        s_idxs = np.random.choice(not_selected_idxs, p=norm_rate, size=budget, replace=False)
        return s_idxs, scores[s_idxs]
    
    def sample_batch(self, acc_idxs, rej_idxs, budget, return_coef=False):
        if len(np.intersect1d(acc_idxs, rej_idxs))>0:
            raise Exception("Intersection between acc idxs and rej idxs!")
            
        s1 = self.uncertainty
        s2 = self.v_scores[self.predict_v_sub_labels]
        s3 = self.a_scores[self.predict_a_sub_labels]
        s4 = self.p_scores[self.predict_p_sub_labels]
        X = np.vstack((s1,s2,s3,s4)).transpose([1,0])

        exp_idxs = np.concatenate((acc_idxs, rej_idxs), axis=0)
        target_X = X[exp_idxs]
        target_Y = np.zeros(len(exp_idxs))
        target_Y[:len(acc_idxs)] = 1
        krr = Ridge(alpha=1.0)
        krr.fit(target_X, target_Y)
        scores = krr.predict(X)

        not_selected = np.setdiff1d(np.arange(self.train_num), exp_idxs)
        remain_scores = scores[not_selected]
        args = np.argsort(remain_scores)[-budget:]
        selected_idxs = not_selected[args]
        if return_coef:
            return selected_idxs, scores[selected_idxs], krr.coef_
        return selected_idxs, scores[selected_idxs]
    
    def sample_batch_normal_init(self, acc_idxs, rej_idxs, budget):
        # scores = (self.uncertainty + self.cls_scores[self.predict_sub_labels])/2
        scores = (self.uncertainty + self.v_scores[self.predict_v_sub_labels]+self.a_scores[self.predict_a_sub_labels]+self.p_scores[self.predict_p_sub_labels])/4
        
        selected = np.zeros(self.train_num)
        if len(acc_idxs)>0:
            selected[acc_idxs] = 1.
        if len(rej_idxs)>0:
            selected[rej_idxs] = 1.
        if len(np.intersect1d(acc_idxs, rej_idxs))>0:
            raise Exception("Intersection between acc idxs and rej idxs!")
        not_selected_idxs = np.argwhere(selected==0).squeeze()

        norm_rate = (1 - scores[not_selected_idxs])/np.sum((1 - scores[not_selected_idxs]))
        s_idxs = np.random.choice(not_selected_idxs, p=norm_rate, size=budget, replace=False)
        return s_idxs, scores[s_idxs]
    
    def sample_batch_normal(self, acc_idxs, rej_idxs, budget):
        if len(np.intersect1d(acc_idxs, rej_idxs))>0:
            raise Exception("Intersection between acc idxs and rej idxs!")
            
        s1 = self.uncertainty
        s2 = self.v_scores[self.predict_v_sub_labels]
        s3 = self.a_scores[self.predict_a_sub_labels]
        s4 = self.p_scores[self.predict_p_sub_labels]
        X = np.vstack((s1,s2,s3,s4)).transpose([1,0])

        exp_idxs = np.concatenate((acc_idxs, rej_idxs), axis=0)
        target_X = X[exp_idxs]
        target_Y = np.zeros(len(exp_idxs))
        target_Y[:len(acc_idxs)] = 1
        krr = Ridge(alpha=1.0)
        krr.fit(target_X, target_Y)
        scores = krr.predict(X)

        not_selected = np.setdiff1d(np.arange(self.train_num), exp_idxs)
        remain_scores = scores[not_selected]
        args = np.argsort(remain_scores)[:budget]
        selected_idxs = not_selected[args]
        return selected_idxs, scores[selected_idxs]
    
    def score_new_sample(self, sample_trajectory, return_nearest=False):
        new_position = sample_trajectory.reshape(-1)
        new_v = sample_trajectory[1:, :] - sample_trajectory[:-1, :]
        new_a = (new_v[1:,:]-new_v[:-1,:]).reshape(-1)
        new_v = new_v.reshape(-1)

        position_cls, p_nearest_idx = find_cluster(self.position, self.predict_p_sub_labels, new_position)
        v_cls, v_nearest_idx = find_cluster(self.v, self.predict_v_sub_labels, new_v)
        a_cls, a_nearest_idx = find_cluster(self.a, self.predict_a_sub_labels, new_a)

        new_p_score = self.p_scores[position_cls] if position_cls>=0 else 1-1/self.train_num
        new_v_score = self.v_scores[v_cls] if v_cls>=0 else 1-1/self.train_num
        new_a_score = self.a_scores[a_cls] if a_cls>=0 else 1-1/self.train_num
        if return_nearest:
            return (new_p_score, new_v_score, new_a_score), (p_nearest_idx, v_nearest_idx, a_nearest_idx)
        return new_p_score, new_v_score, new_a_score