File size: 6,246 Bytes
f291f4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Returns points that minimizes the maximum distance of any point to a center. namely hausdorff distance
Implements the k-Center-Greedy method in
Ozan Sener and Silvio Savarese. A Geometric Approach to Active Learning for
Convolutional Neural Networks. https://arxiv.org/abs/1708.00489 2017
Distance metric defaults to l2 distance. Features used to calculate distance
are either raw features or if a model has transform method then uses the output
of model.transform(X).
Can be extended to a robust k centers algorithm that ignores a certain number of
outlier datapoints. Resulting centers are solution to multiple integer program.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
import numpy as np
import math
from sklearn.metrics import pairwise_distances
class kCenterGreedy(object):
def __init__(self, X, metric='euclidean'):
self.features = X.reshape(len(X), -1)
self.name = 'kcenter'
self.metric = metric
self.min_distances = None
self.n_obs = self.features.shape[0]
self.already_selected = []
def update_distances(self, cluster_centers, only_new=True, reset_dist=False):
"""Update min distances given cluster centers.
Args:
cluster_centers: indices of cluster centers
only_new: only calculate distance for newly selected points and update
min_distances.
rest_dist: whether to reset min_distances.
"""
if reset_dist:
self.min_distances = None
if only_new:
cluster_centers = [d for d in cluster_centers
if d not in self.already_selected]
if cluster_centers is not None:
# Update min_distances for all examples given new cluster center.
x = self.features[cluster_centers]
dist = pairwise_distances(self.features, x, metric=self.metric)
if self.min_distances is None:
self.min_distances = np.min(dist, axis=1).reshape(-1,1)
else:
self.min_distances = np.minimum(self.min_distances, dist)
def select_batch_with_budgets(self, already_selected, budgets, return_min=False):
"""
Diversity promoting active learning method that greedily forms a batch
to minimize the maximum distance to a cluster center among all unlabeled
datapoints.
Args:
budgets: batch size
Returns:
indices of points selected to minimize distance to cluster centers
"""
print('Calculating distances...')
t0 = time.time()
self.update_distances(already_selected, only_new=False, reset_dist=True)
t1 = time.time()
print("calculating distances for {:d} points within {:.2f} seconds...".format(len(already_selected), t1 - t0))
new_batch = []
for _ in range(budgets):
ind = np.argmax(self.min_distances)
# New examples should not be in already selected since those points
# should have min_distance of zero to a cluster center.
assert ind not in already_selected
self.update_distances([ind], only_new=True, reset_dist=False)
new_batch.append(ind)
print('Hausdorff distance is {:.2f} with {:d} points'.format(self.min_distances.max(), len(already_selected)+len(new_batch)))
self.already_selected = np.concatenate((already_selected, np.array(new_batch)))
if return_min:
return new_batch, self.min_distances.max()
return new_batch
def covering_perc(self, p):
"""given a dist, return the covering percentage"""
sorted = np.sort(self.min_distances.reshape(-1))
return sorted[int(self.n_obs*p)]
def hausdorff(self):
return self.min_distances.max()
def select_batch_with_cn(self, already_selected, r_max, c_c0, d_d0, p=0.95, return_min=False):
"""
Diversity promoting active learning method that greedily forms a batch
to minimize the maximum distance to a cluster center among all unlabeled
datapoints.
Args:
budgets: batch size
Returns:
indices of points selected to minimize distance to cluster centers
"""
print('Calculating distances...')
t0 = time.time()
self.update_distances(already_selected, only_new=False, reset_dist=True)
t1 = time.time()
print("calculating distances for {:d} points within {:.2f} seconds...".format(len(already_selected), t1 - t0))
new_batch = []
while True:
ind = np.argmax(self.min_distances)
r_cover = self.min_distances[ind][0]
if r_cover/c_c0/d_d0 < r_max:
break
# New examples should not be in already selected since those points
# should have min_distance of zero to a cluster center.
assert ind not in already_selected
self.update_distances([ind], only_new=True, reset_dist=False)
new_batch.append(ind)
# sorted = np.sort(self.min_distances.reshape(-1))
# r_cover = sorted[int(self.n_obs*p)-1]
print('Hausdorff distance is {:.2f} with {:d} points'.format(r_cover, len(already_selected)+len(new_batch)))
self.already_selected = np.concatenate((already_selected, np.array(new_batch)))
if return_min:
return new_batch, self.min_distances.max()
return new_batch
def fps(data, num):
import torch
from dgl.geometry import farthest_point_sampler
data = torch.from_numpy(data[np.newaxis,:,:]).to(device=torch.device("cuda"))
point_idxs = farthest_point_sampler(data, num).squeeze(axis=0)
point_idxs = point_idx.cpu().numpy()
return point_idxs
if __name__ == "__main__":
data = np.random.rand(500,10)
selected_idxs = np.random.choice(500, size=10, replace=False)
kc = kCenterGreedy(data)
_ = kc.select_batch_with_budgets(selected_idxs, 50)
c0 = kc.hausdorff() |