Peverell
/

code_training_dynamic

Model card Files Files and versions Community

code_training_dynamic / saved_models /codesearch_simp /singleVis /intrinsic_dim.py

SalazarPevelll

f291f4a 11 months ago

3.38 kB

	from audioop import mul
	import numpy as np
	from pynndescent import NNDescent
	from sklearn.linear_model import LinearRegression
	from tqdm import tqdm


	class IntrinsicDim:
	def __init__(self, data, metric="euclidean"):
	# normalize and if redundant
	self.data = np.unique(data.reshape(len(data), -1), axis=0)

	self.metric = metric
	self.name = "Intrinsic Dimension"


	def find_mu(self):
	# number of trees in random projection forest
	n_trees = min(64, 5 + int(round((self.data.shape[0]) ** 0.5 / 20.0)))
	# max number of nearest neighbor iters to perform
	n_iters = max(5, int(round(np.log2(self.data.shape[0]))))
	# distance metric
	# get nearest neighbors
	nnd = NNDescent(
	self.data,
	n_neighbors=3,
	metric=self.metric,
	n_trees=n_trees,
	n_iters=n_iters,
	max_candidates=10,
	verbose=False
	)
	_, knn_dists = nnd.neighbor_graph
	mu = knn_dists[:, 2] / knn_dists[:, 1]
	return mu

	def estimate_id_fast(self):
	mu = self.find_mu()
	N = self.data.shape[0]
	sort_idx = np.argsort(mu)
	Femp = np.arange(N)/N
	lr = LinearRegression(fit_intercept=False)
	lr.fit(np.log(mu[sort_idx]).reshape(-1,1), -np.log(1-Femp).reshape(-1,1))
	d = lr.coef_[0][0]
	return d

	def estimate_id(self):
	N = self.data.shape[0]
	mu = np.zeros(N)
	for i in tqdm(range(N)):
	dist = np.sort(np.sqrt(np.sum((self.data[i]-self.data)**2, axis=1)))
	r1, r2 = dist[dist>0][:2]
	mu[i]=r2/r1
	sort_idx = np.argsort(mu)
	Femp = np.arange(N)/N
	lr = LinearRegression(fit_intercept=False)
	lr.fit(np.log(mu[sort_idx]).reshape(-1,1), -np.log(1-Femp).reshape(-1,1))
	d = lr.coef_[0][0]

	return d

	def twonn_dimension(self, return_xy=False):
	N = len(self.data)
	mu = []
	for i in tqdm(range(N)):
	dist = np.sort(np.sqrt(np.sum((self.data[i]-self.data)**2, axis=1)))
	r1, r2 = dist[dist>0][:2]
	mu.append((i+1,r2/r1))
	sigma_i = dict(zip(range(1,len(mu)+1), np.array(sorted(mu, key=lambda x: x[1]))[:,0].astype(int)))
	mu = dict(mu)
	F_i = {}
	for i in mu:
	F_i[sigma_i[i]] = i/N
	x = np.log([mu[i] for i in sorted(mu.keys())])
	y = np.array([1-F_i[i] for i in sorted(mu.keys())])
	x = x[y>0]
	y = y[y>0]
	y = -1*np.log(y)
	d = np.linalg.lstsq(np.vstack([x, np.zeros(len(x))]).T, y, rcond=None)[0][0]
	if return_xy:
	return d, x, y
	else:
	return d

	def twonn_dimension_fast(self):
	N = len(self.data)
	mu = self.find_mu().tolist()
	mu = list(enumerate(mu, start=1))
	sigma_i = dict(zip(range(1,len(mu)+1), np.array(sorted(mu, key=lambda x: x[1]))[:,0].astype(int)))
	mu = dict(mu)
	F_i = {}
	for i in mu:
	F_i[sigma_i[i]] = i/N
	x = np.log([mu[i] for i in sorted(mu.keys())])
	y = np.array([1-F_i[i] for i in sorted(mu.keys())])
	x = x[y>0]
	y = y[y>0]
	y = -1*np.log(y)
	d = np.linalg.lstsq(np.vstack([x, np.zeros(len(x))]).T, y, rcond=None)[0][0]
	return d