Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions mapalign/dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,12 @@ def compute_nearest_neighbor_graph(K, n_neighbors=50):
return K


def compute_affinity(X, method='markov', eps=None, metric='euclidean'):
def compute_affinity(X, method='markov', f_neighbors = 0.01, eps=None, metric='euclidean'):
"""Compute the similarity or affinity matrix between the samples in X

:param X: A set of samples with number of rows > 1
:param method: 'markov' or 'cauchy' kernel (default: markov)
:param f_neighbors: Fraction of nearest neighbors to consider for automatic eps calculation
:param eps: scaling factor for kernel
:param metric: metric to compute pairwise distances
:return: a similarity matrix
Expand All @@ -147,8 +148,8 @@ def compute_affinity(X, method='markov', eps=None, metric='euclidean'):
import numpy as np
D = squareform(pdist(X, metric=metric))
if eps is None:
k = int(max(2, np.round(D.shape[0] * 0.01)))
eps = 2 * np.median(np.sort(D, axis=0)[k+1, :])**2
k = int(max(2, np.round(D.shape[0] * f_neighbors)))
eps = np.median(np.sort(D, axis=0)[0:k+1, :]**2)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if i remember this correctly, eps is based on the k+1 th entry rather than than everything upto that point. it basically provides a scaling factor given the sorted distance.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The approach currently implemented is more liberal than the one I propose and both of them are correct. All manifold learning approaches are based on local similarities and these should be emphasized - I think the median of all the values up to the k-th captures this property more accurately. Besides, I think that parametrization of the kernel in terms of average distances to a certain fraction of nearest neighbors is more natural, because it doesn't depend on the scale of the affinities that one uses.

if method == 'markov':
affinity_matrix = np.exp(-(D * D) / eps)
elif method == 'cauchy':
Expand Down
75 changes: 40 additions & 35 deletions mapalign/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
"""

import numpy as np
import scipy.sparse as sps

has_sklearn = True
try:
import sklearn
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics.pairwise import rbf_kernel
except ImportError:
has_sklearn = False

Expand Down Expand Up @@ -77,32 +81,8 @@ def compute_diffusion_map(L, alpha=0.5, n_components=None, diffusion_time=0,
raise ImportError('Checks require scikit-learn, but not found')

ndim = L.shape[0]
if overwrite:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

overwrite is no longer supported in the rewrite. i think this should still be used as a memory saving option.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mfalkiewicz - this is not supported in the refactor

L_alpha = L
else:
L_alpha = L.copy()

if alpha > 0:
# Step 2
d = np.array(L_alpha.sum(axis=1)).flatten()
d_alpha = np.power(d, -alpha)
if use_sparse:
L_alpha.data *= d_alpha[L_alpha.indices]
L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
L_alpha.data *= d_alpha[L_alpha.indices]
L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
else:
L_alpha = d_alpha[:, np.newaxis] * L_alpha
L_alpha = L_alpha * d_alpha[np.newaxis, :]

# Step 3
d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1)
if use_sparse:
L_alpha.data *= d_alpha[L_alpha.indices]
else:
L_alpha = d_alpha[:, np.newaxis] * L_alpha

M = L_alpha
M = _compute_markov_matrix(L, use_sparse = use_sparse, alpha = alpha)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

M = _compute_markov_matrix(L if overwrite else L.copy(), use_sparse = use_sparse, alpha = alpha)

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding this comment and overwrite - I agree that it would be nice to have a memory saving option, but I am not sure if this refactor will allow to manipulate that. I don't know the details of the lexical scope of Python, but maybe you could enlighten me. _compute_markov_matrix creates a local binding A. If we pass an alias as an argument (L) rather than the object itself (L.copy()), is A also going to be an alias? In other words, if we pass an alias to variable that is in the lexical scope of function compute_diffusion_map as an argument to function _compute_markov_matrix, is _compute_markov_matrix going to mutate the object that the alias refers to OR create it's local copy and mutate that instead?

Sorry for slightly confusing description, but I don't know how to phrase this in a simpler way.


from scipy.sparse.linalg import eigs, eigsh
if eigen_solver is None:
Expand Down Expand Up @@ -130,6 +110,37 @@ def compute_diffusion_map(L, alpha=0.5, n_components=None, diffusion_time=0,
return_result)


def _compute_markov_matrix(A, use_sparse, alpha = 0):
"""
Computes Markov matrix from affinity matrix. Density normalization is optional.

:param A: Affinity matrix
:param alpha: Diffusion operator parameter
:return: Transition matrix
"""

if alpha > 0:
# Step 2
d = np.array(A.sum(axis=1)).flatten()
d_alpha = np.power(d, -alpha)
if use_sparse:
A.data *= d_alpha[A.indices]
A = sps.csr_matrix(A.transpose().toarray())
A.data *= d_alpha[A.indices]
A = sps.csr_matrix(A.transpose().toarray())
else:
A = d_alpha[:, np.newaxis] * A
A = A * d_alpha[np.newaxis, :]

# Step 3
d_alpha = np.power(np.array(A.sum(axis=1)).flatten(), -1)
if use_sparse:
A.data *= d_alpha[A.indices]
else:
A = d_alpha[:, np.newaxis] * A

return A

def _step_5(lambdas, vectors, ndim, n_components, diffusion_time, return_result):
"""
This is a helper function for diffusion map computation.
Expand Down Expand Up @@ -345,9 +356,8 @@ def _get_affinity_matrix(self, X, Y=None):
"""
if self.affinity == 'precomputed':
self.affinity_matrix_ = X
return self.affinity_matrix_
if self.affinity == 'nearest_neighbors':
if sparse.issparse(X):
if sps.issparse(X):
warnings.warn("Nearest neighbors affinity currently does "
"not support sparse input, falling back to "
"rbf affinity")
Expand All @@ -362,21 +372,16 @@ def _get_affinity_matrix(self, X, Y=None):
# currently only symmetric affinity_matrix supported
self.affinity_matrix_ = 0.5 * (self.affinity_matrix_ +
self.affinity_matrix_.T)
return self.affinity_matrix_
if self.affinity == 'rbf':
self.gamma_ = (self.gamma
if self.gamma is not None else 1.0 / X.shape[1])
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
return self.affinity_matrix_
if self.affinity in ['markov', 'cauchy']:
from .dist import compute_affinity
self.affinity_matrix_ = compute_affinity(X,
method=self.affinity,
eps=self.gamma,
metric=self.metric)
return self.affinity_matrix_
self.affinity_matrix_ = self.affinity(X)
return self.affinity_matrix_
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the function description should be modified if this function is not going to return the affinity matrix.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I re-added the return of affinity_matrix_, but not as a field in self. The reason is that in the code I am currently writing I want to be able to bind results of several affinity calculations to different fields in the object.


def fit(self, X, y=None):
"""Fit the model from data in X.
Expand Down Expand Up @@ -413,14 +418,14 @@ def fit(self, X, y=None):
raise ValueError(("'affinity' is expected to be an affinity "
"name or a callable. Got: %s") % self.affinity)

affinity_matrix = self._get_affinity_matrix(X)
self._get_affinity_matrix(X)
if self.use_variant:
self.embedding_ = compute_diffusion_map_psd(affinity_matrix,
self.embedding_ = compute_diffusion_map_psd(self.affinity_matrix_,
alpha=self.alpha,
n_components=self.n_components,
diffusion_time=self.diffusion_time)
else:
self.embedding_ = compute_diffusion_map(affinity_matrix,
self.embedding_ = compute_diffusion_map(self.affinity_matrix_,
alpha=self.alpha,
n_components=self.n_components,
diffusion_time=self.diffusion_time,
Expand Down