Source code for msmbuilder.metrics.baseclasses

import abc
import re
import numpy as np
import warnings

from core import cdist, pdist


[docs]class AbstractDistanceMetric(object): """Abstract base class for distance metrics. All distance metrics should inherit from this abstract class. Provides a niave implementation of all_pairwise and one_to_many in terms of the abstract method one_to_all, which may be overridden by subclasses. """ __metaclass__ = abc.ABCMeta @abc.abstractmethod
[docs] def prepare_trajectory(self, trajectory): """Prepare trajectory on a format that is more conventient to take distances on. Parameters ---------- trajecory : msmbuilder.Trajectory Trajectory to prepare Returns ------- prepared_traj : array-like the exact form of the prepared_traj is subclass specific, but it should support fancy indexing Notes ----- For RMSD, this is going to mean making word-aligned padded arrays (TheoData) suitable for faste calculation, for dihedral-space distances means computing the dihedral angles, etc.""" return
@abc.abstractmethod
[docs] def one_to_all(self, prepared_traj1, prepared_traj2, index1): """Calculate the vector of distances from the index1th frame of prepared_traj1 to all of the frames in prepared_traj2. Parameters ---------- prepared_traj1 : prepared_trajectory First prepared trajectory prepared_traj2 : prepared_trajectory Second prepared trajectory index1 : int index in `prepared_trajectory` Returns ------- distances : ndarray vector of distances of length len(prepared_traj2) Notes ----- Although this might seem to be a special case of one_to_many(), it can often be implemented in a much more optimized way because it doesn't require construction of the indices2 array and array slicing in python is kindof slow. """ return
[docs] def one_to_many(self, prepared_traj1, prepared_traj2, index1, indices2): """Calculate the a vector of distances from the index1th frame of prepared_traj1 to all of the indices2 frames of prepared_traj2. Parameters ---------- prepared_traj1 : prepared_trajectory First prepared trajectory prepared_traj2 : prepared_trajectory Second prepared trajectory index1 : int index in `prepared_trajectory` indices2 : ndarray list of indices in `prepared_traj2` to calculate the distances to Returns ------- Vector of distances of length len(indices2) Notes ----- A subclass should be able to provide a more efficient implementation of this """ return self.one_to_all(prepared_traj1, prepared_traj2[indices2], index1)
[docs] def all_pairwise(self, prepared_traj): """Calculate condensed distance metric of all pairwise distances See `scipy.spatial.distance.squareform` for information on how to convert the condensed distance matrix to a redundant square matrix Parameters ---------- prepared_traj : array_like Prepared trajectory Returns ------- Y : ndarray A 1D array containing the distance from each frame to each other frame See Also -------- fast_pdist scipy.spatial.distance.squareform """ traj_length = len(prepared_traj) output = -1 * np.ones(traj_length * (traj_length - 1) / 2) p = 0 for i in xrange(traj_length): cmp_indices = np.arange(i + 1, traj_length) output[p: p + len(cmp_indices)] = self.one_to_many(prepared_traj, prepared_traj, i, cmp_indices) p += len(cmp_indices) return output
[docs]class Vectorized(AbstractDistanceMetric): """Represent MSM frames as vectors in some arbitrary vector space, and then use standard vector space metrics. Some examples of this might be extracting the contact map or dihedral angles. In order to be a full featured DistanceMetric, a subclass of Vectorized implements its own prepared_trajectory() method, Vectorized provides the remainder. allowable_scipy_metrics gives the list of metrics which your client can use. If the vector space that you're projecting your trajectory onto is just a space of boolean vectors, then you probably don't want to allow eulcidean distance for instances. default_scipy_metric is the metric that will be used by your default metric if the user leaves the 'metric' field blank/unspecified. default_scipy_p is the default value of 'p' that will be used if left unspecified. the value 'p' is ONLY used for the minkowski (pnorm) metric, so otherwise the scipy.spatial.distance code ignores it anyways. See http://docs.scipy.org/doc/scipy/reference/spatial.distance.html for a description of all the distance metrics and how they work. """ allowable_scipy_metrics = ['braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'euclidean', 'minkowski', 'sqeuclidean','dice', 'kulsinki', 'matching', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', 'yule', 'seuclidean', 'mahalanobis', 'sqmahalanobis']
[docs] def __init__(self, metric='euclidean', p=2, V=None, VI=None): """Create a Vectorized metric Parameters ---------- metric : {'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'euclidean', 'minkowski', 'sqeuclidean','dice', 'kulsinki', 'matching', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', 'yule', 'seuclidean', 'mahalanobis', 'sqmahalanobis'} Distance metric to equip the vector space with. See http://docs.scipy.org/doc/scipy/reference/spatial.distance.html for details p : int, optional p-norm order, used for metric='minkowski' V : ndarray, optional variances, used for metric='seuclidean' VI : ndarray, optional inverse covariance matrix, used for metric='mahalanobis' """ self._validate_scipy_metric(metric) self.metric = metric self.p = p self.V = V self.VI = VI if self.metric == 'seuclidean' and V is None: raise ValueError('To use seuclidean, you need to supply V') if self.metric in ['mahalanobis', 'sqmahalanobis'] and VI is None: raise ValueError('To used mahalanobis or sqmahalanobis, you need to supply VI')
def _validate_scipy_metric(self, metric): """Ensure that "metric" is an "allowable" metric (in allowable_scipy_metrics)""" if not metric in self.allowable_scipy_metrics: raise TypeError('%s is an unrecognize metric. "metric" must be one of %s' % (metric, str(self.allowable_scipy_metrics)))
[docs] def one_to_many(self, prepared_traj1, prepared_traj2, index1, indices2): """Calculate a vector of distances from one frame of the first trajectory to many frames of the second trajectory The distances calculated are from the `index1`th frame of `prepared_traj1` to the frames in `prepared_traj2` with indices `indices2` Parameters ---------- prepared_traj1 : ndarray First prepared trajectory prepared_traj2 : ndarray Second prepared trajectory index1 : int index in `prepared_trajectory` indices2 : ndarray list of indices in `prepared_traj2` to calculate the distances to Returns ------- distances : ndarray Vector of distances of length len(indices2) """ if not isinstance(index1, int): raise TypeError('index1 must be of type int.') out = cdist(prepared_traj2[indices2], prepared_traj1[[index1]], metric=self.metric, p=self.p, V=self.V, VI=self.VI) return out[:, 0]
[docs] def one_to_all(self, prepared_traj1, prepared_traj2, index1): """Measure the distance from one frame to every frame in a trajectory The distances calculated are from the `index1`th frame of `prepared_traj1` to all the frames in `prepared_traj2` with indices `indices2`. Although this is similar to one_to_many, it can often be computed faster Parameters ---------- prepared_traj1 : ndarray First prepared trajectory prepared_traj2 : ndarray Second prepared trajectory index1 : int index in `prepared_trajectory` Returns ------- distances : ndarray A vector of distances of length len(prepared_traj2)""" if not isinstance(index1, int): raise TypeError('index1 must be of type int.') out2 = cdist(prepared_traj2, prepared_traj1[[index1]], metric=self.metric, p=self.p, V=self.V, VI=self.VI) return out2[:, 0]
[docs] def many_to_many(self, prepared_traj1, prepared_traj2, indices1, indices2): """Get a matrix of distances from each frame in a set to each other frame in a second set. Calculate a MATRIX of distances from the frames in prepared_traj1 with indices `indices1` to the frames in prepared_traj2 with indices `indices2`, using supplied metric. Parameters ---------- prepared_traj1 : ndarray First prepared trajectory prepared_traj2 : ndarray Second prepared trajectory indices1 : array_like list of indices in `prepared_traj1` to calculate the distances from indices2 : array_like list of indices in `prepared_traj2` to calculate the distances to Returns ------- distances : ndarray A 2D array of shape len(indices1) * len(indices2)""" out = cdist(prepared_traj1[indices1], prepared_traj2[indices2], metric=self.metric, p=self.p, V=self.V, VI=self.VI) return out
[docs] def all_to_all(self, prepared_traj1, prepared_traj2): """Get a matrix of distances from all frames in one traj to all frames in another Parameters ---------- prepared_traj1 : ndarray First prepared trajectory prepared_traj2 : ndarray Second prepared trajectory Returns ------- distances : ndarray A 2D array of shape len(preprared_traj1) * len(preprared_traj2)""" if prepared_traj1 is prepared_traj2: warnings.warn('runtime', re.sub("\s+", " ", """it's not recommended to use this method to calculate the full pairwise distance matrix for one trajectory to itself (as you're doing). Use all_pairwise, which will be more efficient if you reall need the results as a 2D matrix (why?) then you can always use scipy.spatial.distance.squareform() on the output of all_pairwise()""".replace('\n', ' '))) out = cdist(prepared_traj1, prepared_traj2, metric=self.metric, p=self.p, V=self.V, VI=self.VI) return out
[docs] def all_pairwise(self, prepared_traj): """Calculate a condense" distance matrix of all the pairwise distances between each frame with each other frame in prepared_traj The condensed distance matrix can be converted to the redundant square form if desired Parameters ---------- prepared_traj1 : ndarray Prepared trajectory Returns ------- distances : ndarray 1D vector of length len(pairwise_traj) choose 2 where the i*jth entry contains the distance between prepared_traj[i] and prepared_traj[j] See Also -------- scipy.spatial.distance.pdist scipy.spatial.distance.squareform """ out = pdist(prepared_traj, metric=self.metric, p=self.p, V=self.V, VI=self.VI) return out