1# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- 2# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 3# 4# MDAnalysis --- https://www.mdanalysis.org 5# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors 6# (see the file AUTHORS for the full list of names) 7# 8# Released under the GNU Public Licence, v2 or any higher version 9# 10# Please cite your use of MDAnalysis in published work: 11# 12# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler, 13# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein. 14# MDAnalysis: A Python package for the rapid analysis of molecular dynamics 15# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th 16# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy. 17# 18# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. 19# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. 20# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 21# 22 23r"""Principal Component Analysis (PCA) --- :mod:`MDAnalysis.analysis.pca` 24===================================================================== 25 26:Authors: John Detlefs 27:Year: 2016 28:Copyright: GNU Public License v3 29 30.. versionadded:: 0.16.0 31 32This module contains the linear dimensions reduction method Principal Component 33Analysis (PCA). PCA sorts a simulation into 3N directions of descending 34variance, with N being the number of atoms. These directions are called 35the principal components. The dimensions to be analyzed are reduced by only 36looking at a few projections of the first principal components. To learn how to 37run a Principal Component Analysis, please refer to the :ref:`PCA-tutorial`. 38 39The PCA problem is solved by solving the eigenvalue problem of the covariance 40matrix, a :math:`3N \times 3N` matrix where the element :math:`(i, j)` is the 41covariance between coordinates :math:`i` and :math:`j`. The principal 42components are the eigenvectors of this matrix. 43 44For each eigenvector, its eigenvalue is the variance that the eigenvector 45explains. Stored in :attr:`PCA.cumulated_variance`, a ratio for each number of 46eigenvectors up to index :math:`i` is provided to quickly find out how many 47principal components are needed to explain the amount of variance reflected by 48those :math:`i` eigenvectors. For most data, :attr:`PCA.cumulated_variance` 49will be approximately equal to one for some :math:`n` that is significantly 50smaller than the total number of components, these are the components of 51interest given by Principal Component Analysis. 52 53From here, we can project a trajectory onto these principal components and 54attempt to retrieve some structure from our high dimensional data. 55 56For a basic introduction to the module, the :ref:`PCA-tutorial` shows how 57to perform Principal Component Analysis. 58 59.. _PCA-tutorial: 60 61PCA Tutorial 62------------ 63 64The example uses files provided as part of the MDAnalysis test suite 65(in the variables :data:`~MDAnalysis.tests.datafiles.PSF` and 66:data:`~MDAnalysis.tests.datafiles.DCD`). This tutorial shows how to use the 67PCA class. 68 69First load all modules and test data 70 71 >>> import MDAnalysis as mda 72 >>> import MDAnalysis.analysis.pca as pca 73 >>> from MDAnalysis.tests.datafiles import PSF, DCD 74 75Given a universe containing trajectory data we can perform Principal Component 76Analyis by using the class :class:`PCA` and retrieving the principal 77components. 78 79 >>> u = mda.Universe(PSF, DCD) 80 >>> PSF_pca = pca.PCA(u, select='backbone') 81 >>> PSF_pca.run() 82 83Inspect the components to determine the principal components you would like 84to retain. The choice is arbitrary, but I will stop when 95 percent of the 85variance is explained by the components. This cumulated variance by the 86components is conveniently stored in the one-dimensional array attribute 87``cumulated_variance``. The value at the ith index of `cumulated_variance` 88is the sum of the variances from 0 to i. 89 90 >>> n_pcs = np.where(PSF_pca.cumulated_variance > 0.95)[0][0] 91 >>> atomgroup = u.select_atoms('backbone') 92 >>> pca_space = PSF_pca.transform(atomgroup, n_components=n_pcs) 93 94From here, inspection of the ``pca_space`` and conclusions to be drawn from the 95data are left to the user. 96 97Classes and Functions 98--------------------- 99 100.. autoclass:: PCA 101.. autofunction:: cosine_content 102 103""" 104from __future__ import division, absolute_import 105from six.moves import range 106import warnings 107 108import numpy as np 109import scipy.integrate 110 111from MDAnalysis import Universe 112from MDAnalysis.analysis.align import _fit_to 113from MDAnalysis.lib.log import ProgressMeter 114 115from .base import AnalysisBase 116 117 118class PCA(AnalysisBase): 119 """Principal component analysis on an MD trajectory. 120 121 After initializing and calling method with a universe or an atom group, 122 principal components ordering the atom coordinate data by decreasing 123 variance will be available for analysis. As an example: 124 125 >>> pca = PCA(universe, select='backbone').run() 126 >>> pca_space = pca.transform(universe.select_atoms('backbone'), 3) 127 128 generates the principal components of the backbone of the atomgroup and 129 then transforms those atomgroup coordinates by the direction of those 130 variances. Please refer to the :ref:`PCA-tutorial` for more detailed 131 instructions. 132 133 Attributes 134 ---------- 135 p_components: array, (n_components, n_atoms * 3) 136 The principal components of the feature space, 137 representing the directions of maximum variance in the data. 138 variance : array (n_components, ) 139 The raw variance explained by each eigenvector of the covariance 140 matrix. 141 cumulated_variance : array, (n_components, ) 142 Percentage of variance explained by the selected components and the sum 143 of the components preceding it. If a subset of components is not chosen 144 then all components are stored and the cumulated variance will converge 145 to 1. 146 pca_space : array (n_frames, n_components) 147 After running :meth:`pca.transform` the projection of the 148 positions onto the principal components will exist here. 149 mean_atoms: MDAnalyis atomgroup 150 After running :meth:`PCA.run`, the mean position of all the atoms 151 used for the creation of the covariance matrix will exist here. 152 153 Methods 154 ------- 155 transform(atomgroup, n_components=None) 156 Take an atomgroup or universe with the same number of atoms as was 157 used for the calculation in :meth:`PCA.run` and project it onto the 158 principal components. 159 160 Notes 161 ----- 162 Computation can be speed up by supplying a precalculated mean structure 163 164 .. versionchanged:: 0.19.0 165 The start frame is used when performing selections and calculating 166 mean positions. Previously the 0th frame was always used. 167 """ 168 169 def __init__(self, universe, select='all', align=False, mean=None, 170 n_components=None, **kwargs): 171 """ 172 Parameters 173 ---------- 174 universe: Universe 175 Universe 176 select: string, optional 177 A valid selection statement for choosing a subset of atoms from 178 the atomgroup. 179 align: boolean, optional 180 If True, the trajectory will be aligned to a reference 181 structure. 182 mean: MDAnalysis atomgroup, optional 183 An optional reference structure to be used as the mean of the 184 covariance matrix. 185 n_components : int, optional 186 The number of principal components to be saved, default saves 187 all principal components, Default: None 188 verbose : bool (optional) 189 Show detailed progress of the calculation if set to ``True``; the 190 default is ``False``. 191 """ 192 super(PCA, self).__init__(universe.trajectory, **kwargs) 193 self._u = universe 194 195 # for transform function 196 self.align = align 197 198 self._calculated = False 199 self.n_components = n_components 200 self._select = select 201 self._mean = mean 202 203 def _prepare(self): 204 # access start index 205 self._u.trajectory[self.start] 206 # reference will be start index 207 self._reference = self._u.select_atoms(self._select) 208 self._atoms = self._u.select_atoms(self._select) 209 self._n_atoms = self._atoms.n_atoms 210 211 if self._mean is None: 212 self.mean = np.zeros(self._n_atoms*3) 213 self._calc_mean = True 214 else: 215 self.mean = self._mean.positions 216 self._calc_mean = False 217 218 if self.n_frames == 1: 219 raise ValueError('No covariance information can be gathered from a' 220 'single trajectory frame.\n') 221 n_dim = self._n_atoms * 3 222 self.cov = np.zeros((n_dim, n_dim)) 223 self._ref_atom_positions = self._reference.positions 224 self._ref_cog = self._reference.center_of_geometry() 225 self._ref_atom_positions -= self._ref_cog 226 227 if self._calc_mean: 228 interval = int(self.n_frames // 100) 229 interval = interval if interval > 0 else 1 230 format = ("Mean Calculation Step" 231 "%(step)5d/%(numsteps)d [%(percentage)5.1f%%]\r") 232 mean_pm = ProgressMeter(self.n_frames if self.n_frames else 1, 233 interval=interval, verbose=self._verbose, 234 format=format) 235 for i, ts in enumerate(self._u.trajectory[self.start:self.stop: 236 self.step]): 237 if self.align: 238 mobile_cog = self._atoms.center_of_geometry() 239 mobile_atoms, old_rmsd = _fit_to(self._atoms.positions, 240 self._ref_atom_positions, 241 self._atoms, 242 mobile_com=mobile_cog, 243 ref_com=self._ref_cog) 244 else: 245 self.mean += self._atoms.positions.ravel() 246 mean_pm.echo(i) 247 self.mean /= self.n_frames 248 249 self.mean_atoms = self._atoms 250 self.mean_atoms.positions = self._atoms.positions 251 252 def _single_frame(self): 253 if self.align: 254 mobile_cog = self._atoms.center_of_geometry() 255 mobile_atoms, old_rmsd = _fit_to(self._atoms.positions, 256 self._ref_atom_positions, 257 self._atoms, 258 mobile_com=mobile_cog, 259 ref_com=self._ref_cog) 260 # now all structures are aligned to reference 261 x = mobile_atoms.positions.ravel() 262 else: 263 x = self._atoms.positions.ravel() 264 x -= self.mean 265 self.cov += np.dot(x[:, np.newaxis], x[:, np.newaxis].T) 266 267 def _conclude(self): 268 self.cov /= self.n_frames - 1 269 e_vals, e_vects = np.linalg.eig(self.cov) 270 sort_idx = np.argsort(e_vals)[::-1] 271 self.variance = e_vals[sort_idx] 272 self.variance = self.variance[:self.n_components] 273 self.p_components = e_vects[:self.n_components, sort_idx] 274 self.cumulated_variance = (np.cumsum(self.variance) / 275 np.sum(self.variance)) 276 self._calculated = True 277 278 def transform(self, atomgroup, n_components=None, start=None, stop=None, 279 step=None): 280 """Apply the dimensionality reduction on a trajectory 281 282 Parameters 283 ---------- 284 atomgroup: MDAnalysis atomgroup/ Universe 285 The atomgroup or universe containing atoms to be PCA transformed. 286 n_components: int, optional 287 The number of components to be projected onto, Default none: maps 288 onto all components. 289 start: int, optional 290 The frame to start on for the PCA transform. Default: None becomes 291 0, the first frame index. 292 stop: int, optional 293 Frame index to stop PCA transform. Default: None becomes n_frames. 294 Iteration stops *before* this frame number, which means that the 295 trajectory would be read until the end. 296 step: int, optional 297 Number of frames to skip over for PCA transform. Default: None 298 becomes 1. 299 300 Returns 301 ------- 302 pca_space : array, shape (number of frames, number of components) 303 304 .. versionchanged:: 0.19.0 305 Transform now requires that :meth:`run` has been called before, 306 otherwise a :exc:`ValueError` is raised. 307 """ 308 if not self._calculated: 309 raise ValueError('Call run() on the PCA before using transform') 310 311 if isinstance(atomgroup, Universe): 312 atomgroup = atomgroup.atoms 313 314 if(self._n_atoms != atomgroup.n_atoms): 315 raise ValueError('PCA has been fit for' 316 '{} atoms. Your atomgroup' 317 'has {} atoms'.format(self._n_atoms, 318 atomgroup.n_atoms)) 319 if not (self._atoms.types == atomgroup.types).all(): 320 warnings.warn('Atom types do not match with types used to fit PCA') 321 322 traj = atomgroup.universe.trajectory 323 start, stop, step = traj.check_slice_indices(start, stop, step) 324 n_frames = len(range(start, stop, step)) 325 326 dim = (n_components if n_components is not None else 327 self.p_components.shape[1]) 328 329 dot = np.zeros((n_frames, dim)) 330 331 for i, ts in enumerate(traj[start:stop:step]): 332 xyz = atomgroup.positions.ravel() - self.mean 333 dot[i] = np.dot(xyz, self.p_components[:, :n_components]) 334 335 return dot 336 337 338def cosine_content(pca_space, i): 339 """Measure the cosine content of the PCA projection. 340 341 The cosine content of pca projections can be used as an indicator if a 342 simulation is converged. Values close to 1 are an indicator that the 343 simulation isn't converged. For values below 0.7 no statement can be made. 344 If you use this function please cite [BerkHess1]_. 345 346 347 Parameters 348 ---------- 349 pca_space: array, shape (number of frames, number of components) 350 The PCA space to be analyzed. 351 i: int 352 The index of the pca_component projectection to be analyzed. 353 354 Returns 355 ------- 356 A float reflecting the cosine content of the ith projection in the PCA 357 space. The output is bounded by 0 and 1, with 1 reflecting an agreement 358 with cosine while 0 reflects complete disagreement. 359 360 References 361 ---------- 362 .. [BerkHess1] Berk Hess. Convergence of sampling in protein simulations. 363 Phys. Rev. E 65, 031910 (2002). 364 """ 365 366 t = np.arange(len(pca_space)) 367 T = len(pca_space) 368 cos = np.cos(np.pi * t * (i + 1) / T) 369 return ((2.0 / T) * (scipy.integrate.simps(cos*pca_space[:, i])) ** 2 / 370 scipy.integrate.simps(pca_space[:, i] ** 2)) 371