1# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
2# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
3#
4# MDAnalysis --- https://www.mdanalysis.org
5# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
6# (see the file AUTHORS for the full list of names)
7#
8# Released under the GNU Public Licence, v2 or any higher version
9#
10# Please cite your use of MDAnalysis in published work:
11#
12# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
13# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
14# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
15# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
16# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
17#
18# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
19# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
20# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
21#
22
23r"""Principal Component Analysis (PCA) --- :mod:`MDAnalysis.analysis.pca`
24=====================================================================
25
26:Authors: John Detlefs
27:Year: 2016
28:Copyright: GNU Public License v3
29
30.. versionadded:: 0.16.0
31
32This module contains the linear dimensions reduction method Principal Component
33Analysis (PCA). PCA sorts a simulation into 3N directions of descending
34variance, with N being the number of atoms. These directions are called
35the principal components. The dimensions to be analyzed are reduced by only
36looking at a few projections of the first principal components. To learn how to
37run a Principal Component Analysis, please refer to the :ref:`PCA-tutorial`.
38
39The PCA problem is solved by solving the eigenvalue problem of the covariance
40matrix, a :math:`3N \times 3N` matrix where the element :math:`(i, j)` is the
41covariance between coordinates :math:`i` and :math:`j`. The principal
42components are the eigenvectors of this matrix.
43
44For each eigenvector, its eigenvalue is the variance that the eigenvector
45explains. Stored in :attr:`PCA.cumulated_variance`, a ratio for each number of
46eigenvectors up to index :math:`i` is provided to quickly find out how many
47principal components are needed to explain the amount of variance reflected by
48those :math:`i` eigenvectors. For most data, :attr:`PCA.cumulated_variance`
49will be approximately equal to one for some :math:`n` that is significantly
50smaller than the total number of components, these are the components of
51interest given by Principal Component Analysis.
52
53From here, we can project a trajectory onto these principal components and
54attempt to retrieve some structure from our high dimensional data.
55
56For a basic introduction to the module, the :ref:`PCA-tutorial` shows how
57to perform Principal Component Analysis.
58
59.. _PCA-tutorial:
60
61PCA Tutorial
62------------
63
64The example uses files provided as part of the MDAnalysis test suite
65(in the variables :data:`~MDAnalysis.tests.datafiles.PSF` and
66:data:`~MDAnalysis.tests.datafiles.DCD`). This tutorial shows how to use the
67PCA class.
68
69First load all modules and test data
70
71    >>> import MDAnalysis as mda
72    >>> import MDAnalysis.analysis.pca as pca
73    >>> from MDAnalysis.tests.datafiles import PSF, DCD
74
75Given a universe containing trajectory data we can perform Principal Component
76Analyis by using the class :class:`PCA` and retrieving the principal
77components.
78
79    >>> u = mda.Universe(PSF, DCD)
80    >>> PSF_pca = pca.PCA(u, select='backbone')
81    >>> PSF_pca.run()
82
83Inspect the components to determine the principal components you would like
84to retain. The choice is arbitrary, but I will stop when 95 percent of the
85variance is explained by the components. This cumulated variance by the
86components is conveniently stored in the one-dimensional array attribute
87``cumulated_variance``. The value at the ith index of `cumulated_variance`
88is the sum of the variances from 0 to i.
89
90    >>> n_pcs = np.where(PSF_pca.cumulated_variance > 0.95)[0][0]
91    >>> atomgroup = u.select_atoms('backbone')
92    >>> pca_space = PSF_pca.transform(atomgroup, n_components=n_pcs)
93
94From here, inspection of the ``pca_space`` and conclusions to be drawn from the
95data are left to the user.
96
97Classes and Functions
98---------------------
99
100.. autoclass:: PCA
101.. autofunction:: cosine_content
102
103"""
104from __future__ import division, absolute_import
105from six.moves import range
106import warnings
107
108import numpy as np
109import scipy.integrate
110
111from MDAnalysis import Universe
112from MDAnalysis.analysis.align import _fit_to
113from MDAnalysis.lib.log import ProgressMeter
114
115from .base import AnalysisBase
116
117
118class PCA(AnalysisBase):
119    """Principal component analysis on an MD trajectory.
120
121    After initializing and calling method with a universe or an atom group,
122    principal components ordering the atom coordinate data by decreasing
123    variance will be available for analysis. As an example:
124
125        >>> pca = PCA(universe, select='backbone').run()
126        >>> pca_space =  pca.transform(universe.select_atoms('backbone'), 3)
127
128    generates the principal components of the backbone of the atomgroup and
129    then transforms those atomgroup coordinates by the direction of those
130    variances. Please refer to the :ref:`PCA-tutorial` for more detailed
131    instructions.
132
133    Attributes
134    ----------
135    p_components: array, (n_components, n_atoms * 3)
136        The principal components of the feature space,
137        representing the directions of maximum variance in the data.
138    variance : array (n_components, )
139        The raw variance explained by each eigenvector of the covariance
140        matrix.
141    cumulated_variance : array, (n_components, )
142        Percentage of variance explained by the selected components and the sum
143        of the components preceding it. If a subset of components is not chosen
144        then all components are stored and the cumulated variance will converge
145        to 1.
146    pca_space : array (n_frames, n_components)
147        After running :meth:`pca.transform` the projection of the
148        positions onto the principal components will exist here.
149    mean_atoms: MDAnalyis atomgroup
150        After running :meth:`PCA.run`, the mean position of all the atoms
151        used for the creation of the covariance matrix will exist here.
152
153    Methods
154    -------
155    transform(atomgroup, n_components=None)
156        Take an atomgroup or universe with the same number of atoms as was
157        used for the calculation in :meth:`PCA.run` and project it onto the
158        principal components.
159
160    Notes
161    -----
162    Computation can be speed up by supplying a precalculated mean structure
163
164    .. versionchanged:: 0.19.0
165       The start frame is used when performing selections and calculating
166       mean positions.  Previously the 0th frame was always used.
167    """
168
169    def __init__(self, universe, select='all', align=False, mean=None,
170                 n_components=None, **kwargs):
171        """
172        Parameters
173        ----------
174        universe: Universe
175            Universe
176        select: string, optional
177            A valid selection statement for choosing a subset of atoms from
178            the atomgroup.
179        align: boolean, optional
180            If True, the trajectory will be aligned to a reference
181            structure.
182        mean: MDAnalysis atomgroup, optional
183            An optional reference structure to be used as the mean of the
184            covariance matrix.
185        n_components : int, optional
186            The number of principal components to be saved, default saves
187            all principal components, Default: None
188        verbose : bool (optional)
189             Show detailed progress of the calculation if set to ``True``; the
190             default is ``False``.
191        """
192        super(PCA, self).__init__(universe.trajectory, **kwargs)
193        self._u = universe
194
195        # for transform function
196        self.align = align
197
198        self._calculated = False
199        self.n_components = n_components
200        self._select = select
201        self._mean = mean
202
203    def _prepare(self):
204        # access start index
205        self._u.trajectory[self.start]
206        # reference will be start index
207        self._reference = self._u.select_atoms(self._select)
208        self._atoms = self._u.select_atoms(self._select)
209        self._n_atoms = self._atoms.n_atoms
210
211        if self._mean is None:
212            self.mean = np.zeros(self._n_atoms*3)
213            self._calc_mean = True
214        else:
215            self.mean = self._mean.positions
216            self._calc_mean = False
217
218        if self.n_frames == 1:
219            raise ValueError('No covariance information can be gathered from a'
220                             'single trajectory frame.\n')
221        n_dim = self._n_atoms * 3
222        self.cov = np.zeros((n_dim, n_dim))
223        self._ref_atom_positions = self._reference.positions
224        self._ref_cog = self._reference.center_of_geometry()
225        self._ref_atom_positions -= self._ref_cog
226
227        if self._calc_mean:
228            interval = int(self.n_frames // 100)
229            interval = interval if interval > 0 else 1
230            format = ("Mean Calculation Step"
231                      "%(step)5d/%(numsteps)d [%(percentage)5.1f%%]\r")
232            mean_pm = ProgressMeter(self.n_frames if self.n_frames else 1,
233                                    interval=interval, verbose=self._verbose,
234                                    format=format)
235            for i, ts in enumerate(self._u.trajectory[self.start:self.stop:
236                                                      self.step]):
237                if self.align:
238                    mobile_cog = self._atoms.center_of_geometry()
239                    mobile_atoms, old_rmsd = _fit_to(self._atoms.positions,
240                                                     self._ref_atom_positions,
241                                                     self._atoms,
242                                                     mobile_com=mobile_cog,
243                                                     ref_com=self._ref_cog)
244                else:
245                    self.mean += self._atoms.positions.ravel()
246                mean_pm.echo(i)
247            self.mean /= self.n_frames
248
249        self.mean_atoms = self._atoms
250        self.mean_atoms.positions = self._atoms.positions
251
252    def _single_frame(self):
253        if self.align:
254            mobile_cog = self._atoms.center_of_geometry()
255            mobile_atoms, old_rmsd = _fit_to(self._atoms.positions,
256                                             self._ref_atom_positions,
257                                             self._atoms,
258                                             mobile_com=mobile_cog,
259                                             ref_com=self._ref_cog)
260            # now all structures are aligned to reference
261            x = mobile_atoms.positions.ravel()
262        else:
263            x = self._atoms.positions.ravel()
264        x -= self.mean
265        self.cov += np.dot(x[:, np.newaxis], x[:, np.newaxis].T)
266
267    def _conclude(self):
268        self.cov /= self.n_frames - 1
269        e_vals, e_vects = np.linalg.eig(self.cov)
270        sort_idx = np.argsort(e_vals)[::-1]
271        self.variance = e_vals[sort_idx]
272        self.variance = self.variance[:self.n_components]
273        self.p_components = e_vects[:self.n_components, sort_idx]
274        self.cumulated_variance = (np.cumsum(self.variance) /
275                                   np.sum(self.variance))
276        self._calculated = True
277
278    def transform(self, atomgroup, n_components=None, start=None, stop=None,
279                  step=None):
280        """Apply the dimensionality reduction on a trajectory
281
282        Parameters
283        ----------
284        atomgroup: MDAnalysis atomgroup/ Universe
285            The atomgroup or universe containing atoms to be PCA transformed.
286        n_components: int, optional
287            The number of components to be projected onto, Default none: maps
288            onto all components.
289        start: int, optional
290            The frame to start on for the PCA transform. Default: None becomes
291            0, the first frame index.
292        stop: int, optional
293            Frame index to stop PCA transform. Default: None becomes n_frames.
294            Iteration stops *before* this frame number, which means that the
295            trajectory would be read until the end.
296        step: int, optional
297            Number of frames to skip over for PCA transform. Default: None
298            becomes 1.
299
300        Returns
301        -------
302        pca_space : array, shape (number of frames, number of components)
303
304        .. versionchanged:: 0.19.0
305           Transform now requires that :meth:`run` has been called before,
306           otherwise a :exc:`ValueError` is raised.
307        """
308        if not self._calculated:
309            raise ValueError('Call run() on the PCA before using transform')
310
311        if isinstance(atomgroup, Universe):
312            atomgroup = atomgroup.atoms
313
314        if(self._n_atoms != atomgroup.n_atoms):
315            raise ValueError('PCA has been fit for'
316                             '{} atoms. Your atomgroup'
317                             'has {} atoms'.format(self._n_atoms,
318                                                   atomgroup.n_atoms))
319        if not (self._atoms.types == atomgroup.types).all():
320            warnings.warn('Atom types do not match with types used to fit PCA')
321
322        traj = atomgroup.universe.trajectory
323        start, stop, step = traj.check_slice_indices(start, stop, step)
324        n_frames = len(range(start, stop, step))
325
326        dim = (n_components if n_components is not None else
327               self.p_components.shape[1])
328
329        dot = np.zeros((n_frames, dim))
330
331        for i, ts in enumerate(traj[start:stop:step]):
332            xyz = atomgroup.positions.ravel() - self.mean
333            dot[i] = np.dot(xyz, self.p_components[:, :n_components])
334
335        return dot
336
337
338def cosine_content(pca_space, i):
339    """Measure the cosine content of the PCA projection.
340
341    The cosine content of pca projections can be used as an indicator if a
342    simulation is converged. Values close to 1 are an indicator that the
343    simulation isn't converged. For values below 0.7 no statement can be made.
344    If you use this function please cite [BerkHess1]_.
345
346
347    Parameters
348    ----------
349    pca_space: array, shape (number of frames, number of components)
350        The PCA space to be analyzed.
351    i: int
352        The index of the pca_component projectection to be analyzed.
353
354    Returns
355    -------
356    A float reflecting the cosine content of the ith projection in the PCA
357    space. The output is bounded by 0 and 1, with 1 reflecting an agreement
358    with cosine while 0 reflects complete disagreement.
359
360    References
361    ----------
362    .. [BerkHess1] Berk Hess. Convergence of sampling in protein simulations.
363                   Phys. Rev. E 65, 031910 (2002).
364    """
365
366    t = np.arange(len(pca_space))
367    T = len(pca_space)
368    cos = np.cos(np.pi * t * (i + 1) / T)
369    return ((2.0 / T) * (scipy.integrate.simps(cos*pca_space[:, i])) ** 2 /
370            scipy.integrate.simps(pca_space[:, i] ** 2))
371