1###########################################################################
2#                                                                         #
3#    physical_validation,                                                 #
4#    a python package to test the physical validity of MD results         #
5#                                                                         #
6#    Written by Michael R. Shirts <michael.shirts@colorado.edu>           #
7#               Pascal T. Merz <pascal.merz@colorado.edu>                 #
8#                                                                         #
9#    Copyright (C) 2012 University of Virginia                            #
10#              (C) 2017 University of Colorado Boulder                    #
11#                                                                         #
12#    This library is free software; you can redistribute it and/or        #
13#    modify it under the terms of the GNU Lesser General Public           #
14#    License as published by the Free Software Foundation; either         #
15#    version 2.1 of the License, or (at your option) any later version.   #
16#                                                                         #
17#    This library is distributed in the hope that it will be useful,      #
18#    but WITHOUT ANY WARRANTY; without even the implied warranty of       #
19#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    #
20#    Lesser General Public License for more details.                      #
21#                                                                         #
22#    You should have received a copy of the GNU Lesser General Public     #
23#    License along with this library; if not, write to the                #
24#    Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,     #
25#    Boston, MA 02110-1301 USA                                            #
26#                                                                         #
27###########################################################################
28r"""
29This file reimplements most functionality of the checkensemble.py code
30originally published on https://github.com/shirtsgroup/checkensemble. It
31serves as the low-level functionality of the high-level module
32:mod:`physical_validation.ensemble`.
33"""
34from __future__ import division
35import numpy as np
36import scipy.optimize
37
38import pymbar
39
40from . import trajectory
41from . import error as pv_error
42from . import plot
43
44
45def generate_histograms(traj1, traj2, g1, g2, bins):
46
47    n1 = np.size(traj1)
48    n2 = np.size(traj2)
49
50    h1 = np.histogram(traj1, bins=bins)[0]/n1
51    h2 = np.histogram(traj2, bins=bins)[0]/n2
52    dh1 = np.sqrt(g1 * h1 * (1 - h1) / n1)
53    dh2 = np.sqrt(g2 * h2 * (1 - h2) / n2)
54
55    return h1, h2, dh1, dh2
56
57
58def do_linear_fit(traj1, traj2, g1, g2, bins,
59                  screen=False, filename=None,
60                  trueslope=0.0, trueoffset=0.0,
61                  units=None):
62
63    h1, h2, dh1, dh2 = generate_histograms(traj1, traj2, g1, g2, bins)
64
65    #  v  copied from checkensemble.py  v
66    ratio = np.log(h2 / h1)
67    dratio = np.sqrt((dh1/h1)**2 + (dh2/h2)**2)
68
69    usedat = np.isfinite(ratio)
70    y = ratio[usedat]
71    nuse = len(y)
72    weights = 1.0/dratio[usedat]
73
74    xaxis = (bins[:-1] + bins[1:])/2
75    x = xaxis[usedat]
76
77    x_mat = np.ones([nuse, 2])
78    x_mat[:, 1] = x
79
80    w = np.diag(weights)
81    wx = np.dot(w, x_mat)
82    wy = np.dot(w, y)
83    wx_t = np.transpose(wx)
84    z = np.dot(wx_t, wx)
85    wxy = np.dot(wx_t, wy)
86
87    a = np.linalg.solve(z, wxy)
88    da_matrix = np.transpose(np.linalg.inv(z))
89    da = np.zeros(2)
90    da[0] = np.sqrt(da_matrix[0, 0])
91    da[1] = np.sqrt(da_matrix[1, 1])
92
93    # the true line is y = df + dp*x, where y is ln P_1(X)/P_2(X)
94    #  ^  end copied from checkensemble.py  ^
95
96    do_plot = screen or filename is not None
97    if do_plot:
98        true = trueoffset+trueslope*xaxis
99        fit = a[0] + a[1]*xaxis
100
101        data = [{'x': xaxis,
102                 'y': ratio,
103                 'y_err': dratio,
104                 'name': 'Simulation'},
105                {'x': xaxis,
106                 'y': fit,
107                 'name': 'Fit to simulation'},
108                {'x': xaxis,
109                 'y': true,
110                 'name': 'Analytical ratio'}]
111
112        if units is not None:
113            units = ' [' + units + ']'
114        else:
115            units = ''
116
117        annot = ('{:.1f}'.format(abs((a[1] - trueslope) / da[1])) +
118                 ' quantiles')
119
120        plot.plot(data,
121                  legend='best',
122                  title='Log probability ratio',
123                  xlabel='Energy' + units,
124                  ylabel=r'$\log\frac{P_2(E)}{P_1(E)}$',
125                  filename=filename,
126                  screen=screen,
127                  axtext=annot)
128
129    return a, da
130
131
132def do_max_likelihood_fit(traj1, traj2, g1, g2,
133                          init_params=None,
134                          verbose=False):
135
136    # ============================================================= #
137    # Define (negative) log-likelihood function and its derivatives #
138    # ============================================================= #
139    def log_likelihood(a, ene1, ene2):
140        # Returns negative of eq (8) of check_ensemble paper
141        #
142        # Uses log (1/f(x)) == -log(f(x))
143        # and log(1 + e^x) == log(e^x (e^-x + 1)) == x + log(1 + e^-x)
144        #     ^(a)                                   ^(b)
145        # form (a) -> 0 for x->-inf, -> inf for x->inf
146        # form (b) -> NaN for x->-inf, -> x for x->inf
147        # combined: -> 0 for x-> -inf, -> x for x-> inf
148        def log_1_plus_exp(y):
149            def f(yy):
150                with np.errstate(over='raise'):
151                    try:
152                        xx = np.log(1 + np.exp(yy))
153                    except FloatingPointError:
154                        xx = yy + np.log(1 + np.exp(-yy))
155                    return xx
156            return np.vectorize(f)(y)
157
158        if a.size == 2:
159            return (np.sum(log_1_plus_exp(a[0] + a[1]*ene1)) +
160                    np.sum(log_1_plus_exp(-a[0] - a[1]*ene2)))
161        else:
162            return (np.sum(log_1_plus_exp(a[0] + a[1]*ene1[0] + a[2]*ene1[1])) +
163                    np.sum(log_1_plus_exp(-a[0] - a[1]*ene2[0] - a[2]*ene2[1])))
164
165    def da_log_likelihood(a, ene1, ene2):
166        # Returns the first derivative wrt the parameters a of log_likelihood
167        #
168        # d/da0 log(1 + exp(a0 + a1*E)) == exp(a0 + a1*E) / (1 + exp(a0 + a1*E))
169        #                               == 1 / (1 + exp(-a0 - a1*E))
170        # d/da1 log(1 + exp(a0 + a1*E)) == E * exp(a0 + a1*E) / (1 + exp(a0 + a1*E))
171        #                               == E / (1 + exp(-a0 - a1*E))
172        def inv_1_plus_exp(y):
173            def f(yy):
174                with np.errstate(over='raise'):
175                    try:
176                        xx = 1. / (1 + np.exp(yy))
177                    except FloatingPointError:
178                        xx = 0.
179                    return xx
180            return np.vectorize(f)(y)
181
182        if a.size == 2:
183            d = np.zeros(2)
184            d[0] = (np.sum(inv_1_plus_exp(-a[0] - a[1]*ene1)) -
185                    np.sum(inv_1_plus_exp(a[0] + a[1]*ene2)))
186            d[1] = (np.sum(inv_1_plus_exp(-a[0] - a[1]*ene1) * ene1) -
187                    np.sum(inv_1_plus_exp(a[0] + a[1]*ene2) * ene2))
188        else:
189            d = np.zeros(3)
190            d[0] = (np.sum(inv_1_plus_exp(-a[0] - a[1]*ene1[0] - a[2]*ene1[1])) -
191                    np.sum(inv_1_plus_exp(a[0] + a[1]*ene2[0] + a[2]*ene2[1])))
192            d[1] = (np.sum(inv_1_plus_exp(-a[0] - a[1]*ene1[0] - a[2]*ene1[1]) * ene1[0]) -
193                    np.sum(inv_1_plus_exp(a[0] + a[1]*ene2[0] + a[2]*ene2[1]) * ene2[0]))
194            d[2] = (np.sum(inv_1_plus_exp(-a[0] - a[1]*ene1[0] - a[2]*ene1[1]) * ene1[1]) -
195                    np.sum(inv_1_plus_exp(a[0] + a[1]*ene2[0] + a[2]*ene2[1]) * ene2[1]))
196
197        return d
198
199    def hess_log_likelihood(a, ene1, ene2):
200        # Returns the hessian wrt the parameters a of log_likelihood
201        # fac1 = 1 / (2 + 2*cosh(a0 + a1*ene1))
202        # h1 = [[ fac1,      ene1*fac1    ],
203        #       [ ene1*fac1, ene1**2*fac1 ]]
204        # fac2 = 1 / (2 + 2*cosh(a0 + a1*ene2))
205        # h2 = [[ fac2,      ene2*fac2    ],
206        #       [ ene2*fac2, ene2**2*fac2 ]]
207        # h = h1 + h2
208
209        if a.size == 2:
210            fac1 = 1 / (2 + 2*np.cosh(a[0] + a[1]*ene1))
211            fac2 = 1 / (2 + 2*np.cosh(a[0] + a[1]*ene2))
212
213            h = np.zeros((2, 2))
214
215            h[0, 0] = np.sum(fac1) + np.sum(fac2)
216            h[0, 1] = h[1, 0] = np.sum(ene1 * fac1) + np.sum(ene2 * fac2)
217            h[1, 1] = np.sum(ene1 * ene1 * fac1) + np.sum(ene2 * ene2 * fac2)
218
219        else:
220            fac1 = 1 / (2 + 2*np.cosh(a[0] + a[1]*ene1[0] + a[2]*ene1[1]))
221            fac2 = 1 / (2 + 2*np.cosh(a[0] + a[1]*ene2[0] + a[2]*ene2[1]))
222
223            h = np.zeros((3, 3))
224
225            h[0, 0] = np.sum(fac1) + np.sum(fac2)
226            h[1, 1] = np.sum(ene1[0] * ene1[0] * fac1) + np.sum(ene2[0] * ene2[0] * fac2)
227            h[2, 2] = np.sum(ene1[1] * ene1[1] * fac1) + np.sum(ene2[1] * ene2[1] * fac2)
228
229            h[0, 1] = h[1, 0] = np.sum(ene1[0] * fac1) + np.sum(ene2[0] * fac2)
230            h[0, 2] = h[2, 0] = np.sum(ene1[1] * fac1) + np.sum(ene2[1] * fac2)
231            h[1, 2] = h[2, 1] = (np.sum(ene1[0] * ene1[1] * fac1) +
232                                 np.sum(ene2[0] * ene2[1] * fac2))
233
234        return h
235
236    # ==================================================== #
237    # Minimize the negative of the log likelihood function #
238    # ==================================================== #
239    if init_params is None:
240        init_params = np.zeros(traj1.ndim + 1)
241    else:
242        init_params = np.array(init_params)
243
244    min_res = scipy.optimize.minimize(
245        log_likelihood,
246        x0=init_params,
247        args=(traj1, traj2),
248        method='dogleg',
249        jac=da_log_likelihood,
250        hess=hess_log_likelihood
251    )
252
253    # fallback options
254    if not min_res.success:
255        if verbose:
256            print('Note: Max-Likelihood minimization failed using \'dogleg\' method. '
257                  'Trying to vary initial parameters.')
258        min_res_1 = scipy.optimize.minimize(
259            log_likelihood,
260            x0=init_params * 0.9,
261            args=(traj1, traj2),
262            method='dogleg',
263            jac=da_log_likelihood,
264            hess=hess_log_likelihood
265        )
266        min_res_2 = scipy.optimize.minimize(
267            log_likelihood,
268            x0=init_params * 1.1,
269            args=(traj1, traj2),
270            method='dogleg',
271            jac=da_log_likelihood,
272            hess=hess_log_likelihood
273        )
274        if min_res_1.success and min_res_2.success and np.allclose(min_res_1.x, min_res_2.x):
275            min_res = min_res_1
276
277    if not min_res.success:
278        # dogleg was unsuccessful using alternative starting point
279        if verbose:
280            print('Note: Max-Likelihood minimization failed using \'dogleg\' method. '
281                  'Trying method \'nelder-mead\'.')
282        min_res = scipy.optimize.minimize(
283            log_likelihood,
284            x0=init_params * 0.9,
285            args=(traj1, traj2),
286            method='nelder-mead'
287        )
288
289    if not min_res.success:
290        raise RuntimeError('MaxLikelihood: Unable to minimize function.')
291
292    final_params = min_res.x
293
294    # ======================= #
295    # Calculate uncertainties #
296    # ======================= #
297    cov = np.linalg.inv(hess_log_likelihood(final_params, traj1, traj2))
298    final_error = np.sqrt(np.diag(cov))*np.sqrt(np.average([g1, g2]))
299
300    return final_params, final_error
301
302
303def check_bins(traj1, traj2, bins):
304    # check for empty bins
305    h1, _ = np.histogram(traj1, bins=bins)
306    h2, _ = np.histogram(traj2, bins=bins)
307    empty = np.where((h1 == 0) | (h2 == 0))[0]
308
309    if np.size(empty) == 0:
310        return bins
311    elif np.size(empty) == 1:
312        empty = empty[0]
313        if empty > np.size(bins) / 2:
314            return bins[:empty]
315        else:
316            return bins[empty+1:]
317    else:
318        # find longest non-empty interval
319        empty = np.insert(np.append(empty, [40]), 0, [-1])
320        max_interval = np.argmax(empty[1:] - empty[:-1])
321        left = empty[max_interval] + 1
322        right = empty[max_interval + 1]
323        return bins[left:right]
324
325
326def print_stats(title,
327                fitvals, dfitvals,
328                kb, param1, param2, trueslope,
329                temp=None, pvconvert=None,
330                dtemp=False, dpress=False, dmu=False,
331                dtempdpress=False, dtempdmu=False):
332
333    # if simple 1d:
334    #     fitvals = [df, slope]
335    #     dfitvals = [ddf, dslope]
336    # if simple 2d:
337    #     fitvals = [df, slope0, slope1]
338    #     dfitvals = [ddf, dslope0, dslope1]
339    # if bootstrapped 1d:
340    #     fitvals = [[df, slope], [df, slope], ...]
341    #     dfitvals = None
342    # if bootstrapped 2d:
343    #     fitvals = [[df, slope0, slope1], [df, slope0, slope1], ...]
344    #     dfitvals = None
345    if fitvals.ndim > 1:
346        dfitvals = np.std(fitvals, axis=0)
347        fitvals = np.average(fitvals, axis=0)
348
349    if np.ndim(trueslope) == 0:
350        trueslopes = np.array([trueslope])
351    else:
352        trueslopes = trueslope
353
354    free_energy = fitvals[0]
355    slopes = fitvals[1:]
356    dfree_energy = dfitvals[0]
357    dslopes = dfitvals[1:]
358
359    print('='*50)
360    print(title)
361    print('='*50)
362    print('Free energy')
363    print('    {:.5f} +/- {:.5f}'.format(free_energy, dfree_energy))
364    print('{:27s}      |  {:s}'.format('Estimated slope', 'True slope'))
365    for slope, dslope, trueslope in zip(slopes, dslopes, trueslopes):
366        print('    {:<9.6f} +/- {:<9.6f}      |  {:<9.6f}'.format(slope, dslope, trueslope))
367        quant = np.abs((slope-trueslope)/dslope)
368        print('    ({:.2f} quantiles from true slope)'.format(quant))
369
370    if dtemp or dtempdpress or dtempdmu:
371        # slope is estimated beta2 - beta1
372        # kb * slope == 1/T1' - 1/T2' == (T2' - T1')/(T1'*T2')
373        # So we'll assume dT' == T1' - T2' ~= kb * slope * T1*T2
374        slope = slopes[0]
375        dslope = dslopes[0]
376        if dtemp:
377            t1 = param1
378            t2 = param2
379        else:
380            t1 = param1[0]
381            t2 = param2[0]
382        print('{:27s}      |  {:s}'.format('Estimated dT', 'True dT'))
383        print('    {:<6.1f} +/- {:<6.1f}            |  {:<6.1f}'.format(
384            kb * slope * t1 * t2,
385            kb * dslope * t1 * t2,
386            t2 - t1
387        ))
388    if dpress or dtempdpress:
389        # slope is estimated (P1 - P2)/beta*pvconvert (1d), or
390        #                    (P1/b1 - P2/b2)*pvconvert (2d)
391        if temp is None and dtempdpress:
392            temp = .5*(param1[0] + param2[0])
393        if dpress:
394            press = -slopes[0] * (kb*temp) / pvconvert
395            ddpress = -dslopes[0] * (kb*temp) / pvconvert
396            truepress = -trueslopes[0] * (kb*temp) / pvconvert
397        else:
398            press = -slopes[1] * (kb*temp) / pvconvert
399            ddpress = -dslopes[1] * (kb*temp) / pvconvert
400            truepress = -trueslopes[1] * (kb*temp) / pvconvert
401        print('{:27s}      |  {:s}'.format('Estimated dP', 'True dP'))
402        print('    {:<6.1f} +/- {:<6.1f}            |  {:<6.1f}'.format(
403            press, np.abs(ddpress), truepress
404        ))
405    if dmu or dtempdmu:
406        pass
407    print('='*50)
408
409
410def estimate_interval(ens_string, ens_temp,
411                      energy, kb,
412                      ens_press=None,
413                      volume=None, pvconvert=None,
414                      verbosity=1, cutoff=0.001,
415                      tunit='', punit=''):
416    result = {}
417    if ens_string == 'NVT':
418        # Discard burn-in period and time-correlated frames
419        energy = trajectory.equilibrate(energy, verbose=(verbosity > 1), name='Energy')
420        energy = trajectory.decorrelate(energy, verbose=(verbosity > 1), name='Energy')
421        energy = trajectory.cut_tails(energy, cut=cutoff, verbose=(verbosity > 2), name='Energy')
422
423        # dT
424        sig = np.std(energy)
425        result['dT'] = 2*kb*ens_temp*ens_temp/sig
426    elif ens_string == 'NPT':
427        enthalpy = energy + pvconvert * ens_press * volume
428        traj_2d = np.array([energy, volume])
429        # Discard burn-in period and time-correlated frames
430        enthalpy = trajectory.equilibrate(enthalpy, verbose=(verbosity > 1), name='Enthalpy')
431        enthalpy = trajectory.decorrelate(enthalpy, verbose=(verbosity > 1), name='Enthalpy')
432        enthalpy = trajectory.cut_tails(enthalpy, cut=cutoff, verbose=(verbosity > 2), name='Enthalpy')
433        volume_1d = trajectory.equilibrate(volume, verbose=(verbosity > 1), name='Volume')
434        volume_1d = trajectory.decorrelate(volume_1d, verbose=(verbosity > 1), name='Volume')
435        volume_1d = trajectory.cut_tails(volume_1d, cut=cutoff, verbose=(verbosity > 2), name='Volume')
436        traj_2d = trajectory.equilibrate(traj_2d, verbose=(verbosity > 1), name='2D-Trajectory')
437        traj_2d = trajectory.decorrelate(traj_2d, facs=[1, pvconvert * ens_press], verbose=(verbosity > 1), name='2D-Trajectory')
438        traj_2d = trajectory.cut_tails(traj_2d, cut=cutoff, verbose=(verbosity > 2), name='2D-Trajectory')
439
440        # dT
441        sig = np.std(enthalpy)
442        result['dT'] = 2*kb*ens_temp*ens_temp/sig
443        # dP
444        sig = np.std(volume_1d)*pvconvert
445        result['dP'] = 2*kb*ens_temp/sig
446        # dTdP
447        cov = np.cov(traj_2d)
448        sig = np.sqrt(np.diag(cov))
449        sig[1] *= pvconvert
450        result['dTdP'] = [2*kb*ens_temp*ens_temp/sig[0],
451                          2*kb*ens_temp/sig[1]]
452    else:
453        raise pv_error.InputError('ens_str', 'Unrecognized ensemble string.')
454
455    if verbosity > 0:
456        print('A rule of thumb states that good error recognition can be expected when\n'
457              'spacing the tip of the distributions by about two standard deviations.\n'
458              'Based on this rule, and the assumption that the standard deviation of the\n'
459              'distributions is largely independent of the state point, here\'s an estimate\n'
460              'for the interval given the current simulation:')
461        if ens_string == 'NVT':
462            print('Current trajectory: NVT, T = {:.2f} {:s}'.format(ens_temp, tunit))
463            print('Suggested interval: dT = {:.1f} {:s}'.format(result['dT'], tunit))
464        if ens_string == 'NPT':
465            print('Current trajectory: NPT, T = {:.2f} {:s}, P = {:.2f} {:s}'.format(
466                ens_temp, tunit, ens_press, punit))
467            print('Suggested interval:')
468            print('  Temperature-only: dT = {:.1f} {:s}'.format(result['dT'], tunit))
469            print('  Pressure-only: dP = {:.1f} {:s}'.format(result['dP'], punit))
470            print('  Combined: dT = {:.1f} {:s}, dP = {:.1f} {:s}'.format(
471                result['dTdP'][0], tunit, result['dTdP'][1], punit))
472
473
474def check_1d(traj1, traj2, param1, param2, kb,
475             quantity, dtemp=False, dpress=False, dmu=False,
476             temp=None, pvconvert=None,
477             nbins=40, cutoff=0.001, seed=None,
478             verbosity=1, screen=False, filename=None):
479    r"""
480    Checks whether the energy trajectories of two simulation performed at
481    different temperatures have sampled distributions at the analytically
482    expected ratio.
483
484    Parameters
485    ----------
486    traj1 : array-like
487        Trajectory of the first simulation
488        If dtemp:
489
490            * NVT: Potential energy U or total energy E = U + K
491            * NPT: Enthalpy H = U + pV or total energy E = H + K
492
493        If dpress:
494
495            * NPT: Volume V
496
497    traj2 : array-like
498        Trajectory of the second simulation
499        If dtemp:
500
501            * NVT: Potential energy U or total energy E = U + K
502            * NPT: Enthalpy H = U + pV or total energy E = H + K
503
504        If dpress:
505
506            * NPT: Volume V
507
508    param1 : float
509        Target temperature or pressure of the first simulation
510    param2 : float
511        Target temperature or pressure of the second simulation
512    kb : float
513        Boltzmann constant in same units as the energy trajectories
514    quantity : str
515        Name of quantity analyzed (used for printing only)
516    dtemp : bool, optional
517        Set to True if trajectories were simulated at different temperature
518        Default: False.
519    dpress : bool, optional
520        Set to True if trajectories were simulated at different pressure
521        Default: False.
522    temp : float, optional
523        The temperature in equal temperature, differring pressure NPT simulations.
524        Needed to print optimal dP.
525    pvconvert : float, optional
526        Conversion from pressure * volume to energy units.
527        Needed to print optimal dP.
528    dmu : bool, optional
529        Set to True if trajectories were simulated at different chemical potential
530        Default: False.
531    nbins : int, optional
532        Number of bins used to assess distributions of the trajectories
533        Default: 40
534    cutoff : float, optional
535        Tail cutoff of distributions.
536        Default: 0.001 (0.1%)
537    seed : int, optional
538        If set, bootstrapping will be reproducible.
539        Default: None, bootstrapping non-reproducible.
540    verbosity : int, optional
541        Verbosity level.
542        Default: 1 (only most important output)
543    screen : bool, optional
544        Plot distributions on screen.
545        Default: False.
546    filename : string, optional
547        Plot distributions to `filename`.pdf.
548        Default: None.
549
550    Returns
551    -------
552
553    """
554
555    if (not (dtemp or dpress or dmu) or
556       (dtemp and dpress) or
557       (dtemp and dmu) or
558       (dpress and dmu)):
559        raise pv_error.InputError(['dtemp', 'dpress', 'dmu'],
560                                  'Need to specify exactly one of `dtemp`, `dpress` and `dmu`.')
561
562    if dmu:
563        raise NotImplementedError('check_1d: Testing of `dmu` not implemented.')
564
565    if seed is not None:
566        raise NotImplementedError('check_1d: Bootstrapping not implemented.')
567
568    if dpress and (temp is None or pvconvert is None):
569        raise pv_error.InputError(['dpress', 'temp', 'pvconvert'],
570                                  '`ensemble.check_1d` with `dpress=True` requires `temp` and `pvconvert`.')
571
572    # =============================== #
573    # prepare constants, strings etc. #
574    # =============================== #
575    pstring = 'ln(P_2(' + quantity + ')/P_1(' + quantity + '))'
576    trueslope = 0
577    if dtemp:
578        trueslope = 1/(kb * param1) - 1/(kb * param2)
579    elif dpress:
580        trueslope = (param1 - param2) / (kb * temp) * pvconvert
581
582    if verbosity > 1:
583        print('Analytical slope of {:s}: {:.8f}'.format(pstring, trueslope))
584
585    quant = {}
586
587    # ==================== #
588    # prepare trajectories #
589    # ==================== #
590    # Discard burn-in period and time-correlated frames
591    traj1 = trajectory.prepare(traj1, cut=cutoff, verbosity=verbosity, name='Trajectory 1')
592    traj2 = trajectory.prepare(traj2, cut=cutoff, verbosity=verbosity, name='Trajectory 2')
593
594    # calculate inefficiency
595    g1 = pymbar.timeseries.statisticalInefficiency(traj1)
596    g2 = pymbar.timeseries.statisticalInefficiency(traj2)
597
598    # calculate overlap
599    traj1_full = traj1
600    traj2_full = traj2
601    traj1, traj2, min_ene, max_ene = trajectory.overlap(
602        traj1=traj1_full, traj2=traj2_full,
603    )
604    if verbosity > 0:
605        print('Overlap is {:.1%} of trajectory 1 and {:.1%} of trajectory 2.'.format(
606            traj1.shape[0] / traj1_full.shape[0],
607            traj2.shape[0] / traj2_full.shape[0]
608        ))
609    if verbosity > 0 and dtemp:
610        sig1 = np.std(traj1_full)
611        sig2 = np.std(traj2_full)
612        dt1 = 2*kb*param1*param1/sig1
613        dt2 = 2*kb*param2*param2/sig2
614        if verbosity > 1:
615            print('A rule of thumb states that a good overlap is found when dT/T = (2*kB*T)/(sig),\n'
616                  'where sig is the standard deviation of the energy distribution.\n'
617                  'For the current trajectories, dT = {:.1f}, sig1 = {:.1f} and sig2 = {:.1f}.\n'
618                  'According to the rule of thumb, given T1, a good dT is dT = {:.1f}, and\n'
619                  '                                given T2, a good dT is dT = {:.1f}.'.format(
620                      param2-param1, sig1, sig2, dt1, dt2)
621                  )
622        print('Rule of thumb estimates that dT = {:.1f} would be optimal '
623              '(currently, dT = {:.1f})'.format(.5*(dt1+dt2), param2-param1))
624    if verbosity > 0 and dpress:
625        sig1 = np.std(traj1_full)*pvconvert
626        sig2 = np.std(traj2_full)*pvconvert
627        dp1 = 2*kb*temp/sig1
628        dp2 = 2*kb*temp/sig2
629        if verbosity > 1:
630            print('A rule of thumb states that a good overlap is found when dP = (2*kB*T)/(sig),\n'
631                  'where sig is the standard deviation of the volume distribution.\n'
632                  'For the current trajectories, dP = {:.1f}, sig1 = {:.1g} and sig2 = {:.1g}.\n'
633                  'According to the rule of thumb, given P1, a good dP is dP = {:.1f}, and\n'
634                  '                                given P2, a good dP is dP = {:.1f}.'.format(
635                      param2-param1, sig1, sig2, dp1, dp2)
636                  )
637        print('Rule of thumb estimates that dP = {:.1f} would be optimal '
638              '(currently, dP = {:.1f})'.format(.5*(dp1+dp2), param2-param1))
639    if not min_ene:
640        raise pv_error.InputError(['traj1', 'traj2'],
641                                  'No overlap between trajectories.')
642    # calculate bins
643    bins = np.linspace(min_ene, max_ene, nbins+1)
644    bins = check_bins(traj1, traj2, bins)
645    if np.size(bins) < 3:
646        raise pv_error.InputError(['traj1', 'traj2', 'nbins', 'cutoff'],
647                                  'Less than 3 bins were filled in the overlap region.\n'
648                                  'Ensure sufficient overlap between the trajectories, and '
649                                  'consider increasing `cutoff` or `nbins` if there is '
650                                  'sufficient overlap but unusually long tails.')
651
652    w_f = -trueslope * traj1_full
653    w_r = trueslope * traj2_full
654
655    if verbosity > 2:
656        print('Computing log of partition functions using pymbar.BAR...')
657    df, ddf = pymbar.BAR(w_f, w_r)
658    if verbosity > 2:
659        print('Using {:.5f} for log of partition functions as computed from BAR.'.format(df))
660        print('Uncertainty in quantity is {:.5f}.'.format(ddf))
661        print('Assuming this is negligible compared to sampling error at individual points.')
662
663    # ========== #
664    # linear fit #
665    # ========== #
666    if verbosity > 2:
667        print('Computing linear fit parameters (for plotting / comparison)')
668
669    fitvals, dfitvals = do_linear_fit(
670        traj1=traj1_full, traj2=traj2_full, g1=g1, g2=g2, bins=bins,
671        screen=screen, filename=filename,
672        trueslope=trueslope, trueoffset=df,
673        units=None
674    )
675
676    slope = fitvals[1]
677    dslope = dfitvals[1]
678    quant['linear'] = [abs((slope - trueslope)/dslope)]
679    if verbosity > 1:
680        print_stats(
681            title='Linear Fit Analysis (analytical error)',
682            fitvals=fitvals,
683            dfitvals=dfitvals,
684            kb=kb,
685            param1=param1,
686            param2=param2,
687            trueslope=trueslope,
688            temp=temp, pvconvert=pvconvert,
689            dtemp=dtemp, dpress=dpress, dmu=dmu
690        )
691
692    # ================== #
693    # max-likelihood fit #
694    # ================== #
695    if verbosity > 2:
696        print('Computing the maximum likelihood parameters')
697
698    fitvals, dfitvals = do_max_likelihood_fit(traj1_full, traj2_full, g1, g2,
699                                              init_params=[df, trueslope],
700                                              verbose=(verbosity > 1))
701
702    slope = fitvals[1]
703    dslope = dfitvals[1]
704    quant['maxLikelihood'] = [abs((slope - trueslope)/dslope)]
705    if verbosity > 0:
706        print_stats(
707            title='Maximum Likelihood Analysis (analytical error)',
708            fitvals=fitvals,
709            dfitvals=dfitvals,
710            kb=kb,
711            param1=param1,
712            param2=param2,
713            trueslope=trueslope,
714            temp=temp, pvconvert=pvconvert,
715            dtemp=dtemp, dpress=dpress, dmu=dmu
716        )
717
718    return quant['maxLikelihood']
719
720
721def check_2d(traj1, traj2, param1, param2, kb, pvconvert,
722             quantity, dtempdpress=False, dtempdmu=False,
723             cutoff=0.001, seed=None,
724             verbosity=1, screen=False, filename=None):
725    r"""
726    Checks whether the energy trajectories of two simulation performed at
727    different temperatures have sampled distributions at the analytically
728    expected ratio.
729
730    Parameters
731    ----------
732    traj1 : array-like, 2d
733        Trajectory of the first simulation
734        If dtempdpress:
735
736            * traj[0,:]: Potential energy U or total energy E = U + K
737            * traj[1,:]: Volume V
738    traj2 : array-like, 2d
739        Trajectory of the second simulation
740        If dtempdpress:
741
742            * traj[0,:]: Potential energy U or total energy E = U + K
743            * traj[1,:]: Volume V
744    param1 : array-like
745        If dtempdpress:
746            Target temperature and pressure of the first simulation
747    param2 : array-like
748        If dtempdpress:
749            Target temperature and pressure of the first simulation
750    kb : float
751        Boltzmann constant in same units as the energy trajectories
752    pvconvert : float
753        Conversion from pressure * volume to energy units
754    quantity : List[str]
755        Names of quantities analyzed (used for printing only)
756    dtempdpress : bool, optional
757        Set to True if trajectories were simulated at different
758        temperature and pressure
759        Default: False.
760    dtempdmu : bool, optional
761        Set to True if trajectories were simulated at different
762        temperature and chemical potential
763        Default: False.
764    cutoff : float
765        Tail cutoff of distributions.
766        Default: 0.001 (0.1%)
767    seed : int
768        If set, bootstrapping will be reproducible.
769        Default: None, bootstrapping non-reproducible.
770    verbosity : int
771        Verbosity level.
772        Default: 1 (only most important output)
773    screen : bool, optional
774        Plot distributions on screen.
775        Default: False.
776    filename : string, optional
777        Plot distributions to `filename`.pdf.
778        Default: None.
779
780    Returns
781    -------
782
783    """
784
785    if not (dtempdpress or dtempdmu) or (dtempdpress and dtempdmu):
786        raise pv_error.InputError(['dtempdpress', 'dtempdmu'],
787                                  'Need to specify exactly one of `dtempdpress` and `dtempdmu`.')
788
789    if dtempdmu:
790        raise NotImplementedError('check_2d: Testing of `dtempdmu` not implemented.')
791
792    if seed is not None:
793        raise NotImplementedError('check_2d: Bootstrapping not implemented.')
794
795    if screen or filename is not None:
796        raise NotImplementedError('check_2d: Plotting not implemented.')
797
798    # =============================== #
799    # prepare constants, strings etc. #
800    # =============================== #
801    pstring = ('ln(P_2(' + quantity[0] + ', ' + quantity[1] + ')/' +
802               'P_1(' + quantity[0] + ', ' + quantity[1] + '))')
803    trueslope = np.zeros(2)
804    facs = [None, None]
805    if dtempdpress:
806        trueslope = np.array([
807            1/(kb * param1[0]) - 1/(kb * param2[0]),
808            pvconvert*(1/(kb * param1[0]) * param1[1] - 1/(kb * param2[0]) * param2[1])
809        ])
810        facs = [[1, param1[1]], [1, param2[1]]]
811
812    if verbosity > 1:
813        print('Analytical slope of {:s}: {:.8f}, {:.8f}'.format(
814            pstring, trueslope[0], trueslope[1]
815        ))
816
817    quant = {}
818
819    # ==================== #
820    # prepare trajectories #
821    # ==================== #
822    # Discard burn-in period and time-correlated frames
823    traj1 = trajectory.prepare(traj1, cut=cutoff, facs=facs[0],
824                               verbosity=verbosity, name='Trajectory 1')
825    traj2 = trajectory.prepare(traj2, cut=cutoff, facs=facs[1],
826                               verbosity=verbosity, name='Trajectory 2')
827
828    # calculate inefficiency
829    g1 = np.array([
830            pymbar.timeseries.statisticalInefficiency(traj1[0]),
831            pymbar.timeseries.statisticalInefficiency(traj1[1])
832        ])
833    g2 = np.array([
834            pymbar.timeseries.statisticalInefficiency(traj2[0]),
835            pymbar.timeseries.statisticalInefficiency(traj2[1])
836        ])
837
838    # calculate overlap
839    traj1_full = traj1
840    traj2_full = traj2
841    traj1, traj2, min_ene, max_ene = trajectory.overlap(
842        traj1=traj1_full, traj2=traj2_full,
843    )
844    if verbosity > 0:
845        print('Overlap is {:.1%} of trajectory 1 and {:.1%} of trajectory 2.'.format(
846            traj1.shape[1] / traj1_full.shape[1],
847            traj2.shape[1] / traj2_full.shape[1]
848        ))
849    if verbosity > 0 and dtempdpress:
850        cov1 = np.cov(traj1_full)
851        sig1 = np.sqrt(np.diag(cov1))
852        sig1[1] *= pvconvert
853        cov2 = np.cov(traj2_full)
854        sig2 = np.sqrt(np.diag(cov2))
855        sig2[1] *= pvconvert
856        dt1 = 2*kb*param1[0]*param1[0]/sig1[0]
857        dt2 = 2*kb*param2[0]*param2[0]/sig2[0]
858        dp1 = 2*kb*param1[0]/sig1[1]
859        dp2 = 2*kb*param2[0]/sig2[1]
860        if verbosity > 1:
861            print('A rule of thumb states that a good overlap can be expected when choosing state\n'
862                  'points separated by about 2 standard deviations.\n'
863                  'For the current trajectories, dT = {:.1f}, and dP = {:.1f},\n'
864                  'with standard deviations sig1 = [{:.1f}, {:.1g}], and sig2 = [{:.1f}, {:.1g}].\n'
865                  'According to the rule of thumb, given point 1, the estimate is dT = {:.1f}, dP = {:.1f}, and\n'
866                  '                                given point 2, the estimate is dT = {:.1f}, dP = {:.1f}.'.format(
867                      param2[0]-param1[0], param2[1]-param1[1],
868                      sig1[0], sig1[1], sig2[0], sig2[1],
869                      dt1, dt2, dp1, dp2)
870                  )
871        print('Rule of thumb estimates that (dT,dP) = ({:.1f},{:.1f}) would be optimal '
872              '(currently, (dT,dP) = ({:.1f},{:.1f}))'.format(.5*(dt1+dt2), .5*(dp1+dp2),
873                                                              param2[0]-param1[0], param2[1]-param1[1]))
874    if min_ene is None:
875        raise pv_error.InputError(['traj1', 'traj2'],
876                                  'No overlap between trajectories.')
877
878    w_f = -trueslope[0] * traj1_full[0] - trueslope[1] * traj1_full[1]
879    w_r = trueslope[0] * traj2_full[0] + trueslope[1] * traj2_full[1]
880
881    if verbosity > 2:
882        print('Computing log of partition functions using pymbar.BAR...')
883    df, ddf = pymbar.BAR(w_f, w_r)
884    if verbosity > 2:
885        print('Using {:.5f} for log of partition functions as computed from BAR.'.format(df))
886        print('Uncertainty in quantity is {:.5f}.'.format(ddf))
887        print('Assuming this is negligible compared to sampling error at individual points.')
888
889    # ================== #
890    # max-likelihood fit #
891    # ================== #
892    if verbosity > 2:
893        print('Computing the maximum likelihood parameters')
894
895    fitvals, dfitvals = do_max_likelihood_fit(traj1_full, traj2_full, g1, g2,
896                                              init_params=[df, trueslope[0], trueslope[1]],
897                                              verbose=(verbosity > 1))
898
899    slope = fitvals[1:]
900    dslope = dfitvals[1:]
901    quant['maxLikelihood'] = np.abs((slope - trueslope)/dslope)
902    if verbosity > 0:
903        print_stats(
904            title='Maximum Likelihood Analysis (analytical error)',
905            fitvals=fitvals,
906            dfitvals=dfitvals,
907            kb=kb,
908            param1=param1,
909            param2=param2,
910            trueslope=trueslope,
911            pvconvert=pvconvert,
912            dtempdpress=dtempdpress, dtempdmu=dtempdmu
913        )
914
915    return quant['maxLikelihood']
916