1########################################################################### 2# # 3# physical_validation, # 4# a python package to test the physical validity of MD results # 5# # 6# Written by Michael R. Shirts <michael.shirts@colorado.edu> # 7# Pascal T. Merz <pascal.merz@colorado.edu> # 8# # 9# Copyright (C) 2012 University of Virginia # 10# (C) 2017 University of Colorado Boulder # 11# # 12# This library is free software; you can redistribute it and/or # 13# modify it under the terms of the GNU Lesser General Public # 14# License as published by the Free Software Foundation; either # 15# version 2.1 of the License, or (at your option) any later version. # 16# # 17# This library is distributed in the hope that it will be useful, # 18# but WITHOUT ANY WARRANTY; without even the implied warranty of # 19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # 20# Lesser General Public License for more details. # 21# # 22# You should have received a copy of the GNU Lesser General Public # 23# License along with this library; if not, write to the # 24# Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, # 25# Boston, MA 02110-1301 USA # 26# # 27########################################################################### 28r""" 29This file reimplements most functionality of the checkensemble.py code 30originally published on https://github.com/shirtsgroup/checkensemble. It 31serves as the low-level functionality of the high-level module 32:mod:`physical_validation.ensemble`. 33""" 34from __future__ import division 35import numpy as np 36import scipy.optimize 37 38import pymbar 39 40from . import trajectory 41from . import error as pv_error 42from . import plot 43 44 45def generate_histograms(traj1, traj2, g1, g2, bins): 46 47 n1 = np.size(traj1) 48 n2 = np.size(traj2) 49 50 h1 = np.histogram(traj1, bins=bins)[0]/n1 51 h2 = np.histogram(traj2, bins=bins)[0]/n2 52 dh1 = np.sqrt(g1 * h1 * (1 - h1) / n1) 53 dh2 = np.sqrt(g2 * h2 * (1 - h2) / n2) 54 55 return h1, h2, dh1, dh2 56 57 58def do_linear_fit(traj1, traj2, g1, g2, bins, 59 screen=False, filename=None, 60 trueslope=0.0, trueoffset=0.0, 61 units=None): 62 63 h1, h2, dh1, dh2 = generate_histograms(traj1, traj2, g1, g2, bins) 64 65 # v copied from checkensemble.py v 66 ratio = np.log(h2 / h1) 67 dratio = np.sqrt((dh1/h1)**2 + (dh2/h2)**2) 68 69 usedat = np.isfinite(ratio) 70 y = ratio[usedat] 71 nuse = len(y) 72 weights = 1.0/dratio[usedat] 73 74 xaxis = (bins[:-1] + bins[1:])/2 75 x = xaxis[usedat] 76 77 x_mat = np.ones([nuse, 2]) 78 x_mat[:, 1] = x 79 80 w = np.diag(weights) 81 wx = np.dot(w, x_mat) 82 wy = np.dot(w, y) 83 wx_t = np.transpose(wx) 84 z = np.dot(wx_t, wx) 85 wxy = np.dot(wx_t, wy) 86 87 a = np.linalg.solve(z, wxy) 88 da_matrix = np.transpose(np.linalg.inv(z)) 89 da = np.zeros(2) 90 da[0] = np.sqrt(da_matrix[0, 0]) 91 da[1] = np.sqrt(da_matrix[1, 1]) 92 93 # the true line is y = df + dp*x, where y is ln P_1(X)/P_2(X) 94 # ^ end copied from checkensemble.py ^ 95 96 do_plot = screen or filename is not None 97 if do_plot: 98 true = trueoffset+trueslope*xaxis 99 fit = a[0] + a[1]*xaxis 100 101 data = [{'x': xaxis, 102 'y': ratio, 103 'y_err': dratio, 104 'name': 'Simulation'}, 105 {'x': xaxis, 106 'y': fit, 107 'name': 'Fit to simulation'}, 108 {'x': xaxis, 109 'y': true, 110 'name': 'Analytical ratio'}] 111 112 if units is not None: 113 units = ' [' + units + ']' 114 else: 115 units = '' 116 117 annot = ('{:.1f}'.format(abs((a[1] - trueslope) / da[1])) + 118 ' quantiles') 119 120 plot.plot(data, 121 legend='best', 122 title='Log probability ratio', 123 xlabel='Energy' + units, 124 ylabel=r'$\log\frac{P_2(E)}{P_1(E)}$', 125 filename=filename, 126 screen=screen, 127 axtext=annot) 128 129 return a, da 130 131 132def do_max_likelihood_fit(traj1, traj2, g1, g2, 133 init_params=None, 134 verbose=False): 135 136 # ============================================================= # 137 # Define (negative) log-likelihood function and its derivatives # 138 # ============================================================= # 139 def log_likelihood(a, ene1, ene2): 140 # Returns negative of eq (8) of check_ensemble paper 141 # 142 # Uses log (1/f(x)) == -log(f(x)) 143 # and log(1 + e^x) == log(e^x (e^-x + 1)) == x + log(1 + e^-x) 144 # ^(a) ^(b) 145 # form (a) -> 0 for x->-inf, -> inf for x->inf 146 # form (b) -> NaN for x->-inf, -> x for x->inf 147 # combined: -> 0 for x-> -inf, -> x for x-> inf 148 def log_1_plus_exp(y): 149 def f(yy): 150 with np.errstate(over='raise'): 151 try: 152 xx = np.log(1 + np.exp(yy)) 153 except FloatingPointError: 154 xx = yy + np.log(1 + np.exp(-yy)) 155 return xx 156 return np.vectorize(f)(y) 157 158 if a.size == 2: 159 return (np.sum(log_1_plus_exp(a[0] + a[1]*ene1)) + 160 np.sum(log_1_plus_exp(-a[0] - a[1]*ene2))) 161 else: 162 return (np.sum(log_1_plus_exp(a[0] + a[1]*ene1[0] + a[2]*ene1[1])) + 163 np.sum(log_1_plus_exp(-a[0] - a[1]*ene2[0] - a[2]*ene2[1]))) 164 165 def da_log_likelihood(a, ene1, ene2): 166 # Returns the first derivative wrt the parameters a of log_likelihood 167 # 168 # d/da0 log(1 + exp(a0 + a1*E)) == exp(a0 + a1*E) / (1 + exp(a0 + a1*E)) 169 # == 1 / (1 + exp(-a0 - a1*E)) 170 # d/da1 log(1 + exp(a0 + a1*E)) == E * exp(a0 + a1*E) / (1 + exp(a0 + a1*E)) 171 # == E / (1 + exp(-a0 - a1*E)) 172 def inv_1_plus_exp(y): 173 def f(yy): 174 with np.errstate(over='raise'): 175 try: 176 xx = 1. / (1 + np.exp(yy)) 177 except FloatingPointError: 178 xx = 0. 179 return xx 180 return np.vectorize(f)(y) 181 182 if a.size == 2: 183 d = np.zeros(2) 184 d[0] = (np.sum(inv_1_plus_exp(-a[0] - a[1]*ene1)) - 185 np.sum(inv_1_plus_exp(a[0] + a[1]*ene2))) 186 d[1] = (np.sum(inv_1_plus_exp(-a[0] - a[1]*ene1) * ene1) - 187 np.sum(inv_1_plus_exp(a[0] + a[1]*ene2) * ene2)) 188 else: 189 d = np.zeros(3) 190 d[0] = (np.sum(inv_1_plus_exp(-a[0] - a[1]*ene1[0] - a[2]*ene1[1])) - 191 np.sum(inv_1_plus_exp(a[0] + a[1]*ene2[0] + a[2]*ene2[1]))) 192 d[1] = (np.sum(inv_1_plus_exp(-a[0] - a[1]*ene1[0] - a[2]*ene1[1]) * ene1[0]) - 193 np.sum(inv_1_plus_exp(a[0] + a[1]*ene2[0] + a[2]*ene2[1]) * ene2[0])) 194 d[2] = (np.sum(inv_1_plus_exp(-a[0] - a[1]*ene1[0] - a[2]*ene1[1]) * ene1[1]) - 195 np.sum(inv_1_plus_exp(a[0] + a[1]*ene2[0] + a[2]*ene2[1]) * ene2[1])) 196 197 return d 198 199 def hess_log_likelihood(a, ene1, ene2): 200 # Returns the hessian wrt the parameters a of log_likelihood 201 # fac1 = 1 / (2 + 2*cosh(a0 + a1*ene1)) 202 # h1 = [[ fac1, ene1*fac1 ], 203 # [ ene1*fac1, ene1**2*fac1 ]] 204 # fac2 = 1 / (2 + 2*cosh(a0 + a1*ene2)) 205 # h2 = [[ fac2, ene2*fac2 ], 206 # [ ene2*fac2, ene2**2*fac2 ]] 207 # h = h1 + h2 208 209 if a.size == 2: 210 fac1 = 1 / (2 + 2*np.cosh(a[0] + a[1]*ene1)) 211 fac2 = 1 / (2 + 2*np.cosh(a[0] + a[1]*ene2)) 212 213 h = np.zeros((2, 2)) 214 215 h[0, 0] = np.sum(fac1) + np.sum(fac2) 216 h[0, 1] = h[1, 0] = np.sum(ene1 * fac1) + np.sum(ene2 * fac2) 217 h[1, 1] = np.sum(ene1 * ene1 * fac1) + np.sum(ene2 * ene2 * fac2) 218 219 else: 220 fac1 = 1 / (2 + 2*np.cosh(a[0] + a[1]*ene1[0] + a[2]*ene1[1])) 221 fac2 = 1 / (2 + 2*np.cosh(a[0] + a[1]*ene2[0] + a[2]*ene2[1])) 222 223 h = np.zeros((3, 3)) 224 225 h[0, 0] = np.sum(fac1) + np.sum(fac2) 226 h[1, 1] = np.sum(ene1[0] * ene1[0] * fac1) + np.sum(ene2[0] * ene2[0] * fac2) 227 h[2, 2] = np.sum(ene1[1] * ene1[1] * fac1) + np.sum(ene2[1] * ene2[1] * fac2) 228 229 h[0, 1] = h[1, 0] = np.sum(ene1[0] * fac1) + np.sum(ene2[0] * fac2) 230 h[0, 2] = h[2, 0] = np.sum(ene1[1] * fac1) + np.sum(ene2[1] * fac2) 231 h[1, 2] = h[2, 1] = (np.sum(ene1[0] * ene1[1] * fac1) + 232 np.sum(ene2[0] * ene2[1] * fac2)) 233 234 return h 235 236 # ==================================================== # 237 # Minimize the negative of the log likelihood function # 238 # ==================================================== # 239 if init_params is None: 240 init_params = np.zeros(traj1.ndim + 1) 241 else: 242 init_params = np.array(init_params) 243 244 min_res = scipy.optimize.minimize( 245 log_likelihood, 246 x0=init_params, 247 args=(traj1, traj2), 248 method='dogleg', 249 jac=da_log_likelihood, 250 hess=hess_log_likelihood 251 ) 252 253 # fallback options 254 if not min_res.success: 255 if verbose: 256 print('Note: Max-Likelihood minimization failed using \'dogleg\' method. ' 257 'Trying to vary initial parameters.') 258 min_res_1 = scipy.optimize.minimize( 259 log_likelihood, 260 x0=init_params * 0.9, 261 args=(traj1, traj2), 262 method='dogleg', 263 jac=da_log_likelihood, 264 hess=hess_log_likelihood 265 ) 266 min_res_2 = scipy.optimize.minimize( 267 log_likelihood, 268 x0=init_params * 1.1, 269 args=(traj1, traj2), 270 method='dogleg', 271 jac=da_log_likelihood, 272 hess=hess_log_likelihood 273 ) 274 if min_res_1.success and min_res_2.success and np.allclose(min_res_1.x, min_res_2.x): 275 min_res = min_res_1 276 277 if not min_res.success: 278 # dogleg was unsuccessful using alternative starting point 279 if verbose: 280 print('Note: Max-Likelihood minimization failed using \'dogleg\' method. ' 281 'Trying method \'nelder-mead\'.') 282 min_res = scipy.optimize.minimize( 283 log_likelihood, 284 x0=init_params * 0.9, 285 args=(traj1, traj2), 286 method='nelder-mead' 287 ) 288 289 if not min_res.success: 290 raise RuntimeError('MaxLikelihood: Unable to minimize function.') 291 292 final_params = min_res.x 293 294 # ======================= # 295 # Calculate uncertainties # 296 # ======================= # 297 cov = np.linalg.inv(hess_log_likelihood(final_params, traj1, traj2)) 298 final_error = np.sqrt(np.diag(cov))*np.sqrt(np.average([g1, g2])) 299 300 return final_params, final_error 301 302 303def check_bins(traj1, traj2, bins): 304 # check for empty bins 305 h1, _ = np.histogram(traj1, bins=bins) 306 h2, _ = np.histogram(traj2, bins=bins) 307 empty = np.where((h1 == 0) | (h2 == 0))[0] 308 309 if np.size(empty) == 0: 310 return bins 311 elif np.size(empty) == 1: 312 empty = empty[0] 313 if empty > np.size(bins) / 2: 314 return bins[:empty] 315 else: 316 return bins[empty+1:] 317 else: 318 # find longest non-empty interval 319 empty = np.insert(np.append(empty, [40]), 0, [-1]) 320 max_interval = np.argmax(empty[1:] - empty[:-1]) 321 left = empty[max_interval] + 1 322 right = empty[max_interval + 1] 323 return bins[left:right] 324 325 326def print_stats(title, 327 fitvals, dfitvals, 328 kb, param1, param2, trueslope, 329 temp=None, pvconvert=None, 330 dtemp=False, dpress=False, dmu=False, 331 dtempdpress=False, dtempdmu=False): 332 333 # if simple 1d: 334 # fitvals = [df, slope] 335 # dfitvals = [ddf, dslope] 336 # if simple 2d: 337 # fitvals = [df, slope0, slope1] 338 # dfitvals = [ddf, dslope0, dslope1] 339 # if bootstrapped 1d: 340 # fitvals = [[df, slope], [df, slope], ...] 341 # dfitvals = None 342 # if bootstrapped 2d: 343 # fitvals = [[df, slope0, slope1], [df, slope0, slope1], ...] 344 # dfitvals = None 345 if fitvals.ndim > 1: 346 dfitvals = np.std(fitvals, axis=0) 347 fitvals = np.average(fitvals, axis=0) 348 349 if np.ndim(trueslope) == 0: 350 trueslopes = np.array([trueslope]) 351 else: 352 trueslopes = trueslope 353 354 free_energy = fitvals[0] 355 slopes = fitvals[1:] 356 dfree_energy = dfitvals[0] 357 dslopes = dfitvals[1:] 358 359 print('='*50) 360 print(title) 361 print('='*50) 362 print('Free energy') 363 print(' {:.5f} +/- {:.5f}'.format(free_energy, dfree_energy)) 364 print('{:27s} | {:s}'.format('Estimated slope', 'True slope')) 365 for slope, dslope, trueslope in zip(slopes, dslopes, trueslopes): 366 print(' {:<9.6f} +/- {:<9.6f} | {:<9.6f}'.format(slope, dslope, trueslope)) 367 quant = np.abs((slope-trueslope)/dslope) 368 print(' ({:.2f} quantiles from true slope)'.format(quant)) 369 370 if dtemp or dtempdpress or dtempdmu: 371 # slope is estimated beta2 - beta1 372 # kb * slope == 1/T1' - 1/T2' == (T2' - T1')/(T1'*T2') 373 # So we'll assume dT' == T1' - T2' ~= kb * slope * T1*T2 374 slope = slopes[0] 375 dslope = dslopes[0] 376 if dtemp: 377 t1 = param1 378 t2 = param2 379 else: 380 t1 = param1[0] 381 t2 = param2[0] 382 print('{:27s} | {:s}'.format('Estimated dT', 'True dT')) 383 print(' {:<6.1f} +/- {:<6.1f} | {:<6.1f}'.format( 384 kb * slope * t1 * t2, 385 kb * dslope * t1 * t2, 386 t2 - t1 387 )) 388 if dpress or dtempdpress: 389 # slope is estimated (P1 - P2)/beta*pvconvert (1d), or 390 # (P1/b1 - P2/b2)*pvconvert (2d) 391 if temp is None and dtempdpress: 392 temp = .5*(param1[0] + param2[0]) 393 if dpress: 394 press = -slopes[0] * (kb*temp) / pvconvert 395 ddpress = -dslopes[0] * (kb*temp) / pvconvert 396 truepress = -trueslopes[0] * (kb*temp) / pvconvert 397 else: 398 press = -slopes[1] * (kb*temp) / pvconvert 399 ddpress = -dslopes[1] * (kb*temp) / pvconvert 400 truepress = -trueslopes[1] * (kb*temp) / pvconvert 401 print('{:27s} | {:s}'.format('Estimated dP', 'True dP')) 402 print(' {:<6.1f} +/- {:<6.1f} | {:<6.1f}'.format( 403 press, np.abs(ddpress), truepress 404 )) 405 if dmu or dtempdmu: 406 pass 407 print('='*50) 408 409 410def estimate_interval(ens_string, ens_temp, 411 energy, kb, 412 ens_press=None, 413 volume=None, pvconvert=None, 414 verbosity=1, cutoff=0.001, 415 tunit='', punit=''): 416 result = {} 417 if ens_string == 'NVT': 418 # Discard burn-in period and time-correlated frames 419 energy = trajectory.equilibrate(energy, verbose=(verbosity > 1), name='Energy') 420 energy = trajectory.decorrelate(energy, verbose=(verbosity > 1), name='Energy') 421 energy = trajectory.cut_tails(energy, cut=cutoff, verbose=(verbosity > 2), name='Energy') 422 423 # dT 424 sig = np.std(energy) 425 result['dT'] = 2*kb*ens_temp*ens_temp/sig 426 elif ens_string == 'NPT': 427 enthalpy = energy + pvconvert * ens_press * volume 428 traj_2d = np.array([energy, volume]) 429 # Discard burn-in period and time-correlated frames 430 enthalpy = trajectory.equilibrate(enthalpy, verbose=(verbosity > 1), name='Enthalpy') 431 enthalpy = trajectory.decorrelate(enthalpy, verbose=(verbosity > 1), name='Enthalpy') 432 enthalpy = trajectory.cut_tails(enthalpy, cut=cutoff, verbose=(verbosity > 2), name='Enthalpy') 433 volume_1d = trajectory.equilibrate(volume, verbose=(verbosity > 1), name='Volume') 434 volume_1d = trajectory.decorrelate(volume_1d, verbose=(verbosity > 1), name='Volume') 435 volume_1d = trajectory.cut_tails(volume_1d, cut=cutoff, verbose=(verbosity > 2), name='Volume') 436 traj_2d = trajectory.equilibrate(traj_2d, verbose=(verbosity > 1), name='2D-Trajectory') 437 traj_2d = trajectory.decorrelate(traj_2d, facs=[1, pvconvert * ens_press], verbose=(verbosity > 1), name='2D-Trajectory') 438 traj_2d = trajectory.cut_tails(traj_2d, cut=cutoff, verbose=(verbosity > 2), name='2D-Trajectory') 439 440 # dT 441 sig = np.std(enthalpy) 442 result['dT'] = 2*kb*ens_temp*ens_temp/sig 443 # dP 444 sig = np.std(volume_1d)*pvconvert 445 result['dP'] = 2*kb*ens_temp/sig 446 # dTdP 447 cov = np.cov(traj_2d) 448 sig = np.sqrt(np.diag(cov)) 449 sig[1] *= pvconvert 450 result['dTdP'] = [2*kb*ens_temp*ens_temp/sig[0], 451 2*kb*ens_temp/sig[1]] 452 else: 453 raise pv_error.InputError('ens_str', 'Unrecognized ensemble string.') 454 455 if verbosity > 0: 456 print('A rule of thumb states that good error recognition can be expected when\n' 457 'spacing the tip of the distributions by about two standard deviations.\n' 458 'Based on this rule, and the assumption that the standard deviation of the\n' 459 'distributions is largely independent of the state point, here\'s an estimate\n' 460 'for the interval given the current simulation:') 461 if ens_string == 'NVT': 462 print('Current trajectory: NVT, T = {:.2f} {:s}'.format(ens_temp, tunit)) 463 print('Suggested interval: dT = {:.1f} {:s}'.format(result['dT'], tunit)) 464 if ens_string == 'NPT': 465 print('Current trajectory: NPT, T = {:.2f} {:s}, P = {:.2f} {:s}'.format( 466 ens_temp, tunit, ens_press, punit)) 467 print('Suggested interval:') 468 print(' Temperature-only: dT = {:.1f} {:s}'.format(result['dT'], tunit)) 469 print(' Pressure-only: dP = {:.1f} {:s}'.format(result['dP'], punit)) 470 print(' Combined: dT = {:.1f} {:s}, dP = {:.1f} {:s}'.format( 471 result['dTdP'][0], tunit, result['dTdP'][1], punit)) 472 473 474def check_1d(traj1, traj2, param1, param2, kb, 475 quantity, dtemp=False, dpress=False, dmu=False, 476 temp=None, pvconvert=None, 477 nbins=40, cutoff=0.001, seed=None, 478 verbosity=1, screen=False, filename=None): 479 r""" 480 Checks whether the energy trajectories of two simulation performed at 481 different temperatures have sampled distributions at the analytically 482 expected ratio. 483 484 Parameters 485 ---------- 486 traj1 : array-like 487 Trajectory of the first simulation 488 If dtemp: 489 490 * NVT: Potential energy U or total energy E = U + K 491 * NPT: Enthalpy H = U + pV or total energy E = H + K 492 493 If dpress: 494 495 * NPT: Volume V 496 497 traj2 : array-like 498 Trajectory of the second simulation 499 If dtemp: 500 501 * NVT: Potential energy U or total energy E = U + K 502 * NPT: Enthalpy H = U + pV or total energy E = H + K 503 504 If dpress: 505 506 * NPT: Volume V 507 508 param1 : float 509 Target temperature or pressure of the first simulation 510 param2 : float 511 Target temperature or pressure of the second simulation 512 kb : float 513 Boltzmann constant in same units as the energy trajectories 514 quantity : str 515 Name of quantity analyzed (used for printing only) 516 dtemp : bool, optional 517 Set to True if trajectories were simulated at different temperature 518 Default: False. 519 dpress : bool, optional 520 Set to True if trajectories were simulated at different pressure 521 Default: False. 522 temp : float, optional 523 The temperature in equal temperature, differring pressure NPT simulations. 524 Needed to print optimal dP. 525 pvconvert : float, optional 526 Conversion from pressure * volume to energy units. 527 Needed to print optimal dP. 528 dmu : bool, optional 529 Set to True if trajectories were simulated at different chemical potential 530 Default: False. 531 nbins : int, optional 532 Number of bins used to assess distributions of the trajectories 533 Default: 40 534 cutoff : float, optional 535 Tail cutoff of distributions. 536 Default: 0.001 (0.1%) 537 seed : int, optional 538 If set, bootstrapping will be reproducible. 539 Default: None, bootstrapping non-reproducible. 540 verbosity : int, optional 541 Verbosity level. 542 Default: 1 (only most important output) 543 screen : bool, optional 544 Plot distributions on screen. 545 Default: False. 546 filename : string, optional 547 Plot distributions to `filename`.pdf. 548 Default: None. 549 550 Returns 551 ------- 552 553 """ 554 555 if (not (dtemp or dpress or dmu) or 556 (dtemp and dpress) or 557 (dtemp and dmu) or 558 (dpress and dmu)): 559 raise pv_error.InputError(['dtemp', 'dpress', 'dmu'], 560 'Need to specify exactly one of `dtemp`, `dpress` and `dmu`.') 561 562 if dmu: 563 raise NotImplementedError('check_1d: Testing of `dmu` not implemented.') 564 565 if seed is not None: 566 raise NotImplementedError('check_1d: Bootstrapping not implemented.') 567 568 if dpress and (temp is None or pvconvert is None): 569 raise pv_error.InputError(['dpress', 'temp', 'pvconvert'], 570 '`ensemble.check_1d` with `dpress=True` requires `temp` and `pvconvert`.') 571 572 # =============================== # 573 # prepare constants, strings etc. # 574 # =============================== # 575 pstring = 'ln(P_2(' + quantity + ')/P_1(' + quantity + '))' 576 trueslope = 0 577 if dtemp: 578 trueslope = 1/(kb * param1) - 1/(kb * param2) 579 elif dpress: 580 trueslope = (param1 - param2) / (kb * temp) * pvconvert 581 582 if verbosity > 1: 583 print('Analytical slope of {:s}: {:.8f}'.format(pstring, trueslope)) 584 585 quant = {} 586 587 # ==================== # 588 # prepare trajectories # 589 # ==================== # 590 # Discard burn-in period and time-correlated frames 591 traj1 = trajectory.prepare(traj1, cut=cutoff, verbosity=verbosity, name='Trajectory 1') 592 traj2 = trajectory.prepare(traj2, cut=cutoff, verbosity=verbosity, name='Trajectory 2') 593 594 # calculate inefficiency 595 g1 = pymbar.timeseries.statisticalInefficiency(traj1) 596 g2 = pymbar.timeseries.statisticalInefficiency(traj2) 597 598 # calculate overlap 599 traj1_full = traj1 600 traj2_full = traj2 601 traj1, traj2, min_ene, max_ene = trajectory.overlap( 602 traj1=traj1_full, traj2=traj2_full, 603 ) 604 if verbosity > 0: 605 print('Overlap is {:.1%} of trajectory 1 and {:.1%} of trajectory 2.'.format( 606 traj1.shape[0] / traj1_full.shape[0], 607 traj2.shape[0] / traj2_full.shape[0] 608 )) 609 if verbosity > 0 and dtemp: 610 sig1 = np.std(traj1_full) 611 sig2 = np.std(traj2_full) 612 dt1 = 2*kb*param1*param1/sig1 613 dt2 = 2*kb*param2*param2/sig2 614 if verbosity > 1: 615 print('A rule of thumb states that a good overlap is found when dT/T = (2*kB*T)/(sig),\n' 616 'where sig is the standard deviation of the energy distribution.\n' 617 'For the current trajectories, dT = {:.1f}, sig1 = {:.1f} and sig2 = {:.1f}.\n' 618 'According to the rule of thumb, given T1, a good dT is dT = {:.1f}, and\n' 619 ' given T2, a good dT is dT = {:.1f}.'.format( 620 param2-param1, sig1, sig2, dt1, dt2) 621 ) 622 print('Rule of thumb estimates that dT = {:.1f} would be optimal ' 623 '(currently, dT = {:.1f})'.format(.5*(dt1+dt2), param2-param1)) 624 if verbosity > 0 and dpress: 625 sig1 = np.std(traj1_full)*pvconvert 626 sig2 = np.std(traj2_full)*pvconvert 627 dp1 = 2*kb*temp/sig1 628 dp2 = 2*kb*temp/sig2 629 if verbosity > 1: 630 print('A rule of thumb states that a good overlap is found when dP = (2*kB*T)/(sig),\n' 631 'where sig is the standard deviation of the volume distribution.\n' 632 'For the current trajectories, dP = {:.1f}, sig1 = {:.1g} and sig2 = {:.1g}.\n' 633 'According to the rule of thumb, given P1, a good dP is dP = {:.1f}, and\n' 634 ' given P2, a good dP is dP = {:.1f}.'.format( 635 param2-param1, sig1, sig2, dp1, dp2) 636 ) 637 print('Rule of thumb estimates that dP = {:.1f} would be optimal ' 638 '(currently, dP = {:.1f})'.format(.5*(dp1+dp2), param2-param1)) 639 if not min_ene: 640 raise pv_error.InputError(['traj1', 'traj2'], 641 'No overlap between trajectories.') 642 # calculate bins 643 bins = np.linspace(min_ene, max_ene, nbins+1) 644 bins = check_bins(traj1, traj2, bins) 645 if np.size(bins) < 3: 646 raise pv_error.InputError(['traj1', 'traj2', 'nbins', 'cutoff'], 647 'Less than 3 bins were filled in the overlap region.\n' 648 'Ensure sufficient overlap between the trajectories, and ' 649 'consider increasing `cutoff` or `nbins` if there is ' 650 'sufficient overlap but unusually long tails.') 651 652 w_f = -trueslope * traj1_full 653 w_r = trueslope * traj2_full 654 655 if verbosity > 2: 656 print('Computing log of partition functions using pymbar.BAR...') 657 df, ddf = pymbar.BAR(w_f, w_r) 658 if verbosity > 2: 659 print('Using {:.5f} for log of partition functions as computed from BAR.'.format(df)) 660 print('Uncertainty in quantity is {:.5f}.'.format(ddf)) 661 print('Assuming this is negligible compared to sampling error at individual points.') 662 663 # ========== # 664 # linear fit # 665 # ========== # 666 if verbosity > 2: 667 print('Computing linear fit parameters (for plotting / comparison)') 668 669 fitvals, dfitvals = do_linear_fit( 670 traj1=traj1_full, traj2=traj2_full, g1=g1, g2=g2, bins=bins, 671 screen=screen, filename=filename, 672 trueslope=trueslope, trueoffset=df, 673 units=None 674 ) 675 676 slope = fitvals[1] 677 dslope = dfitvals[1] 678 quant['linear'] = [abs((slope - trueslope)/dslope)] 679 if verbosity > 1: 680 print_stats( 681 title='Linear Fit Analysis (analytical error)', 682 fitvals=fitvals, 683 dfitvals=dfitvals, 684 kb=kb, 685 param1=param1, 686 param2=param2, 687 trueslope=trueslope, 688 temp=temp, pvconvert=pvconvert, 689 dtemp=dtemp, dpress=dpress, dmu=dmu 690 ) 691 692 # ================== # 693 # max-likelihood fit # 694 # ================== # 695 if verbosity > 2: 696 print('Computing the maximum likelihood parameters') 697 698 fitvals, dfitvals = do_max_likelihood_fit(traj1_full, traj2_full, g1, g2, 699 init_params=[df, trueslope], 700 verbose=(verbosity > 1)) 701 702 slope = fitvals[1] 703 dslope = dfitvals[1] 704 quant['maxLikelihood'] = [abs((slope - trueslope)/dslope)] 705 if verbosity > 0: 706 print_stats( 707 title='Maximum Likelihood Analysis (analytical error)', 708 fitvals=fitvals, 709 dfitvals=dfitvals, 710 kb=kb, 711 param1=param1, 712 param2=param2, 713 trueslope=trueslope, 714 temp=temp, pvconvert=pvconvert, 715 dtemp=dtemp, dpress=dpress, dmu=dmu 716 ) 717 718 return quant['maxLikelihood'] 719 720 721def check_2d(traj1, traj2, param1, param2, kb, pvconvert, 722 quantity, dtempdpress=False, dtempdmu=False, 723 cutoff=0.001, seed=None, 724 verbosity=1, screen=False, filename=None): 725 r""" 726 Checks whether the energy trajectories of two simulation performed at 727 different temperatures have sampled distributions at the analytically 728 expected ratio. 729 730 Parameters 731 ---------- 732 traj1 : array-like, 2d 733 Trajectory of the first simulation 734 If dtempdpress: 735 736 * traj[0,:]: Potential energy U or total energy E = U + K 737 * traj[1,:]: Volume V 738 traj2 : array-like, 2d 739 Trajectory of the second simulation 740 If dtempdpress: 741 742 * traj[0,:]: Potential energy U or total energy E = U + K 743 * traj[1,:]: Volume V 744 param1 : array-like 745 If dtempdpress: 746 Target temperature and pressure of the first simulation 747 param2 : array-like 748 If dtempdpress: 749 Target temperature and pressure of the first simulation 750 kb : float 751 Boltzmann constant in same units as the energy trajectories 752 pvconvert : float 753 Conversion from pressure * volume to energy units 754 quantity : List[str] 755 Names of quantities analyzed (used for printing only) 756 dtempdpress : bool, optional 757 Set to True if trajectories were simulated at different 758 temperature and pressure 759 Default: False. 760 dtempdmu : bool, optional 761 Set to True if trajectories were simulated at different 762 temperature and chemical potential 763 Default: False. 764 cutoff : float 765 Tail cutoff of distributions. 766 Default: 0.001 (0.1%) 767 seed : int 768 If set, bootstrapping will be reproducible. 769 Default: None, bootstrapping non-reproducible. 770 verbosity : int 771 Verbosity level. 772 Default: 1 (only most important output) 773 screen : bool, optional 774 Plot distributions on screen. 775 Default: False. 776 filename : string, optional 777 Plot distributions to `filename`.pdf. 778 Default: None. 779 780 Returns 781 ------- 782 783 """ 784 785 if not (dtempdpress or dtempdmu) or (dtempdpress and dtempdmu): 786 raise pv_error.InputError(['dtempdpress', 'dtempdmu'], 787 'Need to specify exactly one of `dtempdpress` and `dtempdmu`.') 788 789 if dtempdmu: 790 raise NotImplementedError('check_2d: Testing of `dtempdmu` not implemented.') 791 792 if seed is not None: 793 raise NotImplementedError('check_2d: Bootstrapping not implemented.') 794 795 if screen or filename is not None: 796 raise NotImplementedError('check_2d: Plotting not implemented.') 797 798 # =============================== # 799 # prepare constants, strings etc. # 800 # =============================== # 801 pstring = ('ln(P_2(' + quantity[0] + ', ' + quantity[1] + ')/' + 802 'P_1(' + quantity[0] + ', ' + quantity[1] + '))') 803 trueslope = np.zeros(2) 804 facs = [None, None] 805 if dtempdpress: 806 trueslope = np.array([ 807 1/(kb * param1[0]) - 1/(kb * param2[0]), 808 pvconvert*(1/(kb * param1[0]) * param1[1] - 1/(kb * param2[0]) * param2[1]) 809 ]) 810 facs = [[1, param1[1]], [1, param2[1]]] 811 812 if verbosity > 1: 813 print('Analytical slope of {:s}: {:.8f}, {:.8f}'.format( 814 pstring, trueslope[0], trueslope[1] 815 )) 816 817 quant = {} 818 819 # ==================== # 820 # prepare trajectories # 821 # ==================== # 822 # Discard burn-in period and time-correlated frames 823 traj1 = trajectory.prepare(traj1, cut=cutoff, facs=facs[0], 824 verbosity=verbosity, name='Trajectory 1') 825 traj2 = trajectory.prepare(traj2, cut=cutoff, facs=facs[1], 826 verbosity=verbosity, name='Trajectory 2') 827 828 # calculate inefficiency 829 g1 = np.array([ 830 pymbar.timeseries.statisticalInefficiency(traj1[0]), 831 pymbar.timeseries.statisticalInefficiency(traj1[1]) 832 ]) 833 g2 = np.array([ 834 pymbar.timeseries.statisticalInefficiency(traj2[0]), 835 pymbar.timeseries.statisticalInefficiency(traj2[1]) 836 ]) 837 838 # calculate overlap 839 traj1_full = traj1 840 traj2_full = traj2 841 traj1, traj2, min_ene, max_ene = trajectory.overlap( 842 traj1=traj1_full, traj2=traj2_full, 843 ) 844 if verbosity > 0: 845 print('Overlap is {:.1%} of trajectory 1 and {:.1%} of trajectory 2.'.format( 846 traj1.shape[1] / traj1_full.shape[1], 847 traj2.shape[1] / traj2_full.shape[1] 848 )) 849 if verbosity > 0 and dtempdpress: 850 cov1 = np.cov(traj1_full) 851 sig1 = np.sqrt(np.diag(cov1)) 852 sig1[1] *= pvconvert 853 cov2 = np.cov(traj2_full) 854 sig2 = np.sqrt(np.diag(cov2)) 855 sig2[1] *= pvconvert 856 dt1 = 2*kb*param1[0]*param1[0]/sig1[0] 857 dt2 = 2*kb*param2[0]*param2[0]/sig2[0] 858 dp1 = 2*kb*param1[0]/sig1[1] 859 dp2 = 2*kb*param2[0]/sig2[1] 860 if verbosity > 1: 861 print('A rule of thumb states that a good overlap can be expected when choosing state\n' 862 'points separated by about 2 standard deviations.\n' 863 'For the current trajectories, dT = {:.1f}, and dP = {:.1f},\n' 864 'with standard deviations sig1 = [{:.1f}, {:.1g}], and sig2 = [{:.1f}, {:.1g}].\n' 865 'According to the rule of thumb, given point 1, the estimate is dT = {:.1f}, dP = {:.1f}, and\n' 866 ' given point 2, the estimate is dT = {:.1f}, dP = {:.1f}.'.format( 867 param2[0]-param1[0], param2[1]-param1[1], 868 sig1[0], sig1[1], sig2[0], sig2[1], 869 dt1, dt2, dp1, dp2) 870 ) 871 print('Rule of thumb estimates that (dT,dP) = ({:.1f},{:.1f}) would be optimal ' 872 '(currently, (dT,dP) = ({:.1f},{:.1f}))'.format(.5*(dt1+dt2), .5*(dp1+dp2), 873 param2[0]-param1[0], param2[1]-param1[1])) 874 if min_ene is None: 875 raise pv_error.InputError(['traj1', 'traj2'], 876 'No overlap between trajectories.') 877 878 w_f = -trueslope[0] * traj1_full[0] - trueslope[1] * traj1_full[1] 879 w_r = trueslope[0] * traj2_full[0] + trueslope[1] * traj2_full[1] 880 881 if verbosity > 2: 882 print('Computing log of partition functions using pymbar.BAR...') 883 df, ddf = pymbar.BAR(w_f, w_r) 884 if verbosity > 2: 885 print('Using {:.5f} for log of partition functions as computed from BAR.'.format(df)) 886 print('Uncertainty in quantity is {:.5f}.'.format(ddf)) 887 print('Assuming this is negligible compared to sampling error at individual points.') 888 889 # ================== # 890 # max-likelihood fit # 891 # ================== # 892 if verbosity > 2: 893 print('Computing the maximum likelihood parameters') 894 895 fitvals, dfitvals = do_max_likelihood_fit(traj1_full, traj2_full, g1, g2, 896 init_params=[df, trueslope[0], trueslope[1]], 897 verbose=(verbosity > 1)) 898 899 slope = fitvals[1:] 900 dslope = dfitvals[1:] 901 quant['maxLikelihood'] = np.abs((slope - trueslope)/dslope) 902 if verbosity > 0: 903 print_stats( 904 title='Maximum Likelihood Analysis (analytical error)', 905 fitvals=fitvals, 906 dfitvals=dfitvals, 907 kb=kb, 908 param1=param1, 909 param2=param2, 910 trueslope=trueslope, 911 pvconvert=pvconvert, 912 dtempdpress=dtempdpress, dtempdmu=dtempdmu 913 ) 914 915 return quant['maxLikelihood'] 916