1 /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 Copyright (c) 2017-2021 The plumed team
3 (see the PEOPLE file at the root of the distribution for a list of names)
4
5 See http://www.plumed.org for more information.
6
7 This file is part of plumed, version 2.
8
9 plumed is free software: you can redistribute it and/or modify
10 it under the terms of the GNU Lesser General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13
14 plumed is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU Lesser General Public License for more details.
18
19 You should have received a copy of the GNU Lesser General Public License
20 along with plumed. If not, see <http://www.gnu.org/licenses/>.
21 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
22 #include "colvar/Colvar.h"
23 #include "colvar/ActionRegister.h"
24 #include "core/PlumedMain.h"
25 #include "tools/Matrix.h"
26 #include "core/GenericMolInfo.h"
27 #include "core/ActionSet.h"
28 #include "tools/File.h"
29
30 #include <string>
31 #include <cmath>
32 #include <map>
33 #include <numeric>
34 #include <ctime>
35 #include "tools/Random.h"
36
37 using namespace std;
38
39 namespace PLMD {
40 namespace isdb {
41
42 //+PLUMEDOC ISDB_COLVAR EMMI
43 /*
44 Calculate the fit of a structure or ensemble of structures with a cryo-EM density map.
45
46 This action implements the multi-scale Bayesian approach to cryo-EM data fitting introduced in Ref. \cite Hanot113951 .
47 This method allows efficient and accurate structural modeling of cryo-electron microscopy density maps at multiple scales, from coarse-grained to atomistic resolution, by addressing the presence of random and systematic errors in the data, sample heterogeneity, data correlation, and noise correlation.
48
49 The experimental density map is fit by a Gaussian Mixture Model (GMM), which is provided as an external file specified by the keyword
50 GMM_FILE. We are currently working on a web server to perform
51 this operation. In the meantime, the user can request a stand-alone version of the GMM code at massimiliano.bonomi_AT_gmail.com.
52
53 When run in single-replica mode, this action allows atomistic, flexible refinement of an individual structure into a density map.
54 Combined with a multi-replica framework (such as the -multi option in GROMACS), the user can model an ensemble of structures using
55 the Metainference approach \cite Bonomi:2016ip .
56
57 \warning
58 To use \ref EMMI, the user should always add a \ref MOLINFO line and specify a pdb file of the system.
59
60 \note
61 To enhance sampling in single-structure refinement, one can use a Replica Exchange Method, such as Parallel Tempering.
62 In this case, the user should add the NO_AVER flag to the input line.
63
64 \note
65 \ref EMMI can be used in combination with periodic and non-periodic systems. In the latter case, one should
66 add the NOPBC flag to the input line
67
68 \par Examples
69
70 In this example, we perform a single-structure refinement based on an experimental cryo-EM map. The map is fit with a GMM, whose
71 parameters are listed in the file GMM_fit.dat. This file contains one line per GMM component in the following format:
72
73 \plumedfile
74 #! FIELDS Id Weight Mean_0 Mean_1 Mean_2 Cov_00 Cov_01 Cov_02 Cov_11 Cov_12 Cov_22 Beta
75 0 2.9993805e+01 6.54628 10.37820 -0.92988 2.078920e-02 1.216254e-03 5.990827e-04 2.556246e-02 8.411835e-03 2.486254e-02 1
76 1 2.3468312e+01 6.56095 10.34790 -0.87808 1.879859e-02 6.636049e-03 3.682865e-04 3.194490e-02 1.750524e-03 3.017100e-02 1
77 ...
78 \endplumedfile
79
80 To accelerate the computation of the Bayesian score, one can:
81 - use neighbor lists, specified by the keywords NL_CUTOFF and NL_STRIDE;
82 - calculate the restraint every other step (or more).
83
84 All the heavy atoms of the system are used to calculate the density map. This list can conveniently be provided
85 using a GROMACS index file.
86
87 The input file looks as follows:
88
89 \plumedfile
90 # include pdb info
91 MOLINFO STRUCTURE=prot.pdb
92
93 # all heavy atoms
94 protein-h: GROUP NDX_FILE=index.ndx NDX_GROUP=Protein-H
95
96 # create EMMI score
97 gmm: EMMI NOPBC SIGMA_MIN=0.01 TEMP=300.0 NL_STRIDE=100 NL_CUTOFF=0.01 GMM_FILE=GMM_fit.dat ATOMS=protein-h
98
99 # translate into bias - apply every 2 steps
100 emr: BIASVALUE ARG=gmm.scoreb STRIDE=2
101
102 PRINT ARG=emr.* FILE=COLVAR STRIDE=500 FMT=%20.10f
103 \endplumedfile
104
105
106 */
107 //+ENDPLUMEDOC
108
109 class EMMI : public Colvar {
110
111 private:
112
113 // temperature in kbt
114 double kbt_;
115 // model GMM - atom types
116 vector<unsigned> GMM_m_type_;
117 // model GMM - list of atom sigmas - one per atom type
118 vector<double> GMM_m_s_;
119 // model GMM - list of atom weights - one per atom type
120 vector<double> GMM_m_w_;
121 // data GMM - means, weights, and covariances + beta option
122 vector<Vector> GMM_d_m_;
123 vector<double> GMM_d_w_;
124 vector< VectorGeneric<6> > GMM_d_cov_;
125 vector<int> GMM_d_beta_;
126 vector < vector<int> > GMM_d_grps_;
127 // overlaps
128 vector<double> ovmd_;
129 vector<double> ovdd_;
130 vector<double> ovmd_ave_;
131 // and derivatives
132 vector<Vector> ovmd_der_;
133 vector<Vector> atom_der_;
134 vector<double> GMMid_der_;
135 // constants
136 double cfact_;
137 double inv_sqrt2_, sqrt2_pi_;
138 // metainference
139 unsigned nrep_;
140 unsigned replica_;
141 vector<double> sigma_;
142 vector<double> sigma_min_;
143 vector<double> sigma_max_;
144 vector<double> dsigma_;
145 // list of prefactors for overlap between two Gaussians
146 // pre_fact = 1.0 / (2pi)**1.5 / sqrt(det_md) * Wm * Wd
147 vector<double> pre_fact_;
148 // inverse of the sum of model and data covariances matrices
149 vector< VectorGeneric<6> > inv_cov_md_;
150 // neighbor list
151 double nl_cutoff_;
152 unsigned nl_stride_;
153 bool first_time_;
154 bool no_aver_;
155 vector<unsigned> nl_;
156 // parallel stuff
157 unsigned size_;
158 unsigned rank_;
159 // pbc
160 bool pbc_;
161 // Monte Carlo stuff
162 int MCstride_;
163 double MCaccept_;
164 double MCtrials_;
165 Random random_;
166 // status stuff
167 unsigned int statusstride_;
168 string statusfilename_;
169 OFile statusfile_;
170 bool first_status_;
171 // regression
172 unsigned nregres_;
173 double scale_;
174 double scale_min_;
175 double scale_max_;
176 double dscale_;
177 // tabulated exponential
178 double dpcutoff_;
179 double dexp_;
180 unsigned nexp_;
181 vector<double> tab_exp_;
182 // simulated annealing
183 unsigned nanneal_;
184 double kanneal_;
185 double anneal_;
186 // prior exponent
187 double prior_;
188 // noise type
189 unsigned noise_;
190 // total score and virial;
191 double ene_;
192 Tensor virial_;
193 // model overlap file
194 unsigned int ovstride_;
195 string ovfilename_;
196
197 // write file with model overlap
198 void write_model_overlap(long int step);
199 // get median of vector
200 double get_median(vector<double> &v);
201 // annealing
202 double get_annealing(long int step);
203 // do regression
204 double scaleEnergy(double s);
205 double doRegression();
206 // read and write status
207 void read_status();
208 void print_status(long int step);
209 // accept or reject
210 bool doAccept(double oldE, double newE, double kbt);
211 // do MonteCarlo
212 void doMonteCarlo();
213 // read error file
214 vector<double> read_exp_errors(string errfile);
215 // read experimental overlaps
216 vector<double> read_exp_overlaps(string ovfile);
217 // calculate model GMM weights and covariances
218 vector<double> get_GMM_m(vector<AtomNumber> &atoms);
219 // read data GMM file
220 void get_GMM_d(string gmm_file);
221 // check GMM data
222 void check_GMM_d(VectorGeneric<6> &cov, double w);
223 // auxiliary method
224 void calculate_useful_stuff(double reso);
225 // get pref_fact and inv_cov_md
226 double get_prefactor_inverse (const VectorGeneric<6> &GMM_cov_0, const VectorGeneric<6> &GMM_cov_1,
227 double &GMM_w_0, double &GMM_w_1,
228 VectorGeneric<6> &sum, VectorGeneric<6> &inv_sum);
229 // calculate self overlaps between data GMM components - ovdd_
230 double get_self_overlap(unsigned id);
231 // calculate overlap between two Gaussians
232 double get_overlap(const Vector &m_m, const Vector &d_m, double &pre_fact,
233 const VectorGeneric<6> &inv_cov_md, Vector &ov_der);
234 // calculate exponent of overlap for neighbor list update
235 double get_exp_overlap(const Vector &m_m, const Vector &d_m,
236 const VectorGeneric<6> &inv_cov_md);
237 // update the neighbor list
238 void update_neighbor_list();
239 // calculate overlap
240 void calculate_overlap();
241 // Gaussian noise
242 void calculate_Gauss();
243 // Outliers noise
244 void calculate_Outliers();
245 // Marginal noise
246 void calculate_Marginal();
247
248 public:
249 static void registerKeywords( Keywords& keys );
250 explicit EMMI(const ActionOptions&);
251 // active methods:
252 void prepare() override;
253 void calculate() override;
254 };
255
256 PLUMED_REGISTER_ACTION(EMMI,"EMMI")
257
registerKeywords(Keywords & keys)258 void EMMI::registerKeywords( Keywords& keys ) {
259 Colvar::registerKeywords( keys );
260 keys.add("atoms","ATOMS","atoms for which we calculate the density map, typically all heavy atoms");
261 keys.add("compulsory","GMM_FILE","file with the parameters of the GMM components");
262 keys.add("compulsory","NL_CUTOFF","The cutoff in overlap for the neighbor list");
263 keys.add("compulsory","NL_STRIDE","The frequency with which we are updating the neighbor list");
264 keys.add("compulsory","SIGMA_MIN","minimum uncertainty");
265 keys.add("compulsory","RESOLUTION", "Cryo-EM map resolution");
266 keys.add("compulsory","NOISETYPE","functional form of the noise (GAUSS, OUTLIERS, MARGINAL)");
267 keys.add("optional","SIGMA0","initial value of the uncertainty");
268 keys.add("optional","DSIGMA","MC step for uncertainties");
269 keys.add("optional","MC_STRIDE", "Monte Carlo stride");
270 keys.add("optional","ERR_FILE","file with experimental or GMM fit errors");
271 keys.add("optional","OV_FILE","file with experimental overlaps");
272 keys.add("optional","NORM_DENSITY","integral of the experimental density");
273 keys.add("optional","STATUS_FILE","write a file with all the data useful for restart");
274 keys.add("optional","WRITE_STRIDE","write the status to a file every N steps, this can be used for restart");
275 keys.add("optional","REGRESSION","regression stride");
276 keys.add("optional","REG_SCALE_MIN","regression minimum scale");
277 keys.add("optional","REG_SCALE_MAX","regression maximum scale");
278 keys.add("optional","REG_DSCALE","regression maximum scale MC move");
279 keys.add("optional","SCALE","scale factor");
280 keys.add("optional","ANNEAL", "Length of annealing cycle");
281 keys.add("optional","ANNEAL_FACT", "Annealing temperature factor");
282 keys.add("optional","TEMP","temperature");
283 keys.add("optional","PRIOR", "exponent of uncertainty prior");
284 keys.add("optional","WRITE_OV_STRIDE","write model overlaps every N steps");
285 keys.add("optional","WRITE_OV","write a file with model overlaps");
286 keys.addFlag("NO_AVER",false,"don't do ensemble averaging in multi-replica mode");
287 componentsAreNotOptional(keys);
288 keys.addOutputComponent("scoreb","default","Bayesian score");
289 keys.addOutputComponent("acc", "NOISETYPE","MC acceptance for uncertainty");
290 keys.addOutputComponent("scale", "REGRESSION","scale factor");
291 keys.addOutputComponent("accscale", "REGRESSION","MC acceptance for scale regression");
292 keys.addOutputComponent("enescale", "REGRESSION","MC energy for scale regression");
293 keys.addOutputComponent("anneal","ANNEAL","annealing factor");
294 }
295
EMMI(const ActionOptions & ao)296 EMMI::EMMI(const ActionOptions&ao):
297 PLUMED_COLVAR_INIT(ao),
298 inv_sqrt2_(0.707106781186548),
299 sqrt2_pi_(0.797884560802865),
300 first_time_(true), no_aver_(false), pbc_(true),
301 MCstride_(1), MCaccept_(0.), MCtrials_(0.),
302 statusstride_(0), first_status_(true),
303 nregres_(0), scale_(1.),
304 dpcutoff_(15.0), nexp_(1000000), nanneal_(0),
305 kanneal_(0.), anneal_(1.), prior_(1.), ovstride_(0)
306 {
307 // periodic boundary conditions
308 bool nopbc=!pbc_;
309 parseFlag("NOPBC",nopbc);
310 pbc_=!nopbc;
311
312 // list of atoms
313 vector<AtomNumber> atoms;
314 parseAtomList("ATOMS", atoms);
315
316 // file with data GMM
317 string GMM_file;
318 parse("GMM_FILE", GMM_file);
319
320 // type of data noise
321 string noise;
322 parse("NOISETYPE",noise);
323 if (noise=="GAUSS") noise_ = 0;
324 else if(noise=="OUTLIERS") noise_ = 1;
325 else if(noise=="MARGINAL") noise_ = 2;
326 else error("Unknown noise type!");
327
328 // minimum value for error
329 double sigma_min;
330 parse("SIGMA_MIN", sigma_min);
331 if(sigma_min<0) error("SIGMA_MIN should be greater or equal to zero");
332
333 // the following parameters must be specified with noise type 0 and 1
334 double sigma_ini, dsigma;
335 if(noise_!=2) {
336 // initial value of the uncertainty
337 parse("SIGMA0", sigma_ini);
338 if(sigma_ini<=0) error("you must specify a positive SIGMA0");
339 // MC parameters
340 parse("DSIGMA", dsigma);
341 if(dsigma<0) error("you must specify a positive DSIGMA");
342 parse("MC_STRIDE", MCstride_);
343 if(dsigma>0 && MCstride_<=0) error("you must specify a positive MC_STRIDE");
344 // status file parameters
345 parse("WRITE_STRIDE", statusstride_);
346 if(statusstride_<=0) error("you must specify a positive WRITE_STRIDE");
347 parse("STATUS_FILE", statusfilename_);
348 if(statusfilename_=="") statusfilename_ = "MISTATUS"+getLabel();
349 else statusfilename_ = statusfilename_+getLabel();
350 }
351
352 // error file
353 string errfile;
354 parse("ERR_FILE", errfile);
355
356 // file with experimental overlaps
357 string ovfile;
358 parse("OV_FILE", ovfile);
359
360 // integral of the experimetal density
361 double norm_d = 0.0;
362 parse("NORM_DENSITY", norm_d);
363
364 // temperature
365 double temp=0.0;
366 parse("TEMP",temp);
367 // convert temp to kbt
368 if(temp>0.0) kbt_=plumed.getAtoms().getKBoltzmann()*temp;
369 else kbt_=plumed.getAtoms().getKbT();
370
371 // exponent of uncertainty prior
372 parse("PRIOR",prior_);
373
374 // simulated annealing stuff
375 parse("ANNEAL", nanneal_);
376 parse("ANNEAL_FACT", kanneal_);
377 if(nanneal_>0 && kanneal_<=1.0) error("with ANNEAL, ANNEAL_FACT must be greater than 1");
378
379 // regression stride
380 parse("REGRESSION",nregres_);
381 // other regression parameters
382 if(nregres_>0) {
383 parse("REG_SCALE_MIN",scale_min_);
384 parse("REG_SCALE_MAX",scale_max_);
385 parse("REG_DSCALE",dscale_);
386 // checks
387 if(scale_max_<=scale_min_) error("with REGRESSION, REG_SCALE_MAX must be greater than REG_SCALE_MIN");
388 if(dscale_<=0.) error("with REGRESSION, REG_DSCALE must be positive");
389 }
390
391 // scale factor
392 parse("SCALE", scale_);
393
394 // read map resolution
395 double reso;
396 parse("RESOLUTION", reso);
397 if(reso<=0.) error("RESOLUTION should be strictly positive");
398
399 // neighbor list stuff
400 parse("NL_CUTOFF",nl_cutoff_);
401 if(nl_cutoff_<=0.0) error("NL_CUTOFF should be explicitly specified and positive");
402 parse("NL_STRIDE",nl_stride_);
403 if(nl_stride_<=0) error("NL_STRIDE should be explicitly specified and positive");
404
405 // averaging or not
406 parseFlag("NO_AVER",no_aver_);
407
408 // write overlap file
409 parse("WRITE_OV_STRIDE", ovstride_);
410 parse("WRITE_OV", ovfilename_);
411 if(ovstride_>0 && ovfilename_=="") error("With WRITE_OV_STRIDE you must specify WRITE_OV");
412
413 checkRead();
414
415 // set parallel stuff
416 size_=comm.Get_size();
417 rank_=comm.Get_rank();
418
419 // get number of replicas
420 if(rank_==0) {
421 if(no_aver_) {
422 nrep_ = 1;
423 } else {
424 nrep_ = multi_sim_comm.Get_size();
425 }
426 replica_ = multi_sim_comm.Get_rank();
427 } else {
428 nrep_ = 0;
429 replica_ = 0;
430 }
431 comm.Sum(&nrep_,1);
432 comm.Sum(&replica_,1);
433
434 log.printf(" atoms involved : ");
435 for(unsigned i=0; i<atoms.size(); ++i) log.printf("%d ",atoms[i].serial());
436 log.printf("\n");
437 log.printf(" GMM data file : %s\n", GMM_file.c_str());
438 if(no_aver_) log.printf(" without ensemble averaging\n");
439 log.printf(" type of data noise : %s\n", noise.c_str());
440 log.printf(" neighbor list cutoff : %lf\n", nl_cutoff_);
441 log.printf(" neighbor list stride : %u\n", nl_stride_);
442 log.printf(" minimum uncertainty : %f\n",sigma_min);
443 log.printf(" scale factor : %lf\n",scale_);
444 if(nregres_>0) {
445 log.printf(" regression stride : %u\n", nregres_);
446 log.printf(" regression minimum scale : %lf\n", scale_min_);
447 log.printf(" regression maximum scale : %lf\n", scale_max_);
448 log.printf(" regression maximum scale MC move : %lf\n", dscale_);
449 }
450 if(noise_!=2) {
451 log.printf(" initial value of the uncertainty : %f\n",sigma_ini);
452 log.printf(" max MC move in uncertainty : %f\n",dsigma);
453 log.printf(" MC stride : %u\n", MCstride_);
454 log.printf(" reading/writing to status file : %s\n",statusfilename_.c_str());
455 log.printf(" with stride : %u\n",statusstride_);
456 }
457 if(errfile.size()>0) log.printf(" reading experimental errors from file : %s\n", errfile.c_str());
458 if(ovfile.size()>0) log.printf(" reading experimental overlaps from file : %s\n", ovfile.c_str());
459 log.printf(" temperature of the system in energy unit : %f\n",kbt_);
460 log.printf(" prior exponent : %f\n",prior_);
461 log.printf(" number of replicas for averaging: %u\n",nrep_);
462 log.printf(" id of the replica : %u\n",replica_);
463 if(nanneal_>0) {
464 log.printf(" length of annealing cycle : %u\n",nanneal_);
465 log.printf(" annealing factor : %f\n",kanneal_);
466 }
467 if(ovstride_>0) {
468 log.printf(" stride for writing model overlaps : %u\n",ovstride_);
469 log.printf(" file for writing model overlaps : %s\n", ovfilename_.c_str());
470 }
471
472 // set constant quantity before calculating stuff
473 cfact_ = 1.0/pow( 2.0*pi, 1.5 );
474
475 // calculate model GMM constant parameters
476 vector<double> GMM_m_w = get_GMM_m(atoms);
477
478 // read data GMM parameters
479 get_GMM_d(GMM_file);
480 log.printf(" number of GMM components : %u\n", static_cast<unsigned>(GMM_d_m_.size()));
481
482 // normalize atom weight map
483 if(norm_d <= 0.0) norm_d = accumulate(GMM_d_w_.begin(), GMM_d_w_.end(), 0.0);
484 double norm_m = accumulate(GMM_m_w.begin(), GMM_m_w.end(), 0.0);
485 // renormalization
486 for(unsigned i=0; i<GMM_m_w_.size(); ++i) GMM_m_w_[i] *= norm_d / norm_m;
487
488 // read experimental errors
489 vector<double> exp_err;
490 if(errfile.size()>0) exp_err = read_exp_errors(errfile);
491
492 // get self overlaps between data GMM components
493 if(ovfile.size()>0) {
494 ovdd_ = read_exp_overlaps(ovfile);
495 } else {
496 for(unsigned i=0; i<GMM_d_m_.size(); ++i) {
497 double ov = get_self_overlap(i);
498 ovdd_.push_back(ov);
499 }
500 }
501
502 log.printf(" number of GMM groups : %u\n", static_cast<unsigned>(GMM_d_grps_.size()));
503 // cycle on GMM groups
504 for(unsigned Gid=0; Gid<GMM_d_grps_.size(); ++Gid) {
505 log.printf(" group %d\n", Gid);
506 // calculate median overlap and experimental error
507 vector<double> ovdd;
508 vector<double> err;
509 // cycle on the group members
510 for(unsigned i=0; i<GMM_d_grps_[Gid].size(); ++i) {
511 // GMM id
512 int GMMid = GMM_d_grps_[Gid][i];
513 // add to experimental error
514 if(errfile.size()>0) err.push_back(exp_err[GMMid]);
515 else err.push_back(0.);
516 // add to GMM overlap
517 ovdd.push_back(ovdd_[GMMid]);
518 }
519 // calculate median quantities
520 double ovdd_m = get_median(ovdd);
521 double err_m = get_median(err);
522 // print out statistics
523 log.printf(" # of members : %zu\n", GMM_d_grps_[Gid].size());
524 log.printf(" median overlap : %lf\n", ovdd_m);
525 log.printf(" median error : %lf\n", err_m);
526 // add minimum value of sigma for this group of GMMs
527 sigma_min_.push_back(sqrt(err_m*err_m+sigma_min*ovdd_m*sigma_min*ovdd_m));
528 // these are only needed with Gaussian and Outliers noise models
529 if(noise_!=2) {
530 // set dsigma
531 dsigma_.push_back(dsigma * ovdd_m);
532 // set sigma max
533 sigma_max_.push_back(10.0*ovdd_m + sigma_min_[Gid] + dsigma_[Gid]);
534 // initialize sigma
535 sigma_.push_back(std::max(sigma_min_[Gid],std::min(sigma_ini*ovdd_m,sigma_max_[Gid])));
536 }
537 }
538
539 // read status file if restarting
540 if(getRestart() && noise_!=2) read_status();
541
542 // calculate auxiliary stuff
543 calculate_useful_stuff(reso);
544
545 // prepare data and derivative vectors
546 ovmd_.resize(ovdd_.size());
547 atom_der_.resize(GMM_m_type_.size());
548 GMMid_der_.resize(ovdd_.size());
549
550 // clear things that are no longer needed
551 GMM_d_cov_.clear();
552
553 // add components
554 addComponentWithDerivatives("scoreb"); componentIsNotPeriodic("scoreb");
555
556 if(noise_!=2) {addComponent("acc"); componentIsNotPeriodic("acc");}
557
558 if(nregres_>0) {
559 addComponent("scale"); componentIsNotPeriodic("scale");
560 addComponent("accscale"); componentIsNotPeriodic("accscale");
561 addComponent("enescale"); componentIsNotPeriodic("enescale");
562 }
563
564 if(nanneal_>0) {addComponent("anneal"); componentIsNotPeriodic("anneal");}
565
566 // initialize random seed
567 unsigned iseed;
568 if(rank_==0) iseed = time(NULL)+replica_;
569 else iseed = 0;
570 comm.Sum(&iseed, 1);
571 random_.setSeed(-iseed);
572
573 // request the atoms
574 requestAtoms(atoms);
575
576 // print bibliography
577 log<<" Bibliography "<<plumed.cite("Bonomi, Camilloni, Bioinformatics, 33, 3999 (2017)");
578 log<<plumed.cite("Hanot, Bonomi, Greenberg, Sali, Nilges, Vendruscolo, Pellarin, bioRxiv doi: 10.1101/113951 (2017)");
579 log<<plumed.cite("Bonomi, Pellarin, Vendruscolo, Biophys. J. 114, 1604 (2018)");
580 if(!no_aver_ && nrep_>1)log<<plumed.cite("Bonomi, Camilloni, Cavalli, Vendruscolo, Sci. Adv. 2, e150117 (2016)");
581 log<<"\n";
582 }
583
write_model_overlap(long int step)584 void EMMI::write_model_overlap(long int step)
585 {
586 OFile ovfile;
587 ovfile.link(*this);
588 std::string num; Tools::convert(step,num);
589 string name = ovfilename_+"-"+num;
590 ovfile.open(name);
591 ovfile.setHeavyFlush();
592 ovfile.fmtField("%10.7e ");
593 // write overlaps
594 for(int i=0; i<ovmd_.size(); ++i) {
595 ovfile.printField("Model", ovmd_[i]);
596 ovfile.printField("ModelScaled", scale_ * ovmd_[i]);
597 ovfile.printField("Data", ovdd_[i]);
598 ovfile.printField();
599 }
600 ovfile.close();
601 }
602
get_median(vector<double> & v)603 double EMMI::get_median(vector<double> &v)
604 {
605 // dimension of vector
606 unsigned size = v.size();
607 // in case of only one entry
608 if (size==1) {
609 return v[0];
610 } else {
611 // reorder vector
612 sort(v.begin(), v.end());
613 // odd or even?
614 if (size%2==0) {
615 return (v[size/2-1]+v[size/2])/2.0;
616 } else {
617 return v[size/2];
618 }
619 }
620 }
621
read_status()622 void EMMI::read_status()
623 {
624 double MDtime;
625 // open file
626 IFile *ifile = new IFile();
627 ifile->link(*this);
628 if(ifile->FileExist(statusfilename_)) {
629 ifile->open(statusfilename_);
630 while(ifile->scanField("MD_time", MDtime)) {
631 for(unsigned i=0; i<sigma_.size(); ++i) {
632 // convert i to string
633 std::string num; Tools::convert(i,num);
634 // read entries
635 ifile->scanField("s"+num, sigma_[i]);
636 }
637 // new line
638 ifile->scanField();
639 }
640 ifile->close();
641 } else {
642 error("Cannot find status file "+statusfilename_+"\n");
643 }
644 delete ifile;
645 }
646
print_status(long int step)647 void EMMI::print_status(long int step)
648 {
649 // if first time open the file
650 if(first_status_) {
651 first_status_ = false;
652 statusfile_.link(*this);
653 statusfile_.open(statusfilename_);
654 statusfile_.setHeavyFlush();
655 statusfile_.fmtField("%6.3e ");
656 }
657 // write fields
658 double MDtime = static_cast<double>(step)*getTimeStep();
659 statusfile_.printField("MD_time", MDtime);
660 for(unsigned i=0; i<sigma_.size(); ++i) {
661 // convert i to string
662 std::string num; Tools::convert(i,num);
663 // print entry
664 statusfile_.printField("s"+num, sigma_[i]);
665 }
666 statusfile_.printField();
667 }
668
doAccept(double oldE,double newE,double kbt)669 bool EMMI::doAccept(double oldE, double newE, double kbt) {
670 bool accept = false;
671 // calculate delta energy
672 double delta = ( newE - oldE ) / kbt;
673 // if delta is negative always accept move
674 if( delta < 0.0 ) {
675 accept = true;
676 } else {
677 // otherwise extract random number
678 double s = random_.RandU01();
679 if( s < exp(-delta) ) { accept = true; }
680 }
681 return accept;
682 }
683
doMonteCarlo()684 void EMMI::doMonteCarlo()
685 {
686 // extract random GMM group
687 unsigned nGMM = static_cast<unsigned>(floor(random_.RandU01()*static_cast<double>(GMM_d_grps_.size())));
688 if(nGMM==GMM_d_grps_.size()) nGMM -= 1;
689
690 // generate random move
691 double shift = dsigma_[nGMM] * ( 2.0 * random_.RandU01() - 1.0 );
692 // new sigma
693 double new_s = sigma_[nGMM] + shift;
694 // check boundaries
695 if(new_s > sigma_max_[nGMM]) {new_s = 2.0 * sigma_max_[nGMM] - new_s;}
696 if(new_s < sigma_min_[nGMM]) {new_s = 2.0 * sigma_min_[nGMM] - new_s;}
697 // old s2
698 double old_inv_s2 = 1.0 / sigma_[nGMM] / sigma_[nGMM];
699 // new s2
700 double new_inv_s2 = 1.0 / new_s / new_s;
701
702 // cycle on GMM group and calculate old and new energy
703 double old_ene = 0.0;
704 double new_ene = 0.0;
705 double ng = static_cast<double>(GMM_d_grps_[nGMM].size());
706
707 // in case of Gaussian noise
708 if(noise_==0) {
709 double chi2 = 0.0;
710 for(unsigned i=0; i<GMM_d_grps_[nGMM].size(); ++i) {
711 // id GMM component
712 int GMMid = GMM_d_grps_[nGMM][i];
713 // deviation
714 double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] );
715 // add to chi2
716 chi2 += dev * dev;
717 }
718 // final energy calculation: add normalization and prior
719 old_ene = 0.5 * kbt_ * ( chi2 * old_inv_s2 - (ng+prior_) * std::log(old_inv_s2) );
720 new_ene = 0.5 * kbt_ * ( chi2 * new_inv_s2 - (ng+prior_) * std::log(new_inv_s2) );
721 }
722
723 // in case of Outliers noise
724 if(noise_==1) {
725 for(unsigned i=0; i<GMM_d_grps_[nGMM].size(); ++i) {
726 // id GMM component
727 int GMMid = GMM_d_grps_[nGMM][i];
728 // calculate deviation
729 double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] );
730 // add to energies
731 old_ene += std::log( 1.0 + 0.5 * dev * dev * old_inv_s2);
732 new_ene += std::log( 1.0 + 0.5 * dev * dev * new_inv_s2);
733 }
734 // final energy calculation: add normalization and prior
735 old_ene = kbt_ * ( old_ene + (ng+prior_) * std::log(sigma_[nGMM]) );
736 new_ene = kbt_ * ( new_ene + (ng+prior_) * std::log(new_s) );
737 }
738
739 // increment number of trials
740 MCtrials_ += 1.0;
741
742 // accept or reject
743 bool accept = doAccept(old_ene/anneal_, new_ene/anneal_, kbt_);
744 if(accept) {
745 sigma_[nGMM] = new_s;
746 MCaccept_ += 1.0;
747 }
748 // local communication
749 if(rank_!=0) {
750 for(unsigned i=0; i<sigma_.size(); ++i) sigma_[i] = 0.0;
751 MCaccept_ = 0.0;
752 }
753 if(size_>1) {
754 comm.Sum(&sigma_[0], sigma_.size());
755 comm.Sum(&MCaccept_, 1);
756 }
757 }
758
read_exp_errors(string errfile)759 vector<double> EMMI::read_exp_errors(string errfile)
760 {
761 int nexp, idcomp;
762 double err;
763 vector<double> exp_err;
764 // open file
765 IFile *ifile = new IFile();
766 if(ifile->FileExist(errfile)) {
767 ifile->open(errfile);
768 // scan for number of experimental errors
769 ifile->scanField("Nexp", nexp);
770 // cycle on GMM components
771 while(ifile->scanField("Id",idcomp)) {
772 // total experimental error
773 double err_tot = 0.0;
774 // cycle on number of experimental overlaps
775 for(unsigned i=0; i<nexp; ++i) {
776 string ss; Tools::convert(i,ss);
777 ifile->scanField("Err"+ss, err);
778 // add to total error
779 err_tot += err*err;
780 }
781 // new line
782 ifile->scanField();
783 // calculate RMSE
784 err_tot = sqrt(err_tot/static_cast<double>(nexp));
785 // add to global
786 exp_err.push_back(err_tot);
787 }
788 ifile->close();
789 } else {
790 error("Cannot find ERR_FILE "+errfile+"\n");
791 }
792 return exp_err;
793 }
794
read_exp_overlaps(string ovfile)795 vector<double> EMMI::read_exp_overlaps(string ovfile)
796 {
797 int idcomp;
798 double ov;
799 vector<double> ovdd;
800 // open file
801 IFile *ifile = new IFile();
802 if(ifile->FileExist(ovfile)) {
803 ifile->open(ovfile);
804 // cycle on GMM components
805 while(ifile->scanField("Id",idcomp)) {
806 // read experimental overlap
807 ifile->scanField("Overlap", ov);
808 // add to ovdd
809 ovdd.push_back(ov);
810 // new line
811 ifile->scanField();
812 }
813 ifile->close();
814 } else {
815 error("Cannot find OV_FILE "+ovfile+"\n");
816 }
817 return ovdd;
818 }
819
get_GMM_m(vector<AtomNumber> & atoms)820 vector<double> EMMI::get_GMM_m(vector<AtomNumber> &atoms)
821 {
822 // list of weights - one per atom
823 vector<double> GMM_m_w;
824
825 auto* moldat=plumed.getActionSet().selectLatest<GenericMolInfo*>(this);
826 // map of atom types to A and B coefficients of scattering factor
827 // f(s) = A * exp(-B*s**2)
828 // B is in Angstrom squared
829 // map between an atom type and an index
830 map<string, unsigned> type_map;
831 type_map["C"]=0;
832 type_map["O"]=1;
833 type_map["N"]=2;
834 type_map["S"]=3;
835 // fill in sigma vector
836 GMM_m_s_.push_back(0.01*15.146); // type 0
837 GMM_m_s_.push_back(0.01*8.59722); // type 1
838 GMM_m_s_.push_back(0.01*11.1116); // type 2
839 GMM_m_s_.push_back(0.01*15.8952); // type 3
840 // fill in weight vector
841 GMM_m_w_.push_back(2.49982); // type 0
842 GMM_m_w_.push_back(1.97692); // type 1
843 GMM_m_w_.push_back(2.20402); // type 2
844 GMM_m_w_.push_back(5.14099); // type 3
845
846 // check if MOLINFO line is present
847 if( moldat ) {
848 log<<" MOLINFO DATA found with label " <<moldat->getLabel()<<", using proper atom names\n";
849 for(unsigned i=0; i<atoms.size(); ++i) {
850 // get atom name
851 string name = moldat->getAtomName(atoms[i]);
852 char type;
853 // get atom type
854 char first = name.at(0);
855 // GOLDEN RULE: type is first letter, if not a number
856 if (!isdigit(first)) {
857 type = first;
858 // otherwise is the second
859 } else {
860 type = name.at(1);
861 }
862 // check if key in map
863 std::string type_s = std::string(1,type);
864 if(type_map.find(type_s) != type_map.end()) {
865 // save atom type
866 GMM_m_type_.push_back(type_map[type_s]);
867 // this will be normalized in the final density
868 GMM_m_w.push_back(GMM_m_w_[type_map[type_s]]);
869 } else {
870 error("Wrong atom type "+type_s+" from atom name "+name+"\n");
871 }
872 }
873 } else {
874 error("MOLINFO DATA not found\n");
875 }
876 return GMM_m_w;
877 }
878
check_GMM_d(VectorGeneric<6> & cov,double w)879 void EMMI::check_GMM_d(VectorGeneric<6> &cov, double w)
880 {
881
882 // check if positive defined, by calculating the 3 leading principal minors
883 double pm1 = cov[0];
884 double pm2 = cov[0]*cov[3]-cov[1]*cov[1];
885 double pm3 = cov[0]*(cov[3]*cov[5]-cov[4]*cov[4])-cov[1]*(cov[1]*cov[5]-cov[4]*cov[2])+cov[2]*(cov[1]*cov[4]-cov[3]*cov[2]);
886 // apply Sylvester’s criterion
887 if(pm1<=0.0 || pm2<=0.0 || pm3<=0.0)
888 error("check data GMM: covariance matrix is not positive defined");
889
890 // check if weight is positive
891 if(w<=0) error("check data GMM: weight must be positive");
892 }
893
894 // read GMM data file in PLUMED format:
get_GMM_d(string GMM_file)895 void EMMI::get_GMM_d(string GMM_file)
896 {
897 VectorGeneric<6> cov;
898
899 // open file
900 std::unique_ptr<IFile> ifile(new IFile);
901 if(ifile->FileExist(GMM_file)) {
902 ifile->open(GMM_file);
903 int idcomp;
904 while(ifile->scanField("Id",idcomp)) {
905 int beta;
906 double w, m0, m1, m2;
907 ifile->scanField("Weight",w);
908 ifile->scanField("Mean_0",m0);
909 ifile->scanField("Mean_1",m1);
910 ifile->scanField("Mean_2",m2);
911 ifile->scanField("Cov_00",cov[0]);
912 ifile->scanField("Cov_01",cov[1]);
913 ifile->scanField("Cov_02",cov[2]);
914 ifile->scanField("Cov_11",cov[3]);
915 ifile->scanField("Cov_12",cov[4]);
916 ifile->scanField("Cov_22",cov[5]);
917 ifile->scanField("Beta",beta);
918 // check input
919 check_GMM_d(cov, w);
920 // check beta
921 if(beta<0) error("Beta must be positive!");
922 // center of the Gaussian
923 GMM_d_m_.push_back(Vector(m0,m1,m2));
924 // covariance matrix
925 GMM_d_cov_.push_back(cov);
926 // weight
927 GMM_d_w_.push_back(w);
928 // beta
929 GMM_d_beta_.push_back(beta);
930 // new line
931 ifile->scanField();
932 }
933 } else {
934 error("Cannot find GMM_FILE "+GMM_file+"\n");
935 }
936 // now create a set from beta (unique set of values)
937 set<int> bu(GMM_d_beta_.begin(), GMM_d_beta_.end());
938 // now prepare the group vector
939 GMM_d_grps_.resize(bu.size());
940 // and fill it in
941 for(unsigned i=0; i<GMM_d_beta_.size(); ++i) {
942 if(GMM_d_beta_[i]>=GMM_d_grps_.size()) error("Check Beta values");
943 GMM_d_grps_[GMM_d_beta_[i]].push_back(i);
944 }
945 }
946
calculate_useful_stuff(double reso)947 void EMMI::calculate_useful_stuff(double reso)
948 {
949 // We use the following definition for resolution:
950 // the Fourier transform of the density distribution in real space
951 // f(s) falls to 1/e of its maximum value at wavenumber 1/resolution
952 // i.e. from f(s) = A * exp(-B*s**2) -> Res = sqrt(B).
953 // average value of B
954 double Bave = 0.0;
955 for(unsigned i=0; i<GMM_m_type_.size(); ++i) {
956 Bave += GMM_m_s_[GMM_m_type_[i]];
957 }
958 Bave /= static_cast<double>(GMM_m_type_.size());
959 // calculate blur factor
960 double blur = 0.0;
961 if(reso*reso>Bave) blur = reso*reso-Bave;
962 else warning("PLUMED should not be used with maps at resolution better than 0.3 nm");
963 // add blur to B
964 for(unsigned i=0; i<GMM_m_s_.size(); ++i) GMM_m_s_[i] += blur;
965 // calculate average resolution
966 double ave_res = 0.0;
967 for(unsigned i=0; i<GMM_m_type_.size(); ++i) {
968 ave_res += sqrt(GMM_m_s_[GMM_m_type_[i]]);
969 }
970 ave_res = ave_res / static_cast<double>(GMM_m_type_.size());
971 log.printf(" experimental map resolution : %3.2f\n", reso);
972 log.printf(" predicted map resolution : %3.2f\n", ave_res);
973 log.printf(" blur factor : %f\n", blur);
974 // now calculate useful stuff
975 VectorGeneric<6> cov, sum, inv_sum;
976 // cycle on all atoms types (4 for the moment)
977 for(unsigned i=0; i<GMM_m_s_.size(); ++i) {
978 // the Gaussian in density (real) space is the FT of scattering factor
979 // f(r) = A * (pi/B)**1.5 * exp(-pi**2/B*r**2)
980 double s = sqrt ( 0.5 * GMM_m_s_[i] ) / pi;
981 // covariance matrix for spherical Gaussian
982 cov[0]=s*s; cov[1]=0.0; cov[2]=0.0;
983 cov[3]=s*s; cov[4]=0.0;
984 cov[5]=s*s;
985 // cycle on all data GMM
986 for(unsigned j=0; j<GMM_d_m_.size(); ++j) {
987 // we need the sum of the covariance matrices
988 for(unsigned k=0; k<6; ++k) sum[k] = cov[k] + GMM_d_cov_[j][k];
989 // and to calculate its determinant
990 double det = sum[0]*(sum[3]*sum[5]-sum[4]*sum[4]);
991 det -= sum[1]*(sum[1]*sum[5]-sum[4]*sum[2]);
992 det += sum[2]*(sum[1]*sum[4]-sum[3]*sum[2]);
993 // calculate prefactor - model weights are already normalized
994 double pre_fact = cfact_ / sqrt(det) * GMM_d_w_[j] * GMM_m_w_[i];
995 // and its inverse
996 inv_sum[0] = (sum[3]*sum[5] - sum[4]*sum[4])/det;
997 inv_sum[1] = (sum[2]*sum[4] - sum[1]*sum[5])/det;
998 inv_sum[2] = (sum[1]*sum[4] - sum[2]*sum[3])/det;
999 inv_sum[3] = (sum[0]*sum[5] - sum[2]*sum[2])/det;
1000 inv_sum[4] = (sum[2]*sum[1] - sum[0]*sum[4])/det;
1001 inv_sum[5] = (sum[0]*sum[3] - sum[1]*sum[1])/det;
1002 // now we store the prefactor
1003 pre_fact_.push_back(pre_fact);
1004 // and the inverse of the sum
1005 inv_cov_md_.push_back(inv_sum);
1006 }
1007 }
1008 // tabulate exponential
1009 dexp_ = dpcutoff_ / static_cast<double> (nexp_-1);
1010 for(unsigned i=0; i<nexp_; ++i) {
1011 tab_exp_.push_back(exp(-static_cast<double>(i) * dexp_));
1012 }
1013 }
1014
1015 // get prefactors
get_prefactor_inverse(const VectorGeneric<6> & GMM_cov_0,const VectorGeneric<6> & GMM_cov_1,double & GMM_w_0,double & GMM_w_1,VectorGeneric<6> & sum,VectorGeneric<6> & inv_sum)1016 double EMMI::get_prefactor_inverse
1017 (const VectorGeneric<6> &GMM_cov_0, const VectorGeneric<6> &GMM_cov_1,
1018 double &GMM_w_0, double &GMM_w_1,
1019 VectorGeneric<6> &sum, VectorGeneric<6> &inv_sum)
1020 {
1021 // we need the sum of the covariance matrices
1022 for(unsigned k=0; k<6; ++k) sum[k] = GMM_cov_0[k] + GMM_cov_1[k];
1023
1024 // and to calculate its determinant
1025 double det = sum[0]*(sum[3]*sum[5]-sum[4]*sum[4]);
1026 det -= sum[1]*(sum[1]*sum[5]-sum[4]*sum[2]);
1027 det += sum[2]*(sum[1]*sum[4]-sum[3]*sum[2]);
1028
1029 // the prefactor is
1030 double pre_fact = cfact_ / sqrt(det) * GMM_w_0 * GMM_w_1;
1031
1032 // and its inverse
1033 inv_sum[0] = (sum[3]*sum[5] - sum[4]*sum[4])/det;
1034 inv_sum[1] = (sum[2]*sum[4] - sum[1]*sum[5])/det;
1035 inv_sum[2] = (sum[1]*sum[4] - sum[2]*sum[3])/det;
1036 inv_sum[3] = (sum[0]*sum[5] - sum[2]*sum[2])/det;
1037 inv_sum[4] = (sum[2]*sum[1] - sum[0]*sum[4])/det;
1038 inv_sum[5] = (sum[0]*sum[3] - sum[1]*sum[1])/det;
1039
1040 // return pre-factor
1041 return pre_fact;
1042 }
1043
get_self_overlap(unsigned id)1044 double EMMI::get_self_overlap(unsigned id)
1045 {
1046 double ov_tot = 0.0;
1047 VectorGeneric<6> sum, inv_sum;
1048 Vector ov_der;
1049 // start loop
1050 for(unsigned i=0; i<GMM_d_m_.size(); ++i) {
1051 // call auxiliary method
1052 double pre_fact = get_prefactor_inverse(GMM_d_cov_[id], GMM_d_cov_[i],
1053 GMM_d_w_[id], GMM_d_w_[i], sum, inv_sum);
1054 // add overlap to ov_tot
1055 ov_tot += get_overlap(GMM_d_m_[id], GMM_d_m_[i], pre_fact, inv_sum, ov_der);
1056 }
1057 // and return it
1058 return ov_tot;
1059 }
1060
1061 // get overlap and derivatives
get_overlap(const Vector & m_m,const Vector & d_m,double & pre_fact,const VectorGeneric<6> & inv_cov_md,Vector & ov_der)1062 double EMMI::get_overlap(const Vector &m_m, const Vector &d_m, double &pre_fact,
1063 const VectorGeneric<6> &inv_cov_md, Vector &ov_der)
1064 {
1065 Vector md;
1066 // calculate vector difference m_m-d_m with/without pbc
1067 if(pbc_) md = pbcDistance(d_m, m_m);
1068 else md = delta(d_m, m_m);
1069 // calculate product of transpose of md and inv_cov_md
1070 double p_x = md[0]*inv_cov_md[0]+md[1]*inv_cov_md[1]+md[2]*inv_cov_md[2];
1071 double p_y = md[0]*inv_cov_md[1]+md[1]*inv_cov_md[3]+md[2]*inv_cov_md[4];
1072 double p_z = md[0]*inv_cov_md[2]+md[1]*inv_cov_md[4]+md[2]*inv_cov_md[5];
1073 // calculate product of prod and md
1074 double ov = md[0]*p_x+md[1]*p_y+md[2]*p_z;
1075 // final calculation
1076 ov = pre_fact * exp(-0.5*ov);
1077 // derivatives
1078 ov_der = ov * Vector(p_x, p_y, p_z);
1079 return ov;
1080 }
1081
1082 // get the exponent of the overlap
get_exp_overlap(const Vector & m_m,const Vector & d_m,const VectorGeneric<6> & inv_cov_md)1083 double EMMI::get_exp_overlap(const Vector &m_m, const Vector &d_m,
1084 const VectorGeneric<6> &inv_cov_md)
1085 {
1086 Vector md;
1087 // calculate vector difference m_m-d_m with/without pbc
1088 if(pbc_) md = pbcDistance(d_m, m_m);
1089 else md = delta(d_m, m_m);
1090 // calculate product of transpose of md and inv_cov_md
1091 double p_x = md[0]*inv_cov_md[0]+md[1]*inv_cov_md[1]+md[2]*inv_cov_md[2];
1092 double p_y = md[0]*inv_cov_md[1]+md[1]*inv_cov_md[3]+md[2]*inv_cov_md[4];
1093 double p_z = md[0]*inv_cov_md[2]+md[1]*inv_cov_md[4]+md[2]*inv_cov_md[5];
1094 // calculate product of prod and md
1095 double ov = md[0]*p_x+md[1]*p_y+md[2]*p_z;
1096 return ov;
1097 }
1098
update_neighbor_list()1099 void EMMI::update_neighbor_list()
1100 {
1101 // dimension of GMM and atom vectors
1102 unsigned GMM_d_size = GMM_d_m_.size();
1103 unsigned GMM_m_size = GMM_m_type_.size();
1104 // local neighbor list
1105 vector < unsigned > nl_l;
1106 // clear old neighbor list
1107 nl_.clear();
1108
1109 // cycle on GMM components - in parallel
1110 for(unsigned id=rank_; id<GMM_d_size; id+=size_) {
1111 // overlap lists and map
1112 vector<double> ov_l;
1113 map<double, unsigned> ov_m;
1114 // total overlap with id
1115 double ov_tot = 0.0;
1116 // cycle on all atoms
1117 for(unsigned im=0; im<GMM_m_size; ++im) {
1118 // get index in auxiliary lists
1119 unsigned kaux = GMM_m_type_[im] * GMM_d_size + id;
1120 // calculate exponent of overlap
1121 double expov = get_exp_overlap(GMM_d_m_[id], getPosition(im), inv_cov_md_[kaux]);
1122 // get index of 0.5*expov in tabulated exponential
1123 unsigned itab = static_cast<unsigned> (round( 0.5*expov/dexp_ ));
1124 // check boundaries and skip atom in case
1125 if(itab >= tab_exp_.size()) continue;
1126 // in case calculate overlap
1127 double ov = pre_fact_[kaux] * tab_exp_[itab];
1128 // add to list
1129 ov_l.push_back(ov);
1130 // and map to retrieve atom index
1131 ov_m[ov] = im;
1132 // increase ov_tot
1133 ov_tot += ov;
1134 }
1135 // check if zero size -> ov_tot = 0
1136 if(ov_l.size()==0) continue;
1137 // define cutoff
1138 double ov_cut = ov_tot * nl_cutoff_;
1139 // sort ov_l in ascending order
1140 std::sort(ov_l.begin(), ov_l.end());
1141 // integrate ov_l
1142 double res = 0.0;
1143 for(unsigned i=0; i<ov_l.size(); ++i) {
1144 res += ov_l[i];
1145 // if exceeding the cutoff for overlap, stop
1146 if(res >= ov_cut) break;
1147 else ov_m.erase(ov_l[i]);
1148 }
1149 // now add atoms to neighborlist
1150 for(map<double, unsigned>::iterator it=ov_m.begin(); it!=ov_m.end(); ++it)
1151 nl_l.push_back(id*GMM_m_size+it->second);
1152 // end cycle on GMM components in parallel
1153 }
1154 // find total dimension of neighborlist
1155 vector <int> recvcounts(size_, 0);
1156 recvcounts[rank_] = nl_l.size();
1157 comm.Sum(&recvcounts[0], size_);
1158 int tot_size = accumulate(recvcounts.begin(), recvcounts.end(), 0);
1159 // resize neighbor stuff
1160 nl_.resize(tot_size);
1161 // calculate vector of displacement
1162 vector<int> disp(size_);
1163 disp[0] = 0;
1164 int rank_size = 0;
1165 for(unsigned i=0; i<size_-1; ++i) {
1166 rank_size += recvcounts[i];
1167 disp[i+1] = rank_size;
1168 }
1169 // Allgather neighbor list
1170 comm.Allgatherv(&nl_l[0], recvcounts[rank_], &nl_[0], &recvcounts[0], &disp[0]);
1171 // now resize derivatives
1172 ovmd_der_.resize(tot_size);
1173 }
1174
prepare()1175 void EMMI::prepare()
1176 {
1177 if(getExchangeStep()) first_time_=true;
1178 }
1179
1180 // overlap calculator
calculate_overlap()1181 void EMMI::calculate_overlap() {
1182
1183 if(first_time_ || getExchangeStep() || getStep()%nl_stride_==0) {
1184 update_neighbor_list();
1185 first_time_=false;
1186 }
1187
1188 // clean temporary vectors
1189 for(unsigned i=0; i<ovmd_.size(); ++i) ovmd_[i] = 0.0;
1190 for(unsigned i=0; i<ovmd_der_.size(); ++i) ovmd_der_[i] = Vector(0,0,0);
1191
1192 // we have to cycle over all model and data GMM components in the neighbor list
1193 unsigned GMM_d_size = GMM_d_m_.size();
1194 unsigned GMM_m_size = GMM_m_type_.size();
1195 for(unsigned i=rank_; i<nl_.size(); i=i+size_) {
1196 // get data (id) and atom (im) indexes
1197 unsigned id = nl_[i] / GMM_m_size;
1198 unsigned im = nl_[i] % GMM_m_size;
1199 // get index in auxiliary lists
1200 unsigned kaux = GMM_m_type_[im] * GMM_d_size + id;
1201 // add overlap with im component of model GMM
1202 ovmd_[id] += get_overlap(GMM_d_m_[id], getPosition(im), pre_fact_[kaux],
1203 inv_cov_md_[kaux], ovmd_der_[i]);
1204 }
1205 // communicate stuff
1206 if(size_>1) {
1207 comm.Sum(&ovmd_[0], ovmd_.size());
1208 comm.Sum(&ovmd_der_[0][0], 3*ovmd_der_.size());
1209 }
1210 }
1211
scaleEnergy(double s)1212 double EMMI::scaleEnergy(double s)
1213 {
1214 double ene = 0.0;
1215 for(unsigned i=0; i<ovdd_.size(); ++i) {
1216 ene += std::log( abs ( s * ovmd_[i] - ovdd_[i] ) );
1217 }
1218 return ene;
1219 }
1220
doRegression()1221 double EMMI::doRegression()
1222 {
1223 // standard MC parameters
1224 unsigned MCsteps = 100000;
1225 double kbtmin = 1.0;
1226 double kbtmax = 10.0;
1227 unsigned ncold = 5000;
1228 unsigned nhot = 2000;
1229 double MCacc = 0.0;
1230 double kbt, ebest, scale_best;
1231
1232 // initial value of scale factor and energy
1233 double scale = random_.RandU01() * ( scale_max_ - scale_min_ ) + scale_min_;
1234 double ene = scaleEnergy(scale);
1235 // set best energy
1236 ebest = ene;
1237
1238 // MC loop
1239 for(unsigned istep=0; istep<MCsteps; ++istep) {
1240 // get temperature
1241 if(istep%(ncold+nhot)<ncold) kbt = kbtmin;
1242 else kbt = kbtmax;
1243 // propose move in scale
1244 double ds = dscale_ * ( 2.0 * random_.RandU01() - 1.0 );
1245 double new_scale = scale + ds;
1246 // check boundaries
1247 if(new_scale > scale_max_) {new_scale = 2.0 * scale_max_ - new_scale;}
1248 if(new_scale < scale_min_) {new_scale = 2.0 * scale_min_ - new_scale;}
1249 // new energy
1250 double new_ene = scaleEnergy(new_scale);
1251 // accept or reject
1252 bool accept = doAccept(ene, new_ene, kbt);
1253 // in case of acceptance
1254 if(accept) {
1255 scale = new_scale;
1256 ene = new_ene;
1257 MCacc += 1.0;
1258 }
1259 // save best
1260 if(ene<ebest) {
1261 ebest = ene;
1262 scale_best = scale;
1263 }
1264 }
1265 // calculate acceptance
1266 double accscale = MCacc / static_cast<double>(MCsteps);
1267 // global communication
1268 if(!no_aver_ && nrep_>1) {
1269 if(replica_!=0) {
1270 scale_best = 0.0;
1271 ebest = 0.0;
1272 accscale = 0.0;
1273 }
1274 if(rank_==0) {
1275 multi_sim_comm.Sum(&scale_best, 1);
1276 multi_sim_comm.Sum(&ebest, 1);
1277 multi_sim_comm.Sum(&accscale, 1);
1278 }
1279 }
1280 // local communication
1281 if(rank_!=0) {
1282 scale_best = 0.0;
1283 ebest = 0.0;
1284 accscale = 0.0;
1285 }
1286 if(size_>1) {
1287 comm.Sum(&scale_best, 1);
1288 comm.Sum(&ebest, 1);
1289 comm.Sum(&accscale, 1);
1290 }
1291 // set scale parameters
1292 getPntrToComponent("accscale")->set(accscale);
1293 getPntrToComponent("enescale")->set(ebest);
1294 // return scale value
1295 return scale_best;
1296 }
1297
get_annealing(long int step)1298 double EMMI::get_annealing(long int step)
1299 {
1300 // default no annealing
1301 double fact = 1.0;
1302 // position in annealing cycle
1303 unsigned nc = step%(4*nanneal_);
1304 // useful doubles
1305 double ncd = static_cast<double>(nc);
1306 double nn = static_cast<double>(nanneal_);
1307 // set fact
1308 if(nc>=nanneal_ && nc<2*nanneal_) fact = (kanneal_-1.0) / nn * ( ncd - nn ) + 1.0;
1309 if(nc>=2*nanneal_ && nc<3*nanneal_) fact = kanneal_;
1310 if(nc>=3*nanneal_) fact = (1.0-kanneal_) / nn * ( ncd - 3.0*nn) + kanneal_;
1311 return fact;
1312 }
1313
calculate()1314 void EMMI::calculate()
1315 {
1316
1317 // calculate CV
1318 calculate_overlap();
1319
1320 // rescale factor for ensemble average
1321 double escale = 1.0 / static_cast<double>(nrep_);
1322
1323 // in case of ensemble averaging, calculate average overlap
1324 if(!no_aver_ && nrep_>1) {
1325 // if master node, calculate average across replicas
1326 if(rank_==0) {
1327 multi_sim_comm.Sum(&ovmd_[0], ovmd_.size());
1328 for(unsigned i=0; i<ovmd_.size(); ++i) ovmd_[i] *= escale;
1329 } else {
1330 for(unsigned i=0; i<ovmd_.size(); ++i) ovmd_[i] = 0.0;
1331 }
1332 // local communication
1333 if(size_>1) comm.Sum(&ovmd_[0], ovmd_.size());
1334 }
1335
1336 // get time step
1337 long int step = getStep();
1338
1339 // do regression
1340 if(nregres_>0) {
1341 if(step%nregres_==0 && !getExchangeStep()) scale_ = doRegression();
1342 // set scale component
1343 getPntrToComponent("scale")->set(scale_);
1344 }
1345
1346 // write model overlap to file
1347 if(ovstride_>0 && step%ovstride_==0) write_model_overlap(step);
1348
1349 // clear energy and virial
1350 ene_ = 0.0;
1351 virial_.zero();
1352
1353 // Gaussian noise
1354 if(noise_==0) calculate_Gauss();
1355
1356 // Outliers noise
1357 if(noise_==1) calculate_Outliers();
1358
1359 // Marginal noise
1360 if(noise_==2) calculate_Marginal();
1361
1362 // get annealing rescale factor
1363 if(nanneal_>0) {
1364 anneal_ = get_annealing(step);
1365 getPntrToComponent("anneal")->set(anneal_);
1366 }
1367
1368 // annealing rescale
1369 ene_ /= anneal_;
1370
1371 // in case of ensemble averaging
1372 if(!no_aver_ && nrep_>1) {
1373 // if master node, sum der_GMMid derivatives and ene
1374 if(rank_==0) {
1375 multi_sim_comm.Sum(&GMMid_der_[0], GMMid_der_.size());
1376 multi_sim_comm.Sum(&ene_, 1);
1377 } else {
1378 // set der_GMMid derivatives and energy to zero
1379 for(unsigned i=0; i<GMMid_der_.size(); ++i) GMMid_der_[i]=0.0;
1380 ene_ = 0.0;
1381 }
1382 // local communication
1383 if(size_>1) {
1384 comm.Sum(&GMMid_der_[0], GMMid_der_.size());
1385 comm.Sum(&ene_, 1);
1386 }
1387 }
1388
1389 // clean temporary vector
1390 for(unsigned i=0; i<atom_der_.size(); ++i) atom_der_[i] = Vector(0,0,0);
1391
1392 // get derivatives of bias with respect to atoms
1393 for(unsigned i=rank_; i<nl_.size(); i=i+size_) {
1394 // get indexes of data and model component
1395 unsigned id = nl_[i] / GMM_m_type_.size();
1396 unsigned im = nl_[i] % GMM_m_type_.size();
1397 // chain rule + replica normalization
1398 Vector tot_der = GMMid_der_[id] * ovmd_der_[i] * escale * scale_ / anneal_;
1399 Vector pos;
1400 if(pbc_) pos = pbcDistance(GMM_d_m_[id], getPosition(im)) + GMM_d_m_[id];
1401 else pos = getPosition(im);
1402 // increment derivatives and virial
1403 atom_der_[im] += tot_der;
1404 virial_ += Tensor(pos, -tot_der);
1405 }
1406
1407 // communicate local derivatives and virial
1408 if(size_>1) {
1409 comm.Sum(&atom_der_[0][0], 3*atom_der_.size());
1410 comm.Sum(virial_);
1411 }
1412
1413 // set derivatives, virial, and score
1414 for(unsigned i=0; i<atom_der_.size(); ++i) setAtomsDerivatives(getPntrToComponent("scoreb"), i, atom_der_[i]);
1415 setBoxDerivatives(getPntrToComponent("scoreb"), virial_);
1416 getPntrToComponent("scoreb")->set(ene_);
1417
1418 // This part is needed only for Gaussian and Outliers noise models
1419 if(noise_!=2) {
1420
1421 // do Montecarlo
1422 if(dsigma_[0]>0 && step%MCstride_==0 && !getExchangeStep()) doMonteCarlo();
1423
1424 // print status
1425 if(step%statusstride_==0) print_status(step);
1426
1427 // calculate acceptance ratio
1428 double acc = MCaccept_ / MCtrials_;
1429
1430 // set value
1431 getPntrToComponent("acc")->set(acc);
1432
1433 }
1434
1435 }
1436
calculate_Gauss()1437 void EMMI::calculate_Gauss()
1438 {
1439 // cycle on all the GMM groups
1440 for(unsigned i=0; i<GMM_d_grps_.size(); ++i) {
1441 double eneg = 0.0;
1442 // cycle on all the members of the group
1443 for(unsigned j=0; j<GMM_d_grps_[i].size(); ++j) {
1444 // id of the GMM component
1445 int GMMid = GMM_d_grps_[i][j];
1446 // calculate deviation
1447 double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] ) / sigma_[i];
1448 // add to group energy
1449 eneg += 0.5 * dev * dev;
1450 // store derivative for later
1451 GMMid_der_[GMMid] = kbt_ * dev / sigma_[i];
1452 }
1453 // add to total energy along with normalizations and prior
1454 ene_ += kbt_ * ( eneg + (static_cast<double>(GMM_d_grps_[i].size())+prior_) * std::log(sigma_[i]) );
1455 }
1456 }
1457
calculate_Outliers()1458 void EMMI::calculate_Outliers()
1459 {
1460 // cycle on all the GMM groups
1461 for(unsigned i=0; i<GMM_d_grps_.size(); ++i) {
1462 // cycle on all the members of the group
1463 double eneg = 0.0;
1464 for(unsigned j=0; j<GMM_d_grps_[i].size(); ++j) {
1465 // id of the GMM component
1466 int GMMid = GMM_d_grps_[i][j];
1467 // calculate deviation
1468 double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] ) / sigma_[i];
1469 // add to group energy
1470 eneg += std::log( 1.0 + 0.5 * dev * dev );
1471 // store derivative for later
1472 GMMid_der_[GMMid] = kbt_ / ( 1.0 + 0.5 * dev * dev ) * dev / sigma_[i];
1473 }
1474 // add to total energy along with normalizations and prior
1475 ene_ += kbt_ * ( eneg + (static_cast<double>(GMM_d_grps_[i].size())+prior_) * std::log(sigma_[i]) );
1476 }
1477 }
1478
calculate_Marginal()1479 void EMMI::calculate_Marginal()
1480 {
1481 // cycle on all the GMM groups
1482 for(unsigned i=0; i<GMM_d_grps_.size(); ++i) {
1483 // cycle on all the members of the group
1484 for(unsigned j=0; j<GMM_d_grps_[i].size(); ++j) {
1485 // id of the GMM component
1486 int GMMid = GMM_d_grps_[i][j];
1487 // calculate deviation
1488 double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] );
1489 // calculate errf
1490 double errf = erf ( dev * inv_sqrt2_ / sigma_min_[i] );
1491 // add to group energy
1492 ene_ += -kbt_ * std::log ( 0.5 / dev * errf ) ;
1493 // store derivative for later
1494 GMMid_der_[GMMid] = - kbt_/errf*sqrt2_pi_*exp(-0.5*dev*dev/sigma_min_[i]/sigma_min_[i])/sigma_min_[i]+kbt_/dev;
1495 }
1496 }
1497 }
1498
1499 }
1500 }
1501