1 /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2    Copyright (c) 2017-2021 The plumed team
3    (see the PEOPLE file at the root of the distribution for a list of names)
4 
5    See http://www.plumed.org for more information.
6 
7    This file is part of plumed, version 2.
8 
9    plumed is free software: you can redistribute it and/or modify
10    it under the terms of the GNU Lesser General Public License as published by
11    the Free Software Foundation, either version 3 of the License, or
12    (at your option) any later version.
13 
14    plumed is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU Lesser General Public License for more details.
18 
19    You should have received a copy of the GNU Lesser General Public License
20    along with plumed.  If not, see <http://www.gnu.org/licenses/>.
21 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
22 #include "colvar/Colvar.h"
23 #include "colvar/ActionRegister.h"
24 #include "core/PlumedMain.h"
25 #include "tools/Matrix.h"
26 #include "core/GenericMolInfo.h"
27 #include "core/ActionSet.h"
28 #include "tools/File.h"
29 
30 #include <string>
31 #include <cmath>
32 #include <map>
33 #include <numeric>
34 #include <ctime>
35 #include "tools/Random.h"
36 
37 using namespace std;
38 
39 namespace PLMD {
40 namespace isdb {
41 
42 //+PLUMEDOC ISDB_COLVAR EMMI
43 /*
44 Calculate the fit of a structure or ensemble of structures with a cryo-EM density map.
45 
46 This action implements the multi-scale Bayesian approach to cryo-EM data fitting introduced in  Ref. \cite Hanot113951 .
47 This method allows efficient and accurate structural modeling of cryo-electron microscopy density maps at multiple scales, from coarse-grained to atomistic resolution, by addressing the presence of random and systematic errors in the data, sample heterogeneity, data correlation, and noise correlation.
48 
49 The experimental density map is fit by a Gaussian Mixture Model (GMM), which is provided as an external file specified by the keyword
50 GMM_FILE. We are currently working on a web server to perform
51 this operation. In the meantime, the user can request a stand-alone version of the GMM code at massimiliano.bonomi_AT_gmail.com.
52 
53 When run in single-replica mode, this action allows atomistic, flexible refinement of an individual structure into a density map.
54 Combined with a multi-replica framework (such as the -multi option in GROMACS), the user can model an ensemble of structures using
55 the Metainference approach \cite Bonomi:2016ip .
56 
57 \warning
58 To use \ref EMMI, the user should always add a \ref MOLINFO line and specify a pdb file of the system.
59 
60 \note
61 To enhance sampling in single-structure refinement, one can use a Replica Exchange Method, such as Parallel Tempering.
62 In this case, the user should add the NO_AVER flag to the input line.
63 
64 \note
65 \ref EMMI can be used in combination with periodic and non-periodic systems. In the latter case, one should
66 add the NOPBC flag to the input line
67 
68 \par Examples
69 
70 In this example, we perform a single-structure refinement based on an experimental cryo-EM map. The map is fit with a GMM, whose
71 parameters are listed in the file GMM_fit.dat. This file contains one line per GMM component in the following format:
72 
73 \plumedfile
74 #! FIELDS Id Weight Mean_0 Mean_1 Mean_2 Cov_00 Cov_01 Cov_02 Cov_11 Cov_12 Cov_22 Beta
75      0  2.9993805e+01   6.54628 10.37820 -0.92988  2.078920e-02 1.216254e-03 5.990827e-04 2.556246e-02 8.411835e-03 2.486254e-02  1
76      1  2.3468312e+01   6.56095 10.34790 -0.87808  1.879859e-02 6.636049e-03 3.682865e-04 3.194490e-02 1.750524e-03 3.017100e-02  1
77      ...
78 \endplumedfile
79 
80 To accelerate the computation of the Bayesian score, one can:
81 - use neighbor lists, specified by the keywords NL_CUTOFF and NL_STRIDE;
82 - calculate the restraint every other step (or more).
83 
84 All the heavy atoms of the system are used to calculate the density map. This list can conveniently be provided
85 using a GROMACS index file.
86 
87 The input file looks as follows:
88 
89 \plumedfile
90 # include pdb info
91 MOLINFO STRUCTURE=prot.pdb
92 
93 #  all heavy atoms
94 protein-h: GROUP NDX_FILE=index.ndx NDX_GROUP=Protein-H
95 
96 # create EMMI score
97 gmm: EMMI NOPBC SIGMA_MIN=0.01 TEMP=300.0 NL_STRIDE=100 NL_CUTOFF=0.01 GMM_FILE=GMM_fit.dat ATOMS=protein-h
98 
99 # translate into bias - apply every 2 steps
100 emr: BIASVALUE ARG=gmm.scoreb STRIDE=2
101 
102 PRINT ARG=emr.* FILE=COLVAR STRIDE=500 FMT=%20.10f
103 \endplumedfile
104 
105 
106 */
107 //+ENDPLUMEDOC
108 
109 class EMMI : public Colvar {
110 
111 private:
112 
113 // temperature in kbt
114   double kbt_;
115 // model GMM - atom types
116   vector<unsigned> GMM_m_type_;
117 // model GMM - list of atom sigmas - one per atom type
118   vector<double> GMM_m_s_;
119 // model GMM - list of atom weights - one per atom type
120   vector<double> GMM_m_w_;
121 // data GMM - means, weights, and covariances + beta option
122   vector<Vector>             GMM_d_m_;
123   vector<double>             GMM_d_w_;
124   vector< VectorGeneric<6> > GMM_d_cov_;
125   vector<int>                GMM_d_beta_;
126   vector < vector<int> >     GMM_d_grps_;
127 // overlaps
128   vector<double> ovmd_;
129   vector<double> ovdd_;
130   vector<double> ovmd_ave_;
131 // and derivatives
132   vector<Vector> ovmd_der_;
133   vector<Vector> atom_der_;
134   vector<double> GMMid_der_;
135 // constants
136   double cfact_;
137   double inv_sqrt2_, sqrt2_pi_;
138 // metainference
139   unsigned nrep_;
140   unsigned replica_;
141   vector<double> sigma_;
142   vector<double> sigma_min_;
143   vector<double> sigma_max_;
144   vector<double> dsigma_;
145 // list of prefactors for overlap between two Gaussians
146 // pre_fact = 1.0 / (2pi)**1.5 / sqrt(det_md) * Wm * Wd
147   vector<double> pre_fact_;
148 // inverse of the sum of model and data covariances matrices
149   vector< VectorGeneric<6> > inv_cov_md_;
150 // neighbor list
151   double   nl_cutoff_;
152   unsigned nl_stride_;
153   bool first_time_;
154   bool no_aver_;
155   vector<unsigned> nl_;
156 // parallel stuff
157   unsigned size_;
158   unsigned rank_;
159 // pbc
160   bool pbc_;
161 // Monte Carlo stuff
162   int      MCstride_;
163   double   MCaccept_;
164   double   MCtrials_;
165   Random   random_;
166   // status stuff
167   unsigned int statusstride_;
168   string       statusfilename_;
169   OFile        statusfile_;
170   bool         first_status_;
171   // regression
172   unsigned nregres_;
173   double scale_;
174   double scale_min_;
175   double scale_max_;
176   double dscale_;
177   // tabulated exponential
178   double dpcutoff_;
179   double dexp_;
180   unsigned nexp_;
181   vector<double> tab_exp_;
182   // simulated annealing
183   unsigned nanneal_;
184   double   kanneal_;
185   double   anneal_;
186   // prior exponent
187   double prior_;
188   // noise type
189   unsigned noise_;
190   // total score and virial;
191   double ene_;
192   Tensor virial_;
193   // model overlap file
194   unsigned int ovstride_;
195   string       ovfilename_;
196 
197 // write file with model overlap
198   void write_model_overlap(long int step);
199 // get median of vector
200   double get_median(vector<double> &v);
201 // annealing
202   double get_annealing(long int step);
203 // do regression
204   double scaleEnergy(double s);
205   double doRegression();
206 // read and write status
207   void read_status();
208   void print_status(long int step);
209 // accept or reject
210   bool doAccept(double oldE, double newE, double kbt);
211 // do MonteCarlo
212   void doMonteCarlo();
213 // read error file
214   vector<double> read_exp_errors(string errfile);
215 // read experimental overlaps
216   vector<double> read_exp_overlaps(string ovfile);
217 // calculate model GMM weights and covariances
218   vector<double> get_GMM_m(vector<AtomNumber> &atoms);
219 // read data GMM file
220   void get_GMM_d(string gmm_file);
221 // check GMM data
222   void check_GMM_d(VectorGeneric<6> &cov, double w);
223 // auxiliary method
224   void calculate_useful_stuff(double reso);
225 // get pref_fact and inv_cov_md
226   double get_prefactor_inverse (const VectorGeneric<6> &GMM_cov_0, const VectorGeneric<6> &GMM_cov_1,
227                                 double &GMM_w_0, double &GMM_w_1,
228                                 VectorGeneric<6> &sum, VectorGeneric<6> &inv_sum);
229 // calculate self overlaps between data GMM components - ovdd_
230   double get_self_overlap(unsigned id);
231 // calculate overlap between two Gaussians
232   double get_overlap(const Vector &m_m, const Vector &d_m, double &pre_fact,
233                      const VectorGeneric<6> &inv_cov_md, Vector &ov_der);
234 // calculate exponent of overlap for neighbor list update
235   double get_exp_overlap(const Vector &m_m, const Vector &d_m,
236                          const VectorGeneric<6> &inv_cov_md);
237 // update the neighbor list
238   void update_neighbor_list();
239 // calculate overlap
240   void calculate_overlap();
241 // Gaussian noise
242   void calculate_Gauss();
243 // Outliers noise
244   void calculate_Outliers();
245 // Marginal noise
246   void calculate_Marginal();
247 
248 public:
249   static void registerKeywords( Keywords& keys );
250   explicit EMMI(const ActionOptions&);
251 // active methods:
252   void prepare() override;
253   void calculate() override;
254 };
255 
256 PLUMED_REGISTER_ACTION(EMMI,"EMMI")
257 
registerKeywords(Keywords & keys)258 void EMMI::registerKeywords( Keywords& keys ) {
259   Colvar::registerKeywords( keys );
260   keys.add("atoms","ATOMS","atoms for which we calculate the density map, typically all heavy atoms");
261   keys.add("compulsory","GMM_FILE","file with the parameters of the GMM components");
262   keys.add("compulsory","NL_CUTOFF","The cutoff in overlap for the neighbor list");
263   keys.add("compulsory","NL_STRIDE","The frequency with which we are updating the neighbor list");
264   keys.add("compulsory","SIGMA_MIN","minimum uncertainty");
265   keys.add("compulsory","RESOLUTION", "Cryo-EM map resolution");
266   keys.add("compulsory","NOISETYPE","functional form of the noise (GAUSS, OUTLIERS, MARGINAL)");
267   keys.add("optional","SIGMA0","initial value of the uncertainty");
268   keys.add("optional","DSIGMA","MC step for uncertainties");
269   keys.add("optional","MC_STRIDE", "Monte Carlo stride");
270   keys.add("optional","ERR_FILE","file with experimental or GMM fit errors");
271   keys.add("optional","OV_FILE","file with experimental overlaps");
272   keys.add("optional","NORM_DENSITY","integral of the experimental density");
273   keys.add("optional","STATUS_FILE","write a file with all the data useful for restart");
274   keys.add("optional","WRITE_STRIDE","write the status to a file every N steps, this can be used for restart");
275   keys.add("optional","REGRESSION","regression stride");
276   keys.add("optional","REG_SCALE_MIN","regression minimum scale");
277   keys.add("optional","REG_SCALE_MAX","regression maximum scale");
278   keys.add("optional","REG_DSCALE","regression maximum scale MC move");
279   keys.add("optional","SCALE","scale factor");
280   keys.add("optional","ANNEAL", "Length of annealing cycle");
281   keys.add("optional","ANNEAL_FACT", "Annealing temperature factor");
282   keys.add("optional","TEMP","temperature");
283   keys.add("optional","PRIOR", "exponent of uncertainty prior");
284   keys.add("optional","WRITE_OV_STRIDE","write model overlaps every N steps");
285   keys.add("optional","WRITE_OV","write a file with model overlaps");
286   keys.addFlag("NO_AVER",false,"don't do ensemble averaging in multi-replica mode");
287   componentsAreNotOptional(keys);
288   keys.addOutputComponent("scoreb","default","Bayesian score");
289   keys.addOutputComponent("acc",   "NOISETYPE","MC acceptance for uncertainty");
290   keys.addOutputComponent("scale", "REGRESSION","scale factor");
291   keys.addOutputComponent("accscale", "REGRESSION","MC acceptance for scale regression");
292   keys.addOutputComponent("enescale", "REGRESSION","MC energy for scale regression");
293   keys.addOutputComponent("anneal","ANNEAL","annealing factor");
294 }
295 
EMMI(const ActionOptions & ao)296 EMMI::EMMI(const ActionOptions&ao):
297   PLUMED_COLVAR_INIT(ao),
298   inv_sqrt2_(0.707106781186548),
299   sqrt2_pi_(0.797884560802865),
300   first_time_(true), no_aver_(false), pbc_(true),
301   MCstride_(1), MCaccept_(0.), MCtrials_(0.),
302   statusstride_(0), first_status_(true),
303   nregres_(0), scale_(1.),
304   dpcutoff_(15.0), nexp_(1000000), nanneal_(0),
305   kanneal_(0.), anneal_(1.), prior_(1.), ovstride_(0)
306 {
307   // periodic boundary conditions
308   bool nopbc=!pbc_;
309   parseFlag("NOPBC",nopbc);
310   pbc_=!nopbc;
311 
312   // list of atoms
313   vector<AtomNumber> atoms;
314   parseAtomList("ATOMS", atoms);
315 
316   // file with data GMM
317   string GMM_file;
318   parse("GMM_FILE", GMM_file);
319 
320   // type of data noise
321   string noise;
322   parse("NOISETYPE",noise);
323   if      (noise=="GAUSS")   noise_ = 0;
324   else if(noise=="OUTLIERS") noise_ = 1;
325   else if(noise=="MARGINAL") noise_ = 2;
326   else error("Unknown noise type!");
327 
328   // minimum value for error
329   double sigma_min;
330   parse("SIGMA_MIN", sigma_min);
331   if(sigma_min<0) error("SIGMA_MIN should be greater or equal to zero");
332 
333   // the following parameters must be specified with noise type 0 and 1
334   double sigma_ini, dsigma;
335   if(noise_!=2) {
336     // initial value of the uncertainty
337     parse("SIGMA0", sigma_ini);
338     if(sigma_ini<=0) error("you must specify a positive SIGMA0");
339     // MC parameters
340     parse("DSIGMA", dsigma);
341     if(dsigma<0) error("you must specify a positive DSIGMA");
342     parse("MC_STRIDE", MCstride_);
343     if(dsigma>0 && MCstride_<=0) error("you must specify a positive MC_STRIDE");
344     // status file parameters
345     parse("WRITE_STRIDE", statusstride_);
346     if(statusstride_<=0) error("you must specify a positive WRITE_STRIDE");
347     parse("STATUS_FILE",  statusfilename_);
348     if(statusfilename_=="") statusfilename_ = "MISTATUS"+getLabel();
349     else                    statusfilename_ = statusfilename_+getLabel();
350   }
351 
352   // error file
353   string errfile;
354   parse("ERR_FILE", errfile);
355 
356   // file with experimental overlaps
357   string ovfile;
358   parse("OV_FILE", ovfile);
359 
360   // integral of the experimetal density
361   double norm_d = 0.0;
362   parse("NORM_DENSITY", norm_d);
363 
364   // temperature
365   double temp=0.0;
366   parse("TEMP",temp);
367   // convert temp to kbt
368   if(temp>0.0) kbt_=plumed.getAtoms().getKBoltzmann()*temp;
369   else kbt_=plumed.getAtoms().getKbT();
370 
371   // exponent of uncertainty prior
372   parse("PRIOR",prior_);
373 
374   // simulated annealing stuff
375   parse("ANNEAL", nanneal_);
376   parse("ANNEAL_FACT", kanneal_);
377   if(nanneal_>0 && kanneal_<=1.0) error("with ANNEAL, ANNEAL_FACT must be greater than 1");
378 
379   // regression stride
380   parse("REGRESSION",nregres_);
381   // other regression parameters
382   if(nregres_>0) {
383     parse("REG_SCALE_MIN",scale_min_);
384     parse("REG_SCALE_MAX",scale_max_);
385     parse("REG_DSCALE",dscale_);
386     // checks
387     if(scale_max_<=scale_min_) error("with REGRESSION, REG_SCALE_MAX must be greater than REG_SCALE_MIN");
388     if(dscale_<=0.) error("with REGRESSION, REG_DSCALE must be positive");
389   }
390 
391   // scale factor
392   parse("SCALE", scale_);
393 
394   // read map resolution
395   double reso;
396   parse("RESOLUTION", reso);
397   if(reso<=0.) error("RESOLUTION should be strictly positive");
398 
399   // neighbor list stuff
400   parse("NL_CUTOFF",nl_cutoff_);
401   if(nl_cutoff_<=0.0) error("NL_CUTOFF should be explicitly specified and positive");
402   parse("NL_STRIDE",nl_stride_);
403   if(nl_stride_<=0) error("NL_STRIDE should be explicitly specified and positive");
404 
405   // averaging or not
406   parseFlag("NO_AVER",no_aver_);
407 
408   // write overlap file
409   parse("WRITE_OV_STRIDE", ovstride_);
410   parse("WRITE_OV", ovfilename_);
411   if(ovstride_>0 && ovfilename_=="") error("With WRITE_OV_STRIDE you must specify WRITE_OV");
412 
413   checkRead();
414 
415   // set parallel stuff
416   size_=comm.Get_size();
417   rank_=comm.Get_rank();
418 
419   // get number of replicas
420   if(rank_==0) {
421     if(no_aver_) {
422       nrep_ = 1;
423     } else {
424       nrep_ = multi_sim_comm.Get_size();
425     }
426     replica_ = multi_sim_comm.Get_rank();
427   } else {
428     nrep_ = 0;
429     replica_ = 0;
430   }
431   comm.Sum(&nrep_,1);
432   comm.Sum(&replica_,1);
433 
434   log.printf("  atoms involved : ");
435   for(unsigned i=0; i<atoms.size(); ++i) log.printf("%d ",atoms[i].serial());
436   log.printf("\n");
437   log.printf("  GMM data file : %s\n", GMM_file.c_str());
438   if(no_aver_) log.printf("  without ensemble averaging\n");
439   log.printf("  type of data noise : %s\n", noise.c_str());
440   log.printf("  neighbor list cutoff : %lf\n", nl_cutoff_);
441   log.printf("  neighbor list stride : %u\n",  nl_stride_);
442   log.printf("  minimum uncertainty : %f\n",sigma_min);
443   log.printf("  scale factor : %lf\n",scale_);
444   if(nregres_>0) {
445     log.printf("  regression stride : %u\n", nregres_);
446     log.printf("  regression minimum scale : %lf\n", scale_min_);
447     log.printf("  regression maximum scale : %lf\n", scale_max_);
448     log.printf("  regression maximum scale MC move : %lf\n", dscale_);
449   }
450   if(noise_!=2) {
451     log.printf("  initial value of the uncertainty : %f\n",sigma_ini);
452     log.printf("  max MC move in uncertainty : %f\n",dsigma);
453     log.printf("  MC stride : %u\n", MCstride_);
454     log.printf("  reading/writing to status file : %s\n",statusfilename_.c_str());
455     log.printf("  with stride : %u\n",statusstride_);
456   }
457   if(errfile.size()>0) log.printf("  reading experimental errors from file : %s\n", errfile.c_str());
458   if(ovfile.size()>0)  log.printf("  reading experimental overlaps from file : %s\n", ovfile.c_str());
459   log.printf("  temperature of the system in energy unit : %f\n",kbt_);
460   log.printf("  prior exponent : %f\n",prior_);
461   log.printf("  number of replicas for averaging: %u\n",nrep_);
462   log.printf("  id of the replica : %u\n",replica_);
463   if(nanneal_>0) {
464     log.printf("  length of annealing cycle : %u\n",nanneal_);
465     log.printf("  annealing factor : %f\n",kanneal_);
466   }
467   if(ovstride_>0) {
468     log.printf("  stride for writing model overlaps : %u\n",ovstride_);
469     log.printf("  file for writing model overlaps : %s\n", ovfilename_.c_str());
470   }
471 
472   // set constant quantity before calculating stuff
473   cfact_ = 1.0/pow( 2.0*pi, 1.5 );
474 
475   // calculate model GMM constant parameters
476   vector<double> GMM_m_w = get_GMM_m(atoms);
477 
478   // read data GMM parameters
479   get_GMM_d(GMM_file);
480   log.printf("  number of GMM components : %u\n", static_cast<unsigned>(GMM_d_m_.size()));
481 
482   // normalize atom weight map
483   if(norm_d <= 0.0) norm_d = accumulate(GMM_d_w_.begin(), GMM_d_w_.end(), 0.0);
484   double norm_m = accumulate(GMM_m_w.begin(),  GMM_m_w.end(),  0.0);
485   // renormalization
486   for(unsigned i=0; i<GMM_m_w_.size(); ++i) GMM_m_w_[i] *= norm_d / norm_m;
487 
488   // read experimental errors
489   vector<double> exp_err;
490   if(errfile.size()>0) exp_err = read_exp_errors(errfile);
491 
492   // get self overlaps between data GMM components
493   if(ovfile.size()>0) {
494     ovdd_ = read_exp_overlaps(ovfile);
495   } else {
496     for(unsigned i=0; i<GMM_d_m_.size(); ++i) {
497       double ov = get_self_overlap(i);
498       ovdd_.push_back(ov);
499     }
500   }
501 
502   log.printf("  number of GMM groups : %u\n", static_cast<unsigned>(GMM_d_grps_.size()));
503   // cycle on GMM groups
504   for(unsigned Gid=0; Gid<GMM_d_grps_.size(); ++Gid) {
505     log.printf("    group %d\n", Gid);
506     // calculate median overlap and experimental error
507     vector<double> ovdd;
508     vector<double> err;
509     // cycle on the group members
510     for(unsigned i=0; i<GMM_d_grps_[Gid].size(); ++i) {
511       // GMM id
512       int GMMid = GMM_d_grps_[Gid][i];
513       // add to experimental error
514       if(errfile.size()>0) err.push_back(exp_err[GMMid]);
515       else                 err.push_back(0.);
516       // add to GMM overlap
517       ovdd.push_back(ovdd_[GMMid]);
518     }
519     // calculate median quantities
520     double ovdd_m = get_median(ovdd);
521     double err_m  = get_median(err);
522     // print out statistics
523     log.printf("     # of members : %zu\n", GMM_d_grps_[Gid].size());
524     log.printf("     median overlap : %lf\n", ovdd_m);
525     log.printf("     median error : %lf\n", err_m);
526     // add minimum value of sigma for this group of GMMs
527     sigma_min_.push_back(sqrt(err_m*err_m+sigma_min*ovdd_m*sigma_min*ovdd_m));
528     // these are only needed with Gaussian and Outliers noise models
529     if(noise_!=2) {
530       // set dsigma
531       dsigma_.push_back(dsigma * ovdd_m);
532       // set sigma max
533       sigma_max_.push_back(10.0*ovdd_m + sigma_min_[Gid] + dsigma_[Gid]);
534       // initialize sigma
535       sigma_.push_back(std::max(sigma_min_[Gid],std::min(sigma_ini*ovdd_m,sigma_max_[Gid])));
536     }
537   }
538 
539   // read status file if restarting
540   if(getRestart() && noise_!=2) read_status();
541 
542   // calculate auxiliary stuff
543   calculate_useful_stuff(reso);
544 
545   // prepare data and derivative vectors
546   ovmd_.resize(ovdd_.size());
547   atom_der_.resize(GMM_m_type_.size());
548   GMMid_der_.resize(ovdd_.size());
549 
550   // clear things that are no longer needed
551   GMM_d_cov_.clear();
552 
553   // add components
554   addComponentWithDerivatives("scoreb"); componentIsNotPeriodic("scoreb");
555 
556   if(noise_!=2) {addComponent("acc"); componentIsNotPeriodic("acc");}
557 
558   if(nregres_>0) {
559     addComponent("scale");     componentIsNotPeriodic("scale");
560     addComponent("accscale");  componentIsNotPeriodic("accscale");
561     addComponent("enescale");  componentIsNotPeriodic("enescale");
562   }
563 
564   if(nanneal_>0) {addComponent("anneal"); componentIsNotPeriodic("anneal");}
565 
566   // initialize random seed
567   unsigned iseed;
568   if(rank_==0) iseed = time(NULL)+replica_;
569   else iseed = 0;
570   comm.Sum(&iseed, 1);
571   random_.setSeed(-iseed);
572 
573   // request the atoms
574   requestAtoms(atoms);
575 
576   // print bibliography
577   log<<"  Bibliography "<<plumed.cite("Bonomi, Camilloni, Bioinformatics, 33, 3999 (2017)");
578   log<<plumed.cite("Hanot, Bonomi, Greenberg, Sali, Nilges, Vendruscolo, Pellarin, bioRxiv doi: 10.1101/113951 (2017)");
579   log<<plumed.cite("Bonomi, Pellarin, Vendruscolo, Biophys. J. 114, 1604 (2018)");
580   if(!no_aver_ && nrep_>1)log<<plumed.cite("Bonomi, Camilloni, Cavalli, Vendruscolo, Sci. Adv. 2, e150117 (2016)");
581   log<<"\n";
582 }
583 
write_model_overlap(long int step)584 void EMMI::write_model_overlap(long int step)
585 {
586   OFile ovfile;
587   ovfile.link(*this);
588   std::string num; Tools::convert(step,num);
589   string name = ovfilename_+"-"+num;
590   ovfile.open(name);
591   ovfile.setHeavyFlush();
592   ovfile.fmtField("%10.7e ");
593 // write overlaps
594   for(int i=0; i<ovmd_.size(); ++i) {
595     ovfile.printField("Model", ovmd_[i]);
596     ovfile.printField("ModelScaled", scale_ * ovmd_[i]);
597     ovfile.printField("Data", ovdd_[i]);
598     ovfile.printField();
599   }
600   ovfile.close();
601 }
602 
get_median(vector<double> & v)603 double EMMI::get_median(vector<double> &v)
604 {
605 // dimension of vector
606   unsigned size = v.size();
607 // in case of only one entry
608   if (size==1) {
609     return v[0];
610   } else {
611     // reorder vector
612     sort(v.begin(), v.end());
613     // odd or even?
614     if (size%2==0) {
615       return (v[size/2-1]+v[size/2])/2.0;
616     } else {
617       return v[size/2];
618     }
619   }
620 }
621 
read_status()622 void EMMI::read_status()
623 {
624   double MDtime;
625 // open file
626   IFile *ifile = new IFile();
627   ifile->link(*this);
628   if(ifile->FileExist(statusfilename_)) {
629     ifile->open(statusfilename_);
630     while(ifile->scanField("MD_time", MDtime)) {
631       for(unsigned i=0; i<sigma_.size(); ++i) {
632         // convert i to string
633         std::string num; Tools::convert(i,num);
634         // read entries
635         ifile->scanField("s"+num, sigma_[i]);
636       }
637       // new line
638       ifile->scanField();
639     }
640     ifile->close();
641   } else {
642     error("Cannot find status file "+statusfilename_+"\n");
643   }
644   delete ifile;
645 }
646 
print_status(long int step)647 void EMMI::print_status(long int step)
648 {
649 // if first time open the file
650   if(first_status_) {
651     first_status_ = false;
652     statusfile_.link(*this);
653     statusfile_.open(statusfilename_);
654     statusfile_.setHeavyFlush();
655     statusfile_.fmtField("%6.3e ");
656   }
657 // write fields
658   double MDtime = static_cast<double>(step)*getTimeStep();
659   statusfile_.printField("MD_time", MDtime);
660   for(unsigned i=0; i<sigma_.size(); ++i) {
661     // convert i to string
662     std::string num; Tools::convert(i,num);
663     // print entry
664     statusfile_.printField("s"+num, sigma_[i]);
665   }
666   statusfile_.printField();
667 }
668 
doAccept(double oldE,double newE,double kbt)669 bool EMMI::doAccept(double oldE, double newE, double kbt) {
670   bool accept = false;
671   // calculate delta energy
672   double delta = ( newE - oldE ) / kbt;
673   // if delta is negative always accept move
674   if( delta < 0.0 ) {
675     accept = true;
676   } else {
677     // otherwise extract random number
678     double s = random_.RandU01();
679     if( s < exp(-delta) ) { accept = true; }
680   }
681   return accept;
682 }
683 
doMonteCarlo()684 void EMMI::doMonteCarlo()
685 {
686   // extract random GMM group
687   unsigned nGMM = static_cast<unsigned>(floor(random_.RandU01()*static_cast<double>(GMM_d_grps_.size())));
688   if(nGMM==GMM_d_grps_.size()) nGMM -= 1;
689 
690   // generate random move
691   double shift = dsigma_[nGMM] * ( 2.0 * random_.RandU01() - 1.0 );
692   // new sigma
693   double new_s = sigma_[nGMM] + shift;
694   // check boundaries
695   if(new_s > sigma_max_[nGMM]) {new_s = 2.0 * sigma_max_[nGMM] - new_s;}
696   if(new_s < sigma_min_[nGMM]) {new_s = 2.0 * sigma_min_[nGMM] - new_s;}
697   // old s2
698   double old_inv_s2 = 1.0 / sigma_[nGMM] / sigma_[nGMM];
699   // new s2
700   double new_inv_s2 = 1.0 / new_s / new_s;
701 
702   // cycle on GMM group and calculate old and new energy
703   double old_ene = 0.0;
704   double new_ene = 0.0;
705   double ng = static_cast<double>(GMM_d_grps_[nGMM].size());
706 
707   // in case of Gaussian noise
708   if(noise_==0) {
709     double chi2 = 0.0;
710     for(unsigned i=0; i<GMM_d_grps_[nGMM].size(); ++i) {
711       // id GMM component
712       int GMMid = GMM_d_grps_[nGMM][i];
713       // deviation
714       double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] );
715       // add to chi2
716       chi2 += dev * dev;
717     }
718     // final energy calculation: add normalization and prior
719     old_ene = 0.5 * kbt_ * ( chi2 * old_inv_s2 - (ng+prior_) * std::log(old_inv_s2) );
720     new_ene = 0.5 * kbt_ * ( chi2 * new_inv_s2 - (ng+prior_) * std::log(new_inv_s2) );
721   }
722 
723   // in case of Outliers noise
724   if(noise_==1) {
725     for(unsigned i=0; i<GMM_d_grps_[nGMM].size(); ++i) {
726       // id GMM component
727       int GMMid = GMM_d_grps_[nGMM][i];
728       // calculate deviation
729       double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] );
730       // add to energies
731       old_ene += std::log( 1.0 + 0.5 * dev * dev * old_inv_s2);
732       new_ene += std::log( 1.0 + 0.5 * dev * dev * new_inv_s2);
733     }
734     // final energy calculation: add normalization and prior
735     old_ene = kbt_ * ( old_ene + (ng+prior_) * std::log(sigma_[nGMM]) );
736     new_ene = kbt_ * ( new_ene + (ng+prior_) * std::log(new_s) );
737   }
738 
739   // increment number of trials
740   MCtrials_ += 1.0;
741 
742   // accept or reject
743   bool accept = doAccept(old_ene/anneal_, new_ene/anneal_, kbt_);
744   if(accept) {
745     sigma_[nGMM] = new_s;
746     MCaccept_ += 1.0;
747   }
748   // local communication
749   if(rank_!=0) {
750     for(unsigned i=0; i<sigma_.size(); ++i) sigma_[i] = 0.0;
751     MCaccept_ = 0.0;
752   }
753   if(size_>1) {
754     comm.Sum(&sigma_[0], sigma_.size());
755     comm.Sum(&MCaccept_, 1);
756   }
757 }
758 
read_exp_errors(string errfile)759 vector<double> EMMI::read_exp_errors(string errfile)
760 {
761   int nexp, idcomp;
762   double err;
763   vector<double> exp_err;
764 // open file
765   IFile *ifile = new IFile();
766   if(ifile->FileExist(errfile)) {
767     ifile->open(errfile);
768     // scan for number of experimental errors
769     ifile->scanField("Nexp", nexp);
770     // cycle on GMM components
771     while(ifile->scanField("Id",idcomp)) {
772       // total experimental error
773       double err_tot = 0.0;
774       // cycle on number of experimental overlaps
775       for(unsigned i=0; i<nexp; ++i) {
776         string ss; Tools::convert(i,ss);
777         ifile->scanField("Err"+ss, err);
778         // add to total error
779         err_tot += err*err;
780       }
781       // new line
782       ifile->scanField();
783       // calculate RMSE
784       err_tot = sqrt(err_tot/static_cast<double>(nexp));
785       // add to global
786       exp_err.push_back(err_tot);
787     }
788     ifile->close();
789   } else {
790     error("Cannot find ERR_FILE "+errfile+"\n");
791   }
792   return exp_err;
793 }
794 
read_exp_overlaps(string ovfile)795 vector<double> EMMI::read_exp_overlaps(string ovfile)
796 {
797   int idcomp;
798   double ov;
799   vector<double> ovdd;
800 // open file
801   IFile *ifile = new IFile();
802   if(ifile->FileExist(ovfile)) {
803     ifile->open(ovfile);
804     // cycle on GMM components
805     while(ifile->scanField("Id",idcomp)) {
806       // read experimental overlap
807       ifile->scanField("Overlap", ov);
808       // add to ovdd
809       ovdd.push_back(ov);
810       // new line
811       ifile->scanField();
812     }
813     ifile->close();
814   } else {
815     error("Cannot find OV_FILE "+ovfile+"\n");
816   }
817   return ovdd;
818 }
819 
get_GMM_m(vector<AtomNumber> & atoms)820 vector<double> EMMI::get_GMM_m(vector<AtomNumber> &atoms)
821 {
822   // list of weights - one per atom
823   vector<double> GMM_m_w;
824 
825   auto* moldat=plumed.getActionSet().selectLatest<GenericMolInfo*>(this);
826   // map of atom types to A and B coefficients of scattering factor
827   // f(s) = A * exp(-B*s**2)
828   // B is in Angstrom squared
829   // map between an atom type and an index
830   map<string, unsigned> type_map;
831   type_map["C"]=0;
832   type_map["O"]=1;
833   type_map["N"]=2;
834   type_map["S"]=3;
835   // fill in sigma vector
836   GMM_m_s_.push_back(0.01*15.146);  // type 0
837   GMM_m_s_.push_back(0.01*8.59722); // type 1
838   GMM_m_s_.push_back(0.01*11.1116); // type 2
839   GMM_m_s_.push_back(0.01*15.8952); // type 3
840   // fill in weight vector
841   GMM_m_w_.push_back(2.49982); // type 0
842   GMM_m_w_.push_back(1.97692); // type 1
843   GMM_m_w_.push_back(2.20402); // type 2
844   GMM_m_w_.push_back(5.14099); // type 3
845 
846   // check if MOLINFO line is present
847   if( moldat ) {
848     log<<"  MOLINFO DATA found with label " <<moldat->getLabel()<<", using proper atom names\n";
849     for(unsigned i=0; i<atoms.size(); ++i) {
850       // get atom name
851       string name = moldat->getAtomName(atoms[i]);
852       char type;
853       // get atom type
854       char first = name.at(0);
855       // GOLDEN RULE: type is first letter, if not a number
856       if (!isdigit(first)) {
857         type = first;
858         // otherwise is the second
859       } else {
860         type = name.at(1);
861       }
862       // check if key in map
863       std::string type_s = std::string(1,type);
864       if(type_map.find(type_s) != type_map.end()) {
865         // save atom type
866         GMM_m_type_.push_back(type_map[type_s]);
867         // this will be normalized in the final density
868         GMM_m_w.push_back(GMM_m_w_[type_map[type_s]]);
869       } else {
870         error("Wrong atom type "+type_s+" from atom name "+name+"\n");
871       }
872     }
873   } else {
874     error("MOLINFO DATA not found\n");
875   }
876   return GMM_m_w;
877 }
878 
check_GMM_d(VectorGeneric<6> & cov,double w)879 void EMMI::check_GMM_d(VectorGeneric<6> &cov, double w)
880 {
881 
882 // check if positive defined, by calculating the 3 leading principal minors
883   double pm1 = cov[0];
884   double pm2 = cov[0]*cov[3]-cov[1]*cov[1];
885   double pm3 = cov[0]*(cov[3]*cov[5]-cov[4]*cov[4])-cov[1]*(cov[1]*cov[5]-cov[4]*cov[2])+cov[2]*(cov[1]*cov[4]-cov[3]*cov[2]);
886 // apply Sylvester’s criterion
887   if(pm1<=0.0 || pm2<=0.0 || pm3<=0.0)
888     error("check data GMM: covariance matrix is not positive defined");
889 
890 // check if weight is positive
891   if(w<=0) error("check data GMM: weight must be positive");
892 }
893 
894 // read GMM data file in PLUMED format:
get_GMM_d(string GMM_file)895 void EMMI::get_GMM_d(string GMM_file)
896 {
897   VectorGeneric<6> cov;
898 
899 // open file
900   std::unique_ptr<IFile> ifile(new IFile);
901   if(ifile->FileExist(GMM_file)) {
902     ifile->open(GMM_file);
903     int idcomp;
904     while(ifile->scanField("Id",idcomp)) {
905       int beta;
906       double w, m0, m1, m2;
907       ifile->scanField("Weight",w);
908       ifile->scanField("Mean_0",m0);
909       ifile->scanField("Mean_1",m1);
910       ifile->scanField("Mean_2",m2);
911       ifile->scanField("Cov_00",cov[0]);
912       ifile->scanField("Cov_01",cov[1]);
913       ifile->scanField("Cov_02",cov[2]);
914       ifile->scanField("Cov_11",cov[3]);
915       ifile->scanField("Cov_12",cov[4]);
916       ifile->scanField("Cov_22",cov[5]);
917       ifile->scanField("Beta",beta);
918       // check input
919       check_GMM_d(cov, w);
920       // check beta
921       if(beta<0) error("Beta must be positive!");
922       // center of the Gaussian
923       GMM_d_m_.push_back(Vector(m0,m1,m2));
924       // covariance matrix
925       GMM_d_cov_.push_back(cov);
926       // weight
927       GMM_d_w_.push_back(w);
928       // beta
929       GMM_d_beta_.push_back(beta);
930       // new line
931       ifile->scanField();
932     }
933   } else {
934     error("Cannot find GMM_FILE "+GMM_file+"\n");
935   }
936   // now create a set from beta (unique set of values)
937   set<int> bu(GMM_d_beta_.begin(), GMM_d_beta_.end());
938   // now prepare the group vector
939   GMM_d_grps_.resize(bu.size());
940   // and fill it in
941   for(unsigned i=0; i<GMM_d_beta_.size(); ++i) {
942     if(GMM_d_beta_[i]>=GMM_d_grps_.size()) error("Check Beta values");
943     GMM_d_grps_[GMM_d_beta_[i]].push_back(i);
944   }
945 }
946 
calculate_useful_stuff(double reso)947 void EMMI::calculate_useful_stuff(double reso)
948 {
949   // We use the following definition for resolution:
950   // the Fourier transform of the density distribution in real space
951   // f(s) falls to 1/e of its maximum value at wavenumber 1/resolution
952   // i.e. from f(s) = A * exp(-B*s**2) -> Res = sqrt(B).
953   // average value of B
954   double Bave = 0.0;
955   for(unsigned i=0; i<GMM_m_type_.size(); ++i) {
956     Bave += GMM_m_s_[GMM_m_type_[i]];
957   }
958   Bave /= static_cast<double>(GMM_m_type_.size());
959   // calculate blur factor
960   double blur = 0.0;
961   if(reso*reso>Bave) blur = reso*reso-Bave;
962   else warning("PLUMED should not be used with maps at resolution better than 0.3 nm");
963   // add blur to B
964   for(unsigned i=0; i<GMM_m_s_.size(); ++i) GMM_m_s_[i] += blur;
965   // calculate average resolution
966   double ave_res = 0.0;
967   for(unsigned i=0; i<GMM_m_type_.size(); ++i) {
968     ave_res += sqrt(GMM_m_s_[GMM_m_type_[i]]);
969   }
970   ave_res = ave_res / static_cast<double>(GMM_m_type_.size());
971   log.printf("  experimental map resolution : %3.2f\n", reso);
972   log.printf("  predicted map resolution : %3.2f\n", ave_res);
973   log.printf("  blur factor : %f\n", blur);
974   // now calculate useful stuff
975   VectorGeneric<6> cov, sum, inv_sum;
976   // cycle on all atoms types (4 for the moment)
977   for(unsigned i=0; i<GMM_m_s_.size(); ++i) {
978     // the Gaussian in density (real) space is the FT of scattering factor
979     // f(r) = A * (pi/B)**1.5 * exp(-pi**2/B*r**2)
980     double s = sqrt ( 0.5 * GMM_m_s_[i] ) / pi;
981     // covariance matrix for spherical Gaussian
982     cov[0]=s*s; cov[1]=0.0; cov[2]=0.0;
983     cov[3]=s*s; cov[4]=0.0;
984     cov[5]=s*s;
985     // cycle on all data GMM
986     for(unsigned j=0; j<GMM_d_m_.size(); ++j) {
987       // we need the sum of the covariance matrices
988       for(unsigned k=0; k<6; ++k) sum[k] = cov[k] + GMM_d_cov_[j][k];
989       // and to calculate its determinant
990       double det = sum[0]*(sum[3]*sum[5]-sum[4]*sum[4]);
991       det -= sum[1]*(sum[1]*sum[5]-sum[4]*sum[2]);
992       det += sum[2]*(sum[1]*sum[4]-sum[3]*sum[2]);
993       // calculate prefactor - model weights are already normalized
994       double pre_fact =  cfact_ / sqrt(det) * GMM_d_w_[j] * GMM_m_w_[i];
995       // and its inverse
996       inv_sum[0] = (sum[3]*sum[5] - sum[4]*sum[4])/det;
997       inv_sum[1] = (sum[2]*sum[4] - sum[1]*sum[5])/det;
998       inv_sum[2] = (sum[1]*sum[4] - sum[2]*sum[3])/det;
999       inv_sum[3] = (sum[0]*sum[5] - sum[2]*sum[2])/det;
1000       inv_sum[4] = (sum[2]*sum[1] - sum[0]*sum[4])/det;
1001       inv_sum[5] = (sum[0]*sum[3] - sum[1]*sum[1])/det;
1002       // now we store the prefactor
1003       pre_fact_.push_back(pre_fact);
1004       // and the inverse of the sum
1005       inv_cov_md_.push_back(inv_sum);
1006     }
1007   }
1008   // tabulate exponential
1009   dexp_ = dpcutoff_ / static_cast<double> (nexp_-1);
1010   for(unsigned i=0; i<nexp_; ++i) {
1011     tab_exp_.push_back(exp(-static_cast<double>(i) * dexp_));
1012   }
1013 }
1014 
1015 // get prefactors
get_prefactor_inverse(const VectorGeneric<6> & GMM_cov_0,const VectorGeneric<6> & GMM_cov_1,double & GMM_w_0,double & GMM_w_1,VectorGeneric<6> & sum,VectorGeneric<6> & inv_sum)1016 double EMMI::get_prefactor_inverse
1017 (const VectorGeneric<6> &GMM_cov_0, const VectorGeneric<6> &GMM_cov_1,
1018  double &GMM_w_0, double &GMM_w_1,
1019  VectorGeneric<6> &sum, VectorGeneric<6> &inv_sum)
1020 {
1021 // we need the sum of the covariance matrices
1022   for(unsigned k=0; k<6; ++k) sum[k] = GMM_cov_0[k] + GMM_cov_1[k];
1023 
1024 // and to calculate its determinant
1025   double det = sum[0]*(sum[3]*sum[5]-sum[4]*sum[4]);
1026   det -= sum[1]*(sum[1]*sum[5]-sum[4]*sum[2]);
1027   det += sum[2]*(sum[1]*sum[4]-sum[3]*sum[2]);
1028 
1029 // the prefactor is
1030   double pre_fact =  cfact_ / sqrt(det) * GMM_w_0 * GMM_w_1;
1031 
1032 // and its inverse
1033   inv_sum[0] = (sum[3]*sum[5] - sum[4]*sum[4])/det;
1034   inv_sum[1] = (sum[2]*sum[4] - sum[1]*sum[5])/det;
1035   inv_sum[2] = (sum[1]*sum[4] - sum[2]*sum[3])/det;
1036   inv_sum[3] = (sum[0]*sum[5] - sum[2]*sum[2])/det;
1037   inv_sum[4] = (sum[2]*sum[1] - sum[0]*sum[4])/det;
1038   inv_sum[5] = (sum[0]*sum[3] - sum[1]*sum[1])/det;
1039 
1040 // return pre-factor
1041   return pre_fact;
1042 }
1043 
get_self_overlap(unsigned id)1044 double EMMI::get_self_overlap(unsigned id)
1045 {
1046   double ov_tot = 0.0;
1047   VectorGeneric<6> sum, inv_sum;
1048   Vector ov_der;
1049 // start loop
1050   for(unsigned i=0; i<GMM_d_m_.size(); ++i) {
1051     // call auxiliary method
1052     double pre_fact = get_prefactor_inverse(GMM_d_cov_[id], GMM_d_cov_[i],
1053                                             GMM_d_w_[id],   GMM_d_w_[i], sum, inv_sum);
1054     // add overlap to ov_tot
1055     ov_tot += get_overlap(GMM_d_m_[id], GMM_d_m_[i], pre_fact, inv_sum, ov_der);
1056   }
1057 // and return it
1058   return ov_tot;
1059 }
1060 
1061 // get overlap and derivatives
get_overlap(const Vector & m_m,const Vector & d_m,double & pre_fact,const VectorGeneric<6> & inv_cov_md,Vector & ov_der)1062 double EMMI::get_overlap(const Vector &m_m, const Vector &d_m, double &pre_fact,
1063                          const VectorGeneric<6> &inv_cov_md, Vector &ov_der)
1064 {
1065   Vector md;
1066   // calculate vector difference m_m-d_m with/without pbc
1067   if(pbc_) md = pbcDistance(d_m, m_m);
1068   else     md = delta(d_m, m_m);
1069   // calculate product of transpose of md and inv_cov_md
1070   double p_x = md[0]*inv_cov_md[0]+md[1]*inv_cov_md[1]+md[2]*inv_cov_md[2];
1071   double p_y = md[0]*inv_cov_md[1]+md[1]*inv_cov_md[3]+md[2]*inv_cov_md[4];
1072   double p_z = md[0]*inv_cov_md[2]+md[1]*inv_cov_md[4]+md[2]*inv_cov_md[5];
1073   // calculate product of prod and md
1074   double ov = md[0]*p_x+md[1]*p_y+md[2]*p_z;
1075   // final calculation
1076   ov = pre_fact * exp(-0.5*ov);
1077   // derivatives
1078   ov_der = ov * Vector(p_x, p_y, p_z);
1079   return ov;
1080 }
1081 
1082 // get the exponent of the overlap
get_exp_overlap(const Vector & m_m,const Vector & d_m,const VectorGeneric<6> & inv_cov_md)1083 double EMMI::get_exp_overlap(const Vector &m_m, const Vector &d_m,
1084                              const VectorGeneric<6> &inv_cov_md)
1085 {
1086   Vector md;
1087   // calculate vector difference m_m-d_m with/without pbc
1088   if(pbc_) md = pbcDistance(d_m, m_m);
1089   else     md = delta(d_m, m_m);
1090   // calculate product of transpose of md and inv_cov_md
1091   double p_x = md[0]*inv_cov_md[0]+md[1]*inv_cov_md[1]+md[2]*inv_cov_md[2];
1092   double p_y = md[0]*inv_cov_md[1]+md[1]*inv_cov_md[3]+md[2]*inv_cov_md[4];
1093   double p_z = md[0]*inv_cov_md[2]+md[1]*inv_cov_md[4]+md[2]*inv_cov_md[5];
1094   // calculate product of prod and md
1095   double ov = md[0]*p_x+md[1]*p_y+md[2]*p_z;
1096   return ov;
1097 }
1098 
update_neighbor_list()1099 void EMMI::update_neighbor_list()
1100 {
1101   // dimension of GMM and atom vectors
1102   unsigned GMM_d_size = GMM_d_m_.size();
1103   unsigned GMM_m_size = GMM_m_type_.size();
1104   // local neighbor list
1105   vector < unsigned > nl_l;
1106   // clear old neighbor list
1107   nl_.clear();
1108 
1109   // cycle on GMM components - in parallel
1110   for(unsigned id=rank_; id<GMM_d_size; id+=size_) {
1111     // overlap lists and map
1112     vector<double> ov_l;
1113     map<double, unsigned> ov_m;
1114     // total overlap with id
1115     double ov_tot = 0.0;
1116     // cycle on all atoms
1117     for(unsigned im=0; im<GMM_m_size; ++im) {
1118       // get index in auxiliary lists
1119       unsigned kaux = GMM_m_type_[im] * GMM_d_size + id;
1120       // calculate exponent of overlap
1121       double expov = get_exp_overlap(GMM_d_m_[id], getPosition(im), inv_cov_md_[kaux]);
1122       // get index of 0.5*expov in tabulated exponential
1123       unsigned itab = static_cast<unsigned> (round( 0.5*expov/dexp_ ));
1124       // check boundaries and skip atom in case
1125       if(itab >= tab_exp_.size()) continue;
1126       // in case calculate overlap
1127       double ov = pre_fact_[kaux] * tab_exp_[itab];
1128       // add to list
1129       ov_l.push_back(ov);
1130       // and map to retrieve atom index
1131       ov_m[ov] = im;
1132       // increase ov_tot
1133       ov_tot += ov;
1134     }
1135     // check if zero size -> ov_tot = 0
1136     if(ov_l.size()==0) continue;
1137     // define cutoff
1138     double ov_cut = ov_tot * nl_cutoff_;
1139     // sort ov_l in ascending order
1140     std::sort(ov_l.begin(), ov_l.end());
1141     // integrate ov_l
1142     double res = 0.0;
1143     for(unsigned i=0; i<ov_l.size(); ++i) {
1144       res += ov_l[i];
1145       // if exceeding the cutoff for overlap, stop
1146       if(res >= ov_cut) break;
1147       else ov_m.erase(ov_l[i]);
1148     }
1149     // now add atoms to neighborlist
1150     for(map<double, unsigned>::iterator it=ov_m.begin(); it!=ov_m.end(); ++it)
1151       nl_l.push_back(id*GMM_m_size+it->second);
1152     // end cycle on GMM components in parallel
1153   }
1154   // find total dimension of neighborlist
1155   vector <int> recvcounts(size_, 0);
1156   recvcounts[rank_] = nl_l.size();
1157   comm.Sum(&recvcounts[0], size_);
1158   int tot_size = accumulate(recvcounts.begin(), recvcounts.end(), 0);
1159   // resize neighbor stuff
1160   nl_.resize(tot_size);
1161   // calculate vector of displacement
1162   vector<int> disp(size_);
1163   disp[0] = 0;
1164   int rank_size = 0;
1165   for(unsigned i=0; i<size_-1; ++i) {
1166     rank_size += recvcounts[i];
1167     disp[i+1] = rank_size;
1168   }
1169   // Allgather neighbor list
1170   comm.Allgatherv(&nl_l[0], recvcounts[rank_], &nl_[0], &recvcounts[0], &disp[0]);
1171   // now resize derivatives
1172   ovmd_der_.resize(tot_size);
1173 }
1174 
prepare()1175 void EMMI::prepare()
1176 {
1177   if(getExchangeStep()) first_time_=true;
1178 }
1179 
1180 // overlap calculator
calculate_overlap()1181 void EMMI::calculate_overlap() {
1182 
1183   if(first_time_ || getExchangeStep() || getStep()%nl_stride_==0) {
1184     update_neighbor_list();
1185     first_time_=false;
1186   }
1187 
1188   // clean temporary vectors
1189   for(unsigned i=0; i<ovmd_.size(); ++i)     ovmd_[i] = 0.0;
1190   for(unsigned i=0; i<ovmd_der_.size(); ++i) ovmd_der_[i] = Vector(0,0,0);
1191 
1192   // we have to cycle over all model and data GMM components in the neighbor list
1193   unsigned GMM_d_size = GMM_d_m_.size();
1194   unsigned GMM_m_size = GMM_m_type_.size();
1195   for(unsigned i=rank_; i<nl_.size(); i=i+size_) {
1196     // get data (id) and atom (im) indexes
1197     unsigned id = nl_[i] / GMM_m_size;
1198     unsigned im = nl_[i] % GMM_m_size;
1199     // get index in auxiliary lists
1200     unsigned kaux = GMM_m_type_[im] * GMM_d_size + id;
1201     // add overlap with im component of model GMM
1202     ovmd_[id] += get_overlap(GMM_d_m_[id], getPosition(im), pre_fact_[kaux],
1203                              inv_cov_md_[kaux], ovmd_der_[i]);
1204   }
1205   // communicate stuff
1206   if(size_>1) {
1207     comm.Sum(&ovmd_[0], ovmd_.size());
1208     comm.Sum(&ovmd_der_[0][0], 3*ovmd_der_.size());
1209   }
1210 }
1211 
scaleEnergy(double s)1212 double EMMI::scaleEnergy(double s)
1213 {
1214   double ene = 0.0;
1215   for(unsigned i=0; i<ovdd_.size(); ++i) {
1216     ene += std::log( abs ( s * ovmd_[i] - ovdd_[i] ) );
1217   }
1218   return ene;
1219 }
1220 
doRegression()1221 double EMMI::doRegression()
1222 {
1223 // standard MC parameters
1224   unsigned MCsteps = 100000;
1225   double kbtmin = 1.0;
1226   double kbtmax = 10.0;
1227   unsigned ncold = 5000;
1228   unsigned nhot = 2000;
1229   double MCacc = 0.0;
1230   double kbt, ebest, scale_best;
1231 
1232 // initial value of scale factor and energy
1233   double scale = random_.RandU01() * ( scale_max_ - scale_min_ ) + scale_min_;
1234   double ene = scaleEnergy(scale);
1235 // set best energy
1236   ebest = ene;
1237 
1238 // MC loop
1239   for(unsigned istep=0; istep<MCsteps; ++istep) {
1240     // get temperature
1241     if(istep%(ncold+nhot)<ncold) kbt = kbtmin;
1242     else kbt = kbtmax;
1243     // propose move in scale
1244     double ds = dscale_ * ( 2.0 * random_.RandU01() - 1.0 );
1245     double new_scale = scale + ds;
1246     // check boundaries
1247     if(new_scale > scale_max_) {new_scale = 2.0 * scale_max_ - new_scale;}
1248     if(new_scale < scale_min_) {new_scale = 2.0 * scale_min_ - new_scale;}
1249     // new energy
1250     double new_ene = scaleEnergy(new_scale);
1251     // accept or reject
1252     bool accept = doAccept(ene, new_ene, kbt);
1253     // in case of acceptance
1254     if(accept) {
1255       scale = new_scale;
1256       ene = new_ene;
1257       MCacc += 1.0;
1258     }
1259     // save best
1260     if(ene<ebest) {
1261       ebest = ene;
1262       scale_best = scale;
1263     }
1264   }
1265 // calculate acceptance
1266   double accscale = MCacc / static_cast<double>(MCsteps);
1267 // global communication
1268   if(!no_aver_ && nrep_>1) {
1269     if(replica_!=0) {
1270       scale_best = 0.0;
1271       ebest = 0.0;
1272       accscale = 0.0;
1273     }
1274     if(rank_==0) {
1275       multi_sim_comm.Sum(&scale_best, 1);
1276       multi_sim_comm.Sum(&ebest, 1);
1277       multi_sim_comm.Sum(&accscale, 1);
1278     }
1279   }
1280   // local communication
1281   if(rank_!=0) {
1282     scale_best = 0.0;
1283     ebest = 0.0;
1284     accscale = 0.0;
1285   }
1286   if(size_>1) {
1287     comm.Sum(&scale_best, 1);
1288     comm.Sum(&ebest, 1);
1289     comm.Sum(&accscale, 1);
1290   }
1291 // set scale parameters
1292   getPntrToComponent("accscale")->set(accscale);
1293   getPntrToComponent("enescale")->set(ebest);
1294 // return scale value
1295   return scale_best;
1296 }
1297 
get_annealing(long int step)1298 double EMMI::get_annealing(long int step)
1299 {
1300 // default no annealing
1301   double fact = 1.0;
1302 // position in annealing cycle
1303   unsigned nc = step%(4*nanneal_);
1304 // useful doubles
1305   double ncd = static_cast<double>(nc);
1306   double nn  = static_cast<double>(nanneal_);
1307 // set fact
1308   if(nc>=nanneal_   && nc<2*nanneal_) fact = (kanneal_-1.0) / nn * ( ncd - nn ) + 1.0;
1309   if(nc>=2*nanneal_ && nc<3*nanneal_) fact = kanneal_;
1310   if(nc>=3*nanneal_)                  fact = (1.0-kanneal_) / nn * ( ncd - 3.0*nn) + kanneal_;
1311   return fact;
1312 }
1313 
calculate()1314 void EMMI::calculate()
1315 {
1316 
1317 // calculate CV
1318   calculate_overlap();
1319 
1320   // rescale factor for ensemble average
1321   double escale = 1.0 / static_cast<double>(nrep_);
1322 
1323   // in case of ensemble averaging, calculate average overlap
1324   if(!no_aver_ && nrep_>1) {
1325     // if master node, calculate average across replicas
1326     if(rank_==0) {
1327       multi_sim_comm.Sum(&ovmd_[0], ovmd_.size());
1328       for(unsigned i=0; i<ovmd_.size(); ++i) ovmd_[i] *= escale;
1329     } else {
1330       for(unsigned i=0; i<ovmd_.size(); ++i) ovmd_[i] = 0.0;
1331     }
1332     // local communication
1333     if(size_>1) comm.Sum(&ovmd_[0], ovmd_.size());
1334   }
1335 
1336   // get time step
1337   long int step = getStep();
1338 
1339   // do regression
1340   if(nregres_>0) {
1341     if(step%nregres_==0 && !getExchangeStep()) scale_ = doRegression();
1342     // set scale component
1343     getPntrToComponent("scale")->set(scale_);
1344   }
1345 
1346   // write model overlap to file
1347   if(ovstride_>0 && step%ovstride_==0) write_model_overlap(step);
1348 
1349   // clear energy and virial
1350   ene_ = 0.0;
1351   virial_.zero();
1352 
1353   // Gaussian noise
1354   if(noise_==0) calculate_Gauss();
1355 
1356   // Outliers noise
1357   if(noise_==1) calculate_Outliers();
1358 
1359   // Marginal noise
1360   if(noise_==2) calculate_Marginal();
1361 
1362   // get annealing rescale factor
1363   if(nanneal_>0) {
1364     anneal_ = get_annealing(step);
1365     getPntrToComponent("anneal")->set(anneal_);
1366   }
1367 
1368   // annealing rescale
1369   ene_ /= anneal_;
1370 
1371   // in case of ensemble averaging
1372   if(!no_aver_ && nrep_>1) {
1373     // if master node, sum der_GMMid derivatives and ene
1374     if(rank_==0) {
1375       multi_sim_comm.Sum(&GMMid_der_[0], GMMid_der_.size());
1376       multi_sim_comm.Sum(&ene_, 1);
1377     } else {
1378       // set der_GMMid derivatives and energy to zero
1379       for(unsigned i=0; i<GMMid_der_.size(); ++i) GMMid_der_[i]=0.0;
1380       ene_ = 0.0;
1381     }
1382     // local communication
1383     if(size_>1) {
1384       comm.Sum(&GMMid_der_[0], GMMid_der_.size());
1385       comm.Sum(&ene_, 1);
1386     }
1387   }
1388 
1389   // clean temporary vector
1390   for(unsigned i=0; i<atom_der_.size(); ++i) atom_der_[i] = Vector(0,0,0);
1391 
1392   // get derivatives of bias with respect to atoms
1393   for(unsigned i=rank_; i<nl_.size(); i=i+size_) {
1394     // get indexes of data and model component
1395     unsigned id = nl_[i] / GMM_m_type_.size();
1396     unsigned im = nl_[i] % GMM_m_type_.size();
1397     // chain rule + replica normalization
1398     Vector tot_der = GMMid_der_[id] * ovmd_der_[i] * escale * scale_ / anneal_;
1399     Vector pos;
1400     if(pbc_) pos = pbcDistance(GMM_d_m_[id], getPosition(im)) + GMM_d_m_[id];
1401     else     pos = getPosition(im);
1402     // increment derivatives and virial
1403     atom_der_[im] += tot_der;
1404     virial_ += Tensor(pos, -tot_der);
1405   }
1406 
1407   // communicate local derivatives and virial
1408   if(size_>1) {
1409     comm.Sum(&atom_der_[0][0], 3*atom_der_.size());
1410     comm.Sum(virial_);
1411   }
1412 
1413   // set derivatives, virial, and score
1414   for(unsigned i=0; i<atom_der_.size(); ++i) setAtomsDerivatives(getPntrToComponent("scoreb"), i, atom_der_[i]);
1415   setBoxDerivatives(getPntrToComponent("scoreb"), virial_);
1416   getPntrToComponent("scoreb")->set(ene_);
1417 
1418   // This part is needed only for Gaussian and Outliers noise models
1419   if(noise_!=2) {
1420 
1421     // do Montecarlo
1422     if(dsigma_[0]>0 && step%MCstride_==0 && !getExchangeStep()) doMonteCarlo();
1423 
1424     // print status
1425     if(step%statusstride_==0) print_status(step);
1426 
1427     // calculate acceptance ratio
1428     double acc = MCaccept_ / MCtrials_;
1429 
1430     // set value
1431     getPntrToComponent("acc")->set(acc);
1432 
1433   }
1434 
1435 }
1436 
calculate_Gauss()1437 void EMMI::calculate_Gauss()
1438 {
1439   // cycle on all the GMM groups
1440   for(unsigned i=0; i<GMM_d_grps_.size(); ++i) {
1441     double eneg = 0.0;
1442     // cycle on all the members of the group
1443     for(unsigned j=0; j<GMM_d_grps_[i].size(); ++j) {
1444       // id of the GMM component
1445       int GMMid = GMM_d_grps_[i][j];
1446       // calculate deviation
1447       double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] ) / sigma_[i];
1448       // add to group energy
1449       eneg += 0.5 * dev * dev;
1450       // store derivative for later
1451       GMMid_der_[GMMid] = kbt_ * dev / sigma_[i];
1452     }
1453     // add to total energy along with normalizations and prior
1454     ene_ += kbt_ * ( eneg + (static_cast<double>(GMM_d_grps_[i].size())+prior_) * std::log(sigma_[i]) );
1455   }
1456 }
1457 
calculate_Outliers()1458 void EMMI::calculate_Outliers()
1459 {
1460   // cycle on all the GMM groups
1461   for(unsigned i=0; i<GMM_d_grps_.size(); ++i) {
1462     // cycle on all the members of the group
1463     double eneg = 0.0;
1464     for(unsigned j=0; j<GMM_d_grps_[i].size(); ++j) {
1465       // id of the GMM component
1466       int GMMid = GMM_d_grps_[i][j];
1467       // calculate deviation
1468       double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] ) / sigma_[i];
1469       // add to group energy
1470       eneg += std::log( 1.0 + 0.5 * dev * dev );
1471       // store derivative for later
1472       GMMid_der_[GMMid] = kbt_ / ( 1.0 + 0.5 * dev * dev ) * dev / sigma_[i];
1473     }
1474     // add to total energy along with normalizations and prior
1475     ene_ += kbt_ * ( eneg + (static_cast<double>(GMM_d_grps_[i].size())+prior_) * std::log(sigma_[i]) );
1476   }
1477 }
1478 
calculate_Marginal()1479 void EMMI::calculate_Marginal()
1480 {
1481   // cycle on all the GMM groups
1482   for(unsigned i=0; i<GMM_d_grps_.size(); ++i) {
1483     // cycle on all the members of the group
1484     for(unsigned j=0; j<GMM_d_grps_[i].size(); ++j) {
1485       // id of the GMM component
1486       int GMMid = GMM_d_grps_[i][j];
1487       // calculate deviation
1488       double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] );
1489       // calculate errf
1490       double errf = erf ( dev * inv_sqrt2_ / sigma_min_[i] );
1491       // add to group energy
1492       ene_ += -kbt_ * std::log ( 0.5 / dev * errf ) ;
1493       // store derivative for later
1494       GMMid_der_[GMMid] = - kbt_/errf*sqrt2_pi_*exp(-0.5*dev*dev/sigma_min_[i]/sigma_min_[i])/sigma_min_[i]+kbt_/dev;
1495     }
1496   }
1497 }
1498 
1499 }
1500 }
1501