1 /***************************************************************************
2  *   Copyright (C) 2009 by BUI Quang Minh   *
3  *   minh.bui@univie.ac.at   *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20 #include "partitionmodel.h"
21 #include "alignment/superalignment.h"
22 #include "model/rategamma.h"
23 #include "model/modelmarkov.h"
24 
PartitionModel()25 PartitionModel::PartitionModel()
26         : ModelFactory()
27 {
28 	linked_alpha = -1.0;
29     opt_gamma_invar = false;
30 }
31 
PartitionModel(Params & params,PhyloSuperTree * tree,ModelsBlock * models_block)32 PartitionModel::PartitionModel(Params &params, PhyloSuperTree *tree, ModelsBlock *models_block)
33         : ModelFactory()
34 {
35 	store_trans_matrix = params.store_trans_matrix;
36 	is_storing = false;
37 	joint_optimize = params.optimize_model_rate_joint;
38 	fused_mix_rate = false;
39     linked_alpha = -1.0;
40     opt_gamma_invar = false;
41 
42 	// create dummy model
43 	model = new ModelSubst(tree->aln->num_states);
44 	site_rate = new RateHeterogeneity();
45 	site_rate->setTree(tree);
46 
47 //    string model_name = params.model_name;
48     PhyloSuperTree::iterator it;
49     int part;
50     if (params.link_alpha) {
51         params.gamma_shape = fabs(params.gamma_shape);
52         linked_alpha = params.gamma_shape;
53     }
54     double init_by_divmat = false;
55     if (params.model_name_init && strcmp(params.model_name_init, "DIVMAT") == 0) {
56         init_by_divmat = true;
57         params.model_name_init = NULL;
58     }
59     for (it = tree->begin(), part = 0; it != tree->end(); it++, part++) {
60         ASSERT(!((*it)->getModelFactory()));
61         string model_name = (*it)->aln->model_name;
62         if (model_name == "") // if empty, take model name from command option
63         	model_name = params.model_name;
64         (*it)->setModelFactory(new ModelFactory(params, model_name, (*it), models_block));
65         (*it)->setModel((*it)->getModelFactory()->model);
66         (*it)->setRate((*it)->getModelFactory()->site_rate);
67 
68         // link models between partitions
69         if (params.link_model) {
70             (*it)->getModel()->fixParameters(true);
71             if (linked_models.find((*it)->getModel()->getName()) == linked_models.end()) {
72                 linked_models[(*it)->getModel()->getName()] = (*it)->getModel();
73             }
74         } else if ((*it)->aln->getNSeq() < tree->aln->getNSeq() && params.partition_type != TOPO_UNLINKED &&
75             (*it)->getModel()->freq_type == FREQ_EMPIRICAL && (*it)->aln->seq_type != SEQ_CODON) {
76         	// modify state_freq to account for empty sequences
77         	(*it)->aln->computeStateFreq((*it)->getModel()->state_freq, (*it)->aln->getNSite() * (tree->aln->getNSeq() - (*it)->aln->getNSeq()));
78         	(*it)->getModel()->decomposeRateMatrix();
79         }
80 
81         //string taxa_set = ((SuperAlignment*)tree->aln)->getPattern(part);
82         //(*it)->copyTree(tree, taxa_set);
83         //(*it)->drawTree(cout);
84     }
85     if (init_by_divmat) {
86         ASSERT(0 && "init_by_div_mat not working");
87         int nstates = linked_models.begin()->second->num_states;
88         double *pair_freq = new double[nstates * nstates];
89         double *state_freq = new double[nstates];
90         tree->aln->computeDivergenceMatrix(pair_freq, state_freq);
91         /*
92         MatrixXd divmat = Map<Matrix<double,Dynamic, Dynamic, RowMajor> > (pair_freq, nstates, nstates);
93         cout << "DivMat: " << endl << divmat << endl;
94         auto pi = Map<VectorXd>(state_freq, nstates);
95         MatrixXd Q = (pi.asDiagonal() * divmat).log();
96         cout << "Q: " << endl << Q << endl;
97         cout << "rowsum: " << Q.rowwise().sum() << endl;
98         Map<Matrix<double,Dynamic, Dynamic, RowMajor> >(pair_freq, nstates, nstates) = Q;
99          */
100         ((ModelMarkov*)linked_models.begin()->second)->setFullRateMatrix(pair_freq, state_freq);
101         ((ModelMarkov*)linked_models.begin()->second)->decomposeRateMatrix();
102         delete [] state_freq;
103         delete [] pair_freq;
104 
105     } else
106     for (auto mit = linked_models.begin(); mit != linked_models.end(); mit++) {
107         PhyloSuperTree *stree = (PhyloSuperTree*)site_rate->phylo_tree;
108         if (mit->second->freq_type != FREQ_ESTIMATE && mit->second->freq_type != FREQ_EMPIRICAL)
109             continue;
110         // count state occurrences
111         size_t *sum_state_counts = NULL;
112         int num_parts = 0;
113         for (it = stree->begin(); it != stree->end(); it++) {
114             if ((*it)->getModel()->getName() == mit->second->getName()) {
115                 num_parts++;
116                 if ((*it)->aln->seq_type == SEQ_CODON)
117                     outError("Linking codon models not supported");
118                 if ((*it)->aln->seq_type == SEQ_POMO)
119                     outError("Linking POMO models not supported");
120                 size_t state_counts[(*it)->aln->STATE_UNKNOWN+1];
121                 size_t unknown_states = 0;
122                 if( params.partition_type != TOPO_UNLINKED)
123                     unknown_states = (*it)->aln->getNSite() * (tree->aln->getNSeq() - (*it)->aln->getNSeq());
124                 (*it)->aln->countStates(state_counts, unknown_states);
125                 if (!sum_state_counts) {
126                     sum_state_counts = new size_t[(*it)->aln->STATE_UNKNOWN+1];
127                     memset(sum_state_counts, 0, sizeof(size_t)*((*it)->aln->STATE_UNKNOWN+1));
128                 }
129                 for (int state = 0; state <= (*it)->aln->STATE_UNKNOWN; state++)
130                     sum_state_counts[state] += state_counts[state];
131             }
132         }
133         cout << "Linking " << mit->first << " model across " << num_parts << " partitions" << endl;
134         int nstates = mit->second->num_states;
135         double sum_state_freq[nstates];
136         // convert counts to frequencies
137         for (it = stree->begin(); it != stree->end(); it++) {
138             if ((*it)->getModel()->getName() == mit->second->getName()) {
139                 (*it)->aln->convertCountToFreq(sum_state_counts, sum_state_freq);
140                 break;
141             }
142         }
143 
144         cout << "Mean state frequencies:";
145         int prec = cout.precision(8);
146         for (int state = 0; state < mit->second->num_states; state++)
147             cout << " " << sum_state_freq[state];
148         cout << endl;
149         cout.precision(prec);
150 
151         for (it = stree->begin(); it != stree->end(); it++)
152             if ((*it)->getModel()->getName() == mit->second->getName()) {
153                 ((ModelMarkov*)(*it)->getModel())->adaptStateFrequency(sum_state_freq);
154                 (*it)->getModel()->decomposeRateMatrix();
155             }
156         delete [] sum_state_counts;
157     }
158 }
159 
setCheckpoint(Checkpoint * checkpoint)160 void PartitionModel::setCheckpoint(Checkpoint *checkpoint) {
161 	ModelFactory::setCheckpoint(checkpoint);
162     PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
163     for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++)
164 		(*it)->getModelFactory()->setCheckpoint(checkpoint);
165 }
166 
startCheckpoint()167 void PartitionModel::startCheckpoint() {
168     checkpoint->startStruct("PartitionModel");
169 }
170 
saveCheckpoint()171 void PartitionModel::saveCheckpoint() {
172     startCheckpoint();
173     CKP_SAVE(linked_alpha);
174     for (auto it = linked_models.begin(); it != linked_models.end(); it++) {
175         checkpoint->startStruct(it->first);
176         bool fixed = it->second->fixParameters(false);
177         it->second->saveCheckpoint();
178         it->second->fixParameters(fixed);
179         checkpoint->endStruct();
180     }
181     PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
182     int part = 0;
183     for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++, part++) {
184         checkpoint->startStruct((*it)->aln->name);
185         (*it)->getModelFactory()->saveCheckpoint();
186         checkpoint->endStruct();
187     }
188     endCheckpoint();
189 
190     CheckpointFactory::saveCheckpoint();
191 }
192 
restoreCheckpoint()193 void PartitionModel::restoreCheckpoint() {
194     CheckpointFactory::restoreCheckpoint();
195     startCheckpoint();
196     CKP_RESTORE(linked_alpha);
197 
198     PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
199     int part = 0;
200     for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++, part++) {
201         checkpoint->startStruct((*it)->aln->name);
202         (*it)->getModelFactory()->restoreCheckpoint();
203         checkpoint->endStruct();
204     }
205 
206     // restore linked models
207     for (auto it = linked_models.begin(); it != linked_models.end(); it++) {
208         checkpoint->startStruct(it->first);
209         for (auto tit = tree->begin(); tit != tree->end(); tit++)
210             if ((*tit)->getModel()->getName() == it->first) {
211                 bool fixed = (*tit)->getModel()->fixParameters(false);
212                 (*tit)->getModel()->restoreCheckpoint();
213                 (*tit)->getModel()->fixParameters(fixed);
214             }
215         checkpoint->endStruct();
216     }
217 
218     endCheckpoint();
219 }
220 
getNParameters(int brlen_type)221 int PartitionModel::getNParameters(int brlen_type) {
222     PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
223 	int df = 0;
224     for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++) {
225     	df += (*it)->getModelFactory()->getNParameters(brlen_type);
226     }
227     if (linked_alpha > 0)
228         df ++;
229     for (auto it = linked_models.begin(); it != linked_models.end(); it++) {
230         bool fixed = it->second->fixParameters(false);
231         df += it->second->getNDim() + it->second->getNDimFreq();
232         it->second->fixParameters(fixed);
233     }
234     return df;
235 }
236 
computeFunction(double shape)237 double PartitionModel::computeFunction(double shape) {
238     PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
239     double res = 0.0;
240     int ntrees = tree->size();
241     linked_alpha = shape;
242     if (tree->part_order.empty()) tree->computePartitionOrder();
243 #ifdef _OPENMP
244 #pragma omp parallel for reduction(+: res) schedule(dynamic) if(tree->num_threads > 1)
245 #endif
246     for (int j = 0; j < ntrees; j++) {
247         int i = tree->part_order[j];
248         if (tree->at(i)->getRate()->isGammaRate())
249             res += tree->at(i)->getRate()->computeFunction(shape);
250     }
251     if (res == 0.0)
252         outError("No partition has Gamma rate heterogeneity!");
253 	return res;
254 }
255 
optimizeLinkedAlpha(bool write_info,double gradient_epsilon)256 double PartitionModel::optimizeLinkedAlpha(bool write_info, double gradient_epsilon) {
257     if (write_info)
258         cout << "Optimizing linked gamma shape..." << endl;
259 	double negative_lh;
260 	double current_shape = linked_alpha;
261 	double ferror, optx;
262 	optx = minimizeOneDimen(site_rate->getTree()->params->min_gamma_shape, current_shape, MAX_GAMMA_SHAPE, max(gradient_epsilon, TOL_GAMMA_SHAPE), &negative_lh, &ferror);
263     double tree_lh = site_rate->getTree()->computeLikelihood();
264     if (write_info) {
265         cout << "Linked alpha across partitions: " << linked_alpha << endl;
266         cout << "Linked alpha log-likelihood: " << tree_lh << endl;
267     }
268 	return tree_lh;
269 
270 }
271 
getNDim()272 int PartitionModel::getNDim() {
273     return model->getNDim();
274 }
275 
targetFunk(double x[])276 double PartitionModel::targetFunk(double x[]) {
277     PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
278 
279     double res = 0;
280     int ntrees = tree->size();
281     if (tree->part_order.empty()) tree->computePartitionOrder();
282 #ifdef _OPENMP
283 #pragma omp parallel for reduction(+: res) schedule(dynamic) if(tree->num_threads > 1)
284 #endif
285     for (int j = 0; j < ntrees; j++) {
286         int i = tree->part_order[j];
287         ModelSubst *part_model = tree->at(i)->getModel();
288         if (part_model->getName() != model->getName())
289             continue;
290         bool fixed = part_model->fixParameters(false);
291         res += part_model->targetFunk(x);
292         part_model->fixParameters(fixed);
293     }
294     if (res == 0.0)
295         outError("No partition has model ", model->getName());
296     return res;
297 }
298 
setVariables(double * variables)299 void PartitionModel::setVariables(double *variables) {
300     model->setVariables(variables);
301 }
302 
getVariables(double * variables)303 bool PartitionModel::getVariables(double *variables) {
304     bool changed = false;
305     PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
306     for (auto it = tree->begin(); it != tree->end(); it++)
307         if ((*it)->getModel()->getName() == model->getName())
308             changed |= (*it)->getModel()->getVariables(variables);
309     return changed;
310 }
311 
scaleStateFreq(bool sum_one)312 void PartitionModel::scaleStateFreq(bool sum_one) {
313     PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
314     for (auto it = tree->begin(); it != tree->end(); it++)
315         if ((*it)->getModel()->getName() == model->getName())
316             ((ModelMarkov*)(*it)->getModel())->scaleStateFreq(sum_one);
317 }
318 
optimizeLinkedModel(bool write_info,double gradient_epsilon)319 double PartitionModel::optimizeLinkedModel(bool write_info, double gradient_epsilon) {
320     int ndim = getNDim();
321 
322     // return if nothing to be optimized
323     if (ndim == 0) return 0.0;
324 
325     if (write_info)
326         cout << "Optimizing linked " << model->getName() << " parameters across all partitions (" << ndim << " free parameters)" << endl;
327 
328     if (verbose_mode >= VB_MAX)
329         cout << "Optimizing " << model->name << " model parameters..." << endl;
330 
331     //if (freq_type == FREQ_ESTIMATE) scaleStateFreq(false);
332 
333     double *variables = new double[ndim+1]; // used for BFGS numerical recipes
334     double *variables2 = new double[ndim+1]; // used for L-BFGS-B
335     double *upper_bound = new double[ndim+1];
336     double *lower_bound = new double[ndim+1];
337     bool *bound_check = new bool[ndim+1];
338     double score;
339 
340 
341     // by BFGS algorithm
342     setVariables(variables);
343     setVariables(variables2);
344     ((ModelMarkov*)model)->setBounds(lower_bound, upper_bound, bound_check);
345     // expand the bound for linked model
346 //    for (int i = 1; i <= ndim; i++) {
347 //        lower_bound[i] = MIN_RATE*0.2;
348 //        upper_bound[i] = MAX_RATE*2.0;
349 //    }
350     if (Params::getInstance().optimize_alg.find("BFGS-B") == string::npos)
351         score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_RATE));
352     else
353         score = -L_BFGS_B(ndim, variables+1, lower_bound+1, upper_bound+1, max(gradient_epsilon, TOL_RATE));
354 
355     bool changed = getVariables(variables);
356 
357     /* 2019-09-05: REMOVED due to numerical issue (NAN) with L-BFGS-B
358     // 2017-12-06: more robust optimization using 2 different routines
359     // when estimates are at boundary
360     score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_RATE));
361     bool changed = getVariables(variables);
362 
363     if (model->isUnstableParameters()) {
364         // parameters at boundary, restart with L-BFGS-B with parameters2
365         double score2 = -L_BFGS_B(ndim, variables2+1, lower_bound+1, upper_bound+1, max(gradient_epsilon, TOL_RATE));
366         if (score2 > score+0.1) {
367             if (verbose_mode >= VB_MED)
368                 cout << "NICE: L-BFGS-B found better parameters with LnL=" << score2 << " than BFGS LnL=" << score << endl;
369             changed = getVariables(variables2);
370             score = score2;
371         } else {
372             // otherwise, revert what BFGS found
373             changed = getVariables(variables);
374         }
375     }
376     */
377 
378     // BQM 2015-09-07: normalize state_freq
379     if (model->isReversible() && model->freq_type == FREQ_ESTIMATE) {
380         scaleStateFreq(true);
381         changed = true;
382     }
383     if (changed) {
384         PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
385         for (auto it = tree->begin(); it != tree->end(); it++)
386             if ((*it)->getModel()->getName() == model->getName())
387                 (*it)->getModel()->decomposeRateMatrix();
388         site_rate->phylo_tree->clearAllPartialLH();
389         score = site_rate->phylo_tree->computeLikelihood();
390     }
391 
392     delete [] bound_check;
393     delete [] lower_bound;
394     delete [] upper_bound;
395     delete [] variables2;
396     delete [] variables;
397 
398     if (write_info) {
399         cout << "Linked-model log-likelihood: " << score << endl;
400     }
401 
402     return score;
403 }
404 
optimizeLinkedModels(bool write_info,double gradient_epsilon)405 double PartitionModel::optimizeLinkedModels(bool write_info, double gradient_epsilon) {
406     PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
407     double tree_lh;
408     for (auto it = linked_models.begin(); it != linked_models.end(); it++) {
409         ModelSubst *saved_model = model;
410         model = it->second;
411         PhyloSuperTree::iterator part_tree;
412         // un-fix model parameters
413         for (part_tree = tree->begin(); part_tree != tree->end(); part_tree++)
414             if ((*part_tree)->getModel()->getName() == model->getName())
415                 (*part_tree)->getModel()->fixParameters(false);
416 
417         // main call to optimize linked model parameters
418         tree_lh = optimizeLinkedModel(write_info, gradient_epsilon);
419 
420         // fix model parameters again
421         for (part_tree = tree->begin(); part_tree != tree->end(); part_tree++)
422             if ((*part_tree)->getModel()->getName() == model->getName())
423                 (*part_tree)->getModel()->fixParameters(true);
424 
425         saveCheckpoint();
426         getCheckpoint()->dump();
427         model = saved_model;
428     }
429 
430     return site_rate->phylo_tree->computeLikelihood();
431 }
432 
reportLinkedModel(ostream & out)433 void PartitionModel::reportLinkedModel(ostream &out) {
434     if (linked_alpha > 0.0)
435         out << "Linked alpha across partitions: " << linked_alpha << endl;
436     for (auto it = linked_models.begin(); it != linked_models.end(); it++) {
437         out << "Linked model " << it->first << ": " << endl;
438         it->second->report(out);
439     }
440 }
441 
isLinkedModel()442 bool PartitionModel::isLinkedModel() {
443     return Params::getInstance().link_alpha || (linked_models.size()>0);
444 }
445 
optimizeParameters(int fixed_len,bool write_info,double logl_epsilon,double gradient_epsilon)446 double PartitionModel::optimizeParameters(int fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
447     PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
448     double prev_tree_lh = -DBL_MAX, tree_lh = 0.0;
449     int ntrees = tree->size();
450 
451     for (int step = 0; step < Params::getInstance().model_opt_steps; step++) {
452         tree_lh = 0.0;
453         if (tree->part_order.empty()) tree->computePartitionOrder();
454         #ifdef _OPENMP
455         #pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(tree->num_threads > 1)
456         #endif
457         for (int i = 0; i < ntrees; i++) {
458             int part = tree->part_order[i];
459             double score;
460             if (opt_gamma_invar)
461                 score = tree->at(part)->getModelFactory()->optimizeParametersGammaInvar(fixed_len,
462                     write_info && verbose_mode >= VB_MED,
463                     logl_epsilon/min(ntrees,10), gradient_epsilon/min(ntrees,10));
464             else
465                 score = tree->at(part)->getModelFactory()->optimizeParameters(fixed_len,
466                     write_info && verbose_mode >= VB_MED,
467                     logl_epsilon/min(ntrees,10), gradient_epsilon/min(ntrees,10));
468             tree_lh += score;
469             if (write_info)
470 #ifdef _OPENMP
471 #pragma omp critical
472 #endif
473             {
474                 cout << "Partition " << tree->at(part)->aln->name
475                      << " / Model: " << tree->at(part)->getModelName()
476                      << " / df: " << tree->at(part)->getModelFactory()->getNParameters(fixed_len)
477                 << " / LogL: " << score << endl;
478             }
479         }
480         //return ModelFactory::optimizeParameters(fixed_len, write_info);
481 
482         if (!isLinkedModel())
483             break;
484 
485         if (verbose_mode >= VB_MED || write_info)
486             cout << step+1 << ". Log-likelihood: " << tree_lh << endl;
487 
488         if (tree->params->link_alpha) {
489             tree_lh = optimizeLinkedAlpha(write_info, gradient_epsilon);
490         }
491 
492         // optimize linked models
493         if (!linked_models.empty()) {
494             double new_tree_lh = optimizeLinkedModels(write_info, gradient_epsilon);
495             ASSERT(new_tree_lh > tree_lh - 0.1);
496             tree_lh = new_tree_lh;
497         }
498 
499         if (tree_lh-logl_epsilon*10 < prev_tree_lh)
500             break;
501         prev_tree_lh = tree_lh;
502     }
503 
504     if (verbose_mode >= VB_MED || write_info)
505 		cout << "Optimal log-likelihood: " << tree_lh << endl;
506     // write linked_models
507     if (verbose_mode <= VB_MIN && write_info) {
508         for (auto it = linked_models.begin(); it != linked_models.end(); it++)
509             it->second->writeInfo(cout);
510     }
511     return tree_lh;
512 }
513 
514 
optimizeParametersGammaInvar(int fixed_len,bool write_info,double logl_epsilon,double gradient_epsilon)515 double PartitionModel::optimizeParametersGammaInvar(int fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
516     opt_gamma_invar = true;
517     double tree_lh = optimizeParameters(fixed_len, write_info, logl_epsilon, gradient_epsilon);
518     opt_gamma_invar = false;
519     return tree_lh;
520 }
521 
522 
~PartitionModel()523 PartitionModel::~PartitionModel()
524 {
525 }
526 
isUnstableParameters()527 bool PartitionModel::isUnstableParameters() {
528     PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
529 
530 	for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++)
531 		if ((*it)->getModelFactory()->isUnstableParameters()) {
532 			return true;
533 		}
534 	return false;
535 }
536