1 /***************************************************************************
2 * Copyright (C) 2009 by BUI Quang Minh *
3 * minh.bui@univie.ac.at *
4 * *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
9 * *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
14 * *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ***************************************************************************/
20 #include "partitionmodel.h"
21 #include "alignment/superalignment.h"
22 #include "model/rategamma.h"
23 #include "model/modelmarkov.h"
24
PartitionModel()25 PartitionModel::PartitionModel()
26 : ModelFactory()
27 {
28 linked_alpha = -1.0;
29 opt_gamma_invar = false;
30 }
31
PartitionModel(Params & params,PhyloSuperTree * tree,ModelsBlock * models_block)32 PartitionModel::PartitionModel(Params ¶ms, PhyloSuperTree *tree, ModelsBlock *models_block)
33 : ModelFactory()
34 {
35 store_trans_matrix = params.store_trans_matrix;
36 is_storing = false;
37 joint_optimize = params.optimize_model_rate_joint;
38 fused_mix_rate = false;
39 linked_alpha = -1.0;
40 opt_gamma_invar = false;
41
42 // create dummy model
43 model = new ModelSubst(tree->aln->num_states);
44 site_rate = new RateHeterogeneity();
45 site_rate->setTree(tree);
46
47 // string model_name = params.model_name;
48 PhyloSuperTree::iterator it;
49 int part;
50 if (params.link_alpha) {
51 params.gamma_shape = fabs(params.gamma_shape);
52 linked_alpha = params.gamma_shape;
53 }
54 double init_by_divmat = false;
55 if (params.model_name_init && strcmp(params.model_name_init, "DIVMAT") == 0) {
56 init_by_divmat = true;
57 params.model_name_init = NULL;
58 }
59 for (it = tree->begin(), part = 0; it != tree->end(); it++, part++) {
60 ASSERT(!((*it)->getModelFactory()));
61 string model_name = (*it)->aln->model_name;
62 if (model_name == "") // if empty, take model name from command option
63 model_name = params.model_name;
64 (*it)->setModelFactory(new ModelFactory(params, model_name, (*it), models_block));
65 (*it)->setModel((*it)->getModelFactory()->model);
66 (*it)->setRate((*it)->getModelFactory()->site_rate);
67
68 // link models between partitions
69 if (params.link_model) {
70 (*it)->getModel()->fixParameters(true);
71 if (linked_models.find((*it)->getModel()->getName()) == linked_models.end()) {
72 linked_models[(*it)->getModel()->getName()] = (*it)->getModel();
73 }
74 } else if ((*it)->aln->getNSeq() < tree->aln->getNSeq() && params.partition_type != TOPO_UNLINKED &&
75 (*it)->getModel()->freq_type == FREQ_EMPIRICAL && (*it)->aln->seq_type != SEQ_CODON) {
76 // modify state_freq to account for empty sequences
77 (*it)->aln->computeStateFreq((*it)->getModel()->state_freq, (*it)->aln->getNSite() * (tree->aln->getNSeq() - (*it)->aln->getNSeq()));
78 (*it)->getModel()->decomposeRateMatrix();
79 }
80
81 //string taxa_set = ((SuperAlignment*)tree->aln)->getPattern(part);
82 //(*it)->copyTree(tree, taxa_set);
83 //(*it)->drawTree(cout);
84 }
85 if (init_by_divmat) {
86 ASSERT(0 && "init_by_div_mat not working");
87 int nstates = linked_models.begin()->second->num_states;
88 double *pair_freq = new double[nstates * nstates];
89 double *state_freq = new double[nstates];
90 tree->aln->computeDivergenceMatrix(pair_freq, state_freq);
91 /*
92 MatrixXd divmat = Map<Matrix<double,Dynamic, Dynamic, RowMajor> > (pair_freq, nstates, nstates);
93 cout << "DivMat: " << endl << divmat << endl;
94 auto pi = Map<VectorXd>(state_freq, nstates);
95 MatrixXd Q = (pi.asDiagonal() * divmat).log();
96 cout << "Q: " << endl << Q << endl;
97 cout << "rowsum: " << Q.rowwise().sum() << endl;
98 Map<Matrix<double,Dynamic, Dynamic, RowMajor> >(pair_freq, nstates, nstates) = Q;
99 */
100 ((ModelMarkov*)linked_models.begin()->second)->setFullRateMatrix(pair_freq, state_freq);
101 ((ModelMarkov*)linked_models.begin()->second)->decomposeRateMatrix();
102 delete [] state_freq;
103 delete [] pair_freq;
104
105 } else
106 for (auto mit = linked_models.begin(); mit != linked_models.end(); mit++) {
107 PhyloSuperTree *stree = (PhyloSuperTree*)site_rate->phylo_tree;
108 if (mit->second->freq_type != FREQ_ESTIMATE && mit->second->freq_type != FREQ_EMPIRICAL)
109 continue;
110 // count state occurrences
111 size_t *sum_state_counts = NULL;
112 int num_parts = 0;
113 for (it = stree->begin(); it != stree->end(); it++) {
114 if ((*it)->getModel()->getName() == mit->second->getName()) {
115 num_parts++;
116 if ((*it)->aln->seq_type == SEQ_CODON)
117 outError("Linking codon models not supported");
118 if ((*it)->aln->seq_type == SEQ_POMO)
119 outError("Linking POMO models not supported");
120 size_t state_counts[(*it)->aln->STATE_UNKNOWN+1];
121 size_t unknown_states = 0;
122 if( params.partition_type != TOPO_UNLINKED)
123 unknown_states = (*it)->aln->getNSite() * (tree->aln->getNSeq() - (*it)->aln->getNSeq());
124 (*it)->aln->countStates(state_counts, unknown_states);
125 if (!sum_state_counts) {
126 sum_state_counts = new size_t[(*it)->aln->STATE_UNKNOWN+1];
127 memset(sum_state_counts, 0, sizeof(size_t)*((*it)->aln->STATE_UNKNOWN+1));
128 }
129 for (int state = 0; state <= (*it)->aln->STATE_UNKNOWN; state++)
130 sum_state_counts[state] += state_counts[state];
131 }
132 }
133 cout << "Linking " << mit->first << " model across " << num_parts << " partitions" << endl;
134 int nstates = mit->second->num_states;
135 double sum_state_freq[nstates];
136 // convert counts to frequencies
137 for (it = stree->begin(); it != stree->end(); it++) {
138 if ((*it)->getModel()->getName() == mit->second->getName()) {
139 (*it)->aln->convertCountToFreq(sum_state_counts, sum_state_freq);
140 break;
141 }
142 }
143
144 cout << "Mean state frequencies:";
145 int prec = cout.precision(8);
146 for (int state = 0; state < mit->second->num_states; state++)
147 cout << " " << sum_state_freq[state];
148 cout << endl;
149 cout.precision(prec);
150
151 for (it = stree->begin(); it != stree->end(); it++)
152 if ((*it)->getModel()->getName() == mit->second->getName()) {
153 ((ModelMarkov*)(*it)->getModel())->adaptStateFrequency(sum_state_freq);
154 (*it)->getModel()->decomposeRateMatrix();
155 }
156 delete [] sum_state_counts;
157 }
158 }
159
setCheckpoint(Checkpoint * checkpoint)160 void PartitionModel::setCheckpoint(Checkpoint *checkpoint) {
161 ModelFactory::setCheckpoint(checkpoint);
162 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
163 for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++)
164 (*it)->getModelFactory()->setCheckpoint(checkpoint);
165 }
166
startCheckpoint()167 void PartitionModel::startCheckpoint() {
168 checkpoint->startStruct("PartitionModel");
169 }
170
saveCheckpoint()171 void PartitionModel::saveCheckpoint() {
172 startCheckpoint();
173 CKP_SAVE(linked_alpha);
174 for (auto it = linked_models.begin(); it != linked_models.end(); it++) {
175 checkpoint->startStruct(it->first);
176 bool fixed = it->second->fixParameters(false);
177 it->second->saveCheckpoint();
178 it->second->fixParameters(fixed);
179 checkpoint->endStruct();
180 }
181 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
182 int part = 0;
183 for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++, part++) {
184 checkpoint->startStruct((*it)->aln->name);
185 (*it)->getModelFactory()->saveCheckpoint();
186 checkpoint->endStruct();
187 }
188 endCheckpoint();
189
190 CheckpointFactory::saveCheckpoint();
191 }
192
restoreCheckpoint()193 void PartitionModel::restoreCheckpoint() {
194 CheckpointFactory::restoreCheckpoint();
195 startCheckpoint();
196 CKP_RESTORE(linked_alpha);
197
198 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
199 int part = 0;
200 for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++, part++) {
201 checkpoint->startStruct((*it)->aln->name);
202 (*it)->getModelFactory()->restoreCheckpoint();
203 checkpoint->endStruct();
204 }
205
206 // restore linked models
207 for (auto it = linked_models.begin(); it != linked_models.end(); it++) {
208 checkpoint->startStruct(it->first);
209 for (auto tit = tree->begin(); tit != tree->end(); tit++)
210 if ((*tit)->getModel()->getName() == it->first) {
211 bool fixed = (*tit)->getModel()->fixParameters(false);
212 (*tit)->getModel()->restoreCheckpoint();
213 (*tit)->getModel()->fixParameters(fixed);
214 }
215 checkpoint->endStruct();
216 }
217
218 endCheckpoint();
219 }
220
getNParameters(int brlen_type)221 int PartitionModel::getNParameters(int brlen_type) {
222 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
223 int df = 0;
224 for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++) {
225 df += (*it)->getModelFactory()->getNParameters(brlen_type);
226 }
227 if (linked_alpha > 0)
228 df ++;
229 for (auto it = linked_models.begin(); it != linked_models.end(); it++) {
230 bool fixed = it->second->fixParameters(false);
231 df += it->second->getNDim() + it->second->getNDimFreq();
232 it->second->fixParameters(fixed);
233 }
234 return df;
235 }
236
computeFunction(double shape)237 double PartitionModel::computeFunction(double shape) {
238 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
239 double res = 0.0;
240 int ntrees = tree->size();
241 linked_alpha = shape;
242 if (tree->part_order.empty()) tree->computePartitionOrder();
243 #ifdef _OPENMP
244 #pragma omp parallel for reduction(+: res) schedule(dynamic) if(tree->num_threads > 1)
245 #endif
246 for (int j = 0; j < ntrees; j++) {
247 int i = tree->part_order[j];
248 if (tree->at(i)->getRate()->isGammaRate())
249 res += tree->at(i)->getRate()->computeFunction(shape);
250 }
251 if (res == 0.0)
252 outError("No partition has Gamma rate heterogeneity!");
253 return res;
254 }
255
optimizeLinkedAlpha(bool write_info,double gradient_epsilon)256 double PartitionModel::optimizeLinkedAlpha(bool write_info, double gradient_epsilon) {
257 if (write_info)
258 cout << "Optimizing linked gamma shape..." << endl;
259 double negative_lh;
260 double current_shape = linked_alpha;
261 double ferror, optx;
262 optx = minimizeOneDimen(site_rate->getTree()->params->min_gamma_shape, current_shape, MAX_GAMMA_SHAPE, max(gradient_epsilon, TOL_GAMMA_SHAPE), &negative_lh, &ferror);
263 double tree_lh = site_rate->getTree()->computeLikelihood();
264 if (write_info) {
265 cout << "Linked alpha across partitions: " << linked_alpha << endl;
266 cout << "Linked alpha log-likelihood: " << tree_lh << endl;
267 }
268 return tree_lh;
269
270 }
271
getNDim()272 int PartitionModel::getNDim() {
273 return model->getNDim();
274 }
275
targetFunk(double x[])276 double PartitionModel::targetFunk(double x[]) {
277 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
278
279 double res = 0;
280 int ntrees = tree->size();
281 if (tree->part_order.empty()) tree->computePartitionOrder();
282 #ifdef _OPENMP
283 #pragma omp parallel for reduction(+: res) schedule(dynamic) if(tree->num_threads > 1)
284 #endif
285 for (int j = 0; j < ntrees; j++) {
286 int i = tree->part_order[j];
287 ModelSubst *part_model = tree->at(i)->getModel();
288 if (part_model->getName() != model->getName())
289 continue;
290 bool fixed = part_model->fixParameters(false);
291 res += part_model->targetFunk(x);
292 part_model->fixParameters(fixed);
293 }
294 if (res == 0.0)
295 outError("No partition has model ", model->getName());
296 return res;
297 }
298
setVariables(double * variables)299 void PartitionModel::setVariables(double *variables) {
300 model->setVariables(variables);
301 }
302
getVariables(double * variables)303 bool PartitionModel::getVariables(double *variables) {
304 bool changed = false;
305 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
306 for (auto it = tree->begin(); it != tree->end(); it++)
307 if ((*it)->getModel()->getName() == model->getName())
308 changed |= (*it)->getModel()->getVariables(variables);
309 return changed;
310 }
311
scaleStateFreq(bool sum_one)312 void PartitionModel::scaleStateFreq(bool sum_one) {
313 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
314 for (auto it = tree->begin(); it != tree->end(); it++)
315 if ((*it)->getModel()->getName() == model->getName())
316 ((ModelMarkov*)(*it)->getModel())->scaleStateFreq(sum_one);
317 }
318
optimizeLinkedModel(bool write_info,double gradient_epsilon)319 double PartitionModel::optimizeLinkedModel(bool write_info, double gradient_epsilon) {
320 int ndim = getNDim();
321
322 // return if nothing to be optimized
323 if (ndim == 0) return 0.0;
324
325 if (write_info)
326 cout << "Optimizing linked " << model->getName() << " parameters across all partitions (" << ndim << " free parameters)" << endl;
327
328 if (verbose_mode >= VB_MAX)
329 cout << "Optimizing " << model->name << " model parameters..." << endl;
330
331 //if (freq_type == FREQ_ESTIMATE) scaleStateFreq(false);
332
333 double *variables = new double[ndim+1]; // used for BFGS numerical recipes
334 double *variables2 = new double[ndim+1]; // used for L-BFGS-B
335 double *upper_bound = new double[ndim+1];
336 double *lower_bound = new double[ndim+1];
337 bool *bound_check = new bool[ndim+1];
338 double score;
339
340
341 // by BFGS algorithm
342 setVariables(variables);
343 setVariables(variables2);
344 ((ModelMarkov*)model)->setBounds(lower_bound, upper_bound, bound_check);
345 // expand the bound for linked model
346 // for (int i = 1; i <= ndim; i++) {
347 // lower_bound[i] = MIN_RATE*0.2;
348 // upper_bound[i] = MAX_RATE*2.0;
349 // }
350 if (Params::getInstance().optimize_alg.find("BFGS-B") == string::npos)
351 score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_RATE));
352 else
353 score = -L_BFGS_B(ndim, variables+1, lower_bound+1, upper_bound+1, max(gradient_epsilon, TOL_RATE));
354
355 bool changed = getVariables(variables);
356
357 /* 2019-09-05: REMOVED due to numerical issue (NAN) with L-BFGS-B
358 // 2017-12-06: more robust optimization using 2 different routines
359 // when estimates are at boundary
360 score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_RATE));
361 bool changed = getVariables(variables);
362
363 if (model->isUnstableParameters()) {
364 // parameters at boundary, restart with L-BFGS-B with parameters2
365 double score2 = -L_BFGS_B(ndim, variables2+1, lower_bound+1, upper_bound+1, max(gradient_epsilon, TOL_RATE));
366 if (score2 > score+0.1) {
367 if (verbose_mode >= VB_MED)
368 cout << "NICE: L-BFGS-B found better parameters with LnL=" << score2 << " than BFGS LnL=" << score << endl;
369 changed = getVariables(variables2);
370 score = score2;
371 } else {
372 // otherwise, revert what BFGS found
373 changed = getVariables(variables);
374 }
375 }
376 */
377
378 // BQM 2015-09-07: normalize state_freq
379 if (model->isReversible() && model->freq_type == FREQ_ESTIMATE) {
380 scaleStateFreq(true);
381 changed = true;
382 }
383 if (changed) {
384 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
385 for (auto it = tree->begin(); it != tree->end(); it++)
386 if ((*it)->getModel()->getName() == model->getName())
387 (*it)->getModel()->decomposeRateMatrix();
388 site_rate->phylo_tree->clearAllPartialLH();
389 score = site_rate->phylo_tree->computeLikelihood();
390 }
391
392 delete [] bound_check;
393 delete [] lower_bound;
394 delete [] upper_bound;
395 delete [] variables2;
396 delete [] variables;
397
398 if (write_info) {
399 cout << "Linked-model log-likelihood: " << score << endl;
400 }
401
402 return score;
403 }
404
optimizeLinkedModels(bool write_info,double gradient_epsilon)405 double PartitionModel::optimizeLinkedModels(bool write_info, double gradient_epsilon) {
406 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
407 double tree_lh;
408 for (auto it = linked_models.begin(); it != linked_models.end(); it++) {
409 ModelSubst *saved_model = model;
410 model = it->second;
411 PhyloSuperTree::iterator part_tree;
412 // un-fix model parameters
413 for (part_tree = tree->begin(); part_tree != tree->end(); part_tree++)
414 if ((*part_tree)->getModel()->getName() == model->getName())
415 (*part_tree)->getModel()->fixParameters(false);
416
417 // main call to optimize linked model parameters
418 tree_lh = optimizeLinkedModel(write_info, gradient_epsilon);
419
420 // fix model parameters again
421 for (part_tree = tree->begin(); part_tree != tree->end(); part_tree++)
422 if ((*part_tree)->getModel()->getName() == model->getName())
423 (*part_tree)->getModel()->fixParameters(true);
424
425 saveCheckpoint();
426 getCheckpoint()->dump();
427 model = saved_model;
428 }
429
430 return site_rate->phylo_tree->computeLikelihood();
431 }
432
reportLinkedModel(ostream & out)433 void PartitionModel::reportLinkedModel(ostream &out) {
434 if (linked_alpha > 0.0)
435 out << "Linked alpha across partitions: " << linked_alpha << endl;
436 for (auto it = linked_models.begin(); it != linked_models.end(); it++) {
437 out << "Linked model " << it->first << ": " << endl;
438 it->second->report(out);
439 }
440 }
441
isLinkedModel()442 bool PartitionModel::isLinkedModel() {
443 return Params::getInstance().link_alpha || (linked_models.size()>0);
444 }
445
optimizeParameters(int fixed_len,bool write_info,double logl_epsilon,double gradient_epsilon)446 double PartitionModel::optimizeParameters(int fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
447 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
448 double prev_tree_lh = -DBL_MAX, tree_lh = 0.0;
449 int ntrees = tree->size();
450
451 for (int step = 0; step < Params::getInstance().model_opt_steps; step++) {
452 tree_lh = 0.0;
453 if (tree->part_order.empty()) tree->computePartitionOrder();
454 #ifdef _OPENMP
455 #pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(tree->num_threads > 1)
456 #endif
457 for (int i = 0; i < ntrees; i++) {
458 int part = tree->part_order[i];
459 double score;
460 if (opt_gamma_invar)
461 score = tree->at(part)->getModelFactory()->optimizeParametersGammaInvar(fixed_len,
462 write_info && verbose_mode >= VB_MED,
463 logl_epsilon/min(ntrees,10), gradient_epsilon/min(ntrees,10));
464 else
465 score = tree->at(part)->getModelFactory()->optimizeParameters(fixed_len,
466 write_info && verbose_mode >= VB_MED,
467 logl_epsilon/min(ntrees,10), gradient_epsilon/min(ntrees,10));
468 tree_lh += score;
469 if (write_info)
470 #ifdef _OPENMP
471 #pragma omp critical
472 #endif
473 {
474 cout << "Partition " << tree->at(part)->aln->name
475 << " / Model: " << tree->at(part)->getModelName()
476 << " / df: " << tree->at(part)->getModelFactory()->getNParameters(fixed_len)
477 << " / LogL: " << score << endl;
478 }
479 }
480 //return ModelFactory::optimizeParameters(fixed_len, write_info);
481
482 if (!isLinkedModel())
483 break;
484
485 if (verbose_mode >= VB_MED || write_info)
486 cout << step+1 << ". Log-likelihood: " << tree_lh << endl;
487
488 if (tree->params->link_alpha) {
489 tree_lh = optimizeLinkedAlpha(write_info, gradient_epsilon);
490 }
491
492 // optimize linked models
493 if (!linked_models.empty()) {
494 double new_tree_lh = optimizeLinkedModels(write_info, gradient_epsilon);
495 ASSERT(new_tree_lh > tree_lh - 0.1);
496 tree_lh = new_tree_lh;
497 }
498
499 if (tree_lh-logl_epsilon*10 < prev_tree_lh)
500 break;
501 prev_tree_lh = tree_lh;
502 }
503
504 if (verbose_mode >= VB_MED || write_info)
505 cout << "Optimal log-likelihood: " << tree_lh << endl;
506 // write linked_models
507 if (verbose_mode <= VB_MIN && write_info) {
508 for (auto it = linked_models.begin(); it != linked_models.end(); it++)
509 it->second->writeInfo(cout);
510 }
511 return tree_lh;
512 }
513
514
optimizeParametersGammaInvar(int fixed_len,bool write_info,double logl_epsilon,double gradient_epsilon)515 double PartitionModel::optimizeParametersGammaInvar(int fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
516 opt_gamma_invar = true;
517 double tree_lh = optimizeParameters(fixed_len, write_info, logl_epsilon, gradient_epsilon);
518 opt_gamma_invar = false;
519 return tree_lh;
520 }
521
522
~PartitionModel()523 PartitionModel::~PartitionModel()
524 {
525 }
526
isUnstableParameters()527 bool PartitionModel::isUnstableParameters() {
528 PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
529
530 for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++)
531 if ((*it)->getModelFactory()->isUnstableParameters()) {
532 return true;
533 }
534 return false;
535 }
536