1 /***************************************************************************
2 * Copyright (C) 2009-2015 by *
3 * BUI Quang Minh <minh.bui@univie.ac.at> *
4 * Lam-Tung Nguyen <nltung@gmail.com> *
5 * *
6 * *
7 * This program is free software; you can redistribute it and/or modify *
8 * it under the terms of the GNU General Public License as published by *
9 * the Free Software Foundation; either version 2 of the License, or *
10 * (at your option) any later version. *
11 * *
12 * This program is distributed in the hope that it will be useful, *
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 * GNU General Public License for more details. *
16 * *
17 * You should have received a copy of the GNU General Public License *
18 * along with this program; if not, write to the *
19 * Free Software Foundation, Inc., *
20 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
21 ***************************************************************************/
22
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26 #include <iqtree_config.h>
27 #include "tree/phylotree.h"
28 #include "tree/phylosupertree.h"
29 #include "tree/phylosupertreeplen.h"
30 #include "tree/phylosupertreeunlinked.h"
31 #include "phyloanalysis.h"
32 #include "alignment/alignment.h"
33 #include "alignment/superalignment.h"
34 #include "alignment/superalignmentunlinked.h"
35 #include "tree/iqtree.h"
36 #include "tree/phylotreemixlen.h"
37 #include "model/modelmarkov.h"
38 #include "model/modeldna.h"
39 #include "model/modelpomo.h"
40 #include "nclextra/myreader.h"
41 #include "model/rateheterogeneity.h"
42 #include "model/rategamma.h"
43 #include "model/rateinvar.h"
44 #include "model/rategammainvar.h"
45 //#include "modeltest_wrapper.h"
46 #include "model/modelprotein.h"
47 #include "model/modelbin.h"
48 #include "model/modelcodon.h"
49 #include "utils/stoprule.h"
50
51 #include "tree/mtreeset.h"
52 #include "tree/mexttree.h"
53 #include "model/ratemeyerhaeseler.h"
54 #include "whtest/whtest_wrapper.h"
55 #include "model/partitionmodel.h"
56 #include "model/modelmixture.h"
57 #include "model/modelfactorymixlen.h"
58 //#include "guidedbootstrap.h"
59 #include "model/modelset.h"
60 #include "utils/timeutil.h"
61 #include "tree/upperbounds.h"
62 #include "utils/MPIHelper.h"
63 #include "timetree.h"
64
65 #ifdef USE_BOOSTER
66 extern "C" {
67 #include "booster/booster.h"
68 }
69 #endif
70
71 #ifdef IQTREE_TERRAPHAST
72 #include "terrace/terrace.h"
73 #endif
74
reportReferences(Params & params,ofstream & out)75 void reportReferences(Params ¶ms, ofstream &out) {
76
77 out << "To cite IQ-TREE please use:" << endl << endl
78 << "Bui Quang Minh, Heiko A. Schmidt, Olga Chernomor, Dominik Schrempf," << endl
79 << "Michael D. Woodhams, Arndt von Haeseler, and Robert Lanfear (2020)" << endl
80 << "IQ-TREE 2: New models and efficient methods for phylogenetic inference" << endl
81 << "in the genomic era. Mol. Biol. Evol., in press." << endl
82 << "https://doi.org/10.1093/molbev/msaa015" << endl << endl;
83
84 bool modelfinder_only = false;
85 if (params.model_name.substr(0,4) == "TEST" || params.model_name.substr(0, 2) == "MF" || params.model_name.empty()) {
86 out << "To cite ModelFinder please use: " << endl << endl
87 << "Subha Kalyaanamoorthy, Bui Quang Minh, Thomas KF Wong, Arndt von Haeseler," << endl
88 << "and Lars S Jermiin (2017) ModelFinder: Fast model selection for" << endl
89 << "accurate phylogenetic estimates. Nature Methods, 14:587–589." << endl
90 << "https://doi.org/10.1038/nmeth.4285" << endl << endl;
91 if (params.model_name.find("ONLY") != string::npos || (params.model_name.substr(0,2)=="MF" && params.model_name.substr(0,3)!="MFP"))
92 modelfinder_only = true;
93 }
94 if (posPOMO(params.model_name) != string::npos) {
95 out << "For polymorphism-aware models please cite:" << endl << endl
96 << "Dominik Schrempf, Bui Quang Minh, Nicola De Maio, Arndt von Haeseler, and Carolin Kosiol" << endl
97 << "(2016) Reversible polymorphism-aware phylogenetic models and their application to" << endl
98 << "tree inference. J. Theor. Biol., 407:362–370." << endl
99 << "https://doi.org/10.1016/j.jtbi.2016.07.042" << endl << endl;
100 }
101
102 if (params.site_freq_file || params.tree_freq_file)
103 out << "Since you used site-specific frequency model please also cite: " << endl << endl
104 << "Huai-Chun Wang, Edward Susko, Bui Quang Minh, and Andrew J. Roger (2018)" << endl
105 << "Modeling site heterogeneity with posterior mean site frequency profiles" << endl
106 << "accelerates accurate phylogenomic estimation. Syst. Biol., 67:216–235." << endl
107 << "https://doi.org/10.1093/sysbio/syx068" << endl << endl;
108
109
110 if (params.gbo_replicates)
111 out << "Since you used ultrafast bootstrap (UFBoot) please also cite: " << endl << endl
112 << "Diep Thi Hoang, Olga Chernomor, Arndt von Haeseler, Bui Quang Minh," << endl
113 << "and Le Sy Vinh (2018) UFBoot2: Improving the ultrafast bootstrap" << endl
114 << "approximation. Mol. Biol. Evol., 35:518–522." << endl
115 << "https://doi.org/10.1093/molbev/msx281" << endl << endl;
116
117 if (params.partition_file)
118 out << "Since you used partition models please also cite:" << endl << endl
119 << "Olga Chernomor, Arndt von Haeseler, and Bui Quang Minh (2016)" << endl
120 << "Terrace aware data structure for phylogenomic inference from" << endl
121 << "supermatrices. Syst. Biol., 65:997-1008." << endl
122 << "https://doi.org/10.1093/sysbio/syw037" << endl << endl;
123
124 if (params.terrace_analysis)
125 out << "Since you used terrace analysis please also cite:" << endl << endl
126 << "Biczok R, Bozsoky P, Eisenmann P, Ernst J, Ribizel T, Scholz F," << endl
127 << "Trefzer A, Weber F, Hamann M, Stamatakis A. (2018)" << endl
128 << "Two C++ libraries for counting trees on a phylogenetic" << endl
129 << "terrace. Bioinformatics 34:3399–3401." << endl
130 << "https://doi.org/10.1093/bioinformatics/bty384" << endl << endl;
131
132 if (params.dating_method == "LSD")
133 out << "Since you used least square dating (LSD) please also cite: " << endl << endl
134 << "Thu-Hien To, Matthieu Jung, Samantha Lycett, Olivier Gascuel (2016)" << endl
135 << "Fast dating using least-squares criteria and algorithms. Syst. Biol. 65:82-97." << endl
136 << "https://doi.org/10.1093/sysbio/syv068" << endl << endl;
137 }
138
reportAlignment(ofstream & out,Alignment & alignment,int nremoved_seqs)139 void reportAlignment(ofstream &out, Alignment &alignment, int nremoved_seqs) {
140 out << "Input data: " << alignment.getNSeq()+nremoved_seqs << " sequences with "
141 << alignment.getNSite() << " ";
142 switch (alignment.seq_type) {
143 case SEQ_BINARY: out << "binary"; break;
144 case SEQ_DNA: out << "nucleotide"; break;
145 case SEQ_PROTEIN: out << "amino-acid"; break;
146 case SEQ_CODON: out << "codon"; break;
147 case SEQ_MORPH: out << "morphological"; break;
148 case SEQ_POMO: out << "PoMo"; break;
149 default: out << "unknown"; break;
150 }
151 out << " sites" << endl << "Number of constant sites: "
152 << round(alignment.frac_const_sites * alignment.getNSite())
153 << " (= " << alignment.frac_const_sites * 100 << "% of all sites)" << endl
154
155 << "Number of invariant (constant or ambiguous constant) sites: "
156 << round(alignment.frac_invariant_sites * alignment.getNSite())
157 << " (= " << alignment.frac_invariant_sites * 100 << "% of all sites)" << endl
158
159 << "Number of parsimony informative sites: " << alignment.num_informative_sites << endl
160
161 << "Number of distinct site patterns: " << alignment.size() << endl
162 << endl;
163 }
164
165 /*
166 void pruneModelInfo(ModelCheckpoint &model_info, PhyloSuperTree *tree) {
167 ModelCheckpoint res_info;
168 for (vector<PartitionInfo>::iterator it = tree->part_info.begin(); it != tree->part_info.end(); it++) {
169 for (ModelCheckpoint::iterator mit = model_info.begin(); mit != model_info.end(); mit++)
170 if (mit->set_name == it->name)
171 res_info.push_back(*mit);
172 }
173 model_info = res_info;
174
175 }
176 */
reportModelSelection(ofstream & out,Params & params,ModelCheckpoint * model_info,PhyloTree * tree)177 void reportModelSelection(ofstream &out, Params ¶ms, ModelCheckpoint *model_info, PhyloTree *tree) {
178 out << "Best-fit model according to " << criterionName(params.model_test_criterion) << ": ";
179 // ModelCheckpoint::iterator it;
180 string best_model;
181 PhyloSuperTree *stree = (tree->isSuperTree()) ? ((PhyloSuperTree*)tree) : NULL;
182 if (tree->isSuperTree()) {
183 SuperAlignment *saln = (SuperAlignment*)stree->aln;
184 for (int part = 0; part != stree->size(); part++) {
185 if (part != 0)
186 out << ",";
187 out << saln->partitions[part]->model_name << ":" << saln->partitions[part]->name;
188 }
189 // string set_name = "";
190 // for (it = model_info.begin(); it != model_info.end(); it++) {
191 // if (it->set_name != set_name) {
192 // if (set_name != "")
193 // out << ",";
194 // out << it->name << ":" << it->set_name;
195 // set_name = it->set_name;
196 // }
197 // }
198 } else {
199 // out << model_info[0].name;
200 model_info->getBestModel(best_model);
201 out << best_model;
202 }
203
204 if (tree->isSuperTree()) {
205 out << endl << endl << "List of best-fit models per partition:" << endl << endl;
206 } else {
207 out << endl << endl << "List of models sorted by "
208 << ((params.model_test_criterion == MTC_BIC) ? "BIC" :
209 ((params.model_test_criterion == MTC_AIC) ? "AIC" : "AICc"))
210 << " scores: " << endl << endl;
211 }
212 if (tree->isSuperTree())
213 out << " ID ";
214 out << "Model LogL AIC w-AIC AICc w-AICc BIC w-BIC" << endl;
215 /*
216 if (is_partitioned)
217 out << "----------";
218
219 out << "----------------------------------------------------------------------------------------" << endl;
220 */
221 int setid = 1;
222 out.precision(3);
223
224 CandidateModelSet models;
225 model_info->getOrderedModels(tree, models);
226 for (auto it = models.begin(); it != models.end(); it++) {
227 if (tree->isSuperTree()) {
228 out.width(4);
229 out << right << setid << " ";
230 setid++;
231 }
232 out.width(15);
233 out << left << it->getName() << " ";
234 out.width(11);
235 out << right << it->logl << " ";
236 out.width(11);
237 out << it->AIC_score << ((it->AIC_conf) ? " + " : " - ");
238 out.unsetf(ios::fixed);
239 out.width(8);
240 out << it->AIC_weight << " ";
241 out.setf(ios::fixed);
242 out.width(11);
243 out << it->AICc_score << ((it->AICc_conf) ? " + " : " - ");
244 out.unsetf(ios::fixed);
245 out.width(8);
246 out << it->AICc_weight << " ";
247 out.setf(ios::fixed);
248 out.width(11);
249 out << it->BIC_score << ((it->BIC_conf) ? " + " : " - ");
250 out.unsetf(ios::fixed);
251 out.width(8);
252 out << it->BIC_weight;
253 out.setf(ios::fixed);
254 out << endl;
255 }
256 out.precision(4);
257
258 /* TODO
259 for (it = model_info.begin(); it != model_info.end(); it++) {
260 if (it->AIC_score == DBL_MAX) continue;
261 if (it != model_info.begin() && it->set_name != (it-1)->set_name)
262 setid++;
263 if (is_partitioned && it != model_info.begin() && it->set_name == (it-1)->set_name)
264 continue;
265 if (is_partitioned) {
266 out.width(4);
267 out << right << setid << " ";
268 }
269 out.width(15);
270 out << left << it->name << " ";
271 out.width(11);
272 out << right << it->logl << " ";
273 out.width(11);
274 out << it->AIC_score << ((it->AIC_conf) ? " + " : " - ") << it->AIC_weight << " ";
275 out.width(11);
276 out << it->AICc_score << ((it->AICc_conf) ? " + " : " - ") << it->AICc_weight << " ";
277 out.width(11);
278 out << it->BIC_score << ((it->BIC_conf) ? " + " : " - ") << it->BIC_weight;
279 out << endl;
280 }
281 */
282 out << endl;
283 out << "AIC, w-AIC : Akaike information criterion scores and weights." << endl
284 << "AICc, w-AICc : Corrected AIC scores and weights." << endl
285 << "BIC, w-BIC : Bayesian information criterion scores and weights." << endl << endl
286
287 << "Plus signs denote the 95% confidence sets." << endl
288 << "Minus signs denote significant exclusion." <<endl;
289 out << endl;
290 }
291
reportModel(ofstream & out,Alignment * aln,ModelSubst * m)292 void reportModel(ofstream &out, Alignment *aln, ModelSubst *m) {
293 int i, j, k;
294 ASSERT(aln->num_states == m->num_states);
295 double *rate_mat = new double[m->num_states * m->num_states];
296 if (!m->isSiteSpecificModel())
297 m->getRateMatrix(rate_mat);
298 else
299 ((ModelSet*)m)->front()->getRateMatrix(rate_mat);
300
301 if (m->num_states <= 4) {
302 out << "Rate parameter R:" << endl << endl;
303
304 if (m->num_states > 4)
305 out << fixed;
306 if (m->isReversible()) {
307 for (i = 0, k = 0; i < m->num_states - 1; i++)
308 for (j = i + 1; j < m->num_states; j++, k++) {
309 out << " " << aln->convertStateBackStr(i) << "-" << aln->convertStateBackStr(j) << ": "
310 << rate_mat[k];
311 if (m->num_states <= 4)
312 out << endl;
313 else if (k % 5 == 4)
314 out << endl;
315 }
316
317 } else { // non-reversible model
318 for (i = 0, k = 0; i < m->num_states; i++)
319 for (j = 0; j < m->num_states; j++)
320 if (i != j) {
321 out << " " << aln->convertStateBackStr(i) << "-" << aln->convertStateBackStr(j)
322 << ": " << rate_mat[k];
323 if (m->num_states <= 4)
324 out << endl;
325 else if (k % 5 == 4)
326 out << endl;
327 k++;
328 }
329
330 }
331
332 //if (tree.aln->num_states > 4)
333 out << endl;
334 out.unsetf(ios_base::fixed);
335 } else if (aln->seq_type == SEQ_PROTEIN && m->getNDim() > 20) {
336 ASSERT(m->num_states == 20);
337 out << "WARNING: This model has " << m->getNDim() + m->getNDimFreq() << " parameters that may be overfitting. Please use with caution!" << endl << endl;
338 double full_mat[400];
339
340 out.precision(6);
341
342 if (m->isReversible()) {
343 for (i = 0, k = 0; i < m->num_states - 1; i++)
344 for (j = i + 1; j < m->num_states; j++, k++) {
345 full_mat[i*m->num_states+j] = rate_mat[k];
346 }
347 out << "Substitution parameters (lower-diagonal) and state frequencies in PAML format (can be used as input for IQ-TREE): " << endl << endl;
348 for (i = 1; i < m->num_states; i++) {
349 for (j = 0; j < i; j++)
350 out << " " << full_mat[j*m->num_states+i];
351 out << endl;
352 }
353 } else {
354 // non-reversible model
355 m->getQMatrix(full_mat);
356 out << "Full Q matrix and state frequencies (can be used as input for IQ-TREE): " << endl << endl;
357 for (i = 0; i < m->num_states; i++) {
358 for (j = 0; j < m->num_states; j++)
359 out << " " << full_mat[i*m->num_states+j];
360 out << endl;
361 }
362 }
363 double state_freq[20];
364 m->getStateFrequency(state_freq);
365 for (i = 0; i < m->num_states; i++)
366 out << " " << state_freq[i];
367 out << endl << endl;
368 out.precision(4);
369 }
370
371 delete[] rate_mat;
372
373 if (aln->seq_type == SEQ_POMO) {
374 m->report(out);
375 return;
376 }
377
378 out << "State frequencies: ";
379 if (m->isSiteSpecificModel())
380 out << "(site specific frequencies)" << endl << endl;
381 else {
382 // 2016-11-03: commented out as this is not correct anymore
383 // if (!m->isReversible())
384 // out << "(inferred from Q matrix)" << endl;
385 // else
386 switch (m->getFreqType()) {
387 case FREQ_EMPIRICAL:
388 out << "(empirical counts from alignment)" << endl;
389 break;
390 case FREQ_ESTIMATE:
391 out << "(estimated with maximum likelihood)" << endl;
392 break;
393 case FREQ_USER_DEFINED:
394 out << ((aln->seq_type == SEQ_PROTEIN) ? "(model)" : "(user-defined)") << endl;
395 break;
396 case FREQ_EQUAL:
397 out << "(equal frequencies)" << endl;
398 break;
399 default:
400 break;
401 }
402 out << endl;
403
404 if ((m->getFreqType() != FREQ_USER_DEFINED || aln->seq_type == SEQ_DNA) && m->getFreqType() != FREQ_EQUAL) {
405 double *state_freqs = new double[m->num_states];
406 m->getStateFrequency(state_freqs);
407 int ncols=(aln->seq_type == SEQ_CODON) ? 4 : 1;
408 for (i = 0; i < m->num_states; i++) {
409 out << " pi(" << aln->convertStateBackStr(i) << ") = " << state_freqs[i];
410 if (i % ncols == ncols-1)
411 out << endl;
412 }
413 delete[] state_freqs;
414 out << endl;
415 }
416 if (m->num_states <= 4 || verbose_mode >= VB_MED) {
417 // report Q matrix
418 if (verbose_mode >= VB_MED)
419 out.precision(6);
420 double *q_mat = new double[m->num_states * m->num_states];
421 m->getQMatrix(q_mat);
422
423 out << "Rate matrix Q:" << endl << endl;
424 for (i = 0, k = 0; i < m->num_states; i++) {
425 out << " " << aln->convertStateBackStr(i);
426 for (j = 0; j < m->num_states; j++, k++) {
427 out << " ";
428 out.width(8);
429 out << q_mat[k];
430 }
431 out << endl;
432 }
433 out << endl;
434 delete[] q_mat;
435 }
436 }
437 }
438
reportModel(ofstream & out,PhyloTree & tree)439 void reportModel(ofstream &out, PhyloTree &tree) {
440 // int i, j, k;
441 int i;
442
443 if (tree.getModel()->isMixture() && !tree.getModel()->isPolymorphismAware()) {
444 out << "Mixture model of substitution: " << tree.getModelName() << endl;
445 // out << "Full name: " << tree.getModelName() << endl;
446 ModelSubst *mmodel = tree.getModel();
447 out << endl << " No Component Rate Weight Parameters" << endl;
448 i = 0;
449 int nmix = mmodel->getNMixtures();
450 for (i = 0; i < nmix; i++) {
451 ModelMarkov *m = (ModelMarkov*)mmodel->getMixtureClass(i);
452 out.width(4);
453 out << right << i+1 << " ";
454 out.width(12);
455 out << left << (m)->name << " ";
456 out.width(7);
457 out << (m)->total_num_subst << " ";
458 out.width(7);
459 out << mmodel->getMixtureWeight(i) << " " << (m)->getNameParams() << endl;
460
461 if (tree.aln->seq_type == SEQ_POMO) {
462 out << endl << "Model for mixture component " << i+1 << ": " << (m)->name << endl;
463 reportModel(out, tree.aln, m);
464 }
465 }
466 if (tree.aln->seq_type != SEQ_POMO && tree.aln->seq_type != SEQ_DNA)
467 for (i = 0; i < nmix; i++) {
468 ModelMarkov *m = (ModelMarkov*)mmodel->getMixtureClass(i);
469 if (m->getFreqType() == FREQ_EQUAL || m->getFreqType() == FREQ_USER_DEFINED)
470 continue;
471 out << endl << "Model for mixture component " << i+1 << ": " << (m)->name << endl;
472 reportModel(out, tree.aln, m);
473 }
474 out << endl;
475 } else {
476 out << "Model of substitution: " << tree.getModelName() << endl << endl;
477 reportModel(out, tree.aln, tree.getModel());
478 }
479 }
480
reportRate(ostream & out,PhyloTree & tree)481 void reportRate(ostream &out, PhyloTree &tree) {
482 int i;
483 RateHeterogeneity *rate_model = tree.getRate();
484 out << "Model of rate heterogeneity: " << rate_model->full_name << endl;
485 rate_model->writeInfo(out);
486
487 if (rate_model->getNDiscreteRate() > 1 || rate_model->getPInvar() > 0.0) {
488 out << endl << " Category Relative_rate Proportion" << endl;
489 if (rate_model->getPInvar() > 0.0)
490 out << " 0 0 " << rate_model->getPInvar()
491 << endl;
492 int cats = rate_model->getNDiscreteRate();
493 DoubleVector prop;
494 if (rate_model->getGammaShape() > 0 || rate_model->getPtnCat(0) < 0) {
495 // prop.resize(cats, (1.0 - rate_model->getPInvar()) / rate_model->getNRate());
496 prop.resize(cats);
497 for (i = 0; i < cats; i++)
498 prop[i] = rate_model->getProp(i);
499 } else {
500 prop.resize(cats, 0.0);
501 for (i = 0; i < tree.aln->getNPattern(); i++)
502 prop[rate_model->getPtnCat(i)] += tree.aln->at(i).frequency;
503 for (i = 0; i < cats; i++)
504 prop[i] /= tree.aln->getNSite();
505 }
506 for (i = 0; i < cats; i++) {
507 out << " " << i + 1 << " ";
508 out.width(14);
509 out << left << rate_model->getRate(i) << " " << prop[i];
510 out << endl;
511 }
512 if (rate_model->isGammaRate()) {
513 out << "Relative rates are computed as " << ((rate_model->isGammaRate() == GAMMA_CUT_MEDIAN) ? "MEDIAN" : "MEAN") <<
514 " of the portion of the Gamma distribution falling in the category." << endl;
515 }
516 }
517 /*
518 if (rate_model->getNDiscreteRate() > 1 || rate_model->isSiteSpecificRate())
519 out << endl << "See file " << rate_file << " for site-specific rates and categories" << endl;*/
520 out << endl;
521 }
522
reportTree(ofstream & out,Params & params,PhyloTree & tree,double tree_lh,double lh_variance,double main_tree)523 void reportTree(ofstream &out, Params ¶ms, PhyloTree &tree, double tree_lh, double lh_variance, double main_tree) {
524 int ssize = tree.getAlnNSite();
525 double epsilon = 1.0 / ssize;
526 double totalLen = tree.treeLength();
527 int df = tree.getModelFactory()->getNParameters(BRLEN_OPTIMIZE);
528 double AIC_score, AICc_score, BIC_score;
529 computeInformationScores(tree_lh, df, ssize, AIC_score, AICc_score, BIC_score);
530
531 out << "Log-likelihood of the tree: " << fixed << tree_lh;
532 if (lh_variance > 0.0)
533 out << " (s.e. " << sqrt(lh_variance) << ")";
534 out << endl;
535 out << "Unconstrained log-likelihood (without tree): " << tree.aln->computeUnconstrainedLogL() << endl;
536
537 out << "Number of free parameters (#branches + #model parameters): " << df << endl;
538 // if (ssize > df) {
539 // if (ssize > 40*df)
540 // out << "Akaike information criterion (AIC) score: " << AIC_score << endl;
541 // else
542 // out << "Corrected Akaike information criterion (AICc) score: " << AICc_score << endl;
543 //
544 // out << "Bayesian information criterion (BIC) score: " << BIC_score << endl;
545 // } else
546 out << "Akaike information criterion (AIC) score: " << AIC_score << endl;
547 out << "Corrected Akaike information criterion (AICc) score: " << AICc_score << endl;
548 out << "Bayesian information criterion (BIC) score: " << BIC_score << endl;
549
550 if (ssize <= df && main_tree) {
551
552 out << endl
553 << "**************************** WARNING ****************************" << endl
554 << "Number of parameters (K, model parameters and branch lengths): " << df << endl
555 << "Sample size (n, alignment length): " << ssize << endl << endl
556 << "Given that K>=n, the parameter estimates might be inaccurate." << endl
557 << "Thus, phylogenetic estimates should be interpreted with caution." << endl << endl
558
559 << "Ideally, it is desirable that n >> K. When selecting optimal models," << endl
560 << "1. use AIC or BIC if n > 40K;" << endl
561 << "2. use AICc or BIC if 40K >= n > K;" << endl
562 << "3. be extremely cautious if n <= K" << endl << endl
563
564 << "To improve the situation (3), consider the following options:" << endl
565 << " 1. Increase the sample size (n)" << endl
566 << " 2. Decrease the number of parameters (K) to be estimated. If" << endl
567 << " possible:" << endl
568 << " a. Remove the least important sequences from the alignment" << endl
569 << " b. Specify some of the parameter values for the substitution"<< endl
570 << " model (e.g., the nucleotide or amino acid frequencies)" << endl
571 << " c. Specify some of the parameter values for the rates-across-" << endl
572 << " sites model (e.g., the shape parameter for the discrete" << endl
573 << " Gamma distribution, the proportion of invariable sites, or" << endl
574 << " the rates of change for different rate categories under" << endl
575 << " the FreeRate model)" << endl << endl
576 << "Reference:" << endl
577 << "Burnham KR, Anderson DR (2002). Model Selection and Multimodel" << endl
578 << "Inference: A Practical Information-Theoretic Approach. Springer," << endl
579 << "New York." << endl
580 << "************************ END OF WARNING ***********************" << endl;
581 }
582 out << endl;
583
584 if (tree.aln->seq_type == SEQ_POMO) {
585 int N = tree.aln->virtual_pop_size;
586 out << "NOTE: The branch lengths of PoMo measure mutations and frequency shifts." << endl;
587 out << "To compare PoMo branch lengths to DNA substitution models use the tree length" << endl;
588 out << "measured in substitutions per site." << endl << endl;
589 out << "PoMo branch length = Substitution model branch length * N * N." << endl << endl;
590 out << "Total tree length (sum of branch lengths)" << endl;
591 out << " - measured in number of mutations and frequency shifts per site: " << totalLen << endl;
592 out << " - measured in number of substitutions per site (divided by N^2): " << totalLen / (N * N) << endl;
593 }
594 else out << "Total tree length (sum of branch lengths): " << totalLen << endl;
595
596 double totalLenInternal = tree.treeLengthInternal(epsilon);
597 double totalLenInternalP = totalLenInternal*100.0 / totalLen;
598 if (tree.aln->seq_type == SEQ_POMO) {
599 int N = tree.aln->virtual_pop_size;
600 double totLenIntSub = totalLenInternal/(N * N);
601 out << "Sum of internal branch lengths" << endl;
602 out << "- measured in mutations and frequency shifts per site: " << totalLenInternal << " (" << totalLenInternalP << "% of tree length)" << endl;
603 out << "- measured in substitutions per site: " << totLenIntSub << " (" << totalLenInternalP << "% of tree length)" << endl;
604 out << endl;
605 }
606 else {
607 out << "Sum of internal branch lengths: " << totalLenInternal << " (" << totalLenInternalP << "% of tree length)" << endl;
608 // out << "Sum of internal branch lengths divided by total tree length: "
609 // << totalLenInternal / totalLen << endl;
610 out << endl;
611 }
612
613 if (tree.isMixlen()) {
614 DoubleVector lenvec;
615 tree.treeLengths(lenvec);
616 out << "Class tree lengths: ";
617 for (int i = 0; i < lenvec.size(); i++)
618 out << " " << lenvec[i];
619 out << endl;
620 }
621
622 if (params.partition_type == TOPO_UNLINKED) {
623 out << "Tree topologies are unlinked across partitions, thus no drawing will be displayed here" << endl;
624 out << endl;
625 return;
626 }
627
628 //out << "ZERO BRANCH EPSILON = " << epsilon << endl;
629 int zero_internal_branches = tree.countZeroInternalBranches(NULL, NULL, epsilon);
630 if (zero_internal_branches > 0) {
631 //int zero_internal_branches = tree.countZeroInternalBranches(NULL, NULL, epsilon);
632 /*
633 out << "WARNING: " << zero_branches
634 << " branches of near-zero lengths (<" << epsilon << ") and should be treated with caution!"
635 << endl;
636 */
637 out << "WARNING: " << zero_internal_branches
638 << " near-zero internal branches (<" << epsilon << ") should be treated with caution"
639 << endl;
640 /*
641 cout << endl << "WARNING: " << zero_branches
642 << " branches of near-zero lengths (<" << epsilon << ") and should be treated with caution!"
643 << endl;
644 */
645 out << " Such branches are denoted by '**' in the figure below"
646 << endl << endl;
647 }
648 int long_branches = tree.countLongBranches(NULL, NULL, params.max_branch_length-0.2);
649 if (long_branches > 0) {
650 //stringstream sstr;
651 out << "WARNING: " << long_branches << " too long branches (>"
652 << params.max_branch_length-0.2 << ") should be treated with caution!" << endl;
653 //out << sstr.str();
654 //cout << sstr.str();
655 }
656
657 //<< "Total tree length: " << tree.treeLength() << endl << endl
658 tree.sortTaxa();
659 if (tree.rooted)
660 out << "NOTE: Tree is ROOTED at virtual root '" << tree.root->name << "'" << endl;
661 else
662 out << "NOTE: Tree is UNROOTED although outgroup taxon '" << tree.root->name << "' is drawn at root" << endl;
663
664 if (tree.isSuperTree() && params.partition_type == BRLEN_OPTIMIZE)
665 out << "NOTE: Branch lengths are weighted average over all partitions" << endl
666 << " (weighted by the number of sites in the partitions)" << endl;
667 if (tree.isMixlen())
668 out << "NOTE: Branch lengths are weighted average over heterotachy classes" << endl;
669
670 bool is_codon = tree.aln->seq_type == SEQ_CODON;
671 if (tree.isSuperTree()) {
672 PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
673 is_codon = true;
674 for (PhyloSuperTree::iterator sit = stree->begin(); sit != stree->end(); sit++)
675 if ((*sit)->aln->seq_type != SEQ_CODON) {
676 is_codon = false;
677 break;
678 }
679 }
680 if (is_codon)
681 out << endl << "NOTE: Branch lengths are interpreted as number of nucleotide substitutions per codon site!"
682 << endl << " Rescale them by 1/3 if you want to have #nt substitutions per nt site" << endl;
683 if (main_tree)
684 if (params.aLRT_replicates > 0 || params.gbo_replicates || (params.num_bootstrap_samples && params.compute_ml_tree)) {
685 out << "Numbers in parentheses are ";
686 if (params.aLRT_replicates > 0) {
687 out << "SH-aLRT support (%)";
688 if (params.localbp_replicates)
689 out << " / local bootstrap support (%)";
690 }
691 if (params.aLRT_test)
692 out << " / parametric aLRT support";
693 if (params.aBayes_test)
694 out << " / aBayes support";
695 if (params.num_bootstrap_samples && params.compute_ml_tree) {
696 if (params.aLRT_replicates > 0 || params.aLRT_test || params.aBayes_test)
697 out << " /";
698 out << " standard " << RESAMPLE_NAME << " support (%)";
699 }
700 if (params.gbo_replicates) {
701 if (params.aLRT_replicates > 0 || params.aLRT_test || params.aBayes_test)
702 out << " /";
703 out << " ultrafast " << RESAMPLE_NAME << " support (%)";
704 }
705 out << endl;
706 }
707 out << endl;
708
709 //tree.setExtendedFigChar();
710 tree.setRootNode(params.root, true);
711 tree.drawTree(out, WT_BR_SCALE, epsilon);
712
713 out << "Tree in newick format:";
714 if (tree.isMixlen())
715 out << " (class branch lengths are given in [...] and separated by '/' )";
716 if (tree.aln->seq_type == SEQ_POMO)
717 out << " (measured in mutations and frequency shifts)";
718 out << endl << endl;
719
720 tree.printTree(out, WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA);
721 out << endl << endl;
722
723 if (tree.aln->seq_type == SEQ_POMO) {
724 out << "Tree in newick format (measured in substitutions, see above):" << endl;
725 out << "WARNING: Only for comparison with substitution models." << endl;
726 out << " These are NOT the branch lengths inferred by PoMo." << endl << endl;
727 double len_scale_old = tree.len_scale;
728 int N = tree.aln->virtual_pop_size;
729 tree.len_scale = 1.0/(N*N);
730 tree.printTree(out, WT_BR_SCALE | WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA);
731 tree.len_scale = len_scale_old;
732 out << endl << endl;
733 }
734
735 tree.setRootNode(params.root, false);
736
737 }
738
reportCredits(ofstream & out)739 void reportCredits(ofstream &out) {
740 out << "CREDITS" << endl << "-------" << endl << endl
741 << "Some parts of the code were taken from the following packages/libraries:"
742 << endl << endl
743 << "Schmidt HA, Strimmer K, Vingron M, and von Haeseler A (2002)" << endl
744 << "TREE-PUZZLE: maximum likelihood phylogenetic analysis using quartets" << endl
745 << "and parallel computing. Bioinformatics, 18(3):502-504." << endl << endl
746
747 //<< "The source code to construct the BIONJ tree were taken from BIONJ software:"
748 //<< endl << endl
749 << "Gascuel O (1997) BIONJ: an improved version of the NJ algorithm" << endl
750 << "based on a simple model of sequence data. Mol. Bio. Evol., 14:685-695." << endl << endl
751
752 //<< "The Nexus file parser was taken from the Nexus Class Library:"
753 //<< endl << endl
754 << "Paul O. Lewis (2003) NCL: a C++ class library for interpreting data files in" << endl
755 << "NEXUS format. Bioinformatics, 19(17):2330-2331." << endl << endl
756
757 << "Mascagni M and Srinivasan A (2000) Algorithm 806: SPRNG: A Scalable Library" << endl
758 << "for Pseudorandom Number Generation. ACM Transactions on Mathematical Software," << endl
759 << "26: 436-461." << endl << endl
760
761 << "Guennebaud G, Jacob B, et al. (2010) Eigen v3. http://eigen.tuxfamily.org" << endl << endl;
762 /*
763 << "The Modeltest 3.7 source codes were taken from:" << endl << endl
764 << "David Posada and Keith A. Crandall (1998) MODELTEST: testing the model of"
765 << endl << "DNA substitution. Bioinformatics, 14(9):817-8." << endl
766 */
767 }
768
769 /***********************************************************
770 * CREATE REPORT FILE
771 ***********************************************************/
772 extern StringIntMap pllTreeCounter;
773
774 void exhaustiveSearchGAMMAInvar(Params ¶ms, IQTree &iqtree);
775
776 void searchGAMMAInvarByRestarting(IQTree &iqtree);
777
778 void computeLoglFromUserInputGAMMAInvar(Params ¶ms, IQTree &iqtree);
779
printOutfilesInfo(Params & params,IQTree & tree)780 void printOutfilesInfo(Params ¶ms, IQTree &tree) {
781
782 cout << endl << "Analysis results written to: " << endl;
783 if (!(params.suppress_output_flags & OUT_IQTREE))
784 cout<< " IQ-TREE report: " << params.out_prefix << ".iqtree"
785 << endl;
786 if (params.compute_ml_tree) {
787 if (!(params.suppress_output_flags & OUT_TREEFILE)) {
788 if (params.model_name.find("ONLY") != string::npos || (params.model_name.substr(0,2)=="MF" && params.model_name.substr(0,3)!="MFP"))
789 cout << " Tree used for ModelFinder: " << params.out_prefix << ".treefile" << endl;
790 else {
791 cout << " Maximum-likelihood tree: " << params.out_prefix << ".treefile" << endl;
792 if (params.partition_type == BRLEN_OPTIMIZE && tree.isSuperTree())
793 cout << " Partition trees: " << params.out_prefix << ".parttrees" << endl;
794 }
795 }
796 // if (params.snni && params.write_local_optimal_trees) {
797 // cout << " Locally optimal trees (" << tree.candidateTrees.getNumLocalOptTrees() << "): " << params.out_prefix << ".suboptimal_trees" << endl;
798 // }
799 }
800 if (params.num_runs > 1)
801 cout << " Trees from independent runs: " << params.out_prefix << ".runtrees" << endl;
802
803 if (!params.user_file && params.start_tree == STT_BIONJ) {
804 cout << " BIONJ tree: " << params.out_prefix << ".bionj"
805 << endl;
806 }
807 if (!params.dist_file) {
808 //cout << " Juke-Cantor distances: " << params.out_prefix << ".jcdist" << endl;
809 if (params.compute_ml_dist)
810 cout << " Likelihood distances: " << params.out_prefix
811 << ".mldist" << endl;
812 if (params.print_conaln)
813 cout << " Concatenated alignment: " << params.out_prefix
814 << ".conaln" << endl;
815 }
816 if ((params.model_name.find("TEST") != string::npos || params.model_name.substr(0,2) == "MF") && tree.isSuperTree()) {
817 cout << " Best partitioning scheme: " << params.out_prefix << ".best_scheme.nex" << endl;
818 bool raxml_format_printed = true;
819
820 for (auto it = ((SuperAlignment*)tree.aln)->partitions.begin();
821 it != ((SuperAlignment*)tree.aln)->partitions.end(); it++)
822 if (!(*it)->aln_file.empty()) {
823 raxml_format_printed = false;
824 break;
825 }
826 if (raxml_format_printed)
827 cout << " in RAxML format: " << params.out_prefix << ".best_scheme" << endl;
828 }
829 if ((tree.getRate()->getGammaShape() > 0 || params.partition_file) && params.print_site_rate)
830 cout << " Site-specific rates: " << params.out_prefix << ".rate"
831 << endl;
832
833 if ((tree.getRate()->isSiteSpecificRate() || tree.getRate()->getPtnCat(0) >= 0) && params.print_site_rate)
834 cout << " Site-rates by MH model: " << params.out_prefix << ".rate"
835 << endl;
836
837 if (params.print_site_lh)
838 cout << " Site log-likelihoods: " << params.out_prefix << ".sitelh"
839 << endl;
840
841 if (params.print_partition_lh)
842 cout << " Partition log-likelihoods: " << params.out_prefix << ".partlh"
843 << endl;
844
845 if (params.print_site_prob)
846 cout << " Site probability per rate/mix: " << params.out_prefix << ".siteprob"
847 << endl;
848
849 if (params.print_ancestral_sequence) {
850 cout << " Ancestral state: " << params.out_prefix << ".state" << endl;
851 // cout << " Ancestral sequences: " << params.out_prefix << ".aseq" << endl;
852 }
853
854 if (params.write_intermediate_trees)
855 cout << " All intermediate trees: " << params.out_prefix << ".treels"
856 << endl;
857
858 if (params.writeDistImdTrees) {
859 tree.intermediateTrees.printTrees(string("ditrees"));
860 cout << " Distinct intermediate trees: " << params.out_prefix << ".ditrees" << endl;
861 cout << " Logl of intermediate trees: " << params.out_prefix << ".ditrees_lh" << endl;
862 }
863
864 if (params.gbo_replicates) {
865 cout << endl << "Ultrafast " << RESAMPLE_NAME << " approximation results written to:" << endl;
866 if (!tree.isSuperTreeUnlinked())
867 cout << " Split support values: " << params.out_prefix << ".splits.nex" << endl
868 << " Consensus tree: " << params.out_prefix << ".contree" << endl;
869 if (params.print_ufboot_trees)
870 cout << " UFBoot trees: " << params.out_prefix << ".ufboot" << endl;
871
872 }
873
874 if (!params.treeset_file.empty()) {
875 cout << " Evaluated user trees: " << params.out_prefix << ".trees" << endl;
876
877 if (params.print_tree_lh) {
878 cout << " Tree log-likelihoods: " << params.out_prefix << ".treelh" << endl;
879 }
880 }
881 if (params.lmap_num_quartets >= 0) {
882 cout << " Likelihood mapping plot (SVG): " << params.out_prefix << ".lmap.svg" << endl;
883 cout << " Likelihood mapping plot (EPS): " << params.out_prefix << ".lmap.eps" << endl;
884 }
885 if (!(params.suppress_output_flags & OUT_LOG))
886 cout << " Screen log file: " << params.out_prefix << ".log" << endl;
887 /* if (params.model_name == "WHTEST")
888 cout <<" WH-TEST report: " << params.out_prefix << ".whtest" << endl;*/
889
890 cout << endl;
891
892 }
893
reportPhyloAnalysis(Params & params,IQTree & tree,ModelCheckpoint & model_info)894 void reportPhyloAnalysis(Params ¶ms, IQTree &tree, ModelCheckpoint &model_info) {
895 if (!MPIHelper::getInstance().isMaster()) {
896 return;
897 }
898 if (params.suppress_output_flags & OUT_IQTREE) {
899 printOutfilesInfo(params, tree);
900 return;
901 }
902
903 if (params.count_trees) {
904 // addon: print #distinct trees
905 cout << endl << "NOTE: " << pllTreeCounter.size() << " distinct trees evaluated during whole tree search" << endl;
906
907 IntVector counts;
908 for (StringIntMap::iterator i = pllTreeCounter.begin(); i != pllTreeCounter.end(); i++) {
909 if (i->second > counts.size())
910 counts.resize(i->second+1, 0);
911 counts[i->second]++;
912 }
913 for (IntVector::iterator i2 = counts.begin(); i2 != counts.end(); i2++) {
914 if (*i2 != 0) {
915 cout << "#Trees occurring " << (i2-counts.begin()) << " times: " << *i2 << endl;
916 }
917 }
918 }
919 string outfile = params.out_prefix;
920
921 outfile += ".iqtree";
922 try {
923 ofstream out;
924 out.exceptions(ios::failbit | ios::badbit);
925 out.open(outfile.c_str());
926 out << "IQ-TREE " << iqtree_VERSION_MAJOR << "." << iqtree_VERSION_MINOR
927 << iqtree_VERSION_PATCH << " built " << __DATE__ << endl
928 << endl;
929 if (params.partition_file)
930 out << "Partition file name: " << params.partition_file << endl;
931 if (params.aln_file)
932 out << "Input file name: " << params.aln_file << endl;
933
934 if (params.user_file)
935 out << "User tree file name: " << params.user_file << endl;
936 out << "Type of analysis: ";
937 bool modelfinder = params.model_name.substr(0,4)=="TEST" || params.model_name.substr(0,2) == "MF" || params.model_name.empty();
938 if (modelfinder)
939 out << "ModelFinder";
940 if (params.compute_ml_tree) {
941 if (modelfinder)
942 out << " + ";
943 out << "tree reconstruction";
944 }
945 if (params.num_bootstrap_samples > 0) {
946 if (params.compute_ml_tree)
947 out << " + ";
948 out << "non-parametric " << RESAMPLE_NAME << " (" << params.num_bootstrap_samples
949 << " replicates)";
950 }
951 if (params.gbo_replicates > 0) {
952 out << " + ultrafast " << RESAMPLE_NAME << " (" << params.gbo_replicates << " replicates)";
953 }
954 out << endl;
955 out << "Random seed number: " << params.ran_seed << endl << endl;
956 out << "REFERENCES" << endl << "----------" << endl << endl;
957 reportReferences(params, out);
958
959 out << "SEQUENCE ALIGNMENT" << endl << "------------------" << endl
960 << endl;
961 if (tree.isSuperTree()) {
962 // TODO DS: Changes may be needed here for PoMo.
963 out << "Input data: " << tree.aln->getNSeq()+tree.removed_seqs.size() << " taxa with "
964 << tree.aln->getNSite() << " partitions and "
965 << tree.getAlnNSite() << " total sites ("
966 << ((SuperAlignment*)tree.aln)->computeMissingData()*100 << "% missing data)" << endl << endl;
967
968 PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
969 int namelen = stree->getMaxPartNameLength();
970 int part;
971 out.width(max(namelen+6,10));
972 out << left << " ID Name" << " Type\tSeq\tSite\tUnique\tInfor\tInvar\tConst" << endl;
973 //out << string(namelen+54, '-') << endl;
974 part = 0;
975 for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++, part++) {
976 //out << "FOR PARTITION " << stree->part_info[part].name << ":" << endl << endl;
977 //reportAlignment(out, *((*it)->aln));
978 out.width(4);
979 out << right << part+1 << " ";
980 out.width(max(namelen,4));
981 out << left << (*it)->aln->name << " ";
982 switch ((*it)->aln->seq_type) {
983 case SEQ_BINARY: out << "BIN"; break;
984 case SEQ_CODON: out << "CODON"; break;
985 case SEQ_DNA: out << "DNA"; break;
986 case SEQ_MORPH: out << "MORPH"; break;
987 case SEQ_MULTISTATE: out << "MULTI"; break;
988 case SEQ_PROTEIN: out << "AA"; break;
989 case SEQ_POMO: out << "POMO"; break;
990 case SEQ_UNKNOWN: out << "???"; break;
991 }
992 out << "\t" << (*it)->aln->getNSeq() << "\t" << (*it)->aln->getNSite()
993 << "\t" << (*it)->aln->getNPattern() << "\t" << (*it)->aln->num_informative_sites
994 << "\t" << (*it)->getAlnNSite() - (*it)->aln->num_variant_sites
995 << "\t" << int((*it)->aln->frac_const_sites*(*it)->getAlnNSite()) << endl;
996 }
997 out << endl << "Column meanings:" << endl
998 << " Unique: Number of unique site patterns" << endl
999 << " Infor: Number of parsimony-informative sites" << endl
1000 << " Invar: Number of invariant sites" << endl
1001 << " Const: Number of constant sites (can be subset of invariant sites)" << endl << endl;
1002
1003 } else
1004 reportAlignment(out, *(tree.aln), tree.removed_seqs.size());
1005
1006 out.precision(4);
1007 out << fixed;
1008
1009 if (!model_info.empty()) {
1010 out << "ModelFinder" << endl << "-----------" << endl << endl;
1011 // if (tree.isSuperTree())
1012 // pruneModelInfo(model_info, (PhyloSuperTree*)&tree);
1013 reportModelSelection(out, params, &model_info, &tree);
1014 }
1015
1016 out << "SUBSTITUTION PROCESS" << endl << "--------------------" << endl
1017 << endl;
1018 if (tree.isSuperTree()) {
1019 if(params.partition_type == BRLEN_SCALE)
1020 out << "Edge-linked-proportional partition model with ";
1021 else if(params.partition_type == BRLEN_FIX)
1022 out << "Edge-linked-equal partition model with ";
1023 else if (params.partition_type == BRLEN_OPTIMIZE)
1024 out << "Edge-unlinked partition model with ";
1025 else
1026 out << "Topology-unlinked partition model with ";
1027
1028 if (params.model_joint)
1029 out << "joint substitution model ";
1030 else
1031 out << "separate substitution models ";
1032 if (params.link_alpha)
1033 out << "and joint gamma shape";
1034 else
1035 out << "and separate rates across sites";
1036 out << endl << endl;
1037
1038 PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
1039 PhyloSuperTree::iterator it;
1040 int part;
1041 if(params.partition_type == BRLEN_OPTIMIZE || params.partition_type == TOPO_UNLINKED)
1042 out << " ID Model TreeLen Parameters" << endl;
1043 else
1044 out << " ID Model Speed Parameters" << endl;
1045 //out << "-------------------------------------" << endl;
1046 for (it = stree->begin(), part = 0; it != stree->end(); it++, part++) {
1047 out.width(4);
1048 out << right << (part+1) << " ";
1049 out.width(14);
1050 if(params.partition_type == BRLEN_OPTIMIZE || params.partition_type == TOPO_UNLINKED)
1051 out << left << (*it)->getModelName() << " " << (*it)->treeLength() << " " << (*it)->getModelNameParams() << endl;
1052 else
1053 out << left << (*it)->getModelName() << " " << stree->part_info[part].part_rate << " " << (*it)->getModelNameParams() << endl;
1054 }
1055 out << endl;
1056 /*
1057 for (it = stree->begin(), part = 0; it != stree->end(); it++, part++) {
1058 reportModel(out, *(*it));
1059 reportRate(out, *(*it));
1060 }*/
1061 PartitionModel *part_model = (PartitionModel*)tree.getModelFactory();
1062 for (auto itm = part_model->linked_models.begin(); itm != part_model->linked_models.end(); itm++) {
1063 for (it = stree->begin(); it != stree->end(); it++)
1064 if ((*it)->getModel() == itm->second) {
1065 out << "Linked model of substitution: " << itm->second->getName() << endl << endl;
1066 bool fixed = (*it)->getModel()->fixParameters(false);
1067 reportModel(out, (*it)->aln, (*it)->getModel());
1068 (*it)->getModel()->fixParameters(fixed);
1069 break;
1070 }
1071 }
1072 } else {
1073 reportModel(out, tree);
1074 reportRate(out, tree);
1075 }
1076
1077 if (params.lmap_num_quartets >= 0) {
1078 tree.reportLikelihoodMapping(out);
1079 }
1080
1081
1082 /*
1083 out << "RATE HETEROGENEITY" << endl << "------------------" << endl
1084 << endl;
1085 if (tree.isSuperTree()) {
1086 PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
1087 int part = 0;
1088 for (PhyloSuperTree::iterator it = stree->begin();
1089 it != stree->end(); it++, part++) {
1090 out << "FOR PARTITION " << stree->part_info[part].name << ":"
1091 << endl << endl;
1092 reportRate(out, *(*it));
1093 }
1094 } else
1095 reportRate(out, tree);
1096 */
1097 // Bootstrap analysis:
1098 //Display as outgroup: a
1099
1100 if (params.model_name == "WHTEST") {
1101 out << "TEST OF MODEL HOMOGENEITY" << endl
1102 << "-------------------------" << endl << endl;
1103 out << "Delta of input data: "
1104 << params.whtest_delta << endl;
1105 out << ".95 quantile of Delta distribution: "
1106 << params.whtest_delta_quantile << endl;
1107 out << "Number of simulations performed: "
1108 << params.whtest_simulations << endl;
1109 out << "P-value: "
1110 << params.whtest_p_value << endl;
1111 if (params.whtest_p_value < 0.05) {
1112 out
1113 << "RESULT: Homogeneity assumption is rejected (p-value cutoff 0.05)"
1114 << endl;
1115 } else {
1116 out
1117 << "RESULT: Homogeneity assumption is NOT rejected (p-value cutoff 0.05)"
1118 << endl;
1119 }
1120 out << endl << "*** For this result please cite:" << endl << endl;
1121 out
1122 << "G. Weiss and A. von Haeseler (2003) Testing substitution models"
1123 << endl
1124 << "within a phylogenetic tree. Mol. Biol. Evol, 20(4):572-578"
1125 << endl << endl;
1126 }
1127
1128 if (params.num_runs > 1) {
1129 out << "MULTIPLE RUNS" << endl
1130 << "-------------" << endl << endl;
1131 out << "Run logL" << endl;
1132 DoubleVector runLnL;
1133 tree.getCheckpoint()->getVector("runLnL", runLnL);
1134 for (int run = 0; run < runLnL.size(); run++)
1135 out << run+1 << "\t" << fixed << runLnL[run] << endl;
1136 out << endl;
1137 }
1138 /*
1139 out << "TREE SEARCH" << endl << "-----------" << endl << endl
1140 << "Stopping rule: "
1141 << ((params.stop_condition == SC_STOP_PREDICT) ? "Yes" : "No")
1142 << endl << "Number of iterations: "
1143 << tree.stop_rule.getNumIterations() << endl
1144 << "Probability of deleting sequences: " << params.p_delete
1145 << endl << "Number of representative leaves: "
1146 << params.k_representative << endl
1147 << "NNI log-likelihood cutoff: " << tree.getNNICutoff() << endl
1148 << endl;
1149 */
1150 if (params.compute_ml_tree) {
1151 if (params.model_name.find("ONLY") != string::npos || (params.model_name.substr(0,2) == "MF" && params.model_name.substr(0,3) != "MFP")) {
1152 out << "TREE USED FOR ModelFinder" << endl
1153 << "-------------------------" << endl << endl;
1154 } else if (params.min_iterations == 0) {
1155 if (params.user_file)
1156 out << "USER TREE" << endl
1157 << "---------" << endl << endl;
1158 else
1159 out << "STARTING TREE" << endl
1160 << "-------------" << endl << endl;
1161 } else {
1162 out << "MAXIMUM LIKELIHOOD TREE" << endl
1163 << "-----------------------" << endl << endl;
1164 }
1165
1166 tree.setRootNode(params.root);
1167
1168 if (params.gbo_replicates) {
1169 if (tree.boot_consense_logl > tree.getBestScore() + 0.1 && !tree.isSuperTreeUnlinked()) {
1170 out << endl << "**NOTE**: Consensus tree has higher likelihood than ML tree found! Please use consensus tree below." << endl;
1171 }
1172 }
1173
1174 reportTree(out, params, tree, tree.getBestScore(), tree.logl_variance, true);
1175
1176 if (tree.isSuperTree() && verbose_mode >= VB_MED) {
1177 PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
1178 // stree->mapTrees();
1179 // int empty_branches = stree->countEmptyBranches();
1180 // if (empty_branches) {
1181 // stringstream ss;
1182 // ss << empty_branches << " branches in the overall tree with no phylogenetic information due to missing data!";
1183 // outWarning(ss.str());
1184 // }
1185
1186 int part = 0;
1187 for (PhyloSuperTree::iterator it = stree->begin();
1188 it != stree->end(); it++, part++) {
1189 out << "FOR PARTITION " << (*it)->aln->name
1190 << ":" << endl << endl;
1191 (*it)->setRootNode(params.root);
1192 // reportTree(out, params, *(*it), (*it)->computeLikelihood(), (*it)->computeLogLVariance(), false);
1193 reportTree(out, params, *(*it), stree->part_info[part].cur_score, 0.0, false);
1194 }
1195 }
1196
1197 }
1198 /*
1199 if (params.write_intermediate_trees) {
1200 out << endl << "CONSENSUS OF INTERMEDIATE TREES" << endl << "-----------------------" << endl << endl
1201 << "Number of intermediate trees: " << tree.stop_rule.getNumIterations() << endl
1202 << "Split threshold: " << params.split_threshold << endl
1203 << "Burn-in: " << params.tree_burnin << endl << endl;
1204 }*/
1205
1206 if (params.consensus_type == CT_CONSENSUS_TREE && !tree.isSuperTreeUnlinked()) {
1207 out << "CONSENSUS TREE" << endl << "--------------" << endl << endl;
1208 out << "Consensus tree is constructed from "
1209 << (params.num_bootstrap_samples ? params.num_bootstrap_samples : params.gbo_replicates)
1210 << " " << RESAMPLE_NAME << " trees";
1211 if (params.gbo_replicates || params.num_bootstrap_samples) {
1212 out << endl << "Log-likelihood of consensus tree: " << fixed << tree.boot_consense_logl;
1213 }
1214 string con_file = params.out_prefix;
1215 con_file += ".contree";
1216
1217 // -- Mon Apr 17 21:14:53 BST 2017
1218 // DONE Minh: merged correctly
1219 if (params.compute_ml_tree)
1220 out << endl << "Robinson-Foulds distance between ML tree and consensus tree: "
1221 << tree.contree_rfdist << endl;
1222 // --
1223
1224 out << endl << "Branches with support >"
1225 << floor(params.split_threshold * 1000) / 10 << "% are kept";
1226 if (params.split_threshold == 0.0)
1227 out << " (extended consensus)";
1228 if (params.split_threshold == 0.5)
1229 out << " (majority-rule consensus)";
1230 if (params.split_threshold >= 0.99)
1231 out << " (strict consensus)";
1232
1233 out << endl << "Branch lengths are optimized by maximum likelihood on original alignment" << endl;
1234 out << "Numbers in parentheses are " << RESAMPLE_NAME << " supports (%)" << endl << endl;
1235
1236 bool rooted = false;
1237 MTree contree;
1238 contree.readTree(con_file.c_str(), rooted);
1239 contree.drawTree(out, WT_BR_SCALE);
1240 out << endl << "Consensus tree in newick format: " << endl << endl;
1241 contree.printTree(out);
1242 out << endl << endl;
1243 // tree.freeNode();
1244 // tree.root = NULL;
1245 // tree.readTree(con_file.c_str(), rooted);
1246 // if (removed_seqs.size() > 0) {
1247 // tree.reinsertIdenticalSeqs(tree.aln, removed_seqs, twin_seqs);
1248 // }
1249 // tree.setAlignment(tree.aln);
1250
1251 // bug fix
1252 // if ((tree.sse == LK_EIGEN || tree.sse == LK_EIGEN_SSE) && !tree.isBifurcating()) {
1253 // cout << "NOTE: Changing to old kernel as consensus tree is multifurcating" << endl;
1254 // tree.changeLikelihoodKernel(LK_SSE);
1255 // }
1256
1257 // tree.initializeAllPartialLh();
1258 // tree.fixNegativeBranch(false);
1259 // if (tree.isSuperTree())
1260 // ((PhyloSuperTree*) &tree)->mapTrees();
1261 // tree.optimizeAllBranches();
1262 // tree.printTree(con_file.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA);
1263 // tree.sortTaxa();
1264 // tree.drawTree(out, WT_BR_SCALE);
1265 // out << endl << "Consensus tree in newick format: " << endl << endl;
1266 // tree.printResultTree(out);
1267 // out << endl << endl;
1268 }
1269 #ifdef IQTREE_TERRAPHAST
1270 if (params.terrace_analysis && params.compute_ml_tree) {
1271
1272 out << "TERRACE ANALYSIS" << endl << "----------------" << endl << endl;
1273 cout << "Running additional analysis: Phylogenetic Terraces ..."<< endl;
1274
1275 string filename = params.out_prefix;
1276 filename += ".terrace";
1277
1278 try
1279 {
1280 Terrace terrace(tree, (SuperAlignment*)(tree.aln));
1281
1282 uint64_t terrace_size = terrace.getSize();
1283
1284 if (terrace_size == 1) {
1285 out << "The tree does not lie on a terrace." << endl;
1286 } else {
1287 out << "The tree lies on a terrace of size ";
1288
1289 if (terrace_size == UINT64_MAX) {
1290 out << "at least " << terrace_size << " (integer overflow)";
1291 } else {
1292 out << terrace_size;
1293 }
1294
1295 out << endl;
1296
1297 ofstream terraceout;
1298 terraceout.open(filename.c_str());
1299
1300 terrace.printTreesCompressed(terraceout);
1301
1302 terraceout.close();
1303
1304 out << "Terrace trees written (in compressed Newick format) to " << filename << endl;
1305 }
1306 }
1307 catch (std::exception& e)
1308 {
1309 out << "ERROR: Terrace analysis using Terraphast failed: " << e.what() << endl << endl;
1310 }
1311
1312 out << endl;
1313 out << "For documentation, see the technical supplement to Biczok et al. (2018)" << endl;
1314 out << "https://doi.org/10.1093/bioinformatics/bty384";
1315
1316 out << endl << endl;
1317 cout<< "Done. Results are written in "<<params.out_prefix<<".iqtree file."<<endl;
1318 }
1319 #endif
1320 /* evaluate user trees */
1321 vector<TreeInfo> info;
1322 IntVector distinct_trees;
1323 if (!params.treeset_file.empty()) {
1324 evaluateTrees(params.treeset_file, params, &tree, info, distinct_trees);
1325 out.precision(4);
1326 out.setf(ios_base::fixed);
1327
1328 out << endl << "USER TREES" << endl << "----------" << endl << endl;
1329 out << "See " << params.out_prefix << ".trees for trees with branch lengths." << endl << endl;
1330 if (params.topotest_replicates && info.size() > 1) {
1331 if (params.do_au_test && params.topotest_replicates < 10000)
1332 out << "WARNING: Too few replicates for AU test. At least -zb 10000 for reliable results!" << endl << endl;
1333 out << "Tree logL deltaL bp-RELL p-KH p-SH ";
1334 if (params.do_weighted_test)
1335 out << "p-WKH p-WSH ";
1336 out << " c-ELW";
1337 if (params.do_au_test)
1338 out << " p-AU";
1339
1340 out << endl << "------------------------------------------------------------------";
1341 if (params.do_weighted_test)
1342 out << "------------------";
1343 if (params.do_au_test)
1344 out << "-------";
1345 out << endl;
1346 } else {
1347 out << "Tree logL deltaL" << endl;
1348 out << "-------------------------" << endl;
1349
1350 }
1351 double maxL = -DBL_MAX;
1352 int tid, orig_id;
1353 for (tid = 0; tid < info.size(); tid++)
1354 if (info[tid].logl > maxL) maxL = info[tid].logl;
1355 for (orig_id = 0, tid = 0; orig_id < distinct_trees.size(); orig_id++) {
1356 out.width(3);
1357 out << right << orig_id+1 << " ";
1358 if (distinct_trees[orig_id] >= 0) {
1359 out << " = tree " << distinct_trees[orig_id]+1 << endl;
1360 continue;
1361 }
1362 out.unsetf(ios::fixed);
1363 out.precision(10);
1364 out.width(12);
1365 out << info[tid].logl << " ";
1366 out.width(7);
1367 out.precision(5);
1368 out << maxL - info[tid].logl;
1369 if (!params.topotest_replicates || info.size() <= 1) {
1370 out << endl;
1371 tid++;
1372 continue;
1373 }
1374 out.precision(3);
1375 out << " ";
1376 out.width(6);
1377 out << info[tid].rell_bp;
1378 if (info[tid].rell_confident)
1379 out << " + ";
1380 else
1381 out << " - ";
1382 out.width(6);
1383 out << right << info[tid].kh_pvalue;
1384 if (info[tid].kh_pvalue < 0.05)
1385 out << " - ";
1386 else
1387 out << " + ";
1388 out.width(6);
1389 out << right << info[tid].sh_pvalue;
1390 if (info[tid].sh_pvalue < 0.05)
1391 out << " - ";
1392 else
1393 out << " + ";
1394
1395 if (params.do_weighted_test) {
1396 out.width(6);
1397 out << right << info[tid].wkh_pvalue;
1398 if (info[tid].wkh_pvalue < 0.05)
1399 out << " - ";
1400 else
1401 out << " + ";
1402 out.width(6);
1403 out << right << info[tid].wsh_pvalue;
1404 if (info[tid].wsh_pvalue < 0.05)
1405 out << " - ";
1406 else
1407 out << " + ";
1408 }
1409 out.width(9);
1410 out << right << info[tid].elw_value;
1411 if (info[tid].elw_confident)
1412 out << " + ";
1413 else
1414 out << " - ";
1415
1416 if (params.do_au_test) {
1417 out.width(8);
1418 out << right << info[tid].au_pvalue;
1419 if (info[tid].au_pvalue < 0.05)
1420 out << " - ";
1421 else
1422 out << " + ";
1423 }
1424 out.setf(ios::fixed);
1425
1426 out << endl;
1427 tid++;
1428 }
1429 out << endl;
1430
1431 if (params.topotest_replicates) {
1432 out << "deltaL : logL difference from the maximal logl in the set." << endl
1433 << "bp-RELL : bootstrap proportion using RELL method (Kishino et al. 1990)." << endl
1434 << "p-KH : p-value of one sided Kishino-Hasegawa test (1989)." << endl
1435 << "p-SH : p-value of Shimodaira-Hasegawa test (2000)." << endl;
1436 if (params.do_weighted_test) {
1437 out << "p-WKH : p-value of weighted KH test." << endl
1438 << "p-WSH : p-value of weighted SH test." << endl;
1439 }
1440 out << "c-ELW : Expected Likelihood Weight (Strimmer & Rambaut 2002)." << endl;
1441 if (params.do_au_test) {
1442 out << "p-AU : p-value of approximately unbiased (AU) test (Shimodaira, 2002)." << endl;
1443 }
1444 out << endl
1445 << "Plus signs denote the 95% confidence sets." << endl
1446 << "Minus signs denote significant exclusion." << endl
1447 << "All tests performed "
1448 << params.topotest_replicates << " resamplings using the RELL method."<<endl;
1449 }
1450 out << endl;
1451 }
1452
1453
1454 time_t cur_time;
1455 time(&cur_time);
1456
1457 char *date_str;
1458 date_str = ctime(&cur_time);
1459 out.unsetf(ios_base::fixed);
1460 out << "TIME STAMP" << endl << "----------" << endl << endl
1461 << "Date and time: " << date_str << "Total CPU time used: "
1462 << (double) params.run_time << " seconds (" << convert_time(params.run_time) << ")" << endl
1463 << "Total wall-clock time used: " << getRealTime() - params.start_real_time
1464 << " seconds (" << convert_time(getRealTime() - params.start_real_time) << ")" << endl << endl;
1465
1466 //reportCredits(out); // not needed, now in the manual
1467 out.close();
1468
1469 } catch (ios::failure) {
1470 outError(ERR_WRITE_OUTPUT, outfile);
1471 }
1472
1473 printOutfilesInfo(params, tree);
1474 }
1475
checkZeroDist(Alignment * aln,double * dist)1476 void checkZeroDist(Alignment *aln, double *dist) {
1477 int ntaxa = aln->getNSeq();
1478 IntVector checked;
1479 checked.resize(ntaxa, 0);
1480 int i, j;
1481 for (i = 0; i < ntaxa - 1; i++) {
1482 if (checked[i])
1483 continue;
1484 string str = "";
1485 bool first = true;
1486 for (j = i + 1; j < ntaxa; j++)
1487 if (dist[i * ntaxa + j] <= Params::getInstance().min_branch_length) {
1488 if (first)
1489 str = "ZERO distance between sequences "
1490 + aln->getSeqName(i);
1491 str += ", " + aln->getSeqName(j);
1492 checked[j] = 1;
1493 first = false;
1494 }
1495 checked[i] = 1;
1496 if (str != "")
1497 outWarning(str);
1498 }
1499 }
1500
1501
printAnalysisInfo(int model_df,IQTree & iqtree,Params & params)1502 void printAnalysisInfo(int model_df, IQTree& iqtree, Params& params) {
1503 // if (!params.raxmllib) {
1504 cout << "Model of evolution: ";
1505 if (iqtree.isSuperTree()) {
1506 cout << iqtree.getModelName() << " (" << model_df << " free parameters)" << endl;
1507 } else {
1508 cout << iqtree.getModelName() << " with ";
1509 switch (iqtree.getModel()->getFreqType()) {
1510 case FREQ_EQUAL:
1511 cout << "equal";
1512 break;
1513 case FREQ_EMPIRICAL:
1514 cout << "counted";
1515 break;
1516 case FREQ_USER_DEFINED:
1517 cout << "user-defined";
1518 break;
1519 case FREQ_ESTIMATE:
1520 cout << "optimized";
1521 break;
1522 case FREQ_CODON_1x4:
1523 cout << "counted 1x4";
1524 break;
1525 case FREQ_CODON_3x4:
1526 cout << "counted 3x4";
1527 break;
1528 case FREQ_CODON_3x4C:
1529 cout << "counted 3x4-corrected";
1530 break;
1531 case FREQ_DNA_RY:
1532 cout << "constrained A+G=C+T";
1533 break;
1534 case FREQ_DNA_WS:
1535 cout << "constrained A+T=C+G";
1536 break;
1537 case FREQ_DNA_MK:
1538 cout << "constrained A+C=G+T";
1539 break;
1540 case FREQ_DNA_1112:
1541 cout << "constrained A=C=G";
1542 break;
1543 case FREQ_DNA_1121:
1544 cout << "constrained A=C=T";
1545 break;
1546 case FREQ_DNA_1211:
1547 cout << "constrained A=G=T";
1548 break;
1549 case FREQ_DNA_2111:
1550 cout << "constrained C=G=T";
1551 break;
1552 case FREQ_DNA_1122:
1553 cout << "constrained A=C,G=T";
1554 break;
1555 case FREQ_DNA_1212:
1556 cout << "constrained A=G,C=T";
1557 break;
1558 case FREQ_DNA_1221:
1559 cout << "constrained A=T,C=G";
1560 break;
1561 case FREQ_DNA_1123:
1562 cout << "constrained A=C";
1563 break;
1564 case FREQ_DNA_1213:
1565 cout << "constrained A=G";
1566 break;
1567 case FREQ_DNA_1231:
1568 cout << "constrained A=T";
1569 break;
1570 case FREQ_DNA_2113:
1571 cout << "constrained C=G";
1572 break;
1573 case FREQ_DNA_2131:
1574 cout << "constrained C=T";
1575 break;
1576 case FREQ_DNA_2311:
1577 cout << "constrained G=T";
1578 break;
1579 default:
1580 outError("Wrong specified state frequencies");
1581 }
1582 cout << " frequencies (" << model_df << " free parameters)" << endl;
1583 }
1584 cout << "Fixed branch lengths: "
1585 << ((params.fixed_branch_length) ? "Yes" : "No") << endl;
1586
1587 if (params.min_iterations > 0) {
1588 cout << "Tree search algorithm: " << (params.snni ? "Stochastic nearest neighbor interchange" : "IQPNNI") << endl;
1589 cout << "Termination condition: ";
1590 if (params.stop_condition == SC_REAL_TIME) {
1591 cout << "after " << params.maxtime << " minutes" << endl;
1592 } else if (params.stop_condition == SC_UNSUCCESS_ITERATION) {
1593 cout << "after " << params.unsuccess_iteration << " unsuccessful iterations" << endl;
1594 } else if (params.stop_condition == SC_FIXED_ITERATION) {
1595 cout << params.min_iterations << " iterations" << endl;
1596 } else if(params.stop_condition == SC_WEIBULL) {
1597 cout << "predicted in [" << params.min_iterations << ","
1598 << params.max_iterations << "] (confidence "
1599 << params.stop_confidence << ")" << endl;
1600 } else if (params.stop_condition == SC_BOOTSTRAP_CORRELATION) {
1601 cout << "min " << params.min_correlation << " correlation coefficient" << endl;
1602 }
1603
1604 if (!params.snni) {
1605 cout << "Number of representative leaves : " << params.k_representative << endl;
1606 cout << "Probability of deleting sequences: " << iqtree.getProbDelete() << endl;
1607 cout << "Number of leaves to be deleted : " << iqtree.getDelete() << endl;
1608 cout << "Important quartets assessed on: "
1609 << ((params.iqp_assess_quartet == IQP_DISTANCE) ?
1610 "Distance" : ((params.iqp_assess_quartet == IQP_PARSIMONY) ? "Parsimony" : "Bootstrap"))
1611 << endl;
1612 }
1613 cout << "NNI assessed on: " << ((params.nni5) ? "5 branches" : "1 branch") << endl;
1614 }
1615 cout << "Phylogenetic likelihood library: " << (params.pll ? "Yes" : "No") << endl;
1616 if (params.fixed_branch_length != BRLEN_FIX)
1617 cout << "Branch length optimization method: "
1618 << ((iqtree.optimize_by_newton) ? "Newton" : "Brent") << endl;
1619 cout << "Number of Newton-Raphson steps in NNI evaluation and branch length optimization: " << NNI_MAX_NR_STEP
1620 << " / " << PLL_NEWZPERCYCLE << endl;
1621 cout << "SSE instructions: "
1622 << ((iqtree.sse) ? "Yes" : "No") << endl;
1623 cout << endl;
1624 }
1625
computeMLDist(Params & params,IQTree & iqtree,double begin_time)1626 void computeMLDist(Params& params, IQTree& iqtree, double begin_time) {
1627 double longest_dist;
1628 // stringstream best_tree_string;
1629 // iqtree.printTree(best_tree_string, WT_BR_LEN + WT_TAXON_ID);
1630 cout << "Computing ML distances based on estimated model parameters...";
1631 double *ml_dist = NULL;
1632 double *ml_var = NULL;
1633 longest_dist = iqtree.computeDist(params, iqtree.aln, ml_dist, ml_var, iqtree.dist_file);
1634 cout << " " << (getCPUTime() - begin_time) << " sec" << endl;
1635
1636 double max_genetic_dist = MAX_GENETIC_DIST;
1637 if (iqtree.aln->seq_type == SEQ_POMO) {
1638 int N = iqtree.aln->virtual_pop_size;
1639 max_genetic_dist *= N * N;
1640 }
1641 if (longest_dist > max_genetic_dist * 0.99) {
1642 outWarning("Some pairwise ML distances are too long (saturated)");
1643 //cout << "Some ML distances are too long, using old distances..." << endl;
1644 } //else
1645 {
1646 if ( !iqtree.dist_matrix ) {
1647 iqtree.dist_matrix = new double[iqtree.aln->getNSeq() * iqtree.aln->getNSeq()];
1648 }
1649 if ( !iqtree.var_matrix ) {
1650 iqtree.var_matrix = new double[iqtree.aln->getNSeq() * iqtree.aln->getNSeq()];
1651 }
1652 memmove(iqtree.dist_matrix, ml_dist,
1653 sizeof (double) * iqtree.aln->getNSeq() * iqtree.aln->getNSeq());
1654 memmove(iqtree.var_matrix, ml_var,
1655 sizeof(double) * iqtree.aln->getNSeq() * iqtree.aln->getNSeq());
1656 }
1657 delete[] ml_dist;
1658 delete[] ml_var;
1659 }
1660
computeInitialDist(Params & params,IQTree & iqtree)1661 void computeInitialDist(Params ¶ms, IQTree &iqtree) {
1662 double longest_dist;
1663 if (params.dist_file) {
1664 cout << "Reading distance matrix file " << params.dist_file << " ..." << endl;
1665 } else if (params.compute_jc_dist) {
1666 cout << "Computing Juke-Cantor distances..." << endl;
1667 } else if (params.compute_obs_dist) {
1668 cout << "Computing observed distances..." << endl;
1669 }
1670
1671 if (params.compute_jc_dist || params.compute_obs_dist || params.partition_file) {
1672 longest_dist = iqtree.computeDist(params, iqtree.aln, iqtree.dist_matrix, iqtree.var_matrix, iqtree.dist_file);
1673 checkZeroDist(iqtree.aln, iqtree.dist_matrix);
1674
1675 double max_genetic_dist = MAX_GENETIC_DIST;
1676 if (iqtree.aln->seq_type == SEQ_POMO) {
1677 int N = iqtree.aln->virtual_pop_size;
1678 max_genetic_dist *= N * N;
1679 }
1680 if (longest_dist > max_genetic_dist * 0.99) {
1681 outWarning("Some pairwise distances are too long (saturated)");
1682 }
1683 }
1684
1685 }
1686
initializeParams(Params & params,IQTree & iqtree)1687 void initializeParams(Params ¶ms, IQTree &iqtree)
1688 {
1689 // iqtree.setCurScore(-DBL_MAX);
1690 bool ok_tree = iqtree.root;
1691 if (iqtree.isSuperTreeUnlinked())
1692 ok_tree = ((PhyloSuperTree*)&iqtree)->front()->root;
1693 if (!ok_tree)
1694 {
1695 // compute initial tree
1696 iqtree.computeInitialTree(params.SSE);
1697 }
1698 ASSERT(iqtree.aln);
1699
1700 if (iqtree.aln->model_name == "WHTEST") {
1701 if (iqtree.aln->seq_type != SEQ_DNA)
1702 outError("Weiss & von Haeseler test of model homogeneity only works for DNA");
1703 iqtree.aln->model_name = "GTR+G";
1704 }
1705
1706 if (params.gbo_replicates)
1707 params.speed_conf = 1.0;
1708
1709 // TODO: check if necessary
1710 // if (iqtree.isSuperTree())
1711 // ((PhyloSuperTree*) &iqtree)->mapTrees();
1712
1713 // set parameter for the current tree
1714 // iqtree.setParams(params);
1715 }
1716
1717
pruneTaxa(Params & params,IQTree & iqtree,double * pattern_lh,NodeVector & pruned_taxa,StrVector & linked_name)1718 void pruneTaxa(Params ¶ms, IQTree &iqtree, double *pattern_lh, NodeVector &pruned_taxa, StrVector &linked_name) {
1719 int num_low_support;
1720 double mytime;
1721
1722 if (params.aLRT_threshold <= 100 && (params.aLRT_replicates > 0 || params.localbp_replicates > 0)) {
1723 mytime = getCPUTime();
1724 cout << "Testing tree branches by SH-like aLRT with " << params.aLRT_replicates << " replicates..." << endl;
1725 iqtree.setRootNode(params.root);
1726 double curScore = iqtree.getCurScore();
1727 iqtree.computePatternLikelihood(pattern_lh, &curScore);
1728 num_low_support = iqtree.testAllBranches(params.aLRT_threshold, curScore,
1729 pattern_lh, params.aLRT_replicates, params.localbp_replicates, params.aLRT_test, params.aBayes_test);
1730 iqtree.printResultTree();
1731 cout << " " << getCPUTime() - mytime << " sec." << endl;
1732 cout << num_low_support << " branches show low support values (<= " << params.aLRT_threshold << "%)" << endl;
1733
1734 //tree.drawTree(cout);
1735 cout << "Collapsing stable clades..." << endl;
1736 iqtree.collapseStableClade(params.aLRT_threshold, pruned_taxa, linked_name, iqtree.dist_matrix);
1737 cout << pruned_taxa.size() << " taxa were pruned from stable clades" << endl;
1738 }
1739
1740 if (!pruned_taxa.empty()) {
1741 cout << "Pruned alignment contains " << iqtree.aln->getNSeq()
1742 << " sequences and " << iqtree.aln->getNSite() << " sites and "
1743 << iqtree.aln->getNPattern() << " patterns" << endl;
1744 //tree.clearAllPartialLh();
1745 iqtree.initializeAllPartialLh();
1746 iqtree.clearAllPartialLH();
1747 iqtree.setCurScore(iqtree.optimizeAllBranches());
1748 //cout << "Log-likelihood after reoptimizing model parameters: " << tree.curScore << endl;
1749 // pair<int, int> nniInfo = iqtree.optimizeNNI();
1750 iqtree.optimizeNNI();
1751 cout << "Log-likelihood after optimizing partial tree: "
1752 << iqtree.getCurScore() << endl;
1753 }
1754
1755 }
1756
restoreTaxa(IQTree & iqtree,double * saved_dist_mat,NodeVector & pruned_taxa,StrVector & linked_name)1757 void restoreTaxa(IQTree &iqtree, double *saved_dist_mat, NodeVector &pruned_taxa, StrVector &linked_name) {
1758 if (!pruned_taxa.empty()) {
1759 cout << "Restoring full tree..." << endl;
1760 iqtree.restoreStableClade(iqtree.aln, pruned_taxa, linked_name);
1761 delete[] iqtree.dist_matrix;
1762 iqtree.dist_matrix = saved_dist_mat;
1763 iqtree.initializeAllPartialLh();
1764 iqtree.clearAllPartialLH();
1765 iqtree.setCurScore(iqtree.optimizeAllBranches());
1766 //cout << "Log-likelihood after reoptimizing model parameters: " << tree.curScore << endl;
1767 pair<int, int> nniInfo;
1768 nniInfo = iqtree.optimizeNNI();
1769 cout << "Log-likelihood after reoptimizing full tree: " << iqtree.getCurScore() << endl;
1770 //iqtree.setBestScore(iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.model_eps));
1771
1772 }
1773 }
runApproximateBranchLengths(Params & params,IQTree & iqtree)1774 void runApproximateBranchLengths(Params ¶ms, IQTree &iqtree) {
1775 if (!params.fixed_branch_length && params.leastSquareBranch) {
1776 cout << endl << "Computing Least Square branch lengths..." << endl;
1777 iqtree.optimizeAllBranchesLS();
1778 iqtree.clearAllPartialLH();
1779 iqtree.setCurScore(iqtree.computeLikelihood());
1780 string filename = params.out_prefix;
1781 filename += ".lstree";
1782 iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
1783 cout << "Logl of tree with LS branch lengths: " << iqtree.getCurScore() << endl;
1784 cout << "Tree with LS branch lengths written to " << filename << endl;
1785 if (params.print_branch_lengths) {
1786 if (params.manuel_analytic_approx) {
1787 cout << "Applying Manuel's analytic approximation.." << endl;
1788 iqtree.approxAllBranches();
1789 }
1790 ofstream out;
1791 filename = params.out_prefix;
1792 filename += ".lsbrlen";
1793 out.open(filename.c_str());
1794 iqtree.printBranchLengths(out);
1795 out.close();
1796 cout << "LS Branch lengths written to " << filename << endl;
1797 }
1798 cout << "Total LS tree length: " << iqtree.treeLength() << endl;
1799 }
1800
1801 if (params.pars_branch_length) {
1802 cout << endl << "Computing parsimony branch lengths..." << endl;
1803 iqtree.fixNegativeBranch(true);
1804 iqtree.clearAllPartialLH();
1805 iqtree.setCurScore(iqtree.computeLikelihood());
1806 string filename = params.out_prefix;
1807 filename += ".mptree";
1808 iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
1809 cout << "Logl of tree with MP branch lengths: " << iqtree.getCurScore() << endl;
1810 cout << "Tree with MP branch lengths written to " << filename << endl;
1811 if (params.print_branch_lengths) {
1812 ofstream out;
1813 filename = params.out_prefix;
1814 filename += ".mpbrlen";
1815 out.open(filename.c_str());
1816 iqtree.printBranchLengths(out);
1817 out.close();
1818 cout << "MP Branch lengths written to " << filename << endl;
1819 }
1820 cout << "Total MP tree length: " << iqtree.treeLength() << endl;
1821
1822 }
1823
1824 if (params.bayes_branch_length) {
1825 cout << endl << "Computing Bayesian branch lengths..." << endl;
1826 iqtree.computeAllBayesianBranchLengths();
1827 iqtree.clearAllPartialLH();
1828 iqtree.setCurScore(iqtree.computeLikelihood());
1829 string filename = params.out_prefix;
1830 filename += ".batree";
1831 iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
1832 cout << "Logl of tree with Bayesian branch lengths: " << iqtree.getCurScore() << endl;
1833 cout << "Tree with Bayesian branch lengths written to " << filename << endl;
1834 if (params.print_branch_lengths) {
1835 ofstream out;
1836 filename = params.out_prefix;
1837 filename += ".babrlen";
1838 out.open(filename.c_str());
1839 iqtree.printBranchLengths(out);
1840 out.close();
1841 cout << "Bayesian Branch lengths written to " << filename << endl;
1842 }
1843 cout << "Total Bayesian tree length: " << iqtree.treeLength() << endl;
1844
1845 }
1846
1847 }
1848
printSiteRates(IQTree & iqtree,const char * rate_file,bool bayes)1849 void printSiteRates(IQTree &iqtree, const char *rate_file, bool bayes) {
1850 try {
1851 ofstream out;
1852 out.exceptions(ios::failbit | ios::badbit);
1853 out.open(rate_file);
1854 out << "# Site-specific subtitution rates determined by ";
1855 if (bayes)
1856 out<< "empirical Bayesian method" << endl;
1857 else
1858 out<< "maximum likelihood" << endl;
1859 out << "# This file can be read in MS Excel or in R with command:" << endl
1860 << "# tab=read.table('" << rate_file << "',header=TRUE)" << endl
1861 << "# Columns are tab-separated with following meaning:" << endl;
1862 if (iqtree.isSuperTree()) {
1863 out << "# Part: Partition ID (1=" << ((PhyloSuperTree*)&iqtree)->front()->aln->name << ", etc)" << endl
1864 << "# Site: Site ID within partition (starting from 1 for each partition)" << endl;
1865 } else
1866 out << "# Site: Alignment site ID" << endl;
1867
1868 if (bayes)
1869 out << "# Rate: Posterior mean site rate weighted by posterior probability" << endl
1870 << "# Cat: Category with highest posterior (0=invariable, 1=slow, etc)" << endl
1871 << "# C_Rate: Corresponding rate of highest category" << endl;
1872 else
1873 out << "# Rate: Site rate estimated by maximum likelihood" << endl;
1874 if (iqtree.isSuperTree())
1875 out << "Part\t";
1876 out << "Site\tRate";
1877 if (bayes)
1878 out << "\tCat\tC_Rate" << endl;
1879 else
1880 out << endl;
1881 iqtree.writeSiteRates(out, bayes);
1882 out.close();
1883 } catch (ios::failure) {
1884 outError(ERR_WRITE_OUTPUT, rate_file);
1885 }
1886 cout << "Site rates printed to " << rate_file << endl;
1887 }
1888
printMiscInfo(Params & params,IQTree & iqtree,double * pattern_lh)1889 void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) {
1890 if (params.print_site_lh && !params.pll) {
1891 string site_lh_file = params.out_prefix;
1892 site_lh_file += ".sitelh";
1893 if (params.print_site_lh == WSL_SITE)
1894 printSiteLh(site_lh_file.c_str(), &iqtree, pattern_lh);
1895 else
1896 printSiteLhCategory(site_lh_file.c_str(), &iqtree, params.print_site_lh);
1897 }
1898
1899 if (params.print_partition_lh && !iqtree.isSuperTree()) {
1900 outWarning("-wpl does not work with non-partition model");
1901 params.print_partition_lh = false;
1902 }
1903 if (params.print_partition_lh && !params.pll) {
1904 string part_lh_file = (string)params.out_prefix + ".partlh";
1905 printPartitionLh(part_lh_file.c_str(), &iqtree, pattern_lh);
1906 }
1907
1908 if (params.print_site_prob && !params.pll) {
1909 printSiteProbCategory(((string)params.out_prefix + ".siteprob").c_str(), &iqtree, params.print_site_prob);
1910 }
1911
1912 if (params.print_ancestral_sequence) {
1913 printAncestralSequences(params.out_prefix, &iqtree, params.print_ancestral_sequence);
1914 }
1915
1916 if (params.print_site_state_freq != WSF_NONE && !params.site_freq_file && !params.tree_freq_file) {
1917 string site_freq_file = params.out_prefix;
1918 site_freq_file += ".sitesf";
1919 printSiteStateFreq(site_freq_file.c_str(), &iqtree);
1920 }
1921
1922 if (params.print_trees_site_posterior) {
1923 cout << "Computing mixture posterior probabilities" << endl;
1924 IntVector pattern_cat;
1925 int num_mix = iqtree.computePatternCategories(&pattern_cat);
1926 cout << num_mix << " mixture components are necessary" << endl;
1927 string site_mix_file = (string)params.out_prefix + ".sitemix";
1928 ofstream out(site_mix_file.c_str());
1929 if (!out.is_open())
1930 outError("File " + site_mix_file + " could not be opened");
1931 out << "Ptn\tFreq\tNumMix" << endl;
1932 int ptn;
1933 for (ptn = 0; ptn < pattern_cat.size(); ptn++)
1934 out << ptn << "\t" << (int)iqtree.ptn_freq[ptn] << "\t" << pattern_cat[ptn] << endl;
1935 out.close();
1936 cout << "Pattern mixtures printed to " << site_mix_file << endl;
1937
1938 site_mix_file = (string)params.out_prefix + ".sitemixall";
1939 out.open(site_mix_file.c_str());
1940 int ncat = iqtree.getRate()->getNRate();
1941 if (iqtree.getModel()->isMixture() && !iqtree.getModelFactory()->fused_mix_rate)
1942 ncat = iqtree.getModel()->getNMixtures();
1943 out << "Ptn\tFreq\tNumMix\tCat" << endl;
1944
1945 int c;
1946 for (ptn = 0; ptn < iqtree.ptn_cat_mask.size(); ptn++) {
1947 int num_cat = popcount_lauradoux((unsigned*)&iqtree.ptn_cat_mask[ptn], 2);
1948 out << ptn << "\t" << (int)iqtree.ptn_freq[ptn] << "\t" << num_cat << "\t";
1949 for (c = 0; c < ncat; c++)
1950 if (iqtree.ptn_cat_mask[ptn] & ((uint64_t)1<<c))
1951 out << "1";
1952 else
1953 out << "0";
1954 out << endl;
1955 }
1956 out.close();
1957 }
1958
1959 if (params.print_branch_lengths) {
1960 if (params.manuel_analytic_approx) {
1961 cout << "Applying Manuel's analytic approximation.." << endl;
1962 iqtree.approxAllBranches();
1963 }
1964 string brlen_file = params.out_prefix;
1965 brlen_file += ".brlen";
1966 ofstream out;
1967 out.open(brlen_file.c_str());
1968 iqtree.printBranchLengths(out);
1969 out.close();
1970 cout << "Branch lengths written to " << brlen_file << endl;
1971 }
1972
1973 if (params.write_branches) {
1974 string filename = string(params.out_prefix) + ".branches.csv";
1975 ofstream out;
1976 out.open(filename.c_str());
1977 iqtree.writeBranches(out);
1978 out.close();
1979 cout << "Branch lengths written to " << filename << endl;
1980 }
1981
1982 if (params.print_conaln && iqtree.isSuperTree()) {
1983 string str = params.out_prefix;
1984 str = params.out_prefix;
1985 str += ".conaln";
1986 iqtree.aln->printAlignment(params.aln_output_format, str.c_str());
1987 }
1988
1989 if (params.print_partition_info && iqtree.isSuperTree()) {
1990 ASSERT(params.print_conaln);
1991 string aln_file = (string)params.out_prefix + ".conaln";
1992 string partition_info = params.out_prefix;
1993 partition_info += ".partinfo.nex";
1994 ((SuperAlignment*)(iqtree.aln))->printPartition(partition_info.c_str(), aln_file.c_str());
1995 partition_info = (string)params.out_prefix + ".partitions";
1996 ((SuperAlignment*)(iqtree.aln))->printPartitionRaxml(partition_info.c_str());
1997 }
1998
1999 if (params.mvh_site_rate) {
2000 RateMeyerHaeseler *rate_mvh = new RateMeyerHaeseler(params.rate_file,
2001 &iqtree, params.rate_mh_type);
2002 cout << endl << "Computing site-specific rates by "
2003 << rate_mvh->full_name << "..." << endl;
2004 rate_mvh->runIterativeProc(params, iqtree);
2005 cout << endl << "BEST SCORE FOUND : " << iqtree.getBestScore()<< endl;
2006 string mhrate_file = params.out_prefix;
2007 mhrate_file += ".mhrate";
2008 try {
2009 ofstream out;
2010 out.exceptions(ios::failbit | ios::badbit);
2011 out.open(mhrate_file.c_str());
2012 iqtree.writeSiteRates(out, true);
2013 out.close();
2014 } catch (ios::failure) {
2015 outError(ERR_WRITE_OUTPUT, mhrate_file);
2016 }
2017
2018 if (params.print_site_lh) {
2019 string site_lh_file = params.out_prefix;
2020 site_lh_file += ".mhsitelh";
2021 printSiteLh(site_lh_file.c_str(), &iqtree);
2022 }
2023 }
2024
2025 if (params.print_site_rate & 1) {
2026 string rate_file = params.out_prefix;
2027 rate_file += ".rate";
2028 printSiteRates(iqtree, rate_file.c_str(), true);
2029 }
2030
2031 if (params.print_site_rate & 2) {
2032 string rate_file = params.out_prefix;
2033 rate_file += ".mlrate";
2034 printSiteRates(iqtree, rate_file.c_str(), false);
2035 }
2036
2037 if (params.fixed_branch_length == BRLEN_SCALE) {
2038 string filename = (string)params.out_prefix + ".blscale";
2039 iqtree.printTreeLengthScaling(filename.c_str());
2040 cout << "Scaled tree length and model parameters printed to " << filename << endl;
2041 }
2042
2043 }
2044
printFinalSearchInfo(Params & params,IQTree & iqtree,double search_cpu_time,double search_real_time)2045 void printFinalSearchInfo(Params ¶ms, IQTree &iqtree, double search_cpu_time, double search_real_time) {
2046 cout << "Total tree length: " << iqtree.treeLength() << endl;
2047
2048 if (iqtree.isSuperTree() && verbose_mode >= VB_MAX) {
2049 PhyloSuperTree *stree = (PhyloSuperTree*) &iqtree;
2050 cout << stree->evalNNIs << " NNIs evaluated from " << stree->totalNNIs << " all possible NNIs ( " <<
2051 (int)(((stree->evalNNIs+1.0)/(stree->totalNNIs+1.0))*100.0) << " %)" << endl;
2052 cout<<"Details for subtrees:"<<endl;
2053 int part = 0;
2054 for(auto it = stree->begin(); it != stree->end(); it++,part++){
2055 cout << part+1 << ". " << (*it)->aln->name << ": " << stree->part_info[part].evalNNIs<<" ( "
2056 << (int)(((stree->part_info[part].evalNNIs+1.0)/((stree->totalNNIs+1.0) / stree->size()))*100.0)
2057 << " %)" << endl;
2058 }
2059 }
2060
2061 params.run_time = (getCPUTime() - params.startCPUTime);
2062 cout << endl;
2063 cout << "Total number of iterations: " << iqtree.stop_rule.getCurIt() << endl;
2064 // cout << "Total number of partial likelihood vector computations: " << iqtree.num_partial_lh_computations << endl;
2065 cout << "CPU time used for tree search: " << search_cpu_time
2066 << " sec (" << convert_time(search_cpu_time) << ")" << endl;
2067 cout << "Wall-clock time used for tree search: " << search_real_time
2068 << " sec (" << convert_time(search_real_time) << ")" << endl;
2069 cout << "Total CPU time used: " << (double) params.run_time << " sec ("
2070 << convert_time((double) params.run_time) << ")" << endl;
2071 cout << "Total wall-clock time used: "
2072 << getRealTime() - params.start_real_time << " sec ("
2073 << convert_time(getRealTime() - params.start_real_time) << ")" << endl;
2074
2075 }
2076
printTrees(vector<string> trees,Params & params,string suffix)2077 void printTrees(vector<string> trees, Params ¶ms, string suffix) {
2078 ofstream treesOut((string(params.out_prefix) + suffix).c_str(),
2079 ofstream::out);
2080 for (vector<string>::iterator it = trees.begin(); it != trees.end(); it++) {
2081 treesOut << (*it);
2082 treesOut << endl;
2083 }
2084 treesOut.close();
2085 }
2086
2087 /************************************************************
2088 * MAIN TREE RECONSTRUCTION
2089 ***********************************************************/
2090
startTreeReconstruction(Params & params,IQTree * & iqtree,ModelCheckpoint & model_info)2091 void startTreeReconstruction(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &model_info) {
2092 if (params.root) {
2093 StrVector outgroup_names;
2094 convert_string_vec(params.root, outgroup_names);
2095 for (auto it = outgroup_names.begin(); it != outgroup_names.end(); it++)
2096 if (iqtree->aln->getSeqID(*it) < 0)
2097 outError("Alignment does not have specified outgroup taxon ", *it);
2098 }
2099
2100 // if (params.count_trees && pllTreeCounter == NULL)
2101 // pllTreeCounter = new StringIntMap;
2102
2103 // Temporary fix since PLL only supports DNA/Protein: switch to IQ-TREE parsimony kernel
2104 if (params.start_tree == STT_PLL_PARSIMONY) {
2105 if (iqtree->isSuperTreeUnlinked()) {
2106 params.start_tree = STT_PARSIMONY;
2107 } else if (iqtree->isSuperTree()) {
2108 PhyloSuperTree *stree = (PhyloSuperTree*)iqtree;
2109 for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++)
2110 if ((*it)->aln->seq_type != SEQ_DNA && (*it)->aln->seq_type != SEQ_PROTEIN)
2111 params.start_tree = STT_PARSIMONY;
2112 } else if (iqtree->aln->seq_type != SEQ_DNA && iqtree->aln->seq_type != SEQ_PROTEIN)
2113 params.start_tree = STT_PARSIMONY;
2114 }
2115
2116 /***************** Initialization for PLL and sNNI ******************/
2117 if (params.start_tree == STT_PLL_PARSIMONY || params.start_tree == STT_RANDOM_TREE || params.pll) {
2118 /* Initialized all data structure for PLL*/
2119 iqtree->initializePLL(params);
2120 }
2121
2122 /********************* Compute pairwise distances *******************/
2123 if (params.start_tree == STT_BIONJ || params.iqp || params.leastSquareBranch) {
2124 computeInitialDist(params, *iqtree);
2125 }
2126
2127 /******************** Pass the parameter object params to IQTree *******************/
2128 iqtree->setParams(¶ms);
2129
2130 /*************** SET UP PARAMETERS and model testing ****************/
2131
2132 // FOR TUNG: swapping the order cause bug for -m TESTLINK
2133 // iqtree.initSettings(params);
2134
2135 runModelFinder(params, *iqtree, model_info);
2136 }
2137
2138 /**
2139 optimize branch lengths of consensus tree
2140 */
optimizeConTree(Params & params,IQTree * tree)2141 void optimizeConTree(Params ¶ms, IQTree *tree) {
2142 string contree_file = string(params.out_prefix) + ".contree";
2143
2144 DoubleVector rfdist;
2145 tree->computeRFDist(contree_file.c_str(), rfdist);
2146 tree->contree_rfdist = (int)rfdist[0];
2147
2148 tree->readTreeFile(contree_file);
2149
2150 tree->initializeAllPartialLh();
2151 tree->fixNegativeBranch(false);
2152
2153 tree->boot_consense_logl = tree->optimizeAllBranches();
2154 cout << "Log-likelihood of consensus tree: " << tree->boot_consense_logl << endl;
2155 tree->setRootNode(params.root);
2156 tree->insertTaxa(tree->removed_seqs, tree->twin_seqs);
2157 tree->printTree(contree_file.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
2158 string contree = tree->getTreeString();
2159 tree->getCheckpoint()->put("contree", contree);
2160 }
2161
runTreeReconstruction(Params & params,IQTree * & iqtree)2162 void runTreeReconstruction(Params ¶ms, IQTree* &iqtree) {
2163
2164 // string dist_file;
2165 params.startCPUTime = getCPUTime();
2166 params.start_real_time = getRealTime();
2167
2168 int absent_states = 0;
2169 if (iqtree->isSuperTree()) {
2170 PhyloSuperTree *stree = (PhyloSuperTree*)iqtree;
2171 for (auto i = stree->begin(); i != stree->end(); i++)
2172 absent_states += (*i)->aln->checkAbsentStates("partition " + (*i)->aln->name);
2173 } else {
2174 absent_states = iqtree->aln->checkAbsentStates("alignment");
2175 }
2176 if (absent_states > 0) {
2177 cout << "NOTE: " << absent_states << " states (see above) are not present and thus removed from Markov process to prevent numerical problems" << endl;
2178 }
2179
2180 // Make sure that no partial likelihood of IQ-TREE is initialized when PLL is used to save memory
2181 if (params.pll) {
2182 iqtree->deleteAllPartialLh();
2183 }
2184
2185 /***************** Initialization for PLL and sNNI ******************/
2186 if ((params.start_tree == STT_PLL_PARSIMONY || params.start_tree == STT_RANDOM_TREE || params.pll) && !iqtree->isInitializedPLL()) {
2187 /* Initialized all data structure for PLL*/
2188 iqtree->initializePLL(params);
2189 }
2190
2191
2192 /********************* Compute pairwise distances *******************/
2193 if ((params.start_tree == STT_BIONJ || params.iqp || params.leastSquareBranch) && !iqtree->root) {
2194 computeInitialDist(params, *iqtree);
2195 }
2196
2197 /******************** Pass the parameter object params to IQTree *******************/
2198 iqtree->setParams(¶ms);
2199
2200 ModelsBlock *models_block = readModelsDefinition(params);
2201
2202 initializeParams(params, *iqtree);
2203
2204 if (posRateHeterotachy(iqtree->aln->model_name) != string::npos && !iqtree->isMixlen()) {
2205 // create a new instance
2206 IQTree* iqtree_new = new PhyloTreeMixlen(iqtree->aln, 0);
2207 iqtree_new->setCheckpoint(iqtree->getCheckpoint());
2208 if (!iqtree->constraintTree.empty())
2209 iqtree_new->constraintTree.readConstraint(iqtree->constraintTree);
2210 iqtree_new->removed_seqs = iqtree->removed_seqs;
2211 iqtree_new->twin_seqs = iqtree->twin_seqs;
2212 if (params.start_tree == STT_PLL_PARSIMONY || params.start_tree == STT_RANDOM_TREE || params.pll) {
2213 /* Initialized all data structure for PLL*/
2214 iqtree_new->initializePLL(params);
2215 }
2216 iqtree_new->setParams(¶ms);
2217 iqtree_new->copyPhyloTree(iqtree);
2218
2219 // replace iqtree object
2220 delete iqtree;
2221 iqtree = iqtree_new;
2222
2223 }
2224
2225 iqtree->setRootNode(params.root);
2226
2227 iqtree->restoreCheckpoint();
2228
2229 if (params.online_bootstrap && params.gbo_replicates > 0) {
2230 cout << "Generating " << params.gbo_replicates << " samples for ultrafast "
2231 << RESAMPLE_NAME << " (seed: " << params.ran_seed << ")..." << endl;
2232 }
2233
2234 iqtree->initSettings(params);
2235
2236 /*********************** INITIAL MODEL OPTIMIZATION *****************/
2237
2238
2239 if (!iqtree->getModelFactory()) {
2240 iqtree->initializeModel(params, iqtree->aln->model_name, models_block);
2241 }
2242 if (iqtree->getRate()->isHeterotachy() && !iqtree->isMixlen()) {
2243 ASSERT(0 && "Heterotachy tree not properly created");
2244 }
2245 // iqtree.restoreCheckpoint();
2246
2247 delete models_block;
2248
2249 // UpperBounds analysis. Here, to analyse the initial tree without any tree search or optimization
2250 /*
2251 if (params.upper_bound) {
2252 iqtree.setCurScore(iqtree.computeLikelihood());
2253 cout<<iqtree.getCurScore()<<endl;
2254 UpperBounds(¶ms, iqtree.aln, iqtree);
2255 exit(0);
2256 }
2257 */
2258
2259 // degree of freedom
2260 cout << endl;
2261 if (verbose_mode >= VB_MED) {
2262 cout << "ML-TREE SEARCH START WITH THE FOLLOWING PARAMETERS:" << endl;
2263 int model_df = iqtree->getModelFactory()->getNParameters(BRLEN_OPTIMIZE);
2264 printAnalysisInfo(model_df, *iqtree, params);
2265 }
2266
2267 if (!params.pll) {
2268 uint64_t total_mem = getMemorySize();
2269 if (params.lh_mem_save == LM_MEM_SAVE && params.max_mem_size > total_mem)
2270 params.max_mem_size = total_mem;
2271
2272 uint64_t mem_required = iqtree->getMemoryRequired();
2273
2274 if (mem_required >= total_mem*0.95 && !iqtree->isSuperTree()) {
2275 // switch to memory saving mode
2276 if (params.lh_mem_save != LM_MEM_SAVE) {
2277 params.max_mem_size = (total_mem*0.95)/mem_required;
2278 params.lh_mem_save = LM_MEM_SAVE;
2279 mem_required = iqtree->getMemoryRequired();
2280 cout << "NOTE: Switching to memory saving mode using " << (mem_required / 1073741824.0) << " GB ("
2281 << (mem_required*100/total_mem) << "% of normal mode)" << endl;
2282 cout << "NOTE: Use -mem option if you want to restrict RAM usage further" << endl;
2283 }
2284 if (mem_required >= total_mem) {
2285 params.lh_mem_save = LM_MEM_SAVE;
2286 params.max_mem_size = 0.0;
2287 mem_required = iqtree->getMemoryRequired();
2288 }
2289 }
2290 if (mem_required >= total_mem) {
2291 cerr << "ERROR: Your RAM is below minimum requirement of " << (mem_required / 1073741824.0) << " GB RAM" << endl;
2292 outError("Memory saving mode cannot work, switch to another computer!!!");
2293 }
2294
2295 //#if defined __APPLE__ || defined __MACH__
2296 cout << "NOTE: " << (mem_required / 1048576) << " MB RAM (" << (mem_required / 1073741824) << " GB) is required!" << endl;
2297 //#else
2298 // cout << "NOTE: " << ((double) mem_size / 1000.0) / 1000 << " MB RAM is required!" << endl;
2299 //#endif
2300 if (params.memCheck)
2301 exit(0);
2302 #ifdef BINARY32
2303 if (mem_required >= 2000000000) {
2304 outError("Memory required exceeds 2GB limit of 32-bit executable");
2305 }
2306 #endif
2307 int max_procs = countPhysicalCPUCores();
2308 if (mem_required * max_procs > total_mem * iqtree->num_threads && iqtree->num_threads > 0) {
2309 outWarning("Memory required per CPU-core (" + convertDoubleToString((double)mem_required/iqtree->num_threads/1024/1024/1024)+
2310 " GB) is higher than your computer RAM per CPU-core ("+convertIntToString(total_mem/max_procs/1024/1024/1024)+
2311 " GB), thus multiple runs may exceed RAM!");
2312 }
2313 }
2314
2315
2316 #ifdef _OPENMP
2317 if (iqtree->num_threads <= 0) {
2318 int bestThreads = iqtree->testNumThreads();
2319 omp_set_num_threads(bestThreads);
2320 params.num_threads = bestThreads;
2321 } else
2322 iqtree->warnNumThreads();
2323 #endif
2324
2325
2326 iqtree->initializeAllPartialLh();
2327 double initEpsilon = params.min_iterations == 0 ? params.modelEps : (params.modelEps*10);
2328
2329
2330 if (iqtree->getRate()->name.find("+I+G") != string::npos) {
2331 if (params.alpha_invar_file != NULL) { // COMPUTE TREE LIKELIHOOD BASED ON THE INPUT ALPHA AND P_INVAR VALUE
2332 computeLoglFromUserInputGAMMAInvar(params, *iqtree);
2333 exit(0);
2334 }
2335
2336 if (params.exh_ai) {
2337 exhaustiveSearchGAMMAInvar(params, *iqtree);
2338 exit(0);
2339 }
2340
2341 }
2342
2343 // Optimize model parameters and branch lengths using ML for the initial tree
2344 string initTree;
2345 iqtree->clearAllPartialLH();
2346
2347 iqtree->getModelFactory()->restoreCheckpoint();
2348 if (iqtree->getCheckpoint()->getBool("finishedModelInit")) {
2349 // model optimization already done: ignore this step
2350 if (!iqtree->candidateTrees.empty())
2351 iqtree->readTreeString(iqtree->getBestTrees()[0]);
2352 iqtree->setCurScore(iqtree->computeLikelihood());
2353 initTree = iqtree->getTreeString();
2354 cout << "CHECKPOINT: Model parameters restored, LogL: " << iqtree->getCurScore() << endl;
2355 } else {
2356 initTree = iqtree->optimizeModelParameters(true, initEpsilon);
2357 if (iqtree->isMixlen())
2358 initTree = ((ModelFactoryMixlen*)iqtree->getModelFactory())->sortClassesByTreeLength();
2359
2360 iqtree->saveCheckpoint();
2361 iqtree->getModelFactory()->saveCheckpoint();
2362 iqtree->getCheckpoint()->putBool("finishedModelInit", true);
2363 iqtree->getCheckpoint()->dump();
2364 // cout << "initTree: " << initTree << endl;
2365 }
2366
2367 if (params.lmap_num_quartets >= 0) {
2368 cout << endl << "Performing likelihood mapping with ";
2369 if (params.lmap_num_quartets > 0)
2370 cout << params.lmap_num_quartets;
2371 else
2372 cout << "all";
2373 cout << " quartets..." << endl;
2374 double lkmap_time = getRealTime();
2375 iqtree->doLikelihoodMapping();
2376 cout << "Likelihood mapping needed " << getRealTime()-lkmap_time << " seconds" << endl << endl;
2377 }
2378
2379 // TODO: why is this variable not used?
2380 // ANSWER: moved to doTreeSearch
2381 // bool finishedCandidateSet = iqtree.getCheckpoint()->getBool("finishedCandidateSet");
2382 bool finishedInitTree = iqtree->getCheckpoint()->getBool("finishedInitTree");
2383
2384 // now overwrite with random tree
2385 if (params.start_tree == STT_RANDOM_TREE && !finishedInitTree) {
2386 cout << "Generate random initial Yule-Harding tree..." << endl;
2387 iqtree->generateRandomTree(YULE_HARDING);
2388 iqtree->wrapperFixNegativeBranch(true);
2389 iqtree->initializeAllPartialLh();
2390 initTree = iqtree->optimizeBranches(params.brlen_num_traversal);
2391 cout << "Log-likelihood of random tree: " << iqtree->getCurScore() << endl;
2392 }
2393
2394 /****************** NOW PERFORM MAXIMUM LIKELIHOOD TREE RECONSTRUCTION ******************/
2395
2396 // Update best tree
2397 if (!finishedInitTree) {
2398 iqtree->addTreeToCandidateSet(initTree, iqtree->getCurScore(), false, MPIHelper::getInstance().getProcessID());
2399 iqtree->printResultTree();
2400 iqtree->intermediateTrees.update(iqtree->getTreeString(), iqtree->getCurScore());
2401 if (iqtree->isSuperTreeUnlinked()) {
2402 PhyloSuperTree* stree = (PhyloSuperTree*)iqtree;
2403 for (auto it = stree->begin(); it != stree->end(); it++)
2404 ((IQTree*)(*it))->addTreeToCandidateSet((*it)->getTreeString(),
2405 (*it)->getCurScore(), false, MPIHelper::getInstance().getProcessID());
2406 }
2407 }
2408
2409 if (params.min_iterations && !iqtree->isBifurcating())
2410 outError("Tree search does not work with initial multifurcating tree. Please specify `-n 0` to avoid this.");
2411
2412
2413 // Compute maximum likelihood distance
2414 // ML distance is only needed for IQP
2415 // if ( params.start_tree != STT_BIONJ && ((params.snni && !params.iqp) || params.min_iterations == 0)) {
2416 // params.compute_ml_dist = false;
2417 // }
2418 if ((params.min_iterations <= 1 || params.numInitTrees <= 1) && params.start_tree != STT_BIONJ)
2419 params.compute_ml_dist = false;
2420
2421 if ((params.user_file || params.start_tree == STT_RANDOM_TREE) && params.snni && !params.iqp) {
2422 params.compute_ml_dist = false;
2423 }
2424
2425 if (params.constraint_tree_file)
2426 params.compute_ml_dist = false;
2427
2428 if (iqtree->isSuperTreeUnlinked())
2429 params.compute_ml_dist = false;
2430
2431 //Generate BIONJ tree
2432 if (MPIHelper::getInstance().isMaster() && !iqtree->getCheckpoint()->getBool("finishedCandidateSet")) {
2433 if (!finishedInitTree && ((!params.dist_file && params.compute_ml_dist) || params.leastSquareBranch)) {
2434 computeMLDist(params, *iqtree, getCPUTime());
2435 if (!params.user_file && params.start_tree != STT_RANDOM_TREE) {
2436 // NEW 2015-08-10: always compute BIONJ tree into the candidate set
2437 iqtree->resetCurScore();
2438 double start_bionj = getRealTime();
2439 iqtree->computeBioNJ(params, iqtree->aln, iqtree->dist_file);
2440 cout << getRealTime() - start_bionj << " seconds" << endl;
2441 if (iqtree->isSuperTree())
2442 iqtree->wrapperFixNegativeBranch(true);
2443 else
2444 iqtree->wrapperFixNegativeBranch(false);
2445 iqtree->initializeAllPartialLh();
2446 if (params.start_tree == STT_BIONJ) {
2447 initTree = iqtree->optimizeModelParameters(params.min_iterations==0, initEpsilon);
2448 } else {
2449 initTree = iqtree->optimizeBranches();
2450 }
2451 cout << "Log-likelihood of BIONJ tree: " << iqtree->getCurScore() << endl;
2452 // cout << "BIONJ tree: " << iqtree->getTreeString() << endl;
2453 iqtree->candidateTrees.update(initTree, iqtree->getCurScore());
2454 }
2455 }
2456 }
2457
2458 // iqtree->saveCheckpoint();
2459
2460 double cputime_search_start = getCPUTime();
2461 double realtime_search_start = getRealTime();
2462
2463 if (params.leastSquareNNI) {
2464 iqtree->computeSubtreeDists();
2465 }
2466
2467 if (params.model_name == "WHTEST") {
2468 cout << endl << "Testing model homogeneity by Weiss & von Haeseler (2003)..." << endl;
2469 WHTest(params, *iqtree);
2470 }
2471
2472 NodeVector pruned_taxa;
2473 StrVector linked_name;
2474 double *saved_dist_mat = iqtree->dist_matrix;
2475 double *pattern_lh;
2476
2477 pattern_lh = new double[iqtree->getAlnNPattern()];
2478
2479 // prune stable taxa
2480 pruneTaxa(params, *iqtree, pattern_lh, pruned_taxa, linked_name);
2481
2482 /***************************************** DO STOCHASTIC TREE SEARCH *******************************************/
2483 if (params.min_iterations > 0 && !params.tree_spr) {
2484 iqtree->doTreeSearch();
2485 iqtree->setAlignment(iqtree->aln);
2486 } else {
2487 iqtree->candidateTrees.saveCheckpoint();
2488 /* do SPR with likelihood function */
2489 if (params.tree_spr) {
2490 //tree.optimizeSPRBranches();
2491 cout << "Doing SPR Search" << endl;
2492 cout << "Start tree.optimizeSPR()" << endl;
2493 double spr_score = iqtree->optimizeSPR();
2494 cout << "Finish tree.optimizeSPR()" << endl;
2495 //double spr_score = tree.optimizeSPR(tree.curScore, (PhyloNode*) tree.root->neighbors[0]->node);
2496 if (spr_score <= iqtree->getCurScore()) {
2497 cout << "SPR search did not found any better tree" << endl;
2498 }
2499 }
2500 }
2501
2502 // restore pruned taxa
2503 restoreTaxa(*iqtree, saved_dist_mat, pruned_taxa, linked_name);
2504
2505 double search_cpu_time = getCPUTime() - cputime_search_start;
2506 double search_real_time = getRealTime() - realtime_search_start;
2507
2508 // COMMENT THIS OUT BECAUSE IT DELETES ALL BRANCH LENGTHS OF SUBTREES!
2509 // if (iqtree.isSuperTree())
2510 // ((PhyloSuperTree*) iqtree)->mapTrees();
2511
2512 if (!MPIHelper::getInstance().isMaster()) {
2513 delete[] pattern_lh;
2514 return;
2515 }
2516
2517 if (params.snni && params.min_iterations && verbose_mode >= VB_MED) {
2518 cout << "Log-likelihoods of " << params.popSize << " best candidate trees: " << endl;
2519 iqtree->printBestScores();
2520 cout << endl;
2521 }
2522
2523 if (!params.final_model_opt) {
2524 iqtree->setCurScore(iqtree->computeLikelihood());
2525 } else if (params.min_iterations) {
2526 iqtree->readTreeString(iqtree->getBestTrees()[0]);
2527 iqtree->initializeAllPartialLh();
2528 iqtree->clearAllPartialLH();
2529 cout << "--------------------------------------------------------------------" << endl;
2530 cout << "| FINALIZING TREE SEARCH |" << endl;
2531 cout << "--------------------------------------------------------------------" << endl;
2532
2533 if (iqtree->getCheckpoint()->getBool("finishedModelFinal")) {
2534 iqtree->setCurScore(iqtree->computeLikelihood());
2535 cout << "CHECKPOINT: Final model parameters restored" << endl;
2536 } else {
2537 cout << "Performs final model parameters optimization" << endl;
2538 string tree;
2539 Params::getInstance().fixStableSplits = false;
2540 Params::getInstance().tabu = false;
2541 // why doing NNI search here?
2542 // iqtree->doNNISearch();
2543 tree = iqtree->optimizeModelParameters(true);
2544 iqtree->addTreeToCandidateSet(tree, iqtree->getCurScore(), false, MPIHelper::getInstance().getProcessID());
2545 iqtree->getCheckpoint()->putBool("finishedModelFinal", true);
2546 iqtree->saveCheckpoint();
2547 }
2548
2549 }
2550
2551 if (iqtree->isSuperTree()) {
2552 ((PhyloSuperTree*) iqtree)->computeBranchLengths();
2553 ((PhyloSuperTree*) iqtree)->printBestPartitionParams((string(params.out_prefix) + ".best_model.nex").c_str());
2554 }
2555
2556 cout << "BEST SCORE FOUND : " << iqtree->getCurScore() << endl;
2557
2558 if (params.write_candidate_trees) {
2559 printTrees(iqtree->getBestTrees(), params, ".imd_trees");
2560 }
2561
2562 if (params.pll)
2563 iqtree->inputModelPLL2IQTree();
2564
2565 /* root the tree at the first sequence */
2566 // BQM: WHY SETTING THIS ROOT NODE????
2567 // iqtree->root = iqtree->findLeafName(iqtree->aln->getSeqName(0));
2568 // assert(iqtree->root);
2569 iqtree->setRootNode(params.root);
2570
2571
2572 if (!params.pll) {
2573 iqtree->computeLikelihood(pattern_lh);
2574 // compute logl variance
2575 iqtree->logl_variance = iqtree->computeLogLVariance();
2576 }
2577
2578 printMiscInfo(params, *iqtree, pattern_lh);
2579
2580 if (params.root_test) {
2581 cout << "Testing root positions..." << endl;
2582 iqtree->testRootPosition(true, params.loglh_epsilon);
2583 }
2584
2585 /****** perform SH-aLRT test ******************/
2586 if ((params.aLRT_replicates > 0 || params.localbp_replicates > 0 || params.aLRT_test || params.aBayes_test) && !params.pll) {
2587 double mytime = getRealTime();
2588 params.aLRT_replicates = max(params.aLRT_replicates, params.localbp_replicates);
2589 cout << endl;
2590 if (params.aLRT_replicates > 0)
2591 cout << "Testing tree branches by SH-like aLRT with "
2592 << params.aLRT_replicates << " replicates..." << endl;
2593 if (params.localbp_replicates)
2594 cout << "Testing tree branches by local-BP test with " << params.localbp_replicates << " replicates..." << endl;
2595 if (params.aLRT_test)
2596 cout << "Testing tree branches by aLRT parametric test..." << endl;
2597 if (params.aBayes_test)
2598 cout << "Testing tree branches by aBayes parametric test..." << endl;
2599 iqtree->setRootNode(params.root);
2600 if (iqtree->isBifurcating()) {
2601 iqtree->testAllBranches(params.aLRT_threshold, iqtree->getCurScore(),
2602 pattern_lh, params.aLRT_replicates, params.localbp_replicates, params.aLRT_test, params.aBayes_test);
2603 cout << getRealTime() - mytime << " sec." << endl;
2604 } else {
2605 outWarning("Tree is multifurcating and such test is not applicable");
2606 params.aLRT_replicates = params.localbp_replicates = params.aLRT_test = params.aBayes_test = 0;
2607 }
2608 }
2609
2610 if (params.gbo_replicates > 0) {
2611 cout << "Creating " << RESAMPLE_NAME << " support values..." << endl;
2612 if (!params.online_bootstrap)
2613 outError("Obsolete feature");
2614 // runGuidedBootstrap(params, iqtree->aln, iqtree);
2615 else
2616 iqtree->summarizeBootstrap(params);
2617 }
2618
2619 if (params.collapse_zero_branch) {
2620 cout << "Collapsing near-zero internal branches... ";
2621 cout << iqtree->collapseInternalBranches(NULL, NULL, params.min_branch_length*4);
2622 cout << " collapsed" << endl;
2623 }
2624
2625 printFinalSearchInfo(params, *iqtree, search_cpu_time, search_real_time);
2626
2627 if (params.gbo_replicates && params.online_bootstrap && params.print_ufboot_trees)
2628 iqtree->writeUFBootTrees(params);
2629
2630 if (params.gbo_replicates && params.online_bootstrap && !iqtree->isSuperTreeUnlinked()) {
2631
2632 cout << endl << "Computing " << RESAMPLE_NAME << " consensus tree..." << endl;
2633 string splitsfile = params.out_prefix;
2634 splitsfile += ".splits.nex";
2635 double weight_threshold = (params.split_threshold<1) ? params.split_threshold : (params.gbo_replicates-1.0)/params.gbo_replicates;
2636 weight_threshold *= 100.0;
2637 computeConsensusTree(splitsfile.c_str(), 0, 1e6, -1,
2638 weight_threshold, NULL, params.out_prefix, NULL, ¶ms);
2639 // now optimize branch lengths of the consensus tree
2640 string current_tree = iqtree->getTreeString();
2641 optimizeConTree(params, iqtree);
2642 // revert the best tree
2643 iqtree->readTreeString(current_tree);
2644 }
2645 if (Params::getInstance().writeDistImdTrees) {
2646 cout << endl;
2647 cout << "Recomputing the log-likelihood of the intermediate trees ... " << endl;
2648 iqtree->intermediateTrees.recomputeLoglOfAllTrees(*iqtree);
2649 }
2650
2651 // BUG FIX: readTreeString(bestTreeString) not needed before this line
2652 iqtree->printResultTree();
2653 iqtree->saveCheckpoint();
2654
2655 if (params.upper_bound_NNI) {
2656 string out_file_UB = params.out_prefix;
2657 out_file_UB += ".UB.NNI.main";
2658 ofstream out_UB;
2659 out_UB.exceptions(ios::failbit | ios::badbit);
2660 out_UB.open((char *) out_file_UB.c_str(), std::ofstream::out | std::ofstream::app);
2661 out_UB << iqtree->leafNum << "\t" << iqtree->aln->getNSite() << "\t" << iqtree->params->upper_bound_frac << "\t"
2662 << iqtree->skippedNNIub << "\t" << iqtree->totalNNIub << "\t" << iqtree->getBestScore() << endl;
2663 //iqtree->minUB << "\t" << iqtree->meanUB/iqtree->skippedNNIub << "\t" << iqtree->maxUB << endl;
2664 out_UB.close();
2665 }
2666
2667 if (params.out_file)
2668 iqtree->printTree(params.out_file);
2669
2670 delete[] pattern_lh;
2671
2672 runApproximateBranchLengths(params, *iqtree);
2673
2674 }
2675
2676 /**********************************************************
2677 * MULTIPLE TREE RECONSTRUCTION
2678 ***********************************************************/
runMultipleTreeReconstruction(Params & params,Alignment * alignment,IQTree * tree)2679 void runMultipleTreeReconstruction(Params ¶ms, Alignment *alignment, IQTree *tree) {
2680 ModelCheckpoint *model_info = new ModelCheckpoint;
2681
2682 if (params.suppress_output_flags & OUT_TREEFILE)
2683 outError("Suppress .treefile not allowed with -runs option");
2684 string treefile_name = params.out_prefix;
2685 treefile_name += ".treefile";
2686 string runtrees_name = params.out_prefix;
2687 runtrees_name += ".runtrees";
2688 DoubleVector runLnL;
2689
2690 if (tree->getCheckpoint()->getVector("runLnL", runLnL)) {
2691 cout << endl << "CHECKPOINT: " << runLnL.size() << " independent run(s) restored" << endl;
2692 } else if (MPIHelper::getInstance().isMaster()) {
2693 // first empty the runtrees file
2694 try {
2695 ofstream tree_out;
2696 tree_out.exceptions(ios::failbit | ios::badbit);
2697 tree_out.open(runtrees_name.c_str());
2698 tree_out.close();
2699 } catch (ios::failure) {
2700 outError(ERR_WRITE_OUTPUT, runtrees_name);
2701 }
2702 }
2703
2704 double start_time = getCPUTime();
2705 double start_real_time = getRealTime();
2706
2707 int orig_seed = params.ran_seed;
2708 int run;
2709 int best_run = 0;
2710 for (run = 0; run < runLnL.size(); run++)
2711 if (runLnL[run] > runLnL[best_run])
2712 best_run = run;
2713
2714 // do multiple tree reconstruction
2715 for (run = runLnL.size(); run < params.num_runs; run++) {
2716
2717 tree->getCheckpoint()->startStruct("run" + convertIntToString(run+1));
2718
2719 params.ran_seed = orig_seed + run*1000 + MPIHelper::getInstance().getProcessID();
2720
2721 cout << endl << "---> START RUN NUMBER " << run + 1 << " (seed: " << params.ran_seed << ")" << endl;
2722
2723 tree->getCheckpoint()->put("seed", params.ran_seed);
2724
2725 // initialize random stream for replicating the run
2726
2727 int *saved_randstream = randstream;
2728 init_random(params.ran_seed);
2729
2730 IQTree *iqtree;
2731 if (alignment->isSuperAlignment()){
2732 if(params.partition_type != BRLEN_OPTIMIZE){
2733 iqtree = new PhyloSuperTreePlen((SuperAlignment*) alignment, (PhyloSuperTree*) tree);
2734 } else {
2735 iqtree = new PhyloSuperTree((SuperAlignment*) alignment, (PhyloSuperTree*) tree);
2736 }
2737 } else {
2738 // allocate heterotachy tree if neccessary
2739 int pos = posRateHeterotachy(alignment->model_name);
2740
2741 if (params.num_mixlen > 1) {
2742 iqtree = new PhyloTreeMixlen(alignment, params.num_mixlen);
2743 } else if (pos != string::npos) {
2744 iqtree = new PhyloTreeMixlen(alignment, 0);
2745 } else
2746 iqtree = new IQTree(alignment);
2747 }
2748
2749 if (!tree->constraintTree.empty()) {
2750 iqtree->constraintTree.readConstraint(tree->constraintTree);
2751 }
2752
2753 // set checkpoint
2754 iqtree->setCheckpoint(tree->getCheckpoint());
2755 iqtree->num_precision = tree->num_precision;
2756
2757 runTreeReconstruction(params, iqtree);
2758 // read in the output tree file
2759 stringstream ss;
2760 iqtree->printTree(ss);
2761 if (MPIHelper::getInstance().isMaster())
2762 try {
2763 ofstream tree_out;
2764 tree_out.exceptions(ios::failbit | ios::badbit);
2765 tree_out.open(runtrees_name.c_str(), ios_base::out | ios_base::app);
2766 tree_out.precision(10);
2767 tree_out << "[ lh=" << iqtree->getBestScore() << " ]";
2768 tree_out << ss.str() << endl;
2769 tree_out.close();
2770 } catch (ios::failure) {
2771 outError(ERR_WRITE_OUTPUT, runtrees_name);
2772 }
2773 // fix bug: set the model for original tree after testing
2774 if ((params.model_name.substr(0,4) == "TEST" || params.model_name.substr(0,2) == "MF") && tree->isSuperTree()) {
2775 PhyloSuperTree *stree = ((PhyloSuperTree*)tree);
2776 stree->part_info = ((PhyloSuperTree*)iqtree)->part_info;
2777 }
2778 runLnL.push_back(iqtree->getBestScore());
2779
2780 if (MPIHelper::getInstance().isMaster()) {
2781 if (params.num_bootstrap_samples > 0 && params.consensus_type == CT_CONSENSUS_TREE &&
2782 (run == 0 || iqtree->getBestScore() > runLnL[best_run])) {
2783 // 2017-12-08: optimize branch lengths of consensus tree
2784 // now optimize branch lengths of the consensus tree
2785 string current_tree = iqtree->getTreeString();
2786 optimizeConTree(params, iqtree);
2787 // revert the best tree
2788 iqtree->readTreeString(current_tree);
2789 iqtree->saveCheckpoint();
2790 }
2791 }
2792 if (iqtree->getBestScore() > runLnL[best_run])
2793 best_run = run;
2794
2795 if (params.num_runs == 1)
2796 reportPhyloAnalysis(params, *iqtree, *model_info);
2797 delete iqtree;
2798
2799 tree->getCheckpoint()->endStruct();
2800 // clear all checkpointed information
2801 // tree->getCheckpoint()->keepKeyPrefix("iqtree");
2802 tree->getCheckpoint()->putVector("runLnL", runLnL);
2803 // tree->getCheckpoint()->putBool("finished", false);
2804 tree->getCheckpoint()->dump(true);
2805 // restore randstream
2806 finish_random();
2807 randstream = saved_randstream;
2808 }
2809
2810 cout << endl << "---> SUMMARIZE RESULTS FROM " << runLnL.size() << " RUNS" << endl << endl;
2811
2812 cout << "Run " << best_run+1 << " gave best log-likelihood: " << runLnL[best_run] << endl;
2813
2814 // initialize tree and model strucgture
2815 ModelsBlock *models_block = readModelsDefinition(params);
2816 tree->setParams(¶ms);
2817 tree->setNumThreads(params.num_threads);
2818 if (!tree->getModelFactory()) {
2819 tree->initializeModel(params, tree->aln->model_name, models_block);
2820 }
2821 if (tree->getRate()->isHeterotachy() && !tree->isMixlen()) {
2822 ASSERT(0 && "Heterotachy tree not properly created");
2823 }
2824 delete models_block;
2825
2826 // restore the tree and model from the best run
2827 tree->getCheckpoint()->startStruct("run" + convertIntToString(best_run+1));
2828 tree->restoreCheckpoint();
2829 tree->getModelFactory()->restoreCheckpoint();
2830 tree->setCurScore(runLnL[best_run]);
2831 if (params.gbo_replicates && !tree->isSuperTreeUnlinked()) {
2832
2833 string out_file = (string)params.out_prefix + ".splits";
2834 if (params.print_splits_file) {
2835 tree->boot_splits.back()->saveFile(out_file.c_str(), IN_OTHER, true);
2836 cout << "Split supports printed to star-dot file " << out_file << endl;
2837 }
2838
2839 if (params.print_splits_nex_file) {
2840 out_file = (string)params.out_prefix + ".splits.nex";
2841 tree->boot_splits.back()->saveFile(out_file.c_str(), IN_NEXUS, false);
2842 cout << "Split supports printed to NEXUS file " << out_file << endl;
2843 }
2844
2845 // overwrite .ufboot trees
2846 if (params.print_ufboot_trees)
2847 tree->writeUFBootTrees(params);
2848
2849 // overwrite .contree
2850 string contree;
2851 if (!tree->getCheckpoint()->getString("contree", contree))
2852 ASSERT(0 && "Couldn't restore contree");
2853 string contree_file = string(params.out_prefix) + ".contree";
2854 string current_tree = tree->getTreeString();
2855 tree->readTreeString(contree);
2856 tree->setRootNode(params.root);
2857 tree->insertTaxa(tree->removed_seqs, tree->twin_seqs);
2858 tree->printTree(contree_file.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
2859 tree->readTreeString(current_tree);
2860 cout << "Consensus tree written to " << contree_file << endl;
2861 }
2862 tree->getCheckpoint()->endStruct();
2863
2864 // overwrite .treefile
2865 tree->printResultTree();
2866
2867 if (MPIHelper::getInstance().isMaster()) {
2868 cout << "Total CPU time for " << params.num_runs << " runs: " << (getCPUTime() - start_time) << " seconds." << endl;
2869 cout << "Total wall-clock time for " << params.num_runs << " runs: " << (getRealTime() - start_real_time) << " seconds." << endl << endl;
2870 }
2871 delete model_info;
2872 }
2873
computeLoglFromUserInputGAMMAInvar(Params & params,IQTree & iqtree)2874 void computeLoglFromUserInputGAMMAInvar(Params ¶ms, IQTree &iqtree) {
2875 RateHeterogeneity *site_rates = iqtree.getRate();
2876 site_rates->setFixPInvar(true);
2877 site_rates->setFixGammaShape(true);
2878 vector<double> alphas, p_invars, logl;
2879 ifstream aiFile;
2880 aiFile.open(params.alpha_invar_file, ios_base::in);
2881 if (aiFile.good()) {
2882 double alpha, p_invar;
2883 while (aiFile >> alpha >> p_invar) {
2884 alphas.push_back(alpha);
2885 p_invars.push_back(p_invar);
2886 }
2887 aiFile.close();
2888 cout << "Computing tree logl based on the alpha and p_invar values in " << params.alpha_invar_file << " ..." <<
2889 endl;
2890 } else {
2891 stringstream errMsg;
2892 errMsg << "Could not find file: " << params.alpha_invar_file;
2893 outError(errMsg.str().c_str());
2894 }
2895 string aiResultsFileName = string(params.out_prefix) + "_" + string(params.alpha_invar_file) + ".results";
2896 ofstream aiFileResults;
2897 aiFileResults.open(aiResultsFileName.c_str());
2898 aiFileResults << fixed;
2899 aiFileResults.precision(4);
2900 DoubleVector lenvec;
2901 aiFileResults << "Alpha P_Invar Logl TreeLength\n";
2902 for (int i = 0; i < alphas.size(); i++) {
2903 iqtree.saveBranchLengths(lenvec);
2904 aiFileResults << alphas.at(i) << " " << p_invars.at(i) << " ";
2905 site_rates->setGammaShape(alphas.at(i));
2906 site_rates->setPInvar(p_invars.at(i));
2907 iqtree.clearAllPartialLH();
2908 double lh = iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, false, 0.001);
2909 aiFileResults << lh << " " << iqtree.treeLength() << "\n";
2910 iqtree.restoreBranchLengths(lenvec);
2911 }
2912 aiFileResults.close();
2913 cout << "Results were written to: " << aiResultsFileName << endl;
2914 cout << "Wall clock time used: " << getRealTime() - params.start_real_time << endl;
2915 }
2916
searchGAMMAInvarByRestarting(IQTree & iqtree)2917 void searchGAMMAInvarByRestarting(IQTree &iqtree) {
2918 if (!Params::getInstance().fixed_branch_length)
2919 iqtree.setCurScore(iqtree.optimizeAllBranches(1));
2920 else
2921 iqtree.setCurScore(iqtree.computeLikelihood());
2922 RateHeterogeneity* site_rates = (iqtree.getRate());
2923 double values[] = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 };
2924 vector<double> initAlphas;
2925 if (Params::getInstance().randomAlpha) {
2926 while (initAlphas.size() < 10) {
2927 double initAlpha = random_double();
2928 initAlphas.push_back(initAlpha + iqtree.params->min_gamma_shape*2);
2929 }
2930 } else {
2931 initAlphas.assign(values, values+10);
2932 }
2933 double bestLogl = iqtree.getCurScore();
2934 double bestAlpha = 0.0;
2935 double bestPInvar = 0.0;
2936 double initPInvar = iqtree.getRate()->getPInvar();
2937
2938 /* Back up branch lengths and substitutional rates */
2939 DoubleVector lenvec;
2940 DoubleVector bestLens;
2941 iqtree.saveBranchLengths(lenvec);
2942 int numRateEntries = iqtree.getModel()->getNumRateEntries();
2943 double *rates = new double[numRateEntries];
2944 double *bestRates = new double[numRateEntries];
2945 iqtree.getModel()->getRateMatrix(rates);
2946 int numStates = iqtree.aln->num_states;
2947 double *state_freqs = new double[numStates];
2948 iqtree.getModel()->getStateFrequency(state_freqs);
2949 double *bestStateFreqs = new double[numStates];
2950
2951 for (int i = 0; i < 10; i++) {
2952 cout << endl;
2953 cout << "Testing alpha: " << initAlphas[i] << endl;
2954 // Initialize model parameters
2955 iqtree.restoreBranchLengths(lenvec);
2956 ((ModelMarkov*) iqtree.getModel())->setRateMatrix(rates);
2957 ((ModelMarkov*) iqtree.getModel())->setStateFrequency(state_freqs);
2958 iqtree.getModel()->decomposeRateMatrix();
2959 site_rates->setGammaShape(initAlphas[i]);
2960 site_rates->setPInvar(initPInvar);
2961 iqtree.clearAllPartialLH();
2962 iqtree.optimizeModelParameters(verbose_mode >= VB_MED, Params::getInstance().testAlphaEps);
2963 double estAlpha = iqtree.getRate()->getGammaShape();
2964 double estPInv = iqtree.getRate()->getPInvar();
2965 double logl = iqtree.getCurScore();
2966 cout << "Est. alpha: " << estAlpha << " / Est. pinv: " << estPInv
2967 << " / Logl: " << logl << endl;
2968
2969 if (iqtree.getCurScore() > bestLogl) {
2970 bestLogl = logl;
2971 bestAlpha = estAlpha;
2972 bestPInvar = estPInv;
2973 bestLens.clear();
2974 iqtree.saveBranchLengths(bestLens);
2975 iqtree.getModel()->getRateMatrix(bestRates);
2976 iqtree.getModel()->getStateFrequency(bestStateFreqs);
2977 }
2978 }
2979 site_rates->setGammaShape(bestAlpha);
2980 site_rates->setFixGammaShape(false);
2981 site_rates->setPInvar(bestPInvar);
2982 site_rates->setFixPInvar(false);
2983 ((ModelMarkov*) iqtree.getModel())->setRateMatrix(bestRates);
2984 ((ModelMarkov*) iqtree.getModel())->setStateFrequency(bestStateFreqs);
2985 iqtree.restoreBranchLengths(bestLens);
2986 iqtree.getModel()->decomposeRateMatrix();
2987 iqtree.clearAllPartialLH();
2988 iqtree.setCurScore(iqtree.computeLikelihood());
2989 cout << endl;
2990 cout << "Best initial alpha: " << bestAlpha << " / initial pinv: " << bestPInvar << " / ";
2991 cout << "Logl: " << iqtree.getCurScore() << endl;
2992
2993 delete [] rates;
2994 delete [] state_freqs;
2995 delete [] bestRates;
2996 delete [] bestStateFreqs;
2997 }
2998
2999 // Test alpha fom 0.1 to 15 and p_invar from 0.1 to 0.99, stepsize = 0.01
exhaustiveSearchGAMMAInvar(Params & params,IQTree & iqtree)3000 void exhaustiveSearchGAMMAInvar(Params ¶ms, IQTree &iqtree) {
3001 double alphaMin = 0.01;
3002 double alphaMax = 10.00;
3003 double p_invarMin = 0.01;
3004 double p_invarMax = 1.00;
3005 // double p_invarMax = iqtree.aln->frac_const_sites;
3006 double stepSize = 0.01;
3007 int numAlpha = (int) floor((alphaMax - alphaMin)/stepSize);
3008 int numInvar = (int) floor((p_invarMax - p_invarMin)/stepSize);
3009
3010 cout << "EVALUATING " << numAlpha*numInvar << " COMBINATIONS OF " << " alpha=" << alphaMin << ".." << alphaMax
3011 << " AND " << " p-invar=" << p_invarMin << ".." << p_invarMax
3012 << " (epsilon: " << params.modelEps << ")" << endl;
3013
3014 // vector<string> results;
3015 // results.reserve((unsigned long) (numAlpha * numInvar));
3016 DoubleVector lenvec;
3017 iqtree.saveBranchLengths(lenvec);
3018
3019 RateHeterogeneity* site_rates = (iqtree.getRate());
3020 site_rates->setFixPInvar(true);
3021 site_rates->setFixGammaShape(true);
3022
3023 string aiResultsFileName = string(params.out_prefix) + ".ai_results";
3024 ofstream aiFileResults;
3025 aiFileResults.open(aiResultsFileName.c_str());
3026 aiFileResults << fixed;
3027 aiFileResults.precision(4);
3028 aiFileResults << "alpha p_invar logl tree_len\n";
3029
3030 for (double alpha = alphaMin; alpha < alphaMax; alpha = alpha + stepSize) {
3031 cout << "alpha = " << alpha << endl;
3032 for (double p_invar = p_invarMin; p_invar < p_invarMax; p_invar = p_invar + stepSize) {
3033 site_rates->setGammaShape(alpha);
3034 site_rates->setPInvar(p_invar);
3035 iqtree.clearAllPartialLH();
3036 double lh = iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, false, params.modelEps);
3037 // stringstream ss;
3038 // ss << fixed << setprecision(2) << alpha << " " << p_invar << " " << lh << " " << iqtree.treeLength();
3039 aiFileResults << alpha << " " << p_invar << " " << lh << " " << iqtree.treeLength() << endl;
3040 //cout << ss.str() << endl;
3041 // results.push_back(ss.str());
3042 iqtree.restoreBranchLengths(lenvec);
3043 }
3044 }
3045 // for (vector<string>::iterator it = results.begin(); it != results.end(); it++) {
3046 // aiFileResults << (*it) << endl;
3047 // }
3048 aiFileResults.close();
3049 cout << "Results were written to: " << aiResultsFileName << endl;
3050 cout << "Wall clock time used: " << getRealTime() - params.start_real_time << endl;
3051 }
3052
3053 /**********************************************************
3054 * STANDARD NON-PARAMETRIC BOOTSTRAP
3055 ***********************************************************/
runStandardBootstrap(Params & params,Alignment * alignment,IQTree * tree)3056 void runStandardBootstrap(Params ¶ms, Alignment *alignment, IQTree *tree) {
3057 ModelCheckpoint *model_info = new ModelCheckpoint;
3058 StrVector removed_seqs, twin_seqs;
3059
3060 // turn off all branch tests
3061 int saved_aLRT_replicates = params.aLRT_replicates;
3062 int saved_localbp_replicates = params.localbp_replicates;
3063 bool saved_aLRT_test = params.aLRT_test;
3064 bool saved_aBayes_test = params.aBayes_test;
3065 params.aLRT_replicates = 0;
3066 params.localbp_replicates = 0;
3067 params.aLRT_test = false;
3068 params.aBayes_test = false;
3069
3070 if (params.suppress_output_flags & OUT_TREEFILE)
3071 outError("Suppress .treefile not allowed for standard bootstrap");
3072 string treefile_name = params.out_prefix;
3073 treefile_name += ".treefile";
3074 string boottrees_name = params.out_prefix;
3075 boottrees_name += ".boottrees";
3076 string bootaln_name = params.out_prefix;
3077 bootaln_name += ".bootaln";
3078 string bootlh_name = params.out_prefix;
3079 bootlh_name += ".bootlh";
3080 int bootSample = 0;
3081 if (tree->getCheckpoint()->get("bootSample", bootSample)) {
3082 cout << "CHECKPOINT: " << bootSample << " bootstrap analyses restored" << endl;
3083 } else if (MPIHelper::getInstance().isMaster()) {
3084 // first empty the boottrees file
3085 try {
3086 ofstream tree_out;
3087 tree_out.exceptions(ios::failbit | ios::badbit);
3088 tree_out.open(boottrees_name.c_str());
3089 tree_out.close();
3090 } catch (ios::failure) {
3091 outError(ERR_WRITE_OUTPUT, boottrees_name);
3092 }
3093
3094 // empty the bootaln file
3095 if (params.print_bootaln)
3096 try {
3097 ofstream tree_out;
3098 tree_out.exceptions(ios::failbit | ios::badbit);
3099 tree_out.open(bootaln_name.c_str());
3100 tree_out.close();
3101 } catch (ios::failure) {
3102 outError(ERR_WRITE_OUTPUT, bootaln_name);
3103 }
3104 }
3105
3106 double start_time = getCPUTime();
3107 double start_real_time = getRealTime();
3108
3109 startTreeReconstruction(params, tree, *model_info);
3110
3111 // 2018-06-21: bug fix: alignment might be changed by -m ...MERGE
3112 alignment = tree->aln;
3113
3114 // do bootstrap analysis
3115 for (int sample = bootSample; sample < params.num_bootstrap_samples; sample++) {
3116 cout << endl << "===> START " << RESAMPLE_NAME_UPPER << " REPLICATE NUMBER "
3117 << sample + 1 << endl << endl;
3118
3119 // 2015-12-17: initialize random stream for creating bootstrap samples
3120 // mainly so that checkpointing does not need to save bootstrap samples
3121 int *saved_randstream = randstream;
3122 init_random(params.ran_seed + sample);
3123
3124 Alignment* bootstrap_alignment;
3125 cout << "Creating " << RESAMPLE_NAME << " alignment (seed: " << params.ran_seed+sample << ")..." << endl;
3126
3127 if (alignment->isSuperAlignment())
3128 bootstrap_alignment = new SuperAlignment;
3129 else
3130 bootstrap_alignment = new Alignment;
3131 bootstrap_alignment->createBootstrapAlignment(alignment, NULL, params.bootstrap_spec);
3132
3133 // restore randstream
3134 finish_random();
3135 randstream = saved_randstream;
3136
3137 if (params.print_tree_lh && MPIHelper::getInstance().isMaster()) {
3138 double prob;
3139 bootstrap_alignment->multinomialProb(*alignment, prob);
3140 ofstream boot_lh;
3141 if (sample == 0)
3142 boot_lh.open(bootlh_name.c_str());
3143 else
3144 boot_lh.open(bootlh_name.c_str(), ios_base::out | ios_base::app);
3145 boot_lh << "0\t" << prob << endl;
3146 boot_lh.close();
3147 }
3148 IQTree *boot_tree;
3149 if (alignment->isSuperAlignment()){
3150 if(params.partition_type != BRLEN_OPTIMIZE){
3151 boot_tree = new PhyloSuperTreePlen((SuperAlignment*) bootstrap_alignment, (PhyloSuperTree*) tree);
3152 } else {
3153 boot_tree = new PhyloSuperTree((SuperAlignment*) bootstrap_alignment, (PhyloSuperTree*) tree);
3154 }
3155 } else {
3156 // allocate heterotachy tree if neccessary
3157 int pos = posRateHeterotachy(alignment->model_name);
3158
3159 if (params.num_mixlen > 1) {
3160 boot_tree = new PhyloTreeMixlen(bootstrap_alignment, params.num_mixlen);
3161 } else if (pos != string::npos) {
3162 boot_tree = new PhyloTreeMixlen(bootstrap_alignment, 0);
3163 } else
3164 boot_tree = new IQTree(bootstrap_alignment);
3165 }
3166 if (params.print_bootaln && MPIHelper::getInstance().isMaster()) {
3167 bootstrap_alignment->printAlignment(params.aln_output_format, bootaln_name.c_str(), true);
3168 }
3169
3170 if (params.print_boot_site_freq && MPIHelper::getInstance().isMaster()) {
3171 printSiteStateFreq((((string)params.out_prefix)+"."+convertIntToString(sample)+".bootsitefreq").c_str(), bootstrap_alignment);
3172 bootstrap_alignment->printAlignment(params.aln_output_format, (((string)params.out_prefix)+"."+convertIntToString(sample)+".bootaln").c_str());
3173 }
3174
3175 if (!tree->constraintTree.empty()) {
3176 boot_tree->constraintTree.readConstraint(tree->constraintTree);
3177 }
3178
3179 // set checkpoint
3180 boot_tree->setCheckpoint(tree->getCheckpoint());
3181 boot_tree->num_precision = tree->num_precision;
3182
3183 runTreeReconstruction(params, boot_tree);
3184 // read in the output tree file
3185 stringstream ss;
3186 boot_tree->printTree(ss);
3187 // try {
3188 // ifstream tree_in;
3189 // tree_in.exceptions(ios::failbit | ios::badbit);
3190 // tree_in.open(treefile_name.c_str());
3191 // tree_in >> tree_str;
3192 // tree_in.close();
3193 // } catch (ios::failure) {
3194 // outError(ERR_READ_INPUT, treefile_name);
3195 // }
3196 // write the tree into .boottrees file
3197 if (MPIHelper::getInstance().isMaster())
3198 try {
3199 ofstream tree_out;
3200 tree_out.exceptions(ios::failbit | ios::badbit);
3201 tree_out.open(boottrees_name.c_str(), ios_base::out | ios_base::app);
3202 tree_out << ss.str() << endl;
3203 tree_out.close();
3204 } catch (ios::failure) {
3205 outError(ERR_WRITE_OUTPUT, boottrees_name);
3206 }
3207 // OBSOLETE fix bug: set the model for original tree after testing
3208 // if ((params.model_name.substr(0,4) == "TEST" || params.model_name.substr(0,2) == "MF") && tree->isSuperTree()) {
3209 // PhyloSuperTree *stree = ((PhyloSuperTree*)tree);
3210 // stree->part_info = ((PhyloSuperTree*)boot_tree)->part_info;
3211 // }
3212 if (params.num_bootstrap_samples == 1)
3213 reportPhyloAnalysis(params, *boot_tree, *model_info);
3214 // WHY was the following line missing, which caused memory leak?
3215 bootstrap_alignment = boot_tree->aln;
3216 delete boot_tree;
3217 // fix bug: bootstrap_alignment might be changed
3218 delete bootstrap_alignment;
3219
3220 // clear all checkpointed information
3221 tree->getCheckpoint()->keepKeyPrefix("iqtree");
3222 tree->getCheckpoint()->put("bootSample", sample+1);
3223 tree->getCheckpoint()->putBool("finished", false);
3224 tree->getCheckpoint()->dump(true);
3225 }
3226
3227
3228 if (params.consensus_type == CT_CONSENSUS_TREE && MPIHelper::getInstance().isMaster()) {
3229
3230 cout << endl << "===> COMPUTE CONSENSUS TREE FROM " << params.num_bootstrap_samples
3231 << RESAMPLE_NAME_UPPER << " TREES" << endl << endl;
3232 string root_name = (params.root) ? params.root : alignment->getSeqName(0);
3233 const char* saved_root = params.root;
3234 params.root = root_name.c_str();
3235 computeConsensusTree(boottrees_name.c_str(), 0, 1e6, -1,
3236 params.split_threshold, NULL, params.out_prefix, NULL, ¶ms);
3237 params.root = saved_root;
3238 }
3239
3240 if (params.compute_ml_tree) {
3241 cout << endl << "===> START ANALYSIS ON THE ORIGINAL ALIGNMENT" << endl << endl;
3242 // restore branch tests
3243 params.aLRT_replicates = saved_aLRT_replicates;
3244 params.localbp_replicates = saved_localbp_replicates;
3245 params.aLRT_test = saved_aLRT_test;
3246 params.aBayes_test = saved_aBayes_test;
3247
3248 if (params.num_runs == 1)
3249 runTreeReconstruction(params, tree);
3250 else
3251 runMultipleTreeReconstruction(params, tree->aln, tree);
3252
3253 if (MPIHelper::getInstance().isMaster()) {
3254 if (params.consensus_type == CT_CONSENSUS_TREE && params.num_runs == 1) {
3255 // 2017-12-08: optimize branch lengths of consensus tree
3256 optimizeConTree(params, tree);
3257 }
3258
3259 cout << endl << "===> ASSIGN " << RESAMPLE_NAME_UPPER
3260 << " SUPPORTS TO THE TREE FROM ORIGINAL ALIGNMENT" << endl << endl;
3261 MExtTree ext_tree;
3262 assignBootstrapSupport(boottrees_name.c_str(), 0, 1e6,
3263 treefile_name.c_str(), false, treefile_name.c_str(),
3264 params.out_prefix, ext_tree, NULL, ¶ms);
3265 tree->copyTree(&ext_tree);
3266 reportPhyloAnalysis(params, *tree, *model_info);
3267 }
3268 } else if (params.consensus_type == CT_CONSENSUS_TREE && MPIHelper::getInstance().isMaster()) {
3269 int mi = params.min_iterations;
3270 STOP_CONDITION sc = params.stop_condition;
3271 params.min_iterations = 0;
3272 params.stop_condition = SC_FIXED_ITERATION;
3273 runTreeReconstruction(params, tree);
3274 params.min_iterations = mi;
3275 params.stop_condition = sc;
3276 tree->stop_rule.initialize(params);
3277 optimizeConTree(params, tree);
3278 reportPhyloAnalysis(params, *tree, *model_info);
3279 } else
3280 cout << endl;
3281
3282 #ifdef USE_BOOSTER
3283 if (params.transfer_bootstrap) {
3284 // transfer bootstrap expectation (TBE)
3285 cout << "Performing transfer bootstrap expectation..." << endl;
3286 string input_tree = (string)params.out_prefix + ".treefile";
3287 string boot_trees = (string)params.out_prefix + ".boottrees";
3288 string out_tree = (string)params.out_prefix + ".tbe.tree";
3289 string out_raw_tree = (string)params.out_prefix + ".tbe.rawtree";
3290 string stat_out = (string)params.out_prefix + ".tbe.stat";
3291 main_booster(input_tree.c_str(), boot_trees.c_str(), out_tree.c_str(),
3292 (params.transfer_bootstrap==2) ? out_raw_tree.c_str() : NULL,
3293 stat_out.c_str(), (verbose_mode >= VB_MED) ? 0 : 1);
3294 cout << "TBE tree written to " << out_tree << endl;
3295 if (params.transfer_bootstrap == 2)
3296 cout << "TBE raw tree written to " << out_raw_tree << endl;
3297 cout << "TBE statistic written to " << stat_out << endl;
3298 cout << endl;
3299 }
3300 #endif
3301
3302 if (MPIHelper::getInstance().isMaster()) {
3303 cout << "Total CPU time for " << RESAMPLE_NAME << ": " << (getCPUTime() - start_time) << " seconds." << endl;
3304 cout << "Total wall-clock time for " << RESAMPLE_NAME << ": " << (getRealTime() - start_real_time) << " seconds." << endl << endl;
3305 cout << "Non-parametric " << RESAMPLE_NAME << " results written to:" << endl;
3306 if (params.print_bootaln)
3307 cout << RESAMPLE_NAME_I << " alignments: " << params.out_prefix << ".bootaln" << endl;
3308 cout << RESAMPLE_NAME_I << " trees: " << params.out_prefix << ".boottrees" << endl;
3309 if (params.consensus_type == CT_CONSENSUS_TREE)
3310 cout << " Consensus tree: " << params.out_prefix << ".contree" << endl;
3311 cout << endl;
3312 }
3313 delete model_info;
3314 }
3315
convertAlignment(Params & params,IQTree * iqtree)3316 void convertAlignment(Params ¶ms, IQTree *iqtree) {
3317 Alignment *alignment = iqtree->aln;
3318 if (params.num_bootstrap_samples || params.print_bootaln) {
3319 // create bootstrap alignment
3320 Alignment* bootstrap_alignment;
3321 cout << "Creating " << RESAMPLE_NAME << " alignment..." << endl;
3322 if (alignment->isSuperAlignment())
3323 bootstrap_alignment = new SuperAlignment;
3324 else
3325 bootstrap_alignment = new Alignment;
3326 bootstrap_alignment->createBootstrapAlignment(alignment, NULL, params.bootstrap_spec);
3327 delete alignment;
3328 alignment = bootstrap_alignment;
3329 iqtree->aln = alignment;
3330 }
3331
3332 int exclude_sites = 0;
3333 if (params.aln_nogaps)
3334 exclude_sites += EXCLUDE_GAP;
3335 if (params.aln_no_const_sites)
3336 exclude_sites += EXCLUDE_INVAR;
3337
3338 if (alignment->isSuperAlignment()) {
3339 alignment->printAlignment(params.aln_output_format, params.aln_output, false, params.aln_site_list,
3340 exclude_sites, params.ref_seq_name);
3341 if (params.print_subaln)
3342 ((SuperAlignment*)alignment)->printSubAlignments(params);
3343 if (params.aln_output_format != IN_NEXUS) {
3344 string partition_info = string(params.aln_output) + ".nex";
3345 ((SuperAlignment*)alignment)->printPartition(partition_info.c_str(), params.aln_output);
3346 partition_info = (string)params.aln_output + ".partitions";
3347 ((SuperAlignment*)alignment)->printPartitionRaxml(partition_info.c_str());
3348 }
3349 } else if (params.gap_masked_aln) {
3350 Alignment out_aln;
3351 Alignment masked_aln(params.gap_masked_aln, params.sequence_type, params.intype, params.model_name);
3352 out_aln.createGapMaskedAlignment(&masked_aln, alignment);
3353 out_aln.printAlignment(params.aln_output_format, params.aln_output, false, params.aln_site_list,
3354 exclude_sites, params.ref_seq_name);
3355 string str = params.gap_masked_aln;
3356 str += ".sitegaps";
3357 out_aln.printSiteGaps(str.c_str());
3358 } else {
3359 alignment->printAlignment(params.aln_output_format, params.aln_output, false, params.aln_site_list,
3360 exclude_sites, params.ref_seq_name);
3361 }
3362 }
3363
3364 /**
3365 2016-08-04: compute a site frequency model for profile mixture model
3366 */
computeSiteFrequencyModel(Params & params,Alignment * alignment)3367 void computeSiteFrequencyModel(Params ¶ms, Alignment *alignment) {
3368
3369 cout << endl << "===> COMPUTING SITE FREQUENCY MODEL BASED ON TREE FILE " << params.tree_freq_file << endl;
3370 ASSERT(params.tree_freq_file);
3371 PhyloTree *tree = new PhyloTree(alignment);
3372 tree->setParams(¶ms);
3373 bool myrooted = params.is_rooted;
3374 tree->readTree(params.tree_freq_file, myrooted);
3375 tree->setAlignment(alignment);
3376 tree->setRootNode(params.root);
3377
3378 ModelsBlock *models_block = readModelsDefinition(params);
3379 tree->setModelFactory(new ModelFactory(params, alignment->model_name, tree, models_block));
3380 delete models_block;
3381 tree->setModel(tree->getModelFactory()->model);
3382 tree->setRate(tree->getModelFactory()->site_rate);
3383 tree->setLikelihoodKernel(params.SSE);
3384 tree->setNumThreads(params.num_threads);
3385
3386 if (!tree->getModel()->isMixture())
3387 outError("No mixture model was specified!");
3388 uint64_t mem_size = tree->getMemoryRequired();
3389 uint64_t total_mem = getMemorySize();
3390 cout << "NOTE: " << (mem_size / 1024) / 1024 << " MB RAM is required!" << endl;
3391 if (mem_size >= total_mem) {
3392 outError("Memory required exceeds your computer RAM size!");
3393 }
3394 #ifdef BINARY32
3395 if (mem_size >= 2000000000) {
3396 outError("Memory required exceeds 2GB limit of 32-bit executable");
3397 }
3398 #endif
3399
3400 #ifdef _OPENMP
3401 if (tree->num_threads <= 0) {
3402 int bestThreads = tree->testNumThreads();
3403 omp_set_num_threads(bestThreads);
3404 } else
3405 tree->warnNumThreads();
3406 #endif
3407
3408 tree->initializeAllPartialLh();
3409 // 2017-12-07: Increase espilon ten times (0.01 -> 0.1) to speedup PMSF computation
3410 tree->getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.modelEps*10);
3411
3412 size_t nptn = alignment->getNPattern(), nstates = alignment->num_states;
3413 double *ptn_state_freq = new double[nptn*nstates];
3414 tree->computePatternStateFreq(ptn_state_freq);
3415 alignment->site_state_freq.resize(nptn);
3416 for (size_t ptn = 0; ptn < nptn; ptn++) {
3417 double *f = new double[nstates];
3418 memcpy(f, ptn_state_freq+ptn*nstates, sizeof(double)*nstates);
3419 alignment->site_state_freq[ptn] = f;
3420 }
3421 alignment->getSitePatternIndex(alignment->site_model);
3422 printSiteStateFreq(((string)params.out_prefix+".sitefreq").c_str(), tree, ptn_state_freq);
3423 params.print_site_state_freq = WSF_NONE;
3424
3425 delete [] ptn_state_freq;
3426 delete tree;
3427
3428 cout << endl << "===> CONTINUE ANALYSIS USING THE INFERRED SITE FREQUENCY MODEL" << endl;
3429 }
3430
3431
3432 /**********************************************************
3433 * TOP-LEVEL FUNCTION
3434 ***********************************************************/
3435
newIQTree(Params & params,Alignment * alignment)3436 IQTree *newIQTree(Params ¶ms, Alignment *alignment) {
3437 IQTree *tree;
3438 if (alignment->isSuperAlignment()) {
3439 if (params.partition_type == TOPO_UNLINKED) {
3440 tree = new PhyloSuperTreeUnlinked((SuperAlignment*)alignment);
3441 } else if(params.partition_type != BRLEN_OPTIMIZE){
3442 // initialize supertree - Proportional Edges case
3443 tree = new PhyloSuperTreePlen((SuperAlignment*)alignment, params.partition_type);
3444 } else {
3445 // initialize supertree stuff if user specifies partition file with -sp option
3446 tree = new PhyloSuperTree((SuperAlignment*)alignment);
3447 }
3448 // this alignment will actually be of type SuperAlignment
3449 // alignment = tree->aln;
3450 if (((PhyloSuperTree*)tree)->rescale_codon_brlen)
3451 cout << "NOTE: Mixed codon and other data, branch lengths of codon partitions are rescaled by 3!" << endl;
3452
3453 } else {
3454 // allocate heterotachy tree if neccessary
3455 int pos = posRateHeterotachy(alignment->model_name);
3456
3457 if (params.num_mixlen > 1) {
3458 tree = new PhyloTreeMixlen(alignment, params.num_mixlen);
3459 } else if (pos != string::npos) {
3460 tree = new PhyloTreeMixlen(alignment, 0);
3461 } else
3462 tree = new IQTree(alignment);
3463 }
3464
3465 return tree;
3466 }
3467
3468 /** get ID of bad or good symtest results */
getSymTestID(vector<SymTestResult> & res,set<int> & id,bool bad_res)3469 void getSymTestID(vector<SymTestResult> &res, set<int> &id, bool bad_res) {
3470 if (bad_res) {
3471 // get significant test ID
3472 switch (Params::getInstance().symtest) {
3473 case SYMTEST_BINOM:
3474 for (auto i = res.begin(); i != res.end(); i++)
3475 if (i->pvalue_binom < Params::getInstance().symtest_pcutoff)
3476 id.insert(i - res.begin());
3477 break;
3478 case SYMTEST_MAXDIV:
3479 for (auto i = res.begin(); i != res.end(); i++)
3480 if (i->pvalue_maxdiv < Params::getInstance().symtest_pcutoff)
3481 id.insert(i - res.begin());
3482 break;
3483 default:
3484 break;
3485 }
3486 } else {
3487 // get non-significant test ID
3488 switch (Params::getInstance().symtest) {
3489 case SYMTEST_BINOM:
3490 for (auto i = res.begin(); i != res.end(); i++)
3491 if (i->pvalue_binom >= Params::getInstance().symtest_pcutoff)
3492 id.insert(i - res.begin());
3493 break;
3494 case SYMTEST_MAXDIV:
3495 for (auto i = res.begin(); i != res.end(); i++)
3496 if (i->pvalue_maxdiv >= Params::getInstance().symtest_pcutoff)
3497 id.insert(i - res.begin());
3498 break;
3499 default:
3500 break;
3501 }
3502 }
3503 }
3504
computePValueSMax(vector<SymTestResult> & sym,int start,int step)3505 double computePValueSMax(vector<SymTestResult> &sym, int start, int step) {
3506 double orig_max = sym[start].max_stat;
3507 int count = 0, num = 0;
3508 for (size_t i = start; i < sym.size(); i += step, num++)
3509 if (sym[i].max_stat >= orig_max)
3510 count++;
3511 return double(count)/num;
3512
3513 }
3514
doSymTest(Alignment * alignment,Params & params)3515 void doSymTest(Alignment *alignment, Params ¶ms) {
3516 double start_time = getRealTime();
3517 cout << "Performing matched-pair tests of symmetry...";
3518 vector<SymTestResult> sym, marsym, intsym;
3519
3520 size_t num_parts = 1;
3521 if (alignment->isSuperAlignment())
3522 num_parts = ((SuperAlignment*)alignment)->partitions.size();
3523
3524 string filename_stat = string(params.out_prefix) + ".symstat.csv";
3525 ofstream *out_stat = NULL;
3526 if (params.symtest_stat) {
3527 out_stat = new ofstream;
3528 out_stat->open(filename_stat);
3529 *out_stat
3530 << "# Statistic values for matched-pair tests of symmetry" << endl
3531 << "# This file can be read in MS Excel or in R with command:" << endl
3532 << "# dat=read.csv('" << filename_stat << "',comment.char='#')" << endl
3533 << "# Columns are comma-separated with following meanings:" << endl
3534 << "# ID: Partition ID" << endl
3535 << "# Seq1: ID of sequence 1 within partition" << endl
3536 << "# Seq1: ID of sequence 2 within partition" << endl
3537 << "# Sym: Statistic for test of symmetry" << endl
3538 << "# SymChi: Chi-square p-value for test of symmetry" << endl
3539 << "# Mar: Statistic for test of marginal symmetry" << endl
3540 << "# MarChi: Chi-square p-value for marginal test of symmetry" << endl
3541 << "# Int: Statistic for test of internal symmetry" << endl
3542 << "# MarChi: Chi-square p-value for internal test of symmetry" << endl
3543 << "ID,Seq1,Seq2,Sym,SymChi,Mar,MarChi,Int,IntChi" << endl;
3544
3545 }
3546
3547 sym.resize(num_parts*params.symtest_shuffle);
3548 marsym.resize(num_parts*params.symtest_shuffle);
3549 intsym.resize(num_parts*params.symtest_shuffle);
3550
3551 for (int i = 0; i < params.symtest_shuffle; i++) {
3552 vector<SymTestStat> *stats = NULL;
3553 if (params.symtest_stat)
3554 stats = new vector<SymTestStat>;
3555 if (i == 0) // original alignment
3556 alignment->doSymTest(i*num_parts, sym, marsym, intsym, NULL, stats);
3557 else {
3558 int *rstream;
3559 init_random(params.ran_seed+i+1, false, &rstream);
3560 alignment->doSymTest(i*num_parts, sym, marsym, intsym, rstream, stats);
3561 finish_random(rstream);
3562 }
3563 if ((i+1)*10 % params.symtest_shuffle == 0) {
3564 cout << " " << (i+1)*100 / params.symtest_shuffle << "%";
3565 cout.flush();
3566 }
3567 if (!stats)
3568 continue;
3569 for (auto it = stats->begin(); it != stats->end(); it++) {
3570 *out_stat << it->part << ',' << it->seq1 << ',' << it->seq2 << ','
3571 << it->chi2_sym << ',' << it->pval_sym << ','
3572 << it->chi2_marsym << ',' << it->pval_marsym << ','
3573 << it->chi2_intsym << ',' << it->pval_intsym << endl;
3574 }
3575 delete stats;
3576 }
3577
3578 if (out_stat) {
3579 out_stat->close();
3580 delete out_stat;
3581 }
3582
3583 if (params.symtest_shuffle > 1) {
3584 // compute p-value for s-max approach
3585 for (int part = 0; part < num_parts; part++) {
3586 sym[part].pvalue_perm = computePValueSMax(sym, part, num_parts);
3587 marsym[part].pvalue_perm = computePValueSMax(marsym, part, num_parts);
3588 intsym[part].pvalue_perm = computePValueSMax(intsym, part, num_parts);
3589 }
3590 }
3591
3592 string filename = string(params.out_prefix) + ".symtest.csv";
3593 ofstream out;
3594 out.open(filename);
3595 out << "# Matched-pair tests of symmetry" << endl
3596 << "# This file can be read in MS Excel or in R with command:" << endl
3597 << "# dat=read.csv('" << filename << "',comment.char='#')" << endl
3598 << "# Columns are comma-separated with following meanings:" << endl
3599 << "# Name: Partition name" << endl
3600 << "# SymSig: Number of significant sequence pairs by test of symmetry" << endl
3601 << "# SymNon: Number of non-significant sequence pairs by test of symmetry" << endl
3602 << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "# SymBi: P-value for binomial test of symmetry" : "# SymPval: P-value for maximum test of symmetry") << endl;
3603 if (params.symtest_shuffle > 1)
3604 out << "# SymMax: Maximum of pair statistics by test of symmetry" << endl
3605 << "# SymPerm: P-value for permutation test of symmetry" << endl;
3606
3607 out << "# MarSig: Number of significant sequence pairs by test of marginal symmetry" << endl
3608 << "# MarNon: Number of non-significant sequence pairs by test of marginal symmetry" << endl
3609 << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "# MarBi: P-value for binomial test of marginal symmetry" : "# MarPval: P-value for maximum test of marginal symmetry") << endl;
3610 if (params.symtest_shuffle > 1)
3611 out << "# MarMax: Maximum of pair statistics by test of marginal symmetry" << endl
3612 << "# MarPerm: P-value for permutation test of marginal symmetry" << endl;
3613 out << "# IntSig: Number of significant sequence pairs by test of internal symmetry" << endl
3614 << "# IntNon: Number of non-significant sequence pairs by test of internal symmetry" << endl
3615 << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "# IntBi: P-value for binomial test of symmetry" : "# IntPval: P-value for maximum test of internal symmetry") << endl;
3616 if (params.symtest_shuffle > 1)
3617 out << "# IntMax: Maximum of pair statistics by test of internal symmetry" << endl
3618 << "# IntPerm: P-value for permutation test of internal symmetry" << endl;
3619
3620 out << "Name,SymSig,SymNon," << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "SymBi" : "SymPval")
3621 << ((params.symtest_shuffle > 1) ? ",SymMax,SymPerm" : "")
3622 << ",MarSig,MarNon," << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "MarBi" : "MarPval")
3623 << ((params.symtest_shuffle > 1) ? ",MarMax,MarPerm" : "")
3624 << ",IntSig,IntNon," << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "IntBi" : "IntPval")
3625 << ((params.symtest_shuffle > 1) ? ",IntMax,IntPerm" : "") << endl;
3626
3627 if (alignment->isSuperAlignment()) {
3628 SuperAlignment *saln = (SuperAlignment*)alignment;
3629 for (int part = 0; part < saln->partitions.size(); part++)
3630 out << saln->partitions[part]->name << ','
3631 << sym[part] << ',' << marsym[part] << ',' << intsym[part] << endl;
3632 } else {
3633 out << alignment->name << ',' << sym[0] << ',' << marsym[0] << ',' << intsym[0] << endl;
3634 }
3635
3636 if (params.symtest_shuffle > 1) {
3637 for (int part = num_parts; part < sym.size(); part++) {
3638 sym[part].pvalue_perm = marsym[part].pvalue_perm = intsym[part].pvalue_perm = -1.0;
3639 out << part % num_parts << ','
3640 << sym[part] << ',' << marsym[part] << ',' << intsym[part] << endl;
3641 }
3642 // erase the rest
3643 sym.erase(sym.begin()+num_parts, sym.end());
3644 marsym.erase(marsym.begin()+num_parts, marsym.end());
3645 intsym.erase(intsym.begin()+num_parts, intsym.end());
3646 }
3647
3648 out.close();
3649 cout << " " << getRealTime() - start_time << " seconds" << endl;
3650 if (params.symtest_stat)
3651 cout << "SymTest statistics written to " << filename_stat << endl;
3652 cout << "SymTest results written to " << filename << endl;
3653
3654 // now filter out partitions
3655 if (alignment->isSuperAlignment()) {
3656 set<int> part_id;
3657 if (params.symtest_remove == 1) {
3658 // remove bad loci
3659 if (params.symtest_type == 0)
3660 getSymTestID(sym, part_id, true);
3661 else if (params.symtest_type == 1)
3662 getSymTestID(marsym, part_id, true);
3663 else
3664 getSymTestID(intsym, part_id, true);
3665 } else if (params.symtest_remove == 2) {
3666 // remove good loci
3667 if (params.symtest_type == 0)
3668 getSymTestID(sym, part_id, false);
3669 else if (params.symtest_type == 1)
3670 getSymTestID(marsym, part_id, false);
3671 else
3672 getSymTestID(intsym, part_id, false);
3673 }
3674 if (!part_id.empty()) {
3675 SuperAlignment *saln = (SuperAlignment*)alignment;
3676 cout << "Removing " << part_id.size()
3677 << ((params.symtest_remove == 1)? " bad" : " good") << " partitions (pvalue cutoff = "
3678 << params.symtest_pcutoff << ")..." << endl;
3679 if (part_id.size() < alignment->getNSite())
3680 saln->removePartitions(part_id);
3681 else
3682 outError("Can't remove all partitions");
3683 if (params.aln_output_format == IN_NEXUS) {
3684 string aln_file = (string)params.out_prefix + ((params.symtest_remove == 1)? ".good.nex" : ".bad.nex");
3685 alignment->printAlignment(params.aln_output_format, aln_file.c_str());
3686 } else {
3687 string aln_file = (string)params.out_prefix + ((params.symtest_remove == 1)? ".good.phy" : ".bad.phy");
3688 alignment->printAlignment(params.aln_output_format, aln_file.c_str());
3689 string filename = (string)params.out_prefix + ((params.symtest_remove == 2)? ".good.nex" : ".bad.nex");
3690 saln->printPartition(filename.c_str(), aln_file.c_str());
3691 }
3692 }
3693 }
3694 if (params.symtest_only)
3695 exit(EXIT_SUCCESS);
3696 }
3697
runPhyloAnalysis(Params & params,Checkpoint * checkpoint)3698 void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint) {
3699 Alignment *alignment;
3700
3701 checkpoint->putBool("finished", false);
3702 checkpoint->setDumpInterval(params.checkpoint_dump_interval);
3703
3704 /****************** read in alignment **********************/
3705 if (params.partition_file) {
3706 // Partition model analysis
3707 if (params.partition_type == TOPO_UNLINKED)
3708 alignment = new SuperAlignmentUnlinked(params);
3709 else
3710 alignment = new SuperAlignment(params);
3711 } else {
3712 alignment = createAlignment(params.aln_file, params.sequence_type, params.intype, params.model_name);
3713
3714 if (params.freq_const_patterns) {
3715 int orig_nsite = alignment->getNSite();
3716 alignment->addConstPatterns(params.freq_const_patterns);
3717 cout << "INFO: " << alignment->getNSite() - orig_nsite << " const sites added into alignment" << endl;
3718 }
3719
3720 // Initialize site-frequency model
3721 if (params.tree_freq_file) {
3722 if (checkpoint->getBool("finishedSiteFreqFile")) {
3723 alignment->readSiteStateFreq(((string)params.out_prefix + ".sitefreq").c_str());
3724 params.print_site_state_freq = WSF_NONE;
3725 cout << "CHECKPOINT: Site frequency model restored" << endl;
3726 } else {
3727 computeSiteFrequencyModel(params, alignment);
3728 checkpoint->putBool("finishedSiteFreqFile", true);
3729 checkpoint->dump();
3730 }
3731 }
3732 if (params.site_freq_file) {
3733 alignment->readSiteStateFreq(params.site_freq_file);
3734 }
3735 }
3736
3737 if (params.symtest) {
3738 doSymTest(alignment, params);
3739 }
3740
3741 if (params.print_aln_info) {
3742 string site_info_file = string(params.out_prefix) + ".alninfo";
3743 alignment->printSiteInfo(site_info_file.c_str());
3744 cout << "Alignment sites statistics printed to " << site_info_file << endl;
3745 }
3746
3747 /*************** initialize tree ********************/
3748 IQTree *tree = newIQTree(params, alignment);
3749
3750 tree->setCheckpoint(checkpoint);
3751 if (params.min_branch_length <= 0.0) {
3752 params.min_branch_length = 1e-6;
3753 if (!tree->isSuperTree() && tree->getAlnNSite() >= 100000) {
3754 params.min_branch_length = 0.1 / (tree->getAlnNSite());
3755 tree->num_precision = max((int)ceil(-log10(Params::getInstance().min_branch_length))+1, 6);
3756 cout.precision(12);
3757 cout << "NOTE: minimal branch length is reduced to " << params.min_branch_length << " for long alignment" << endl;
3758 cout.precision(3);
3759 }
3760 // Increase the minimum branch length if PoMo is used.
3761 if (alignment->seq_type == SEQ_POMO) {
3762 params.min_branch_length *= alignment->virtual_pop_size * alignment->virtual_pop_size;
3763 cout.precision(12);
3764 cout << "NOTE: minimal branch length is increased to " << params.min_branch_length << " because PoMo infers number of mutations and frequency shifts" << endl;
3765 cout.precision(3);
3766 }
3767 }
3768 // Increase the minimum branch length if PoMo is used.
3769 if (alignment->seq_type == SEQ_POMO) {
3770 params.max_branch_length *= alignment->virtual_pop_size * alignment->virtual_pop_size;
3771 cout.precision(1);
3772 cout << "NOTE: maximal branch length is increased to " << params.max_branch_length << " because PoMo infers number of mutations and frequency shifts" << endl;
3773 cout.precision(3);
3774 }
3775
3776
3777 if (params.concatenate_aln) {
3778 Alignment aln(params.concatenate_aln, params.sequence_type, params.intype, params.model_name);
3779 cout << "Concatenating " << params.aln_file << " with " << params.concatenate_aln << " ..." << endl;
3780 alignment->concatenateAlignment(&aln);
3781 }
3782
3783 if (params.constraint_tree_file) {
3784 cout << "Reading constraint tree " << params.constraint_tree_file << "..." << endl;
3785 tree->constraintTree.readConstraint(params.constraint_tree_file, alignment->getSeqNames());
3786 if (params.start_tree == STT_PLL_PARSIMONY)
3787 params.start_tree = STT_PARSIMONY;
3788 else if (params.start_tree == STT_BIONJ)
3789 outError("Constraint tree does not work with -t BIONJ");
3790 if (params.num_bootstrap_samples || params.gbo_replicates)
3791 cout << "INFO: Constraint tree will be applied to ML tree and all bootstrap trees." << endl;
3792 }
3793
3794 if (params.compute_seq_identity_along_tree) {
3795 if (!params.user_file)
3796 outError("Please supply a user tree file!");
3797 tree->readTree(params.user_file, params.is_rooted);
3798 if (!tree->rooted && !params.root) {
3799 outError("Tree is unrooted, thus you have to specify a root with -o option");
3800 }
3801 tree->setAlignment(tree->aln);
3802 if (!tree->rooted)
3803 tree->setRootNode(params.root);
3804 tree->computeSeqIdentityAlongTree();
3805 if (verbose_mode >= VB_MED)
3806 tree->drawTree(cout);
3807 string out_tree = (string)params.out_prefix + ".seqident_tree";
3808 tree->printTree(out_tree.c_str());
3809 cout << "Tree with sequence identity printed to " << out_tree << endl;
3810 } else if (params.aln_output) {
3811 /************ convert alignment to other format and write to output file *************/
3812 convertAlignment(params, tree);
3813 } else if (params.gbo_replicates > 0 && params.user_file && params.second_tree) {
3814 // run one of the UFBoot analysis
3815 // runGuidedBootstrap(params, alignment, *tree);
3816 outError("Obsolete feature");
3817 } else if (params.avh_test) {
3818 // run one of the wondering test for Arndt
3819 // runAvHTest(params, alignment, *tree);
3820 outError("Obsolete feature");
3821 } else if (params.bootlh_test) {
3822 // run Arndt's plot of tree likelihoods against bootstrap alignments
3823 // runBootLhTest(params, alignment, *tree);
3824 outError("Obsolete feature");
3825 } else if (params.num_bootstrap_samples == 0) {
3826 /********************************************************************************
3827 THE MAIN MAXIMUM LIKELIHOOD TREE RECONSTRUCTION
3828 ********************************************************************************/
3829 ModelCheckpoint *model_info = new ModelCheckpoint;
3830 alignment->checkGappySeq(params.remove_empty_seq);
3831
3832 // remove identical sequences
3833 if (params.ignore_identical_seqs) {
3834 tree->removeIdenticalSeqs(params);
3835 if (tree->removed_seqs.size() > 0 && MPIHelper::getInstance().isMaster() && (params.suppress_output_flags & OUT_UNIQUESEQ) == 0) {
3836 string filename = (string)params.out_prefix + ".uniqueseq.phy";
3837 tree->aln->printAlignment(params.aln_output_format, filename.c_str());
3838 cout << endl << "For your convenience alignment with unique sequences printed to " << filename << endl;
3839 }
3840 }
3841 alignment = NULL; // from now on use tree->aln instead
3842
3843 startTreeReconstruction(params, tree, *model_info);
3844 // call main tree reconstruction
3845 if (params.num_runs == 1)
3846 runTreeReconstruction(params, tree);
3847 else
3848 runMultipleTreeReconstruction(params, tree->aln, tree);
3849
3850 if (MPIHelper::getInstance().isMaster()) {
3851 reportPhyloAnalysis(params, *tree, *model_info);
3852 }
3853
3854 // reinsert identical sequences
3855 if (tree->removed_seqs.size() > 0) {
3856 // BUG FIX: dont use reinsertIdenticalSeqs anymore
3857 tree->insertTaxa(tree->removed_seqs, tree->twin_seqs);
3858 tree->printResultTree();
3859 }
3860 delete model_info;
3861
3862 if (params.dating_method != "") {
3863 doTimeTree(tree);
3864 }
3865
3866 } else {
3867 // the classical non-parameter bootstrap (SBS)
3868 // if (params.model_name.find("LINK") != string::npos || params.model_name.find("MERGE") != string::npos)
3869 // outError("-m TESTMERGE is not allowed when doing standard bootstrap. Please first\nfind partition scheme on the original alignment and use it for bootstrap analysis");
3870 if (alignment->getNSeq() < 4)
3871 outError("It makes no sense to perform bootstrap with less than 4 sequences.");
3872 runStandardBootstrap(params, alignment, tree);
3873 }
3874
3875 // if (params.upper_bound) {
3876 // UpperBounds(¶ms, alignment, tree);
3877 // }
3878
3879 if(verbose_mode >= VB_MED){
3880 if(tree->isSuperTree() && params.partition_type != BRLEN_OPTIMIZE){
3881 ((PhyloSuperTreePlen*) tree)->printNNIcasesNUM();
3882 }
3883 }
3884 // 2015-09-22: bug fix, move this line to before deleting tree
3885 alignment = tree->aln;
3886 delete tree;
3887 // BUG FIX: alignment can be changed, should delete tree->aln instead
3888 // 2015-09-22: THIS IS STUPID: after deleting tree, one cannot access tree->aln anymore
3889 // alignment = tree->aln;
3890 delete alignment;
3891
3892 checkpoint->putBool("finished", true);
3893 checkpoint->dump(true);
3894 }
3895
3896 /**
3897 Perform separate tree reconstruction when tree topologies
3898 are unlinked between partitions
3899 */
runUnlinkedPhyloAnalysis(Params & params,Checkpoint * checkpoint)3900 void runUnlinkedPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint) {
3901 SuperAlignment *super_aln;
3902
3903 ASSERT(params.partition_file);
3904
3905 /****************** read in alignment **********************/
3906 // Partition model analysis
3907 super_aln = new SuperAlignmentUnlinked(params);
3908 PhyloSuperTree *super_tree = new PhyloSuperTree(super_aln);
3909
3910 /**** do separate tree reconstruction for each partition ***/
3911
3912 MTreeSet part_trees;
3913
3914 if (params.user_file) {
3915 // reading user tree file for all partitions
3916 bool is_rooted = false;
3917 part_trees.readTrees(params.user_file, is_rooted, 0, super_aln->partitions.size());
3918 if (is_rooted)
3919 outError("Rooted trees not allowed: ", params.user_file);
3920 if (part_trees.size() != super_aln->partitions.size())
3921 outError("User tree file does not have the same number of trees as partitions");
3922 params.user_file = NULL;
3923 }
3924
3925 ModelCheckpoint *model_info = new ModelCheckpoint;
3926 int part = 0;
3927 for (auto alnit = super_aln->partitions.begin(); alnit != super_aln->partitions.end(); alnit++, part++) {
3928
3929 checkpoint->startStruct((*alnit)->name);
3930
3931 // allocate heterotachy tree if neccessary
3932 int pos = posRateHeterotachy((*alnit)->model_name);
3933 IQTree *tree;
3934
3935 if (params.num_mixlen > 1) {
3936 tree = new PhyloTreeMixlen((*alnit), params.num_mixlen);
3937 } else if (pos != string::npos) {
3938 tree = new PhyloTreeMixlen((*alnit), 0);
3939 } else
3940 tree = new IQTree((*alnit));
3941
3942 tree->setCheckpoint(checkpoint);
3943 if (checkpoint->getBool("finished")) {
3944 tree->restoreCheckpoint();
3945 } else {
3946 if (!part_trees.empty())
3947 tree->copyTree(part_trees[part]);
3948
3949 startTreeReconstruction(params, tree, *model_info);
3950 // call main tree reconstruction
3951 if (params.num_runs == 1)
3952 runTreeReconstruction(params, tree);
3953 else
3954 runMultipleTreeReconstruction(params, tree->aln, tree);
3955 checkpoint->putBool("finished", true);
3956 checkpoint->dump();
3957 }
3958
3959 super_tree->at(part)->copyTree(tree);
3960
3961 delete tree;
3962 checkpoint->endStruct();
3963 }
3964
3965 IQTree *iqtree = super_tree;
3966 super_tree->setCheckpoint(checkpoint);
3967 startTreeReconstruction(params, iqtree, *model_info);
3968 runTreeReconstruction(params, iqtree);
3969 if (MPIHelper::getInstance().isMaster())
3970 reportPhyloAnalysis(params, *iqtree, *model_info);
3971
3972 delete super_tree;
3973 delete super_aln;
3974 delete model_info;
3975 }
3976
assignBranchSupportNew(Params & params)3977 void assignBranchSupportNew(Params ¶ms) {
3978 if (!params.user_file)
3979 outError("No target tree file provided");
3980 if (params.num_threads == 0)
3981 outError("-nt AUTO is not supported for concordance factor analysis, please specify no. cores");
3982 PhyloTree *tree;
3983 Alignment *aln = NULL;
3984 if (params.site_concordance) {
3985 if (!params.aln_file && !params.partition_file)
3986 outError("Please provide an alignment (-s) or partition file");
3987 if (params.partition_file) {
3988 params.compute_seq_composition = false;
3989 aln = new SuperAlignment(params);
3990 tree = new PhyloSuperTree((SuperAlignment*)aln);
3991 } else {
3992 aln = createAlignment(params.aln_file, params.sequence_type, params.intype, params.model_name);
3993 tree = new PhyloTree;
3994 }
3995 } else {
3996 tree = new PhyloTree;
3997 }
3998 tree->setParams(¶ms);
3999
4000 cout << "Reading tree " << params.user_file << " ..." << endl;
4001 bool rooted = params.is_rooted;
4002 tree->readTree(params.user_file, rooted);
4003 cout << ((tree->rooted) ? "rooted" : "un-rooted") << " tree with "
4004 << tree->leafNum - tree->rooted << " taxa and " << tree->branchNum << " branches" << endl;
4005
4006 // 2018-12-13: move initialisation to fix rooted vs unrooted tree
4007 if (params.site_concordance) {
4008 tree->setAlignment(aln);
4009 if (tree->isSuperTree())
4010 ((PhyloSuperTree*)tree)->mapTrees();
4011 }
4012
4013 BranchVector branches;
4014 tree->getInnerBranches(branches);
4015 BranchVector::iterator brit;
4016 for (brit = branches.begin(); brit != branches.end(); brit++) {
4017 Neighbor *branch = brit->second->findNeighbor(brit->first);
4018 string label = brit->second->name;
4019 if (!label.empty())
4020 PUT_ATTR(branch, label);
4021 }
4022
4023 map<string,string> meanings;
4024
4025 if (!params.treeset_file.empty()) {
4026 bool rooted = params.is_rooted;
4027 MTreeSet trees(params.treeset_file.c_str(), rooted, params.tree_burnin, params.tree_max_count);
4028 double start_time = getRealTime();
4029 cout << "Computing gene concordance factor..." << endl;
4030 tree->computeGeneConcordance(trees, meanings);
4031 if (params.internode_certainty)
4032 tree->computeQuartetConcordance(trees);
4033 cout << getRealTime() - start_time << " sec" << endl;
4034 }
4035 if (params.site_concordance) {
4036 cout << "Computing site concordance factor..." << endl;
4037 double start_time = getRealTime();
4038 tree->computeSiteConcordance(meanings);
4039 cout << getRealTime() - start_time << " sec" << endl;
4040 delete aln;
4041 }
4042 string prefix = (params.out_prefix) ? params.out_prefix : params.user_file;
4043 string str = prefix + ".cf.tree";
4044 tree->printTree(str.c_str());
4045 cout << "Tree with concordance factors written to " << str << endl;
4046 str = prefix + ".cf.tree.nex";
4047 string filename = prefix + ".cf.stat";
4048 tree->printNexus(str, WT_BR_LEN, "See " + filename + " for branch annotation meanings." +
4049 " This file is best viewed in FigTree.");
4050 cout << "Annotated tree (best viewed in FigTree) written to " << str << endl;
4051 if (verbose_mode >= VB_DEBUG)
4052 tree->drawTree(cout);
4053 str = prefix + ".cf.branch";
4054 tree->printTree(str.c_str(), WT_BR_LEN + WT_INT_NODE + WT_NEWLINE);
4055 cout << "Tree with branch IDs written to " << str << endl;
4056 ofstream out;
4057 out.open(filename.c_str());
4058 out << "# Concordance factor statistics" << endl
4059 << "# This file can be read in MS Excel or in R with command:" << endl
4060 << "# tab=read.table('" << filename << "',header=TRUE)" << endl
4061 << "# Columns are tab-separated with following meaning:" << endl
4062 << "# ID: Branch ID" << endl;
4063 map<string,string>::iterator mit;
4064 for (mit = meanings.begin(); mit != meanings.end(); mit++)
4065 if (mit->first[0] != '*')
4066 out << "# " << mit->first << ": " << mit->second << endl;
4067 out << "# Label: Existing branch label" << endl;
4068 out << "# Length: Branch length" << endl;
4069 for (mit = meanings.begin(); mit != meanings.end(); mit++)
4070 if (mit->first[0] == '*')
4071 out << "# " << mit->first << ": " << mit->second << endl;
4072 out << "ID";
4073 for (mit = meanings.begin(); mit != meanings.end(); mit++)
4074 if (mit->first[0] != '*')
4075 out << "\t" << mit->first;
4076 out << "\tLabel\tLength" << endl;
4077 for (brit = branches.begin(); brit != branches.end(); brit++) {
4078 Neighbor *branch = brit->second->findNeighbor(brit->first);
4079 int ID = brit->second->id;
4080 out << ID;
4081 for (mit = meanings.begin(); mit != meanings.end(); mit++) {
4082 if (mit->first[0] == '*')
4083 continue; // ignore NOTES
4084 out << '\t';
4085 string val;
4086 if (branch->getAttr(mit->first, val))
4087 out << val;
4088 else
4089 out << "NA";
4090 }
4091 double length = branch->length;
4092 string label;
4093 GET_ATTR(branch, label);
4094 out << '\t' << label << '\t' << length << endl;
4095 }
4096 out.close();
4097 cout << "Concordance factors per branch printed to " << filename << endl;
4098 if (!params.site_concordance_partition)
4099 return;
4100
4101 // print concordant/discordant gene trees
4102 filename = prefix + ".cf.stat_tree";
4103 out.open(filename);
4104 out << "# Concordance factor statistics for decisive trees" << endl
4105 << "# This file can be read in MS Excel or in R with command:" << endl
4106 << "# tab2=read.table('" << filename << "',header=TRUE)" << endl
4107 << "# Columns are tab-separated with following meaning:" << endl
4108 << "# ID: Branch ID" << endl
4109 << "# TreeID: Tree ID" << endl
4110 << "# gC: 1/0 if tree is concordant/discordant with branch" << endl
4111 << "# gD1: 1/0 if NNI-1 tree is concordant/discordant with branch" << endl
4112 << "# gD2: 1/0 if NNI-2 tree is concordant/discordant with branch" << endl
4113 << "# NOTE: NA means that tree is not decisive for branch" << endl
4114 << "ID\tTreeID\tgC\tgD1\tgD2" << endl;
4115 for (brit = branches.begin(); brit != branches.end(); brit++) {
4116 Neighbor *branch = brit->second->findNeighbor(brit->first);
4117 int ID = brit->second->id;
4118 for (int part = 1; ; part++) {
4119 string gC, gD1, gD2;
4120 if (!branch->getAttr("gC" + convertIntToString(part), gC))
4121 break;
4122 branch->getAttr("gD1" + convertIntToString(part), gD1);
4123 branch->getAttr("gD2" + convertIntToString(part), gD2);
4124 out << ID << '\t' << part << '\t' << gC << '\t' << gD1 << '\t' << gD2 << endl;
4125 }
4126 }
4127 out.close();
4128 cout << "Concordance factors per branch and tree printed to " << filename << endl;
4129
4130 if (!params.site_concordance_partition || !tree->isSuperTree())
4131 return;
4132 // print partition-wise concordant/discordant sites
4133 filename = prefix + ".cf.stat_loci";
4134 out.open(filename);
4135 out << "# Concordance factor statistics for loci" << endl
4136 << "# This file can be read in MS Excel or in R with command:" << endl
4137 << "# tab2=read.table('" << filename << "',header=TRUE)" << endl
4138 << "# Columns are tab-separated with following meaning:" << endl
4139 << "# ID: Branch ID" << endl
4140 << "# PartID: Locus ID" << endl
4141 << "# sC: Number of concordant sites averaged over " << params.site_concordance << " quartets" << endl
4142 << "# sD1: Number of discordant sites for alternative quartet 1" << endl
4143 << "# sD2: Number of discordant sites for alternative quartet 2" << endl
4144 << "# NOTE: NA means that locus is not decisive for branch" << endl
4145 << "ID\tPartID\tsC\tsD1\tsD2" << endl;
4146 for (brit = branches.begin(); brit != branches.end(); brit++) {
4147 Neighbor *branch = brit->second->findNeighbor(brit->first);
4148 int ID = brit->second->id;
4149 for (int part = 1; ; part++) {
4150 string sC, sD1, sD2;
4151 if (!branch->getAttr("sC" + convertIntToString(part), sC))
4152 break;
4153 if (!branch->getAttr("sD1" + convertIntToString(part), sD1))
4154 break;
4155 if (!branch->getAttr("sD2" + convertIntToString(part), sD2))
4156 break;
4157 out << ID << '\t' << part << '\t' << sC << '\t' << sD1 << '\t' << sD2 << endl;
4158 }
4159 }
4160 out.close();
4161 cout << "Concordance factors per branch and locus printed to " << filename << endl;
4162 }
4163
4164
4165
4166 /**
4167 * assign split occurence frequencies from a set of input trees onto a target tree
4168 * NOTE: input trees must have the same taxon set
4169 * @param input_trees file containing NEWICK tree strings
4170 * @param burnin number of beginning trees to discard
4171 * @param max_count max number of trees to read in
4172 * @param target_tree the target tree
4173 * @param rooted TRUE if trees are rooted, false for unrooted trees
4174 * @param output_file file name to write output tree with assigned support values
4175 * @param out_prefix prefix of output file
4176 * @param mytree (OUT) resulting tree with support values assigned from target_tree
4177 * @param tree_weight_file file containing INTEGER weights of input trees
4178 * @param params program parameters
4179 */
assignBootstrapSupport(const char * input_trees,int burnin,int max_count,const char * target_tree,bool rooted,const char * output_tree,const char * out_prefix,MExtTree & mytree,const char * tree_weight_file,Params * params)4180 void assignBootstrapSupport(const char *input_trees, int burnin, int max_count,
4181 const char *target_tree, bool rooted, const char *output_tree,
4182 const char *out_prefix, MExtTree &mytree, const char* tree_weight_file,
4183 Params *params) {
4184 bool myrooted = rooted;
4185 // read the tree file
4186 cout << "Reading tree " << target_tree << " ..." << endl;
4187 mytree.init(target_tree, myrooted);
4188 if (mytree.rooted)
4189 cout << "rooted tree detected" << endl;
4190 else
4191 cout << "unrooted tree detected" << endl;
4192 // reindex the taxa in the tree to aphabetical names
4193 NodeVector taxa;
4194 mytree.getTaxa(taxa);
4195 sort(taxa.begin(), taxa.end(), nodenamecmp);
4196 int i = 0;
4197 for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++) {
4198 (*it)->id = i++;
4199 }
4200
4201 /*
4202 string filename = params.boot_trees;
4203 filename += ".nolen";
4204 boot_trees.printTrees(filename.c_str(), false);
4205 return;
4206 */
4207 SplitGraph sg;
4208 SplitIntMap hash_ss;
4209 // make the taxa name
4210 vector<string> taxname;
4211 taxname.resize(mytree.leafNum);
4212 mytree.getTaxaName(taxname);
4213
4214 // read the bootstrap tree file
4215 double scale = 100.0;
4216 if (params->scaling_factor > 0)
4217 scale = params->scaling_factor;
4218
4219 MTreeSet boot_trees;
4220 if (params && detectInputFile(input_trees) == IN_NEXUS) {
4221 sg.init(*params);
4222 for (SplitGraph::iterator it = sg.begin(); it != sg.end(); it++)
4223 hash_ss.insertSplit((*it), (*it)->getWeight());
4224 StrVector sgtaxname;
4225 sg.getTaxaName(sgtaxname);
4226 i = 0;
4227 for (StrVector::iterator sit = sgtaxname.begin();
4228 sit != sgtaxname.end(); sit++, i++) {
4229 Node *leaf = mytree.findLeafName(*sit);
4230 if (!leaf)
4231 outError("Tree does not contain taxon ", *sit);
4232 leaf->id = i;
4233 }
4234 scale /= sg.maxWeight();
4235 } else {
4236 myrooted = rooted;
4237 boot_trees.init(input_trees, myrooted, burnin, max_count,
4238 tree_weight_file);
4239 if (mytree.rooted != boot_trees.isRooted())
4240 outError("Target tree and tree set have different rooting");
4241 if (boot_trees.equal_taxon_set) {
4242 boot_trees.convertSplits(taxname, sg, hash_ss, SW_COUNT, -1, params->support_tag);
4243 scale /= boot_trees.sumTreeWeights();
4244 }
4245 }
4246 //sg.report(cout);
4247 if (!sg.empty()) {
4248 cout << "Rescaling split weights by " << scale << endl;
4249 if (params->scaling_factor < 0)
4250 sg.scaleWeight(scale, true);
4251 else {
4252 sg.scaleWeight(scale, false, params->numeric_precision);
4253 }
4254
4255 cout << sg.size() << " splits found" << endl;
4256 }
4257 // compute the percentage of appearance
4258 // printSplitSet(sg, hash_ss);
4259 //sg.report(cout);
4260 cout << "Creating " << RESAMPLE_NAME << " support values..." << endl;
4261 if (!sg.empty())
4262 mytree.createBootstrapSupport(taxname, boot_trees, hash_ss, params->support_tag);
4263 else {
4264 //mytree.createBootstrapSupport(boot_trees);
4265 cout << "Unequal taxon sets, rereading trees..." << endl;
4266 DoubleVector rfdist;
4267 mytree.computeRFDist(input_trees, rfdist, 1);
4268 }
4269
4270 //mytree.scaleLength(100.0/boot_trees.size(), true);
4271 string out_file;
4272 if (output_tree)
4273 out_file = output_tree;
4274 else {
4275 if (out_prefix)
4276 out_file = out_prefix;
4277 else
4278 out_file = target_tree;
4279 out_file += ".suptree";
4280 }
4281
4282 mytree.printTree(out_file.c_str());
4283 cout << "Tree with assigned support written to " << out_file
4284 << endl;
4285 /*
4286 if (out_prefix)
4287 out_file = out_prefix;
4288 else
4289 out_file = target_tree;
4290 out_file += ".supval";
4291 mytree.writeInternalNodeNames(out_file);
4292
4293 cout << "Support values written to " << out_file << endl;
4294 */
4295 }
4296
computeConsensusTree(const char * input_trees,int burnin,int max_count,double cutoff,double weight_threshold,const char * output_tree,const char * out_prefix,const char * tree_weight_file,Params * params)4297 void computeConsensusTree(const char *input_trees, int burnin, int max_count,
4298 double cutoff, double weight_threshold, const char *output_tree,
4299 const char *out_prefix, const char *tree_weight_file, Params *params) {
4300 bool rooted = false;
4301
4302 // read the bootstrap tree file
4303 /*
4304 MTreeSet boot_trees(input_trees, rooted, burnin, tree_weight_file);
4305 string first_taxname = boot_trees.front()->root->name;
4306 //if (params.root) first_taxname = params.root;
4307
4308 SplitGraph sg;
4309
4310 boot_trees.convertSplits(sg, cutoff, SW_COUNT, weight_threshold);*/
4311
4312 //sg.report(cout);
4313 SplitGraph sg;
4314 SplitIntMap hash_ss;
4315 // make the taxa name
4316 //vector<string> taxname;
4317 //taxname.resize(mytree.leafNum);
4318 //mytree.getTaxaName(taxname);
4319
4320 // read the bootstrap tree file
4321 double scale = 100.0;
4322 if (params->scaling_factor > 0)
4323 scale = params->scaling_factor;
4324
4325 MTreeSet boot_trees;
4326 if (params && detectInputFile(input_trees) == IN_NEXUS) {
4327 char *user_file = params->user_file;
4328 params->user_file = (char*) input_trees;
4329 params->split_weight_summary = SW_COUNT; // count number of splits
4330 sg.init(*params);
4331 params->user_file = user_file;
4332 for (SplitGraph::iterator it = sg.begin(); it != sg.end();)
4333 if ((*it)->getWeight() > weight_threshold) {
4334 hash_ss.insertSplit((*it), (*it)->getWeight());
4335 it++;
4336 } else {
4337 // delete the split
4338 if (it != sg.end()-1) {
4339 *(*it) = (*sg.back());
4340 }
4341 delete sg.back();
4342 sg.pop_back();
4343 }
4344 /* StrVector sgtaxname;
4345 sg.getTaxaName(sgtaxname);
4346 i = 0;
4347 for (StrVector::iterator sit = sgtaxname.begin(); sit != sgtaxname.end(); sit++, i++) {
4348 Node *leaf = mytree.findLeafName(*sit);
4349 if (!leaf) outError("Tree does not contain taxon ", *sit);
4350 leaf->id = i;
4351 }*/
4352 scale /= sg.maxWeight();
4353 } else {
4354 boot_trees.init(input_trees, rooted, burnin, max_count,
4355 tree_weight_file);
4356 boot_trees.convertSplits(sg, cutoff, SW_COUNT, weight_threshold);
4357 scale /= boot_trees.sumTreeWeights();
4358 cout << sg.size() << " splits found" << endl;
4359 }
4360 //sg.report(cout);
4361 if (verbose_mode >= VB_MED)
4362 cout << "Rescaling split weights by " << scale << endl;
4363 if (params->scaling_factor < 0)
4364 sg.scaleWeight(scale, true);
4365 else {
4366 sg.scaleWeight(scale, false, params->numeric_precision);
4367 }
4368
4369
4370
4371 //cout << "Creating greedy consensus tree..." << endl;
4372 MTree mytree;
4373 SplitGraph maxsg;
4374 sg.findMaxCompatibleSplits(maxsg);
4375
4376 if (verbose_mode >= VB_MAX)
4377 maxsg.saveFileStarDot(cout);
4378 //cout << "convert compatible split system into tree..." << endl;
4379 mytree.convertToTree(maxsg);
4380 //cout << "done" << endl;
4381 if (!mytree.rooted) {
4382 string taxname;
4383 if (params->root)
4384 taxname = params->root;
4385 else
4386 taxname = sg.getTaxa()->GetTaxonLabel(0);
4387 Node *node = mytree.findLeafName(taxname);
4388 if (node)
4389 mytree.root = node;
4390 }
4391 // mytree.scaleLength(100.0 / boot_trees.sumTreeWeights(), true);
4392
4393 // mytree.getTaxaID(maxsg.getSplitsBlock()->getCycle());
4394 //maxsg.saveFile(cout);
4395
4396 string out_file;
4397
4398 if (output_tree)
4399 out_file = output_tree;
4400 else {
4401 if (out_prefix)
4402 out_file = out_prefix;
4403 else
4404 out_file = input_trees;
4405 out_file += ".contree";
4406 }
4407
4408 // if (removed_seqs.size() > 0)
4409 // mytree.insertTaxa(removed_seqs, twin_seqs);
4410
4411 mytree.printTree(out_file.c_str(), WT_BR_CLADE);
4412 cout << "Consensus tree written to " << out_file << endl;
4413
4414 if (output_tree)
4415 out_file = output_tree;
4416 else {
4417 if (out_prefix)
4418 out_file = out_prefix;
4419 else
4420 out_file = input_trees;
4421 out_file += ".splits";
4422 }
4423
4424 //sg.scaleWeight(0.01, false, 4);
4425 if (params->print_splits_file) {
4426 sg.saveFile(out_file.c_str(), IN_OTHER, true);
4427 cout << "Non-trivial split supports printed to star-dot file " << out_file << endl;
4428 }
4429
4430 }
4431
computeConsensusNetwork(const char * input_trees,int burnin,int max_count,double cutoff,int weight_summary,double weight_threshold,const char * output_tree,const char * out_prefix,const char * tree_weight_file)4432 void computeConsensusNetwork(const char *input_trees, int burnin, int max_count,
4433 double cutoff, int weight_summary, double weight_threshold, const char *output_tree,
4434 const char *out_prefix, const char* tree_weight_file) {
4435 bool rooted = false;
4436
4437 // read the bootstrap tree file
4438 MTreeSet boot_trees(input_trees, rooted, burnin, max_count,
4439 tree_weight_file);
4440
4441 SplitGraph sg;
4442 //SplitIntMap hash_ss;
4443
4444 boot_trees.convertSplits(sg, cutoff, weight_summary, weight_threshold);
4445
4446 string out_file;
4447
4448 if (output_tree)
4449 out_file = output_tree;
4450 else {
4451 if (out_prefix)
4452 out_file = out_prefix;
4453 else
4454 out_file = input_trees;
4455 out_file += ".nex";
4456 }
4457
4458 sg.saveFile(out_file.c_str(), IN_NEXUS);
4459 cout << "Consensus network printed to " << out_file << endl;
4460
4461 if (output_tree)
4462 out_file = output_tree;
4463 else {
4464 if (out_prefix)
4465 out_file = out_prefix;
4466 else
4467 out_file = input_trees;
4468 out_file += ".splits";
4469 }
4470 if (verbose_mode >= VB_MED) {
4471 sg.saveFile(out_file.c_str(), IN_OTHER, true);
4472 cout << "Non-trivial split supports printed to star-dot file " << out_file << endl;
4473 }
4474
4475 }
4476