1 /***************************************************************************
2  *   Copyright (C) 2009-2015 by                                            *
3  *   BUI Quang Minh <minh.bui@univie.ac.at>                                *
4  *   Lam-Tung Nguyen <nltung@gmail.com>                                    *
5  *                                                                         *
6  *                                                                         *
7  *   This program is free software; you can redistribute it and/or modify  *
8  *   it under the terms of the GNU General Public License as published by  *
9  *   the Free Software Foundation; either version 2 of the License, or     *
10  *   (at your option) any later version.                                   *
11  *                                                                         *
12  *   This program is distributed in the hope that it will be useful,       *
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15  *   GNU General Public License for more details.                          *
16  *                                                                         *
17  *   You should have received a copy of the GNU General Public License     *
18  *   along with this program; if not, write to the                         *
19  *   Free Software Foundation, Inc.,                                       *
20  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
21  ***************************************************************************/
22 
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26 #include <iqtree_config.h>
27 #include "tree/phylotree.h"
28 #include "tree/phylosupertree.h"
29 #include "tree/phylosupertreeplen.h"
30 #include "tree/phylosupertreeunlinked.h"
31 #include "phyloanalysis.h"
32 #include "alignment/alignment.h"
33 #include "alignment/superalignment.h"
34 #include "alignment/superalignmentunlinked.h"
35 #include "tree/iqtree.h"
36 #include "tree/phylotreemixlen.h"
37 #include "model/modelmarkov.h"
38 #include "model/modeldna.h"
39 #include "model/modelpomo.h"
40 #include "nclextra/myreader.h"
41 #include "model/rateheterogeneity.h"
42 #include "model/rategamma.h"
43 #include "model/rateinvar.h"
44 #include "model/rategammainvar.h"
45 //#include "modeltest_wrapper.h"
46 #include "model/modelprotein.h"
47 #include "model/modelbin.h"
48 #include "model/modelcodon.h"
49 #include "utils/stoprule.h"
50 
51 #include "tree/mtreeset.h"
52 #include "tree/mexttree.h"
53 #include "model/ratemeyerhaeseler.h"
54 #include "whtest/whtest_wrapper.h"
55 #include "model/partitionmodel.h"
56 #include "model/modelmixture.h"
57 #include "model/modelfactorymixlen.h"
58 //#include "guidedbootstrap.h"
59 #include "model/modelset.h"
60 #include "utils/timeutil.h"
61 #include "tree/upperbounds.h"
62 #include "utils/MPIHelper.h"
63 #include "timetree.h"
64 
65 #ifdef USE_BOOSTER
66 extern "C" {
67 #include "booster/booster.h"
68 }
69 #endif
70 
71 #ifdef IQTREE_TERRAPHAST
72     #include "terrace/terrace.h"
73 #endif
74 
reportReferences(Params & params,ofstream & out)75 void reportReferences(Params &params, ofstream &out) {
76 
77     out << "To cite IQ-TREE please use:" << endl << endl
78     << "Bui Quang Minh, Heiko A. Schmidt, Olga Chernomor, Dominik Schrempf," << endl
79     << "Michael D. Woodhams, Arndt von Haeseler, and Robert Lanfear (2020)" << endl
80     << "IQ-TREE 2: New models and efficient methods for phylogenetic inference" << endl
81     << "in the genomic era. Mol. Biol. Evol., in press." << endl
82     << "https://doi.org/10.1093/molbev/msaa015" << endl << endl;
83 
84     bool modelfinder_only = false;
85     if (params.model_name.substr(0,4) == "TEST" || params.model_name.substr(0, 2) == "MF" || params.model_name.empty()) {
86         out << "To cite ModelFinder please use: " << endl << endl
87             << "Subha Kalyaanamoorthy, Bui Quang Minh, Thomas KF Wong, Arndt von Haeseler," << endl
88             << "and Lars S Jermiin (2017) ModelFinder: Fast model selection for" << endl
89             << "accurate phylogenetic estimates. Nature Methods, 14:587–589." << endl
90             << "https://doi.org/10.1038/nmeth.4285" << endl << endl;
91         if (params.model_name.find("ONLY") != string::npos || (params.model_name.substr(0,2)=="MF" && params.model_name.substr(0,3)!="MFP"))
92             modelfinder_only = true;
93     }
94     if (posPOMO(params.model_name) != string::npos) {
95         out << "For polymorphism-aware models please cite:" << endl << endl
96             << "Dominik Schrempf, Bui Quang Minh, Nicola De Maio, Arndt von Haeseler, and Carolin Kosiol" << endl
97             << "(2016) Reversible polymorphism-aware phylogenetic models and their application to" << endl
98             << "tree inference. J. Theor. Biol., 407:362–370." << endl
99             << "https://doi.org/10.1016/j.jtbi.2016.07.042" << endl << endl;
100     }
101 
102     if (params.site_freq_file || params.tree_freq_file)
103     out << "Since you used site-specific frequency model please also cite: " << endl << endl
104         << "Huai-Chun Wang, Edward Susko, Bui Quang Minh, and Andrew J. Roger (2018)" << endl
105         << "Modeling site heterogeneity with posterior mean site frequency profiles" << endl
106         << "accelerates accurate phylogenomic estimation. Syst. Biol., 67:216–235." << endl
107         << "https://doi.org/10.1093/sysbio/syx068" << endl << endl;
108 
109 
110     if (params.gbo_replicates)
111     out << "Since you used ultrafast bootstrap (UFBoot) please also cite: " << endl << endl
112         << "Diep Thi Hoang, Olga Chernomor, Arndt von Haeseler, Bui Quang Minh," << endl
113         << "and Le Sy Vinh (2018) UFBoot2: Improving the ultrafast bootstrap" << endl
114         << "approximation. Mol. Biol. Evol., 35:518–522." << endl
115         << "https://doi.org/10.1093/molbev/msx281" << endl << endl;
116 
117     if (params.partition_file)
118     out << "Since you used partition models please also cite:" << endl << endl
119         << "Olga Chernomor, Arndt von Haeseler, and Bui Quang Minh (2016)" << endl
120         << "Terrace aware data structure for phylogenomic inference from" << endl
121         << "supermatrices. Syst. Biol., 65:997-1008." << endl
122         << "https://doi.org/10.1093/sysbio/syw037" << endl << endl;
123 
124     if (params.terrace_analysis)
125     out << "Since you used terrace analysis please also cite:" << endl << endl
126         << "Biczok R, Bozsoky P, Eisenmann P, Ernst J, Ribizel T, Scholz F," << endl
127         << "Trefzer A, Weber F, Hamann M, Stamatakis A. (2018)" << endl
128         << "Two C++ libraries for counting trees on a phylogenetic" << endl
129         << "terrace. Bioinformatics 34:3399–3401." << endl
130         << "https://doi.org/10.1093/bioinformatics/bty384" << endl << endl;
131 
132     if (params.dating_method == "LSD")
133         out << "Since you used least square dating (LSD) please also cite: " << endl << endl
134         << "Thu-Hien To, Matthieu Jung, Samantha Lycett, Olivier Gascuel (2016)" << endl
135         << "Fast dating using least-squares criteria and algorithms. Syst. Biol. 65:82-97." << endl
136         << "https://doi.org/10.1093/sysbio/syv068" << endl << endl;
137 }
138 
reportAlignment(ofstream & out,Alignment & alignment,int nremoved_seqs)139 void reportAlignment(ofstream &out, Alignment &alignment, int nremoved_seqs) {
140     out << "Input data: " << alignment.getNSeq()+nremoved_seqs << " sequences with "
141             << alignment.getNSite() << " ";
142     switch (alignment.seq_type) {
143     case SEQ_BINARY: out << "binary"; break;
144     case SEQ_DNA: out << "nucleotide"; break;
145     case SEQ_PROTEIN: out << "amino-acid"; break;
146     case SEQ_CODON: out << "codon"; break;
147     case SEQ_MORPH: out << "morphological"; break;
148     case SEQ_POMO: out << "PoMo"; break;
149     default: out << "unknown"; break;
150     }
151     out << " sites" << endl << "Number of constant sites: "
152         << round(alignment.frac_const_sites * alignment.getNSite())
153         << " (= " << alignment.frac_const_sites * 100 << "% of all sites)" << endl
154 
155         << "Number of invariant (constant or ambiguous constant) sites: "
156         << round(alignment.frac_invariant_sites * alignment.getNSite())
157         << " (= " << alignment.frac_invariant_sites * 100 << "% of all sites)" << endl
158 
159         << "Number of parsimony informative sites: " << alignment.num_informative_sites << endl
160 
161         << "Number of distinct site patterns: " << alignment.size() << endl
162         << endl;
163 }
164 
165 /*
166 void pruneModelInfo(ModelCheckpoint &model_info, PhyloSuperTree *tree) {
167     ModelCheckpoint res_info;
168     for (vector<PartitionInfo>::iterator it = tree->part_info.begin(); it != tree->part_info.end(); it++) {
169         for (ModelCheckpoint::iterator mit = model_info.begin(); mit != model_info.end(); mit++)
170             if (mit->set_name == it->name)
171                 res_info.push_back(*mit);
172     }
173     model_info = res_info;
174 
175 }
176 */
reportModelSelection(ofstream & out,Params & params,ModelCheckpoint * model_info,PhyloTree * tree)177 void reportModelSelection(ofstream &out, Params &params, ModelCheckpoint *model_info, PhyloTree *tree) {
178     out << "Best-fit model according to " << criterionName(params.model_test_criterion) << ": ";
179 //    ModelCheckpoint::iterator it;
180     string best_model;
181     PhyloSuperTree *stree = (tree->isSuperTree()) ? ((PhyloSuperTree*)tree) : NULL;
182     if (tree->isSuperTree()) {
183         SuperAlignment *saln = (SuperAlignment*)stree->aln;
184         for (int part = 0; part != stree->size(); part++) {
185             if (part != 0)
186                 out << ",";
187             out << saln->partitions[part]->model_name << ":" << saln->partitions[part]->name;
188         }
189 //        string set_name = "";
190 //        for (it = model_info.begin(); it != model_info.end(); it++) {
191 //            if (it->set_name != set_name) {
192 //                if (set_name != "")
193 //                    out << ",";
194 //                out << it->name << ":" << it->set_name;
195 //                set_name = it->set_name;
196 //            }
197 //        }
198     } else {
199 //        out << model_info[0].name;
200         model_info->getBestModel(best_model);
201         out << best_model;
202     }
203 
204     if (tree->isSuperTree()) {
205         out << endl << endl << "List of best-fit models per partition:" << endl << endl;
206     } else {
207         out << endl << endl << "List of models sorted by "
208             << ((params.model_test_criterion == MTC_BIC) ? "BIC" :
209                 ((params.model_test_criterion == MTC_AIC) ? "AIC" : "AICc"))
210             << " scores: " << endl << endl;
211     }
212     if (tree->isSuperTree())
213         out << "  ID  ";
214     out << "Model                  LogL         AIC      w-AIC        AICc     w-AICc         BIC      w-BIC" << endl;
215     /*
216     if (is_partitioned)
217         out << "----------";
218 
219     out << "----------------------------------------------------------------------------------------" << endl;
220     */
221     int setid = 1;
222     out.precision(3);
223 
224     CandidateModelSet models;
225     model_info->getOrderedModels(tree, models);
226     for (auto it = models.begin(); it != models.end(); it++) {
227         if (tree->isSuperTree()) {
228             out.width(4);
229             out << right << setid << "  ";
230             setid++;
231         }
232         out.width(15);
233         out << left << it->getName() << " ";
234         out.width(11);
235         out << right << it->logl << " ";
236         out.width(11);
237         out    << it->AIC_score << ((it->AIC_conf) ? " + " : " - ");
238         out.unsetf(ios::fixed);
239         out.width(8);
240         out << it->AIC_weight << " ";
241         out.setf(ios::fixed);
242         out.width(11);
243         out << it->AICc_score << ((it->AICc_conf) ? " + " : " - ");
244         out.unsetf(ios::fixed);
245         out.width(8);
246         out << it->AICc_weight << " ";
247         out.setf(ios::fixed);
248         out.width(11);
249         out << it->BIC_score  << ((it->BIC_conf) ? " + " : " - ");
250         out.unsetf(ios::fixed);
251         out.width(8);
252         out << it->BIC_weight;
253         out.setf(ios::fixed);
254         out << endl;
255     }
256     out.precision(4);
257 
258     /* TODO
259     for (it = model_info.begin(); it != model_info.end(); it++) {
260         if (it->AIC_score == DBL_MAX) continue;
261         if (it != model_info.begin() && it->set_name != (it-1)->set_name)
262             setid++;
263         if (is_partitioned && it != model_info.begin() && it->set_name == (it-1)->set_name)
264             continue;
265         if (is_partitioned) {
266             out.width(4);
267             out << right << setid << "  ";
268         }
269         out.width(15);
270         out << left << it->name << " ";
271         out.width(11);
272         out << right << it->logl << " ";
273         out.width(11);
274         out    << it->AIC_score << ((it->AIC_conf) ? " + " : " - ") << it->AIC_weight << " ";
275         out.width(11);
276         out << it->AICc_score << ((it->AICc_conf) ? " + " : " - ") << it->AICc_weight << " ";
277         out.width(11);
278         out << it->BIC_score  << ((it->BIC_conf) ? " + " : " - ") << it->BIC_weight;
279         out << endl;
280     }
281     */
282     out << endl;
283     out <<  "AIC, w-AIC   : Akaike information criterion scores and weights." << endl
284          << "AICc, w-AICc : Corrected AIC scores and weights." << endl
285          << "BIC, w-BIC   : Bayesian information criterion scores and weights." << endl << endl
286 
287          << "Plus signs denote the 95% confidence sets." << endl
288          << "Minus signs denote significant exclusion." <<endl;
289     out << endl;
290 }
291 
reportModel(ofstream & out,Alignment * aln,ModelSubst * m)292 void reportModel(ofstream &out, Alignment *aln, ModelSubst *m) {
293     int i, j, k;
294     ASSERT(aln->num_states == m->num_states);
295     double *rate_mat = new double[m->num_states * m->num_states];
296     if (!m->isSiteSpecificModel())
297         m->getRateMatrix(rate_mat);
298     else
299         ((ModelSet*)m)->front()->getRateMatrix(rate_mat);
300 
301     if (m->num_states <= 4) {
302         out << "Rate parameter R:" << endl << endl;
303 
304         if (m->num_states > 4)
305             out << fixed;
306         if (m->isReversible()) {
307             for (i = 0, k = 0; i < m->num_states - 1; i++)
308                 for (j = i + 1; j < m->num_states; j++, k++) {
309                     out << "  " << aln->convertStateBackStr(i) << "-" << aln->convertStateBackStr(j) << ": "
310                             << rate_mat[k];
311                     if (m->num_states <= 4)
312                         out << endl;
313                     else if (k % 5 == 4)
314                         out << endl;
315                 }
316 
317         } else { // non-reversible model
318             for (i = 0, k = 0; i < m->num_states; i++)
319                 for (j = 0; j < m->num_states; j++)
320                     if (i != j) {
321                         out << "  " << aln->convertStateBackStr(i) << "-" << aln->convertStateBackStr(j)
322                                 << ": " << rate_mat[k];
323                         if (m->num_states <= 4)
324                             out << endl;
325                         else if (k % 5 == 4)
326                             out << endl;
327                         k++;
328                     }
329 
330         }
331 
332         //if (tree.aln->num_states > 4)
333         out << endl;
334         out.unsetf(ios_base::fixed);
335     } else if (aln->seq_type == SEQ_PROTEIN && m->getNDim() > 20) {
336         ASSERT(m->num_states == 20);
337         out << "WARNING: This model has " << m->getNDim() + m->getNDimFreq() << " parameters that may be overfitting. Please use with caution!" << endl << endl;
338         double full_mat[400];
339 
340         out.precision(6);
341 
342         if (m->isReversible()) {
343             for (i = 0, k = 0; i < m->num_states - 1; i++)
344                 for (j = i + 1; j < m->num_states; j++, k++) {
345                     full_mat[i*m->num_states+j] = rate_mat[k];
346                 }
347             out << "Substitution parameters (lower-diagonal) and state frequencies in PAML format (can be used as input for IQ-TREE): " << endl << endl;
348             for (i = 1; i < m->num_states; i++) {
349                 for (j = 0; j < i; j++)
350                     out << " " << full_mat[j*m->num_states+i];
351                 out << endl;
352             }
353         } else {
354             // non-reversible model
355             m->getQMatrix(full_mat);
356             out << "Full Q matrix and state frequencies (can be used as input for IQ-TREE): " << endl << endl;
357             for (i = 0; i < m->num_states; i++) {
358                 for (j = 0; j < m->num_states; j++)
359                     out << " " << full_mat[i*m->num_states+j];
360                 out << endl;
361             }
362         }
363         double state_freq[20];
364         m->getStateFrequency(state_freq);
365         for (i = 0; i < m->num_states; i++)
366             out << " " << state_freq[i];
367         out << endl << endl;
368         out.precision(4);
369     }
370 
371     delete[] rate_mat;
372 
373     if (aln->seq_type == SEQ_POMO) {
374         m->report(out);
375         return;
376     }
377 
378     out << "State frequencies: ";
379     if (m->isSiteSpecificModel())
380         out << "(site specific frequencies)" << endl << endl;
381     else {
382         // 2016-11-03: commented out as this is not correct anymore
383 //        if (!m->isReversible())
384 //            out << "(inferred from Q matrix)" << endl;
385 //        else
386             switch (m->getFreqType()) {
387             case FREQ_EMPIRICAL:
388                 out << "(empirical counts from alignment)" << endl;
389                 break;
390             case FREQ_ESTIMATE:
391                 out << "(estimated with maximum likelihood)" << endl;
392                 break;
393             case FREQ_USER_DEFINED:
394                 out << ((aln->seq_type == SEQ_PROTEIN) ? "(model)" : "(user-defined)") << endl;
395                 break;
396             case FREQ_EQUAL:
397                 out << "(equal frequencies)" << endl;
398                 break;
399             default:
400                 break;
401             }
402         out << endl;
403 
404         if ((m->getFreqType() != FREQ_USER_DEFINED || aln->seq_type == SEQ_DNA) && m->getFreqType() != FREQ_EQUAL) {
405             double *state_freqs = new double[m->num_states];
406             m->getStateFrequency(state_freqs);
407             int ncols=(aln->seq_type == SEQ_CODON) ? 4 : 1;
408             for (i = 0; i < m->num_states; i++) {
409                 out << "  pi(" << aln->convertStateBackStr(i) << ") = " << state_freqs[i];
410                 if (i % ncols == ncols-1)
411                     out << endl;
412             }
413             delete[] state_freqs;
414             out << endl;
415         }
416         if (m->num_states <= 4 || verbose_mode >= VB_MED) {
417             // report Q matrix
418             if (verbose_mode >= VB_MED)
419                 out.precision(6);
420             double *q_mat = new double[m->num_states * m->num_states];
421             m->getQMatrix(q_mat);
422 
423             out << "Rate matrix Q:" << endl << endl;
424             for (i = 0, k = 0; i < m->num_states; i++) {
425                 out << "  " << aln->convertStateBackStr(i);
426                 for (j = 0; j < m->num_states; j++, k++) {
427                     out << "  ";
428                     out.width(8);
429                     out << q_mat[k];
430                 }
431                 out << endl;
432             }
433             out << endl;
434             delete[] q_mat;
435         }
436     }
437 }
438 
reportModel(ofstream & out,PhyloTree & tree)439 void reportModel(ofstream &out, PhyloTree &tree) {
440 //    int i, j, k;
441     int i;
442 
443     if (tree.getModel()->isMixture() && !tree.getModel()->isPolymorphismAware()) {
444         out << "Mixture model of substitution: " << tree.getModelName() << endl;
445 //        out << "Full name: " << tree.getModelName() << endl;
446         ModelSubst *mmodel = tree.getModel();
447         out << endl << "  No  Component      Rate    Weight   Parameters" << endl;
448         i = 0;
449         int nmix = mmodel->getNMixtures();
450         for (i = 0; i < nmix; i++) {
451             ModelMarkov *m = (ModelMarkov*)mmodel->getMixtureClass(i);
452             out.width(4);
453             out << right << i+1 << "  ";
454             out.width(12);
455             out << left << (m)->name << "  ";
456             out.width(7);
457             out << (m)->total_num_subst << "  ";
458             out.width(7);
459             out << mmodel->getMixtureWeight(i) << "  " << (m)->getNameParams() << endl;
460 
461             if (tree.aln->seq_type == SEQ_POMO) {
462                 out << endl << "Model for mixture component "  << i+1 << ": " << (m)->name << endl;
463                 reportModel(out, tree.aln, m);
464             }
465         }
466         if (tree.aln->seq_type != SEQ_POMO && tree.aln->seq_type != SEQ_DNA)
467         for (i = 0; i < nmix; i++) {
468             ModelMarkov *m = (ModelMarkov*)mmodel->getMixtureClass(i);
469             if (m->getFreqType() == FREQ_EQUAL || m->getFreqType() == FREQ_USER_DEFINED)
470                 continue;
471             out << endl << "Model for mixture component "  << i+1 << ": " << (m)->name << endl;
472             reportModel(out, tree.aln, m);
473         }
474         out << endl;
475     } else {
476         out << "Model of substitution: " << tree.getModelName() << endl << endl;
477         reportModel(out, tree.aln, tree.getModel());
478     }
479 }
480 
reportRate(ostream & out,PhyloTree & tree)481 void reportRate(ostream &out, PhyloTree &tree) {
482     int i;
483     RateHeterogeneity *rate_model = tree.getRate();
484     out << "Model of rate heterogeneity: " << rate_model->full_name << endl;
485     rate_model->writeInfo(out);
486 
487     if (rate_model->getNDiscreteRate() > 1 || rate_model->getPInvar() > 0.0) {
488         out << endl << " Category  Relative_rate  Proportion" << endl;
489         if (rate_model->getPInvar() > 0.0)
490             out << "  0         0              " << rate_model->getPInvar()
491                     << endl;
492         int cats = rate_model->getNDiscreteRate();
493         DoubleVector prop;
494         if (rate_model->getGammaShape() > 0 || rate_model->getPtnCat(0) < 0) {
495 //            prop.resize(cats, (1.0 - rate_model->getPInvar()) / rate_model->getNRate());
496             prop.resize(cats);
497         for (i = 0; i < cats; i++)
498             prop[i] = rate_model->getProp(i);
499         } else {
500             prop.resize(cats, 0.0);
501             for (i = 0; i < tree.aln->getNPattern(); i++)
502                 prop[rate_model->getPtnCat(i)] += tree.aln->at(i).frequency;
503             for (i = 0; i < cats; i++)
504                 prop[i] /= tree.aln->getNSite();
505         }
506         for (i = 0; i < cats; i++) {
507             out << "  " << i + 1 << "         ";
508             out.width(14);
509             out << left << rate_model->getRate(i) << " " << prop[i];
510             out << endl;
511         }
512         if (rate_model->isGammaRate()) {
513             out << "Relative rates are computed as " << ((rate_model->isGammaRate() == GAMMA_CUT_MEDIAN) ? "MEDIAN" : "MEAN") <<
514                 " of the portion of the Gamma distribution falling in the category." << endl;
515         }
516     }
517     /*
518      if (rate_model->getNDiscreteRate() > 1 || rate_model->isSiteSpecificRate())
519      out << endl << "See file " << rate_file << " for site-specific rates and categories" << endl;*/
520     out << endl;
521 }
522 
reportTree(ofstream & out,Params & params,PhyloTree & tree,double tree_lh,double lh_variance,double main_tree)523 void reportTree(ofstream &out, Params &params, PhyloTree &tree, double tree_lh, double lh_variance, double main_tree) {
524     int ssize = tree.getAlnNSite();
525     double epsilon = 1.0 / ssize;
526     double totalLen = tree.treeLength();
527     int df = tree.getModelFactory()->getNParameters(BRLEN_OPTIMIZE);
528     double AIC_score, AICc_score, BIC_score;
529     computeInformationScores(tree_lh, df, ssize, AIC_score, AICc_score, BIC_score);
530 
531     out << "Log-likelihood of the tree: " << fixed << tree_lh;
532     if (lh_variance > 0.0)
533         out << " (s.e. " << sqrt(lh_variance) << ")";
534     out << endl;
535     out    << "Unconstrained log-likelihood (without tree): " << tree.aln->computeUnconstrainedLogL() << endl;
536 
537     out << "Number of free parameters (#branches + #model parameters): " << df << endl;
538 //    if (ssize > df) {
539 //        if (ssize > 40*df)
540 //            out    << "Akaike information criterion (AIC) score: " << AIC_score << endl;
541 //        else
542 //            out << "Corrected Akaike information criterion (AICc) score: " << AICc_score << endl;
543 //
544 //        out << "Bayesian information criterion (BIC) score: " << BIC_score << endl;
545 //    } else
546     out    << "Akaike information criterion (AIC) score: " << AIC_score << endl;
547     out << "Corrected Akaike information criterion (AICc) score: " << AICc_score << endl;
548     out << "Bayesian information criterion (BIC) score: " << BIC_score << endl;
549 
550     if (ssize <= df && main_tree) {
551 
552         out << endl
553             << "**************************** WARNING ****************************" << endl
554             << "Number of parameters (K, model parameters and branch lengths): " << df << endl
555             << "Sample size (n, alignment length): " << ssize << endl << endl
556             << "Given that K>=n, the parameter estimates might be inaccurate." << endl
557             << "Thus, phylogenetic estimates should be interpreted with caution." << endl << endl
558 
559             << "Ideally, it is desirable that n >> K. When selecting optimal models," << endl
560             << "1. use AIC or BIC if n > 40K;" << endl
561             << "2. use AICc or BIC if 40K >= n > K;" << endl
562             << "3. be extremely cautious if n <= K" << endl << endl
563 
564             << "To improve the situation (3), consider the following options:" << endl
565             << "  1. Increase the sample size (n)" << endl
566             << "  2. Decrease the number of parameters (K) to be estimated. If" << endl
567             << "     possible:" << endl
568             << "     a. Remove the least important sequences from the alignment" << endl
569             << "     b. Specify some of the parameter values for the substitution"<< endl
570             << "        model (e.g., the nucleotide or amino acid frequencies)" << endl
571             << "     c. Specify some of the parameter values for the rates-across-" << endl
572             << "        sites model (e.g., the shape parameter for the discrete" << endl
573             << "        Gamma distribution, the proportion of invariable sites, or" << endl
574             << "        the rates of change for different rate categories under" << endl
575             << "        the FreeRate model)" << endl << endl
576             << "Reference:" << endl
577             << "Burnham KR, Anderson DR (2002). Model Selection and Multimodel" << endl
578             << "Inference: A Practical Information-Theoretic Approach. Springer," << endl
579             << "New York." << endl
580             << "************************ END OF WARNING ***********************" << endl;
581     }
582     out << endl;
583 
584     if (tree.aln->seq_type == SEQ_POMO) {
585         int N = tree.aln->virtual_pop_size;
586         out << "NOTE: The branch lengths of PoMo measure mutations and frequency shifts." << endl;
587         out << "To compare PoMo branch lengths to DNA substitution models use the tree length" << endl;
588         out << "measured in substitutions per site." << endl << endl;
589         out << "PoMo branch length = Substitution model branch length * N * N." << endl << endl;
590         out << "Total tree length (sum of branch lengths)" << endl;
591         out << " - measured in number of mutations and frequency shifts per site: " << totalLen << endl;
592         out << " - measured in number of substitutions per site (divided by N^2): " << totalLen / (N * N) << endl;
593     }
594     else out << "Total tree length (sum of branch lengths): " << totalLen << endl;
595 
596     double totalLenInternal = tree.treeLengthInternal(epsilon);
597     double totalLenInternalP = totalLenInternal*100.0 / totalLen;
598     if (tree.aln->seq_type == SEQ_POMO) {
599       int N = tree.aln->virtual_pop_size;
600       double totLenIntSub = totalLenInternal/(N * N);
601         out << "Sum of internal branch lengths" << endl;
602         out << "- measured in mutations and frequency shifts per site: " << totalLenInternal << " (" << totalLenInternalP << "% of tree length)" << endl;
603         out << "- measured in substitutions per site: " << totLenIntSub << " (" << totalLenInternalP << "% of tree length)" << endl;
604         out << endl;
605     }
606     else {
607         out << "Sum of internal branch lengths: " << totalLenInternal << " (" << totalLenInternalP << "% of tree length)" << endl;
608         //    out << "Sum of internal branch lengths divided by total tree length: "
609         //            << totalLenInternal / totalLen << endl;
610         out << endl;
611     }
612 
613     if (tree.isMixlen()) {
614         DoubleVector lenvec;
615         tree.treeLengths(lenvec);
616         out << "Class tree lengths: ";
617         for (int i = 0; i < lenvec.size(); i++)
618             out << " " << lenvec[i];
619         out << endl;
620     }
621 
622     if (params.partition_type == TOPO_UNLINKED) {
623         out << "Tree topologies are unlinked across partitions, thus no drawing will be displayed here" << endl;
624         out << endl;
625         return;
626     }
627 
628     //out << "ZERO BRANCH EPSILON = " << epsilon << endl;
629     int zero_internal_branches = tree.countZeroInternalBranches(NULL, NULL, epsilon);
630     if (zero_internal_branches > 0) {
631         //int zero_internal_branches = tree.countZeroInternalBranches(NULL, NULL, epsilon);
632         /*
633         out << "WARNING: " << zero_branches
634                 << " branches of near-zero lengths (<" << epsilon << ") and should be treated with caution!"
635                 << endl;
636         */
637         out << "WARNING: " << zero_internal_branches
638                 << " near-zero internal branches (<" << epsilon << ") should be treated with caution"
639                 << endl;
640         /*
641         cout << endl << "WARNING: " << zero_branches
642                 << " branches of near-zero lengths (<" << epsilon << ") and should be treated with caution!"
643                 << endl;
644         */
645         out << "         Such branches are denoted by '**' in the figure below"
646                 << endl << endl;
647     }
648     int long_branches = tree.countLongBranches(NULL, NULL, params.max_branch_length-0.2);
649     if (long_branches > 0) {
650         //stringstream sstr;
651         out << "WARNING: " << long_branches << " too long branches (>"
652             << params.max_branch_length-0.2 << ") should be treated with caution!" << endl;
653         //out << sstr.str();
654         //cout << sstr.str();
655     }
656 
657             //<< "Total tree length: " << tree.treeLength() << endl << endl
658     tree.sortTaxa();
659     if (tree.rooted)
660         out << "NOTE: Tree is ROOTED at virtual root '" << tree.root->name << "'" << endl;
661     else
662         out << "NOTE: Tree is UNROOTED although outgroup taxon '" << tree.root->name << "' is drawn at root" << endl;
663 
664     if (tree.isSuperTree() && params.partition_type == BRLEN_OPTIMIZE)
665         out    << "NOTE: Branch lengths are weighted average over all partitions" << endl
666             << "      (weighted by the number of sites in the partitions)" << endl;
667     if (tree.isMixlen())
668         out << "NOTE: Branch lengths are weighted average over heterotachy classes" << endl;
669 
670     bool is_codon = tree.aln->seq_type == SEQ_CODON;
671     if (tree.isSuperTree()) {
672         PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
673         is_codon = true;
674         for (PhyloSuperTree::iterator sit = stree->begin(); sit != stree->end(); sit++)
675             if ((*sit)->aln->seq_type != SEQ_CODON) {
676                 is_codon = false;
677                 break;
678             }
679     }
680     if (is_codon)
681         out << endl << "NOTE: Branch lengths are interpreted as number of nucleotide substitutions per codon site!"
682                 << endl << "      Rescale them by 1/3 if you want to have #nt substitutions per nt site" << endl;
683     if (main_tree)
684     if (params.aLRT_replicates > 0 || params.gbo_replicates || (params.num_bootstrap_samples && params.compute_ml_tree)) {
685         out << "Numbers in parentheses are ";
686         if (params.aLRT_replicates > 0) {
687             out << "SH-aLRT support (%)";
688             if (params.localbp_replicates)
689                 out << " / local bootstrap support (%)";
690         }
691         if (params.aLRT_test)
692             out << " / parametric aLRT support";
693         if (params.aBayes_test)
694             out << " / aBayes support";
695         if (params.num_bootstrap_samples && params.compute_ml_tree) {
696             if (params.aLRT_replicates > 0 || params.aLRT_test || params.aBayes_test)
697                 out << " /";
698             out << " standard " << RESAMPLE_NAME << " support (%)";
699         }
700         if (params.gbo_replicates) {
701             if (params.aLRT_replicates > 0 || params.aLRT_test || params.aBayes_test)
702                 out << " /";
703             out << " ultrafast " << RESAMPLE_NAME << " support (%)";
704         }
705         out << endl;
706     }
707     out << endl;
708 
709     //tree.setExtendedFigChar();
710     tree.setRootNode(params.root, true);
711     tree.drawTree(out, WT_BR_SCALE, epsilon);
712 
713     out << "Tree in newick format:";
714     if (tree.isMixlen())
715         out << " (class branch lengths are given in [...] and separated by '/' )";
716     if (tree.aln->seq_type == SEQ_POMO)
717         out << " (measured in mutations and frequency shifts)";
718     out << endl << endl;
719 
720     tree.printTree(out, WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA);
721     out << endl << endl;
722 
723   if (tree.aln->seq_type == SEQ_POMO) {
724     out << "Tree in newick format (measured in substitutions, see above):" << endl;
725     out << "WARNING: Only for comparison with substitution models." << endl;
726     out << "         These are NOT the branch lengths inferred by PoMo." << endl << endl;
727     double len_scale_old = tree.len_scale;
728     int N = tree.aln->virtual_pop_size;
729     tree.len_scale = 1.0/(N*N);
730     tree.printTree(out, WT_BR_SCALE | WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA);
731     tree.len_scale = len_scale_old;
732     out << endl << endl;
733     }
734 
735   tree.setRootNode(params.root, false);
736 
737 }
738 
reportCredits(ofstream & out)739 void reportCredits(ofstream &out) {
740     out << "CREDITS" << endl << "-------" << endl << endl
741             << "Some parts of the code were taken from the following packages/libraries:"
742             << endl << endl
743             << "Schmidt HA, Strimmer K, Vingron M, and von Haeseler A (2002)" << endl
744             << "TREE-PUZZLE: maximum likelihood phylogenetic analysis using quartets" << endl
745             << "and parallel computing. Bioinformatics, 18(3):502-504." << endl << endl
746 
747             //<< "The source code to construct the BIONJ tree were taken from BIONJ software:"
748             //<< endl << endl
749             << "Gascuel O (1997) BIONJ: an improved version of the NJ algorithm" << endl
750             << "based on a simple model of sequence data. Mol. Bio. Evol., 14:685-695." << endl << endl
751 
752             //<< "The Nexus file parser was taken from the Nexus Class Library:"
753             //<< endl << endl
754             << "Paul O. Lewis (2003) NCL: a C++ class library for interpreting data files in" << endl
755             << "NEXUS format. Bioinformatics, 19(17):2330-2331." << endl << endl
756 
757             << "Mascagni M and Srinivasan A (2000) Algorithm 806: SPRNG: A Scalable Library" << endl
758             << "for Pseudorandom Number Generation. ACM Transactions on Mathematical Software," << endl
759             << "26: 436-461." << endl << endl
760 
761             << "Guennebaud G, Jacob B, et al. (2010) Eigen v3. http://eigen.tuxfamily.org" << endl << endl;
762             /*
763             << "The Modeltest 3.7 source codes were taken from:" << endl << endl
764             << "David Posada and Keith A. Crandall (1998) MODELTEST: testing the model of"
765             << endl << "DNA substitution. Bioinformatics, 14(9):817-8." << endl
766             */
767 }
768 
769 /***********************************************************
770  * CREATE REPORT FILE
771  ***********************************************************/
772 extern StringIntMap pllTreeCounter;
773 
774 void exhaustiveSearchGAMMAInvar(Params &params, IQTree &iqtree);
775 
776 void searchGAMMAInvarByRestarting(IQTree &iqtree);
777 
778 void computeLoglFromUserInputGAMMAInvar(Params &params, IQTree &iqtree);
779 
printOutfilesInfo(Params & params,IQTree & tree)780 void printOutfilesInfo(Params &params, IQTree &tree) {
781 
782     cout << endl << "Analysis results written to: " << endl;
783     if (!(params.suppress_output_flags & OUT_IQTREE))
784         cout<< "  IQ-TREE report:                " << params.out_prefix << ".iqtree"
785             << endl;
786     if (params.compute_ml_tree) {
787         if (!(params.suppress_output_flags & OUT_TREEFILE)) {
788             if (params.model_name.find("ONLY") != string::npos || (params.model_name.substr(0,2)=="MF" && params.model_name.substr(0,3)!="MFP"))
789                 cout << "  Tree used for ModelFinder:     " << params.out_prefix << ".treefile" << endl;
790             else {
791                 cout << "  Maximum-likelihood tree:       " << params.out_prefix << ".treefile" << endl;
792                 if (params.partition_type == BRLEN_OPTIMIZE && tree.isSuperTree())
793                     cout << "  Partition trees:               " << params.out_prefix << ".parttrees" << endl;
794             }
795         }
796 //        if (params.snni && params.write_local_optimal_trees) {
797 //            cout << "  Locally optimal trees (" << tree.candidateTrees.getNumLocalOptTrees() << "):    " << params.out_prefix << ".suboptimal_trees" << endl;
798 //        }
799     }
800     if (params.num_runs > 1)
801         cout << "  Trees from independent runs:   " << params.out_prefix << ".runtrees" << endl;
802 
803     if (!params.user_file && params.start_tree == STT_BIONJ) {
804         cout << "  BIONJ tree:                    " << params.out_prefix << ".bionj"
805                 << endl;
806     }
807     if (!params.dist_file) {
808         //cout << "  Juke-Cantor distances:    " << params.out_prefix << ".jcdist" << endl;
809         if (params.compute_ml_dist)
810         cout << "  Likelihood distances:          " << params.out_prefix
811                     << ".mldist" << endl;
812         if (params.print_conaln)
813         cout << "  Concatenated alignment:        " << params.out_prefix
814                     << ".conaln" << endl;
815     }
816     if ((params.model_name.find("TEST") != string::npos || params.model_name.substr(0,2) == "MF") && tree.isSuperTree()) {
817         cout << "  Best partitioning scheme:      " << params.out_prefix << ".best_scheme.nex" << endl;
818         bool raxml_format_printed = true;
819 
820         for (auto it = ((SuperAlignment*)tree.aln)->partitions.begin();
821                 it != ((SuperAlignment*)tree.aln)->partitions.end(); it++)
822             if (!(*it)->aln_file.empty()) {
823                 raxml_format_printed = false;
824                 break;
825             }
826         if (raxml_format_printed)
827              cout << "           in RAxML format:      " << params.out_prefix << ".best_scheme" << endl;
828     }
829     if ((tree.getRate()->getGammaShape() > 0 || params.partition_file) && params.print_site_rate)
830         cout << "  Site-specific rates:           " << params.out_prefix << ".rate"
831                 << endl;
832 
833     if ((tree.getRate()->isSiteSpecificRate() || tree.getRate()->getPtnCat(0) >= 0) && params.print_site_rate)
834         cout << "  Site-rates by MH model:        " << params.out_prefix << ".rate"
835                 << endl;
836 
837     if (params.print_site_lh)
838         cout << "  Site log-likelihoods:          " << params.out_prefix << ".sitelh"
839                 << endl;
840 
841     if (params.print_partition_lh)
842         cout << "  Partition log-likelihoods:     " << params.out_prefix << ".partlh"
843                 << endl;
844 
845     if (params.print_site_prob)
846         cout << "  Site probability per rate/mix: " << params.out_prefix << ".siteprob"
847                 << endl;
848 
849     if (params.print_ancestral_sequence) {
850         cout << "  Ancestral state:               " << params.out_prefix << ".state" << endl;
851 //        cout << "  Ancestral sequences:           " << params.out_prefix << ".aseq" << endl;
852     }
853 
854     if (params.write_intermediate_trees)
855         cout << "  All intermediate trees:        " << params.out_prefix << ".treels"
856                 << endl;
857 
858     if (params.writeDistImdTrees) {
859         tree.intermediateTrees.printTrees(string("ditrees"));
860         cout << "  Distinct intermediate trees:   " << params.out_prefix <<  ".ditrees" << endl;
861         cout << "  Logl of intermediate trees:    " << params.out_prefix <<  ".ditrees_lh" << endl;
862     }
863 
864     if (params.gbo_replicates) {
865         cout << endl << "Ultrafast " << RESAMPLE_NAME << " approximation results written to:" << endl;
866         if (!tree.isSuperTreeUnlinked())
867             cout << "  Split support values:          " << params.out_prefix << ".splits.nex" << endl
868              << "  Consensus tree:                " << params.out_prefix << ".contree" << endl;
869         if (params.print_ufboot_trees)
870         cout << "  UFBoot trees:                  " << params.out_prefix << ".ufboot" << endl;
871 
872     }
873 
874     if (!params.treeset_file.empty()) {
875         cout << "  Evaluated user trees:          " << params.out_prefix << ".trees" << endl;
876 
877         if (params.print_tree_lh) {
878         cout << "  Tree log-likelihoods:          " << params.out_prefix << ".treelh" << endl;
879         }
880     }
881     if (params.lmap_num_quartets >= 0) {
882         cout << "  Likelihood mapping plot (SVG): " << params.out_prefix << ".lmap.svg" << endl;
883         cout << "  Likelihood mapping plot (EPS): " << params.out_prefix << ".lmap.eps" << endl;
884     }
885     if (!(params.suppress_output_flags & OUT_LOG))
886         cout << "  Screen log file:               " << params.out_prefix << ".log" << endl;
887     /*    if (params.model_name == "WHTEST")
888      cout <<"  WH-TEST report:           " << params.out_prefix << ".whtest" << endl;*/
889 
890     cout << endl;
891 
892 }
893 
reportPhyloAnalysis(Params & params,IQTree & tree,ModelCheckpoint & model_info)894 void reportPhyloAnalysis(Params &params, IQTree &tree, ModelCheckpoint &model_info) {
895     if (!MPIHelper::getInstance().isMaster()) {
896         return;
897     }
898     if (params.suppress_output_flags & OUT_IQTREE) {
899         printOutfilesInfo(params, tree);
900         return;
901     }
902 
903     if (params.count_trees) {
904         // addon: print #distinct trees
905         cout << endl << "NOTE: " << pllTreeCounter.size() << " distinct trees evaluated during whole tree search" << endl;
906 
907         IntVector counts;
908         for (StringIntMap::iterator i = pllTreeCounter.begin(); i != pllTreeCounter.end(); i++) {
909             if (i->second > counts.size())
910                 counts.resize(i->second+1, 0);
911             counts[i->second]++;
912         }
913         for (IntVector::iterator i2 = counts.begin(); i2 != counts.end(); i2++) {
914             if (*i2 != 0) {
915                 cout << "#Trees occurring " << (i2-counts.begin()) << " times: " << *i2 << endl;
916             }
917         }
918     }
919     string outfile = params.out_prefix;
920 
921     outfile += ".iqtree";
922     try {
923         ofstream out;
924         out.exceptions(ios::failbit | ios::badbit);
925         out.open(outfile.c_str());
926         out << "IQ-TREE " << iqtree_VERSION_MAJOR << "." << iqtree_VERSION_MINOR
927             << iqtree_VERSION_PATCH << " built " << __DATE__ << endl
928                 << endl;
929         if (params.partition_file)
930             out << "Partition file name: " << params.partition_file << endl;
931         if (params.aln_file)
932             out << "Input file name: " << params.aln_file << endl;
933 
934         if (params.user_file)
935             out << "User tree file name: " << params.user_file << endl;
936         out << "Type of analysis: ";
937         bool modelfinder = params.model_name.substr(0,4)=="TEST" || params.model_name.substr(0,2) == "MF" || params.model_name.empty();
938         if (modelfinder)
939             out << "ModelFinder";
940         if (params.compute_ml_tree) {
941             if (modelfinder)
942                 out << " + ";
943             out << "tree reconstruction";
944         }
945         if (params.num_bootstrap_samples > 0) {
946             if (params.compute_ml_tree)
947                 out << " + ";
948             out << "non-parametric " << RESAMPLE_NAME << " (" << params.num_bootstrap_samples
949                     << " replicates)";
950         }
951         if (params.gbo_replicates > 0) {
952             out << " + ultrafast " << RESAMPLE_NAME << " (" << params.gbo_replicates << " replicates)";
953         }
954         out << endl;
955         out << "Random seed number: " << params.ran_seed << endl << endl;
956         out << "REFERENCES" << endl << "----------" << endl << endl;
957         reportReferences(params, out);
958 
959         out << "SEQUENCE ALIGNMENT" << endl << "------------------" << endl
960                 << endl;
961         if (tree.isSuperTree()) {
962       // TODO DS: Changes may be needed here for PoMo.
963             out << "Input data: " << tree.aln->getNSeq()+tree.removed_seqs.size() << " taxa with "
964                     << tree.aln->getNSite() << " partitions and "
965                     << tree.getAlnNSite() << " total sites ("
966                     << ((SuperAlignment*)tree.aln)->computeMissingData()*100 << "% missing data)" << endl << endl;
967 
968             PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
969             int namelen = stree->getMaxPartNameLength();
970             int part;
971             out.width(max(namelen+6,10));
972             out << left << "  ID  Name" << "  Type\tSeq\tSite\tUnique\tInfor\tInvar\tConst" << endl;
973             //out << string(namelen+54, '-') << endl;
974             part = 0;
975             for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++, part++) {
976                 //out << "FOR PARTITION " << stree->part_info[part].name << ":" << endl << endl;
977                 //reportAlignment(out, *((*it)->aln));
978                 out.width(4);
979                 out << right << part+1 << "  ";
980                 out.width(max(namelen,4));
981                 out << left << (*it)->aln->name << "  ";
982                 switch ((*it)->aln->seq_type) {
983                 case SEQ_BINARY: out << "BIN"; break;
984                 case SEQ_CODON: out << "CODON"; break;
985                 case SEQ_DNA: out << "DNA"; break;
986                 case SEQ_MORPH: out << "MORPH"; break;
987                 case SEQ_MULTISTATE: out << "MULTI"; break;
988                 case SEQ_PROTEIN: out << "AA"; break;
989                 case SEQ_POMO: out << "POMO"; break;
990                 case SEQ_UNKNOWN: out << "???"; break;
991                 }
992                 out << "\t" << (*it)->aln->getNSeq() << "\t" << (*it)->aln->getNSite()
993                     << "\t" << (*it)->aln->getNPattern() << "\t" << (*it)->aln->num_informative_sites
994                     << "\t" << (*it)->getAlnNSite() - (*it)->aln->num_variant_sites
995                     << "\t" << int((*it)->aln->frac_const_sites*(*it)->getAlnNSite()) << endl;
996             }
997             out << endl << "Column meanings:" << endl
998                 << "  Unique: Number of unique site patterns" << endl
999                 << "  Infor:  Number of parsimony-informative sites" << endl
1000                 << "  Invar:  Number of invariant sites" << endl
1001                 << "  Const:  Number of constant sites (can be subset of invariant sites)" << endl << endl;
1002 
1003         } else
1004             reportAlignment(out, *(tree.aln), tree.removed_seqs.size());
1005 
1006         out.precision(4);
1007         out << fixed;
1008 
1009         if (!model_info.empty()) {
1010             out << "ModelFinder" << endl << "-----------" << endl << endl;
1011 //            if (tree.isSuperTree())
1012 //                pruneModelInfo(model_info, (PhyloSuperTree*)&tree);
1013             reportModelSelection(out, params, &model_info, &tree);
1014         }
1015 
1016         out << "SUBSTITUTION PROCESS" << endl << "--------------------" << endl
1017                 << endl;
1018         if (tree.isSuperTree()) {
1019             if(params.partition_type == BRLEN_SCALE)
1020                 out << "Edge-linked-proportional partition model with ";
1021             else if(params.partition_type == BRLEN_FIX)
1022                 out << "Edge-linked-equal partition model with ";
1023             else if (params.partition_type == BRLEN_OPTIMIZE)
1024                 out << "Edge-unlinked partition model with ";
1025             else
1026                 out << "Topology-unlinked partition model with ";
1027 
1028             if (params.model_joint)
1029                 out << "joint substitution model ";
1030             else
1031                 out << "separate substitution models ";
1032             if (params.link_alpha)
1033                 out << "and joint gamma shape";
1034             else
1035                 out << "and separate rates across sites";
1036             out << endl << endl;
1037 
1038             PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
1039             PhyloSuperTree::iterator it;
1040             int part;
1041             if(params.partition_type == BRLEN_OPTIMIZE || params.partition_type == TOPO_UNLINKED)
1042                 out << "  ID  Model         TreeLen  Parameters" << endl;
1043             else
1044                 out << "  ID  Model           Speed  Parameters" << endl;
1045             //out << "-------------------------------------" << endl;
1046             for (it = stree->begin(), part = 0; it != stree->end(); it++, part++) {
1047                 out.width(4);
1048                 out << right << (part+1) << "  ";
1049                 out.width(14);
1050                 if(params.partition_type == BRLEN_OPTIMIZE || params.partition_type == TOPO_UNLINKED)
1051                     out << left << (*it)->getModelName() << " " << (*it)->treeLength() << "  " << (*it)->getModelNameParams() << endl;
1052                 else
1053                     out << left << (*it)->getModelName() << " " << stree->part_info[part].part_rate  << "  " << (*it)->getModelNameParams() << endl;
1054             }
1055             out << endl;
1056             /*
1057             for (it = stree->begin(), part = 0; it != stree->end(); it++, part++) {
1058                 reportModel(out, *(*it));
1059                 reportRate(out, *(*it));
1060             }*/
1061             PartitionModel *part_model = (PartitionModel*)tree.getModelFactory();
1062             for (auto itm = part_model->linked_models.begin(); itm != part_model->linked_models.end(); itm++) {
1063                 for (it = stree->begin(); it != stree->end(); it++)
1064                     if ((*it)->getModel() == itm->second) {
1065                         out << "Linked model of substitution: " << itm->second->getName() << endl << endl;
1066                         bool fixed = (*it)->getModel()->fixParameters(false);
1067                         reportModel(out, (*it)->aln, (*it)->getModel());
1068                         (*it)->getModel()->fixParameters(fixed);
1069                         break;
1070                     }
1071             }
1072         } else {
1073             reportModel(out, tree);
1074             reportRate(out, tree);
1075         }
1076 
1077             if (params.lmap_num_quartets >= 0) {
1078             tree.reportLikelihoodMapping(out);
1079         }
1080 
1081 
1082         /*
1083         out << "RATE HETEROGENEITY" << endl << "------------------" << endl
1084                 << endl;
1085         if (tree.isSuperTree()) {
1086             PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
1087             int part = 0;
1088             for (PhyloSuperTree::iterator it = stree->begin();
1089                     it != stree->end(); it++, part++) {
1090                 out << "FOR PARTITION " << stree->part_info[part].name << ":"
1091                         << endl << endl;
1092                 reportRate(out, *(*it));
1093             }
1094         } else
1095             reportRate(out, tree);
1096         */
1097         // Bootstrap analysis:
1098         //Display as outgroup: a
1099 
1100         if (params.model_name == "WHTEST") {
1101             out << "TEST OF MODEL HOMOGENEITY" << endl
1102                     << "-------------------------" << endl << endl;
1103             out << "Delta of input data:                 "
1104                     << params.whtest_delta << endl;
1105             out << ".95 quantile of Delta distribution:  "
1106                     << params.whtest_delta_quantile << endl;
1107             out << "Number of simulations performed:     "
1108                     << params.whtest_simulations << endl;
1109             out << "P-value:                             "
1110                     << params.whtest_p_value << endl;
1111             if (params.whtest_p_value < 0.05) {
1112                 out
1113                         << "RESULT: Homogeneity assumption is rejected (p-value cutoff 0.05)"
1114                         << endl;
1115             } else {
1116                 out
1117                         << "RESULT: Homogeneity assumption is NOT rejected (p-value cutoff 0.05)"
1118                         << endl;
1119             }
1120             out << endl << "*** For this result please cite:" << endl << endl;
1121             out
1122                     << "G. Weiss and A. von Haeseler (2003) Testing substitution models"
1123                     << endl
1124                     << "within a phylogenetic tree. Mol. Biol. Evol, 20(4):572-578"
1125                     << endl << endl;
1126         }
1127 
1128         if (params.num_runs > 1) {
1129             out << "MULTIPLE RUNS" << endl
1130                 << "-------------" << endl << endl;
1131             out << "Run     logL" << endl;
1132             DoubleVector runLnL;
1133             tree.getCheckpoint()->getVector("runLnL", runLnL);
1134             for (int run = 0; run < runLnL.size(); run++)
1135                 out << run+1 << "\t" << fixed << runLnL[run] << endl;
1136             out << endl;
1137         }
1138 /*
1139         out << "TREE SEARCH" << endl << "-----------" << endl << endl
1140                 << "Stopping rule: "
1141                 << ((params.stop_condition == SC_STOP_PREDICT) ? "Yes" : "No")
1142                 << endl << "Number of iterations: "
1143                 << tree.stop_rule.getNumIterations() << endl
1144                 << "Probability of deleting sequences: " << params.p_delete
1145                 << endl << "Number of representative leaves: "
1146                 << params.k_representative << endl
1147                 << "NNI log-likelihood cutoff: " << tree.getNNICutoff() << endl
1148                 << endl;
1149 */
1150         if (params.compute_ml_tree) {
1151             if (params.model_name.find("ONLY") != string::npos || (params.model_name.substr(0,2) == "MF" && params.model_name.substr(0,3) != "MFP")) {
1152                 out << "TREE USED FOR ModelFinder" << endl
1153                     << "-------------------------" << endl << endl;
1154             } else if (params.min_iterations == 0) {
1155                 if (params.user_file)
1156                     out << "USER TREE" << endl
1157                         << "---------" << endl << endl;
1158                 else
1159                     out << "STARTING TREE" << endl
1160                         << "-------------" << endl << endl;
1161             } else {
1162                 out << "MAXIMUM LIKELIHOOD TREE" << endl
1163                     << "-----------------------" << endl << endl;
1164             }
1165 
1166             tree.setRootNode(params.root);
1167 
1168             if (params.gbo_replicates) {
1169                 if (tree.boot_consense_logl > tree.getBestScore() + 0.1 && !tree.isSuperTreeUnlinked()) {
1170                     out << endl << "**NOTE**: Consensus tree has higher likelihood than ML tree found! Please use consensus tree below." << endl;
1171                 }
1172             }
1173 
1174             reportTree(out, params, tree, tree.getBestScore(), tree.logl_variance, true);
1175 
1176             if (tree.isSuperTree() && verbose_mode >= VB_MED) {
1177                 PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
1178 //                stree->mapTrees();
1179 //                int empty_branches = stree->countEmptyBranches();
1180 //                if (empty_branches) {
1181 //                    stringstream ss;
1182 //                    ss << empty_branches << " branches in the overall tree with no phylogenetic information due to missing data!";
1183 //                    outWarning(ss.str());
1184 //                }
1185 
1186                 int part = 0;
1187                 for (PhyloSuperTree::iterator it = stree->begin();
1188                         it != stree->end(); it++, part++) {
1189                     out << "FOR PARTITION " << (*it)->aln->name
1190                             << ":" << endl << endl;
1191                     (*it)->setRootNode(params.root);
1192 //                    reportTree(out, params, *(*it), (*it)->computeLikelihood(), (*it)->computeLogLVariance(), false);
1193                     reportTree(out, params, *(*it), stree->part_info[part].cur_score, 0.0, false);
1194                 }
1195             }
1196 
1197         }
1198         /*
1199          if (params.write_intermediate_trees) {
1200          out << endl << "CONSENSUS OF INTERMEDIATE TREES" << endl << "-----------------------" << endl << endl
1201          << "Number of intermediate trees: " << tree.stop_rule.getNumIterations() << endl
1202          << "Split threshold: " << params.split_threshold << endl
1203          << "Burn-in: " << params.tree_burnin << endl << endl;
1204          }*/
1205 
1206         if (params.consensus_type == CT_CONSENSUS_TREE && !tree.isSuperTreeUnlinked()) {
1207             out << "CONSENSUS TREE" << endl << "--------------" << endl << endl;
1208             out << "Consensus tree is constructed from "
1209                     << (params.num_bootstrap_samples ? params.num_bootstrap_samples : params.gbo_replicates)
1210                     << " " << RESAMPLE_NAME << " trees";
1211             if (params.gbo_replicates || params.num_bootstrap_samples) {
1212                 out << endl << "Log-likelihood of consensus tree: " << fixed << tree.boot_consense_logl;
1213             }
1214             string con_file = params.out_prefix;
1215             con_file += ".contree";
1216 
1217             // -- Mon Apr 17 21:14:53 BST 2017
1218             // DONE Minh: merged correctly
1219             if (params.compute_ml_tree)
1220                 out << endl << "Robinson-Foulds distance between ML tree and consensus tree: "
1221                     << tree.contree_rfdist << endl;
1222             // --
1223 
1224             out << endl << "Branches with support >"
1225                     << floor(params.split_threshold * 1000) / 10 << "% are kept";
1226             if (params.split_threshold == 0.0)
1227                 out << " (extended consensus)";
1228             if (params.split_threshold == 0.5)
1229                 out << " (majority-rule consensus)";
1230             if (params.split_threshold >= 0.99)
1231                 out << " (strict consensus)";
1232 
1233             out << endl << "Branch lengths are optimized by maximum likelihood on original alignment" << endl;
1234             out << "Numbers in parentheses are " << RESAMPLE_NAME << " supports (%)" << endl << endl;
1235 
1236             bool rooted = false;
1237             MTree contree;
1238             contree.readTree(con_file.c_str(), rooted);
1239             contree.drawTree(out, WT_BR_SCALE);
1240             out << endl << "Consensus tree in newick format: " << endl << endl;
1241             contree.printTree(out);
1242             out << endl << endl;
1243 //            tree.freeNode();
1244 //            tree.root = NULL;
1245 //            tree.readTree(con_file.c_str(), rooted);
1246 //            if (removed_seqs.size() > 0) {
1247 //                tree.reinsertIdenticalSeqs(tree.aln, removed_seqs, twin_seqs);
1248 //            }
1249 //            tree.setAlignment(tree.aln);
1250 
1251             // bug fix
1252 //            if ((tree.sse == LK_EIGEN || tree.sse == LK_EIGEN_SSE) && !tree.isBifurcating()) {
1253 //                cout << "NOTE: Changing to old kernel as consensus tree is multifurcating" << endl;
1254 //                tree.changeLikelihoodKernel(LK_SSE);
1255 //            }
1256 
1257 //            tree.initializeAllPartialLh();
1258 //            tree.fixNegativeBranch(false);
1259 //            if (tree.isSuperTree())
1260 //                ((PhyloSuperTree*) &tree)->mapTrees();
1261 //            tree.optimizeAllBranches();
1262 //            tree.printTree(con_file.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA);
1263 //            tree.sortTaxa();
1264 //            tree.drawTree(out, WT_BR_SCALE);
1265 //            out << endl << "Consensus tree in newick format: " << endl << endl;
1266 //            tree.printResultTree(out);
1267 //            out << endl << endl;
1268         }
1269 #ifdef IQTREE_TERRAPHAST
1270         if (params.terrace_analysis && params.compute_ml_tree) {
1271 
1272             out << "TERRACE ANALYSIS" << endl << "----------------" << endl << endl;
1273             cout << "Running additional analysis: Phylogenetic Terraces ..."<< endl;
1274 
1275             string filename = params.out_prefix;
1276             filename += ".terrace";
1277 
1278             try
1279             {
1280                 Terrace terrace(tree, (SuperAlignment*)(tree.aln));
1281 
1282                 uint64_t terrace_size = terrace.getSize();
1283 
1284                 if (terrace_size == 1) {
1285                     out << "The tree does not lie on a terrace." << endl;
1286                 } else {
1287                     out << "The tree lies on a terrace of size ";
1288 
1289                     if (terrace_size == UINT64_MAX) {
1290                         out << "at least " << terrace_size << " (integer overflow)";
1291                     } else {
1292                         out << terrace_size;
1293                     }
1294 
1295                     out << endl;
1296 
1297                     ofstream terraceout;
1298                     terraceout.open(filename.c_str());
1299 
1300                     terrace.printTreesCompressed(terraceout);
1301 
1302                     terraceout.close();
1303 
1304                     out << "Terrace trees written (in compressed Newick format) to " << filename << endl;
1305                 }
1306             }
1307             catch (std::exception& e)
1308             {
1309                 out << "ERROR: Terrace analysis using Terraphast failed: " << e.what() << endl << endl;
1310             }
1311 
1312             out << endl;
1313             out << "For documentation, see the technical supplement to Biczok et al. (2018)" << endl;
1314             out << "https://doi.org/10.1093/bioinformatics/bty384";
1315 
1316             out << endl << endl;
1317             cout<< "Done. Results are written in "<<params.out_prefix<<".iqtree file."<<endl;
1318         }
1319 #endif
1320         /* evaluate user trees */
1321         vector<TreeInfo> info;
1322         IntVector distinct_trees;
1323         if (!params.treeset_file.empty()) {
1324             evaluateTrees(params.treeset_file, params, &tree, info, distinct_trees);
1325             out.precision(4);
1326             out.setf(ios_base::fixed);
1327 
1328 			out << endl << "USER TREES" << endl << "----------" << endl << endl;
1329 			out << "See " << params.out_prefix << ".trees for trees with branch lengths." << endl << endl;
1330 			if (params.topotest_replicates && info.size() > 1) {
1331                 if (params.do_au_test && params.topotest_replicates < 10000)
1332                     out << "WARNING: Too few replicates for AU test. At least -zb 10000 for reliable results!" << endl << endl;
1333                 out << "Tree      logL    deltaL  bp-RELL    p-KH     p-SH    ";
1334 				if (params.do_weighted_test)
1335 					out << "p-WKH    p-WSH    ";
1336                 out << "   c-ELW";
1337                 if (params.do_au_test)
1338                     out << "       p-AU";
1339 
1340                 out << endl << "------------------------------------------------------------------";
1341                 if (params.do_weighted_test)
1342                     out << "------------------";
1343                 if (params.do_au_test)
1344                     out << "-------";
1345                 out << endl;
1346 			} else {
1347 				out << "Tree      logL    deltaL" << endl;
1348 				out << "-------------------------" << endl;
1349 
1350 			}
1351 			double maxL = -DBL_MAX;
1352 			int tid, orig_id;
1353 			for (tid = 0; tid < info.size(); tid++)
1354 				if (info[tid].logl > maxL) maxL = info[tid].logl;
1355 			for (orig_id = 0, tid = 0; orig_id < distinct_trees.size(); orig_id++) {
1356 				out.width(3);
1357 				out << right << orig_id+1 << " ";
1358 				if (distinct_trees[orig_id] >= 0) {
1359 					out << " = tree " << distinct_trees[orig_id]+1 << endl;
1360 					continue;
1361 				}
1362                 out.unsetf(ios::fixed);
1363 				out.precision(10);
1364 				out.width(12);
1365 				out << info[tid].logl << " ";
1366 				out.width(7);
1367                 out.precision(5);
1368 				out << maxL - info[tid].logl;
1369 				if (!params.topotest_replicates || info.size() <= 1) {
1370 					out << endl;
1371 					tid++;
1372 					continue;
1373 				}
1374 				out.precision(3);
1375 				out << "  ";
1376 				out.width(6);
1377 				out << info[tid].rell_bp;
1378 				if (info[tid].rell_confident)
1379 					out << " + ";
1380 				else
1381 					out << " - ";
1382 				out.width(6);
1383 				out << right << info[tid].kh_pvalue;
1384 				if (info[tid].kh_pvalue < 0.05)
1385 					out << " - ";
1386 				else
1387 					out << " + ";
1388 				out.width(6);
1389 				out << right << info[tid].sh_pvalue;
1390 				if (info[tid].sh_pvalue < 0.05)
1391 					out << " - ";
1392 				else
1393 					out << " + ";
1394 
1395 				if (params.do_weighted_test) {
1396 					out.width(6);
1397 					out << right << info[tid].wkh_pvalue;
1398 					if (info[tid].wkh_pvalue < 0.05)
1399 						out << " - ";
1400 					else
1401 						out << " + ";
1402 					out.width(6);
1403 					out << right << info[tid].wsh_pvalue;
1404 					if (info[tid].wsh_pvalue < 0.05)
1405 						out << " - ";
1406 					else
1407 						out << " + ";
1408 				}
1409 				out.width(9);
1410 				out << right << info[tid].elw_value;
1411 				if (info[tid].elw_confident)
1412 					out << " + ";
1413 				else
1414 					out << " - ";
1415 
1416                 if (params.do_au_test) {
1417                     out.width(8);
1418                     out << right << info[tid].au_pvalue;
1419                     if (info[tid].au_pvalue < 0.05)
1420                         out << " - ";
1421                     else
1422                         out << " + ";
1423                 }
1424                 out.setf(ios::fixed);
1425 
1426                 out << endl;
1427                 tid++;
1428             }
1429             out << endl;
1430 
1431             if (params.topotest_replicates) {
1432                 out <<  "deltaL  : logL difference from the maximal logl in the set." << endl
1433                      << "bp-RELL : bootstrap proportion using RELL method (Kishino et al. 1990)." << endl
1434                      << "p-KH    : p-value of one sided Kishino-Hasegawa test (1989)." << endl
1435                      << "p-SH    : p-value of Shimodaira-Hasegawa test (2000)." << endl;
1436                 if (params.do_weighted_test) {
1437                     out << "p-WKH   : p-value of weighted KH test." << endl
1438                      << "p-WSH   : p-value of weighted SH test." << endl;
1439                 }
1440                 out     << "c-ELW   : Expected Likelihood Weight (Strimmer & Rambaut 2002)." << endl;
1441                 if (params.do_au_test) {
1442                     out << "p-AU    : p-value of approximately unbiased (AU) test (Shimodaira, 2002)." << endl;
1443                 }
1444                 out  << endl
1445                      << "Plus signs denote the 95% confidence sets." << endl
1446                      << "Minus signs denote significant exclusion."  << endl
1447                      << "All tests performed "
1448                      << params.topotest_replicates << " resamplings using the RELL method."<<endl;
1449             }
1450             out << endl;
1451         }
1452 
1453 
1454         time_t cur_time;
1455         time(&cur_time);
1456 
1457         char *date_str;
1458         date_str = ctime(&cur_time);
1459         out.unsetf(ios_base::fixed);
1460         out << "TIME STAMP" << endl << "----------" << endl << endl
1461                 << "Date and time: " << date_str << "Total CPU time used: "
1462                 << (double) params.run_time << " seconds (" << convert_time(params.run_time) << ")" << endl
1463                 << "Total wall-clock time used: " << getRealTime() - params.start_real_time
1464                 << " seconds (" << convert_time(getRealTime() - params.start_real_time) << ")" << endl << endl;
1465 
1466         //reportCredits(out); // not needed, now in the manual
1467         out.close();
1468 
1469     } catch (ios::failure) {
1470         outError(ERR_WRITE_OUTPUT, outfile);
1471     }
1472 
1473     printOutfilesInfo(params, tree);
1474 }
1475 
checkZeroDist(Alignment * aln,double * dist)1476 void checkZeroDist(Alignment *aln, double *dist) {
1477     int ntaxa = aln->getNSeq();
1478     IntVector checked;
1479     checked.resize(ntaxa, 0);
1480     int i, j;
1481     for (i = 0; i < ntaxa - 1; i++) {
1482         if (checked[i])
1483             continue;
1484         string str = "";
1485         bool first = true;
1486         for (j = i + 1; j < ntaxa; j++)
1487             if (dist[i * ntaxa + j] <= Params::getInstance().min_branch_length) {
1488                 if (first)
1489                     str = "ZERO distance between sequences "
1490                             + aln->getSeqName(i);
1491                 str += ", " + aln->getSeqName(j);
1492                 checked[j] = 1;
1493                 first = false;
1494             }
1495         checked[i] = 1;
1496         if (str != "")
1497             outWarning(str);
1498     }
1499 }
1500 
1501 
printAnalysisInfo(int model_df,IQTree & iqtree,Params & params)1502 void printAnalysisInfo(int model_df, IQTree& iqtree, Params& params) {
1503 //    if (!params.raxmllib) {
1504     cout << "Model of evolution: ";
1505     if (iqtree.isSuperTree()) {
1506         cout << iqtree.getModelName() << " (" << model_df << " free parameters)" << endl;
1507     } else {
1508         cout << iqtree.getModelName() << " with ";
1509         switch (iqtree.getModel()->getFreqType()) {
1510         case FREQ_EQUAL:
1511             cout << "equal";
1512             break;
1513         case FREQ_EMPIRICAL:
1514             cout << "counted";
1515             break;
1516         case FREQ_USER_DEFINED:
1517             cout << "user-defined";
1518             break;
1519         case FREQ_ESTIMATE:
1520             cout << "optimized";
1521             break;
1522         case FREQ_CODON_1x4:
1523             cout << "counted 1x4";
1524             break;
1525         case FREQ_CODON_3x4:
1526             cout << "counted 3x4";
1527             break;
1528         case FREQ_CODON_3x4C:
1529             cout << "counted 3x4-corrected";
1530             break;
1531         case FREQ_DNA_RY:
1532             cout << "constrained A+G=C+T";
1533             break;
1534         case FREQ_DNA_WS:
1535             cout << "constrained A+T=C+G";
1536             break;
1537         case FREQ_DNA_MK:
1538             cout << "constrained A+C=G+T";
1539             break;
1540         case FREQ_DNA_1112:
1541             cout << "constrained A=C=G";
1542             break;
1543         case FREQ_DNA_1121:
1544             cout << "constrained A=C=T";
1545             break;
1546         case FREQ_DNA_1211:
1547             cout << "constrained A=G=T";
1548             break;
1549         case FREQ_DNA_2111:
1550             cout << "constrained C=G=T";
1551             break;
1552         case FREQ_DNA_1122:
1553             cout << "constrained A=C,G=T";
1554             break;
1555         case FREQ_DNA_1212:
1556             cout << "constrained A=G,C=T";
1557             break;
1558         case FREQ_DNA_1221:
1559             cout << "constrained A=T,C=G";
1560             break;
1561         case FREQ_DNA_1123:
1562             cout << "constrained A=C";
1563             break;
1564         case FREQ_DNA_1213:
1565             cout << "constrained A=G";
1566             break;
1567         case FREQ_DNA_1231:
1568             cout << "constrained A=T";
1569             break;
1570         case FREQ_DNA_2113:
1571             cout << "constrained C=G";
1572             break;
1573         case FREQ_DNA_2131:
1574             cout << "constrained C=T";
1575             break;
1576         case FREQ_DNA_2311:
1577             cout << "constrained G=T";
1578             break;
1579         default:
1580             outError("Wrong specified state frequencies");
1581         }
1582         cout << " frequencies (" << model_df << " free parameters)" << endl;
1583     }
1584     cout << "Fixed branch lengths: "
1585             << ((params.fixed_branch_length) ? "Yes" : "No") << endl;
1586 
1587     if (params.min_iterations > 0) {
1588         cout << "Tree search algorithm: " << (params.snni ? "Stochastic nearest neighbor interchange" : "IQPNNI") << endl;
1589         cout << "Termination condition: ";
1590         if (params.stop_condition == SC_REAL_TIME) {
1591             cout << "after " << params.maxtime << " minutes" << endl;
1592         } else if (params.stop_condition == SC_UNSUCCESS_ITERATION) {
1593             cout << "after " << params.unsuccess_iteration << " unsuccessful iterations" << endl;
1594         } else if (params.stop_condition == SC_FIXED_ITERATION) {
1595                 cout << params.min_iterations << " iterations" << endl;
1596         } else if(params.stop_condition == SC_WEIBULL) {
1597                 cout << "predicted in [" << params.min_iterations << ","
1598                         << params.max_iterations << "] (confidence "
1599                         << params.stop_confidence << ")" << endl;
1600         } else if (params.stop_condition == SC_BOOTSTRAP_CORRELATION) {
1601             cout << "min " << params.min_correlation << " correlation coefficient" << endl;
1602         }
1603 
1604         if (!params.snni) {
1605             cout << "Number of representative leaves  : " << params.k_representative << endl;
1606             cout << "Probability of deleting sequences: " << iqtree.getProbDelete() << endl;
1607             cout << "Number of leaves to be deleted   : " << iqtree.getDelete() << endl;
1608             cout << "Important quartets assessed on: "
1609                     << ((params.iqp_assess_quartet == IQP_DISTANCE) ?
1610                             "Distance" : ((params.iqp_assess_quartet == IQP_PARSIMONY) ? "Parsimony" : "Bootstrap"))
1611                     << endl;
1612         }
1613         cout << "NNI assessed on: " << ((params.nni5) ? "5 branches" : "1 branch") << endl;
1614     }
1615     cout << "Phylogenetic likelihood library: " << (params.pll ? "Yes" : "No") << endl;
1616     if (params.fixed_branch_length != BRLEN_FIX)
1617         cout << "Branch length optimization method: "
1618             << ((iqtree.optimize_by_newton) ? "Newton" : "Brent") << endl;
1619     cout << "Number of Newton-Raphson steps in NNI evaluation and branch length optimization: " << NNI_MAX_NR_STEP
1620             << " / " << PLL_NEWZPERCYCLE << endl;
1621     cout << "SSE instructions: "
1622             << ((iqtree.sse) ? "Yes" : "No") << endl;
1623     cout << endl;
1624 }
1625 
computeMLDist(Params & params,IQTree & iqtree,double begin_time)1626 void computeMLDist(Params& params, IQTree& iqtree, double begin_time) {
1627     double longest_dist;
1628 //    stringstream best_tree_string;
1629 //    iqtree.printTree(best_tree_string, WT_BR_LEN + WT_TAXON_ID);
1630     cout << "Computing ML distances based on estimated model parameters...";
1631     double *ml_dist = NULL;
1632     double *ml_var = NULL;
1633     longest_dist = iqtree.computeDist(params, iqtree.aln, ml_dist, ml_var, iqtree.dist_file);
1634     cout << " " << (getCPUTime() - begin_time) << " sec" << endl;
1635 
1636     double max_genetic_dist = MAX_GENETIC_DIST;
1637     if (iqtree.aln->seq_type == SEQ_POMO) {
1638         int N = iqtree.aln->virtual_pop_size;
1639         max_genetic_dist *= N * N;
1640     }
1641     if (longest_dist > max_genetic_dist * 0.99) {
1642         outWarning("Some pairwise ML distances are too long (saturated)");
1643         //cout << "Some ML distances are too long, using old distances..." << endl;
1644     } //else
1645     {
1646         if ( !iqtree.dist_matrix ) {
1647             iqtree.dist_matrix = new double[iqtree.aln->getNSeq() * iqtree.aln->getNSeq()];
1648         }
1649         if ( !iqtree.var_matrix ) {
1650             iqtree.var_matrix = new double[iqtree.aln->getNSeq() * iqtree.aln->getNSeq()];
1651         }
1652         memmove(iqtree.dist_matrix, ml_dist,
1653                 sizeof (double) * iqtree.aln->getNSeq() * iqtree.aln->getNSeq());
1654         memmove(iqtree.var_matrix, ml_var,
1655                 sizeof(double) * iqtree.aln->getNSeq() * iqtree.aln->getNSeq());
1656     }
1657     delete[] ml_dist;
1658     delete[] ml_var;
1659 }
1660 
computeInitialDist(Params & params,IQTree & iqtree)1661 void computeInitialDist(Params &params, IQTree &iqtree) {
1662     double longest_dist;
1663     if (params.dist_file) {
1664         cout << "Reading distance matrix file " << params.dist_file << " ..." << endl;
1665     } else if (params.compute_jc_dist) {
1666         cout << "Computing Juke-Cantor distances..." << endl;
1667     } else if (params.compute_obs_dist) {
1668         cout << "Computing observed distances..." << endl;
1669     }
1670 
1671     if (params.compute_jc_dist || params.compute_obs_dist || params.partition_file) {
1672         longest_dist = iqtree.computeDist(params, iqtree.aln, iqtree.dist_matrix, iqtree.var_matrix, iqtree.dist_file);
1673         checkZeroDist(iqtree.aln, iqtree.dist_matrix);
1674 
1675         double max_genetic_dist = MAX_GENETIC_DIST;
1676         if (iqtree.aln->seq_type == SEQ_POMO) {
1677             int N = iqtree.aln->virtual_pop_size;
1678             max_genetic_dist *= N * N;
1679         }
1680         if (longest_dist > max_genetic_dist * 0.99) {
1681             outWarning("Some pairwise distances are too long (saturated)");
1682         }
1683     }
1684 
1685 }
1686 
initializeParams(Params & params,IQTree & iqtree)1687 void initializeParams(Params &params, IQTree &iqtree)
1688 {
1689 //    iqtree.setCurScore(-DBL_MAX);
1690     bool ok_tree = iqtree.root;
1691     if (iqtree.isSuperTreeUnlinked())
1692         ok_tree = ((PhyloSuperTree*)&iqtree)->front()->root;
1693     if (!ok_tree)
1694     {
1695         // compute initial tree
1696         iqtree.computeInitialTree(params.SSE);
1697     }
1698     ASSERT(iqtree.aln);
1699 
1700     if (iqtree.aln->model_name == "WHTEST") {
1701         if (iqtree.aln->seq_type != SEQ_DNA)
1702             outError("Weiss & von Haeseler test of model homogeneity only works for DNA");
1703         iqtree.aln->model_name = "GTR+G";
1704     }
1705 
1706     if (params.gbo_replicates)
1707         params.speed_conf = 1.0;
1708 
1709     // TODO: check if necessary
1710 //    if (iqtree.isSuperTree())
1711 //        ((PhyloSuperTree*) &iqtree)->mapTrees();
1712 
1713     // set parameter for the current tree
1714 //    iqtree.setParams(params);
1715 }
1716 
1717 
pruneTaxa(Params & params,IQTree & iqtree,double * pattern_lh,NodeVector & pruned_taxa,StrVector & linked_name)1718 void pruneTaxa(Params &params, IQTree &iqtree, double *pattern_lh, NodeVector &pruned_taxa, StrVector &linked_name) {
1719     int num_low_support;
1720     double mytime;
1721 
1722     if (params.aLRT_threshold <= 100 && (params.aLRT_replicates > 0 || params.localbp_replicates > 0)) {
1723         mytime = getCPUTime();
1724         cout << "Testing tree branches by SH-like aLRT with " << params.aLRT_replicates << " replicates..." << endl;
1725         iqtree.setRootNode(params.root);
1726         double curScore =  iqtree.getCurScore();
1727         iqtree.computePatternLikelihood(pattern_lh, &curScore);
1728         num_low_support = iqtree.testAllBranches(params.aLRT_threshold, curScore,
1729                 pattern_lh, params.aLRT_replicates, params.localbp_replicates, params.aLRT_test, params.aBayes_test);
1730         iqtree.printResultTree();
1731         cout << "  " << getCPUTime() - mytime << " sec." << endl;
1732         cout << num_low_support << " branches show low support values (<= " << params.aLRT_threshold << "%)" << endl;
1733 
1734         //tree.drawTree(cout);
1735         cout << "Collapsing stable clades..." << endl;
1736         iqtree.collapseStableClade(params.aLRT_threshold, pruned_taxa, linked_name, iqtree.dist_matrix);
1737         cout << pruned_taxa.size() << " taxa were pruned from stable clades" << endl;
1738     }
1739 
1740     if (!pruned_taxa.empty()) {
1741         cout << "Pruned alignment contains " << iqtree.aln->getNSeq()
1742                 << " sequences and " << iqtree.aln->getNSite() << " sites and "
1743                 << iqtree.aln->getNPattern() << " patterns" << endl;
1744         //tree.clearAllPartialLh();
1745         iqtree.initializeAllPartialLh();
1746         iqtree.clearAllPartialLH();
1747         iqtree.setCurScore(iqtree.optimizeAllBranches());
1748         //cout << "Log-likelihood    after reoptimizing model parameters: " << tree.curScore << endl;
1749 //        pair<int, int> nniInfo = iqtree.optimizeNNI();
1750         iqtree.optimizeNNI();
1751         cout << "Log-likelihood after optimizing partial tree: "
1752                 << iqtree.getCurScore() << endl;
1753     }
1754 
1755 }
1756 
restoreTaxa(IQTree & iqtree,double * saved_dist_mat,NodeVector & pruned_taxa,StrVector & linked_name)1757 void restoreTaxa(IQTree &iqtree, double *saved_dist_mat, NodeVector &pruned_taxa, StrVector &linked_name) {
1758     if (!pruned_taxa.empty()) {
1759         cout << "Restoring full tree..." << endl;
1760         iqtree.restoreStableClade(iqtree.aln, pruned_taxa, linked_name);
1761         delete[] iqtree.dist_matrix;
1762         iqtree.dist_matrix = saved_dist_mat;
1763         iqtree.initializeAllPartialLh();
1764         iqtree.clearAllPartialLH();
1765         iqtree.setCurScore(iqtree.optimizeAllBranches());
1766         //cout << "Log-likelihood    after reoptimizing model parameters: " << tree.curScore << endl;
1767         pair<int, int> nniInfo;
1768         nniInfo = iqtree.optimizeNNI();
1769         cout << "Log-likelihood    after reoptimizing full tree: " << iqtree.getCurScore() << endl;
1770         //iqtree.setBestScore(iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.model_eps));
1771 
1772     }
1773 }
runApproximateBranchLengths(Params & params,IQTree & iqtree)1774 void runApproximateBranchLengths(Params &params, IQTree &iqtree) {
1775     if (!params.fixed_branch_length && params.leastSquareBranch) {
1776         cout << endl << "Computing Least Square branch lengths..." << endl;
1777         iqtree.optimizeAllBranchesLS();
1778         iqtree.clearAllPartialLH();
1779         iqtree.setCurScore(iqtree.computeLikelihood());
1780         string filename = params.out_prefix;
1781         filename += ".lstree";
1782         iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
1783         cout << "Logl of tree with LS branch lengths: " << iqtree.getCurScore() << endl;
1784         cout << "Tree with LS branch lengths written to " << filename << endl;
1785         if (params.print_branch_lengths) {
1786             if (params.manuel_analytic_approx) {
1787                 cout << "Applying Manuel's analytic approximation.." << endl;
1788                 iqtree.approxAllBranches();
1789             }
1790             ofstream out;
1791             filename = params.out_prefix;
1792             filename += ".lsbrlen";
1793             out.open(filename.c_str());
1794             iqtree.printBranchLengths(out);
1795             out.close();
1796             cout << "LS Branch lengths written to " << filename << endl;
1797         }
1798         cout << "Total LS tree length: " << iqtree.treeLength() << endl;
1799     }
1800 
1801     if (params.pars_branch_length) {
1802         cout << endl << "Computing parsimony branch lengths..." << endl;
1803         iqtree.fixNegativeBranch(true);
1804         iqtree.clearAllPartialLH();
1805         iqtree.setCurScore(iqtree.computeLikelihood());
1806         string filename = params.out_prefix;
1807         filename += ".mptree";
1808         iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
1809         cout << "Logl of tree with MP branch lengths: " << iqtree.getCurScore() << endl;
1810         cout << "Tree with MP branch lengths written to " << filename << endl;
1811         if (params.print_branch_lengths) {
1812             ofstream out;
1813             filename = params.out_prefix;
1814             filename += ".mpbrlen";
1815             out.open(filename.c_str());
1816             iqtree.printBranchLengths(out);
1817             out.close();
1818             cout << "MP Branch lengths written to " << filename << endl;
1819         }
1820         cout << "Total MP tree length: " << iqtree.treeLength() << endl;
1821 
1822     }
1823 
1824     if (params.bayes_branch_length) {
1825         cout << endl << "Computing Bayesian branch lengths..." << endl;
1826         iqtree.computeAllBayesianBranchLengths();
1827         iqtree.clearAllPartialLH();
1828         iqtree.setCurScore(iqtree.computeLikelihood());
1829         string filename = params.out_prefix;
1830         filename += ".batree";
1831         iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
1832         cout << "Logl of tree with Bayesian branch lengths: " << iqtree.getCurScore() << endl;
1833         cout << "Tree with Bayesian branch lengths written to " << filename << endl;
1834         if (params.print_branch_lengths) {
1835             ofstream out;
1836             filename = params.out_prefix;
1837             filename += ".babrlen";
1838             out.open(filename.c_str());
1839             iqtree.printBranchLengths(out);
1840             out.close();
1841             cout << "Bayesian Branch lengths written to " << filename << endl;
1842         }
1843         cout << "Total Bayesian tree length: " << iqtree.treeLength() << endl;
1844 
1845     }
1846 
1847 }
1848 
printSiteRates(IQTree & iqtree,const char * rate_file,bool bayes)1849 void printSiteRates(IQTree &iqtree, const char *rate_file, bool bayes) {
1850     try {
1851         ofstream out;
1852         out.exceptions(ios::failbit | ios::badbit);
1853         out.open(rate_file);
1854         out << "# Site-specific subtitution rates determined by ";
1855         if (bayes)
1856             out<< "empirical Bayesian method" << endl;
1857         else
1858             out<< "maximum likelihood" << endl;
1859         out << "# This file can be read in MS Excel or in R with command:" << endl
1860         << "#   tab=read.table('" <<  rate_file << "',header=TRUE)" << endl
1861         << "# Columns are tab-separated with following meaning:" << endl;
1862         if (iqtree.isSuperTree()) {
1863             out << "#   Part:   Partition ID (1=" << ((PhyloSuperTree*)&iqtree)->front()->aln->name << ", etc)" << endl
1864             << "#   Site:   Site ID within partition (starting from 1 for each partition)" << endl;
1865         } else
1866             out << "#   Site:   Alignment site ID" << endl;
1867 
1868         if (bayes)
1869             out << "#   Rate:   Posterior mean site rate weighted by posterior probability" << endl
1870                 << "#   Cat:    Category with highest posterior (0=invariable, 1=slow, etc)" << endl
1871                 << "#   C_Rate: Corresponding rate of highest category" << endl;
1872         else
1873             out << "#   Rate:   Site rate estimated by maximum likelihood" << endl;
1874         if (iqtree.isSuperTree())
1875             out << "Part\t";
1876         out << "Site\tRate";
1877         if (bayes)
1878             out << "\tCat\tC_Rate" << endl;
1879         else
1880             out << endl;
1881         iqtree.writeSiteRates(out, bayes);
1882         out.close();
1883     } catch (ios::failure) {
1884         outError(ERR_WRITE_OUTPUT, rate_file);
1885     }
1886     cout << "Site rates printed to " << rate_file << endl;
1887 }
1888 
printMiscInfo(Params & params,IQTree & iqtree,double * pattern_lh)1889 void printMiscInfo(Params &params, IQTree &iqtree, double *pattern_lh) {
1890     if (params.print_site_lh && !params.pll) {
1891         string site_lh_file = params.out_prefix;
1892         site_lh_file += ".sitelh";
1893         if (params.print_site_lh == WSL_SITE)
1894             printSiteLh(site_lh_file.c_str(), &iqtree, pattern_lh);
1895         else
1896             printSiteLhCategory(site_lh_file.c_str(), &iqtree, params.print_site_lh);
1897     }
1898 
1899     if (params.print_partition_lh && !iqtree.isSuperTree()) {
1900         outWarning("-wpl does not work with non-partition model");
1901         params.print_partition_lh = false;
1902     }
1903     if (params.print_partition_lh && !params.pll) {
1904         string part_lh_file = (string)params.out_prefix + ".partlh";
1905         printPartitionLh(part_lh_file.c_str(), &iqtree, pattern_lh);
1906     }
1907 
1908     if (params.print_site_prob && !params.pll) {
1909         printSiteProbCategory(((string)params.out_prefix + ".siteprob").c_str(), &iqtree, params.print_site_prob);
1910     }
1911 
1912     if (params.print_ancestral_sequence) {
1913         printAncestralSequences(params.out_prefix, &iqtree, params.print_ancestral_sequence);
1914     }
1915 
1916     if (params.print_site_state_freq != WSF_NONE && !params.site_freq_file && !params.tree_freq_file) {
1917         string site_freq_file = params.out_prefix;
1918         site_freq_file += ".sitesf";
1919         printSiteStateFreq(site_freq_file.c_str(), &iqtree);
1920     }
1921 
1922     if (params.print_trees_site_posterior) {
1923         cout << "Computing mixture posterior probabilities" << endl;
1924         IntVector pattern_cat;
1925         int num_mix = iqtree.computePatternCategories(&pattern_cat);
1926         cout << num_mix << " mixture components are necessary" << endl;
1927         string site_mix_file = (string)params.out_prefix + ".sitemix";
1928         ofstream out(site_mix_file.c_str());
1929         if (!out.is_open())
1930             outError("File " + site_mix_file + " could not be opened");
1931         out << "Ptn\tFreq\tNumMix" << endl;
1932         int ptn;
1933         for (ptn = 0; ptn < pattern_cat.size(); ptn++)
1934             out << ptn << "\t" << (int)iqtree.ptn_freq[ptn] << "\t" << pattern_cat[ptn] << endl;
1935         out.close();
1936         cout << "Pattern mixtures printed to " << site_mix_file << endl;
1937 
1938         site_mix_file = (string)params.out_prefix + ".sitemixall";
1939         out.open(site_mix_file.c_str());
1940         int ncat = iqtree.getRate()->getNRate();
1941         if (iqtree.getModel()->isMixture() && !iqtree.getModelFactory()->fused_mix_rate)
1942             ncat = iqtree.getModel()->getNMixtures();
1943         out << "Ptn\tFreq\tNumMix\tCat" << endl;
1944 
1945         int c;
1946         for (ptn = 0; ptn < iqtree.ptn_cat_mask.size(); ptn++) {
1947             int num_cat = popcount_lauradoux((unsigned*)&iqtree.ptn_cat_mask[ptn], 2);
1948             out << ptn << "\t" << (int)iqtree.ptn_freq[ptn] << "\t" << num_cat << "\t";
1949             for (c = 0; c < ncat; c++)
1950                 if (iqtree.ptn_cat_mask[ptn] & ((uint64_t)1<<c))
1951                     out << "1";
1952                 else
1953                     out << "0";
1954             out << endl;
1955         }
1956         out.close();
1957     }
1958 
1959     if (params.print_branch_lengths) {
1960         if (params.manuel_analytic_approx) {
1961             cout << "Applying Manuel's analytic approximation.." << endl;
1962             iqtree.approxAllBranches();
1963         }
1964         string brlen_file = params.out_prefix;
1965         brlen_file += ".brlen";
1966         ofstream out;
1967         out.open(brlen_file.c_str());
1968         iqtree.printBranchLengths(out);
1969         out.close();
1970         cout << "Branch lengths written to " << brlen_file << endl;
1971     }
1972 
1973     if (params.write_branches) {
1974         string filename = string(params.out_prefix) + ".branches.csv";
1975         ofstream out;
1976         out.open(filename.c_str());
1977         iqtree.writeBranches(out);
1978         out.close();
1979         cout << "Branch lengths written to " << filename << endl;
1980     }
1981 
1982     if (params.print_conaln && iqtree.isSuperTree()) {
1983         string str = params.out_prefix;
1984         str = params.out_prefix;
1985         str += ".conaln";
1986         iqtree.aln->printAlignment(params.aln_output_format, str.c_str());
1987     }
1988 
1989     if (params.print_partition_info && iqtree.isSuperTree()) {
1990         ASSERT(params.print_conaln);
1991         string aln_file = (string)params.out_prefix + ".conaln";
1992         string partition_info = params.out_prefix;
1993         partition_info += ".partinfo.nex";
1994         ((SuperAlignment*)(iqtree.aln))->printPartition(partition_info.c_str(), aln_file.c_str());
1995         partition_info = (string)params.out_prefix + ".partitions";
1996         ((SuperAlignment*)(iqtree.aln))->printPartitionRaxml(partition_info.c_str());
1997     }
1998 
1999     if (params.mvh_site_rate) {
2000         RateMeyerHaeseler *rate_mvh = new RateMeyerHaeseler(params.rate_file,
2001                 &iqtree, params.rate_mh_type);
2002         cout << endl << "Computing site-specific rates by "
2003                 << rate_mvh->full_name << "..." << endl;
2004         rate_mvh->runIterativeProc(params, iqtree);
2005         cout << endl << "BEST SCORE FOUND : " << iqtree.getBestScore()<< endl;
2006         string mhrate_file = params.out_prefix;
2007         mhrate_file += ".mhrate";
2008         try {
2009             ofstream out;
2010             out.exceptions(ios::failbit | ios::badbit);
2011             out.open(mhrate_file.c_str());
2012             iqtree.writeSiteRates(out, true);
2013             out.close();
2014         } catch (ios::failure) {
2015             outError(ERR_WRITE_OUTPUT, mhrate_file);
2016         }
2017 
2018         if (params.print_site_lh) {
2019             string site_lh_file = params.out_prefix;
2020             site_lh_file += ".mhsitelh";
2021             printSiteLh(site_lh_file.c_str(), &iqtree);
2022         }
2023     }
2024 
2025     if (params.print_site_rate & 1) {
2026         string rate_file = params.out_prefix;
2027         rate_file += ".rate";
2028         printSiteRates(iqtree, rate_file.c_str(), true);
2029     }
2030 
2031     if (params.print_site_rate & 2) {
2032         string rate_file = params.out_prefix;
2033         rate_file += ".mlrate";
2034         printSiteRates(iqtree, rate_file.c_str(), false);
2035     }
2036 
2037     if (params.fixed_branch_length == BRLEN_SCALE) {
2038         string filename = (string)params.out_prefix + ".blscale";
2039         iqtree.printTreeLengthScaling(filename.c_str());
2040         cout << "Scaled tree length and model parameters printed to " << filename << endl;
2041     }
2042 
2043 }
2044 
printFinalSearchInfo(Params & params,IQTree & iqtree,double search_cpu_time,double search_real_time)2045 void printFinalSearchInfo(Params &params, IQTree &iqtree, double search_cpu_time, double search_real_time) {
2046     cout << "Total tree length: " << iqtree.treeLength() << endl;
2047 
2048     if (iqtree.isSuperTree() && verbose_mode >= VB_MAX) {
2049         PhyloSuperTree *stree = (PhyloSuperTree*) &iqtree;
2050         cout << stree->evalNNIs << " NNIs evaluated from " << stree->totalNNIs << " all possible NNIs ( " <<
2051                 (int)(((stree->evalNNIs+1.0)/(stree->totalNNIs+1.0))*100.0) << " %)" << endl;
2052         cout<<"Details for subtrees:"<<endl;
2053         int part = 0;
2054         for(auto it = stree->begin(); it != stree->end(); it++,part++){
2055             cout << part+1 << ". " << (*it)->aln->name << ": " << stree->part_info[part].evalNNIs<<" ( "
2056                 << (int)(((stree->part_info[part].evalNNIs+1.0)/((stree->totalNNIs+1.0) / stree->size()))*100.0)
2057                 << " %)" << endl;
2058         }
2059     }
2060 
2061     params.run_time = (getCPUTime() - params.startCPUTime);
2062     cout << endl;
2063     cout << "Total number of iterations: " << iqtree.stop_rule.getCurIt() << endl;
2064 //    cout << "Total number of partial likelihood vector computations: " << iqtree.num_partial_lh_computations << endl;
2065     cout << "CPU time used for tree search: " << search_cpu_time
2066             << " sec (" << convert_time(search_cpu_time) << ")" << endl;
2067     cout << "Wall-clock time used for tree search: " << search_real_time
2068             << " sec (" << convert_time(search_real_time) << ")" << endl;
2069     cout << "Total CPU time used: " << (double) params.run_time << " sec ("
2070             << convert_time((double) params.run_time) << ")" << endl;
2071     cout << "Total wall-clock time used: "
2072             << getRealTime() - params.start_real_time << " sec ("
2073             << convert_time(getRealTime() - params.start_real_time) << ")" << endl;
2074 
2075 }
2076 
printTrees(vector<string> trees,Params & params,string suffix)2077 void printTrees(vector<string> trees, Params &params, string suffix) {
2078     ofstream treesOut((string(params.out_prefix) + suffix).c_str(),
2079             ofstream::out);
2080     for (vector<string>::iterator it = trees.begin(); it != trees.end(); it++) {
2081         treesOut << (*it);
2082         treesOut << endl;
2083     }
2084     treesOut.close();
2085 }
2086 
2087 /************************************************************
2088  *  MAIN TREE RECONSTRUCTION
2089  ***********************************************************/
2090 
startTreeReconstruction(Params & params,IQTree * & iqtree,ModelCheckpoint & model_info)2091 void startTreeReconstruction(Params &params, IQTree* &iqtree, ModelCheckpoint &model_info) {
2092     if (params.root) {
2093         StrVector outgroup_names;
2094         convert_string_vec(params.root, outgroup_names);
2095         for (auto it = outgroup_names.begin(); it != outgroup_names.end(); it++)
2096             if (iqtree->aln->getSeqID(*it) < 0)
2097                 outError("Alignment does not have specified outgroup taxon ", *it);
2098     }
2099 
2100 //    if (params.count_trees && pllTreeCounter == NULL)
2101 //        pllTreeCounter = new StringIntMap;
2102 
2103     // Temporary fix since PLL only supports DNA/Protein: switch to IQ-TREE parsimony kernel
2104     if (params.start_tree == STT_PLL_PARSIMONY) {
2105         if (iqtree->isSuperTreeUnlinked()) {
2106             params.start_tree = STT_PARSIMONY;
2107         } else if (iqtree->isSuperTree()) {
2108             PhyloSuperTree *stree = (PhyloSuperTree*)iqtree;
2109             for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++)
2110                 if ((*it)->aln->seq_type != SEQ_DNA && (*it)->aln->seq_type != SEQ_PROTEIN)
2111                     params.start_tree = STT_PARSIMONY;
2112         } else if (iqtree->aln->seq_type != SEQ_DNA && iqtree->aln->seq_type != SEQ_PROTEIN)
2113             params.start_tree = STT_PARSIMONY;
2114     }
2115 
2116     /***************** Initialization for PLL and sNNI ******************/
2117     if (params.start_tree == STT_PLL_PARSIMONY || params.start_tree == STT_RANDOM_TREE || params.pll) {
2118         /* Initialized all data structure for PLL*/
2119         iqtree->initializePLL(params);
2120     }
2121 
2122     /********************* Compute pairwise distances *******************/
2123     if (params.start_tree == STT_BIONJ || params.iqp || params.leastSquareBranch) {
2124         computeInitialDist(params, *iqtree);
2125     }
2126 
2127     /******************** Pass the parameter object params to IQTree *******************/
2128     iqtree->setParams(&params);
2129 
2130     /*************** SET UP PARAMETERS and model testing ****************/
2131 
2132        // FOR TUNG: swapping the order cause bug for -m TESTLINK
2133 //    iqtree.initSettings(params);
2134 
2135     runModelFinder(params, *iqtree, model_info);
2136 }
2137 
2138 /**
2139  optimize branch lengths of consensus tree
2140  */
optimizeConTree(Params & params,IQTree * tree)2141 void optimizeConTree(Params &params, IQTree *tree) {
2142     string contree_file = string(params.out_prefix) + ".contree";
2143 
2144     DoubleVector rfdist;
2145     tree->computeRFDist(contree_file.c_str(), rfdist);
2146     tree->contree_rfdist = (int)rfdist[0];
2147 
2148     tree->readTreeFile(contree_file);
2149 
2150     tree->initializeAllPartialLh();
2151     tree->fixNegativeBranch(false);
2152 
2153     tree->boot_consense_logl = tree->optimizeAllBranches();
2154     cout << "Log-likelihood of consensus tree: " << tree->boot_consense_logl << endl;
2155     tree->setRootNode(params.root);
2156     tree->insertTaxa(tree->removed_seqs, tree->twin_seqs);
2157     tree->printTree(contree_file.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
2158     string contree = tree->getTreeString();
2159     tree->getCheckpoint()->put("contree", contree);
2160 }
2161 
runTreeReconstruction(Params & params,IQTree * & iqtree)2162 void runTreeReconstruction(Params &params, IQTree* &iqtree) {
2163 
2164     //    string dist_file;
2165     params.startCPUTime = getCPUTime();
2166     params.start_real_time = getRealTime();
2167 
2168     int absent_states = 0;
2169     if (iqtree->isSuperTree()) {
2170         PhyloSuperTree *stree = (PhyloSuperTree*)iqtree;
2171         for (auto i = stree->begin(); i != stree->end(); i++)
2172             absent_states += (*i)->aln->checkAbsentStates("partition " + (*i)->aln->name);
2173     } else {
2174         absent_states = iqtree->aln->checkAbsentStates("alignment");
2175     }
2176     if (absent_states > 0) {
2177         cout << "NOTE: " << absent_states << " states (see above) are not present and thus removed from Markov process to prevent numerical problems" << endl;
2178     }
2179 
2180     // Make sure that no partial likelihood of IQ-TREE is initialized when PLL is used to save memory
2181     if (params.pll) {
2182         iqtree->deleteAllPartialLh();
2183     }
2184 
2185     /***************** Initialization for PLL and sNNI ******************/
2186     if ((params.start_tree == STT_PLL_PARSIMONY || params.start_tree == STT_RANDOM_TREE || params.pll) && !iqtree->isInitializedPLL()) {
2187         /* Initialized all data structure for PLL*/
2188         iqtree->initializePLL(params);
2189     }
2190 
2191 
2192     /********************* Compute pairwise distances *******************/
2193     if ((params.start_tree == STT_BIONJ || params.iqp || params.leastSquareBranch) && !iqtree->root) {
2194         computeInitialDist(params, *iqtree);
2195     }
2196 
2197     /******************** Pass the parameter object params to IQTree *******************/
2198     iqtree->setParams(&params);
2199 
2200     ModelsBlock *models_block = readModelsDefinition(params);
2201 
2202     initializeParams(params, *iqtree);
2203 
2204     if (posRateHeterotachy(iqtree->aln->model_name) != string::npos && !iqtree->isMixlen()) {
2205         // create a new instance
2206         IQTree* iqtree_new = new PhyloTreeMixlen(iqtree->aln, 0);
2207         iqtree_new->setCheckpoint(iqtree->getCheckpoint());
2208         if (!iqtree->constraintTree.empty())
2209             iqtree_new->constraintTree.readConstraint(iqtree->constraintTree);
2210         iqtree_new->removed_seqs = iqtree->removed_seqs;
2211         iqtree_new->twin_seqs = iqtree->twin_seqs;
2212         if (params.start_tree == STT_PLL_PARSIMONY || params.start_tree == STT_RANDOM_TREE || params.pll) {
2213             /* Initialized all data structure for PLL*/
2214             iqtree_new->initializePLL(params);
2215         }
2216         iqtree_new->setParams(&params);
2217         iqtree_new->copyPhyloTree(iqtree);
2218 
2219         // replace iqtree object
2220         delete iqtree;
2221         iqtree = iqtree_new;
2222 
2223     }
2224 
2225     iqtree->setRootNode(params.root);
2226 
2227     iqtree->restoreCheckpoint();
2228 
2229     if (params.online_bootstrap && params.gbo_replicates > 0) {
2230         cout << "Generating " << params.gbo_replicates << " samples for ultrafast "
2231         << RESAMPLE_NAME << " (seed: " << params.ran_seed << ")..." << endl;
2232     }
2233 
2234     iqtree->initSettings(params);
2235 
2236     /*********************** INITIAL MODEL OPTIMIZATION *****************/
2237 
2238 
2239     if (!iqtree->getModelFactory()) {
2240         iqtree->initializeModel(params, iqtree->aln->model_name, models_block);
2241     }
2242     if (iqtree->getRate()->isHeterotachy() && !iqtree->isMixlen()) {
2243         ASSERT(0 && "Heterotachy tree not properly created");
2244     }
2245 //    iqtree.restoreCheckpoint();
2246 
2247     delete models_block;
2248 
2249     // UpperBounds analysis. Here, to analyse the initial tree without any tree search or optimization
2250     /*
2251     if (params.upper_bound) {
2252         iqtree.setCurScore(iqtree.computeLikelihood());
2253         cout<<iqtree.getCurScore()<<endl;
2254         UpperBounds(&params, iqtree.aln, iqtree);
2255         exit(0);
2256     }
2257     */
2258 
2259     // degree of freedom
2260     cout << endl;
2261     if (verbose_mode >= VB_MED) {
2262         cout << "ML-TREE SEARCH START WITH THE FOLLOWING PARAMETERS:" << endl;
2263         int model_df = iqtree->getModelFactory()->getNParameters(BRLEN_OPTIMIZE);
2264         printAnalysisInfo(model_df, *iqtree, params);
2265     }
2266 
2267     if (!params.pll) {
2268         uint64_t total_mem = getMemorySize();
2269         if (params.lh_mem_save == LM_MEM_SAVE && params.max_mem_size > total_mem)
2270             params.max_mem_size = total_mem;
2271 
2272         uint64_t mem_required = iqtree->getMemoryRequired();
2273 
2274         if (mem_required >= total_mem*0.95 && !iqtree->isSuperTree()) {
2275             // switch to memory saving mode
2276             if (params.lh_mem_save != LM_MEM_SAVE) {
2277                 params.max_mem_size = (total_mem*0.95)/mem_required;
2278                 params.lh_mem_save = LM_MEM_SAVE;
2279                 mem_required = iqtree->getMemoryRequired();
2280                 cout << "NOTE: Switching to memory saving mode using " << (mem_required / 1073741824.0) << " GB ("
2281                     <<  (mem_required*100/total_mem) << "% of normal mode)" << endl;
2282                 cout << "NOTE: Use -mem option if you want to restrict RAM usage further" << endl;
2283             }
2284             if (mem_required >= total_mem) {
2285                 params.lh_mem_save = LM_MEM_SAVE;
2286                 params.max_mem_size = 0.0;
2287                 mem_required = iqtree->getMemoryRequired();
2288             }
2289         }
2290         if (mem_required >= total_mem) {
2291             cerr << "ERROR: Your RAM is below minimum requirement of " << (mem_required / 1073741824.0) << " GB RAM" << endl;
2292             outError("Memory saving mode cannot work, switch to another computer!!!");
2293         }
2294 
2295 //#if defined __APPLE__ || defined __MACH__
2296         cout << "NOTE: " << (mem_required / 1048576) << " MB RAM (" << (mem_required / 1073741824) << " GB) is required!" << endl;
2297 //#else
2298 //        cout << "NOTE: " << ((double) mem_size / 1000.0) / 1000 << " MB RAM is required!" << endl;
2299 //#endif
2300         if (params.memCheck)
2301             exit(0);
2302 #ifdef BINARY32
2303         if (mem_required >= 2000000000) {
2304             outError("Memory required exceeds 2GB limit of 32-bit executable");
2305         }
2306 #endif
2307         int max_procs = countPhysicalCPUCores();
2308         if (mem_required * max_procs > total_mem * iqtree->num_threads && iqtree->num_threads > 0) {
2309             outWarning("Memory required per CPU-core (" + convertDoubleToString((double)mem_required/iqtree->num_threads/1024/1024/1024)+
2310             " GB) is higher than your computer RAM per CPU-core ("+convertIntToString(total_mem/max_procs/1024/1024/1024)+
2311             " GB), thus multiple runs may exceed RAM!");
2312         }
2313     }
2314 
2315 
2316 #ifdef _OPENMP
2317     if (iqtree->num_threads <= 0) {
2318         int bestThreads = iqtree->testNumThreads();
2319         omp_set_num_threads(bestThreads);
2320         params.num_threads = bestThreads;
2321     } else
2322         iqtree->warnNumThreads();
2323 #endif
2324 
2325 
2326     iqtree->initializeAllPartialLh();
2327     double initEpsilon = params.min_iterations == 0 ? params.modelEps : (params.modelEps*10);
2328 
2329 
2330     if (iqtree->getRate()->name.find("+I+G") != string::npos) {
2331         if (params.alpha_invar_file != NULL) { // COMPUTE TREE LIKELIHOOD BASED ON THE INPUT ALPHA AND P_INVAR VALUE
2332             computeLoglFromUserInputGAMMAInvar(params, *iqtree);
2333             exit(0);
2334         }
2335 
2336         if (params.exh_ai) {
2337             exhaustiveSearchGAMMAInvar(params, *iqtree);
2338             exit(0);
2339         }
2340 
2341     }
2342 
2343     // Optimize model parameters and branch lengths using ML for the initial tree
2344     string initTree;
2345     iqtree->clearAllPartialLH();
2346 
2347     iqtree->getModelFactory()->restoreCheckpoint();
2348     if (iqtree->getCheckpoint()->getBool("finishedModelInit")) {
2349         // model optimization already done: ignore this step
2350         if (!iqtree->candidateTrees.empty())
2351             iqtree->readTreeString(iqtree->getBestTrees()[0]);
2352         iqtree->setCurScore(iqtree->computeLikelihood());
2353         initTree = iqtree->getTreeString();
2354         cout << "CHECKPOINT: Model parameters restored, LogL: " << iqtree->getCurScore() << endl;
2355     } else {
2356         initTree = iqtree->optimizeModelParameters(true, initEpsilon);
2357         if (iqtree->isMixlen())
2358             initTree = ((ModelFactoryMixlen*)iqtree->getModelFactory())->sortClassesByTreeLength();
2359 
2360         iqtree->saveCheckpoint();
2361         iqtree->getModelFactory()->saveCheckpoint();
2362         iqtree->getCheckpoint()->putBool("finishedModelInit", true);
2363         iqtree->getCheckpoint()->dump();
2364 //        cout << "initTree: " << initTree << endl;
2365     }
2366 
2367     if (params.lmap_num_quartets >= 0) {
2368         cout << endl << "Performing likelihood mapping with ";
2369         if (params.lmap_num_quartets > 0)
2370             cout << params.lmap_num_quartets;
2371         else
2372             cout << "all";
2373         cout << " quartets..." << endl;
2374         double lkmap_time = getRealTime();
2375         iqtree->doLikelihoodMapping();
2376         cout << "Likelihood mapping needed " << getRealTime()-lkmap_time << " seconds" << endl << endl;
2377     }
2378 
2379     // TODO: why is this variable not used?
2380     // ANSWER: moved to doTreeSearch
2381 //    bool finishedCandidateSet = iqtree.getCheckpoint()->getBool("finishedCandidateSet");
2382     bool finishedInitTree = iqtree->getCheckpoint()->getBool("finishedInitTree");
2383 
2384     // now overwrite with random tree
2385     if (params.start_tree == STT_RANDOM_TREE && !finishedInitTree) {
2386         cout << "Generate random initial Yule-Harding tree..." << endl;
2387         iqtree->generateRandomTree(YULE_HARDING);
2388         iqtree->wrapperFixNegativeBranch(true);
2389         iqtree->initializeAllPartialLh();
2390         initTree = iqtree->optimizeBranches(params.brlen_num_traversal);
2391         cout << "Log-likelihood of random tree: " << iqtree->getCurScore() << endl;
2392     }
2393 
2394     /****************** NOW PERFORM MAXIMUM LIKELIHOOD TREE RECONSTRUCTION ******************/
2395 
2396     // Update best tree
2397     if (!finishedInitTree) {
2398         iqtree->addTreeToCandidateSet(initTree, iqtree->getCurScore(), false, MPIHelper::getInstance().getProcessID());
2399         iqtree->printResultTree();
2400         iqtree->intermediateTrees.update(iqtree->getTreeString(), iqtree->getCurScore());
2401         if (iqtree->isSuperTreeUnlinked()) {
2402             PhyloSuperTree* stree = (PhyloSuperTree*)iqtree;
2403             for (auto it = stree->begin(); it != stree->end(); it++)
2404                 ((IQTree*)(*it))->addTreeToCandidateSet((*it)->getTreeString(),
2405                     (*it)->getCurScore(), false, MPIHelper::getInstance().getProcessID());
2406         }
2407     }
2408 
2409     if (params.min_iterations && !iqtree->isBifurcating())
2410         outError("Tree search does not work with initial multifurcating tree. Please specify `-n 0` to avoid this.");
2411 
2412 
2413     // Compute maximum likelihood distance
2414     // ML distance is only needed for IQP
2415 //    if ( params.start_tree != STT_BIONJ && ((params.snni && !params.iqp) || params.min_iterations == 0)) {
2416 //        params.compute_ml_dist = false;
2417 //    }
2418     if ((params.min_iterations <= 1 || params.numInitTrees <= 1) && params.start_tree != STT_BIONJ)
2419         params.compute_ml_dist = false;
2420 
2421     if ((params.user_file || params.start_tree == STT_RANDOM_TREE) && params.snni && !params.iqp) {
2422         params.compute_ml_dist = false;
2423     }
2424 
2425     if (params.constraint_tree_file)
2426         params.compute_ml_dist = false;
2427 
2428     if (iqtree->isSuperTreeUnlinked())
2429         params.compute_ml_dist = false;
2430 
2431     //Generate BIONJ tree
2432     if (MPIHelper::getInstance().isMaster() && !iqtree->getCheckpoint()->getBool("finishedCandidateSet")) {
2433         if (!finishedInitTree && ((!params.dist_file && params.compute_ml_dist) || params.leastSquareBranch)) {
2434             computeMLDist(params, *iqtree, getCPUTime());
2435             if (!params.user_file && params.start_tree != STT_RANDOM_TREE) {
2436                 // NEW 2015-08-10: always compute BIONJ tree into the candidate set
2437                 iqtree->resetCurScore();
2438                 double start_bionj = getRealTime();
2439                 iqtree->computeBioNJ(params, iqtree->aln, iqtree->dist_file);
2440                 cout << getRealTime() - start_bionj << " seconds" << endl;
2441                 if (iqtree->isSuperTree())
2442                     iqtree->wrapperFixNegativeBranch(true);
2443                 else
2444                     iqtree->wrapperFixNegativeBranch(false);
2445                 iqtree->initializeAllPartialLh();
2446                 if (params.start_tree == STT_BIONJ) {
2447                     initTree = iqtree->optimizeModelParameters(params.min_iterations==0, initEpsilon);
2448                 } else {
2449                     initTree = iqtree->optimizeBranches();
2450                 }
2451                 cout << "Log-likelihood of BIONJ tree: " << iqtree->getCurScore() << endl;
2452 //                cout << "BIONJ tree: " << iqtree->getTreeString() << endl;
2453                 iqtree->candidateTrees.update(initTree, iqtree->getCurScore());
2454             }
2455         }
2456     }
2457 
2458 //    iqtree->saveCheckpoint();
2459 
2460     double cputime_search_start = getCPUTime();
2461     double realtime_search_start = getRealTime();
2462 
2463     if (params.leastSquareNNI) {
2464         iqtree->computeSubtreeDists();
2465     }
2466 
2467     if (params.model_name == "WHTEST") {
2468         cout << endl << "Testing model homogeneity by Weiss & von Haeseler (2003)..." << endl;
2469         WHTest(params, *iqtree);
2470     }
2471 
2472     NodeVector pruned_taxa;
2473     StrVector linked_name;
2474     double *saved_dist_mat = iqtree->dist_matrix;
2475     double *pattern_lh;
2476 
2477     pattern_lh = new double[iqtree->getAlnNPattern()];
2478 
2479     // prune stable taxa
2480     pruneTaxa(params, *iqtree, pattern_lh, pruned_taxa, linked_name);
2481 
2482     /***************************************** DO STOCHASTIC TREE SEARCH *******************************************/
2483     if (params.min_iterations > 0 && !params.tree_spr) {
2484         iqtree->doTreeSearch();
2485         iqtree->setAlignment(iqtree->aln);
2486     } else {
2487         iqtree->candidateTrees.saveCheckpoint();
2488         /* do SPR with likelihood function */
2489         if (params.tree_spr) {
2490             //tree.optimizeSPRBranches();
2491             cout << "Doing SPR Search" << endl;
2492             cout << "Start tree.optimizeSPR()" << endl;
2493             double spr_score = iqtree->optimizeSPR();
2494             cout << "Finish tree.optimizeSPR()" << endl;
2495             //double spr_score = tree.optimizeSPR(tree.curScore, (PhyloNode*) tree.root->neighbors[0]->node);
2496             if (spr_score <= iqtree->getCurScore()) {
2497                 cout << "SPR search did not found any better tree" << endl;
2498             }
2499         }
2500     }
2501 
2502     // restore pruned taxa
2503     restoreTaxa(*iqtree, saved_dist_mat, pruned_taxa, linked_name);
2504 
2505     double search_cpu_time = getCPUTime() - cputime_search_start;
2506     double search_real_time = getRealTime() - realtime_search_start;
2507 
2508     // COMMENT THIS OUT BECAUSE IT DELETES ALL BRANCH LENGTHS OF SUBTREES!
2509 //    if (iqtree.isSuperTree())
2510 //            ((PhyloSuperTree*) iqtree)->mapTrees();
2511 
2512     if (!MPIHelper::getInstance().isMaster()) {
2513         delete[] pattern_lh;
2514         return;
2515     }
2516 
2517     if (params.snni && params.min_iterations && verbose_mode >= VB_MED) {
2518         cout << "Log-likelihoods of " << params.popSize << " best candidate trees: " << endl;
2519         iqtree->printBestScores();
2520         cout << endl;
2521     }
2522 
2523     if (!params.final_model_opt) {
2524         iqtree->setCurScore(iqtree->computeLikelihood());
2525     } else if (params.min_iterations) {
2526         iqtree->readTreeString(iqtree->getBestTrees()[0]);
2527         iqtree->initializeAllPartialLh();
2528         iqtree->clearAllPartialLH();
2529         cout << "--------------------------------------------------------------------" << endl;
2530         cout << "|                    FINALIZING TREE SEARCH                        |" << endl;
2531         cout << "--------------------------------------------------------------------" << endl;
2532 
2533         if (iqtree->getCheckpoint()->getBool("finishedModelFinal")) {
2534             iqtree->setCurScore(iqtree->computeLikelihood());
2535             cout << "CHECKPOINT: Final model parameters restored" << endl;
2536         } else {
2537             cout << "Performs final model parameters optimization" << endl;
2538             string tree;
2539             Params::getInstance().fixStableSplits = false;
2540             Params::getInstance().tabu = false;
2541             // why doing NNI search here?
2542 //            iqtree->doNNISearch();
2543             tree = iqtree->optimizeModelParameters(true);
2544             iqtree->addTreeToCandidateSet(tree, iqtree->getCurScore(), false, MPIHelper::getInstance().getProcessID());
2545             iqtree->getCheckpoint()->putBool("finishedModelFinal", true);
2546             iqtree->saveCheckpoint();
2547         }
2548 
2549     }
2550 
2551     if (iqtree->isSuperTree()) {
2552         ((PhyloSuperTree*) iqtree)->computeBranchLengths();
2553         ((PhyloSuperTree*) iqtree)->printBestPartitionParams((string(params.out_prefix) + ".best_model.nex").c_str());
2554     }
2555 
2556     cout << "BEST SCORE FOUND : " << iqtree->getCurScore() << endl;
2557 
2558     if (params.write_candidate_trees) {
2559         printTrees(iqtree->getBestTrees(), params, ".imd_trees");
2560     }
2561 
2562     if (params.pll)
2563         iqtree->inputModelPLL2IQTree();
2564 
2565     /* root the tree at the first sequence */
2566     // BQM: WHY SETTING THIS ROOT NODE????
2567 //    iqtree->root = iqtree->findLeafName(iqtree->aln->getSeqName(0));
2568 //    assert(iqtree->root);
2569     iqtree->setRootNode(params.root);
2570 
2571 
2572     if (!params.pll) {
2573         iqtree->computeLikelihood(pattern_lh);
2574         // compute logl variance
2575         iqtree->logl_variance = iqtree->computeLogLVariance();
2576     }
2577 
2578     printMiscInfo(params, *iqtree, pattern_lh);
2579 
2580     if (params.root_test) {
2581         cout << "Testing root positions..." << endl;
2582         iqtree->testRootPosition(true, params.loglh_epsilon);
2583     }
2584 
2585     /****** perform SH-aLRT test ******************/
2586     if ((params.aLRT_replicates > 0 || params.localbp_replicates > 0 || params.aLRT_test || params.aBayes_test) && !params.pll) {
2587         double mytime = getRealTime();
2588         params.aLRT_replicates = max(params.aLRT_replicates, params.localbp_replicates);
2589         cout << endl;
2590         if (params.aLRT_replicates > 0)
2591             cout << "Testing tree branches by SH-like aLRT with "
2592                 << params.aLRT_replicates << " replicates..." << endl;
2593         if (params.localbp_replicates)
2594             cout << "Testing tree branches by local-BP test with " << params.localbp_replicates << " replicates..." << endl;
2595         if (params.aLRT_test)
2596             cout << "Testing tree branches by aLRT parametric test..." << endl;
2597         if (params.aBayes_test)
2598             cout << "Testing tree branches by aBayes parametric test..." << endl;
2599         iqtree->setRootNode(params.root);
2600         if (iqtree->isBifurcating()) {
2601             iqtree->testAllBranches(params.aLRT_threshold, iqtree->getCurScore(),
2602                     pattern_lh, params.aLRT_replicates, params.localbp_replicates, params.aLRT_test, params.aBayes_test);
2603             cout << getRealTime() - mytime << " sec." << endl;
2604         } else {
2605             outWarning("Tree is multifurcating and such test is not applicable");
2606             params.aLRT_replicates = params.localbp_replicates = params.aLRT_test = params.aBayes_test = 0;
2607         }
2608     }
2609 
2610     if (params.gbo_replicates > 0) {
2611         cout << "Creating " << RESAMPLE_NAME << " support values..." << endl;
2612         if (!params.online_bootstrap)
2613             outError("Obsolete feature");
2614 //            runGuidedBootstrap(params, iqtree->aln, iqtree);
2615         else
2616             iqtree->summarizeBootstrap(params);
2617     }
2618 
2619     if (params.collapse_zero_branch) {
2620         cout << "Collapsing near-zero internal branches... ";
2621         cout << iqtree->collapseInternalBranches(NULL, NULL, params.min_branch_length*4);
2622         cout << " collapsed" << endl;
2623     }
2624 
2625     printFinalSearchInfo(params, *iqtree, search_cpu_time, search_real_time);
2626 
2627     if (params.gbo_replicates && params.online_bootstrap && params.print_ufboot_trees)
2628         iqtree->writeUFBootTrees(params);
2629 
2630     if (params.gbo_replicates && params.online_bootstrap && !iqtree->isSuperTreeUnlinked()) {
2631 
2632         cout << endl << "Computing " << RESAMPLE_NAME << " consensus tree..." << endl;
2633         string splitsfile = params.out_prefix;
2634         splitsfile += ".splits.nex";
2635         double weight_threshold = (params.split_threshold<1) ? params.split_threshold : (params.gbo_replicates-1.0)/params.gbo_replicates;
2636         weight_threshold *= 100.0;
2637         computeConsensusTree(splitsfile.c_str(), 0, 1e6, -1,
2638                              weight_threshold, NULL, params.out_prefix, NULL, &params);
2639         // now optimize branch lengths of the consensus tree
2640         string current_tree = iqtree->getTreeString();
2641         optimizeConTree(params, iqtree);
2642         // revert the best tree
2643         iqtree->readTreeString(current_tree);
2644     }
2645     if (Params::getInstance().writeDistImdTrees) {
2646         cout << endl;
2647         cout << "Recomputing the log-likelihood of the intermediate trees ... " << endl;
2648         iqtree->intermediateTrees.recomputeLoglOfAllTrees(*iqtree);
2649     }
2650 
2651     // BUG FIX: readTreeString(bestTreeString) not needed before this line
2652     iqtree->printResultTree();
2653     iqtree->saveCheckpoint();
2654 
2655     if (params.upper_bound_NNI) {
2656         string out_file_UB = params.out_prefix;
2657         out_file_UB += ".UB.NNI.main";
2658         ofstream out_UB;
2659         out_UB.exceptions(ios::failbit | ios::badbit);
2660         out_UB.open((char *) out_file_UB.c_str(), std::ofstream::out | std::ofstream::app);
2661         out_UB << iqtree->leafNum << "\t" << iqtree->aln->getNSite() << "\t" << iqtree->params->upper_bound_frac << "\t"
2662         << iqtree->skippedNNIub << "\t" << iqtree->totalNNIub << "\t" << iqtree->getBestScore() << endl;
2663         //iqtree->minUB << "\t" << iqtree->meanUB/iqtree->skippedNNIub << "\t" << iqtree->maxUB << endl;
2664         out_UB.close();
2665     }
2666 
2667     if (params.out_file)
2668         iqtree->printTree(params.out_file);
2669 
2670     delete[] pattern_lh;
2671 
2672     runApproximateBranchLengths(params, *iqtree);
2673 
2674 }
2675 
2676 /**********************************************************
2677  * MULTIPLE TREE RECONSTRUCTION
2678  ***********************************************************/
runMultipleTreeReconstruction(Params & params,Alignment * alignment,IQTree * tree)2679 void runMultipleTreeReconstruction(Params &params, Alignment *alignment, IQTree *tree) {
2680     ModelCheckpoint *model_info = new ModelCheckpoint;
2681 
2682     if (params.suppress_output_flags & OUT_TREEFILE)
2683         outError("Suppress .treefile not allowed with -runs option");
2684     string treefile_name = params.out_prefix;
2685     treefile_name += ".treefile";
2686     string runtrees_name = params.out_prefix;
2687     runtrees_name += ".runtrees";
2688     DoubleVector runLnL;
2689 
2690     if (tree->getCheckpoint()->getVector("runLnL", runLnL)) {
2691         cout << endl << "CHECKPOINT: " << runLnL.size() << " independent run(s) restored" << endl;
2692     } else if (MPIHelper::getInstance().isMaster()) {
2693         // first empty the runtrees file
2694         try {
2695             ofstream tree_out;
2696             tree_out.exceptions(ios::failbit | ios::badbit);
2697             tree_out.open(runtrees_name.c_str());
2698             tree_out.close();
2699         } catch (ios::failure) {
2700             outError(ERR_WRITE_OUTPUT, runtrees_name);
2701         }
2702     }
2703 
2704     double start_time = getCPUTime();
2705     double start_real_time = getRealTime();
2706 
2707     int orig_seed = params.ran_seed;
2708     int run;
2709     int best_run = 0;
2710     for (run = 0; run < runLnL.size(); run++)
2711         if (runLnL[run] > runLnL[best_run])
2712             best_run = run;
2713 
2714     // do multiple tree reconstruction
2715     for (run = runLnL.size(); run < params.num_runs; run++) {
2716 
2717         tree->getCheckpoint()->startStruct("run" + convertIntToString(run+1));
2718 
2719         params.ran_seed = orig_seed + run*1000 + MPIHelper::getInstance().getProcessID();
2720 
2721         cout << endl << "---> START RUN NUMBER " << run + 1 << " (seed: " << params.ran_seed << ")" << endl;
2722 
2723         tree->getCheckpoint()->put("seed", params.ran_seed);
2724 
2725         // initialize random stream for replicating the run
2726 
2727         int *saved_randstream = randstream;
2728         init_random(params.ran_seed);
2729 
2730         IQTree *iqtree;
2731         if (alignment->isSuperAlignment()){
2732             if(params.partition_type != BRLEN_OPTIMIZE){
2733                 iqtree = new PhyloSuperTreePlen((SuperAlignment*) alignment, (PhyloSuperTree*) tree);
2734             } else {
2735                 iqtree = new PhyloSuperTree((SuperAlignment*) alignment, (PhyloSuperTree*) tree);
2736             }
2737         } else {
2738             // allocate heterotachy tree if neccessary
2739             int pos = posRateHeterotachy(alignment->model_name);
2740 
2741             if (params.num_mixlen > 1) {
2742                 iqtree = new PhyloTreeMixlen(alignment, params.num_mixlen);
2743             } else if (pos != string::npos) {
2744                 iqtree = new PhyloTreeMixlen(alignment, 0);
2745             } else
2746                 iqtree = new IQTree(alignment);
2747         }
2748 
2749         if (!tree->constraintTree.empty()) {
2750             iqtree->constraintTree.readConstraint(tree->constraintTree);
2751         }
2752 
2753         // set checkpoint
2754         iqtree->setCheckpoint(tree->getCheckpoint());
2755         iqtree->num_precision = tree->num_precision;
2756 
2757         runTreeReconstruction(params, iqtree);
2758         // read in the output tree file
2759         stringstream ss;
2760         iqtree->printTree(ss);
2761         if (MPIHelper::getInstance().isMaster())
2762             try {
2763                 ofstream tree_out;
2764                 tree_out.exceptions(ios::failbit | ios::badbit);
2765                 tree_out.open(runtrees_name.c_str(), ios_base::out | ios_base::app);
2766                 tree_out.precision(10);
2767                 tree_out << "[ lh=" << iqtree->getBestScore() << " ]";
2768                 tree_out << ss.str() << endl;
2769                 tree_out.close();
2770             } catch (ios::failure) {
2771                 outError(ERR_WRITE_OUTPUT, runtrees_name);
2772             }
2773         // fix bug: set the model for original tree after testing
2774         if ((params.model_name.substr(0,4) == "TEST" || params.model_name.substr(0,2) == "MF") && tree->isSuperTree()) {
2775             PhyloSuperTree *stree = ((PhyloSuperTree*)tree);
2776             stree->part_info =  ((PhyloSuperTree*)iqtree)->part_info;
2777         }
2778         runLnL.push_back(iqtree->getBestScore());
2779 
2780         if (MPIHelper::getInstance().isMaster()) {
2781             if (params.num_bootstrap_samples > 0 && params.consensus_type == CT_CONSENSUS_TREE &&
2782                 (run == 0 || iqtree->getBestScore() > runLnL[best_run])) {
2783                 // 2017-12-08: optimize branch lengths of consensus tree
2784                 // now optimize branch lengths of the consensus tree
2785                 string current_tree = iqtree->getTreeString();
2786                 optimizeConTree(params, iqtree);
2787                 // revert the best tree
2788                 iqtree->readTreeString(current_tree);
2789                 iqtree->saveCheckpoint();
2790             }
2791         }
2792         if (iqtree->getBestScore() > runLnL[best_run])
2793             best_run = run;
2794 
2795         if (params.num_runs == 1)
2796             reportPhyloAnalysis(params, *iqtree, *model_info);
2797         delete iqtree;
2798 
2799         tree->getCheckpoint()->endStruct();
2800         // clear all checkpointed information
2801 //        tree->getCheckpoint()->keepKeyPrefix("iqtree");
2802         tree->getCheckpoint()->putVector("runLnL", runLnL);
2803 //        tree->getCheckpoint()->putBool("finished", false);
2804         tree->getCheckpoint()->dump(true);
2805         // restore randstream
2806         finish_random();
2807         randstream = saved_randstream;
2808     }
2809 
2810     cout << endl << "---> SUMMARIZE RESULTS FROM " << runLnL.size() << " RUNS" << endl << endl;
2811 
2812     cout << "Run " << best_run+1 <<  " gave best log-likelihood: " << runLnL[best_run] << endl;
2813 
2814     // initialize tree and model strucgture
2815     ModelsBlock *models_block = readModelsDefinition(params);
2816     tree->setParams(&params);
2817     tree->setNumThreads(params.num_threads);
2818     if (!tree->getModelFactory()) {
2819         tree->initializeModel(params, tree->aln->model_name, models_block);
2820     }
2821     if (tree->getRate()->isHeterotachy() && !tree->isMixlen()) {
2822         ASSERT(0 && "Heterotachy tree not properly created");
2823     }
2824     delete models_block;
2825 
2826     // restore the tree and model from the best run
2827     tree->getCheckpoint()->startStruct("run" + convertIntToString(best_run+1));
2828     tree->restoreCheckpoint();
2829     tree->getModelFactory()->restoreCheckpoint();
2830     tree->setCurScore(runLnL[best_run]);
2831     if (params.gbo_replicates && !tree->isSuperTreeUnlinked()) {
2832 
2833         string out_file = (string)params.out_prefix + ".splits";
2834         if (params.print_splits_file) {
2835             tree->boot_splits.back()->saveFile(out_file.c_str(), IN_OTHER, true);
2836             cout << "Split supports printed to star-dot file " << out_file << endl;
2837         }
2838 
2839         if (params.print_splits_nex_file) {
2840             out_file = (string)params.out_prefix + ".splits.nex";
2841             tree->boot_splits.back()->saveFile(out_file.c_str(), IN_NEXUS, false);
2842             cout << "Split supports printed to NEXUS file " << out_file << endl;
2843         }
2844 
2845         // overwrite .ufboot trees
2846         if (params.print_ufboot_trees)
2847             tree->writeUFBootTrees(params);
2848 
2849         // overwrite .contree
2850         string contree;
2851         if (!tree->getCheckpoint()->getString("contree", contree))
2852             ASSERT(0 && "Couldn't restore contree");
2853         string contree_file = string(params.out_prefix) + ".contree";
2854         string current_tree = tree->getTreeString();
2855         tree->readTreeString(contree);
2856         tree->setRootNode(params.root);
2857         tree->insertTaxa(tree->removed_seqs, tree->twin_seqs);
2858         tree->printTree(contree_file.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
2859         tree->readTreeString(current_tree);
2860         cout << "Consensus tree written to " << contree_file << endl;
2861     }
2862     tree->getCheckpoint()->endStruct();
2863 
2864     // overwrite .treefile
2865     tree->printResultTree();
2866 
2867     if (MPIHelper::getInstance().isMaster()) {
2868         cout << "Total CPU time for " << params.num_runs << " runs: " << (getCPUTime() - start_time) << " seconds." << endl;
2869         cout << "Total wall-clock time for " << params.num_runs << " runs: " << (getRealTime() - start_real_time) << " seconds." << endl << endl;
2870     }
2871     delete model_info;
2872 }
2873 
computeLoglFromUserInputGAMMAInvar(Params & params,IQTree & iqtree)2874 void computeLoglFromUserInputGAMMAInvar(Params &params, IQTree &iqtree) {
2875     RateHeterogeneity *site_rates = iqtree.getRate();
2876     site_rates->setFixPInvar(true);
2877     site_rates->setFixGammaShape(true);
2878     vector<double> alphas, p_invars, logl;
2879     ifstream aiFile;
2880     aiFile.open(params.alpha_invar_file, ios_base::in);
2881     if (aiFile.good()) {
2882         double alpha, p_invar;
2883         while (aiFile >> alpha >> p_invar) {
2884             alphas.push_back(alpha);
2885             p_invars.push_back(p_invar);
2886         }
2887         aiFile.close();
2888         cout << "Computing tree logl based on the alpha and p_invar values in " << params.alpha_invar_file << " ..." <<
2889         endl;
2890     } else {
2891         stringstream errMsg;
2892         errMsg << "Could not find file: " << params.alpha_invar_file;
2893         outError(errMsg.str().c_str());
2894     }
2895     string aiResultsFileName = string(params.out_prefix) + "_" + string(params.alpha_invar_file) + ".results";
2896     ofstream aiFileResults;
2897     aiFileResults.open(aiResultsFileName.c_str());
2898     aiFileResults << fixed;
2899     aiFileResults.precision(4);
2900     DoubleVector lenvec;
2901     aiFileResults << "Alpha P_Invar Logl TreeLength\n";
2902     for (int i = 0; i < alphas.size(); i++) {
2903         iqtree.saveBranchLengths(lenvec);
2904         aiFileResults << alphas.at(i) << " " << p_invars.at(i) << " ";
2905         site_rates->setGammaShape(alphas.at(i));
2906         site_rates->setPInvar(p_invars.at(i));
2907         iqtree.clearAllPartialLH();
2908         double lh = iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, false, 0.001);
2909         aiFileResults << lh << " " << iqtree.treeLength() << "\n";
2910         iqtree.restoreBranchLengths(lenvec);
2911     }
2912     aiFileResults.close();
2913     cout << "Results were written to: " << aiResultsFileName << endl;
2914     cout << "Wall clock time used: " << getRealTime() - params.start_real_time << endl;
2915 }
2916 
searchGAMMAInvarByRestarting(IQTree & iqtree)2917 void searchGAMMAInvarByRestarting(IQTree &iqtree) {
2918     if (!Params::getInstance().fixed_branch_length)
2919         iqtree.setCurScore(iqtree.optimizeAllBranches(1));
2920     else
2921         iqtree.setCurScore(iqtree.computeLikelihood());
2922     RateHeterogeneity* site_rates = (iqtree.getRate());
2923     double values[] = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 };
2924     vector<double> initAlphas;
2925     if (Params::getInstance().randomAlpha) {
2926         while (initAlphas.size() < 10) {
2927             double initAlpha = random_double();
2928             initAlphas.push_back(initAlpha + iqtree.params->min_gamma_shape*2);
2929         }
2930     } else {
2931         initAlphas.assign(values, values+10);
2932     }
2933     double bestLogl = iqtree.getCurScore();
2934     double bestAlpha = 0.0;
2935     double bestPInvar = 0.0;
2936     double initPInvar = iqtree.getRate()->getPInvar();
2937 
2938     /* Back up branch lengths and substitutional rates */
2939     DoubleVector lenvec;
2940     DoubleVector bestLens;
2941     iqtree.saveBranchLengths(lenvec);
2942     int numRateEntries = iqtree.getModel()->getNumRateEntries();
2943     double *rates = new double[numRateEntries];
2944     double *bestRates = new double[numRateEntries];
2945     iqtree.getModel()->getRateMatrix(rates);
2946     int numStates = iqtree.aln->num_states;
2947     double *state_freqs = new double[numStates];
2948     iqtree.getModel()->getStateFrequency(state_freqs);
2949     double *bestStateFreqs =  new double[numStates];
2950 
2951     for (int i = 0; i < 10; i++) {
2952         cout << endl;
2953         cout << "Testing alpha: " << initAlphas[i] << endl;
2954         // Initialize model parameters
2955         iqtree.restoreBranchLengths(lenvec);
2956         ((ModelMarkov*) iqtree.getModel())->setRateMatrix(rates);
2957         ((ModelMarkov*) iqtree.getModel())->setStateFrequency(state_freqs);
2958         iqtree.getModel()->decomposeRateMatrix();
2959         site_rates->setGammaShape(initAlphas[i]);
2960         site_rates->setPInvar(initPInvar);
2961         iqtree.clearAllPartialLH();
2962         iqtree.optimizeModelParameters(verbose_mode >= VB_MED, Params::getInstance().testAlphaEps);
2963         double estAlpha = iqtree.getRate()->getGammaShape();
2964         double estPInv = iqtree.getRate()->getPInvar();
2965         double logl = iqtree.getCurScore();
2966         cout << "Est. alpha: " << estAlpha << " / Est. pinv: " << estPInv
2967         << " / Logl: " << logl << endl;
2968 
2969         if (iqtree.getCurScore() > bestLogl) {
2970             bestLogl = logl;
2971             bestAlpha = estAlpha;
2972             bestPInvar = estPInv;
2973             bestLens.clear();
2974             iqtree.saveBranchLengths(bestLens);
2975             iqtree.getModel()->getRateMatrix(bestRates);
2976             iqtree.getModel()->getStateFrequency(bestStateFreqs);
2977         }
2978     }
2979     site_rates->setGammaShape(bestAlpha);
2980     site_rates->setFixGammaShape(false);
2981     site_rates->setPInvar(bestPInvar);
2982     site_rates->setFixPInvar(false);
2983     ((ModelMarkov*) iqtree.getModel())->setRateMatrix(bestRates);
2984     ((ModelMarkov*) iqtree.getModel())->setStateFrequency(bestStateFreqs);
2985     iqtree.restoreBranchLengths(bestLens);
2986     iqtree.getModel()->decomposeRateMatrix();
2987     iqtree.clearAllPartialLH();
2988     iqtree.setCurScore(iqtree.computeLikelihood());
2989     cout << endl;
2990     cout << "Best initial alpha: " << bestAlpha << " / initial pinv: " << bestPInvar << " / ";
2991     cout << "Logl: " << iqtree.getCurScore() << endl;
2992 
2993     delete [] rates;
2994     delete [] state_freqs;
2995     delete [] bestRates;
2996     delete [] bestStateFreqs;
2997 }
2998 
2999 // Test alpha fom 0.1 to 15 and p_invar from 0.1 to 0.99, stepsize = 0.01
exhaustiveSearchGAMMAInvar(Params & params,IQTree & iqtree)3000 void exhaustiveSearchGAMMAInvar(Params &params, IQTree &iqtree) {
3001     double alphaMin = 0.01;
3002     double alphaMax = 10.00;
3003     double p_invarMin = 0.01;
3004     double p_invarMax = 1.00;
3005 //    double p_invarMax = iqtree.aln->frac_const_sites;
3006     double stepSize = 0.01;
3007     int numAlpha = (int) floor((alphaMax - alphaMin)/stepSize);
3008     int numInvar = (int) floor((p_invarMax - p_invarMin)/stepSize);
3009 
3010     cout << "EVALUATING " << numAlpha*numInvar << " COMBINATIONS OF " << " alpha=" << alphaMin << ".." << alphaMax
3011          << " AND " << " p-invar=" << p_invarMin << ".." << p_invarMax
3012          << " (epsilon: " << params.modelEps << ")" << endl;
3013 
3014 //    vector<string> results;
3015 //    results.reserve((unsigned long) (numAlpha * numInvar));
3016     DoubleVector lenvec;
3017     iqtree.saveBranchLengths(lenvec);
3018 
3019     RateHeterogeneity* site_rates = (iqtree.getRate());
3020     site_rates->setFixPInvar(true);
3021     site_rates->setFixGammaShape(true);
3022 
3023     string aiResultsFileName = string(params.out_prefix) + ".ai_results";
3024     ofstream aiFileResults;
3025     aiFileResults.open(aiResultsFileName.c_str());
3026     aiFileResults << fixed;
3027     aiFileResults.precision(4);
3028     aiFileResults << "alpha p_invar logl tree_len\n";
3029 
3030     for (double alpha = alphaMin; alpha < alphaMax; alpha = alpha + stepSize) {
3031         cout << "alpha = " << alpha << endl;
3032         for (double p_invar = p_invarMin; p_invar < p_invarMax; p_invar = p_invar + stepSize) {
3033             site_rates->setGammaShape(alpha);
3034             site_rates->setPInvar(p_invar);
3035             iqtree.clearAllPartialLH();
3036             double lh = iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, false, params.modelEps);
3037 //            stringstream ss;
3038 //            ss << fixed << setprecision(2) << alpha << " " << p_invar << " " << lh << " " << iqtree.treeLength();
3039             aiFileResults << alpha << " " << p_invar << " " << lh << " " << iqtree.treeLength() << endl;
3040             //cout << ss.str() << endl;
3041 //            results.push_back(ss.str());
3042             iqtree.restoreBranchLengths(lenvec);
3043         }
3044     }
3045 //    for (vector<string>::iterator it = results.begin(); it != results.end(); it++) {
3046 //                aiFileResults << (*it) << endl;
3047 //            }
3048     aiFileResults.close();
3049     cout << "Results were written to: " << aiResultsFileName << endl;
3050     cout << "Wall clock time used: " << getRealTime() - params.start_real_time << endl;
3051 }
3052 
3053 /**********************************************************
3054  * STANDARD NON-PARAMETRIC BOOTSTRAP
3055  ***********************************************************/
runStandardBootstrap(Params & params,Alignment * alignment,IQTree * tree)3056 void runStandardBootstrap(Params &params, Alignment *alignment, IQTree *tree) {
3057     ModelCheckpoint *model_info = new ModelCheckpoint;
3058     StrVector removed_seqs, twin_seqs;
3059 
3060     // turn off all branch tests
3061     int saved_aLRT_replicates = params.aLRT_replicates;
3062     int saved_localbp_replicates = params.localbp_replicates;
3063     bool saved_aLRT_test = params.aLRT_test;
3064     bool saved_aBayes_test = params.aBayes_test;
3065     params.aLRT_replicates = 0;
3066     params.localbp_replicates = 0;
3067     params.aLRT_test = false;
3068     params.aBayes_test = false;
3069 
3070     if (params.suppress_output_flags & OUT_TREEFILE)
3071         outError("Suppress .treefile not allowed for standard bootstrap");
3072     string treefile_name = params.out_prefix;
3073     treefile_name += ".treefile";
3074     string boottrees_name = params.out_prefix;
3075     boottrees_name += ".boottrees";
3076     string bootaln_name = params.out_prefix;
3077     bootaln_name += ".bootaln";
3078     string bootlh_name = params.out_prefix;
3079     bootlh_name += ".bootlh";
3080     int bootSample = 0;
3081     if (tree->getCheckpoint()->get("bootSample", bootSample)) {
3082         cout << "CHECKPOINT: " << bootSample << " bootstrap analyses restored" << endl;
3083     } else if (MPIHelper::getInstance().isMaster()) {
3084         // first empty the boottrees file
3085         try {
3086             ofstream tree_out;
3087             tree_out.exceptions(ios::failbit | ios::badbit);
3088             tree_out.open(boottrees_name.c_str());
3089             tree_out.close();
3090         } catch (ios::failure) {
3091             outError(ERR_WRITE_OUTPUT, boottrees_name);
3092         }
3093 
3094         // empty the bootaln file
3095         if (params.print_bootaln)
3096         try {
3097             ofstream tree_out;
3098             tree_out.exceptions(ios::failbit | ios::badbit);
3099             tree_out.open(bootaln_name.c_str());
3100             tree_out.close();
3101         } catch (ios::failure) {
3102             outError(ERR_WRITE_OUTPUT, bootaln_name);
3103         }
3104     }
3105 
3106     double start_time = getCPUTime();
3107     double start_real_time = getRealTime();
3108 
3109     startTreeReconstruction(params, tree, *model_info);
3110 
3111     // 2018-06-21: bug fix: alignment might be changed by -m ...MERGE
3112     alignment = tree->aln;
3113 
3114     // do bootstrap analysis
3115     for (int sample = bootSample; sample < params.num_bootstrap_samples; sample++) {
3116         cout << endl << "===> START " << RESAMPLE_NAME_UPPER << " REPLICATE NUMBER "
3117                 << sample + 1 << endl << endl;
3118 
3119         // 2015-12-17: initialize random stream for creating bootstrap samples
3120         // mainly so that checkpointing does not need to save bootstrap samples
3121         int *saved_randstream = randstream;
3122         init_random(params.ran_seed + sample);
3123 
3124         Alignment* bootstrap_alignment;
3125         cout << "Creating " << RESAMPLE_NAME << " alignment (seed: " << params.ran_seed+sample << ")..." << endl;
3126 
3127         if (alignment->isSuperAlignment())
3128             bootstrap_alignment = new SuperAlignment;
3129         else
3130             bootstrap_alignment = new Alignment;
3131         bootstrap_alignment->createBootstrapAlignment(alignment, NULL, params.bootstrap_spec);
3132 
3133         // restore randstream
3134         finish_random();
3135         randstream = saved_randstream;
3136 
3137         if (params.print_tree_lh && MPIHelper::getInstance().isMaster()) {
3138             double prob;
3139             bootstrap_alignment->multinomialProb(*alignment, prob);
3140             ofstream boot_lh;
3141             if (sample == 0)
3142                 boot_lh.open(bootlh_name.c_str());
3143             else
3144                 boot_lh.open(bootlh_name.c_str(), ios_base::out | ios_base::app);
3145             boot_lh << "0\t" << prob << endl;
3146             boot_lh.close();
3147         }
3148         IQTree *boot_tree;
3149         if (alignment->isSuperAlignment()){
3150             if(params.partition_type != BRLEN_OPTIMIZE){
3151                 boot_tree = new PhyloSuperTreePlen((SuperAlignment*) bootstrap_alignment, (PhyloSuperTree*) tree);
3152             } else {
3153                 boot_tree = new PhyloSuperTree((SuperAlignment*) bootstrap_alignment, (PhyloSuperTree*) tree);
3154             }
3155         } else {
3156             // allocate heterotachy tree if neccessary
3157             int pos = posRateHeterotachy(alignment->model_name);
3158 
3159             if (params.num_mixlen > 1) {
3160                 boot_tree = new PhyloTreeMixlen(bootstrap_alignment, params.num_mixlen);
3161             } else if (pos != string::npos) {
3162                 boot_tree = new PhyloTreeMixlen(bootstrap_alignment, 0);
3163             } else
3164                 boot_tree = new IQTree(bootstrap_alignment);
3165         }
3166         if (params.print_bootaln && MPIHelper::getInstance().isMaster()) {
3167             bootstrap_alignment->printAlignment(params.aln_output_format, bootaln_name.c_str(), true);
3168         }
3169 
3170         if (params.print_boot_site_freq && MPIHelper::getInstance().isMaster()) {
3171             printSiteStateFreq((((string)params.out_prefix)+"."+convertIntToString(sample)+".bootsitefreq").c_str(), bootstrap_alignment);
3172                 bootstrap_alignment->printAlignment(params.aln_output_format, (((string)params.out_prefix)+"."+convertIntToString(sample)+".bootaln").c_str());
3173         }
3174 
3175         if (!tree->constraintTree.empty()) {
3176             boot_tree->constraintTree.readConstraint(tree->constraintTree);
3177         }
3178 
3179         // set checkpoint
3180         boot_tree->setCheckpoint(tree->getCheckpoint());
3181         boot_tree->num_precision = tree->num_precision;
3182 
3183         runTreeReconstruction(params, boot_tree);
3184         // read in the output tree file
3185         stringstream ss;
3186         boot_tree->printTree(ss);
3187 //        try {
3188 //            ifstream tree_in;
3189 //            tree_in.exceptions(ios::failbit | ios::badbit);
3190 //            tree_in.open(treefile_name.c_str());
3191 //            tree_in >> tree_str;
3192 //            tree_in.close();
3193 //        } catch (ios::failure) {
3194 //            outError(ERR_READ_INPUT, treefile_name);
3195 //        }
3196         // write the tree into .boottrees file
3197         if (MPIHelper::getInstance().isMaster())
3198         try {
3199             ofstream tree_out;
3200             tree_out.exceptions(ios::failbit | ios::badbit);
3201             tree_out.open(boottrees_name.c_str(), ios_base::out | ios_base::app);
3202             tree_out << ss.str() << endl;
3203             tree_out.close();
3204         } catch (ios::failure) {
3205             outError(ERR_WRITE_OUTPUT, boottrees_name);
3206         }
3207         // OBSOLETE fix bug: set the model for original tree after testing
3208 //        if ((params.model_name.substr(0,4) == "TEST" || params.model_name.substr(0,2) == "MF") && tree->isSuperTree()) {
3209 //            PhyloSuperTree *stree = ((PhyloSuperTree*)tree);
3210 //            stree->part_info =  ((PhyloSuperTree*)boot_tree)->part_info;
3211 //        }
3212         if (params.num_bootstrap_samples == 1)
3213             reportPhyloAnalysis(params, *boot_tree, *model_info);
3214         // WHY was the following line missing, which caused memory leak?
3215         bootstrap_alignment = boot_tree->aln;
3216         delete boot_tree;
3217         // fix bug: bootstrap_alignment might be changed
3218         delete bootstrap_alignment;
3219 
3220         // clear all checkpointed information
3221         tree->getCheckpoint()->keepKeyPrefix("iqtree");
3222         tree->getCheckpoint()->put("bootSample", sample+1);
3223         tree->getCheckpoint()->putBool("finished", false);
3224         tree->getCheckpoint()->dump(true);
3225     }
3226 
3227 
3228     if (params.consensus_type == CT_CONSENSUS_TREE && MPIHelper::getInstance().isMaster()) {
3229 
3230         cout << endl << "===> COMPUTE CONSENSUS TREE FROM " << params.num_bootstrap_samples
3231             << RESAMPLE_NAME_UPPER << " TREES" << endl << endl;
3232         string root_name = (params.root) ? params.root : alignment->getSeqName(0);
3233         const char* saved_root = params.root;
3234         params.root = root_name.c_str();
3235         computeConsensusTree(boottrees_name.c_str(), 0, 1e6, -1,
3236                 params.split_threshold, NULL, params.out_prefix, NULL, &params);
3237         params.root = saved_root;
3238     }
3239 
3240     if (params.compute_ml_tree) {
3241         cout << endl << "===> START ANALYSIS ON THE ORIGINAL ALIGNMENT" << endl << endl;
3242         // restore branch tests
3243         params.aLRT_replicates = saved_aLRT_replicates;
3244         params.localbp_replicates = saved_localbp_replicates;
3245         params.aLRT_test = saved_aLRT_test;
3246         params.aBayes_test = saved_aBayes_test;
3247 
3248         if (params.num_runs == 1)
3249             runTreeReconstruction(params, tree);
3250         else
3251             runMultipleTreeReconstruction(params, tree->aln, tree);
3252 
3253         if (MPIHelper::getInstance().isMaster()) {
3254             if (params.consensus_type == CT_CONSENSUS_TREE && params.num_runs == 1) {
3255                 // 2017-12-08: optimize branch lengths of consensus tree
3256                 optimizeConTree(params, tree);
3257             }
3258 
3259             cout << endl << "===> ASSIGN " << RESAMPLE_NAME_UPPER
3260                 << " SUPPORTS TO THE TREE FROM ORIGINAL ALIGNMENT" << endl << endl;
3261             MExtTree ext_tree;
3262             assignBootstrapSupport(boottrees_name.c_str(), 0, 1e6,
3263                     treefile_name.c_str(), false, treefile_name.c_str(),
3264                     params.out_prefix, ext_tree, NULL, &params);
3265             tree->copyTree(&ext_tree);
3266             reportPhyloAnalysis(params, *tree, *model_info);
3267         }
3268     } else if (params.consensus_type == CT_CONSENSUS_TREE && MPIHelper::getInstance().isMaster()) {
3269         int mi = params.min_iterations;
3270         STOP_CONDITION sc = params.stop_condition;
3271         params.min_iterations = 0;
3272         params.stop_condition = SC_FIXED_ITERATION;
3273         runTreeReconstruction(params, tree);
3274         params.min_iterations = mi;
3275         params.stop_condition = sc;
3276         tree->stop_rule.initialize(params);
3277         optimizeConTree(params, tree);
3278         reportPhyloAnalysis(params, *tree, *model_info);
3279     } else
3280         cout << endl;
3281 
3282 #ifdef USE_BOOSTER
3283     if (params.transfer_bootstrap) {
3284         // transfer bootstrap expectation (TBE)
3285         cout << "Performing transfer bootstrap expectation..." << endl;
3286         string input_tree = (string)params.out_prefix + ".treefile";
3287         string boot_trees = (string)params.out_prefix + ".boottrees";
3288         string out_tree = (string)params.out_prefix + ".tbe.tree";
3289         string out_raw_tree = (string)params.out_prefix + ".tbe.rawtree";
3290         string stat_out = (string)params.out_prefix + ".tbe.stat";
3291         main_booster(input_tree.c_str(), boot_trees.c_str(), out_tree.c_str(),
3292                      (params.transfer_bootstrap==2) ? out_raw_tree.c_str() : NULL,
3293                      stat_out.c_str(), (verbose_mode >= VB_MED) ? 0 : 1);
3294         cout << "TBE tree written to " << out_tree << endl;
3295         if (params.transfer_bootstrap == 2)
3296             cout << "TBE raw tree written to " << out_raw_tree << endl;
3297         cout << "TBE statistic written to " << stat_out << endl;
3298         cout << endl;
3299     }
3300 #endif
3301 
3302     if (MPIHelper::getInstance().isMaster()) {
3303         cout << "Total CPU time for " << RESAMPLE_NAME << ": " << (getCPUTime() - start_time) << " seconds." << endl;
3304     cout << "Total wall-clock time for " << RESAMPLE_NAME << ": " << (getRealTime() - start_real_time) << " seconds." << endl << endl;
3305     cout << "Non-parametric " << RESAMPLE_NAME << " results written to:" << endl;
3306     if (params.print_bootaln)
3307         cout << RESAMPLE_NAME_I << " alignments:     " << params.out_prefix << ".bootaln" << endl;
3308     cout << RESAMPLE_NAME_I << " trees:          " << params.out_prefix << ".boottrees" << endl;
3309     if (params.consensus_type == CT_CONSENSUS_TREE)
3310         cout << "  Consensus tree:           " << params.out_prefix << ".contree" << endl;
3311     cout << endl;
3312     }
3313     delete model_info;
3314 }
3315 
convertAlignment(Params & params,IQTree * iqtree)3316 void convertAlignment(Params &params, IQTree *iqtree) {
3317     Alignment *alignment = iqtree->aln;
3318     if (params.num_bootstrap_samples || params.print_bootaln) {
3319         // create bootstrap alignment
3320         Alignment* bootstrap_alignment;
3321         cout << "Creating " << RESAMPLE_NAME << " alignment..." << endl;
3322         if (alignment->isSuperAlignment())
3323             bootstrap_alignment = new SuperAlignment;
3324         else
3325             bootstrap_alignment = new Alignment;
3326         bootstrap_alignment->createBootstrapAlignment(alignment, NULL, params.bootstrap_spec);
3327         delete alignment;
3328         alignment = bootstrap_alignment;
3329         iqtree->aln = alignment;
3330     }
3331 
3332     int exclude_sites = 0;
3333     if (params.aln_nogaps)
3334         exclude_sites += EXCLUDE_GAP;
3335     if (params.aln_no_const_sites)
3336         exclude_sites += EXCLUDE_INVAR;
3337 
3338     if (alignment->isSuperAlignment()) {
3339         alignment->printAlignment(params.aln_output_format, params.aln_output, false, params.aln_site_list,
3340                                   exclude_sites, params.ref_seq_name);
3341         if (params.print_subaln)
3342             ((SuperAlignment*)alignment)->printSubAlignments(params);
3343         if (params.aln_output_format != IN_NEXUS) {
3344             string partition_info = string(params.aln_output) + ".nex";
3345             ((SuperAlignment*)alignment)->printPartition(partition_info.c_str(), params.aln_output);
3346             partition_info = (string)params.aln_output + ".partitions";
3347             ((SuperAlignment*)alignment)->printPartitionRaxml(partition_info.c_str());
3348         }
3349     } else if (params.gap_masked_aln) {
3350         Alignment out_aln;
3351         Alignment masked_aln(params.gap_masked_aln, params.sequence_type, params.intype, params.model_name);
3352         out_aln.createGapMaskedAlignment(&masked_aln, alignment);
3353         out_aln.printAlignment(params.aln_output_format, params.aln_output, false, params.aln_site_list,
3354                 exclude_sites, params.ref_seq_name);
3355         string str = params.gap_masked_aln;
3356         str += ".sitegaps";
3357         out_aln.printSiteGaps(str.c_str());
3358     } else  {
3359         alignment->printAlignment(params.aln_output_format, params.aln_output, false, params.aln_site_list,
3360                 exclude_sites, params.ref_seq_name);
3361     }
3362 }
3363 
3364 /**
3365     2016-08-04: compute a site frequency model for profile mixture model
3366 */
computeSiteFrequencyModel(Params & params,Alignment * alignment)3367 void computeSiteFrequencyModel(Params &params, Alignment *alignment) {
3368 
3369     cout << endl << "===> COMPUTING SITE FREQUENCY MODEL BASED ON TREE FILE " << params.tree_freq_file << endl;
3370     ASSERT(params.tree_freq_file);
3371     PhyloTree *tree = new PhyloTree(alignment);
3372     tree->setParams(&params);
3373     bool myrooted = params.is_rooted;
3374     tree->readTree(params.tree_freq_file, myrooted);
3375     tree->setAlignment(alignment);
3376     tree->setRootNode(params.root);
3377 
3378     ModelsBlock *models_block = readModelsDefinition(params);
3379     tree->setModelFactory(new ModelFactory(params, alignment->model_name, tree, models_block));
3380     delete models_block;
3381     tree->setModel(tree->getModelFactory()->model);
3382     tree->setRate(tree->getModelFactory()->site_rate);
3383     tree->setLikelihoodKernel(params.SSE);
3384     tree->setNumThreads(params.num_threads);
3385 
3386     if (!tree->getModel()->isMixture())
3387         outError("No mixture model was specified!");
3388     uint64_t mem_size = tree->getMemoryRequired();
3389     uint64_t total_mem = getMemorySize();
3390     cout << "NOTE: " << (mem_size / 1024) / 1024 << " MB RAM is required!" << endl;
3391     if (mem_size >= total_mem) {
3392         outError("Memory required exceeds your computer RAM size!");
3393     }
3394 #ifdef BINARY32
3395     if (mem_size >= 2000000000) {
3396         outError("Memory required exceeds 2GB limit of 32-bit executable");
3397     }
3398 #endif
3399 
3400 #ifdef _OPENMP
3401     if (tree->num_threads <= 0) {
3402         int bestThreads = tree->testNumThreads();
3403         omp_set_num_threads(bestThreads);
3404     } else
3405         tree->warnNumThreads();
3406 #endif
3407 
3408     tree->initializeAllPartialLh();
3409     // 2017-12-07: Increase espilon ten times (0.01 -> 0.1) to speedup PMSF computation
3410     tree->getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.modelEps*10);
3411 
3412     size_t nptn = alignment->getNPattern(), nstates = alignment->num_states;
3413     double *ptn_state_freq = new double[nptn*nstates];
3414     tree->computePatternStateFreq(ptn_state_freq);
3415     alignment->site_state_freq.resize(nptn);
3416     for (size_t ptn = 0; ptn < nptn; ptn++) {
3417         double *f = new double[nstates];
3418         memcpy(f, ptn_state_freq+ptn*nstates, sizeof(double)*nstates);
3419         alignment->site_state_freq[ptn] = f;
3420     }
3421     alignment->getSitePatternIndex(alignment->site_model);
3422     printSiteStateFreq(((string)params.out_prefix+".sitefreq").c_str(), tree, ptn_state_freq);
3423     params.print_site_state_freq = WSF_NONE;
3424 
3425     delete [] ptn_state_freq;
3426     delete tree;
3427 
3428     cout << endl << "===> CONTINUE ANALYSIS USING THE INFERRED SITE FREQUENCY MODEL" << endl;
3429 }
3430 
3431 
3432 /**********************************************************
3433  * TOP-LEVEL FUNCTION
3434  ***********************************************************/
3435 
newIQTree(Params & params,Alignment * alignment)3436 IQTree *newIQTree(Params &params, Alignment *alignment) {
3437     IQTree *tree;
3438     if (alignment->isSuperAlignment()) {
3439         if (params.partition_type == TOPO_UNLINKED) {
3440             tree = new PhyloSuperTreeUnlinked((SuperAlignment*)alignment);
3441         } else if(params.partition_type != BRLEN_OPTIMIZE){
3442             // initialize supertree - Proportional Edges case
3443             tree = new PhyloSuperTreePlen((SuperAlignment*)alignment, params.partition_type);
3444         } else {
3445             // initialize supertree stuff if user specifies partition file with -sp option
3446             tree = new PhyloSuperTree((SuperAlignment*)alignment);
3447         }
3448         // this alignment will actually be of type SuperAlignment
3449         //        alignment = tree->aln;
3450         if (((PhyloSuperTree*)tree)->rescale_codon_brlen)
3451             cout << "NOTE: Mixed codon and other data, branch lengths of codon partitions are rescaled by 3!" << endl;
3452 
3453     } else {
3454         // allocate heterotachy tree if neccessary
3455         int pos = posRateHeterotachy(alignment->model_name);
3456 
3457         if (params.num_mixlen > 1) {
3458             tree = new PhyloTreeMixlen(alignment, params.num_mixlen);
3459         } else if (pos != string::npos) {
3460             tree = new PhyloTreeMixlen(alignment, 0);
3461         } else
3462             tree = new IQTree(alignment);
3463     }
3464 
3465     return tree;
3466 }
3467 
3468 /** get ID of bad or good symtest results */
getSymTestID(vector<SymTestResult> & res,set<int> & id,bool bad_res)3469 void getSymTestID(vector<SymTestResult> &res, set<int> &id, bool bad_res) {
3470     if (bad_res) {
3471         // get significant test ID
3472         switch (Params::getInstance().symtest) {
3473             case SYMTEST_BINOM:
3474                 for (auto i = res.begin(); i != res.end(); i++)
3475                     if (i->pvalue_binom < Params::getInstance().symtest_pcutoff)
3476                         id.insert(i - res.begin());
3477                 break;
3478             case SYMTEST_MAXDIV:
3479                 for (auto i = res.begin(); i != res.end(); i++)
3480                     if (i->pvalue_maxdiv < Params::getInstance().symtest_pcutoff)
3481                         id.insert(i - res.begin());
3482                 break;
3483             default:
3484                 break;
3485         }
3486     } else {
3487         // get non-significant test ID
3488         switch (Params::getInstance().symtest) {
3489             case SYMTEST_BINOM:
3490                 for (auto i = res.begin(); i != res.end(); i++)
3491                     if (i->pvalue_binom >= Params::getInstance().symtest_pcutoff)
3492                         id.insert(i - res.begin());
3493                 break;
3494             case SYMTEST_MAXDIV:
3495                 for (auto i = res.begin(); i != res.end(); i++)
3496                     if (i->pvalue_maxdiv >= Params::getInstance().symtest_pcutoff)
3497                         id.insert(i - res.begin());
3498                 break;
3499             default:
3500                 break;
3501         }
3502     }
3503 }
3504 
computePValueSMax(vector<SymTestResult> & sym,int start,int step)3505 double computePValueSMax(vector<SymTestResult> &sym, int start, int step) {
3506     double orig_max = sym[start].max_stat;
3507     int count = 0, num = 0;
3508     for (size_t i = start; i < sym.size(); i += step, num++)
3509         if (sym[i].max_stat >= orig_max)
3510             count++;
3511     return double(count)/num;
3512 
3513 }
3514 
doSymTest(Alignment * alignment,Params & params)3515 void doSymTest(Alignment *alignment, Params &params) {
3516     double start_time = getRealTime();
3517     cout << "Performing matched-pair tests of symmetry...";
3518     vector<SymTestResult> sym, marsym, intsym;
3519 
3520     size_t num_parts = 1;
3521     if (alignment->isSuperAlignment())
3522         num_parts = ((SuperAlignment*)alignment)->partitions.size();
3523 
3524     string filename_stat = string(params.out_prefix) + ".symstat.csv";
3525     ofstream *out_stat = NULL;
3526     if (params.symtest_stat) {
3527         out_stat = new ofstream;
3528         out_stat->open(filename_stat);
3529         *out_stat
3530         << "# Statistic values for matched-pair tests of symmetry" << endl
3531         << "# This file can be read in MS Excel or in R with command:" << endl
3532         << "#    dat=read.csv('" <<  filename_stat << "',comment.char='#')" << endl
3533         << "# Columns are comma-separated with following meanings:" << endl
3534         << "#    ID:     Partition ID" << endl
3535         << "#    Seq1:   ID of sequence 1 within partition" << endl
3536         << "#    Seq1:   ID of sequence 2 within partition" << endl
3537         << "#    Sym:    Statistic for test of symmetry" << endl
3538         << "#    SymChi: Chi-square p-value for test of symmetry" << endl
3539         << "#    Mar:    Statistic for test of marginal symmetry" << endl
3540         << "#    MarChi: Chi-square p-value for marginal test of symmetry" << endl
3541         << "#    Int:    Statistic for test of internal symmetry" << endl
3542         << "#    MarChi: Chi-square p-value for internal test of symmetry" << endl
3543         << "ID,Seq1,Seq2,Sym,SymChi,Mar,MarChi,Int,IntChi" << endl;
3544 
3545     }
3546 
3547     sym.resize(num_parts*params.symtest_shuffle);
3548     marsym.resize(num_parts*params.symtest_shuffle);
3549     intsym.resize(num_parts*params.symtest_shuffle);
3550 
3551     for (int i = 0; i < params.symtest_shuffle; i++) {
3552         vector<SymTestStat> *stats = NULL;
3553         if (params.symtest_stat)
3554             stats = new vector<SymTestStat>;
3555         if (i == 0) // original alignment
3556             alignment->doSymTest(i*num_parts, sym, marsym, intsym, NULL, stats);
3557         else {
3558             int *rstream;
3559             init_random(params.ran_seed+i+1, false, &rstream);
3560             alignment->doSymTest(i*num_parts, sym, marsym, intsym, rstream, stats);
3561             finish_random(rstream);
3562         }
3563         if ((i+1)*10 % params.symtest_shuffle == 0) {
3564             cout << " " << (i+1)*100 / params.symtest_shuffle << "%";
3565             cout.flush();
3566         }
3567         if (!stats)
3568             continue;
3569         for (auto it = stats->begin(); it != stats->end(); it++) {
3570             *out_stat << it->part << ',' << it->seq1 << ',' << it->seq2 << ','
3571             << it->chi2_sym << ',' << it->pval_sym << ','
3572             << it->chi2_marsym << ',' << it->pval_marsym << ','
3573             << it->chi2_intsym << ',' << it->pval_intsym << endl;
3574         }
3575         delete stats;
3576     }
3577 
3578     if (out_stat) {
3579         out_stat->close();
3580         delete out_stat;
3581     }
3582 
3583     if (params.symtest_shuffle > 1) {
3584         // compute p-value for s-max approach
3585         for (int part = 0; part < num_parts; part++) {
3586             sym[part].pvalue_perm = computePValueSMax(sym, part, num_parts);
3587             marsym[part].pvalue_perm = computePValueSMax(marsym, part, num_parts);
3588             intsym[part].pvalue_perm = computePValueSMax(intsym, part, num_parts);
3589         }
3590     }
3591 
3592     string filename = string(params.out_prefix) + ".symtest.csv";
3593     ofstream out;
3594     out.open(filename);
3595     out << "# Matched-pair tests of symmetry" << endl
3596     << "# This file can be read in MS Excel or in R with command:" << endl
3597     << "#    dat=read.csv('" <<  filename << "',comment.char='#')" << endl
3598     << "# Columns are comma-separated with following meanings:" << endl
3599     << "#    Name:    Partition name" << endl
3600     << "#    SymSig:  Number of significant sequence pairs by test of symmetry" << endl
3601     << "#    SymNon:  Number of non-significant sequence pairs by test of symmetry" << endl
3602     << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "#    SymBi:   P-value for binomial test of symmetry" : "#    SymPval: P-value for maximum test of symmetry") << endl;
3603     if (params.symtest_shuffle > 1)
3604         out << "#    SymMax:  Maximum of pair statistics by test of symmetry" << endl
3605             << "#    SymPerm: P-value for permutation test of symmetry" << endl;
3606 
3607     out << "#    MarSig:  Number of significant sequence pairs by test of marginal symmetry" << endl
3608     << "#    MarNon:  Number of non-significant sequence pairs by test of marginal symmetry" << endl
3609     << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "#    MarBi:   P-value for binomial test of marginal symmetry" : "#    MarPval: P-value for maximum test of marginal symmetry") << endl;
3610     if (params.symtest_shuffle > 1)
3611         out << "#    MarMax:  Maximum of pair statistics by test of marginal symmetry" << endl
3612             << "#    MarPerm: P-value for permutation test of marginal symmetry" << endl;
3613     out << "#    IntSig:  Number of significant sequence pairs by test of internal symmetry" << endl
3614     << "#    IntNon:  Number of non-significant sequence pairs by test of internal symmetry" << endl
3615     << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "#    IntBi:   P-value for binomial test of symmetry" : "#    IntPval: P-value for maximum test of internal symmetry") << endl;
3616     if (params.symtest_shuffle > 1)
3617         out << "#    IntMax:  Maximum of pair statistics by test of internal symmetry" << endl
3618         << "#    IntPerm: P-value for permutation test of internal symmetry" << endl;
3619 
3620     out << "Name,SymSig,SymNon," << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "SymBi" : "SymPval")
3621         << ((params.symtest_shuffle > 1) ? ",SymMax,SymPerm" : "")
3622         << ",MarSig,MarNon," << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "MarBi" : "MarPval")
3623         << ((params.symtest_shuffle > 1) ? ",MarMax,MarPerm" : "")
3624         << ",IntSig,IntNon," << ((Params::getInstance().symtest == SYMTEST_BINOM) ? "IntBi" : "IntPval")
3625         << ((params.symtest_shuffle > 1) ? ",IntMax,IntPerm" : "") << endl;
3626 
3627     if (alignment->isSuperAlignment()) {
3628         SuperAlignment *saln = (SuperAlignment*)alignment;
3629         for (int part = 0; part < saln->partitions.size(); part++)
3630             out << saln->partitions[part]->name << ','
3631                 << sym[part] << ',' << marsym[part] << ','  << intsym[part] << endl;
3632     } else {
3633         out << alignment->name << ',' << sym[0] << ',' << marsym[0] << ','  << intsym[0] << endl;
3634     }
3635 
3636     if (params.symtest_shuffle > 1) {
3637         for (int part = num_parts; part < sym.size(); part++) {
3638             sym[part].pvalue_perm = marsym[part].pvalue_perm = intsym[part].pvalue_perm = -1.0;
3639             out << part % num_parts << ','
3640             << sym[part] << ',' << marsym[part] << ','  << intsym[part] << endl;
3641         }
3642         // erase the rest
3643         sym.erase(sym.begin()+num_parts, sym.end());
3644         marsym.erase(marsym.begin()+num_parts, marsym.end());
3645         intsym.erase(intsym.begin()+num_parts, intsym.end());
3646     }
3647 
3648     out.close();
3649     cout << " " << getRealTime() - start_time << " seconds" << endl;
3650     if (params.symtest_stat)
3651         cout << "SymTest statistics written to " << filename_stat << endl;
3652     cout << "SymTest results written to " << filename << endl;
3653 
3654     // now filter out partitions
3655     if (alignment->isSuperAlignment()) {
3656         set<int> part_id;
3657         if (params.symtest_remove == 1) {
3658             // remove bad loci
3659             if (params.symtest_type == 0)
3660                 getSymTestID(sym, part_id, true);
3661             else if (params.symtest_type == 1)
3662                 getSymTestID(marsym, part_id, true);
3663             else
3664                 getSymTestID(intsym, part_id, true);
3665         } else if (params.symtest_remove == 2) {
3666             // remove good loci
3667             if (params.symtest_type == 0)
3668                 getSymTestID(sym, part_id, false);
3669             else if (params.symtest_type == 1)
3670                 getSymTestID(marsym, part_id, false);
3671             else
3672                 getSymTestID(intsym, part_id, false);
3673         }
3674         if (!part_id.empty()) {
3675             SuperAlignment *saln = (SuperAlignment*)alignment;
3676             cout << "Removing " << part_id.size()
3677             << ((params.symtest_remove == 1)? " bad" : " good") << " partitions (pvalue cutoff = "
3678             << params.symtest_pcutoff << ")..." << endl;
3679             if (part_id.size() < alignment->getNSite())
3680                 saln->removePartitions(part_id);
3681             else
3682                 outError("Can't remove all partitions");
3683             if (params.aln_output_format == IN_NEXUS) {
3684                 string aln_file = (string)params.out_prefix + ((params.symtest_remove == 1)? ".good.nex" : ".bad.nex");
3685                 alignment->printAlignment(params.aln_output_format, aln_file.c_str());
3686             } else {
3687                 string aln_file = (string)params.out_prefix + ((params.symtest_remove == 1)? ".good.phy" : ".bad.phy");
3688                 alignment->printAlignment(params.aln_output_format, aln_file.c_str());
3689                 string filename = (string)params.out_prefix + ((params.symtest_remove == 2)? ".good.nex" : ".bad.nex");
3690                 saln->printPartition(filename.c_str(), aln_file.c_str());
3691             }
3692         }
3693     }
3694     if (params.symtest_only)
3695         exit(EXIT_SUCCESS);
3696 }
3697 
runPhyloAnalysis(Params & params,Checkpoint * checkpoint)3698 void runPhyloAnalysis(Params &params, Checkpoint *checkpoint) {
3699     Alignment *alignment;
3700 
3701     checkpoint->putBool("finished", false);
3702     checkpoint->setDumpInterval(params.checkpoint_dump_interval);
3703 
3704     /****************** read in alignment **********************/
3705     if (params.partition_file) {
3706         // Partition model analysis
3707         if (params.partition_type == TOPO_UNLINKED)
3708             alignment = new SuperAlignmentUnlinked(params);
3709         else
3710             alignment = new SuperAlignment(params);
3711     } else {
3712         alignment = createAlignment(params.aln_file, params.sequence_type, params.intype, params.model_name);
3713 
3714         if (params.freq_const_patterns) {
3715             int orig_nsite = alignment->getNSite();
3716             alignment->addConstPatterns(params.freq_const_patterns);
3717             cout << "INFO: " << alignment->getNSite() - orig_nsite << " const sites added into alignment" << endl;
3718         }
3719 
3720         // Initialize site-frequency model
3721         if (params.tree_freq_file) {
3722             if (checkpoint->getBool("finishedSiteFreqFile")) {
3723                 alignment->readSiteStateFreq(((string)params.out_prefix + ".sitefreq").c_str());
3724                 params.print_site_state_freq = WSF_NONE;
3725                 cout << "CHECKPOINT: Site frequency model restored" << endl;
3726             } else {
3727                 computeSiteFrequencyModel(params, alignment);
3728                 checkpoint->putBool("finishedSiteFreqFile", true);
3729                 checkpoint->dump();
3730             }
3731         }
3732         if (params.site_freq_file) {
3733             alignment->readSiteStateFreq(params.site_freq_file);
3734         }
3735     }
3736 
3737     if (params.symtest) {
3738         doSymTest(alignment, params);
3739     }
3740 
3741     if (params.print_aln_info) {
3742         string site_info_file = string(params.out_prefix) + ".alninfo";
3743         alignment->printSiteInfo(site_info_file.c_str());
3744         cout << "Alignment sites statistics printed to " << site_info_file << endl;
3745     }
3746 
3747     /*************** initialize tree ********************/
3748     IQTree *tree = newIQTree(params, alignment);
3749 
3750     tree->setCheckpoint(checkpoint);
3751     if (params.min_branch_length <= 0.0) {
3752         params.min_branch_length = 1e-6;
3753         if (!tree->isSuperTree() && tree->getAlnNSite() >= 100000) {
3754             params.min_branch_length = 0.1 / (tree->getAlnNSite());
3755             tree->num_precision = max((int)ceil(-log10(Params::getInstance().min_branch_length))+1, 6);
3756             cout.precision(12);
3757             cout << "NOTE: minimal branch length is reduced to " << params.min_branch_length << " for long alignment" << endl;
3758             cout.precision(3);
3759         }
3760         // Increase the minimum branch length if PoMo is used.
3761         if (alignment->seq_type == SEQ_POMO) {
3762             params.min_branch_length *= alignment->virtual_pop_size * alignment->virtual_pop_size;
3763             cout.precision(12);
3764             cout << "NOTE: minimal branch length is increased to " << params.min_branch_length << " because PoMo infers number of mutations and frequency shifts" << endl;
3765             cout.precision(3);
3766         }
3767     }
3768     // Increase the minimum branch length if PoMo is used.
3769     if (alignment->seq_type == SEQ_POMO) {
3770         params.max_branch_length *= alignment->virtual_pop_size * alignment->virtual_pop_size;
3771         cout.precision(1);
3772         cout << "NOTE: maximal branch length is increased to " << params.max_branch_length << " because PoMo infers number of mutations and frequency shifts" << endl;
3773         cout.precision(3);
3774     }
3775 
3776 
3777     if (params.concatenate_aln) {
3778         Alignment aln(params.concatenate_aln, params.sequence_type, params.intype, params.model_name);
3779         cout << "Concatenating " << params.aln_file << " with " << params.concatenate_aln << " ..." << endl;
3780         alignment->concatenateAlignment(&aln);
3781     }
3782 
3783     if (params.constraint_tree_file) {
3784         cout << "Reading constraint tree " << params.constraint_tree_file << "..." << endl;
3785         tree->constraintTree.readConstraint(params.constraint_tree_file, alignment->getSeqNames());
3786         if (params.start_tree == STT_PLL_PARSIMONY)
3787             params.start_tree = STT_PARSIMONY;
3788         else if (params.start_tree == STT_BIONJ)
3789             outError("Constraint tree does not work with -t BIONJ");
3790         if (params.num_bootstrap_samples || params.gbo_replicates)
3791             cout << "INFO: Constraint tree will be applied to ML tree and all bootstrap trees." << endl;
3792     }
3793 
3794     if (params.compute_seq_identity_along_tree) {
3795         if (!params.user_file)
3796             outError("Please supply a user tree file!");
3797         tree->readTree(params.user_file, params.is_rooted);
3798         if (!tree->rooted && !params.root) {
3799             outError("Tree is unrooted, thus you have to specify a root with -o option");
3800         }
3801         tree->setAlignment(tree->aln);
3802         if (!tree->rooted)
3803             tree->setRootNode(params.root);
3804         tree->computeSeqIdentityAlongTree();
3805         if (verbose_mode >= VB_MED)
3806             tree->drawTree(cout);
3807         string out_tree = (string)params.out_prefix + ".seqident_tree";
3808         tree->printTree(out_tree.c_str());
3809         cout << "Tree with sequence identity printed to " << out_tree << endl;
3810     } else if (params.aln_output) {
3811         /************ convert alignment to other format and write to output file *************/
3812         convertAlignment(params, tree);
3813     } else if (params.gbo_replicates > 0 && params.user_file && params.second_tree) {
3814         // run one of the UFBoot analysis
3815 //        runGuidedBootstrap(params, alignment, *tree);
3816         outError("Obsolete feature");
3817     } else if (params.avh_test) {
3818         // run one of the wondering test for Arndt
3819 //        runAvHTest(params, alignment, *tree);
3820         outError("Obsolete feature");
3821     } else if (params.bootlh_test) {
3822         // run Arndt's plot of tree likelihoods against bootstrap alignments
3823 //        runBootLhTest(params, alignment, *tree);
3824         outError("Obsolete feature");
3825     } else if (params.num_bootstrap_samples == 0) {
3826     /********************************************************************************
3827                     THE MAIN MAXIMUM LIKELIHOOD TREE RECONSTRUCTION
3828      ********************************************************************************/
3829         ModelCheckpoint *model_info = new ModelCheckpoint;
3830         alignment->checkGappySeq(params.remove_empty_seq);
3831 
3832         // remove identical sequences
3833         if (params.ignore_identical_seqs) {
3834             tree->removeIdenticalSeqs(params);
3835             if (tree->removed_seqs.size() > 0 && MPIHelper::getInstance().isMaster() && (params.suppress_output_flags & OUT_UNIQUESEQ) == 0) {
3836                 string filename = (string)params.out_prefix + ".uniqueseq.phy";
3837                 tree->aln->printAlignment(params.aln_output_format, filename.c_str());
3838                 cout << endl << "For your convenience alignment with unique sequences printed to " << filename << endl;
3839             }
3840         }
3841         alignment = NULL; // from now on use tree->aln instead
3842 
3843         startTreeReconstruction(params, tree, *model_info);
3844         // call main tree reconstruction
3845         if (params.num_runs == 1)
3846             runTreeReconstruction(params, tree);
3847         else
3848             runMultipleTreeReconstruction(params, tree->aln, tree);
3849 
3850         if (MPIHelper::getInstance().isMaster()) {
3851             reportPhyloAnalysis(params, *tree, *model_info);
3852         }
3853 
3854         // reinsert identical sequences
3855         if (tree->removed_seqs.size() > 0) {
3856             // BUG FIX: dont use reinsertIdenticalSeqs anymore
3857             tree->insertTaxa(tree->removed_seqs, tree->twin_seqs);
3858             tree->printResultTree();
3859         }
3860         delete model_info;
3861 
3862         if (params.dating_method != "") {
3863             doTimeTree(tree);
3864         }
3865 
3866     } else {
3867         // the classical non-parameter bootstrap (SBS)
3868 //        if (params.model_name.find("LINK") != string::npos || params.model_name.find("MERGE") != string::npos)
3869 //            outError("-m TESTMERGE is not allowed when doing standard bootstrap. Please first\nfind partition scheme on the original alignment and use it for bootstrap analysis");
3870         if (alignment->getNSeq() < 4)
3871             outError("It makes no sense to perform bootstrap with less than 4 sequences.");
3872         runStandardBootstrap(params, alignment, tree);
3873     }
3874 
3875 //    if (params.upper_bound) {
3876 //            UpperBounds(&params, alignment, tree);
3877 //    }
3878 
3879     if(verbose_mode >= VB_MED){
3880         if(tree->isSuperTree() && params.partition_type != BRLEN_OPTIMIZE){
3881             ((PhyloSuperTreePlen*) tree)->printNNIcasesNUM();
3882         }
3883     }
3884     // 2015-09-22: bug fix, move this line to before deleting tree
3885     alignment = tree->aln;
3886     delete tree;
3887     // BUG FIX: alignment can be changed, should delete tree->aln instead
3888     // 2015-09-22: THIS IS STUPID: after deleting tree, one cannot access tree->aln anymore
3889 //    alignment = tree->aln;
3890     delete alignment;
3891 
3892     checkpoint->putBool("finished", true);
3893     checkpoint->dump(true);
3894 }
3895 
3896 /**
3897     Perform separate tree reconstruction when tree topologies
3898     are unlinked between partitions
3899  */
runUnlinkedPhyloAnalysis(Params & params,Checkpoint * checkpoint)3900 void runUnlinkedPhyloAnalysis(Params &params, Checkpoint *checkpoint) {
3901     SuperAlignment *super_aln;
3902 
3903     ASSERT(params.partition_file);
3904 
3905     /****************** read in alignment **********************/
3906     // Partition model analysis
3907     super_aln = new SuperAlignmentUnlinked(params);
3908     PhyloSuperTree *super_tree = new PhyloSuperTree(super_aln);
3909 
3910     /**** do separate tree reconstruction for each partition ***/
3911 
3912     MTreeSet part_trees;
3913 
3914     if (params.user_file) {
3915         // reading user tree file for all partitions
3916         bool is_rooted = false;
3917         part_trees.readTrees(params.user_file, is_rooted, 0, super_aln->partitions.size());
3918         if (is_rooted)
3919             outError("Rooted trees not allowed: ", params.user_file);
3920         if (part_trees.size() != super_aln->partitions.size())
3921             outError("User tree file does not have the same number of trees as partitions");
3922         params.user_file = NULL;
3923     }
3924 
3925     ModelCheckpoint *model_info = new ModelCheckpoint;
3926     int part = 0;
3927     for (auto alnit = super_aln->partitions.begin(); alnit != super_aln->partitions.end(); alnit++, part++) {
3928 
3929         checkpoint->startStruct((*alnit)->name);
3930 
3931         // allocate heterotachy tree if neccessary
3932         int pos = posRateHeterotachy((*alnit)->model_name);
3933         IQTree *tree;
3934 
3935         if (params.num_mixlen > 1) {
3936             tree = new PhyloTreeMixlen((*alnit), params.num_mixlen);
3937         } else if (pos != string::npos) {
3938             tree = new PhyloTreeMixlen((*alnit), 0);
3939         } else
3940             tree = new IQTree((*alnit));
3941 
3942         tree->setCheckpoint(checkpoint);
3943         if (checkpoint->getBool("finished")) {
3944             tree->restoreCheckpoint();
3945         } else {
3946             if (!part_trees.empty())
3947                 tree->copyTree(part_trees[part]);
3948 
3949             startTreeReconstruction(params, tree, *model_info);
3950             // call main tree reconstruction
3951             if (params.num_runs == 1)
3952                 runTreeReconstruction(params, tree);
3953             else
3954                 runMultipleTreeReconstruction(params, tree->aln, tree);
3955             checkpoint->putBool("finished", true);
3956             checkpoint->dump();
3957         }
3958 
3959         super_tree->at(part)->copyTree(tree);
3960 
3961         delete tree;
3962         checkpoint->endStruct();
3963     }
3964 
3965     IQTree *iqtree = super_tree;
3966     super_tree->setCheckpoint(checkpoint);
3967     startTreeReconstruction(params, iqtree, *model_info);
3968     runTreeReconstruction(params, iqtree);
3969     if (MPIHelper::getInstance().isMaster())
3970         reportPhyloAnalysis(params, *iqtree, *model_info);
3971 
3972     delete super_tree;
3973     delete super_aln;
3974     delete model_info;
3975 }
3976 
assignBranchSupportNew(Params & params)3977 void assignBranchSupportNew(Params &params) {
3978     if (!params.user_file)
3979         outError("No target tree file provided");
3980     if (params.num_threads == 0)
3981         outError("-nt AUTO is not supported for concordance factor analysis, please specify no. cores");
3982     PhyloTree *tree;
3983     Alignment *aln = NULL;
3984     if (params.site_concordance) {
3985         if (!params.aln_file && !params.partition_file)
3986             outError("Please provide an alignment (-s) or partition file");
3987         if (params.partition_file) {
3988             params.compute_seq_composition = false;
3989             aln = new SuperAlignment(params);
3990             tree = new PhyloSuperTree((SuperAlignment*)aln);
3991         } else {
3992             aln = createAlignment(params.aln_file, params.sequence_type, params.intype, params.model_name);
3993             tree = new PhyloTree;
3994         }
3995     } else {
3996         tree = new PhyloTree;
3997     }
3998     tree->setParams(&params);
3999 
4000     cout << "Reading tree " << params.user_file << " ..." << endl;
4001     bool rooted = params.is_rooted;
4002     tree->readTree(params.user_file, rooted);
4003     cout << ((tree->rooted) ? "rooted" : "un-rooted") << " tree with "
4004         << tree->leafNum - tree->rooted << " taxa and " << tree->branchNum << " branches" << endl;
4005 
4006     // 2018-12-13: move initialisation to fix rooted vs unrooted tree
4007     if (params.site_concordance) {
4008         tree->setAlignment(aln);
4009         if (tree->isSuperTree())
4010             ((PhyloSuperTree*)tree)->mapTrees();
4011     }
4012 
4013     BranchVector branches;
4014     tree->getInnerBranches(branches);
4015     BranchVector::iterator brit;
4016     for (brit = branches.begin(); brit != branches.end(); brit++) {
4017         Neighbor *branch = brit->second->findNeighbor(brit->first);
4018         string label = brit->second->name;
4019         if (!label.empty())
4020             PUT_ATTR(branch, label);
4021     }
4022 
4023     map<string,string> meanings;
4024 
4025     if (!params.treeset_file.empty()) {
4026         bool rooted = params.is_rooted;
4027         MTreeSet trees(params.treeset_file.c_str(), rooted, params.tree_burnin, params.tree_max_count);
4028         double start_time = getRealTime();
4029         cout << "Computing gene concordance factor..." << endl;
4030         tree->computeGeneConcordance(trees, meanings);
4031         if (params.internode_certainty)
4032             tree->computeQuartetConcordance(trees);
4033         cout << getRealTime() - start_time << " sec" << endl;
4034     }
4035     if (params.site_concordance) {
4036         cout << "Computing site concordance factor..." << endl;
4037         double start_time = getRealTime();
4038         tree->computeSiteConcordance(meanings);
4039         cout << getRealTime() - start_time << " sec" << endl;
4040         delete aln;
4041     }
4042     string prefix = (params.out_prefix) ? params.out_prefix : params.user_file;
4043     string str = prefix + ".cf.tree";
4044     tree->printTree(str.c_str());
4045     cout << "Tree with concordance factors written to " << str << endl;
4046     str = prefix + ".cf.tree.nex";
4047     string filename = prefix + ".cf.stat";
4048     tree->printNexus(str, WT_BR_LEN, "See " + filename + " for branch annotation meanings." +
4049                      " This file is best viewed in FigTree.");
4050     cout << "Annotated tree (best viewed in FigTree) written to " << str << endl;
4051     if (verbose_mode >= VB_DEBUG)
4052         tree->drawTree(cout);
4053     str = prefix + ".cf.branch";
4054     tree->printTree(str.c_str(), WT_BR_LEN + WT_INT_NODE + WT_NEWLINE);
4055     cout << "Tree with branch IDs written to " << str << endl;
4056     ofstream out;
4057     out.open(filename.c_str());
4058     out << "# Concordance factor statistics" << endl
4059         << "# This file can be read in MS Excel or in R with command:" << endl
4060         << "#   tab=read.table('" <<  filename << "',header=TRUE)" << endl
4061         << "# Columns are tab-separated with following meaning:" << endl
4062         << "#   ID: Branch ID" << endl;
4063     map<string,string>::iterator mit;
4064     for (mit = meanings.begin(); mit != meanings.end(); mit++)
4065         if (mit->first[0] != '*')
4066             out << "#   " << mit->first << ": " << mit->second << endl;
4067     out << "#   Label: Existing branch label" << endl;
4068     out << "#   Length: Branch length" << endl;
4069     for (mit = meanings.begin(); mit != meanings.end(); mit++)
4070         if (mit->first[0] == '*')
4071             out << "# " << mit->first << ": " << mit->second << endl;
4072     out << "ID";
4073     for (mit = meanings.begin(); mit != meanings.end(); mit++)
4074         if (mit->first[0] != '*')
4075             out << "\t" << mit->first;
4076     out << "\tLabel\tLength" << endl;
4077     for (brit = branches.begin(); brit != branches.end(); brit++) {
4078         Neighbor *branch = brit->second->findNeighbor(brit->first);
4079         int ID = brit->second->id;
4080         out << ID;
4081         for (mit = meanings.begin(); mit != meanings.end(); mit++) {
4082             if (mit->first[0] == '*')
4083                 continue; // ignore NOTES
4084             out << '\t';
4085             string val;
4086             if (branch->getAttr(mit->first, val))
4087                 out << val;
4088             else
4089                 out << "NA";
4090         }
4091         double length = branch->length;
4092         string label;
4093         GET_ATTR(branch, label);
4094         out << '\t' << label << '\t' << length << endl;
4095     }
4096     out.close();
4097     cout << "Concordance factors per branch printed to " << filename << endl;
4098     if (!params.site_concordance_partition)
4099         return;
4100 
4101     // print concordant/discordant gene trees
4102     filename = prefix + ".cf.stat_tree";
4103     out.open(filename);
4104     out << "# Concordance factor statistics for decisive trees" << endl
4105     << "# This file can be read in MS Excel or in R with command:" << endl
4106     << "#   tab2=read.table('" <<  filename << "',header=TRUE)" << endl
4107     << "# Columns are tab-separated with following meaning:" << endl
4108     << "#   ID: Branch ID" << endl
4109     << "#   TreeID: Tree ID" << endl
4110     << "#   gC: 1/0 if tree is concordant/discordant with branch" << endl
4111     << "#   gD1: 1/0 if NNI-1 tree is concordant/discordant with branch" << endl
4112     << "#   gD2: 1/0 if NNI-2 tree is concordant/discordant with branch" << endl
4113     << "# NOTE: NA means that tree is not decisive for branch" << endl
4114     << "ID\tTreeID\tgC\tgD1\tgD2" << endl;
4115     for (brit = branches.begin(); brit != branches.end(); brit++) {
4116         Neighbor *branch = brit->second->findNeighbor(brit->first);
4117         int ID = brit->second->id;
4118         for (int part = 1; ; part++) {
4119             string gC, gD1, gD2;
4120             if (!branch->getAttr("gC" + convertIntToString(part), gC))
4121                 break;
4122             branch->getAttr("gD1" + convertIntToString(part), gD1);
4123             branch->getAttr("gD2" + convertIntToString(part), gD2);
4124             out << ID << '\t' << part << '\t' << gC << '\t' << gD1 << '\t' << gD2 << endl;
4125         }
4126     }
4127     out.close();
4128     cout << "Concordance factors per branch and tree printed to " << filename << endl;
4129 
4130     if (!params.site_concordance_partition || !tree->isSuperTree())
4131         return;
4132     // print partition-wise concordant/discordant sites
4133     filename = prefix + ".cf.stat_loci";
4134     out.open(filename);
4135     out << "# Concordance factor statistics for loci" << endl
4136     << "# This file can be read in MS Excel or in R with command:" << endl
4137     << "#   tab2=read.table('" <<  filename << "',header=TRUE)" << endl
4138     << "# Columns are tab-separated with following meaning:" << endl
4139     << "#   ID: Branch ID" << endl
4140     << "#   PartID: Locus ID" << endl
4141     << "#   sC: Number of concordant sites averaged over " << params.site_concordance << " quartets" << endl
4142     << "#   sD1: Number of discordant sites for alternative quartet 1" << endl
4143     << "#   sD2: Number of discordant sites for alternative quartet 2" << endl
4144     << "# NOTE: NA means that locus is not decisive for branch" << endl
4145     << "ID\tPartID\tsC\tsD1\tsD2" << endl;
4146     for (brit = branches.begin(); brit != branches.end(); brit++) {
4147         Neighbor *branch = brit->second->findNeighbor(brit->first);
4148         int ID = brit->second->id;
4149         for (int part = 1; ; part++) {
4150             string sC, sD1, sD2;
4151             if (!branch->getAttr("sC" + convertIntToString(part), sC))
4152                 break;
4153             if (!branch->getAttr("sD1" + convertIntToString(part), sD1))
4154                 break;
4155             if (!branch->getAttr("sD2" + convertIntToString(part), sD2))
4156                 break;
4157             out << ID << '\t' << part << '\t' << sC << '\t' << sD1 << '\t' << sD2 << endl;
4158         }
4159     }
4160     out.close();
4161     cout << "Concordance factors per branch and locus printed to " << filename << endl;
4162 }
4163 
4164 
4165 
4166 /**
4167  * assign split occurence frequencies from a set of input trees onto a target tree
4168  * NOTE: input trees must have the same taxon set
4169  * @param input_trees file containing NEWICK tree strings
4170  * @param burnin number of beginning trees to discard
4171  * @param max_count max number of trees to read in
4172  * @param target_tree the target tree
4173  * @param rooted TRUE if trees are rooted, false for unrooted trees
4174  * @param output_file file name to write output tree with assigned support values
4175  * @param out_prefix prefix of output file
4176  * @param mytree (OUT) resulting tree with support values assigned from target_tree
4177  * @param tree_weight_file file containing INTEGER weights of input trees
4178  * @param params program parameters
4179  */
assignBootstrapSupport(const char * input_trees,int burnin,int max_count,const char * target_tree,bool rooted,const char * output_tree,const char * out_prefix,MExtTree & mytree,const char * tree_weight_file,Params * params)4180 void assignBootstrapSupport(const char *input_trees, int burnin, int max_count,
4181         const char *target_tree, bool rooted, const char *output_tree,
4182         const char *out_prefix, MExtTree &mytree, const char* tree_weight_file,
4183         Params *params) {
4184     bool myrooted = rooted;
4185     // read the tree file
4186     cout << "Reading tree " << target_tree << " ..." << endl;
4187     mytree.init(target_tree, myrooted);
4188     if (mytree.rooted)
4189         cout << "rooted tree detected" << endl;
4190     else
4191         cout << "unrooted tree detected" << endl;
4192     // reindex the taxa in the tree to aphabetical names
4193     NodeVector taxa;
4194     mytree.getTaxa(taxa);
4195     sort(taxa.begin(), taxa.end(), nodenamecmp);
4196     int i = 0;
4197     for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++) {
4198         (*it)->id = i++;
4199     }
4200 
4201     /*
4202      string filename = params.boot_trees;
4203      filename += ".nolen";
4204      boot_trees.printTrees(filename.c_str(), false);
4205      return;
4206      */
4207     SplitGraph sg;
4208     SplitIntMap hash_ss;
4209     // make the taxa name
4210     vector<string> taxname;
4211     taxname.resize(mytree.leafNum);
4212     mytree.getTaxaName(taxname);
4213 
4214     // read the bootstrap tree file
4215     double scale = 100.0;
4216     if (params->scaling_factor > 0)
4217         scale = params->scaling_factor;
4218 
4219     MTreeSet boot_trees;
4220     if (params && detectInputFile(input_trees) == IN_NEXUS) {
4221         sg.init(*params);
4222         for (SplitGraph::iterator it = sg.begin(); it != sg.end(); it++)
4223             hash_ss.insertSplit((*it), (*it)->getWeight());
4224         StrVector sgtaxname;
4225         sg.getTaxaName(sgtaxname);
4226         i = 0;
4227         for (StrVector::iterator sit = sgtaxname.begin();
4228                 sit != sgtaxname.end(); sit++, i++) {
4229             Node *leaf = mytree.findLeafName(*sit);
4230             if (!leaf)
4231                 outError("Tree does not contain taxon ", *sit);
4232             leaf->id = i;
4233         }
4234         scale /= sg.maxWeight();
4235     } else {
4236         myrooted = rooted;
4237         boot_trees.init(input_trees, myrooted, burnin, max_count,
4238                         tree_weight_file);
4239         if (mytree.rooted != boot_trees.isRooted())
4240             outError("Target tree and tree set have different rooting");
4241         if (boot_trees.equal_taxon_set) {
4242             boot_trees.convertSplits(taxname, sg, hash_ss, SW_COUNT, -1, params->support_tag);
4243             scale /= boot_trees.sumTreeWeights();
4244         }
4245     }
4246     //sg.report(cout);
4247     if (!sg.empty()) {
4248         cout << "Rescaling split weights by " << scale << endl;
4249         if (params->scaling_factor < 0)
4250             sg.scaleWeight(scale, true);
4251         else {
4252             sg.scaleWeight(scale, false, params->numeric_precision);
4253         }
4254 
4255         cout << sg.size() << " splits found" << endl;
4256     }
4257     // compute the percentage of appearance
4258     //    printSplitSet(sg, hash_ss);
4259     //sg.report(cout);
4260     cout << "Creating " << RESAMPLE_NAME << " support values..." << endl;
4261     if (!sg.empty())
4262         mytree.createBootstrapSupport(taxname, boot_trees, hash_ss, params->support_tag);
4263     else {
4264         //mytree.createBootstrapSupport(boot_trees);
4265         cout << "Unequal taxon sets, rereading trees..." << endl;
4266         DoubleVector rfdist;
4267         mytree.computeRFDist(input_trees, rfdist, 1);
4268     }
4269 
4270     //mytree.scaleLength(100.0/boot_trees.size(), true);
4271     string out_file;
4272     if (output_tree)
4273         out_file = output_tree;
4274     else {
4275         if (out_prefix)
4276             out_file = out_prefix;
4277         else
4278             out_file = target_tree;
4279         out_file += ".suptree";
4280     }
4281 
4282     mytree.printTree(out_file.c_str());
4283     cout << "Tree with assigned support written to " << out_file
4284             << endl;
4285     /*
4286     if (out_prefix)
4287         out_file = out_prefix;
4288     else
4289         out_file = target_tree;
4290     out_file += ".supval";
4291     mytree.writeInternalNodeNames(out_file);
4292 
4293     cout << "Support values written to " << out_file << endl;
4294     */
4295 }
4296 
computeConsensusTree(const char * input_trees,int burnin,int max_count,double cutoff,double weight_threshold,const char * output_tree,const char * out_prefix,const char * tree_weight_file,Params * params)4297 void computeConsensusTree(const char *input_trees, int burnin, int max_count,
4298         double cutoff, double weight_threshold, const char *output_tree,
4299         const char *out_prefix, const char *tree_weight_file, Params *params) {
4300     bool rooted = false;
4301 
4302     // read the bootstrap tree file
4303     /*
4304      MTreeSet boot_trees(input_trees, rooted, burnin, tree_weight_file);
4305      string first_taxname = boot_trees.front()->root->name;
4306      //if (params.root) first_taxname = params.root;
4307 
4308      SplitGraph sg;
4309 
4310      boot_trees.convertSplits(sg, cutoff, SW_COUNT, weight_threshold);*/
4311 
4312     //sg.report(cout);
4313     SplitGraph sg;
4314     SplitIntMap hash_ss;
4315     // make the taxa name
4316     //vector<string> taxname;
4317     //taxname.resize(mytree.leafNum);
4318     //mytree.getTaxaName(taxname);
4319 
4320     // read the bootstrap tree file
4321     double scale = 100.0;
4322     if (params->scaling_factor > 0)
4323         scale = params->scaling_factor;
4324 
4325     MTreeSet boot_trees;
4326     if (params && detectInputFile(input_trees) == IN_NEXUS) {
4327         char *user_file = params->user_file;
4328         params->user_file = (char*) input_trees;
4329         params->split_weight_summary = SW_COUNT; // count number of splits
4330         sg.init(*params);
4331         params->user_file = user_file;
4332         for (SplitGraph::iterator it = sg.begin(); it != sg.end();)
4333             if ((*it)->getWeight() > weight_threshold) {
4334                 hash_ss.insertSplit((*it), (*it)->getWeight());
4335                 it++;
4336             } else {
4337                 // delete the split
4338                 if (it != sg.end()-1) {
4339                     *(*it) = (*sg.back());
4340                 }
4341                 delete sg.back();
4342                 sg.pop_back();
4343             }
4344         /*        StrVector sgtaxname;
4345          sg.getTaxaName(sgtaxname);
4346          i = 0;
4347          for (StrVector::iterator sit = sgtaxname.begin(); sit != sgtaxname.end(); sit++, i++) {
4348          Node *leaf = mytree.findLeafName(*sit);
4349          if (!leaf) outError("Tree does not contain taxon ", *sit);
4350          leaf->id = i;
4351          }*/
4352         scale /= sg.maxWeight();
4353     } else {
4354         boot_trees.init(input_trees, rooted, burnin, max_count,
4355                 tree_weight_file);
4356         boot_trees.convertSplits(sg, cutoff, SW_COUNT, weight_threshold);
4357         scale /= boot_trees.sumTreeWeights();
4358         cout << sg.size() << " splits found" << endl;
4359     }
4360     //sg.report(cout);
4361     if (verbose_mode >= VB_MED)
4362         cout << "Rescaling split weights by " << scale << endl;
4363     if (params->scaling_factor < 0)
4364         sg.scaleWeight(scale, true);
4365     else {
4366         sg.scaleWeight(scale, false, params->numeric_precision);
4367     }
4368 
4369 
4370 
4371     //cout << "Creating greedy consensus tree..." << endl;
4372     MTree mytree;
4373     SplitGraph maxsg;
4374     sg.findMaxCompatibleSplits(maxsg);
4375 
4376     if (verbose_mode >= VB_MAX)
4377         maxsg.saveFileStarDot(cout);
4378     //cout << "convert compatible split system into tree..." << endl;
4379     mytree.convertToTree(maxsg);
4380     //cout << "done" << endl;
4381     if (!mytree.rooted) {
4382         string taxname;
4383         if (params->root)
4384             taxname = params->root;
4385         else
4386             taxname = sg.getTaxa()->GetTaxonLabel(0);
4387         Node *node = mytree.findLeafName(taxname);
4388         if (node)
4389             mytree.root = node;
4390     }
4391     // mytree.scaleLength(100.0 / boot_trees.sumTreeWeights(), true);
4392 
4393     // mytree.getTaxaID(maxsg.getSplitsBlock()->getCycle());
4394     //maxsg.saveFile(cout);
4395 
4396     string out_file;
4397 
4398     if (output_tree)
4399         out_file = output_tree;
4400     else {
4401         if (out_prefix)
4402             out_file = out_prefix;
4403         else
4404             out_file = input_trees;
4405         out_file += ".contree";
4406     }
4407 
4408 //    if (removed_seqs.size() > 0)
4409 //        mytree.insertTaxa(removed_seqs, twin_seqs);
4410 
4411     mytree.printTree(out_file.c_str(), WT_BR_CLADE);
4412     cout << "Consensus tree written to " << out_file << endl;
4413 
4414     if (output_tree)
4415         out_file = output_tree;
4416     else {
4417         if (out_prefix)
4418             out_file = out_prefix;
4419         else
4420             out_file = input_trees;
4421         out_file += ".splits";
4422     }
4423 
4424     //sg.scaleWeight(0.01, false, 4);
4425     if (params->print_splits_file) {
4426         sg.saveFile(out_file.c_str(), IN_OTHER, true);
4427         cout << "Non-trivial split supports printed to star-dot file " << out_file << endl;
4428     }
4429 
4430 }
4431 
computeConsensusNetwork(const char * input_trees,int burnin,int max_count,double cutoff,int weight_summary,double weight_threshold,const char * output_tree,const char * out_prefix,const char * tree_weight_file)4432 void computeConsensusNetwork(const char *input_trees, int burnin, int max_count,
4433         double cutoff, int weight_summary, double weight_threshold, const char *output_tree,
4434         const char *out_prefix, const char* tree_weight_file) {
4435     bool rooted = false;
4436 
4437     // read the bootstrap tree file
4438     MTreeSet boot_trees(input_trees, rooted, burnin, max_count,
4439             tree_weight_file);
4440 
4441     SplitGraph sg;
4442     //SplitIntMap hash_ss;
4443 
4444     boot_trees.convertSplits(sg, cutoff, weight_summary, weight_threshold);
4445 
4446     string out_file;
4447 
4448     if (output_tree)
4449         out_file = output_tree;
4450     else {
4451         if (out_prefix)
4452             out_file = out_prefix;
4453         else
4454             out_file = input_trees;
4455         out_file += ".nex";
4456     }
4457 
4458     sg.saveFile(out_file.c_str(), IN_NEXUS);
4459     cout << "Consensus network printed to " << out_file << endl;
4460 
4461     if (output_tree)
4462         out_file = output_tree;
4463     else {
4464         if (out_prefix)
4465             out_file = out_prefix;
4466         else
4467             out_file = input_trees;
4468         out_file += ".splits";
4469     }
4470     if (verbose_mode >= VB_MED) {
4471         sg.saveFile(out_file.c_str(), IN_OTHER, true);
4472         cout << "Non-trivial split supports printed to star-dot file " << out_file << endl;
4473     }
4474 
4475 }
4476