1 /***************************************************************************
2  *   Copyright (C) 2009-2015 by                                            *
3  *   BUI Quang Minh <minh.bui@univie.ac.at>                                *
4  *   Lam-Tung Nguyen <nltung@gmail.com>                                    *
5  *                                                                         *
6  *                                                                         *
7  *   This program is free software; you can redistribute it and/or modify  *
8  *   it under the terms of the GNU General Public License as published by  *
9  *   the Free Software Foundation; either version 2 of the License, or     *
10  *   (at your option) any later version.                                   *
11  *                                                                         *
12  *   This program is distributed in the hope that it will be useful,       *
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15  *   GNU General Public License for more details.                          *
16  *                                                                         *
17  *   You should have received a copy of the GNU General Public License     *
18  *   along with this program; if not, write to the                         *
19  *   Free Software Foundation, Inc.,                                       *
20  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
21  ***************************************************************************/
22 #include "phylotree.h"
23 #include "utils/bionj.h"
24 //#include "rateheterogeneity.h"
25 #include "alignment/alignmentpairwise.h"
26 #include <algorithm>
27 #include <limits>
28 #include "utils/timeutil.h"
29 #include "utils/pllnni.h"
30 #include "phylosupertree.h"
31 #include "phylosupertreeplen.h"
32 #include "upperbounds.h"
33 #include "utils/MPIHelper.h"
34 #include "model/modelmixture.h"
35 #include "phylonodemixlen.h"
36 #include "phylotreemixlen.h"
37 
38 const int LH_MIN_CONST = 1;
39 
40 //const static int BINARY_SCALE = floor(log2(1/SCALING_THRESHOLD));
41 //const static double LOG_BINARY_SCALE = -(log(2) * BINARY_SCALE);
42 
43 /****************************************************************************
44  SPRMoves class
45  ****************************************************************************/
46 
add(PhyloNode * prune_node,PhyloNode * prune_dad,PhyloNode * regraft_node,PhyloNode * regraft_dad,double score)47 void SPRMoves::add(PhyloNode *prune_node, PhyloNode *prune_dad, PhyloNode *regraft_node, PhyloNode *regraft_dad,
48         double score) {
49     if (size() >= MAX_SPR_MOVES && score <= rbegin()->score)
50         return;
51     if (size() >= MAX_SPR_MOVES) {
52         iterator it = end();
53         it--;
54         erase(it);
55     }
56     SPRMove spr;
57     spr.prune_node = prune_node;
58     spr.prune_dad = prune_dad;
59     spr.regraft_node = regraft_node;
60     spr.regraft_dad = regraft_dad;
61     spr.score = score;
62     insert(spr);
63 }
64 
65 /****************************************************************************
66  PhyloTree class
67  ****************************************************************************/
68 
PhyloTree()69 PhyloTree::PhyloTree() : MTree(), CheckpointFactory() {
70     init();
71 }
72 
init()73 void PhyloTree::init() {
74     aln = NULL;
75     model = NULL;
76     site_rate = NULL;
77     optimize_by_newton = true;
78     central_partial_lh = NULL;
79     nni_partial_lh = NULL;
80     tip_partial_lh = NULL;
81     tip_partial_pars = NULL;
82     tip_partial_lh_computed = 0;
83     ptn_freq_computed = false;
84     central_scale_num = NULL;
85     nni_scale_num = NULL;
86     central_partial_pars = NULL;
87     cost_matrix = NULL;
88     model_factory = NULL;
89     discard_saturated_site = true;
90     _pattern_lh = NULL;
91     _pattern_lh_cat = NULL;
92     _pattern_lh_cat_state = NULL;
93     //root_state = STATE_UNKNOWN;
94     root_state = 126;
95     theta_all = NULL;
96     buffer_scale_all = NULL;
97     buffer_partial_lh = NULL;
98     ptn_freq = NULL;
99     ptn_freq_pars = NULL;
100     ptn_invar = NULL;
101     subTreeDistComputed = false;
102     dist_matrix = NULL;
103     var_matrix = NULL;
104     params = NULL;
105     setLikelihoodKernel(LK_SSE2);  // FOR TUNG: you forgot to initialize this variable!
106     setNumThreads(1);
107     num_threads = 0;
108     max_lh_slots = 0;
109     save_all_trees = 0;
110     nodeBranchDists = NULL;
111     // FOR: upper bounds
112     mlCheck = 0;
113     skippedNNIub = 0;
114     totalNNIub = 0;
115     minStateFreq = 0.0;
116     //minUB = 0.0;
117     //meanUB = 0.0;
118     //maxUB = 0.0;
119     pllInst = NULL;
120     pllAlignment = NULL;
121     pllPartitions = NULL;
122 //    lhComputed = false;
123     curScore = -DBL_MAX;
124     root = NULL;
125     params = NULL;
126     current_scaling = 1.0;
127     is_opt_scaling = false;
128     num_partial_lh_computations = 0;
129     vector_size = 0;
130     safe_numeric = false;
131 }
132 
PhyloTree(Alignment * aln)133 PhyloTree::PhyloTree(Alignment *aln) : MTree(), CheckpointFactory() {
134     init();
135     this->aln = aln;
136 }
137 
PhyloTree(string & treeString,Alignment * aln,bool isRooted)138 PhyloTree::PhyloTree(string& treeString, Alignment* aln, bool isRooted) : MTree() {
139     stringstream str;
140     str << treeString;
141     str.seekg(0, ios::beg);
142     freeNode();
143     readTree(str, isRooted);
144     setAlignment(aln);
145 }
146 
startCheckpoint()147 void PhyloTree::startCheckpoint() {
148     checkpoint->startStruct("PhyloTree");
149 }
150 
151 
saveCheckpoint()152 void PhyloTree::saveCheckpoint() {
153     startCheckpoint();
154 //    StrVector leafNames;
155 //    getTaxaName(leafNames);
156 //    CKP_VECTOR_SAVE(leafNames);
157     string newick = PhyloTree::getTreeString();
158     CKP_SAVE(newick);
159 //    CKP_SAVE(curScore);
160     endCheckpoint();
161     CheckpointFactory::saveCheckpoint();
162 }
163 
restoreCheckpoint()164 void PhyloTree::restoreCheckpoint() {
165     CheckpointFactory::restoreCheckpoint();
166     startCheckpoint();
167 //    StrVector leafNames;
168 //    if (CKP_VECTOR_RESTORE(leafNames)) {
169 //        if (leafNames.size() +(int)rooted != leafNum)
170 //            outError("Alignment mismatched from checkpoint!");
171 //
172 //        StrVector taxname;
173 //        getTaxaName(taxname);
174 //        for (int i = 0; i < leafNames.size(); i++)
175 //            if (taxname[i] != leafNames[i])
176 //                outError("Sequence name " + taxname[i] + " mismatched from checkpoint");
177 //    }
178     string newick;
179 //    CKP_RESTORE(curScore);
180     if (CKP_RESTORE(newick))
181         PhyloTree::readTreeString(newick);
182     endCheckpoint();
183 }
184 
discardSaturatedSite(bool val)185 void PhyloTree::discardSaturatedSite(bool val) {
186     discard_saturated_site = val;
187 }
188 
myPartitionsDestroy(partitionList * pl)189 void myPartitionsDestroy(partitionList *pl) {
190     int i;
191     for (i = 0; i < pl->numberOfPartitions; i++) {
192         rax_free(pl->partitionData[i]->partitionName);
193         rax_free(pl->partitionData[i]);
194     }
195     rax_free(pl->partitionData);
196     rax_free(pl);
197 }
198 
~PhyloTree()199 PhyloTree::~PhyloTree() {
200     if (nni_scale_num)
201         aligned_free(nni_scale_num);
202     nni_scale_num = NULL;
203     if (nni_partial_lh)
204         aligned_free(nni_partial_lh);
205     nni_partial_lh = NULL;
206     if (central_partial_lh)
207         aligned_free(central_partial_lh);
208     central_partial_lh = NULL;
209     if (central_scale_num)
210         aligned_free(central_scale_num);
211     central_scale_num = NULL;
212 
213     if (central_partial_pars)
214         aligned_free(central_partial_pars);
215     central_partial_pars = NULL;
216 
217     if(cost_matrix){
218         aligned_free(cost_matrix);
219         cost_matrix = NULL;
220     }
221 
222     if (model_factory)
223         delete model_factory;
224     model_factory = NULL;
225     if (model)
226         delete model;
227     model = NULL;
228     if (site_rate)
229         delete site_rate;
230     site_rate = NULL;
231     if (_pattern_lh_cat)
232         aligned_free(_pattern_lh_cat);
233     _pattern_lh_cat = NULL;
234     if (_pattern_lh)
235         aligned_free(_pattern_lh);
236     _pattern_lh = NULL;
237     //if (state_freqs)
238     //    delete [] state_freqs;
239     if (theta_all)
240         aligned_free(theta_all);
241     theta_all = NULL;
242     if (buffer_scale_all)
243         aligned_free(buffer_scale_all);
244     buffer_scale_all = NULL;
245     if (buffer_partial_lh)
246         aligned_free(buffer_partial_lh);
247     buffer_partial_lh = NULL;
248     if (ptn_freq)
249         aligned_free(ptn_freq);
250     ptn_freq = NULL;
251     if (ptn_freq_pars)
252         aligned_free(ptn_freq_pars);
253     ptn_freq_pars = NULL;
254     ptn_freq_computed = false;
255     if (ptn_invar)
256         aligned_free(ptn_invar);
257     ptn_invar = NULL;
258     if (dist_matrix)
259         delete[] dist_matrix;
260     dist_matrix = NULL;
261 
262     if (var_matrix)
263         delete[] var_matrix;
264     var_matrix = NULL;
265 
266     if (pllPartitions)
267         myPartitionsDestroy(pllPartitions);
268     if (pllAlignment)
269         pllAlignmentDataDestroy(pllAlignment);
270     if (pllInst)
271         pllDestroyInstance(pllInst);
272 
273     pllPartitions = NULL;
274     pllAlignment = NULL;
275     pllInst = NULL;
276 
277 }
278 
readTree(const char * infile,bool & is_rooted)279 void PhyloTree::readTree(const char *infile, bool &is_rooted) {
280     MTree::readTree(infile, is_rooted);
281     // 2015-10-14: has to reset this pointer when read in
282     current_it = current_it_back = NULL;
283     if (rooted && root)
284         computeBranchDirection();
285 }
286 
readTree(istream & in,bool & is_rooted)287 void PhyloTree::readTree(istream &in, bool &is_rooted) {
288     MTree::readTree(in, is_rooted);
289     // 2015-10-14: has to reset this pointer when read in
290     current_it = current_it_back = NULL;
291     // remove taxa if necessary
292     if (removed_seqs.size() > 0)
293         removeTaxa(removed_seqs);
294 
295     // collapse any internal node of degree 2
296     NodeVector nodes;
297     getInternalNodes(nodes);
298     int num_collapsed = 0;
299     for (NodeVector::iterator it = nodes.begin(); it != nodes.end(); it++)
300         if ((*it)->degree() == 2) {
301             Node *left = (*it)->neighbors[0]->node;
302             Node *right = (*it)->neighbors[1]->node;
303             double len = (*it)->neighbors[0]->length+(*it)->neighbors[1]->length;
304             left->updateNeighbor((*it), right, len);
305             right->updateNeighbor((*it), left, len);
306             delete (*it);
307             num_collapsed++;
308             if (verbose_mode >= VB_MED)
309                 cout << "Node of degree 2 collapsed" << endl;
310         }
311     if (num_collapsed)
312         initializeTree();
313     if (rooted)
314         computeBranchDirection();
315 }
316 
assignLeafNames(Node * node,Node * dad)317 void PhyloTree::assignLeafNames(Node *node, Node *dad) {
318     if (!node)
319         node = root;
320     if (node->isLeaf()) {
321         if (rooted && node == root) {
322             ASSERT(node->id == leafNum-1);
323             root->name = ROOT_NAME;
324         } else {
325             node->id = atoi(node->name.c_str());
326             node->name = aln->getSeqName(node->id);
327         }
328         ASSERT(node->id >= 0 && node->id < leafNum);
329     }
330     FOR_NEIGHBOR_IT(node, dad, it)assignLeafNames((*it)->node, node);
331 }
332 
copyTree(MTree * tree)333 void PhyloTree::copyTree(MTree *tree) {
334     MTree::copyTree(tree);
335     if (!aln)
336         return;
337     // reset the ID with alignment
338     setAlignment(aln);
339 }
340 
copyTree(MTree * tree,string & taxa_set)341 void PhyloTree::copyTree(MTree *tree, string &taxa_set) {
342     MTree::copyTree(tree, taxa_set);
343     if (rooted)
344         computeBranchDirection();
345     if (!aln)
346         return;
347     // reset the ID with alignment
348     setAlignment(aln);
349 }
350 
copyPhyloTree(PhyloTree * tree)351 void PhyloTree::copyPhyloTree(PhyloTree *tree) {
352     MTree::copyTree(tree);
353     if (!tree->aln)
354         return;
355     setAlignment(tree->aln);
356 }
357 
copyPhyloTreeMixlen(PhyloTree * tree,int mix)358 void PhyloTree::copyPhyloTreeMixlen(PhyloTree *tree, int mix) {
359     if (tree->isMixlen()) {
360         ((PhyloTreeMixlen*)tree)->cur_mixture = mix;
361     }
362     copyPhyloTree(tree);
363     if (tree->isMixlen()) {
364         ((PhyloTreeMixlen*)tree)->cur_mixture = -1;
365     }
366 }
367 
setAlignment(Alignment * alignment)368 void PhyloTree::setAlignment(Alignment *alignment) {
369     aln = alignment;
370     bool err = false;
371     int nseq = aln->getNSeq();
372     for (int seq = 0; seq < nseq; seq++) {
373         string seq_name = aln->getSeqName(seq);
374         Node *node = findLeafName(seq_name);
375         if (!node) {
376             string str = "Alignment sequence ";
377             str += seq_name;
378             str += " does not appear in the tree";
379             err = true;
380             outError(str, false);
381         } else {
382             ASSERT(node->isLeaf());
383             node->id = seq;
384         }
385     }
386     if (err) {
387         printTree(cout, WT_NEWLINE);
388         outError("Tree taxa and alignment sequence do not match (see above)");
389     }
390     if (rooted) {
391         ASSERT(root->name == ROOT_NAME);
392         root->id = nseq;
393     }
394     StrVector taxname;
395     getTaxaName(taxname);
396     for (StrVector::iterator it = taxname.begin(); it != taxname.end(); it++)
397         if ((*it) != ROOT_NAME && alignment->getSeqID(*it) < 0) {
398             outError((string)"Tree taxon " + (*it) + " does not appear in the alignment", false);
399             err = true;
400         }
401     if (err) outError("Tree taxa and alignment sequence do not match (see above)");
402 }
403 
setRootNode(const char * my_root,bool multi_taxa)404 void PhyloTree::setRootNode(const char *my_root, bool multi_taxa) {
405     if (rooted) {
406         computeBranchDirection();
407         return;
408     }
409 
410     if (!my_root) {
411         root = findNodeName(aln->getSeqName(0));
412         ASSERT(root);
413         return;
414     }
415 
416     if (strchr(my_root, ',') == NULL) {
417         string root_name = my_root;
418         root = findNodeName(root_name);
419         ASSERT(root);
420         return;
421     }
422 
423     // my_root is a list of taxa
424     StrVector taxa;
425     convert_string_vec(my_root, taxa);
426     root = findNodeName(taxa[0]);
427     ASSERT(root);
428     if (!multi_taxa) {
429         return;
430     }
431     unordered_set<string> taxa_set;
432     for (auto it = taxa.begin(); it != taxa.end(); it++)
433         taxa_set.insert(*it);
434     pair<Node*,Neighbor*> res = {NULL, NULL};
435     findNodeNames(taxa_set, res, root->neighbors[0]->node, root);
436     if (res.first)
437         root = res.first;
438     else
439         outWarning("Branch separating outgroup is not found");
440 }
441 
442 //void PhyloTree::setParams(Params* params) {
443 //    this->params = params;
444 //}
445 
readTreeString(const string & tree_string)446 void PhyloTree::readTreeString(const string &tree_string) {
447     stringstream str(tree_string);
448     freeNode();
449 
450     // bug fix 2016-04-14: in case taxon name happens to be ID
451     MTree::readTree(str, rooted);
452 
453     assignLeafNames();
454     setRootNode(Params::getInstance().root);
455 
456     if (isSuperTree()) {
457         ((PhyloSuperTree*) this)->mapTrees();
458     }
459     if (Params::getInstance().pll) {
460         pllReadNewick(getTreeString());
461     }
462     resetCurScore();
463     if (Params::getInstance().fixStableSplits || Params::getInstance().adaptPertubation) {
464         buildNodeSplit();
465     }
466     current_it = current_it_back = NULL;
467 }
468 
readTreeStringSeqName(const string & tree_string)469 void PhyloTree::readTreeStringSeqName(const string &tree_string) {
470     stringstream str(tree_string);
471 //    str(tree_string);
472 //    str.seekg(0, ios::beg);
473     freeNode();
474     if (rooted) {
475         rooted = false;
476         readTree(str, rooted);
477         if (!rooted)
478             convertToRooted();
479     } else {
480         readTree(str, rooted);
481     }
482 //    assignLeafNames();
483     setAlignment(aln);
484     setRootNode(params->root);
485 
486     if (isSuperTree()) {
487         ((PhyloSuperTree*) this)->mapTrees();
488     }
489     if (params->pll) {
490         pllReadNewick(getTreeString());
491     }
492     resetCurScore();
493 //    lhComputed = false;
494     if (params->fixStableSplits) {
495         buildNodeSplit();
496     }
497     current_it = current_it_back = NULL;
498 }
499 
wrapperFixNegativeBranch(bool force_change)500 int PhyloTree::wrapperFixNegativeBranch(bool force_change) {
501     // Initialize branch lengths for the parsimony tree
502     initializeAllPartialPars();
503     clearAllPartialLH();
504     int numFixed = fixNegativeBranch(force_change);
505     if (params->pll) {
506         pllReadNewick(getTreeString());
507     }
508     resetCurScore();
509     if (verbose_mode >= VB_MAX)
510         printTree(cout);
511 //    lhComputed = false;
512     return numFixed;
513 }
514 
pllReadNewick(string newickTree)515 void PhyloTree::pllReadNewick(string newickTree) {
516     pllNewickTree *newick = pllNewickParseString(newickTree.c_str());
517     pllTreeInitTopologyNewick(pllInst, newick, PLL_FALSE);
518     pllNewickParseDestroy(&newick);
519 }
520 
readTreeFile(const string & file_name)521 void PhyloTree::readTreeFile(const string &file_name) {
522     ifstream str;
523     str.open(file_name.c_str());
524 //    str << tree_string;
525 //    str.seekg(0, ios::beg);
526     freeNode();
527     if (rooted) {
528         rooted = false;
529         readTree(str, rooted);
530         if (!rooted)
531             convertToRooted();
532     } else
533         readTree(str, rooted);
534     setAlignment(aln);
535     if (isSuperTree()) {
536         ((PhyloSuperTree*) this)->mapTrees();
537     } else {
538         clearAllPartialLH();
539     }
540     str.close();
541     current_it = current_it_back = NULL;
542 }
543 
getTreeString()544 string PhyloTree::getTreeString() {
545     stringstream tree_stream;
546     setRootNode(params->root);
547     printTree(tree_stream, WT_TAXON_ID + WT_BR_LEN + WT_SORT_TAXA);
548     return tree_stream.str();
549 }
550 
getTopologyString(bool printBranchLength)551 string PhyloTree::getTopologyString(bool printBranchLength) {
552     stringstream tree_stream;
553     // important: to make topology string unique
554     setRootNode(params->root);
555     //printTree(tree_stream, WT_TAXON_ID + WT_SORT_TAXA);
556     if (printBranchLength) {
557         printTree(tree_stream, WT_SORT_TAXA + WT_BR_LEN + WT_TAXON_ID);
558     } else {
559         printTree(tree_stream, WT_SORT_TAXA);
560     }
561     return tree_stream.str();
562 }
563 
rollBack(istream & best_tree_string)564 void PhyloTree::rollBack(istream &best_tree_string) {
565     best_tree_string.seekg(0, ios::beg);
566     freeNode();
567     readTree(best_tree_string, rooted);
568     assignLeafNames();
569     initializeAllPartialLh();
570     clearAllPartialLH();
571 }
572 
setModel(ModelSubst * amodel)573 void PhyloTree::setModel(ModelSubst *amodel) {
574     model = amodel;
575     //state_freqs = new double[numStates];
576     //model->getStateFrequency(state_freqs);
577 }
578 
setModelFactory(ModelFactory * model_fac)579 void PhyloTree::setModelFactory(ModelFactory *model_fac) {
580     model_factory = model_fac;
581     if (model_fac) {
582         model = model_factory->model;
583         site_rate = model_factory->site_rate;
584         if (!isSuperTree()) {
585             setLikelihoodKernel(sse);
586             setNumThreads(num_threads);
587         }
588     } else {
589         model = NULL;
590         site_rate = NULL;
591     }
592 }
593 
setRate(RateHeterogeneity * rate)594 void PhyloTree::setRate(RateHeterogeneity *rate) {
595     site_rate = rate;
596     if (!rate)
597         return;
598     //numCat = site_rate->getNRate();
599     //if (aln) {
600     //    block = aln->num_states * numCat;
601     //    lh_size = aln->size() * block;
602     //}
603 }
604 
getRate()605 RateHeterogeneity *PhyloTree::getRate() {
606     return site_rate;
607 }
608 
newNode(int node_id,const char * node_name)609 Node* PhyloTree::newNode(int node_id, const char* node_name) {
610     return (Node*) (new PhyloNode(node_id, node_name));
611 }
612 
newNode(int node_id,int node_name)613 Node* PhyloTree::newNode(int node_id, int node_name) {
614     return (Node*) (new PhyloNode(node_id, node_name));
615 }
616 
clearAllPartialLH(bool make_null)617 void PhyloTree::clearAllPartialLH(bool make_null) {
618     if (!root)
619         return;
620     ((PhyloNode*) root->neighbors[0]->node)->clearAllPartialLh(make_null, (PhyloNode*) root);
621     tip_partial_lh_computed = 0;
622     // 2015-10-14: has to reset this pointer when read in
623     current_it = current_it_back = NULL;
624 }
625 
626 /*
627 void PhyloTree::computeAllPartialLh(PhyloNode *node, PhyloNode *dad) {
628     if (!node) node = (PhyloNode*)root;
629     FOR_NEIGHBOR_IT(node, dad, it) {
630         if ((((PhyloNeighbor*)*it)->partial_lh_computed & 1) == 0)
631             computePartialLikelihood((PhyloNeighbor*)*it, node);
632         PhyloNeighbor *rev = (PhyloNeighbor*) (*it)->node->findNeighbor(node);
633         if ((rev->partial_lh_computed & 1) == 0)
634             computePartialLikelihood(rev, (PhyloNode*)(*it)->node);
635         computeAllPartialLh((PhyloNode*)(*it)->node, node);
636     }
637 }
638 */
639 
getASCName(ASCType ASC_type)640 string getASCName(ASCType ASC_type) {
641     switch (ASC_type) {
642         case ASC_NONE:
643             return "";
644         case ASC_VARIANT:
645             return "+ASC";
646         case ASC_VARIANT_MISSING:
647             return "+ASC_MIS";
648         case ASC_INFORMATIVE:
649             return "+ASC_INF";
650         case ASC_INFORMATIVE_MISSING:
651             return "+ASC_INF_MIS";
652     }
653 }
654 
getSubstName()655 string PhyloTree::getSubstName() {
656     return model->getName() + getASCName(model_factory->getASC());
657 }
658 
getRateName()659 string PhyloTree::getRateName() {
660     if (model_factory->fused_mix_rate) {
661         return "*" + site_rate->name.substr(1);
662     } else {
663         return site_rate->name;
664     }
665 }
666 
getModelName()667 string PhyloTree::getModelName() {
668     return getSubstName() + getRateName();
669 }
670 
getModelNameParams()671 string PhyloTree::getModelNameParams() {
672     string name = model->getNameParams();
673     name += getASCName(model_factory->getASC());
674     string rate_name = site_rate->getNameParams();
675 
676     if (model_factory->fused_mix_rate) {
677         name += "*" + rate_name.substr(1);
678     } else {
679         name += rate_name;
680     }
681 
682     return name;
683 }
684 
saveBranchLengths(DoubleVector & lenvec,int startid,PhyloNode * node,PhyloNode * dad)685 void PhyloTree::saveBranchLengths(DoubleVector &lenvec, int startid, PhyloNode *node, PhyloNode *dad) {
686     if (!node) {
687         node = (PhyloNode*) root;
688         ASSERT(branchNum == nodeNum-1);
689         if (lenvec.empty()) lenvec.resize(branchNum*getMixlen() + startid);
690     }
691     FOR_NEIGHBOR_IT(node, dad, it){
692         (*it)->getLength(lenvec, (*it)->id*getMixlen() + startid);
693         PhyloTree::saveBranchLengths(lenvec, startid, (PhyloNode*) (*it)->node, node);
694     }
695 }
696 
restoreBranchLengths(DoubleVector & lenvec,int startid,PhyloNode * node,PhyloNode * dad)697 void PhyloTree::restoreBranchLengths(DoubleVector &lenvec, int startid, PhyloNode *node, PhyloNode *dad) {
698     if (!node) {
699         node = (PhyloNode*) root;
700         ASSERT(!lenvec.empty());
701     }
702     FOR_NEIGHBOR_IT(node, dad, it){
703         (*it)->setLength(lenvec, (*it)->id*getMixlen() + startid, getMixlen());
704         (*it)->node->findNeighbor(node)->setLength(lenvec, (*it)->id*getMixlen() + startid, getMixlen());
705         PhyloTree::restoreBranchLengths(lenvec, startid, (PhyloNode*) (*it)->node, node);
706     }
707 }
708 
709 
710 /****************************************************************************
711  Parsimony function
712  ****************************************************************************/
713 
714 /*
715  double PhyloTree::computeCorrectedParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad) {
716  //    double corrected_bran = 0;
717  //    int parbran;
718  //    int parscore = computeParsimonyBranch(node21_it, node2, &parbran);
719  //    if (site_rate->getGammaShape() != 0) {
720  //        corrected_bran = (aln->num_states - 1.0) / aln->num_states
721  //                * site_rate->getGammaShape()
722  //                * (pow( 1.0 - aln->num_states / (aln->num_states - 1.0) * ((double) parbran / aln->getNSite()),
723  //                        -1.0 / site_rate->getGammaShape()) - 1.0);
724  //    } else {
725  //        corrected_bran = -((aln->num_states - 1.0) / aln->num_states)
726  //                * log(1.0 - (aln->num_states / (aln->num_states - 1.0)) * ((double) parbran / aln->getNSite()));
727  //    }
728  //    return corrected_bran;
729  }
730  */
initializeAllPartialPars()731 void PhyloTree::initializeAllPartialPars() {
732     if (!ptn_freq_pars)
733         ptn_freq_pars = aligned_alloc<UINT>(get_safe_upper_limit_float(getAlnNPattern()));
734     int index = 0;
735     initializeAllPartialPars(index);
736     clearAllPartialLH();
737     //assert(index == (nodeNum - 1)*2);
738 }
739 
initializeAllPartialPars(int & index,PhyloNode * node,PhyloNode * dad)740 void PhyloTree::initializeAllPartialPars(int &index, PhyloNode *node, PhyloNode *dad) {
741     size_t pars_block_size = getBitsBlockSize();
742     if (!node) {
743         node = (PhyloNode*) root;
744         // allocate the big central partial pars memory
745         if (!central_partial_pars) {
746             uint64_t tip_partial_pars_size = get_safe_upper_limit_float(aln->num_states * (aln->STATE_UNKNOWN+1));
747             size_t memsize = (aln->getNSeq()) * 4 * pars_block_size + tip_partial_pars_size;
748             if (verbose_mode >= VB_MAX)
749                 cout << "Allocating " << memsize * sizeof(UINT) << " bytes for partial parsimony vectors" << endl;
750             central_partial_pars = aligned_alloc<UINT>(memsize);
751             if (!central_partial_pars)
752                 outError("Not enough memory for partial parsimony vectors");
753             tip_partial_pars = central_partial_pars + (aln->getNSeq()) * 4 * pars_block_size;
754         }
755         index = 0;
756     }
757     if (dad) {
758         // make memory alignment 16
759         // assign a region in central_partial_lh to both Neihgbors (dad->node, and node->dad)
760         PhyloNeighbor *nei = (PhyloNeighbor*) node->findNeighbor(dad);
761         nei->partial_pars = central_partial_pars + (index * pars_block_size);
762         nei = (PhyloNeighbor*) dad->findNeighbor(node);
763         nei->partial_pars = central_partial_pars + ((index + 1) * pars_block_size);
764         index += 2;
765         //assert(index < nodeNum * 2 - 1);
766     }
767     FOR_NEIGHBOR_IT(node, dad, it)initializeAllPartialPars(index, (PhyloNode*) (*it)->node, node);
768 }
769 
770 #ifdef __AVX512KNL
771 #define SIMD_BITS 512
772 #else
773 #define SIMD_BITS 256
774 #endif
775 
776 
getBitsBlockSize()777 size_t PhyloTree::getBitsBlockSize() {
778     // reserve the last entry for parsimony score
779 //    return (aln->num_states * aln->size() + UINT_BITS - 1) / UINT_BITS + 1;
780     if (cost_matrix) {
781         return get_safe_upper_limit_float(aln->size() * aln->num_states);
782     }
783     size_t len = aln->getMaxNumStates() * ((max(aln->size(), (size_t)aln->num_variant_sites) + SIMD_BITS - 1) / UINT_BITS) + 4;
784 #ifdef __AVX512KNL
785     len = ((len+15)/16)*16;
786 #else
787     len = ((len+7)/8)*8;
788 #endif
789     return len;
790 }
791 
newBitsBlock()792 UINT *PhyloTree::newBitsBlock() {
793     return aligned_alloc<UINT>(getBitsBlockSize());
794 }
795 
796 
computePartialParsimony(PhyloNeighbor * dad_branch,PhyloNode * dad)797 void PhyloTree::computePartialParsimony(PhyloNeighbor *dad_branch, PhyloNode *dad) {
798     (this->*computePartialParsimonyPointer)(dad_branch, dad);
799 }
800 
computeReversePartialParsimony(PhyloNode * node,PhyloNode * dad)801 void PhyloTree::computeReversePartialParsimony(PhyloNode *node, PhyloNode *dad) {
802     PhyloNeighbor *node_nei = (PhyloNeighbor*)node->findNeighbor(dad);
803     ASSERT(node_nei);
804     computePartialParsimony(node_nei, node);
805     for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it ++)
806         if ((*it)->node != dad)
807             computeReversePartialParsimony((PhyloNode*)(*it)->node, node);
808 
809 }
810 
811 
computeParsimonyBranch(PhyloNeighbor * dad_branch,PhyloNode * dad,int * branch_subst)812 int PhyloTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
813     return (this->*computeParsimonyBranchPointer)(dad_branch, dad, branch_subst);
814 }
815 
816 
computeParsimony()817 int PhyloTree::computeParsimony() {
818     return computeParsimonyBranch((PhyloNeighbor*) root->neighbors[0], (PhyloNode*) root);
819 }
820 
821 
822 /****************************************************************************
823  likelihood function
824  ****************************************************************************/
825 
getBufferPartialLhSize()826 size_t PhyloTree::getBufferPartialLhSize() {
827     const size_t VECTOR_SIZE = 8; // TODO, adjusted
828     // 2017-12-13: make sure that num_threads was already set
829     ASSERT(num_threads > 0);
830     size_t ncat_mix = site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
831     size_t block = model->num_states * ncat_mix;
832     size_t buffer_size = 0;
833 
834     // buffer for traversal_info.echildren and partial_lh_leaves
835     if (!Params::getInstance().buffer_mem_save) {
836         buffer_size += get_safe_upper_limit(block * model->num_states * 2) * aln->getNSeq();
837         buffer_size += get_safe_upper_limit(block *(aln->STATE_UNKNOWN+1)) * aln->getNSeq();
838     }
839 
840     buffer_size += get_safe_upper_limit(block *(aln->STATE_UNKNOWN+1));
841     buffer_size += (block*2+model->num_states)*VECTOR_SIZE*num_threads;
842 
843     // always more buffer for non-rev kernel, in case switching between kernels
844     buffer_size += get_safe_upper_limit(block)*(aln->STATE_UNKNOWN+1)*2;
845     buffer_size += block*2*VECTOR_SIZE*num_threads;
846     buffer_size += get_safe_upper_limit(3*block*model->num_states);
847 
848     if (isMixlen()) {
849         size_t nmix = max(getMixlen(), getRate()->getNRate());
850         buffer_size += nmix*(nmix+1)*VECTOR_SIZE + (nmix+3)*nmix*VECTOR_SIZE*num_threads;
851     }
852     return buffer_size;
853 }
854 
initializeAllPartialLh()855 void PhyloTree::initializeAllPartialLh() {
856     int index, indexlh;
857     int numStates = model->num_states;
858     // Minh's question: why getAlnNSite() but not getAlnNPattern() ?
859     //size_t mem_size = ((getAlnNSite() % 2) == 0) ? getAlnNSite() : (getAlnNSite() + 1);
860     // extra #numStates for ascertainment bias correction
861     size_t mem_size = get_safe_upper_limit(getAlnNPattern()) + max(get_safe_upper_limit(numStates),
862         get_safe_upper_limit(model_factory->unobserved_ptns.size()));
863 
864     size_t block_size = mem_size * numStates * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
865     // make sure _pattern_lh size is divisible by 4 (e.g., 9->12, 14->16)
866     if (!_pattern_lh)
867         _pattern_lh = aligned_alloc<double>(mem_size);
868     if (!_pattern_lh_cat)
869         _pattern_lh_cat = aligned_alloc<double>(mem_size * site_rate->getNDiscreteRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures()));
870     if (!theta_all)
871         theta_all = aligned_alloc<double>(block_size);
872     if (!buffer_scale_all)
873         buffer_scale_all = aligned_alloc<double>(mem_size);
874     if (!buffer_partial_lh) {
875         buffer_partial_lh = aligned_alloc<double>(getBufferPartialLhSize());
876     }
877     if (!ptn_freq) {
878         ptn_freq = aligned_alloc<double>(mem_size);
879         ptn_freq_computed = false;
880     }
881     if (!ptn_freq_pars)
882         ptn_freq_pars = aligned_alloc<UINT>(mem_size);
883     if (!ptn_invar)
884         ptn_invar = aligned_alloc<double>(mem_size);
885     initializeAllPartialLh(index, indexlh);
886     if (params->lh_mem_save == LM_MEM_SAVE)
887         mem_slots.init(this, max_lh_slots);
888 
889     ASSERT(index == (nodeNum - 1) * 2);
890     if (params->lh_mem_save == LM_PER_NODE) {
891         ASSERT(indexlh == nodeNum-leafNum);
892     }
893 
894     clearAllPartialLH();
895 
896 }
897 
deleteAllPartialLh()898 void PhyloTree::deleteAllPartialLh() {
899 
900     if (central_partial_lh) {
901         aligned_free(central_partial_lh);
902     }
903     if (central_scale_num) {
904         aligned_free(central_scale_num);
905     }
906     if (central_partial_pars)
907         aligned_free(central_partial_pars);
908 
909     if (nni_scale_num)
910         aligned_free(nni_scale_num);
911     nni_scale_num = NULL;
912     if (nni_partial_lh)
913         aligned_free(nni_partial_lh);
914     nni_partial_lh = NULL;
915 
916     if (ptn_invar)
917         aligned_free(ptn_invar);
918     if (ptn_freq)
919         aligned_free(ptn_freq);
920     if (ptn_freq_pars)
921         aligned_free(ptn_freq_pars);
922     if (theta_all)
923         aligned_free(theta_all);
924     if (buffer_scale_all)
925         aligned_free(buffer_scale_all);
926     if (buffer_partial_lh)
927         aligned_free(buffer_partial_lh);
928     if (_pattern_lh_cat)
929         aligned_free(_pattern_lh_cat);
930     if (_pattern_lh)
931         aligned_free(_pattern_lh);
932     central_partial_lh = NULL;
933     central_scale_num = NULL;
934     central_partial_pars = NULL;
935 
936     ptn_invar = NULL;
937     ptn_freq = NULL;
938     ptn_freq_pars = NULL;
939     ptn_freq_computed = false;
940     theta_all = NULL;
941     buffer_scale_all = NULL;
942     buffer_partial_lh = NULL;
943     _pattern_lh_cat = NULL;
944     _pattern_lh = NULL;
945 
946     tip_partial_lh = NULL;
947     tip_partial_pars = NULL;
948 
949     clearAllPartialLH();
950 }
951 
getMemoryRequired(size_t ncategory,bool full_mem)952 uint64_t PhyloTree::getMemoryRequired(size_t ncategory, bool full_mem) {
953     // +num_states for ascertainment bias correction
954     int64_t nptn = get_safe_upper_limit(aln->getNPattern()) + get_safe_upper_limit(aln->num_states);
955     if (model_factory)
956         nptn = get_safe_upper_limit(aln->getNPattern()) + max(get_safe_upper_limit(aln->num_states), get_safe_upper_limit(model_factory->unobserved_ptns.size()));
957     int64_t scale_block_size = nptn;
958     if (site_rate)
959         scale_block_size *= site_rate->getNRate();
960     else
961         scale_block_size *= ncategory;
962     if (model && !model_factory->fused_mix_rate)
963         scale_block_size *= model->getNMixtures();
964 
965     int64_t block_size = scale_block_size * aln->num_states;
966 
967     int64_t mem_size;
968     // memory to tip_partial_lh
969     if (model)
970         mem_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures() * sizeof(double);
971     else
972         mem_size = aln->num_states * (aln->STATE_UNKNOWN+1) * sizeof(double);
973 
974     // memory for UFBoot
975     if (params->gbo_replicates)
976         mem_size += params->gbo_replicates*nptn*sizeof(BootValType);
977 
978     // memory for model
979     if (model)
980         mem_size += model->getMemoryRequired();
981 
982     int64_t lh_scale_size = block_size * sizeof(double) + scale_block_size * sizeof(UBYTE);
983 
984     max_lh_slots = leafNum-2;
985 
986     if (!full_mem && params->lh_mem_save == LM_MEM_SAVE) {
987         int64_t min_lh_slots = log2(leafNum)+LH_MIN_CONST;
988         if (params->max_mem_size == 0.0) {
989             max_lh_slots = min_lh_slots;
990         } else if (params->max_mem_size <= 1) {
991             max_lh_slots = floor(params->max_mem_size*(leafNum-2));
992         } else {
993             int64_t rest_mem = params->max_mem_size - mem_size;
994 
995             // include 2 blocks for nni_partial_lh
996             max_lh_slots = rest_mem / lh_scale_size - 2;
997 
998             // RAM over requirement, reset to LM_PER_NODE
999             if (max_lh_slots > leafNum-2)
1000                 max_lh_slots = leafNum-2;
1001         }
1002         if (max_lh_slots < min_lh_slots) {
1003             cout << "WARNING: Too low -mem, automatically increased to " << (mem_size + (min_lh_slots+2)*lh_scale_size)/1048576.0 << " MB" << endl;
1004             max_lh_slots = min_lh_slots;
1005         }
1006     }
1007 
1008     // also count MEM for nni_partial_lh
1009     mem_size += (max_lh_slots+2) * lh_scale_size;
1010 
1011 
1012     return mem_size;
1013 }
1014 
getMemoryRequiredThreaded(size_t ncategory,bool full_mem)1015 uint64_t PhyloTree::getMemoryRequiredThreaded(size_t ncategory, bool full_mem) {
1016     return getMemoryRequired(ncategory, full_mem);
1017 }
1018 
getMemoryRequired(uint64_t & partial_lh_entries,uint64_t & scale_num_entries,uint64_t & partial_pars_entries)1019 void PhyloTree::getMemoryRequired(uint64_t &partial_lh_entries, uint64_t &scale_num_entries, uint64_t &partial_pars_entries) {
1020     // +num_states for ascertainment bias correction
1021     uint64_t block_size = get_safe_upper_limit(aln->getNPattern()) + get_safe_upper_limit(aln->num_states);
1022     if (model_factory)
1023         block_size = get_safe_upper_limit(aln->getNPattern()) + max(get_safe_upper_limit(aln->num_states), get_safe_upper_limit(model_factory->unobserved_ptns.size()));
1024     size_t scale_size = block_size;
1025     block_size = block_size * aln->num_states;
1026     if (site_rate) {
1027         block_size *= site_rate->getNRate();
1028         scale_size *= site_rate->getNRate();
1029     }
1030     if (model && !model_factory->fused_mix_rate) {
1031         block_size *= model->getNMixtures();
1032         scale_size *= model->getNMixtures();
1033     }
1034 
1035     uint64_t tip_partial_lh_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures();
1036     uint64_t tip_partial_pars_size = aln->num_states * (aln->STATE_UNKNOWN+1);
1037 
1038     // TODO mem save
1039     partial_lh_entries = ((uint64_t)leafNum - 2) * (uint64_t) block_size + 4 + tip_partial_lh_size;
1040     scale_num_entries = (leafNum - 2) * scale_size;
1041 
1042     size_t pars_block_size = getBitsBlockSize();
1043     partial_pars_entries = (leafNum - 1) * 4 * pars_block_size + tip_partial_pars_size;
1044 }
1045 
initializeAllPartialLh(int & index,int & indexlh,PhyloNode * node,PhyloNode * dad)1046 void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node, PhyloNode *dad) {
1047     uint64_t pars_block_size = getBitsBlockSize();
1048     // +num_states for ascertainment bias correction
1049     size_t nptn = get_safe_upper_limit(aln->size())+ max(get_safe_upper_limit(aln->num_states), get_safe_upper_limit(model_factory->unobserved_ptns.size()));
1050     uint64_t block_size;
1051     uint64_t scale_block_size = nptn * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
1052     block_size = scale_block_size * model->num_states;
1053 
1054     if (!node) {
1055         node = (PhyloNode*) root;
1056         // allocate the big central partial likelihoods memory
1057 //        size_t IT_NUM = (params->nni5) ? 6 : 2;
1058         size_t IT_NUM = 2;
1059         if (!nni_partial_lh) {
1060             // allocate memory only once!
1061             nni_partial_lh = aligned_alloc<double>(IT_NUM*block_size);
1062             nni_scale_num = aligned_alloc<UBYTE>(IT_NUM*scale_block_size);
1063         }
1064 
1065 
1066         if (!central_partial_lh) {
1067             uint64_t tip_partial_lh_size = get_safe_upper_limit(aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures());
1068             if (model->isSiteSpecificModel())
1069                 tip_partial_lh_size = get_safe_upper_limit(aln->size()) * model->num_states * leafNum;
1070 
1071             if (max_lh_slots == 0)
1072                 getMemoryRequired();
1073 
1074             uint64_t mem_size = (uint64_t)max_lh_slots * block_size + 4 + tip_partial_lh_size;
1075 
1076             if (verbose_mode >= VB_MAX)
1077                 cout << "Allocating " << mem_size * sizeof(double) << " bytes for partial likelihood vectors" << endl;
1078             try {
1079                 central_partial_lh = aligned_alloc<double>(mem_size);
1080             } catch (std::bad_alloc &ba) {
1081                 outError("Not enough memory for partial likelihood vectors (bad_alloc)");
1082             }
1083             if (!central_partial_lh)
1084                 outError("Not enough memory for partial likelihood vectors");
1085         }
1086 
1087         // now always assign tip_partial_lh
1088         if (params->lh_mem_save == LM_PER_NODE) {
1089             tip_partial_lh = central_partial_lh + ((nodeNum - leafNum)*block_size);
1090         } else {
1091             tip_partial_lh = central_partial_lh + (max_lh_slots*block_size);
1092         }
1093 
1094         if (!central_scale_num) {
1095             uint64_t mem_size = max_lh_slots * scale_block_size;
1096 
1097             if (verbose_mode >= VB_MAX)
1098                 cout << "Allocating " << mem_size * sizeof(UBYTE) << " bytes for scale num vectors" << endl;
1099             try {
1100                 central_scale_num = aligned_alloc<UBYTE>(mem_size);
1101             } catch (std::bad_alloc &ba) {
1102                 outError("Not enough memory for scale num vectors (bad_alloc)");
1103             }
1104             if (!central_scale_num)
1105                 outError("Not enough memory for scale num vectors");
1106         }
1107 
1108         if (!central_partial_pars) {
1109             uint64_t tip_partial_pars_size = get_safe_upper_limit_float(aln->num_states * (aln->STATE_UNKNOWN+1));
1110             uint64_t mem_size = (leafNum - 1) * 4 * pars_block_size + tip_partial_pars_size;
1111             if (verbose_mode >= VB_MAX)
1112                 cout << "Allocating " << mem_size * sizeof(UINT)
1113                         << " bytes for partial parsimony vectors" << endl;
1114             try {
1115                 central_partial_pars = aligned_alloc<UINT>(mem_size);
1116             } catch (std::bad_alloc &ba) {
1117                 outError("Not enough memory for partial parsimony vectors (bad_alloc)");
1118             }
1119             if (!central_partial_pars)
1120                 outError("Not enough memory for partial parsimony vectors");
1121             tip_partial_pars = central_partial_pars + ((leafNum - 1) * 4 * pars_block_size);
1122         }
1123         index = 0;
1124         indexlh = 0;
1125     }
1126     if (dad) {
1127         // assign a region in central_partial_lh to both Neihgbors (dad->node, and node->dad)
1128         PhyloNeighbor *nei = (PhyloNeighbor*) node->findNeighbor(dad);
1129         PhyloNeighbor *nei2 = (PhyloNeighbor*) dad->findNeighbor(node);
1130 
1131         // first initialize partial_pars
1132         nei->partial_pars = central_partial_pars + (index * pars_block_size);
1133         index++;
1134         nei2->partial_pars = central_partial_pars + (index * pars_block_size);
1135         index ++;
1136         ASSERT(index < nodeNum * 2 - 1);
1137 
1138         // now initialize partial_lh and scale_num
1139         if (params->lh_mem_save == LM_PER_NODE) {
1140             if (!node->isLeaf()) { // only allocate memory to internal node
1141                 nei->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
1142                 nei->scale_num = NULL;
1143                 nei2->scale_num = central_scale_num + ((indexlh) * scale_block_size);
1144                 nei2->partial_lh = central_partial_lh + (indexlh * block_size);
1145                 indexlh++;
1146             } else {
1147                 nei->partial_lh = NULL;
1148                 nei->scale_num = NULL;
1149                 nei2->scale_num = NULL;
1150                 nei2->partial_lh = NULL;
1151             }
1152         } else {
1153             nei->partial_lh = NULL;
1154             nei->scale_num = NULL;
1155             nei2->scale_num = NULL;
1156             nei2->partial_lh = NULL;
1157         }
1158 
1159         // zero memory to allocate contiguous chunk of memory
1160 //        if (nei->partial_lh)
1161 //            memset(nei->partial_lh, 0, block_size*sizeof(double));
1162 //        if (nei2->partial_lh)
1163 //            memset(nei2->partial_lh, 0, block_size*sizeof(double));
1164 
1165 //        if (model->isSiteSpecificModel() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
1166 //            // allocate tip memory for this model
1167 //            if (node->isLeaf()) {
1168 //                nei2->partial_lh = tip_partial_lh + (node->id * tip_block_size);
1169 //            }
1170 //            if (dad->isLeaf()) {
1171 //                nei->partial_lh = tip_partial_lh + (dad->id * tip_block_size);
1172 //            }
1173 //        }
1174     }
1175     FOR_NEIGHBOR_IT(node, dad, it) initializeAllPartialLh(index, indexlh, (PhyloNode*) (*it)->node, node);
1176 }
1177 
newPartialLh()1178 double *PhyloTree::newPartialLh() {
1179     return aligned_alloc<double>(getPartialLhSize());
1180 }
1181 
getPartialLhSize()1182 size_t PhyloTree::getPartialLhSize() {
1183     // +num_states for ascertainment bias correction
1184     size_t block_size = get_safe_upper_limit(aln->size())+max(get_safe_upper_limit(aln->num_states),
1185         get_safe_upper_limit(model_factory->unobserved_ptns.size()));
1186     block_size *= model->num_states * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
1187     return block_size;
1188 }
1189 
getPartialLhBytes()1190 size_t PhyloTree::getPartialLhBytes() {
1191     // +num_states for ascertainment bias correction
1192     return getPartialLhSize() * sizeof(double);
1193 }
1194 
getScaleNumSize()1195 size_t PhyloTree::getScaleNumSize() {
1196     size_t block_size = get_safe_upper_limit(aln->size())+max(get_safe_upper_limit(aln->num_states),
1197         get_safe_upper_limit(model_factory->unobserved_ptns.size()));
1198     return (block_size) * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
1199 }
1200 
getScaleNumBytes()1201 size_t PhyloTree::getScaleNumBytes() {
1202     return getScaleNumSize()*sizeof(UBYTE);
1203 }
1204 
newScaleNum()1205 UBYTE *PhyloTree::newScaleNum() {
1206     return aligned_alloc<UBYTE>(getScaleNumSize());
1207 }
1208 
findFirstFarLeaf(Node * node,Node * dad=NULL)1209 Node *findFirstFarLeaf(Node *node, Node *dad = NULL) {
1210 
1211     do {
1212         FOR_NEIGHBOR_IT(node, dad, it) {
1213             dad = node;
1214             node = (*it)->node;
1215             break;
1216         }
1217     } while (!node->isLeaf());
1218     return node;
1219 
1220 }
1221 
computeLikelihood(double * pattern_lh)1222 double PhyloTree::computeLikelihood(double *pattern_lh) {
1223     ASSERT(model);
1224     ASSERT(site_rate);
1225     ASSERT(root->isLeaf());
1226     if (!current_it) {
1227         Node *leaf = findFarthestLeaf();
1228         current_it = (PhyloNeighbor*)leaf->neighbors[0];
1229         current_it_back = (PhyloNeighbor*)current_it->node->findNeighbor(leaf);
1230 //        PhyloNeighbor *nei = ((PhyloNeighbor*) root->neighbors[0]);
1231 //        current_it = nei;
1232 //        assert(current_it);
1233 //        current_it_back = (PhyloNeighbor*) nei->node->findNeighbor(root);
1234 //        assert(current_it_back);
1235     }
1236     double score;
1237 //    string root_name = ROOT_NAME;
1238 //    Node *vroot = findLeafName(root_name);
1239 //    if (root_state != aln->STATE_UNKNOWN && vroot) {
1240 //        if (verbose_mode >= VB_DEBUG)
1241 //            cout << __func__ << " HIT ROOT STATE " << endl;
1242 //        score = computeLikelihoodRooted((PhyloNeighbor*) vroot->neighbors[0], (PhyloNode*) vroot);
1243 //    } else {
1244         score = computeLikelihoodBranch(current_it, (PhyloNode*) current_it_back->node);
1245 //    }
1246     if (pattern_lh)
1247         memmove(pattern_lh, _pattern_lh, aln->size() * sizeof(double));
1248 
1249     if (pattern_lh && current_it->lh_scale_factor < 0.0) {
1250         int nptn = aln->getNPattern();
1251         //double check_score = 0.0;
1252         for (int i = 0; i < nptn; i++) {
1253             pattern_lh[i] += max(current_it->scale_num[i], UBYTE(0)) * LOG_SCALING_THRESHOLD;
1254             //check_score += (pattern_lh[i] * (aln->at(i).frequency));
1255         }
1256         /*       if (fabs(score - check_score) > 1e-6) {
1257          cout << "score = " << score << " check_score = " << check_score << endl;
1258          outError("Scaling error ", __func__);
1259          }*/
1260     }
1261     curScore = score;
1262     return score;
1263 }
1264 
1265 //double PhyloTree::computeLikelihoodRooted(PhyloNeighbor *dad_branch, PhyloNode *dad) {
1266 //    double score = computeLikelihoodBranchNaive(dad_branch, dad);
1267 //    if (verbose_mode >= VB_DEBUG) {
1268 //        printTransMatrices(dad_branch->node, dad);
1269 //        /*
1270 //         FOR_NEIGHBOR_IT(dad_branch->node, dad, it) {
1271 //         PhyloNeighbor *pit = (PhyloNeighbor*)(*it);
1272 //         cout << pit->node->name << "\t" << pit->partial_lh[0] << endl;
1273 //
1274 //         }*/
1275 //    }
1276 //    double* state_freq = new double[aln->num_states];
1277 //    model->getStateFrequency(state_freq);
1278 //    score -= log(state_freq[(int) root_state]);
1279 //    delete[] state_freq;
1280 //    return score;
1281 //}
1282 
getNumLhCat(SiteLoglType wsl)1283 int PhyloTree::getNumLhCat(SiteLoglType wsl) {
1284     int ncat = 0;
1285     switch (wsl) {
1286     case WSL_NONE: ASSERT(0 && "is not WSL_NONE"); return 0;
1287     case WSL_SITE: ASSERT(0 && "is not WSL_SITE"); return 0;
1288     case WSL_MIXTURE_RATECAT:
1289         ncat = getRate()->getNDiscreteRate();
1290         if (getModel()->isMixture() && !getModelFactory()->fused_mix_rate)
1291             ncat *= getModel()->getNMixtures();
1292         return ncat;
1293     case WSL_RATECAT:
1294         return getRate()->getNDiscreteRate();
1295     case WSL_MIXTURE:
1296         return getModel()->getNMixtures();
1297     }
1298 }
1299 
transformPatternLhCat()1300 void PhyloTree::transformPatternLhCat() {
1301     if (vector_size == 1)
1302         return;
1303 
1304     size_t nptn = ((aln->size()+vector_size-1)/vector_size)*vector_size;
1305 //    size_t nstates = aln->num_states;
1306     size_t ncat = site_rate->getNRate();
1307     if (!model_factory->fused_mix_rate) ncat *= model->getNMixtures();
1308 
1309     double *mem = aligned_alloc<double>(nptn*ncat);
1310     memcpy(mem, _pattern_lh_cat, nptn*ncat*sizeof(double));
1311     double *memptr = mem;
1312 
1313     size_t ptn, cat, i;
1314     for (ptn = 0; ptn < nptn; ptn+=vector_size) {
1315         double *lh_cat_ptr = &_pattern_lh_cat[ptn*ncat];
1316         for (cat = 0; cat < ncat; cat++) {
1317             for (i = 0; i < vector_size; i++)
1318                 lh_cat_ptr[i*ncat+cat] = memptr[i];
1319             memptr += vector_size;
1320         }
1321     }
1322     aligned_free(mem);
1323 }
1324 
computePatternLhCat(SiteLoglType wsl)1325 double PhyloTree::computePatternLhCat(SiteLoglType wsl) {
1326     if (!current_it) {
1327         Node *leaf = findFirstFarLeaf(root);
1328         current_it = (PhyloNeighbor*)leaf->neighbors[0];
1329         current_it_back = (PhyloNeighbor*)current_it->node->findNeighbor(leaf);
1330     }
1331 
1332     double score;
1333 
1334     score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
1335     // TODO: SIMD aware
1336     transformPatternLhCat();
1337     /*
1338     if (getModel()->isSiteSpecificModel()) {
1339         score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
1340     } else if (!getModel()->isMixture())
1341         score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
1342     else if (getModelFactory()->fused_mix_rate)
1343         score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
1344     else {
1345         score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
1346     */
1347     if (!getModel()->isSiteSpecificModel() && getModel()->isMixture() && !getModelFactory()->fused_mix_rate) {
1348         if (wsl == WSL_MIXTURE || wsl == WSL_RATECAT) {
1349             double *lh_cat = _pattern_lh_cat;
1350             double *lh_res = _pattern_lh_cat;
1351             size_t ptn, nptn = aln->getNPattern();
1352             size_t m, nmixture = getModel()->getNMixtures();
1353             size_t c, ncat = getRate()->getNRate();
1354             if (wsl == WSL_MIXTURE && ncat > 1) {
1355                 // transform to lh per mixture class
1356                 for (ptn = 0; ptn < nptn; ptn++) {
1357                     for (m = 0; m < nmixture; m++) {
1358                         double lh = lh_cat[0];
1359                         for (c = 1; c < ncat; c++)
1360                             lh += lh_cat[c];
1361                         lh_res[m] = lh;
1362                         lh_cat += ncat;
1363                     }
1364                     lh_res += nmixture;
1365                 }
1366             } else if (wsl == WSL_RATECAT && nmixture > 1) {
1367                 // transform to lh per rate category
1368                 for (ptn = 0; ptn < nptn; ptn++) {
1369                     if (lh_res != lh_cat)
1370                         memcpy(lh_res, lh_cat, ncat*sizeof(double));
1371                     lh_cat += ncat;
1372                     for (m = 1; m < nmixture; m++) {
1373                         for (c = 0; c < ncat; c++)
1374                             lh_res[c] += lh_cat[c];
1375                         lh_cat += ncat;
1376                     }
1377                     lh_res += ncat;
1378                 }
1379             }
1380         }
1381     }
1382 
1383     return score;
1384 }
1385 
computePatternStateFreq(double * ptn_state_freq)1386 void PhyloTree::computePatternStateFreq(double *ptn_state_freq) {
1387     ASSERT(getModel()->isMixture());
1388     computePatternLhCat(WSL_MIXTURE);
1389     double *lh_cat = _pattern_lh_cat;
1390     size_t ptn, nptn = getAlnNPattern(), m, nmixture = getModel()->getNMixtures();
1391     double *ptn_freq = ptn_state_freq;
1392     size_t state, nstates = aln->num_states;
1393 //    ModelMixture *models = (ModelMixture*)model;
1394 
1395     if (params->print_site_state_freq == WSF_POSTERIOR_MEAN) {
1396         cout << "Computing posterior mean site frequencies...." << endl;
1397         // loop over all site-patterns
1398         for (ptn = 0; ptn < nptn; ptn++) {
1399 
1400             // first compute posterior for each mixture component
1401             double sum_lh = 0.0;
1402             for (m = 0; m < nmixture; m++) {
1403                 sum_lh += lh_cat[m];
1404             }
1405             sum_lh = 1.0/sum_lh;
1406             for (m = 0; m < nmixture; m++) {
1407                 lh_cat[m] *= sum_lh;
1408             }
1409 
1410             // now compute state frequencies
1411             for (state = 0; state < nstates; state++) {
1412                 double freq = 0;
1413                 for (m = 0; m < nmixture; m++)
1414                     freq += model->getMixtureClass(m)->state_freq[state] * lh_cat[m];
1415                 ptn_freq[state] = freq;
1416             }
1417 
1418             // increase the pointers
1419             lh_cat += nmixture;
1420             ptn_freq += nstates;
1421         }
1422     } else if (params->print_site_state_freq == WSF_POSTERIOR_MAX) {
1423         cout << "Computing posterior max site frequencies...." << endl;
1424         // loop over all site-patterns
1425         for (ptn = 0; ptn < nptn; ptn++) {
1426 
1427             // first compute posterior for each mixture component
1428             size_t max_comp = 0;
1429             for (m = 1; m < nmixture; m++)
1430                 if (lh_cat[m] > lh_cat[max_comp]) {
1431                     max_comp = m;
1432                 }
1433 
1434             // now compute state frequencies
1435             memcpy(ptn_freq, model->getMixtureClass(max_comp)->state_freq, nstates*sizeof(double));
1436 
1437             // increase the pointers
1438             lh_cat += nmixture;
1439             ptn_freq += nstates;
1440         }
1441     }
1442 }
1443 
1444 
1445 
computePatternLikelihood(double * ptn_lh,double * cur_logl,double * ptn_lh_cat,SiteLoglType wsl)1446 void PhyloTree::computePatternLikelihood(double *ptn_lh, double *cur_logl, double *ptn_lh_cat, SiteLoglType wsl) {
1447     /*    if (!dad_branch) {
1448      dad_branch = (PhyloNeighbor*) root->neighbors[0];
1449      dad = (PhyloNode*) root;
1450      }*/
1451     int nptn = aln->getNPattern();
1452     int i;
1453     int ncat = getNumLhCat(wsl);
1454     if (ptn_lh_cat) {
1455         // Right now only Naive version store _pattern_lh_cat!
1456         computePatternLhCat(wsl);
1457     }
1458 
1459     double sum_scaling = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
1460     //double sum_scaling = 0.0;
1461     if (sum_scaling < 0.0) {
1462         if (current_it->lh_scale_factor == 0.0) {
1463             for (i = 0; i < nptn; i++) {
1464                 ptn_lh[i] = _pattern_lh[i] + (max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
1465             }
1466         } else if (current_it_back->lh_scale_factor == 0.0){
1467             for (i = 0; i < nptn; i++) {
1468                 ptn_lh[i] = _pattern_lh[i] + (max(UBYTE(0), current_it->scale_num[i])) * LOG_SCALING_THRESHOLD;
1469             }
1470         } else {
1471             for (i = 0; i < nptn; i++) {
1472                 ptn_lh[i] = _pattern_lh[i] + (max(UBYTE(0), current_it->scale_num[i]) +
1473                     max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
1474             }
1475         }
1476     } else {
1477         memmove(ptn_lh, _pattern_lh, nptn * sizeof(double));
1478     }
1479 
1480     if (!ptn_lh_cat)
1481         return;
1482 
1483     /*
1484     if (ptn_lh_cat && model->isSiteSpecificModel()) {
1485         int offset = 0;
1486         if (sum_scaling == 0.0) {
1487             int nptncat = nptn * ncat;
1488             for (i = 0; i < nptncat; i++) {
1489                 ptn_lh_cat[i] = log(_pattern_lh_cat[i]);
1490             }
1491         } else if (current_it->lh_scale_factor == 0.0) {
1492             for (i = 0; i < nptn; i++) {
1493                 double scale = (max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
1494                 for (int j = 0; j < ncat; j++, offset++)
1495                     ptn_lh_cat[offset] = log(_pattern_lh_cat[offset]) + scale;
1496             }
1497         } else if (current_it_back->lh_scale_factor == 0.0) {
1498             for (i = 0; i < nptn; i++) {
1499                 double scale = (max(UBYTE(0), current_it->scale_num[i])) * LOG_SCALING_THRESHOLD;
1500                 for (int j = 0; j < ncat; j++, offset++)
1501                     ptn_lh_cat[offset] = log(_pattern_lh_cat[offset]) + scale;
1502             }
1503         } else {
1504             for (i = 0; i < nptn; i++) {
1505                 double scale = (max(UBYTE(0), current_it->scale_num[i]) +
1506                         max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
1507                 for (int j = 0; j < ncat; j++, offset++)
1508                     ptn_lh_cat[offset] = log(_pattern_lh_cat[offset]) + scale;
1509             }
1510         }
1511         return;
1512     }
1513     */
1514 
1515     // New kernel
1516     int ptn;
1517     PhyloNeighbor *nei1 = current_it;
1518     PhyloNeighbor *nei2 = current_it_back;
1519     if (!nei1->node->isLeaf() && nei2->node->isLeaf()) {
1520         // exchange
1521         PhyloNeighbor *tmp = nei1;
1522         nei1 = nei2;
1523         nei2 = tmp;
1524     }
1525     if (nei1->node->isLeaf()) {
1526         // external branch
1527         double *lh_cat = _pattern_lh_cat;
1528         double *out_lh_cat = ptn_lh_cat;
1529         UBYTE *nei2_scale = nei2->scale_num;
1530         if (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling) {
1531             // per-category scaling
1532             for (ptn = 0; ptn < nptn; ptn++) {
1533                 for (i = 0; i < ncat; i++) {
1534                     out_lh_cat[i] = log(lh_cat[i]) + nei2_scale[i] * LOG_SCALING_THRESHOLD;
1535                 }
1536                 lh_cat += ncat;
1537                 out_lh_cat += ncat;
1538                 nei2_scale += ncat;
1539             }
1540         } else {
1541             // normal scaling
1542             for (ptn = 0; ptn < nptn; ptn++) {
1543                 double scale = nei2_scale[ptn] * LOG_SCALING_THRESHOLD;
1544                 for (i = 0; i < ncat; i++)
1545                     out_lh_cat[i] = log(lh_cat[i]) + scale;
1546                 lh_cat += ncat;
1547                 out_lh_cat += ncat;
1548             }
1549         }
1550     } else {
1551         // internal branch
1552         double *lh_cat = _pattern_lh_cat;
1553         double *out_lh_cat = ptn_lh_cat;
1554         UBYTE *nei1_scale = nei1->scale_num;
1555         UBYTE *nei2_scale = nei2->scale_num;
1556         if (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling) {
1557             // per-category scaling
1558             for (ptn = 0; ptn < nptn; ptn++) {
1559                 for (i = 0; i < ncat; i++) {
1560                     out_lh_cat[i] = log(lh_cat[i]) + (nei1_scale[i]+nei2_scale[i]) * LOG_SCALING_THRESHOLD;
1561                 }
1562                 lh_cat += ncat;
1563                 out_lh_cat += ncat;
1564                 nei1_scale += ncat;
1565                 nei2_scale += ncat;
1566             }
1567         } else {
1568             // normal scaling
1569             for (ptn = 0; ptn < nptn; ptn++) {
1570                 double scale = (nei1_scale[ptn] + nei2_scale[ptn]) * LOG_SCALING_THRESHOLD;
1571                 for (i = 0; i < ncat; i++)
1572                     out_lh_cat[i] = log(lh_cat[i]) + scale;
1573                 lh_cat += ncat;
1574                 out_lh_cat += ncat;
1575             }
1576         }
1577     }
1578 
1579 //    if (cur_logl) {
1580 //        double check_score = 0.0;
1581 //        for (int i = 0; i < nptn; i++) {
1582 //            check_score += (ptn_lh[i] * (aln->at(i).frequency));
1583 //        }
1584 //        if (fabs(check_score - *cur_logl) > 0.01) {
1585 //            cout << *cur_logl << " " << check_score << endl;
1586 //            assert(0);
1587 //        }
1588 //    }
1589     //double score = computeLikelihoodBranch(dad_branch, dad, pattern_lh);
1590     //return score;
1591 }
1592 
computePatternProbabilityCategory(double * ptn_prob_cat,SiteLoglType wsl)1593 void PhyloTree::computePatternProbabilityCategory(double *ptn_prob_cat, SiteLoglType wsl) {
1594     /*    if (!dad_branch) {
1595      dad_branch = (PhyloNeighbor*) root->neighbors[0];
1596      dad = (PhyloNode*) root;
1597      }*/
1598     size_t ptn, nptn = aln->getNPattern();
1599     size_t cat, ncat = getNumLhCat(wsl);
1600     // Right now only Naive version store _pattern_lh_cat!
1601     computePatternLhCat(wsl);
1602 
1603     memcpy(ptn_prob_cat, _pattern_lh_cat, sizeof(double)*nptn*ncat);
1604 
1605     for (ptn = 0; ptn < nptn; ptn++) {
1606         double *lh_cat = ptn_prob_cat + ptn*ncat;
1607         double sum = lh_cat[0];
1608         for (cat = 1; cat < ncat; cat++)
1609             sum += lh_cat[cat];
1610         sum = 1.0/sum;
1611         for (cat = 0; cat < ncat; cat++)
1612             lh_cat[cat] *= sum;
1613     }
1614 }
1615 
computePatternCategories(IntVector * pattern_ncat)1616 int PhyloTree::computePatternCategories(IntVector *pattern_ncat) {
1617     if (sse != LK_386) {
1618         // compute _pattern_lh_cat
1619         computePatternLhCat(WSL_MIXTURE_RATECAT);
1620     }
1621 
1622     size_t npattern = aln->getNPattern();
1623     size_t ncat = getRate()->getNRate();
1624     size_t nmixture;
1625     if (getModel()->isMixture() && !getModelFactory()->fused_mix_rate)
1626         nmixture = getModel()->getNMixtures();
1627     else
1628         nmixture = ncat;
1629     size_t ptn, m, c;
1630     if (pattern_ncat)
1631         pattern_ncat->resize(npattern);
1632     if (ptn_cat_mask.empty())
1633         ptn_cat_mask.resize(npattern, 0);
1634 
1635     size_t num_best_mixture = 0;
1636     ASSERT(ncat < sizeof(uint64_t)*8 && nmixture < sizeof(uint64_t)*8);
1637 
1638     double *lh_cat = _pattern_lh_cat;
1639 //    double *cat_prob = new double[ncat];
1640     double *lh_mixture = new double[nmixture];
1641     double *sorted_lh_mixture = new double[nmixture];
1642     int *id_mixture = new int[nmixture];
1643 
1644 //    for (c = 0; c < ncat; c++)
1645 //        cat_prob[c] = getRate()->getProp(c);
1646 
1647 //    cout << "Ptn\tFreq\tNumMix\tBestMix" << endl;
1648     size_t sum_nmix = 0;
1649     for (ptn = 0; ptn < npattern; ptn++) {
1650         double sum_prob = 0.0, acc_prob = 0.0;
1651         memset(lh_mixture, 0, nmixture*sizeof(double));
1652         if (getModel()->isMixture() && !getModelFactory()->fused_mix_rate) {
1653             for (m = 0; m < nmixture; m++) {
1654                 for (c = 0; c < ncat; c++) {
1655 //                    lh_mixture[m] += lh_cat[c] * cat_prob[c];
1656                     lh_mixture[m] += lh_cat[c];
1657                 }
1658 //                lh_mixture[m] *= prop[m];
1659                 sum_prob += lh_mixture[m];
1660                 lh_cat += ncat;
1661                 id_mixture[m] = m;
1662             }
1663         } else {
1664             for (m = 0; m < nmixture; m++) {
1665 //                lh_mixture[m] = lh_cat[m] * prop[m];
1666                 lh_mixture[m] = lh_cat[m];
1667                 sum_prob += lh_mixture[m];
1668                 id_mixture[m] = m;
1669             }
1670             lh_cat += nmixture;
1671         }
1672         sum_prob = 1.0 / sum_prob;
1673         for (m = 0; m < nmixture; m++) {
1674             lh_mixture[m] *= sum_prob;
1675             sorted_lh_mixture[m] = -lh_mixture[m];
1676         }
1677         quicksort(sorted_lh_mixture, 0, m-1, id_mixture);
1678         for (m = 0; m < nmixture && acc_prob <= 0.99; m++) {
1679             acc_prob -= sorted_lh_mixture[m];
1680             ptn_cat_mask[ptn] |= (uint64_t)1 << id_mixture[m];
1681         }
1682         if (m > num_best_mixture)
1683             num_best_mixture = m;
1684         sum_nmix += m;
1685         if (pattern_ncat)
1686             (*pattern_ncat)[ptn] = m;
1687 
1688         if (verbose_mode >= VB_MED) {
1689             cout << ptn << "\t" << (int)ptn_freq[ptn] << "\t" << m << "\t" << id_mixture[0];
1690             for (c = 0; c < m; c++)
1691                 cout  << "\t" << id_mixture[c] << "\t" << -sorted_lh_mixture[c];
1692             cout << endl;
1693         }
1694     }
1695 //    cout << 100*(double(sum_nmix)/nmixture)/npattern << "% computation necessary" << endl;
1696     delete [] id_mixture;
1697     delete [] sorted_lh_mixture;
1698     delete [] lh_mixture;
1699 //    delete [] cat_prob;
1700     return num_best_mixture;
1701 }
1702 
computeLogLVariance(double * ptn_lh,double tree_lh)1703 double PhyloTree::computeLogLVariance(double *ptn_lh, double tree_lh) {
1704     int i;
1705     int nptn = getAlnNPattern();
1706     int nsite = getAlnNSite();
1707     double *pattern_lh = ptn_lh;
1708     if (!ptn_lh) {
1709         pattern_lh = new double[nptn];
1710         computePatternLikelihood(pattern_lh);
1711     }
1712     IntVector pattern_freq;
1713     aln->getPatternFreq(pattern_freq);
1714     if (tree_lh == 0.0) {
1715         for (i = 0; i < nptn; i++)
1716             tree_lh += pattern_lh[i] * pattern_freq[i];
1717     }
1718     double avg_site_lh = tree_lh / nsite;
1719     double variance = 0.0;
1720     for (i = 0; i < nptn; i++) {
1721         double diff = (pattern_lh[i] - avg_site_lh);
1722         variance += diff * diff * pattern_freq[i];
1723     }
1724     if (!ptn_lh)
1725         delete[] pattern_lh;
1726     if (nsite <= 1)
1727         return 0.0;
1728     return variance * ((double) nsite / (nsite - 1.0));
1729 }
1730 
computeLogLDiffVariance(double * pattern_lh_other,double * ptn_lh)1731 double PhyloTree::computeLogLDiffVariance(double *pattern_lh_other, double *ptn_lh) {
1732     int i;
1733     int nptn = getAlnNPattern();
1734     int nsite = getAlnNSite();
1735     double *pattern_lh = ptn_lh;
1736     if (!ptn_lh) {
1737         pattern_lh = new double[nptn];
1738         computePatternLikelihood(pattern_lh);
1739     }
1740     IntVector pattern_freq;
1741     aln->getPatternFreq(pattern_freq);
1742 
1743     double avg_site_lh_diff = 0.0;
1744     for (i = 0; i < nptn; i++)
1745         avg_site_lh_diff += (pattern_lh[i] - pattern_lh_other[i]) * pattern_freq[i];
1746     avg_site_lh_diff /= nsite;
1747     double variance = 0.0;
1748     for (i = 0; i < nptn; i++) {
1749         double diff = (pattern_lh[i] - pattern_lh_other[i] - avg_site_lh_diff);
1750         variance += diff * diff * pattern_freq[i];
1751     }
1752     if (!ptn_lh)
1753         delete[] pattern_lh;
1754     if (nsite <= 1)
1755         return 0.0;
1756     return variance * ((double) nsite / (nsite - 1.0));
1757 }
1758 
computeLogLDiffVariance(PhyloTree * other_tree,double * pattern_lh)1759 double PhyloTree::computeLogLDiffVariance(PhyloTree *other_tree, double *pattern_lh) {
1760     double *pattern_lh_other = new double[getAlnNPattern()];
1761     other_tree->computePatternLikelihood(pattern_lh_other);
1762     // BUG FIX found by Xcode analyze (use of memory after it is freed)
1763 //    delete[] pattern_lh_other;
1764     double res = computeLogLDiffVariance(pattern_lh_other, pattern_lh);
1765     delete[] pattern_lh_other;
1766     return res;
1767 }
1768 
getUnmarkedNodes(PhyloNodeVector & unmarkedNodes,PhyloNode * node,PhyloNode * dad)1769 void PhyloTree::getUnmarkedNodes(PhyloNodeVector& unmarkedNodes, PhyloNode* node, PhyloNode* dad) {
1770     if (!node) {
1771         node = (PhyloNode*) root;
1772     }
1773 
1774     if (markedNodeList.find(node->id) == markedNodeList.end()) {
1775         int numUnmarkedNei = 0;
1776         for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++) {
1777             if (markedNodeList.find((*it)->node->id) == markedNodeList.end())
1778                 numUnmarkedNei++;
1779         }
1780         if (numUnmarkedNei == 1)
1781             unmarkedNodes.push_back(node);
1782     }
1783 
1784     FOR_NEIGHBOR_IT(node, dad, it){
1785     getUnmarkedNodes(unmarkedNodes, (PhyloNode*) (*it)->node, node);
1786 }
1787 }
1788 
optimizeOneBranchLS(PhyloNode * node1,PhyloNode * node2)1789 double PhyloTree::optimizeOneBranchLS(PhyloNode *node1, PhyloNode *node2) {
1790     if (!subTreeDistComputed) {
1791         if (params->ls_var_type == WLS_PAUPLIN) {
1792             computeNodeBranchDists();
1793             for (int i = 0; i < leafNum; i++)
1794                 for (int j = 0; j < leafNum; j++)
1795                     var_matrix[i*leafNum+j] = pow(2.0,nodeBranchDists[i*nodeNum+j]);
1796         }
1797         computeSubtreeDists();
1798     }
1799     double A, B, C, D;
1800     A = B = C = D = 0;
1801     PhyloNode *nodeA = NULL, *nodeB = NULL, *nodeC = NULL, *nodeD = NULL;
1802     double lsBranch;
1803 
1804     // One of the node is a leaf
1805     if (node1->isLeaf() || node2->isLeaf()) {
1806         if (node1->isLeaf()) {
1807             // nodeA and nodeB are children of node2
1808             FOR_NEIGHBOR_IT(node2, node1, it){
1809                 if (A == 0) {
1810                     A = getNumTaxa((*it)->node, node2);
1811                     nodeA = (PhyloNode*) (*it)->node;
1812                 } else {
1813                     B = getNumTaxa((*it)->node, node2);
1814                     nodeB = (PhyloNode*) (*it)->node;
1815                 }
1816             }
1817             // nodeC is now node1
1818             nodeC = node1;
1819         } else {
1820             // nodeA and nodeB are children of node1
1821             FOR_NEIGHBOR_IT(node1, node2, it) {
1822                 if (A == 0) {
1823                     A = getNumTaxa((*it)->node, node1);
1824                     nodeA = (PhyloNode*) (*it)->node;
1825                 } else {
1826                     B = getNumTaxa((*it)->node, node1);
1827                     nodeB = (PhyloNode*) (*it)->node;
1828                 }
1829             }
1830             // nodeC is now node1
1831             nodeC = node2;
1832         }
1833         ASSERT(A != 0);
1834         ASSERT(B != 0);
1835         string keyAC = getBranchID(nodeA, nodeC);
1836         ASSERT(subTreeDists.count(keyAC));
1837         double distAC = subTreeDists[keyAC];
1838         double weightAC = subTreeWeights[keyAC];
1839         string keyBC = getBranchID(nodeB, nodeC);
1840         ASSERT(subTreeDists.count(keyBC));
1841         double distBC = subTreeDists[keyBC];
1842         double weightBC = subTreeWeights[keyBC];
1843         string keyAB = getBranchID(nodeA, nodeB);
1844         ASSERT(subTreeDists.count(keyAB));
1845         double distAB = subTreeDists[keyAB];
1846         double weightAB = subTreeWeights[keyAB];
1847         if (params->ls_var_type == OLS/* || params->ls_var_type == FIRST_TAYLOR || params->ls_var_type == FITCH_MARGOLIASH
1848                 || params->ls_var_type == SECOND_TAYLOR*/) {
1849             lsBranch = 0.5 * (distAC / A + distBC / B - distAB / (A * B));
1850         } /*else if (params->ls_var_type == PAUPLIN) {
1851             // TODO: Chua test bao gio
1852             outError("Paulin formula not supported yet");
1853             lsBranch = 0.5 * (distAC + distBC) - 0.5 * distAB;
1854         }*/ else {
1855             // weighted least square
1856             lsBranch = 0.5*(distAC/weightAC + distBC/weightBC - distAB/weightAB);
1857         }
1858     } else { // Both node are internal node
1859         FOR_NEIGHBOR_IT(node1, node2, it) {
1860             if (A == 0) {
1861                 A = getNumTaxa((*it)->node, node1);
1862                 nodeA = (PhyloNode*) (*it)->node;
1863             } else {
1864                 B = getNumTaxa((*it)->node, node1);
1865                 nodeB = (PhyloNode*) (*it)->node;
1866             }
1867         }
1868 
1869         FOR_NEIGHBOR_IT(node2, node1, it) {
1870             if (C == 0) {
1871                 C = getNumTaxa((*it)->node, node2);
1872                 nodeC = (PhyloNode*) (*it)->node;
1873             } else {
1874                 D = getNumTaxa((*it)->node, node2);
1875                 nodeD = (PhyloNode*) (*it)->node;
1876             }
1877         }
1878 
1879         string keyAC = getBranchID(nodeA, nodeC);
1880         ASSERT(subTreeDists.count(keyAC));
1881         double distAC = subTreeDists[keyAC];
1882         double weightAC = subTreeWeights[keyAC];
1883 
1884         string keyBD = getBranchID(nodeB, nodeD);
1885         ASSERT(subTreeDists.count(keyBD));
1886         double distBD = subTreeDists[keyBD];
1887         double weightBD = subTreeWeights[keyBD];
1888 
1889         string keyBC = getBranchID(nodeB, nodeC);
1890         ASSERT(subTreeDists.count(keyBC));
1891         double distBC = subTreeDists[keyBC];
1892         double weightBC = subTreeWeights[keyBC];
1893 
1894         string keyAD = getBranchID(nodeA, nodeD);
1895         ASSERT(subTreeDists.count(keyAD));
1896         double distAD = subTreeDists[keyAD];
1897         double weightAD = subTreeWeights[keyAD];
1898 
1899         string keyAB = getBranchID(nodeA, nodeB);
1900         ASSERT(subTreeDists.count(keyAB));
1901         double distAB = subTreeDists[keyAB];
1902         double weightAB = subTreeWeights[keyAB];
1903 
1904         string keyCD = getBranchID(nodeC, nodeD);
1905         ASSERT(subTreeDists.count(keyCD));
1906         double distCD = subTreeDists[keyCD];
1907         double weightCD = subTreeWeights[keyCD];
1908 
1909         /*if (params->ls_var_type == PAUPLIN) {
1910             // this distance has a typo as also seen in Mihaescu & Pachter 2008
1911             //lsBranch = 0.25 * (distAC + distBD + distAD + distBC) - 0.5 * (distAB - distCD);
1912             outError("Paulin formula not supported yet");
1913             lsBranch = 0.25 * (distAC + distBD + distAD + distBC) - 0.5 * (distAB + distCD);
1914         } else*/ if (params->ls_var_type == OLS) {
1915             double gamma = (B * C + A * D) / ((A + B)*(C + D));
1916             lsBranch = 0.5 * (gamma * (distAC / (A * C) + distBD / (B * D))
1917                     + (1 - gamma) * (distBC / (B * C) + distAD / (A * D))
1918                     - distAB / (A * B) - distCD / (C * D));
1919         } else {
1920             // weighted least square
1921             double K = 1.0/weightAC + 1.0/weightBD + 1.0/weightAD + 1.0/weightBC;
1922             lsBranch =
1923                     ((distAC/weightAC+distBD/weightBD)*(weightAD+weightBC)/(weightAD*weightBC)+
1924                     (distAD/weightAD+distBC/weightBC)*(weightAC+weightBD)/(weightAC*weightBD))/K
1925                     - distAB/weightAB - distCD/weightCD;
1926             lsBranch = 0.5*lsBranch;
1927         }
1928     }
1929     return lsBranch;
1930 }
1931 
updateSubtreeDists(NNIMove & nnimove)1932 void PhyloTree::updateSubtreeDists(NNIMove &nnimove) {
1933     ASSERT(subTreeDistComputed);
1934     PhyloNode *nodeA = NULL, *nodeB = NULL, *nodeC = NULL, *nodeD = NULL;
1935     PhyloNode *node1 = nnimove.node1;
1936     PhyloNode *node2 = nnimove.node2;
1937     NeighborVec::iterator node1Nei_it = nnimove.node1Nei_it;
1938     NeighborVec::iterator node2Nei_it = nnimove.node2Nei_it;
1939     Neighbor *node1Nei = *(node1Nei_it);
1940     Neighbor *node2Nei = *(node2Nei_it);
1941 
1942     // ((A,C),(B,D))
1943     // C and D are the 2 subtree that get swapped
1944     FOR_NEIGHBOR_IT(node1, node2, it) {
1945         if ((*it)->id != node1Nei->id) {
1946             nodeA = (PhyloNode*) (*it)->node;
1947         } else {
1948             nodeC = (PhyloNode*) (*it)->node;
1949         }
1950     }
1951 
1952     ASSERT(nodeA);
1953     ASSERT(nodeC);
1954 
1955     FOR_NEIGHBOR_IT(node2, node1, it) {
1956         if ((*it)->id != node2Nei->id) {
1957             nodeB = (PhyloNode*) (*it)->node;
1958         } else {
1959             nodeD = (PhyloNode*) (*it)->node;
1960         }
1961     }
1962 
1963     ASSERT(nodeB);
1964     ASSERT(nodeD);
1965 
1966     NodeVector nodeListA, nodeListB, nodeListC, nodeListD;
1967     getAllNodesInSubtree(nodeA, node1, nodeListA);
1968     getAllNodesInSubtree(nodeC, node1, nodeListC);
1969     getAllNodesInSubtree(nodeB, node2, nodeListB);
1970     getAllNodesInSubtree(nodeD, node2, nodeListD);
1971 
1972     for (NodeVector::iterator it = nodeListA.begin(); it != nodeListA.end(); ++it) {
1973         string key = getBranchID((*it), node2);
1974         double distB = subTreeDists.find(getBranchID((*it), nodeB))->second;
1975         double distD = subTreeDists.find(getBranchID((*it), nodeD))->second;
1976         double newDist = distB + distD;
1977         StringDoubleMap::iterator dist_it = subTreeDists.find(key);
1978         ASSERT(dist_it != subTreeDists.end());
1979         dist_it->second = newDist;
1980     }
1981 
1982     for (NodeVector::iterator it = nodeListB.begin(); it != nodeListB.end(); ++it) {
1983         string key = getBranchID((*it), node1);
1984         double distC = subTreeDists.find(getBranchID((*it), nodeC))->second;
1985         double distA = subTreeDists.find(getBranchID((*it), nodeA))->second;
1986         double newDist = distC + distA;
1987         StringDoubleMap::iterator dist_it = subTreeDists.find(key);
1988         ASSERT(dist_it != subTreeDists.end());
1989         dist_it->second = newDist;
1990     }
1991 
1992     for (NodeVector::iterator it = nodeListC.begin(); it != nodeListC.end(); ++it) {
1993         string key = getBranchID((*it), node2);
1994         double distD = subTreeDists.find(getBranchID((*it), nodeD))->second;
1995         double distB = subTreeDists.find(getBranchID((*it), nodeB))->second;
1996         double newDist = distD + distB;
1997         StringDoubleMap::iterator dist_it = subTreeDists.find(key);
1998         ASSERT(dist_it != subTreeDists.end());
1999         dist_it->second = newDist;
2000     }
2001 
2002     for (NodeVector::iterator it = nodeListD.begin(); it != nodeListD.end(); ++it) {
2003         string key = getBranchID((*it), node1);
2004         double distA = subTreeDists.find(getBranchID((*it), nodeA))->second;
2005         double distC = subTreeDists.find(getBranchID((*it), nodeC))->second;
2006         double newDist = distA + distC;
2007         StringDoubleMap::iterator dist_it = subTreeDists.find(key);
2008         ASSERT(dist_it != subTreeDists.end());
2009         dist_it->second = newDist;
2010     }
2011 
2012     double distAB = subTreeDists.find(getBranchID(nodeA, nodeB))->second;
2013     double distAD = subTreeDists.find(getBranchID(nodeA, nodeD))->second;
2014     double distCB = subTreeDists.find(getBranchID(nodeC, nodeB))->second;
2015     double distCD = subTreeDists.find(getBranchID(nodeC, nodeD))->second;
2016 
2017     subTreeDists.find(getBranchID(node1, node2))->second = distAB + distAD + distCB + distCD;
2018 
2019 }
2020 
computeSubtreeDists()2021 void PhyloTree::computeSubtreeDists() {
2022     PhyloNodeVector unmarkedNodes;
2023     subTreeDists.clear();
2024     subTreeWeights.clear();
2025     do {
2026         // Generate a list of unmarked node that is adjacent to exactly one unmarked nodes
2027         // Here we will work up the tree in a bottom up manner
2028         unmarkedNodes.clear();
2029         getUnmarkedNodes(unmarkedNodes);
2030         if (unmarkedNodes.size() == 0)
2031             break;
2032 
2033         for (PhyloNodeVector::iterator it = unmarkedNodes.begin(); it != unmarkedNodes.end(); ++it) {
2034             // if the node is an internal node then all of its child nodes should be marked
2035             // source_nei1 and source_nei2 are the 2 marked child node
2036             // nextNode is the other node, used for traversal
2037             PhyloNode* source_nei1 = NULL;
2038             PhyloNode* source_nei2 = NULL;
2039             PhyloNode* nextNode;
2040             if (!(*it)->isLeaf()) {
2041                 // select the 2 marked child nodes
2042                 for (NeighborVec::iterator it2 = (*it)->neighbors.begin(); it2 != (*it)->neighbors.end(); ++it2) {
2043                     if (markedNodeList.find((*it2)->node->id) != markedNodeList.end()) {
2044                         if (!source_nei1) {
2045                             source_nei1 = (PhyloNode*) (*it2)->node;
2046                         } else {
2047                             source_nei2 = (PhyloNode*) (*it2)->node;
2048                         }
2049                     } else {
2050                         nextNode = (PhyloNode*) (*it2)->node;
2051                     }
2052                 }
2053                 ASSERT(source_nei1);
2054                 ASSERT(source_nei2);
2055             } else {
2056                 nextNode = (PhyloNode*) (*it)->neighbors[0]->node;
2057             }
2058             // warning: 'nextNode' may be used uninitialized in this function
2059             computeAllSubtreeDistForOneNode((*it), source_nei1, source_nei2, (*it), nextNode);
2060             markedNodeList.insert(IntPhyloNodeMap::value_type((*it)->id, (*it)));
2061         }
2062     } while (true);
2063     markedNodeList.clear();
2064     subTreeDistComputed = true;
2065 }
2066 
computeAllSubtreeDistForOneNode(PhyloNode * source,PhyloNode * source_nei1,PhyloNode * source_nei2,PhyloNode * node,PhyloNode * dad)2067 void PhyloTree::computeAllSubtreeDistForOneNode(PhyloNode* source, PhyloNode* source_nei1, PhyloNode* source_nei2,
2068         PhyloNode* node, PhyloNode* dad) {
2069     string key = getBranchID(source, dad);
2070     double dist, weight;
2071     if (markedNodeList.find(dad->id) != markedNodeList.end()) {
2072         return;
2073     } else if (source->isLeaf() && dad->isLeaf()) {
2074         ASSERT(dist_matrix);
2075         int nseq = aln->getNSeq();
2076         if (params->ls_var_type == OLS) {
2077             dist = dist_matrix[dad->id * nseq + source->id];
2078             weight = 1.0;
2079         } else {
2080             // this will take into account variances, also work for OLS since var = 1
2081             weight = 1.0/var_matrix[dad->id * nseq + source->id];
2082             dist = dist_matrix[dad->id * nseq + source->id] * weight;
2083         }
2084         subTreeDists.insert(StringDoubleMap::value_type(key, dist));
2085         subTreeWeights.insert(StringDoubleMap::value_type(key, weight));
2086     } else if (!source->isLeaf() && dad->isLeaf()) {
2087         ASSERT(source_nei1);
2088         ASSERT(source_nei2);
2089         string key1 = getBranchID(source_nei1, dad);
2090         ASSERT(subTreeDists.find(key1) == subTreeDists.end());
2091         double dist1 = subTreeDists.find(key1)->second;
2092         double weight1 = subTreeWeights.find(key1)->second;
2093         string key2 = getBranchID(source_nei2, dad);
2094         ASSERT(subTreeDists.find(key2) == subTreeDists.end());
2095         double dist2 = subTreeDists.find(key2)->second;
2096         double weight2 = subTreeWeights.find(key2)->second;
2097         dist = dist1 + dist2;
2098         weight = weight1 + weight2;
2099         subTreeDists.insert(StringDoubleMap::value_type(key, dist));
2100         subTreeWeights.insert(StringDoubleMap::value_type(key, weight));
2101     } else {
2102         PhyloNode* dad_nei1 = NULL;
2103         PhyloNode* dad_nei2 = NULL;
2104         for (NeighborVec::iterator it = dad->neighbors.begin(); it != dad->neighbors.end(); ++it) {
2105             if ((*it)->node != node) {
2106                 if (!dad_nei1) {
2107                     dad_nei1 = (PhyloNode*) (*it)->node;
2108                 } else {
2109                     dad_nei2 = (PhyloNode*) (*it)->node;
2110                 }
2111             }
2112         }
2113         ASSERT(dad_nei1);
2114         ASSERT(dad_nei2);
2115         computeAllSubtreeDistForOneNode(source, source_nei1, source_nei2, dad, dad_nei1);
2116         computeAllSubtreeDistForOneNode(source, source_nei1, source_nei2, dad, dad_nei2);
2117         string key1 = getBranchID(source, dad_nei1);
2118         string key2 = getBranchID(source, dad_nei2);
2119         ASSERT(subTreeDists.find(key1) != subTreeDists.end());
2120         ASSERT(subTreeDists.find(key2) != subTreeDists.end());
2121         double dist1 = subTreeDists.find(key1)->second;
2122         double weight1 = subTreeWeights.find(key1)->second;
2123         double dist2 = subTreeDists.find(key2)->second;
2124         double weight2 = subTreeWeights.find(key2)->second;
2125         dist = dist1 + dist2;
2126         weight = weight1 + weight2;
2127         subTreeDists.insert(StringDoubleMap::value_type(key, dist));
2128         subTreeWeights.insert(StringDoubleMap::value_type(key, weight));
2129     }
2130 }
2131 
computeNodeBranchDists(Node * node,Node * dad)2132 set<int> PhyloTree::computeNodeBranchDists(Node *node, Node *dad) {
2133     set<int>::iterator i, j;
2134     if (!nodeBranchDists) {
2135         cout << "nodeNum = " << nodeNum << endl;
2136         nodeBranchDists = new int[nodeNum*nodeNum];
2137     }
2138     if (!node) {
2139         memset(nodeBranchDists, 0, sizeof(int)*nodeNum*nodeNum);
2140         ASSERT(root->isLeaf());
2141         dad = root;
2142         node = dad->neighbors[0]->node;
2143         set<int> res = computeNodeBranchDists(node, dad);
2144         for (i = res.begin(); i != res.end(); i++)
2145             nodeBranchDists[(*i)*nodeNum + dad->id] = nodeBranchDists[(dad->id)*nodeNum + (*i)] =
2146                 nodeBranchDists[(*i)*nodeNum + node->id] + 1;
2147         // sanity check that all distances are filled
2148         for (int x = 0; x < nodeNum; x++)
2149             for (int y = 0; y < nodeNum; y++)
2150                 if (x != y)
2151                     ASSERT(nodeBranchDists[x*nodeNum+y] != 0);
2152                 else
2153                     ASSERT(nodeBranchDists[x*nodeNum+y] == 0);
2154         return res;
2155     }
2156     if (node->isLeaf()) {
2157         set<int> res;
2158         res.insert(node->id);
2159         return res;
2160     }
2161     ASSERT(node->degree() == 3);
2162     Node *left = NULL, *right = NULL;
2163     FOR_NEIGHBOR_IT(node, dad, it) {
2164         if (!left) left = (*it)->node; else right = (*it)->node;
2165     }
2166     set<int> resl = computeNodeBranchDists(left, node);
2167     set<int> resr = computeNodeBranchDists(right, node);
2168     for (i = resl.begin(); i != resl.end(); i++)
2169         nodeBranchDists[(*i)*nodeNum + node->id] = nodeBranchDists[(node->id)*nodeNum + (*i)] =
2170             nodeBranchDists[(*i)*nodeNum + left->id] + 1;
2171     for (i = resr.begin(); i != resr.end(); i++)
2172         nodeBranchDists[(*i)*nodeNum + node->id] = nodeBranchDists[(node->id)*nodeNum + (*i)] =
2173             nodeBranchDists[(*i)*nodeNum + right->id] + 1;
2174     for (i = resl.begin(); i != resl.end(); i++)
2175         for (j = resr.begin(); j != resr.end(); j++)
2176             nodeBranchDists[(*i)*nodeNum + (*j)] = nodeBranchDists[(*j)*nodeNum+(*i)] =
2177                 nodeBranchDists[(*i)*nodeNum+node->id]+nodeBranchDists[(*j)*nodeNum+node->id];
2178     resl.insert(resr.begin(), resr.end());
2179     resl.insert(node->id);
2180     return resl;
2181 }
2182 
2183 
2184 /*
2185     b0: initial guess for the maximum
2186 */
approxOneBranch(PhyloNode * node,PhyloNode * dad,double b0)2187 double PhyloTree::approxOneBranch(PhyloNode *node, PhyloNode *dad, double b0) {
2188     double b_max, ddl, b1, b2, std, seqlen;
2189     double t1, t3, t5, t11, t18, t21, t26, t29, t30, t32, t44, t46, t48;
2190     double beps = 1/DBL_MAX;
2191 
2192     /* TODO: insert call to get sequence length */
2193     seqlen = getAlnNSite();
2194 
2195     /* use a robust first order approximation to the variance */
2196     std = sqrt(b0/seqlen);
2197 
2198     /* determine neighbour points */
2199     b1 = b0 - std;
2200     if (b1<=0) b1 = beps; /* only happens for b<=1 with small seq. len. */
2201     b2 = b0 + std;
2202 
2203     /* TODO: insert calls to log-likelihood function */
2204     PhyloNeighbor *dad_nei = (PhyloNeighbor*)(dad->findNeighbor(node));
2205     PhyloNeighbor *node_nei = (PhyloNeighbor*)(node->findNeighbor(dad));
2206     double old_len = dad_nei->length;
2207     dad_nei->length = node_nei->length = b0;
2208     double l0 = computeLikelihoodBranch(dad_nei, dad);
2209     dad_nei->length = node_nei->length = b1;
2210     double l1 = computeLikelihoodBranch(dad_nei, dad);
2211     dad_nei->length = node_nei->length = b2;
2212     double l2 = computeLikelihoodBranch(dad_nei, dad);
2213     dad_nei->length = node_nei->length = old_len;
2214 
2215     t1 = sqrt(b0);
2216     t3 = sqrt(b2);
2217     t5 = sqrt(b1);
2218     t11 = pow(-t1*l2+t3*l0+t5*l2+t1*l1-t5*l0-t3*l1,2.0);
2219     t18 = -b0*l2+b2*l0+b1*l2+b0*l1-b1*l0-b2*l1;
2220     t21 = t1-t5;
2221     t26 = -t1*t3+t1*t5+b2-t5*t3;
2222     t29 = t18*t18;
2223     t30 = 1/t11;
2224     t32 = sqrt(t29*t30);
2225     ddl = -2.0*t11/t18/t21/t26/t32;
2226 
2227     if (ddl > 0) {
2228         /* the analytic extremum is a minimum,
2229            so the maximum is at the lower bound */
2230         b_max = 0;
2231     } else {
2232         t44 = pow(-t1*b2+t5*b2-t5*b0+t3*b0-t3*b1+t1*b1,2.0);
2233         t46 = t21*t21;
2234         t48 = t26*t26;
2235         b_max = t29*t44/t46/t48*t30/4.0;
2236     }
2237 
2238     return(b_max);
2239 }
2240 
approxAllBranches(PhyloNode * node,PhyloNode * dad)2241 void PhyloTree::approxAllBranches(PhyloNode *node, PhyloNode *dad) {
2242     if (!node) {
2243         node = (PhyloNode*) root;
2244     }
2245 
2246     if (dad) {
2247         PhyloNeighbor *node_dad_nei = (PhyloNeighbor*) node->findNeighbor(dad);
2248         PhyloNeighbor *dad_node_nei = (PhyloNeighbor*) dad->findNeighbor(node);
2249         double len = approxOneBranch(node, dad, dad_node_nei->length);
2250         node_dad_nei->length = len;
2251         dad_node_nei->length = len;
2252     }
2253 
2254     for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++)
2255         if ((*it)->node != (dad)) {
2256             approxAllBranches((PhyloNode*) (*it)->node, node);
2257         }
2258 }
2259 
2260 /*
2261  void PhyloTree::computeAllSubtreeDists(PhyloNode* node, PhyloNode* dad) {
2262  if (!node) {
2263  node = (PhyloNode*) root;
2264  }
2265 
2266  if (dad) {
2267  // This function compute all pairwise subtree distance between subtree rooted at dad and others
2268 
2269  computeSubtreeDists(node, dad);
2270  }
2271 
2272  FOR_NEIGHBOR_IT(node, dad, it) {
2273 
2274  computeAllSubtreeDists((PhyloNode*) (*it)->node, node);
2275  }
2276  }
2277 
2278  void PhyloTree::computeSubtreeDists(PhyloNode* node, PhyloNode* dad) {
2279  // if both nodes are leaf then it is trivial, just retrieve the values from the distance matrix
2280  if (dad->isLeaf() && node->isLeaf()) {
2281  string key = nodePair2String(dad, node);
2282  assert(dist_matrix);
2283  int nseq = aln->getNSeq();
2284  double dist = dist_matrix[dad->id * nseq + node->id];
2285  interSubtreeDistances.insert(StringDoubleMap::value_type(key, dist));
2286  } else if (!dad->isLeaf() && node->isLeaf()) {
2287 
2288  FOR_NEIGHBOR_IT(node, dad, it) {
2289 
2290  computeSubtreeDists(dad, (PhyloNode*) (*it)->node);
2291  }
2292 
2293  }
2294  }
2295  */
2296 
computeBayesianBranchLength(PhyloNeighbor * dad_branch,PhyloNode * dad)2297 double PhyloTree::computeBayesianBranchLength(PhyloNeighbor *dad_branch, PhyloNode *dad) {
2298     double obsLen = 0.0;
2299     PhyloNode *node = (PhyloNode*) dad_branch->node;
2300     PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
2301     ASSERT(node_branch);
2302     /*
2303      if (node->isLeaf() || dad->isLeaf()) {
2304      return -1.0;
2305      }*/
2306      // TODO
2307 //    if ((dad_branch->partial_lh_computed & 1) == 0)
2308 //        computePartialLikelihood(dad_branch, dad);
2309 //    if ((node_branch->partial_lh_computed & 1) == 0)
2310 //        computePartialLikelihood(node_branch, node);
2311     // now combine likelihood at the branch
2312     int nstates = aln->num_states;
2313     int numCat = site_rate->getNRate();
2314     size_t block = numCat * nstates;
2315     size_t nptn = aln->size();
2316     size_t ptn;
2317     int cat, state;
2318     double *tmp_state_freq = new double[nstates];
2319     double *tmp_anscentral_state_prob1 = new double[nstates];
2320     double *tmp_anscentral_state_prob2 = new double[nstates];
2321 
2322     //computeLikelihoodBranchNaive(dad_branch, dad, NULL, tmp_ptn_rates);
2323     //double sum_rates = 0.0;
2324     //for (ptn = 0; ptn < nptn; ptn++)
2325     //    sum_rates += tmp_ptn_rates[ptn] * aln->at(ptn).frequency;
2326     //cout << "sum_rates = " << sum_rates << endl;
2327 
2328     model->getStateFrequency(tmp_state_freq);
2329 
2330     for (ptn = 0; ptn < nptn; ptn++) {
2331         // Compute the probability of each state for the current site
2332         double sum_prob1 = 0.0, sum_prob2 = 0.0;
2333         size_t offset = ptn * block;
2334         double *partial_lh_site = node_branch->partial_lh + (offset);
2335         double *partial_lh_child = dad_branch->partial_lh + (offset);
2336         for (state = 0; state < nstates; state++) {
2337             tmp_anscentral_state_prob1[state] = 0.0;
2338             tmp_anscentral_state_prob2[state] = 0.0;
2339             for (cat = 0; cat < numCat; cat++) {
2340                 tmp_anscentral_state_prob1[state] += partial_lh_site[nstates * cat + state];
2341                 tmp_anscentral_state_prob2[state] += partial_lh_child[nstates * cat + state];
2342             }
2343             tmp_anscentral_state_prob1[state] *= tmp_state_freq[state];
2344             tmp_anscentral_state_prob2[state] *= tmp_state_freq[state];
2345             sum_prob1 += tmp_anscentral_state_prob1[state];
2346             sum_prob2 += tmp_anscentral_state_prob2[state];
2347         }
2348         bool sameState = false;
2349         int state1 = 0, state2 = 0;
2350         double cutoff = 1.0/nstates;
2351         for (state = 0; state < nstates; state++) {
2352             tmp_anscentral_state_prob1[state] /= sum_prob1;
2353             tmp_anscentral_state_prob2[state] /= sum_prob2;
2354             if (tmp_anscentral_state_prob1[state] > tmp_anscentral_state_prob1[state1])
2355                 state1 = state;
2356             if (tmp_anscentral_state_prob2[state] > tmp_anscentral_state_prob2[state2])
2357                 state2 = state;
2358             if (tmp_anscentral_state_prob1[state] > cutoff && tmp_anscentral_state_prob2[state] > cutoff)
2359                 sameState = true;
2360         }
2361         sameState = sameState || (state1 == state2);
2362         if (!sameState) {
2363             obsLen += aln->at(ptn).frequency;
2364         }
2365 
2366     }
2367     obsLen /= getAlnNSite();
2368     if (obsLen < params->min_branch_length)
2369         obsLen = params->min_branch_length;
2370     delete[] tmp_anscentral_state_prob2;
2371     delete[] tmp_anscentral_state_prob1;
2372     delete[] tmp_state_freq;
2373 
2374     return obsLen;
2375 }
2376 
correctBranchLengthF81(double observedBran,double alpha)2377 double PhyloTree::correctBranchLengthF81(double observedBran, double alpha) {
2378     if (!model)
2379         return JukesCantorCorrection(observedBran, alpha);
2380     double H = 0.0;
2381     double correctedBranLen;
2382     for (int i = 0; i < model->num_states; i++) {
2383         H += model->state_freq[i] * (1 - model->state_freq[i]);
2384     }
2385     observedBran = 1.0 - observedBran / H;
2386     // no gamma
2387     if (observedBran <= 0.0)
2388         return params->max_branch_length;
2389 
2390     if (alpha <= 0.0) {
2391         correctedBranLen = -H * log(observedBran);
2392     } else {
2393         //if (verbose_mode >= VB_MAX) cout << "alpha: " << alpha << endl;
2394 
2395         correctedBranLen = H * alpha * (pow(observedBran, -1 / alpha) - 1);
2396     }
2397 
2398     // Branch lengths under PoMo are #events, which is ~N^2 * #substitutions
2399     if (aln->seq_type == SEQ_POMO)
2400         correctedBranLen *= aln->virtual_pop_size * aln->virtual_pop_size;
2401 
2402     if (correctedBranLen < params->min_branch_length)
2403         correctedBranLen = params->min_branch_length;
2404     if (correctedBranLen > params->max_branch_length)
2405         correctedBranLen = params->max_branch_length;
2406 
2407     return correctedBranLen;
2408 }
2409 
computeCorrectedBayesianBranchLength(PhyloNeighbor * dad_branch,PhyloNode * dad)2410 double PhyloTree::computeCorrectedBayesianBranchLength(PhyloNeighbor *dad_branch, PhyloNode *dad) {
2411     double observedBran = computeBayesianBranchLength(dad_branch, dad);
2412     return correctBranchLengthF81(observedBran, site_rate->getGammaShape());
2413 }
2414 
computeAllBayesianBranchLengths(Node * node,Node * dad)2415 void PhyloTree::computeAllBayesianBranchLengths(Node *node, Node *dad) {
2416 
2417     if (!node)
2418         node = root;
2419 
2420     FOR_NEIGHBOR_IT(node, dad, it){
2421         double branch_length = computeBayesianBranchLength((PhyloNeighbor*) (*it), (PhyloNode*) node);
2422         (*it)->length = branch_length;
2423         // set the backward branch length
2424         (*it)->node->findNeighbor(node)->length = (*it)->length;
2425         computeAllBayesianBranchLengths((*it)->node, node);
2426     }
2427 }
2428 
2429 
computeLikelihoodZeroBranch(PhyloNeighbor * dad_branch,PhyloNode * dad)2430 double PhyloTree::computeLikelihoodZeroBranch(PhyloNeighbor *dad_branch, PhyloNode *dad) {
2431     double lh_zero_branch;
2432     double saved_len = dad_branch->length;
2433     PhyloNeighbor *node_branch = (PhyloNeighbor*) dad_branch->node->findNeighbor(dad);
2434     dad_branch->length = 0.0;
2435     node_branch->length = 0.0;
2436     lh_zero_branch = computeLikelihoodBranch(dad_branch, dad);
2437     // restore branch length
2438     dad_branch->length = saved_len;
2439     node_branch->length = saved_len;
2440 
2441     return lh_zero_branch;
2442 }
2443 
2444 
2445 /****************************************************************************
2446  Branch length optimization by maximum likelihood
2447  ****************************************************************************/
2448 
2449 //const double MIN_TREE_LENGTH_SCALE = 0.001;
2450 //const double MAX_TREE_LENGTH_SCALE = 100.0;
2451 const double TOL_TREE_LENGTH_SCALE = 0.001;
2452 
2453 
optimizeTreeLengthScaling(double min_scaling,double & scaling,double max_scaling,double gradient_epsilon)2454 double PhyloTree::optimizeTreeLengthScaling(double min_scaling, double &scaling, double max_scaling, double gradient_epsilon) {
2455     is_opt_scaling = true;
2456     current_scaling = scaling;
2457     double negative_lh, ferror;
2458     // 2018-08-20: make sure that max and min branch lengths do not go over bounds
2459     vector<DoubleVector> brlens;
2460     brlens.resize(branchNum);
2461     getBranchLengths(brlens);
2462     double min_brlen = params->max_branch_length, max_brlen = params->min_branch_length;
2463     for (auto brlenvec = brlens.begin(); brlenvec != brlens.end(); brlenvec++) {
2464         for (auto brlen = brlenvec->begin(); brlen != brlenvec->end(); brlen++) {
2465             max_brlen = max(max_brlen, *brlen);
2466             min_brlen = min(min_brlen, *brlen);
2467         }
2468     }
2469     if (min_brlen <= 0.0) min_brlen = params->min_branch_length;
2470     if (max_scaling > 10.0*params->max_branch_length / max_brlen)
2471         max_scaling = 10.0*params->max_branch_length / max_brlen;
2472     if (min_scaling < 0.1*params->min_branch_length / min_brlen)
2473         min_scaling = 0.1*params->min_branch_length / min_brlen;
2474 
2475     scaling = minimizeOneDimen(min(scaling, min_scaling), scaling, max(max_scaling, scaling), max(TOL_TREE_LENGTH_SCALE, gradient_epsilon), &negative_lh, &ferror);
2476     if (scaling != current_scaling) {
2477         scaleLength(scaling / current_scaling);
2478         current_scaling = scaling;
2479         clearAllPartialLH();
2480     }
2481     is_opt_scaling = false;
2482     return computeLikelihood();
2483 }
2484 
printTreeLengthScaling(const char * filename)2485 void PhyloTree::printTreeLengthScaling(const char *filename) {
2486 //    double treescale = 1.0;
2487 //
2488 //    cout << "Optimizing tree length scaling ..." << endl;
2489 //
2490 //    double lh = optimizeTreeLengthScaling(MIN_BRLEN_SCALE, treescale, MAX_BRLEN_SCALE, 0.001);
2491 //
2492 //    cout << "treescale: " << treescale << " / LogL: " << lh << endl;
2493 
2494     Checkpoint *saved_checkpoint = getModelFactory()->getCheckpoint();
2495     Checkpoint *new_checkpoint = new Checkpoint;
2496     new_checkpoint->setFileName(filename);
2497     new_checkpoint->setCompression(false);
2498     new_checkpoint->setHeader("IQ-TREE scaled tree length and model parameters");
2499     new_checkpoint->put("treelength", treeLength());
2500     saved_checkpoint->put("treelength", treeLength()); // also put treelength into current checkpoint
2501 
2502     getModelFactory()->setCheckpoint(new_checkpoint);
2503     getModelFactory()->saveCheckpoint();
2504     new_checkpoint->dump();
2505 
2506     getModelFactory()->setCheckpoint(saved_checkpoint);
2507 }
2508 
computeFunction(double value)2509 double PhyloTree::computeFunction(double value) {
2510     if (!is_opt_scaling) {
2511         current_it->length = value;
2512         current_it_back->length = value;
2513         return -computeLikelihoodBranch(current_it, (PhyloNode*) current_it_back->node);
2514     } else {
2515         if (value != current_scaling) {
2516             scaleLength(value / current_scaling);
2517             current_scaling = value;
2518             clearAllPartialLH();
2519         }
2520         return -computeLikelihood();
2521     }
2522 }
2523 
computeFuncDerv(double value,double & df,double & ddf)2524 void PhyloTree::computeFuncDerv(double value, double &df, double &ddf) {
2525     current_it->length = value;
2526     current_it_back->length = value;
2527 //    double lh;
2528 //    lh = -computeLikelihoodDerv(current_it, (PhyloNode*) current_it_back->node, df, ddf);
2529     computeLikelihoodDerv(current_it, (PhyloNode*) current_it_back->node, &df, &ddf);
2530 
2531     df = -df;
2532     ddf = -ddf;
2533 
2534 //    return lh;
2535 }
2536 
optimizePatternRates(DoubleVector & pattern_rates)2537 void PhyloTree::optimizePatternRates(DoubleVector &pattern_rates) {
2538     size_t nptn = aln->getNPattern();
2539     pattern_rates.resize(nptn, 1.0);
2540 #pragma omp parallel for
2541     for (size_t ptn = 0; ptn < nptn; ptn++) {
2542         Alignment *paln = new Alignment;
2543         IntVector ptn_id;
2544         ptn_id.push_back(ptn);
2545         paln->extractPatterns(aln, ptn_id);
2546         PhyloTree *tree = new PhyloTree;
2547         tree->copyPhyloTree(this);
2548         tree->setParams(params);
2549         tree->setAlignment(paln);
2550         tree->sse = sse;
2551         tree->num_threads = 1;
2552         // initialize model
2553         tree->setModelFactory(getModelFactory());
2554 
2555         // main optimization
2556         tree->optimizeTreeLengthScaling(MIN_SITE_RATE, pattern_rates[ptn], MAX_SITE_RATE, 0.0001);
2557 
2558         tree->setModelFactory(NULL);
2559 
2560         delete tree;
2561         delete paln;
2562     }
2563 }
2564 
getNBranchParameters(int brlen_type)2565 int PhyloTree::getNBranchParameters(int brlen_type) {
2566     if (params->fixed_branch_length || brlen_type == BRLEN_FIX)
2567         return 0;
2568 
2569     int df = 0;
2570 
2571     if (brlen_type == BRLEN_OPTIMIZE) {
2572         df = branchNum - (int)rooted;
2573     // If model is Lie-Markov, and is in fact time reversible, one of the
2574     // degrees of freedom is illusary. (Of the two edges coming from the
2575     // root, only sum of their lenghts affects likelihood.)
2576     // So correct for this. Without this correction, K2P and RY2.2b
2577     // would not be synonymous, for example.
2578 
2579 //    string className(typeid(*model).name());
2580 //    if (className.find("ModelLieMarkov")!=string::npos && model->isReversible())
2581 //        df--;
2582 
2583         // BQM 2017-04-28, alternatively, check if there is a virtual_root and model is reversible
2584         if (rooted && model && model->isReversible())
2585             df--;
2586 
2587     } else if (brlen_type == BRLEN_SCALE)
2588         df = 1;
2589     return df;
2590 }
2591 
optimizeOneBranch(PhyloNode * node1,PhyloNode * node2,bool clearLH,int maxNRStep)2592 void PhyloTree::optimizeOneBranch(PhyloNode *node1, PhyloNode *node2, bool clearLH, int maxNRStep) {
2593 
2594     if (rooted && (node1 == root || node2 == root))
2595         return; // does not optimize virtual branch from root
2596 
2597     double negative_lh;
2598     current_it = (PhyloNeighbor*) node1->findNeighbor(node2);
2599     ASSERT(current_it);
2600     current_it_back = (PhyloNeighbor*) node2->findNeighbor(node1);
2601     ASSERT(current_it_back);
2602 
2603     double current_len = current_it->length;
2604     double ferror, optx;
2605     ASSERT(current_len >= 0.0);
2606     theta_computed = false;
2607 //    mem_slots.cleanup();
2608     if (optimize_by_newton) {
2609         // Newton-Raphson method
2610         optx = minimizeNewton(params->min_branch_length, current_len, params->max_branch_length, params->min_branch_length, negative_lh, maxNRStep);
2611         if (verbose_mode >= VB_DEBUG) {
2612             cout << "minimizeNewton logl: " << computeLikelihoodFromBuffer() << endl;
2613         }
2614         if (optx > params->max_branch_length*0.95 && !isSuperTree()) {
2615             // newton raphson diverged, reset
2616             double opt_lh = computeLikelihoodFromBuffer();
2617             current_it->length = current_len;
2618             current_it_back->length = current_len;
2619             double orig_lh = computeLikelihoodFromBuffer();
2620             if (orig_lh > opt_lh) {
2621                 optx = current_len;
2622             }
2623         }
2624     }    else {
2625         // Brent method
2626         optx = minimizeOneDimen(params->min_branch_length, current_len, params->max_branch_length, params->min_branch_length, &negative_lh, &ferror);
2627         if (verbose_mode >= VB_MAX) {
2628             cout << "minimizeBrent logl: " << -negative_lh << endl;
2629         }
2630     }
2631 
2632     current_it->length = optx;
2633     current_it_back->length = optx;
2634     //curScore = -negative_lh;
2635 
2636     if (clearLH && current_len != optx) {
2637         node1->clearReversePartialLh(node2);
2638         node2->clearReversePartialLh(node1);
2639     }
2640 
2641 //    return -negative_lh;
2642 }
2643 
optimizeChildBranches(PhyloNode * node,PhyloNode * dad)2644 double PhyloTree::optimizeChildBranches(PhyloNode *node, PhyloNode *dad) {
2645 
2646 //    double tree_lh = 0.0;
2647 
2648     FOR_NEIGHBOR_DECLARE(node, dad, it){
2649 
2650 //    tree_lh = optimizeOneBranch((PhyloNode*) node, (PhyloNode*) (*it)->node);
2651         optimizeOneBranch((PhyloNode*) node, (PhyloNode*) (*it)->node);
2652     }
2653     return computeLikelihoodFromBuffer();
2654 //    return tree_lh;
2655 }
2656 
optimizeAllBranchesLS(PhyloNode * node,PhyloNode * dad)2657 void PhyloTree::optimizeAllBranchesLS(PhyloNode *node, PhyloNode *dad) {
2658     if (!node) {
2659         node = (PhyloNode*) root;
2660     }
2661 
2662     if (dad) {
2663         double lsBran = optimizeOneBranchLS(node, dad);
2664         PhyloNeighbor *node_dad_nei = (PhyloNeighbor*) node->findNeighbor(dad);
2665         PhyloNeighbor *dad_node_nei = (PhyloNeighbor*) dad->findNeighbor(node);
2666         node_dad_nei->length = lsBran;
2667         dad_node_nei->length = lsBran;
2668     }
2669 
2670     for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++)
2671         if ((*it)->node != (dad)) {
2672             optimizeAllBranchesLS((PhyloNode*) (*it)->node, node);
2673         }
2674 }
2675 
optimizeAllBranches(PhyloNode * node,PhyloNode * dad,int maxNRStep)2676 void PhyloTree::optimizeAllBranches(PhyloNode *node, PhyloNode *dad, int maxNRStep) {
2677 //    double tree_lh = -DBL_MAX;
2678 
2679     if (!node) node = (PhyloNode*)root;
2680 
2681     for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++)
2682         if ((*it)->node != (dad)) {
2683             optimizeAllBranches((PhyloNode*) (*it)->node, node, maxNRStep);
2684         }
2685     if (dad)
2686         optimizeOneBranch(node, dad, true, maxNRStep); // BQM 2014-02-24: true was missing
2687 
2688 //    return tree_lh;
2689 }
2690 
computeBestTraversal(NodeVector & nodes,NodeVector & nodes2)2691 void PhyloTree::computeBestTraversal(NodeVector &nodes, NodeVector &nodes2) {
2692     Node *farleaf = findFarthestLeaf();
2693 //    Node *farleaf = root;
2694 
2695     // double call to farthest leaf to find the longest path on the tree
2696     findFarthestLeaf(farleaf);
2697     if (verbose_mode >= VB_MAX)
2698         cout << "Tree diameter: " << farleaf->height << endl;
2699     getPreOrderBranches(nodes, nodes2, farleaf);
2700 }
2701 
optimizeAllBranches(int my_iterations,double tolerance,int maxNRStep)2702 double PhyloTree::optimizeAllBranches(int my_iterations, double tolerance, int maxNRStep) {
2703     if (verbose_mode >= VB_MAX)
2704         cout << "Optimizing branch lengths (max " << my_iterations << " loops)..." << endl;
2705 
2706     NodeVector nodes, nodes2;
2707     computeBestTraversal(nodes, nodes2);
2708 
2709     double tree_lh = computeLikelihoodBranch((PhyloNeighbor*)nodes[0]->findNeighbor(nodes2[0]), (PhyloNode*)nodes[0]);
2710 
2711     if (verbose_mode >= VB_MAX) {
2712         cout << "Initial tree log-likelihood: " << tree_lh << endl;
2713     }
2714     DoubleVector lenvec;
2715     //cout << tree_lh << endl;
2716     for (int i = 0; i < my_iterations; i++) {
2717 //        string string_brlen = getTreeString();
2718         saveBranchLengths(lenvec);
2719 //        if (verbose_mode >= VB_DEBUG) {
2720 //            printTree(cout, WT_BR_LEN+WT_NEWLINE);
2721 //        }
2722 
2723         for (int j = 0; j < nodes.size(); j++) {
2724             optimizeOneBranch((PhyloNode*)nodes[j], (PhyloNode*)nodes2[j]);
2725             if (verbose_mode >= VB_MAX) {
2726                 cout << "Branch " << nodes[j]->id << " " << nodes2[j]->id << ": " << computeLikelihoodFromBuffer() << endl;
2727             }
2728         }
2729 
2730 //        if (i == 0)
2731 //            optimizeOneBranch((PhyloNode*)nodes[0], (PhyloNode*)nodes2[0]);
2732 //        if (i % 2 == 0) {
2733 //            for (int j = 1; j < nodes.size(); j++)
2734 //                optimizeOneBranch((PhyloNode*)nodes[j], (PhyloNode*)nodes2[j]);
2735 //        } else {
2736 //            for (int j = nodes.size()-2; j >= 0; j--)
2737 //                optimizeOneBranch((PhyloNode*)nodes[j], (PhyloNode*)nodes2[j]);
2738 //        }
2739 
2740 //            optimizeAllBranches((PhyloNode*) root, NULL, maxNRStep);
2741 
2742         double new_tree_lh = computeLikelihoodFromBuffer();
2743         //cout<<"After opt  log-lh = "<<new_tree_lh<<endl;
2744 
2745         if (verbose_mode >= VB_MAX) {
2746             cout << "Likelihood after iteration " << i + 1 << " : ";
2747             cout << new_tree_lh << endl;
2748         }
2749 
2750 //        if (verbose_mode >= VB_DEBUG) {
2751 //            printTree(cout, WT_BR_LEN+WT_NEWLINE);
2752 //        }
2753 
2754 //        if (new_tree_lh < tree_lh - 10.0) { // make sure that the new tree likelihood never decreases too much
2755 //            cout << "ERROR: Branch length optimization failed as log-likelihood decreases too much: " << tree_lh << "  --> " << new_tree_lh << endl;
2756 //            getModel()->writeInfo(cout);
2757 //            getRate()->writeInfo(cout);
2758 //            assert(new_tree_lh >= tree_lh - 10.0);
2759 //        }
2760 
2761 
2762         if (new_tree_lh < tree_lh - tolerance*0.1) {
2763             // IN RARE CASE: tree log-likelihood decreases, revert the branch length and stop
2764             if (verbose_mode >= VB_MED)
2765                 cout << "NOTE: Restoring branch lengths as tree log-likelihood decreases after branch length optimization: "
2766                     << tree_lh << " -> " << new_tree_lh << endl;
2767 
2768             clearAllPartialLH();
2769             restoreBranchLengths(lenvec);
2770 
2771             //clearAllPartialLH();
2772 //            readTreeString(string_brlen);
2773             double max_delta_lh = 1.0;
2774             // Increase max delta with PoMo because log likelihood is very much lower.
2775             if (aln->seq_type == SEQ_POMO) max_delta_lh = 3.0;
2776             new_tree_lh = computeLikelihood();
2777             if (fabs(new_tree_lh-tree_lh) > max_delta_lh) {
2778                 printTree(cout);
2779                 cout << endl;
2780                 cout << "new_tree_lh: " << new_tree_lh << "   tree_lh: " << tree_lh << endl;
2781             }
2782             ASSERT(fabs(new_tree_lh-tree_lh) < max_delta_lh);
2783             return new_tree_lh;
2784         }
2785 
2786         // only return if the new_tree_lh >= tree_lh! (in rare case that likelihood decreases, continue the loop)
2787         if (tree_lh <= new_tree_lh && new_tree_lh <= tree_lh + tolerance) {
2788             curScore = new_tree_lh;
2789             return new_tree_lh;
2790         }
2791         tree_lh = new_tree_lh;
2792     }
2793     curScore = tree_lh;
2794     return tree_lh;
2795 }
2796 
moveRoot(Node * node1,Node * node2)2797 void PhyloTree::moveRoot(Node *node1, Node *node2) {
2798     // unplug root from tree
2799     Node *root_dad = root->neighbors[0]->node;
2800     Node *root_nei1 = NULL, *root_nei2 = NULL;
2801     double len = 0.0;
2802     FOR_NEIGHBOR_IT(root_dad, root, it) {
2803         if (!root_nei1)
2804             root_nei1 = (*it)->node;
2805         else if (!root_nei2)
2806             root_nei2 = (*it)->node;
2807         else
2808             outError("Cannot move multifurcating root branch");
2809         len += (*it)->length;
2810     }
2811     root_nei1->updateNeighbor(root_dad, root_nei2, len);
2812     root_nei2->updateNeighbor(root_dad, root_nei1, len);
2813 
2814     // plug root to new branch
2815     len = node1->findNeighbor(node2)->length / 2.0;
2816     root_dad->updateNeighbor(root_nei1, node1, len);
2817     node1->updateNeighbor(node2, root_dad, len);
2818     root_dad->updateNeighbor(root_nei2, node2, len);
2819     node2->updateNeighbor(node1, root_dad, len);
2820 
2821     if (isSuperTree()) {
2822         ((PhyloSuperTree*) this)->mapTrees();
2823     }
2824     if (Params::getInstance().pll) {
2825         pllReadNewick(getTreeString());
2826     }
2827     resetCurScore();
2828     if (Params::getInstance().fixStableSplits || Params::getInstance().adaptPertubation) {
2829         buildNodeSplit();
2830     }
2831     current_it = current_it_back = NULL;
2832     clearBranchDirection();
2833     computeBranchDirection();
2834 }
2835 
optimizeRootPosition(int root_dist,bool write_info,double logl_epsilon)2836 double PhyloTree::optimizeRootPosition(int root_dist, bool write_info, double logl_epsilon) {
2837     if (!rooted)
2838         return curScore;
2839 
2840     NodeVector nodes1, nodes2;
2841     getBranches(root_dist+1, nodes1, nodes2);
2842     int i;
2843     Node *root_dad = root->neighbors[0]->node;
2844 
2845     double best_score = curScore;
2846     string best_tree = getTreeString();
2847 
2848     StrVector trees;
2849 
2850     // ignore branches directly descended from root branch
2851     for (i = 0; i != nodes1.size(); )
2852         if (nodes1[i] == root_dad || nodes2[i] == root_dad) {
2853             nodes1[i] = nodes1[nodes1.size()-1];
2854             nodes2[i] = nodes2[nodes2.size()-1];
2855             nodes1.pop_back();
2856             nodes2.pop_back();
2857         } else {
2858             i++;
2859         }
2860 
2861     // get all trees
2862     for (i = 0; i != nodes1.size(); i++) {
2863         moveRoot(nodes1[i], nodes2[i]);
2864         trees.push_back(getTreeString());
2865     }
2866 
2867     // optimize branch lengths for all trees
2868     for (auto t = trees.begin(); t != trees.end(); t++) {
2869         readTreeString(*t);
2870         optimizeAllBranches(100, logl_epsilon);
2871         if (verbose_mode >= VB_MED) {
2872             cout << "Root pos " << (t - trees.begin())+1 << ": " << curScore << endl;
2873             if (verbose_mode >= VB_DEBUG)
2874                 drawTree(cout);
2875         }
2876         if (curScore > best_score + logl_epsilon) {
2877             if (verbose_mode >= VB_MED || write_info)
2878                 cout << "Better root: " << curScore << endl;
2879             best_score = curScore;
2880             best_tree = getTreeString();
2881         }
2882     }
2883 
2884     readTreeString(best_tree);
2885     curScore = computeLikelihood();
2886 
2887     ASSERT(fabs(curScore-best_score) < logl_epsilon);
2888 
2889     return curScore;
2890 }
2891 
testRootPosition(bool write_info,double logl_epsilon)2892 double PhyloTree::testRootPosition(bool write_info, double logl_epsilon) {
2893     if (!rooted)
2894         return curScore;
2895 
2896     BranchVector branches;
2897     getBranches(branches);
2898     int i;
2899     Node *root_nei = root->neighbors[0]->node;
2900     ASSERT(root_nei->degree() == 3);
2901     Branch root_br = {NULL, NULL};
2902     FOR_NEIGHBOR_IT(root_nei, root, it) {
2903         if (root_br.first == NULL)
2904             root_br.first = (*it)->node;
2905         else
2906             root_br.second = (*it)->node;
2907     }
2908 
2909     double best_score = curScore, orig_score = curScore;
2910 
2911     multimap<double, string> logl_trees;
2912 
2913     // ignore branches directly descended from root branch
2914     for (i = 0; i != branches.size(); )
2915         if (branches[i].first == root_nei || branches[i].second == root_nei) {
2916             branches[i] = branches[branches.size()-1];
2917             branches.pop_back();
2918         } else {
2919             i++;
2920         }
2921     branches.push_back(root_br);
2922 
2923     // get all trees
2924 //    for (i = 0; i != nodes1.size(); i++) {
2925 //        moveRoot(nodes1[i], nodes2[i]);
2926 //        trees.push_back(getTreeString());
2927 //    }
2928 //
2929     // optimize branch lengths for all trees
2930     for (i = 0; i != branches.size(); i++) {
2931 //    for (auto t = trees.begin()+1; t != trees.end(); t++) {
2932         moveRoot(branches[i].first, branches[i].second);
2933 //        readTreeString(*t);
2934         optimizeAllBranches(100, logl_epsilon);
2935         stringstream ss;
2936         printTree(ss);
2937         logl_trees.insert({curScore, ss.str()});
2938         if (verbose_mode >= VB_MED) {
2939             cout << "Root pos " << i+1 << ": " << curScore << endl;
2940             if (verbose_mode >= VB_DEBUG)
2941                 drawTree(cout);
2942         }
2943         if (curScore > best_score + logl_epsilon) {
2944             if (verbose_mode >= VB_MED || write_info)
2945                 cout << "Better root: " << curScore << endl;
2946             best_score = curScore;
2947         }
2948     }
2949 
2950 //    readTreeString(best_tree);
2951 //    curScore = computeLikelihood();
2952 
2953     ASSERT(curScore > orig_score - 0.1);
2954     if (curScore > orig_score)
2955         cout << "UPDATE BEST SCORE: " << curScore << endl;
2956 
2957     ofstream out;
2958     string out_file = (string)params->out_prefix + ".rooted_trees";
2959     out.open(out_file);
2960     out.precision(10);
2961     for (auto lt = logl_trees.rbegin(); lt != logl_trees.rend(); lt++) {
2962         out << "[ lh=" << lt->first << " ]" << lt->second << endl;
2963     }
2964     out.close();
2965     cout << "Rooted trees with log-likelihoods printed to " << out_file << endl;
2966     if (params->treeset_file.empty())
2967         params->treeset_file = out_file;
2968 
2969     // convert logL to weight based on the best score
2970 //    ASSERT(logLs.size() == nodes1.size());
2971 //    for (i = 0; i < logLs.size(); i++) {
2972 //        double weight = exp(logLs[i] - best_score);
2973 //        nodes1[i]->name = convertDoubleToString(weight);
2974 //    }
2975 
2976     return curScore;
2977 }
2978 
2979 /****************************************************************************
2980  Stepwise addition (greedy) by maximum likelihood
2981  ****************************************************************************/
2982 
addTaxonML(Node * added_node,Node * & target_node,Node * & target_dad,Node * node,Node * dad)2983 double PhyloTree::addTaxonML(Node *added_node, Node* &target_node, Node* &target_dad, Node *node, Node *dad) {
2984 
2985     Neighbor *dad_nei = dad->findNeighbor(node);
2986 
2987     // now insert the new node in the middle of the branch node-dad
2988     double len = dad_nei->length;
2989     node->updateNeighbor(dad, added_node, len / 2.0);
2990     dad->updateNeighbor(node, added_node, len / 2.0);
2991     added_node->updateNeighbor((Node*) 1, node, len / 2.0);
2992     added_node->updateNeighbor((Node*) 2, dad, len / 2.0);
2993     // compute the likelihood
2994     clearAllPartialLH();
2995     double best_score = optimizeChildBranches((PhyloNode*) added_node);
2996     target_node = node;
2997     target_dad = dad;
2998     // remove the added node
2999     node->updateNeighbor(added_node, dad, len);
3000     dad->updateNeighbor(added_node, node, len);
3001     added_node->updateNeighbor(node, (Node*) 1, len);
3002     added_node->updateNeighbor(dad, (Node*) 2, len);
3003 
3004     // now tranverse the tree downwards
3005 
3006     FOR_NEIGHBOR_IT(node, dad, it){
3007     Node *target_node2;
3008     Node *target_dad2;
3009     double score = addTaxonML(added_node, target_node2, target_dad2, (*it)->node, node);
3010     if (score > best_score) {
3011 
3012         best_score = score;
3013         target_node = target_node2;
3014         target_dad = target_dad2;
3015     }
3016 }
3017     return best_score;
3018 }
3019 
growTreeML(Alignment * alignment)3020 void PhyloTree::growTreeML(Alignment *alignment) {
3021 
3022     cout << "Stepwise addition using ML..." << endl;
3023     aln = alignment;
3024     int size = aln->getNSeq();
3025     if (size < 3)
3026         outError(ERR_FEW_TAXA);
3027 
3028     root = newNode();
3029     Node * new_taxon;
3030 
3031     // create initial tree with 3 taxa
3032     for (leafNum = 0; leafNum < 3; leafNum++) {
3033         cout << "Add " << aln->getSeqName(leafNum) << " to the tree" << endl;
3034         new_taxon = newNode(leafNum, aln->getSeqName(leafNum).c_str());
3035         root->addNeighbor(new_taxon, 1.0);
3036         new_taxon->addNeighbor(root, 1.0);
3037     }
3038     root = findNodeID(0);
3039     optimizeAllBranches();
3040 
3041     // stepwise adding the next taxon
3042     for (leafNum = 3; leafNum < size; leafNum++) {
3043 
3044         cout << "Add " << aln->getSeqName(leafNum) << " to the tree" << endl;
3045         // allocate a new taxon and a new ajedcent internal node
3046         new_taxon = newNode(leafNum, aln->getSeqName(leafNum).c_str());
3047         Node *added_node = newNode();
3048         added_node->addNeighbor(new_taxon, 1.0);
3049         new_taxon->addNeighbor(added_node, 1.0);
3050 
3051         // preserve two neighbors
3052         added_node->addNeighbor((Node*) 1, 1.0);
3053         added_node->addNeighbor((Node*) 2, 1.0);
3054 
3055         Node *target_node = NULL;
3056         Node *target_dad = NULL;
3057         addTaxonML(added_node, target_node, target_dad, root->neighbors[0]->node, root);
3058         // now insert the new node in the middle of the branch node-dad
3059         double len = target_dad->findNeighbor(target_node)->length;
3060         target_node->updateNeighbor(target_dad, added_node, len / 2.0);
3061         target_dad->updateNeighbor(target_node, added_node, len / 2.0);
3062         added_node->updateNeighbor((Node*) 1, target_node, len / 2.0);
3063         added_node->updateNeighbor((Node*) 2, target_dad, len / 2.0);
3064         // compute the likelihood
3065         clearAllPartialLH();
3066         optimizeAllBranches();
3067         //optimizeNNI();
3068     }
3069 
3070     nodeNum = 2 * leafNum - 2;
3071 }
3072 
3073 /****************************************************************************
3074  Distance function
3075  ****************************************************************************/
3076 
computeDist(int seq1,int seq2,double initial_dist,double & d2l)3077 double PhyloTree::computeDist(int seq1, int seq2, double initial_dist, double &d2l) {
3078     // if no model or site rate is specified, return JC distance
3079     if (initial_dist == 0.0) {
3080         if (params->compute_obs_dist)
3081             return (initial_dist = aln->computeObsDist(seq1, seq2));
3082         else
3083             initial_dist = aln->computeDist(seq1, seq2);
3084     }
3085     if (!model_factory || !site_rate)
3086         return initial_dist; // MANUEL: here no d2l is return
3087 
3088     // now optimize the distance based on the model and site rate
3089     AlignmentPairwise *aln_pair = new AlignmentPairwise(this, seq1, seq2);
3090 
3091     double dist = aln_pair->optimizeDist(initial_dist, d2l);
3092     delete aln_pair;
3093     return dist;
3094 }
3095 
computeDist(int seq1,int seq2,double initial_dist)3096 double PhyloTree::computeDist(int seq1, int seq2, double initial_dist) {
3097     double var;
3098     return computeDist(seq1, seq2, initial_dist, var);
3099 }
3100 
correctDist(double * dist_mat)3101 double PhyloTree::correctDist(double *dist_mat) {
3102     int i, j, k, pos;
3103     int n = aln->getNSeq();
3104     int nsqr = n * n;
3105     // use Floyd algorithm to find shortest path between all pairs of taxa
3106     for (k = 0; k < n; k++)
3107         for (i = 0, pos = 0; i < n; i++)
3108             for (j = 0; j < n; j++, pos++) {
3109                 double tmp = dist_mat[i * n + k] + dist_mat[k * n + j];
3110                 if (dist_mat[pos] > tmp)
3111                     dist_mat[pos] = tmp;
3112             }
3113     double longest_dist = 0.0;
3114     for (i = 0; i < nsqr; i++)
3115         if (dist_mat[i] > longest_dist)
3116             longest_dist = dist_mat[i];
3117 
3118     return longest_dist;
3119 }
3120 
computeDist(double * dist_mat,double * var_mat)3121 double PhyloTree::computeDist(double *dist_mat, double *var_mat) {
3122     int nseqs = aln->getNSeq();
3123     int pos = 0;
3124     int num_pairs = nseqs * (nseqs - 1) / 2;
3125     double longest_dist = 0.0;
3126     int *row_id = new int[num_pairs];
3127     int *col_id = new int[num_pairs];
3128 
3129     row_id[0] = 0;
3130     col_id[0] = 1;
3131     for (pos = 1; pos < num_pairs; pos++) {
3132         row_id[pos] = row_id[pos - 1];
3133         col_id[pos] = col_id[pos - 1] + 1;
3134         if (col_id[pos] >= nseqs) {
3135             row_id[pos]++;
3136             col_id[pos] = row_id[pos] + 1;
3137         }
3138     }
3139     // compute the upper-triangle of distance matrix
3140 #ifdef _OPENMP
3141 #pragma omp parallel for private(pos)
3142 #endif
3143 
3144     for (pos = 0; pos < num_pairs; pos++) {
3145         int seq1 = row_id[pos];
3146         int seq2 = col_id[pos];
3147         double d2l; // moved here for thread-safe (OpenMP)
3148         int sym_pos = seq1 * nseqs + seq2;
3149         dist_mat[sym_pos] = computeDist(seq1, seq2, dist_mat[sym_pos], d2l);
3150         if (params->ls_var_type == OLS)
3151             var_mat[sym_pos] = 1.0;
3152         else if (params->ls_var_type == WLS_PAUPLIN)
3153             var_mat[sym_pos] = 0.0;
3154         else if (params->ls_var_type == WLS_FIRST_TAYLOR)
3155             var_mat[sym_pos] = dist_mat[sym_pos];
3156         else if (params->ls_var_type == WLS_FITCH_MARGOLIASH)
3157             var_mat[sym_pos] = dist_mat[sym_pos] * dist_mat[sym_pos];
3158         else if (params->ls_var_type == WLS_SECOND_TAYLOR)
3159             var_mat[sym_pos] = -1.0 / d2l;
3160     }
3161 
3162     // copy upper-triangle into lower-triangle and set diagonal = 0
3163     for (int seq1 = 0; seq1 < nseqs; seq1++)
3164         for (int seq2 = 0; seq2 <= seq1; seq2++) {
3165             pos = seq1 * nseqs + seq2;
3166             if (seq1 == seq2) {
3167                 dist_mat[pos] = 0.0;
3168                 var_mat[pos] = 0.0;
3169             }
3170             else {
3171                 dist_mat[pos] = dist_mat[seq2 * nseqs + seq1];
3172                 var_mat[pos] = var_mat[seq2 * nseqs + seq1];
3173             }
3174             if (dist_mat[pos] > longest_dist)
3175                 longest_dist = dist_mat[pos];
3176         }
3177     delete[] col_id;
3178     delete[] row_id;
3179 
3180     /*
3181      if (longest_dist > MAX_GENETIC_DIST * 0.99)
3182      outWarning("Some distances are saturated. Please check your alignment again");*/
3183     // NOTE: Bionj does handle long distances already (thanks Manuel)
3184     //return correctDist(dist_mat);
3185     return longest_dist;
3186 }
3187 
computeDist(Params & params,Alignment * alignment,double * & dist_mat,double * & var_mat,string & dist_file)3188 double PhyloTree::computeDist(Params &params, Alignment *alignment, double* &dist_mat, double* &var_mat,
3189         string &dist_file) {
3190     this->params = &params;
3191     double longest_dist = 0.0;
3192     aln = alignment;
3193     dist_file = params.out_prefix;
3194     if (!model_factory) {
3195         if (params.compute_obs_dist)
3196             dist_file += ".obsdist";
3197         else
3198             //dist_file += ".jcdist"; // too many files, I decided to discard .jcdist
3199             dist_file += ".mldist";
3200     } else
3201         dist_file += ".mldist";
3202 
3203     if (!dist_mat) {
3204         dist_mat = new double[alignment->getNSeq() * alignment->getNSeq()];
3205         memset(dist_mat, 0, sizeof(double) * alignment->getNSeq() * alignment->getNSeq());
3206         var_mat = new double[alignment->getNSeq() * alignment->getNSeq()];
3207         // BUG!
3208         //memset(var_mat, 1, sizeof(double) * alignment->getNSeq() * alignment->getNSeq());
3209         int nseq = alignment->getNSeq();
3210         for (int i = 0; i < nseq; i++)
3211             for (int j = 0; j < nseq; j++)
3212                 var_mat[i*nseq+j] = 1.0;
3213     }
3214     if (!params.dist_file) {
3215         longest_dist = computeDist(dist_mat, var_mat);
3216         alignment->printDist(dist_file.c_str(), dist_mat);
3217     } else {
3218         longest_dist = alignment->readDist(params.dist_file, dist_mat);
3219         dist_file = params.dist_file;
3220     }
3221     return longest_dist;
3222 }
3223 
computeObsDist(double * dist_mat)3224 double PhyloTree::computeObsDist(double *dist_mat) {
3225     int nseqs = aln->getNSeq();
3226     int pos = 0;
3227     double longest_dist = 0.0;
3228     for (int seq1 = 0; seq1 < nseqs; seq1++)
3229         for (int seq2 = 0; seq2 < nseqs; seq2++, pos++) {
3230             if (seq1 == seq2)
3231                 dist_mat[pos] = 0.0;
3232             else if (seq2 > seq1) {
3233                 dist_mat[pos] = aln->computeObsDist(seq1, seq2);
3234             } else
3235                 dist_mat[pos] = dist_mat[seq2 * nseqs + seq1];
3236 
3237             if (dist_mat[pos] > longest_dist)
3238                 longest_dist = dist_mat[pos];
3239         }
3240     return longest_dist;
3241     //return correctDist(dist_mat);
3242 }
3243 
computeObsDist(Params & params,Alignment * alignment,double * & dist_mat,string & dist_file)3244 double PhyloTree::computeObsDist(Params &params, Alignment *alignment, double* &dist_mat, string &dist_file) {
3245     double longest_dist = 0.0;
3246     aln = alignment;
3247     dist_file = params.out_prefix;
3248     dist_file += ".obsdist";
3249 
3250     if (!dist_mat) {
3251         dist_mat = new double[alignment->getNSeq() * alignment->getNSeq()];
3252         memset(dist_mat, 0, sizeof(double) * alignment->getNSeq() * alignment->getNSeq());
3253     }
3254     longest_dist = computeObsDist(dist_mat);
3255     alignment->printDist(dist_file.c_str(), dist_mat);
3256 
3257     return longest_dist;
3258 }
3259 
3260 /****************************************************************************
3261  compute BioNJ tree, a more accurate extension of Neighbor-Joining
3262  ****************************************************************************/
3263 
computeBioNJ(Params & params,Alignment * alignment,string & dist_file)3264 void PhyloTree::computeBioNJ(Params &params, Alignment *alignment, string &dist_file) {
3265     string bionj_file = params.out_prefix;
3266     bionj_file += ".bionj";
3267     cout << "Computing BIONJ tree..." << endl;
3268     BioNj bionj;
3269     bionj.create(dist_file.c_str(), bionj_file.c_str());
3270 //    bool my_rooted = false;
3271     bool non_empty_tree = (root != NULL);
3272 //    if (root)
3273 //        freeNode();
3274     readTreeFile(bionj_file.c_str());
3275 
3276 
3277     if (non_empty_tree) {
3278         initializeAllPartialLh();
3279     }
3280 //    setAlignment(alignment);
3281 }
3282 
setNegativeBranch(bool force,double newlen,Node * node,Node * dad)3283 int PhyloTree::setNegativeBranch(bool force, double newlen, Node *node, Node *dad) {
3284     if (!node) node = root;
3285     int fixed = 0;
3286 
3287     FOR_NEIGHBOR_IT(node, dad, it) {
3288         if ((*it)->length < 0.0 || force) { // negative branch length detected
3289             (*it)->length = newlen;
3290             // set the backward branch length
3291             (*it)->node->findNeighbor(node)->length = (*it)->length;
3292             fixed++;
3293         }
3294         fixed += setNegativeBranch(force, newlen, (*it)->node, node);
3295     }
3296     return fixed;
3297 }
3298 
fixOneNegativeBranch(double branch_length,Neighbor * dad_branch,Node * dad)3299 void PhyloTree::fixOneNegativeBranch(double branch_length, Neighbor *dad_branch, Node *dad) {
3300     dad_branch->length = branch_length;
3301     // set the backward branch length
3302     dad_branch->node->findNeighbor(dad)->length = branch_length;
3303 }
3304 
fixNegativeBranch(bool force,Node * node,Node * dad)3305 int PhyloTree::fixNegativeBranch(bool force, Node *node, Node *dad) {
3306 
3307     // 2019-02-05: fix crash when no variant sites found
3308     if (aln->num_variant_sites == 0)
3309         return setNegativeBranch(force, params->min_branch_length, root, NULL);
3310 
3311     if (!node) {
3312         node = root;
3313         // 2015-11-30: if not bifurcating, initialize unknown branch lengths with 0.1
3314         if (!isBifurcating())
3315             return setNegativeBranch(force, 0.1, root, NULL);
3316     }
3317     int fixed = 0;
3318 
3319     if (force && !cost_matrix)
3320         return setParsimonyBranchLengths();
3321 
3322     double alpha = (site_rate) ? site_rate->getGammaShape() : 1.0;
3323 
3324     FOR_NEIGHBOR_IT(node, dad, it){
3325     if ((*it)->length < 0.0 || force) { // negative branch length detected
3326         int branch_subst;
3327         int pars_score = computeParsimonyBranch((PhyloNeighbor*) (*it), (PhyloNode*) node, &branch_subst);
3328         // first compute the observed parsimony distance
3329         double branch_length = (branch_subst > 0) ? ((double) branch_subst / getAlnNSite()) : (1.0 / getAlnNSite());
3330 
3331         branch_length = correctBranchLengthF81(branch_length, alpha);
3332 
3333 //        if (verbose_mode >= VB_DEBUG)
3334 //            cout << "Negative branch length " << (*it)->length << " was set to ";
3335         //(*it)->length = fixed_length;
3336         //(*it)->length = random_double()+0.1;
3337         fixOneNegativeBranch(branch_length, (*it), node);
3338         if (verbose_mode >= VB_DEBUG)
3339             cout << branch_length << " parsimony = " << pars_score << endl;
3340         fixed++;
3341     }
3342     if ((*it)->length <= 0.0 && (!rooted || node != root)) {
3343         (*it)->length = params->min_branch_length;
3344         (*it)->node->findNeighbor(node)->length = (*it)->length;
3345     }
3346     fixed += fixNegativeBranch(force, (*it)->node, node);
3347 }
3348     return fixed;
3349 }
3350 
3351 //int PhyloTree::assignRandomBranchLengths(bool force, Node *node, Node *dad) {
3352 //
3353 //    if (!node)
3354 //        node = root;
3355 //    int fixed = 0;
3356 //
3357 //    FOR_NEIGHBOR_IT(node, dad, it){
3358 //        if ((*it)->length < 0.0 || force) { // negative branch length detected
3359 //            if (verbose_mode >= VB_DEBUG)
3360 //            cout << "Negative branch length " << (*it)->length << " was set to ";
3361 //            (*it)->length = random_double() + 0.1;
3362 //            if (verbose_mode >= VB_DEBUG)
3363 //            cout << (*it)->length << endl;
3364 //            // set the backward branch length
3365 //            (*it)->node->findNeighbor(node)->length = (*it)->length;
3366 //            fixed++;
3367 //        }
3368 //        if ((*it)->length <= 0.0) {
3369 //            (*it)->length = 1e-6;
3370 //            (*it)->node->findNeighbor(node)->length = (*it)->length;
3371 //        }
3372 //        fixed += assignRandomBranchLengths(force, (*it)->node, node);
3373 //    }
3374 //    return fixed;
3375 //}
3376 
3377 /****************************************************************************
3378  Nearest Neighbor Interchange by maximum likelihood
3379  ****************************************************************************/
3380 
3381 /*
3382 void PhyloTree::doOneRandomNNI(Branch branch) {
3383     assert(isInnerBranch(branch.first, branch.second));
3384 
3385     if (((PhyloNeighbor*)branch.first->findNeighbor(branch.second))->direction == TOWARD_ROOT) {
3386         // swap node1 and node2 if the direction is not right, only for nonreversible models
3387         Node *tmp = branch.first;
3388         branch.first = branch.second;
3389         branch.second = tmp;
3390     }
3391 
3392     NNIMove nni;
3393     nni.node1 = (PhyloNode*) branch.first;
3394     nni.node2 = (PhyloNode*) branch.second;
3395     FOR_NEIGHBOR_IT(branch.first, branch.second, node1NeiIt)
3396     if (((PhyloNeighbor*)*node1NeiIt)->direction != TOWARD_ROOT)
3397     {
3398         nni.node1Nei_it = node1NeiIt;
3399         break;
3400     }
3401     int randInt = random_int(branch.second->neighbors.size()-1);
3402     int cnt = 0;
3403     FOR_NEIGHBOR_IT(branch.second, branch.first, node2NeiIt) {
3404         // if this loop, is it sure that direction is away from root because node1->node2 is away from root
3405         if (cnt == randInt) {
3406             nni.node2Nei_it = node2NeiIt;
3407             break;
3408         } else {
3409             cnt++;
3410         }
3411     }
3412     assert(*nni.node1Nei_it != NULL && *nni.node2Nei_it != NULL);
3413     assert(((PhyloNeighbor*)*nni.node1Nei_it)->direction != TOWARD_ROOT && ((PhyloNeighbor*)*nni.node2Nei_it)->direction != TOWARD_ROOT);
3414 
3415     if (constraintTree.isCompatible(nni))
3416         doNNI(nni, true);
3417 }
3418 */
3419 
getRandomNNI(Branch & branch)3420 NNIMove PhyloTree::getRandomNNI(Branch &branch) {
3421     ASSERT(isInnerBranch(branch.first, branch.second));
3422     // for rooted tree
3423     if (((PhyloNeighbor*)branch.first->findNeighbor(branch.second))->direction == TOWARD_ROOT) {
3424         // swap node1 and node2 if the direction is not right, only for nonreversible models
3425         Node *tmp = branch.first;
3426         branch.first = branch.second;
3427         branch.second = tmp;
3428     }
3429     NNIMove nni;
3430     nni.node1 = (PhyloNode*) branch.first;
3431     nni.node2 = (PhyloNode*) branch.second;
3432 
3433     FOR_NEIGHBOR_IT(branch.first, branch.second, node1NeiIt)
3434         if (((PhyloNeighbor*)*node1NeiIt)->direction != TOWARD_ROOT) {
3435             nni.node1Nei_it = node1NeiIt;
3436             break;
3437         }
3438     int randInt = random_int(branch.second->neighbors.size()-1);
3439     int cnt = 0;
3440     FOR_NEIGHBOR_IT(branch.second, branch.first, node2NeiIt) {
3441         // if this loop, is it sure that direction is away from root because node1->node2 is away from root
3442         if (cnt == randInt) {
3443             nni.node2Nei_it = node2NeiIt;
3444             break;
3445         } else {
3446             cnt++;
3447         }
3448     }
3449     ASSERT(*nni.node1Nei_it != NULL && *nni.node2Nei_it != NULL);
3450     ASSERT(((PhyloNeighbor*)*nni.node1Nei_it)->direction != TOWARD_ROOT && ((PhyloNeighbor*)*nni.node2Nei_it)->direction != TOWARD_ROOT);
3451     nni.newloglh = 0.0;
3452     return nni;
3453 }
3454 
doNNI(NNIMove & move,bool clearLH)3455 void PhyloTree::doNNI(NNIMove &move, bool clearLH) {
3456     PhyloNode *node1 = move.node1;
3457     PhyloNode *node2 = move.node2;
3458     NeighborVec::iterator node1Nei_it = move.node1Nei_it;
3459     NeighborVec::iterator node2Nei_it = move.node2Nei_it;
3460     Neighbor *node1Nei = *(node1Nei_it);
3461     Neighbor *node2Nei = *(node2Nei_it);
3462 
3463     // TODO MINH
3464     /*    Node *nodeA = node1Nei->node;
3465      Node *nodeB = node2Nei->node;
3466 
3467      NeighborVec::iterator nodeA_it = nodeA->findNeighborIt(node1);
3468      NeighborVec::iterator nodeB_it = nodeB->findNeighborIt(node2);
3469      Neighbor *nodeANei = *(nodeA_it);
3470      Neighbor *nodeBNei = *(nodeB_it);
3471      *node1Nei_it = node2Nei;
3472      *nodeB_it = nodeANei;
3473      *node2Nei_it = node1Nei;
3474      *nodeA_it = nodeBNei;*/
3475     // END TODO MINH
3476     ASSERT(node1->degree() == 3 && node2->degree() == 3);
3477 
3478     PhyloNeighbor *node12_it = (PhyloNeighbor*) node1->findNeighbor(node2); // return neighbor of node1 which points to node 2
3479     PhyloNeighbor *node21_it = (PhyloNeighbor*) node2->findNeighbor(node1); // return neighbor of node2 which points to node 1
3480 
3481     // reorient partial_lh before swap
3482     if (!isSuperTree()) {
3483         reorientPartialLh(node12_it, node1);
3484         reorientPartialLh(node21_it, node2);
3485     }
3486 
3487     // do the NNI swap
3488     node1->updateNeighbor(node1Nei_it, node2Nei);
3489     node2Nei->node->updateNeighbor(node2, node1);
3490 
3491     node2->updateNeighbor(node2Nei_it, node1Nei);
3492     node1Nei->node->updateNeighbor(node1, node2);
3493 
3494     // BQM check branch ID
3495     /*
3496      if (node1->findNeighbor(nodeB)->id != nodeB->findNeighbor(node1)->id) {
3497      cout << node1->findNeighbor(nodeB)->id << "<->" << nodeB->findNeighbor(node1)->id << endl;
3498      cout << node1->id << "," << nodeB->id << endl;
3499      outError("Wrong ID");
3500      }
3501      if (node2->findNeighbor(nodeA)->id != nodeA->findNeighbor(node2)->id) {
3502      cout << node2->findNeighbor(nodeA)->id << "<->" << nodeA->findNeighbor(node2)->id << endl;
3503      cout << node2->id << "," << nodeA->id << endl;
3504      outError("Wrong ID");
3505      }*/
3506 
3507     PhyloNeighbor *nei12 = (PhyloNeighbor*) node1->findNeighbor(node2); // return neighbor of node1 which points to node 2
3508     PhyloNeighbor *nei21 = (PhyloNeighbor*) node2->findNeighbor(node1); // return neighbor of node2 which points to node 1
3509 
3510     if (clearLH) {
3511         // clear partial likelihood vector
3512         nei12->clearPartialLh();
3513         nei21->clearPartialLh();
3514         nei12->size = nei21->size = 0;
3515 
3516         node2->clearReversePartialLh(node1);
3517         node1->clearReversePartialLh(node2);
3518         //if (params->nni5Branches)
3519         //    clearAllPartialLH();
3520     }
3521 
3522     if (params->leastSquareNNI) {
3523         updateSubtreeDists(move);
3524     }
3525 
3526     // update split store in node
3527     if (nei12->split != NULL || nei21->split != NULL) {
3528         delete nei12->split;
3529         nei12->split = new Split(leafNum);
3530         delete nei21->split;
3531         nei21->split = new Split(leafNum);
3532 
3533         FOR_NEIGHBOR_IT(nei12->node, node1, it)
3534                 *(nei12->split) += *((*it)->split);
3535 
3536         FOR_NEIGHBOR_IT(nei21->node, node2, it)
3537                 *(nei21->split) += *((*it)->split);
3538     }
3539 }
3540 
changeNNIBrans(NNIMove & nnimove)3541 void PhyloTree::changeNNIBrans(NNIMove &nnimove) {
3542     PhyloNode *node1 = nnimove.node1;
3543     PhyloNode *node2 = nnimove.node2;
3544     PhyloNeighbor *node1_node2_nei = (PhyloNeighbor*) node1->findNeighbor(node2);
3545     PhyloNeighbor *node2_node1_nei = (PhyloNeighbor*) node2->findNeighbor(node1);
3546     node1_node2_nei->setLength(nnimove.newLen[0]);
3547     node2_node1_nei->setLength(nnimove.newLen[0]);
3548     if (params->nni5) {
3549         int i = 1;
3550         Neighbor* nei;
3551         Neighbor* nei_back;
3552         NeighborVec::iterator it;
3553         FOR_NEIGHBOR(node1, node2, it)
3554         {
3555             nei = (*it)->node->findNeighbor(node1);
3556             nei_back = (node1)->findNeighbor((*it)->node);
3557             nei->setLength(nnimove.newLen[i]);
3558             nei_back->setLength(nnimove.newLen[i]);
3559             i++;
3560         }
3561         FOR_NEIGHBOR(node2, node1, it)
3562         {
3563             nei = (*it)->node->findNeighbor(node2);
3564             nei_back = (node2)->findNeighbor((*it)->node);
3565             nei->setLength(nnimove.newLen[i]);
3566             nei_back->setLength(nnimove.newLen[i]);
3567             i++;
3568         }
3569     }
3570 }
3571 
getBestNNIForBran(PhyloNode * node1,PhyloNode * node2,NNIMove * nniMoves)3572 NNIMove PhyloTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove* nniMoves) {
3573 
3574     ASSERT(!node1->isLeaf() && !node2->isLeaf());
3575     ASSERT(node1->degree() == 3 && node2->degree() == 3);
3576 
3577     if (((PhyloNeighbor*)node1->findNeighbor(node2))->direction == TOWARD_ROOT) {
3578         // swap node1 and node2 if the direction is not right, only for nonreversible models
3579         PhyloNode *tmp = node1;
3580         node1 = node2;
3581         node2 = tmp;
3582     }
3583 
3584     int IT_NUM = (params->nni5) ? 6 : 2;
3585     size_t partial_lh_size = getPartialLhBytes()/sizeof(double);
3586     size_t scale_num_size = getScaleNumBytes()/sizeof(UBYTE);
3587 
3588 
3589     // Upper Bounds ---------------
3590     /*
3591     if(params->upper_bound_NNI){
3592         totalNNIub += 2;
3593         NNIMove resMove;
3594         resMove = getBestNNIForBranUB(node1,node2,this);
3595         // if UB is smaller than the current likelihood, then we don't recompute the likelihood of the swapped topology.
3596         // Otherwise, follow the normal procedure: evaluate NNIs and compute the likelihood.
3597 
3598         // here, we skip NNI is its UB n times worse than the curLikelihood
3599         if( resMove.newloglh < (1+params->upper_bound_frac)*this->curScore){
3600             //cout << "Skipping Likelihood evaluation of NNIs for this branch :) ........................"<<endl;
3601             return resMove;
3602         }
3603     }
3604     */
3605 
3606     //-----------------------------
3607 
3608     NeighborVec::iterator it;
3609 
3610     NeighborVec::iterator saved_it[6];
3611     int id = 0;
3612 
3613     saved_it[id++] = node1->findNeighborIt(node2);
3614     saved_it[id++] = node2->findNeighborIt(node1);
3615 
3616     if (params->nni5) {
3617         FOR_NEIGHBOR(node1, node2, it)
3618             saved_it[id++] = (*it)->node->findNeighborIt(node1);
3619 
3620         FOR_NEIGHBOR(node2, node1, it)
3621             saved_it[id++] = (*it)->node->findNeighborIt(node2);
3622     }
3623     ASSERT(id == IT_NUM);
3624 
3625     if (!params->nni5) {
3626         reorientPartialLh((PhyloNeighbor*)node1->findNeighbor(node2), node1);
3627         reorientPartialLh((PhyloNeighbor*)node2->findNeighbor(node1), node2);
3628     }
3629 
3630     Neighbor *saved_nei[6];
3631     int mem_id = 0;
3632     // save Neighbor and allocate new Neighbor pointer
3633     for (id = 0; id < IT_NUM; id++) {
3634         saved_nei[id] = (*saved_it[id]);
3635         *saved_it[id] = saved_nei[id]->newNeighbor();
3636 
3637         if (((PhyloNeighbor*)saved_nei[id])->partial_lh) {
3638             ((PhyloNeighbor*) (*saved_it[id]))->partial_lh = nni_partial_lh + mem_id*partial_lh_size;
3639             ((PhyloNeighbor*) (*saved_it[id]))->scale_num = nni_scale_num + mem_id*scale_num_size;
3640             mem_id++;
3641             mem_slots.addSpecialNei((PhyloNeighbor*)*saved_it[id]);
3642         }
3643 //        ((PhyloNeighbor*) (*saved_it[id]))->scale_num = newScaleNum();
3644     }
3645     if (params->nni5)
3646         ASSERT(mem_id == 2);
3647 
3648     // get the Neighbor again since it is replaced for saving purpose
3649     PhyloNeighbor* node12_it = (PhyloNeighbor*) node1->findNeighbor(node2);
3650     PhyloNeighbor* node21_it = (PhyloNeighbor*) node2->findNeighbor(node1);
3651 
3652     int cnt;
3653 
3654     //NNIMove nniMoves[2];
3655     bool newNNIMoves = false;
3656     if (!nniMoves) {
3657         //   Initialize the 2 NNI moves
3658         newNNIMoves = true;
3659         nniMoves = new NNIMove[2];
3660         nniMoves[0].ptnlh = nniMoves[1].ptnlh = NULL;
3661         nniMoves[0].node1 = NULL;
3662 
3663     }
3664 
3665     if (nniMoves[0].node1) {
3666         // assuming that node1Nei_it and node2Nei_it is defined in nniMoves structure
3667         for (cnt = 0; cnt < 2; cnt++) {
3668             // sanity check
3669             if (!node1->findNeighbor((*nniMoves[cnt].node1Nei_it)->node)) outError(__func__);
3670             if (!node2->findNeighbor((*nniMoves[cnt].node2Nei_it)->node)) outError(__func__);
3671         }
3672     } else {
3673         cnt = 0;
3674         FOR_NEIGHBOR_IT(node1, node2, node1_it)
3675         if (((PhyloNeighbor*)*node1_it)->direction != TOWARD_ROOT)
3676         {
3677             cnt = 0;
3678             FOR_NEIGHBOR_IT(node2, node1, node2_it) {
3679                 //   Initialize the 2 NNI moves
3680                 nniMoves[cnt].node1Nei_it = node1_it;
3681                 nniMoves[cnt].node2Nei_it = node2_it;
3682                 cnt++;
3683             }
3684             break;
3685         }
3686         ASSERT(cnt == 2);
3687     }
3688 
3689     // Initialize node1 and node2 in nniMoves
3690     nniMoves[0].node1 = nniMoves[1].node1 = node1;
3691     nniMoves[0].node2 = nniMoves[1].node2 = node2;
3692     nniMoves[0].newloglh = nniMoves[1].newloglh = -DBL_MAX;
3693 
3694     double backupScore = curScore;
3695 
3696     for (cnt = 0; cnt < 2; cnt++) if (constraintTree.isCompatible(nniMoves[cnt]))
3697     {
3698         // do the NNI swap
3699         NeighborVec::iterator node1_it = nniMoves[cnt].node1Nei_it;
3700         NeighborVec::iterator node2_it = nniMoves[cnt].node2Nei_it;
3701         Neighbor *node1_nei = *node1_it;
3702         Neighbor *node2_nei = *node2_it;
3703 
3704         // reorient partial_lh before swap
3705         reorientPartialLh(node12_it, node1);
3706         reorientPartialLh(node21_it, node2);
3707 
3708         node1->updateNeighbor(node1_it, node2_nei);
3709         node2_nei->node->updateNeighbor(node2, node1);
3710 
3711         node2->updateNeighbor(node2_it, node1_nei);
3712         node1_nei->node->updateNeighbor(node1, node2);
3713 
3714         if (params->lh_mem_save == LM_MEM_SAVE) {
3715             // reset subtree size to change traversal order
3716             for (id = 0; id < IT_NUM; id++)
3717                 ((PhyloNeighbor*)*saved_it[id])->size = 0;
3718         }
3719 
3720         int nni5_num_eval = max(params->nni5_num_eval, getMixlen());
3721 
3722         for (int step = 0; step < nni5_num_eval; step++) {
3723 
3724 
3725         // clear partial likelihood vector
3726         node12_it->clearPartialLh();
3727         node21_it->clearPartialLh();
3728 
3729         // compute the score of the swapped topology
3730 //        double saved_len = node1_nei->length;
3731 
3732         // BUG FIX: commented out following (reported by Stephen Crotty)
3733 //        optimizeOneBranch(node1, node2, false, NNI_MAX_NR_STEP);
3734 //        node1->findNeighbor(node2)->getLength(nniMoves[cnt].newLen[0]);
3735 
3736         int i=1;
3737         if (params->nni5) {
3738             FOR_NEIGHBOR(node1, node2, it)
3739             {
3740                 ((PhyloNeighbor*) (*it)->node->findNeighbor(node1))->clearPartialLh();
3741                 optimizeOneBranch(node1, (PhyloNode*) (*it)->node, false, NNI_MAX_NR_STEP);
3742                 node1->findNeighbor((*it)->node)->getLength(nniMoves[cnt].newLen[i]);
3743                 i++;
3744             }
3745             node21_it->clearPartialLh();
3746         }
3747 
3748         optimizeOneBranch(node1, node2, false, NNI_MAX_NR_STEP);
3749         node1->findNeighbor(node2)->getLength(nniMoves[cnt].newLen[0]);
3750 
3751         if (params->nni5) {
3752             FOR_NEIGHBOR(node2, node1, it)
3753             {
3754                 ((PhyloNeighbor*) (*it)->node->findNeighbor(node2))->clearPartialLh();
3755                 optimizeOneBranch(node2, (PhyloNode*) (*it)->node, false, NNI_MAX_NR_STEP);
3756                 //node2_lastnei = (PhyloNeighbor*) (*it);
3757                 node2->findNeighbor((*it)->node)->getLength(nniMoves[cnt].newLen[i]);
3758                 i++;
3759             }
3760             node12_it->clearPartialLh();
3761         }
3762         }
3763         double score = computeLikelihoodFromBuffer();
3764         if (verbose_mode >= VB_DEBUG)
3765             cout << "NNI " << node1->id << " - " << node2->id << ": " << score << endl;
3766         nniMoves[cnt].newloglh = score;
3767         // compute the pattern likelihoods if wanted
3768         if (nniMoves[cnt].ptnlh)
3769             computePatternLikelihood(nniMoves[cnt].ptnlh, &score);
3770 
3771         if (save_all_trees == 2) {
3772             saveCurrentTree(score); // BQM: for new bootstrap
3773         }
3774 
3775         // reorient partial_lh before swap
3776         reorientPartialLh(node12_it, node1);
3777         reorientPartialLh(node21_it, node2);
3778 
3779         // else, swap back, also recover the branch lengths
3780         node1->updateNeighbor(node1_it, node1_nei);
3781         node1_nei->node->updateNeighbor(node2, node1);
3782         node2->updateNeighbor(node2_it, node2_nei);
3783         node2_nei->node->updateNeighbor(node1, node2);
3784         // ONLY FOR CHECKING WITH OLGA's PLEN MODEL
3785         //node1_nei->length = node2_nei->length = saved_len;
3786     }
3787 
3788      // restore the Neighbor*
3789      for (id = IT_NUM-1; id >= 0; id--) {
3790 //         aligned_free(((PhyloNeighbor*) *saved_it[id])->scale_num);
3791          //delete[] ((PhyloNeighbor*) *saved_it[id])->partial_lh;
3792 //         if (((PhyloNeighbor*)saved_nei[id])->partial_lh) {
3793 //            if (saved_nei[id]->node == node1)
3794 //                mem_slots.restore(node21_it, (PhyloNeighbor*)saved_nei[id]);
3795 //            else
3796 //                mem_slots.restore(node12_it, (PhyloNeighbor*)saved_nei[id]);
3797 //         }
3798          if (*saved_it[id] == current_it) current_it = (PhyloNeighbor*) saved_nei[id];
3799          if (*saved_it[id] == current_it_back) current_it_back = (PhyloNeighbor*) saved_nei[id];
3800 
3801          delete (*saved_it[id]);
3802          (*saved_it[id]) = saved_nei[id];
3803      }
3804 
3805     mem_slots.eraseSpecialNei();
3806 
3807 //     aligned_free(new_partial_lh);
3808 
3809      // restore the length of 4 branches around node1, node2
3810      FOR_NEIGHBOR(node1, node2, it)
3811          (*it)->setLength((*it)->node->findNeighbor(node1));
3812      FOR_NEIGHBOR(node2, node1, it)
3813          (*it)->setLength((*it)->node->findNeighbor(node2));
3814 
3815      // restore curScore
3816      curScore = backupScore;
3817 
3818      NNIMove res;
3819      if (nniMoves[0].newloglh > nniMoves[1].newloglh) {
3820          res = nniMoves[0];
3821      } else {
3822          res = nniMoves[1];
3823      }
3824     if (newNNIMoves) {
3825         delete [] nniMoves;
3826     }
3827     return res;
3828 }
3829 
3830 
3831 /****************************************************************************
3832  Subtree Pruning and Regrafting by maximum likelihood
3833  ****************************************************************************/
3834 
optimizeSPR_old(double cur_score,PhyloNode * node,PhyloNode * dad)3835 double PhyloTree::optimizeSPR_old(double cur_score, PhyloNode *node, PhyloNode *dad) {
3836     if (!node)
3837         node = (PhyloNode*) root;
3838     PhyloNeighbor * dad1_nei = NULL;
3839     PhyloNeighbor * dad2_nei = NULL;
3840     PhyloNode * sibling1 = NULL;
3841     PhyloNode * sibling2 = NULL;
3842     double sibling1_len = 0.0, sibling2_len = 0.0;
3843 
3844     if (dad && !dad->isLeaf()) {
3845 
3846         ASSERT(dad->degree() == 3);
3847         // assign the sibling of node, with respect to dad
3848 
3849         FOR_NEIGHBOR_DECLARE(dad, node, it) {
3850             if (!sibling1) {
3851                 dad1_nei = (PhyloNeighbor*) (*it);
3852                 sibling1 = (PhyloNode*) (*it)->node;
3853                 sibling1_len = (*it)->length;
3854             } else {
3855 
3856                 dad2_nei = (PhyloNeighbor*) (*it);
3857                 sibling2 = (PhyloNode*) (*it)->node;
3858                 sibling2_len = (*it)->length;
3859             }
3860         }
3861         // remove the subtree leading to node
3862         double sum_len = sibling1_len + sibling2_len;
3863         sibling1->updateNeighbor(dad, sibling2, sum_len);
3864         sibling2->updateNeighbor(dad, sibling1, sum_len);
3865         PhyloNeighbor* sibling1_nei = (PhyloNeighbor*) sibling1->findNeighbor(sibling2);
3866         PhyloNeighbor* sibling2_nei = (PhyloNeighbor*) sibling2->findNeighbor(sibling1);
3867         sibling1_nei->clearPartialLh();
3868         sibling2_nei->clearPartialLh();
3869 
3870         // now try to move the subtree to somewhere else
3871         vector<PhyloNeighbor*> spr_path;
3872 
3873         FOR_NEIGHBOR(sibling1, sibling2, it)
3874         {
3875             spr_path.push_back(sibling1_nei);
3876             double score = swapSPR_old(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling1,
3877                     spr_path);
3878             // if likelihood score improves, return
3879             if (score > cur_score)
3880 
3881                 return score;
3882             spr_path.pop_back();
3883         }
3884 
3885         FOR_NEIGHBOR(sibling2, sibling1, it)
3886         {
3887             spr_path.push_back(sibling2_nei);
3888             double score = swapSPR_old(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling2,
3889                     spr_path);
3890             // if likelihood score improves, return
3891             if (score > cur_score)
3892 
3893                 return score;
3894             spr_path.pop_back();
3895         }
3896         // if likelihood does not imporve, swap back
3897         sibling1->updateNeighbor(sibling2, dad, sibling1_len);
3898         sibling2->updateNeighbor(sibling1, dad, sibling2_len);
3899         dad1_nei->node = sibling1;
3900         dad1_nei->length = sibling1_len;
3901         dad2_nei->node = sibling2;
3902         dad2_nei->length = sibling2_len;
3903         clearAllPartialLH();
3904     }
3905 
3906     FOR_NEIGHBOR_IT(node, dad, it){
3907     double score = optimizeSPR_old(cur_score, (PhyloNode*) (*it)->node, node);
3908 
3909     if (score > cur_score) return score;
3910 }
3911     return cur_score;
3912 }
3913 
3914 /**
3915  move the subtree (dad1-node1) to the branch (dad2-node2)
3916  */
swapSPR_old(double cur_score,int cur_depth,PhyloNode * node1,PhyloNode * dad1,PhyloNode * orig_node1,PhyloNode * orig_node2,PhyloNode * node2,PhyloNode * dad2,vector<PhyloNeighbor * > & spr_path)3917 double PhyloTree::swapSPR_old(double cur_score, int cur_depth, PhyloNode *node1, PhyloNode *dad1, PhyloNode *orig_node1,
3918         PhyloNode *orig_node2, PhyloNode *node2, PhyloNode *dad2, vector<PhyloNeighbor*> &spr_path) {
3919     PhyloNeighbor *node1_nei = (PhyloNeighbor*) node1->findNeighbor(dad1);
3920     PhyloNeighbor *dad1_nei = (PhyloNeighbor*) dad1->findNeighbor(node1);
3921     double node1_dad1_len = node1_nei->length;
3922     PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
3923 
3924     if (dad2) {
3925         // now, connect (node1-dad1) to the branch (node2-dad2)
3926 
3927         bool first = true;
3928         PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
3929         PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
3930         double len2 = node2_nei->length;
3931 
3932         FOR_NEIGHBOR_IT(dad1, node1, it){
3933         if (first) {
3934             (*it)->node = dad2;
3935             (*it)->length = len2 / 2;
3936             dad2->updateNeighbor(node2, dad1, len2 / 2);
3937             first = false;
3938         } else {
3939             (*it)->node = node2;
3940             (*it)->length = len2 / 2;
3941             node2->updateNeighbor(dad2, dad1, len2 / 2);
3942         }
3943         ((PhyloNeighbor*) (*it))->clearPartialLh();
3944     }
3945         node2_nei->clearPartialLh();
3946         dad2_nei->clearPartialLh();
3947         node1_nei->clearPartialLh();
3948         vector<PhyloNeighbor*>::iterator it2;
3949         for (it2 = spr_path.begin(); it2 != spr_path.end(); it2++)
3950             (*it2)->clearPartialLh();
3951         clearAllPartialLH();
3952         // optimize relevant branches
3953         double score;
3954 
3955         /* testing different branch optimization */
3956         optimizeOneBranch(node1, dad1);
3957         score = computeLikelihoodFromBuffer();
3958         //score = optimizeOneBranch(dad2, dad1);
3959         //score = optimizeOneBranch(node2, dad1);
3960         /*
3961          PhyloNode *cur_node = dad2;
3962          for (int i = spr_path.size()-1; i >= 0; i--) {
3963          score = optimizeOneBranch(cur_node, (PhyloNode*)spr_path[i]->node);
3964          cur_node = (PhyloNode*)spr_path[i]->node;
3965          }
3966          */
3967         //score = optimizeAllBranches(dad1);
3968         // if score improves, return
3969         if (score > cur_score)
3970             return score;
3971         // else, swap back
3972         node2->updateNeighbor(dad1, dad2, len2);
3973         dad2->updateNeighbor(dad1, node2, len2);
3974         node2_nei->clearPartialLh();
3975         dad2_nei->clearPartialLh();
3976         node1_nei->length = node1_dad1_len;
3977         dad1_nei->length = node1_dad1_len;
3978 
3979         // add to candiate SPR moves
3980         spr_moves.add(node1, dad1, node2, dad2, score);
3981     }
3982     if (cur_depth >= spr_radius)
3983 
3984         return cur_score;
3985     spr_path.push_back(node2_nei);
3986 
3987     FOR_NEIGHBOR_IT(node2, dad2, it){
3988     double score = swapSPR(cur_score, cur_depth + 1, node1, dad1, orig_node1, orig_node2, (PhyloNode*) (*it)->node, node2, spr_path);
3989     if (score > cur_score) return score;
3990 }
3991     spr_path.pop_back();
3992 
3993     return cur_score;
3994 
3995 }
3996 
optimizeSPR(double cur_score,PhyloNode * node,PhyloNode * dad)3997 double PhyloTree::optimizeSPR(double cur_score, PhyloNode *node, PhyloNode *dad) {
3998     if (!node)
3999         node = (PhyloNode*) root;
4000     PhyloNeighbor * dad1_nei = NULL;
4001     PhyloNeighbor * dad2_nei = NULL;
4002     PhyloNode * sibling1 = NULL;
4003     PhyloNode * sibling2 = NULL;
4004     double sibling1_len = 0.0, sibling2_len = 0.0;
4005 
4006     if (dad && !dad->isLeaf()) {
4007 
4008         ASSERT(dad->degree() == 3);
4009         // assign the sibling of node, with respect to dad
4010 
4011         FOR_NEIGHBOR_DECLARE(dad, node, it) {
4012             if (!sibling1) {
4013                 dad1_nei = (PhyloNeighbor*) (*it);
4014                 sibling1 = (PhyloNode*) (*it)->node;
4015                 sibling1_len = (*it)->length;
4016             } else {
4017 
4018                 dad2_nei = (PhyloNeighbor*) (*it);
4019                 sibling2 = (PhyloNode*) (*it)->node;
4020                 sibling2_len = (*it)->length;
4021             }
4022         }
4023         // remove the subtree leading to node
4024         double sum_len = sibling1_len + sibling2_len;
4025         sibling1->updateNeighbor(dad, sibling2, sum_len);
4026         sibling2->updateNeighbor(dad, sibling1, sum_len);
4027         PhyloNeighbor* sibling1_nei = (PhyloNeighbor*) sibling1->findNeighbor(sibling2);
4028         PhyloNeighbor* sibling2_nei = (PhyloNeighbor*) sibling2->findNeighbor(sibling1);
4029         // save partial likelihood
4030         double* sibling1_partial_lh = sibling1_nei->partial_lh;
4031         double* sibling2_partial_lh = sibling2_nei->partial_lh;
4032         sibling1_nei->partial_lh = newPartialLh();
4033         sibling2_nei->partial_lh = newPartialLh();
4034         sibling1_nei->clearPartialLh();
4035         sibling2_nei->clearPartialLh();
4036 
4037         // now try to move the subtree to somewhere else
4038         vector<PhyloNeighbor*> spr_path;
4039 
4040         FOR_NEIGHBOR(sibling1, sibling2, it)
4041         {
4042             spr_path.push_back(sibling1_nei);
4043             double score = swapSPR(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling1,
4044                     spr_path);
4045             // if likelihood score improves, return
4046             if (score > cur_score) {
4047                 cout << "cur_score = " << cur_score << endl;
4048                 cout << "Found new BETTER SCORE by SPR: " << score << endl;
4049 
4050                 return score;
4051             }
4052             spr_path.pop_back();
4053         }
4054 
4055         FOR_NEIGHBOR(sibling2, sibling1, it)
4056         {
4057             spr_path.push_back(sibling2_nei);
4058             double score = swapSPR(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling2,
4059                     spr_path);
4060             // if likelihood score improves, return
4061             if (score > cur_score) {
4062                 cout << "cur_score = " << cur_score << endl;
4063                 cout << "Found new BETTER SCORE by SPR: " << score << endl;
4064 
4065                 return score;
4066             }
4067             spr_path.pop_back();
4068         }
4069         // if likelihood does not imporve, swap back
4070         sibling1->updateNeighbor(sibling2, dad, sibling1_len);
4071         sibling2->updateNeighbor(sibling1, dad, sibling2_len);
4072         dad1_nei->node = sibling1;
4073         dad1_nei->length = sibling1_len;
4074         dad2_nei->node = sibling2;
4075         dad2_nei->length = sibling2_len;
4076         delete[] sibling1_nei->partial_lh;
4077         delete[] sibling2_nei->partial_lh;
4078         sibling1_nei->partial_lh = sibling1_partial_lh;
4079         sibling2_nei->partial_lh = sibling2_partial_lh;
4080         //clearAllPartialLH();
4081 
4082     }
4083 
4084     FOR_NEIGHBOR_IT(node, dad, it){
4085     double score = optimizeSPR(cur_score, (PhyloNode*) (*it)->node, node);
4086 
4087     if (score > cur_score) return score;
4088 }
4089     return cur_score;
4090 }
4091 
4092 /**
4093  move the subtree (dad1-node1) to the branch (dad2-node2)
4094  */
swapSPR(double cur_score,int cur_depth,PhyloNode * node1,PhyloNode * dad1,PhyloNode * orig_node1,PhyloNode * orig_node2,PhyloNode * node2,PhyloNode * dad2,vector<PhyloNeighbor * > & spr_path)4095 double PhyloTree::swapSPR(double cur_score, int cur_depth, PhyloNode *node1, PhyloNode *dad1, PhyloNode *orig_node1,
4096         PhyloNode *orig_node2, PhyloNode *node2, PhyloNode *dad2, vector<PhyloNeighbor*> &spr_path) {
4097 
4098     PhyloNeighbor *node1_nei = (PhyloNeighbor*) node1->findNeighbor(dad1);
4099     PhyloNeighbor *dad1_nei = (PhyloNeighbor*) dad1->findNeighbor(node1);
4100     double node1_dad1_len = node1_nei->length;
4101     PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
4102     PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
4103 
4104     //double* node1dad1_lh_save = node1_nei->partial_lh;
4105     //double* dad1node1_lh_save = dad1_nei->partial_lh;
4106     //double node1dad1_scale = node1_nei->lh_scale_factor;
4107     //double dad1node1_scale = dad1_nei->lh_scale_factor;
4108 
4109     double* node2dad2_lh_save = node2_nei->partial_lh;
4110     double* dad2node2_lh_save = dad2_nei->partial_lh;
4111     double node2dad2_scale = node2_nei->lh_scale_factor;
4112     double dad2node_scale = dad2_nei->lh_scale_factor;
4113 
4114     double len2 = node2_nei->length;
4115     double newLen2 = sqrt(len2);
4116 
4117     if (dad2 && cur_depth >= SPR_DEPTH) {
4118         // now, connect (node1-dad1) to the branch (node2-dad2)
4119 
4120         bool first = true;
4121         //PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
4122         //PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
4123         //double len2 = node2_nei->length;
4124 
4125         FOR_NEIGHBOR_IT(dad1, node1, it){
4126         // Finding new 2 neighbors for dad1 that are not node1
4127         if (first) {
4128             (*it)->node = dad2;
4129             //(*it)->length = len2 / 2;
4130             (*it)->length = newLen2;
4131             dad2->updateNeighbor(node2, dad1, newLen2);
4132             first = false;
4133         } else {
4134             (*it)->node = node2;
4135             (*it)->length = newLen2;
4136             node2->updateNeighbor(dad2, dad1, newLen2);
4137         }
4138         // clear all partial likelihood leading from
4139         // dad1 to the new neighbors
4140         ((PhyloNeighbor*) (*it))->clearPartialLh();
4141     }
4142 
4143     // clear partial likelihood from node2 to dad1
4144         node2_nei->clearPartialLh();
4145         // clear partial likelihood from dad2 to dad1
4146         dad2_nei->clearPartialLh();
4147         // clear partial likelihood from dad1 to node1
4148         node1_nei->clearPartialLh();
4149 
4150         // set new legnth as suggested by Alexis
4151         node1_nei->length = 0.9;
4152         dad1_nei->length = 0.9;
4153 
4154         //Save the partial likelihood from the removal point to the insertion point
4155         vector<PhyloNeighbor*>::iterator it2;
4156         vector<double*> saved_partial_lhs(spr_path.size());
4157         for (it2 = spr_path.begin(); it2 != spr_path.end(); it2++) {
4158             saved_partial_lhs.push_back((*it2)->partial_lh);
4159             (*it2)->partial_lh = newPartialLh();
4160             (*it2)->clearPartialLh();
4161         }
4162 
4163         // optimize relevant branches
4164         double score;
4165 
4166         /* testing different branch optimization */
4167         optimizeOneBranch(node1, dad1);
4168         optimizeOneBranch(dad2, dad1);
4169         optimizeOneBranch(node2, dad1);
4170         optimizeOneBranch(orig_node1, orig_node2);
4171         score = computeLikelihoodFromBuffer();
4172 
4173         /*
4174          PhyloNode *cur_node = dad2;
4175          for (int i = spr_path.size()-1; i >= 0; i--) {
4176          score = optimizeOneBranch(cur_node, (PhyloNode*)spr_path[i]->node);
4177          cur_node = (PhyloNode*)spr_path[i]->node;
4178          }
4179          */
4180         //score = optimizeAllBranches(dad1);
4181         // if score improves, return
4182         if (score > cur_score) {
4183             cout << score << endl;
4184             return score;
4185         }
4186 
4187         // else, swap back
4188         node2->updateNeighbor(dad1, dad2, len2);
4189         dad2->updateNeighbor(dad1, node2, len2);
4190         //node2_nei->clearPartialLh();
4191         //dad2_nei->clearPartialLh();
4192         // restore partial likelihood vectors
4193         node2_nei->partial_lh = node2dad2_lh_save;
4194         node2_nei->lh_scale_factor = node2dad2_scale;
4195         dad2_nei->partial_lh = dad2node2_lh_save;
4196         dad2_nei->lh_scale_factor = dad2node_scale;
4197         node2_nei->length = len2;
4198         dad2_nei->length = len2;
4199         node1_nei->length = node1_dad1_len;
4200         dad1_nei->length = node1_dad1_len;
4201         int index = 0;
4202         for (it2 = spr_path.begin(); it2 != spr_path.end(); it2++) {
4203             delete[] (*it2)->partial_lh;
4204             (*it2)->partial_lh = saved_partial_lhs.at(index);
4205             (*it2)->unclearPartialLh();
4206             index++;
4207         }
4208 
4209         // add to candiate SPR moves
4210         // Tung : why adding negative SPR move ?
4211         spr_moves.add(node1, dad1, node2, dad2, score);
4212     }
4213     if (cur_depth >= spr_radius)
4214 
4215         return cur_score;
4216     spr_path.push_back(node2_nei);
4217 
4218     FOR_NEIGHBOR_IT(node2, dad2, it){
4219     double score = swapSPR(cur_score, cur_depth + 1, node1, dad1, orig_node1, orig_node2, (PhyloNode*) (*it)->node, node2, spr_path);
4220     if (score > cur_score) return score;
4221 }
4222     spr_path.pop_back();
4223 
4224     return cur_score;
4225 }
4226 
assessSPRMove(double cur_score,const SPRMove & spr)4227 double PhyloTree::assessSPRMove(double cur_score, const SPRMove &spr) {
4228 
4229     PhyloNode *dad = spr.prune_dad;
4230     PhyloNode *node = spr.prune_node;
4231     PhyloNode *dad2 = spr.regraft_dad;
4232     PhyloNode *node2 = spr.regraft_node;
4233 
4234     PhyloNeighbor *dad_nei1 = NULL;
4235     PhyloNeighbor *dad_nei2 = NULL;
4236     PhyloNode *sibling1 = NULL;
4237     PhyloNode *sibling2 = NULL;
4238     double sibling1_len = 0.0, sibling2_len = 0.0;
4239 
4240     PhyloNeighbor *node1_nei = (PhyloNeighbor*) node->findNeighbor(dad);
4241     PhyloNeighbor *dad1_nei = (PhyloNeighbor*) dad->findNeighbor(node);
4242     double node1_dad1_len = node1_nei->length;
4243 
4244     // assign the sibling of node, with respect to dad
4245 
4246     FOR_NEIGHBOR_DECLARE(dad, node, it) {
4247         if (!sibling1) {
4248             dad_nei1 = (PhyloNeighbor*) (*it);
4249             sibling1 = (PhyloNode*) (*it)->node;
4250             sibling1_len = (*it)->length;
4251         } else {
4252 
4253             dad_nei2 = (PhyloNeighbor*) (*it);
4254             sibling2 = (PhyloNode*) (*it)->node;
4255             sibling2_len = (*it)->length;
4256         }
4257     }
4258     // remove the subtree leading to node
4259     double sum_len = sibling1_len + sibling2_len;
4260     sibling1->updateNeighbor(dad, sibling2, sum_len);
4261     sibling2->updateNeighbor(dad, sibling1, sum_len);
4262     // now try to move the subtree to somewhere else
4263 
4264     bool first = true;
4265     PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
4266     //PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
4267     double len2 = node2_nei->length;
4268 
4269     FOR_NEIGHBOR(dad, node, it)
4270     {
4271         if (first) {
4272             (*it)->node = dad2;
4273             (*it)->length = len2 / 2;
4274             dad2->updateNeighbor(node2, dad, len2 / 2);
4275             first = false;
4276         } else {
4277             (*it)->node = node2;
4278             (*it)->length = len2 / 2;
4279             node2->updateNeighbor(dad2, dad, len2 / 2);
4280         }
4281         ((PhyloNeighbor*) (*it))->clearPartialLh();
4282     }
4283 
4284     clearAllPartialLH();
4285     // optimize branches
4286     double score;
4287     optimizeAllBranches(dad);
4288     score = computeLikelihoodBranch((PhyloNeighbor*)dad->neighbors.back(), dad);
4289 
4290     // if score improves, return
4291     if (score > cur_score)
4292         return score;
4293     // else, swap back
4294     node2->updateNeighbor(dad, dad2, len2);
4295     dad2->updateNeighbor(dad, node2, len2);
4296 
4297     node1_nei->length = node1_dad1_len;
4298     dad1_nei->length = node1_dad1_len;
4299 
4300     sibling1->updateNeighbor(sibling2, dad, sibling1_len);
4301     sibling2->updateNeighbor(sibling1, dad, sibling2_len);
4302     dad_nei1->node = sibling1;
4303     dad_nei1->length = sibling1_len;
4304     dad_nei2->node = sibling2;
4305     dad_nei2->length = sibling2_len;
4306     clearAllPartialLH();
4307 
4308     return cur_score;
4309 
4310 }
4311 
optimizeSPR()4312 double PhyloTree::optimizeSPR() {
4313     double cur_score = computeLikelihood();
4314     //spr_radius = leafNum / 5;
4315     spr_radius = 10;
4316     for (int i = 0; i < 100; i++) {
4317         cout << "i = " << i << endl;
4318         spr_moves.clear();
4319         double score = optimizeSPR_old(cur_score, (PhyloNode*) root->neighbors[0]->node);
4320         clearAllPartialLH();
4321         // why this?
4322         if (score <= cur_score) {
4323             for (SPRMoves::iterator it = spr_moves.begin(); it != spr_moves.end(); it++) {
4324                 //cout << (*it).score << endl;
4325                 score = assessSPRMove(cur_score, *it);
4326                 // if likelihood score improves, apply to SPR
4327                 if (score > cur_score)
4328                     break;
4329             }
4330             if (score <= cur_score)
4331                 break;
4332         } else {
4333 
4334             cur_score = optimizeAllBranches();
4335             cout << "SPR " << i + 1 << " : " << cur_score << endl;
4336             cur_score = score;
4337         }
4338     }
4339     return cur_score;
4340     //return optimizeAllBranches();
4341 }
4342 
optimizeSPRBranches()4343 double PhyloTree::optimizeSPRBranches() {
4344     cout << "Search with Subtree Pruning and Regrafting (SPR) using ML..." << endl;
4345     double cur_score = computeLikelihood();
4346     for (int i = 0; i < 100; i++) {
4347         double score = optimizeSPR();
4348         if (score <= cur_score + TOL_LIKELIHOOD)
4349 
4350             break;
4351         cur_score = score;
4352     }
4353     return cur_score;
4354 }
4355 
pruneSubtree(PhyloNode * node,PhyloNode * dad,PruningInfo & info)4356 void PhyloTree::pruneSubtree(PhyloNode *node, PhyloNode *dad, PruningInfo &info) {
4357 
4358     bool first = true;
4359     info.node = node;
4360     info.dad = dad;
4361 
4362     FOR_NEIGHBOR_IT(dad, node, it){
4363     if (first) {
4364         info.dad_it_left = it;
4365         info.dad_nei_left = (*it);
4366         info.dad_lh_left = ((PhyloNeighbor*) (*it))->partial_lh;
4367         info.left_node = (*it)->node;
4368         info.left_len = (*it)->length;
4369         first = false;
4370     } else {
4371 
4372         info.dad_it_right = it;
4373         info.dad_nei_right = (*it);
4374         info.dad_lh_right = ((PhyloNeighbor*) (*it))->partial_lh;
4375         info.right_node = (*it)->node;
4376         info.right_len = (*it)->length;
4377     }
4378 }
4379     info.left_it = info.left_node->findNeighborIt(dad);
4380     info.right_it = info.right_node->findNeighborIt(dad);
4381     info.left_nei = (*info.left_it);
4382     info.right_nei = (*info.right_it);
4383 
4384     info.left_node->updateNeighbor(info.left_it, info.dad_nei_right);
4385     info.right_node->updateNeighbor(info.right_it, info.dad_nei_left);
4386     ((PhyloNeighbor*) info.dad_nei_right)->partial_lh = newPartialLh();
4387     ((PhyloNeighbor*) info.dad_nei_left)->partial_lh = newPartialLh();
4388 }
4389 
regraftSubtree(PruningInfo & info,PhyloNode * in_node,PhyloNode * in_dad)4390 void PhyloTree::regraftSubtree(PruningInfo &info, PhyloNode *in_node, PhyloNode *in_dad) {
4391 
4392     NeighborVec::iterator in_node_it = in_node->findNeighborIt(in_dad);
4393     NeighborVec::iterator in_dad_it = in_dad->findNeighborIt(in_node);
4394     Neighbor *in_dad_nei = (*in_dad_it);
4395     Neighbor *in_node_nei = (*in_node_it);
4396     //double in_len = in_dad_nei->length;
4397     info.dad->updateNeighbor(info.dad_it_right, in_dad_nei);
4398     info.dad->updateNeighbor(info.dad_it_left, in_node_nei);
4399     // SOMETHING NEED TO BE DONE
4400     //in_dad->updateNeighbor(in_dad_it,
4401 
4402 }
4403 
4404 /****************************************************************************
4405  Approximate Likelihood Ratio Test with SH-like interpretation
4406  ****************************************************************************/
4407 
4408 /*void PhyloTree::computeNNIPatternLh(double cur_lh, double &lh2, double *pattern_lh2, double &lh3, double *pattern_lh3,
4409         PhyloNode *node1, PhyloNode *node2) {
4410 
4411     assert(node1->degree() == 3 && node2->degree() == 3);
4412 
4413     // recompute pattern scaling factors if necessary
4414     PhyloNeighbor *node12_it = (PhyloNeighbor*) node1->findNeighbor(node2);
4415     PhyloNeighbor *node21_it = (PhyloNeighbor*) node2->findNeighbor(node1);
4416     NeighborVec::iterator it;
4417     const int IT_NUM = 6;
4418 
4419     NeighborVec::iterator saved_it[IT_NUM];
4420     int id = 0;
4421 
4422     FOR_NEIGHBOR(node1, node2, it)
4423     {
4424         saved_it[id++] = (*it)->node->findNeighborIt(node1);
4425     } else {
4426 
4427         saved_it[id++] = it;
4428     }
4429 
4430     FOR_NEIGHBOR(node2, node1, it)
4431     {
4432         saved_it[id++] = (*it)->node->findNeighborIt(node2);
4433     } else {
4434         saved_it[id++] = it;
4435     }
4436     assert(id == IT_NUM);
4437 
4438     Neighbor * saved_nei[IT_NUM];
4439     // save Neighbor and allocate new Neighbor pointer
4440     for (id = 0; id < IT_NUM; id++) {
4441         saved_nei[id] = (*saved_it[id]);
4442         // NOTE BUG DOWN HERE!
4443         *saved_it[id] = new PhyloNeighbor(saved_nei[id]->node, saved_nei[id]->length); // BUG for PhyloSuperTree!
4444         ((PhyloNeighbor*) (*saved_it[id]))->partial_lh = newPartialLh();
4445         ((PhyloNeighbor*) (*saved_it[id]))->scale_num = newScaleNum();
4446     }
4447 
4448     // get the Neighbor again since it is replaced for saving purpose
4449     node12_it = (PhyloNeighbor*) node1->findNeighbor(node2);
4450     node21_it = (PhyloNeighbor*) node2->findNeighbor(node1);
4451 
4452     // PhyloNeighbor *node2_lastnei = NULL;
4453 
4454     // save the first found neighbor of node 1 (excluding node2) in node1_it
4455     FOR_NEIGHBOR_DECLARE(node1, node2, node1_it)
4456 
4457         break;
4458     Neighbor *node1_nei = *node1_it;
4459 
4460     bool first = true;
4461 
4462     FOR_NEIGHBOR_IT(node2, node1, node2_it) {
4463         // do the NNI swap
4464         Neighbor *node2_nei = *node2_it;
4465         node1->updateNeighbor(node1_it, node2_nei);
4466         node2_nei->node->updateNeighbor(node2, node1);
4467 
4468         node2->updateNeighbor(node2_it, node1_nei);
4469         node1_nei->node->updateNeighbor(node1, node2);
4470 
4471         // re-optimize five adjacent branches
4472         double old_score = -INFINITY, new_score = old_score;
4473 
4474         // clear partial likelihood vector
4475         node12_it->clearPartialLh();
4476         node21_it->clearPartialLh();
4477         int i;
4478         for (i = 0; i < 2; i++) {
4479 
4480             new_score = optimizeOneBranch(node1, node2, false);
4481 
4482             FOR_NEIGHBOR(node1, node2, it) {
4483                 //for (id = 0; id < IT_NUM; id++)
4484                 //((PhyloNeighbor*)(*saved_it[id]))->clearPartialLh();
4485                 ((PhyloNeighbor*) (*it)->node->findNeighbor(node1))->clearPartialLh();
4486                 new_score = optimizeOneBranch(node1, (PhyloNode*) (*it)->node, false);
4487             }
4488 
4489             node21_it->clearPartialLh();
4490 
4491             FOR_NEIGHBOR(node2, node1, it) {
4492                 //for (id = 0; id < IT_NUM; id++)
4493                 //((PhyloNeighbor*)(*saved_it[id]))->clearPartialLh();
4494                 ((PhyloNeighbor*) (*it)->node->findNeighbor(node2))->clearPartialLh();
4495                 new_score = optimizeOneBranch(node2, (PhyloNode*) (*it)->node, false);
4496                 //node2_lastnei = (PhyloNeighbor*) (*it);
4497             }
4498             node12_it->clearPartialLh();
4499             if (new_score < old_score + TOL_LIKELIHOOD) break;
4500             old_score = new_score;
4501         }
4502         saveCurrentTree(new_score); // BQM: for new bootstrap
4503 
4504         //new_score = optimizeOneBranch(node1, node2, false);
4505         if (new_score > cur_lh + TOL_LIKELIHOOD)
4506         cout << "Alternative NNI shows better likelihood " << new_score << " > " << cur_lh << endl;
4507         double *result_lh;
4508         if (first) {
4509             result_lh = pattern_lh2;
4510             lh2 = new_score;
4511         } else {
4512             result_lh = pattern_lh3;
4513             lh3 = new_score;
4514         }
4515         old_score = new_score;
4516         computePatternLikelihood(result_lh);
4517         // swap back and recover the branch lengths
4518         node1->updateNeighbor(node1_it, node1_nei);
4519         node1_nei->node->updateNeighbor(node2, node1);
4520         node2->updateNeighbor(node2_it, node2_nei);
4521         node2_nei->node->updateNeighbor(node1, node2);
4522         first = false;
4523     }
4524 
4525 // restore the Neighbor*
4526     for (id = 0; id < IT_NUM; id++) {
4527 
4528         delete[] ((PhyloNeighbor*) *saved_it[id])->scale_num;
4529         delete[] ((PhyloNeighbor*) *saved_it[id])->partial_lh;
4530         delete (*saved_it[id]);
4531         (*saved_it[id]) = saved_nei[id];
4532     }
4533 
4534     // restore the length of 4 branches around node1, node2
4535     FOR_NEIGHBOR(node1, node2, it)
4536         (*it)->length = (*it)->node->findNeighbor(node1)->length;
4537     FOR_NEIGHBOR(node2, node1, it)
4538         (*it)->length = (*it)->node->findNeighbor(node2)->length;
4539 }*/
4540 
computeNNIPatternLh(double cur_lh,double & lh2,double * pattern_lh2,double & lh3,double * pattern_lh3,PhyloNode * node1,PhyloNode * node2)4541 void PhyloTree::computeNNIPatternLh(double cur_lh, double &lh2, double *pattern_lh2, double &lh3, double *pattern_lh3,
4542         PhyloNode *node1, PhyloNode *node2) {
4543     NNIMove nniMoves[2];
4544     nniMoves[0].ptnlh = pattern_lh2;
4545     nniMoves[1].ptnlh = pattern_lh3;
4546     bool nni5 = params->nni5;
4547     params->nni5 = true; // always optimize 5 branches for accurate SH-aLRT
4548     nniMoves[0].node1 = nniMoves[1].node1 = NULL;
4549     nniMoves[0].node2 = nniMoves[1].node2 = NULL;
4550     getBestNNIForBran(node1, node2, nniMoves);
4551     params->nni5 = nni5;
4552     lh2 = nniMoves[0].newloglh;
4553     lh3 = nniMoves[1].newloglh;
4554     if (max(lh2,lh3) > cur_lh + TOL_LIKELIHOOD)
4555         cout << "Alternative NNI shows better log-likelihood " << max(lh2,lh3) << " > " << cur_lh << endl;
4556 }
4557 
resampleLh(double ** pat_lh,double * lh_new,int * rstream)4558 void PhyloTree::resampleLh(double **pat_lh, double *lh_new, int *rstream) {
4559     //int nsite = getAlnNSite();
4560     int nptn = getAlnNPattern();
4561     memset(lh_new, 0, sizeof(double) * 3);
4562     int i;
4563     int *boot_freq = aligned_alloc<int>(getAlnNPattern());
4564     aln->createBootstrapAlignment(boot_freq, params->bootstrap_spec, rstream);
4565     for (i = 0; i < nptn; i++) {
4566 
4567         lh_new[0] += boot_freq[i] * pat_lh[0][i];
4568         lh_new[1] += boot_freq[i] * pat_lh[1][i];
4569         lh_new[2] += boot_freq[i] * pat_lh[2][i];
4570     }
4571     aligned_free(boot_freq);
4572 }
4573 
4574 /*********************************************************/
4575 /** THIS FUNCTION IS TAKEN FROM PHYML source code alrt.c
4576 * Convert an aLRT statistic to a none parametric support
4577 * param in: the statistic
4578 */
4579 
Statistics_To_Probabilities(double in)4580 double Statistics_To_Probabilities(double in)
4581 {
4582   double rough_value=0.0;
4583   double a=0.0;
4584   double b=0.0;
4585   double fa=0.0;
4586   double fb=0.0;
4587 
4588   if(in>=0.000000393 && in<0.00000157)
4589     {
4590       a=0.000000393;
4591       b=0.00000157;
4592       fa=0.0005;
4593       fb=0.001;
4594     }
4595   else if(in>=0.00000157 && in<0.0000393)
4596     {
4597       a=0.00000157;
4598       b=0.0000393;
4599       fa=0.001;
4600       fb=0.005;
4601     }
4602   else if(in>=0.0000393 && in<0.000157)
4603     {
4604       a=0.0000393;
4605       b=0.000157;
4606       fa=0.005;
4607       fb=0.01;
4608     }
4609   else if(in>=0.000157 && in<0.000982)
4610     {
4611       a=0.000157;
4612       b=0.000982;
4613       fa=0.01;
4614       fb=0.025;
4615     }
4616   else if(in>0.000982 && in<0.00393)
4617     {
4618       a=0.000982;
4619       b=0.00393;
4620       fa=0.025;
4621       fb=0.05;
4622     }
4623   else if(in>=0.00393 && in<0.0158)
4624     {
4625       a=0.00393;
4626       b=0.0158;
4627       fa=0.05;
4628       fb=0.1;
4629     }
4630   else if(in>=0.0158 && in<0.0642)
4631     {
4632       a=0.0158;
4633       b=0.0642;
4634       fa=0.1;
4635       fb=0.2;
4636     }
4637   else if(in>=0.0642 && in<0.148)
4638     {
4639       a=0.0642;
4640       b=0.148;
4641       fa=0.2;
4642       fb=0.3;
4643     }
4644   else if(in>=0.148 && in<0.275)
4645     {
4646       a=0.148;
4647       b=0.275;
4648       fa=0.3;
4649       fb=0.4;
4650     }
4651   else if(in>=0.275 && in<0.455)
4652     {
4653       a=0.275;
4654       b=0.455;
4655       fa=0.4;
4656       fb=0.5;
4657     }
4658   else if(in>=0.455 && in<0.708)
4659     {
4660       a=0.455;
4661       b=0.708;
4662       fa=0.5;
4663       fb=0.6;
4664     }
4665   else if(in>=0.708 && in<1.074)
4666     {
4667       a=0.708;
4668       b=1.074;
4669       fa=0.6;
4670       fb=0.7;
4671     }
4672   else if(in>=1.074 && in<1.642)
4673     {
4674       a=1.074;
4675       b=1.642;
4676       fa=0.7;
4677       fb=0.8;
4678     }
4679   else if(in>=1.642 && in<2.706)
4680     {
4681       a=1.642;
4682       b=2.706;
4683       fa=0.8;
4684       fb=0.9;
4685     }
4686   else if(in>=2.706 && in<3.841)
4687     {
4688       a=2.706;
4689       b=3.841;
4690       fa=0.9;
4691       fb=0.95;
4692     }
4693   else if(in>=3.841 && in<5.024)
4694     {
4695       a=3.841;
4696       b=5.024;
4697       fa=0.95;
4698       fb=0.975;
4699     }
4700   else if(in>=5.024 && in<6.635)
4701     {
4702       a=5.024;
4703       b=6.635;
4704       fa=0.975;
4705       fb=0.99;
4706     }
4707   else if(in>=6.635 && in<7.879)
4708     {
4709       a=6.635;
4710       b=7.879;
4711       fa=0.99;
4712       fb=0.995;
4713     }
4714   else if(in>=7.879 && in<10.828)
4715     {
4716       a=7.879;
4717       b=10.828;
4718       fa=0.995;
4719       fb=0.999;
4720     }
4721   else if(in>=10.828 && in<12.116)
4722     {
4723       a=10.828;
4724       b=12.116;
4725       fa=0.999;
4726       fb=0.9995;
4727     }
4728   if (in>=12.116)
4729     {
4730       rough_value=0.9999;
4731     }
4732   else if(in<0.000000393)
4733     {
4734       rough_value=0.0001;
4735     }
4736   else
4737     {
4738       rough_value=(b-in)/(b-a)*fa + (in - a)/(b-a)*fb;
4739     }
4740   rough_value=rough_value+(1.0-rough_value)/2.0;
4741   rough_value=rough_value*rough_value*rough_value;
4742   return rough_value;
4743 }
4744 
4745 // Implementation of testBranch follows Guindon et al. (2010)
4746 
testOneBranch(double best_score,double * pattern_lh,int reps,int lbp_reps,PhyloNode * node1,PhyloNode * node2,double & lbp_support,double & aLRT_support,double & aBayes_support)4747 double PhyloTree::testOneBranch(double best_score, double *pattern_lh, int reps, int lbp_reps,
4748         PhyloNode *node1, PhyloNode *node2, double &lbp_support, double &aLRT_support, double &aBayes_support) {
4749     const int NUM_NNI = 3;
4750     double lh[NUM_NNI];
4751     double *pat_lh[NUM_NNI];
4752     lh[0] = best_score;
4753     pat_lh[0] = pattern_lh;
4754     int nptn = getAlnNPattern();
4755     pat_lh[1] = new double[nptn];
4756     pat_lh[2] = new double[nptn];
4757     int tmp = save_all_trees;
4758     save_all_trees = 0;
4759     computeNNIPatternLh(best_score, lh[1], pat_lh[1], lh[2], pat_lh[2], node1, node2);
4760     save_all_trees = tmp;
4761     double aLRT;
4762     if (lh[1] > lh[2])
4763         aLRT = (lh[0] - lh[1]);
4764     else
4765         aLRT = (lh[0] - lh[2]);
4766 
4767     // compute parametric aLRT test support
4768     double aLRT_stat = 2*aLRT;
4769     aLRT_support = 0.0;
4770     if (aLRT_stat >= 0) {
4771         aLRT_support = Statistics_To_Probabilities(aLRT_stat);
4772     }
4773 
4774     aBayes_support = 1.0 / (1.0 + exp(lh[1]-lh[0]) + exp(lh[2]-lh[0]));
4775 
4776     int SH_aLRT_support = 0;
4777     int lbp_support_int = 0;
4778 
4779     int times = max(reps, lbp_reps);
4780 
4781     if (max(lh[1],lh[2]) == -DBL_MAX) {
4782         SH_aLRT_support = times;
4783         outWarning("Branch where both NNIs violate constraint tree will show 100% SH-aLRT support");
4784     } else
4785 #ifdef _OPENMP
4786 #pragma omp parallel
4787     {
4788         int *rstream;
4789         init_random(params->ran_seed + omp_get_thread_num(), false, &rstream);
4790 #pragma omp for reduction(+: lbp_support_int, SH_aLRT_support)
4791 #endif
4792     for (int i = 0; i < times; i++) {
4793         double lh_new[NUM_NNI];
4794         // resampling estimated log-likelihood (RELL)
4795 #ifdef _OPENMP
4796         resampleLh(pat_lh, lh_new, rstream);
4797 #else
4798         resampleLh(pat_lh, lh_new, randstream);
4799 #endif
4800         if (lh_new[0] > lh_new[1] && lh_new[0] > lh_new[2])
4801             lbp_support_int++;
4802         double cs[NUM_NNI], cs_best, cs_2nd_best;
4803         cs[0] = lh_new[0] - lh[0];
4804         cs[1] = lh_new[1] - lh[1];
4805         cs[2] = lh_new[2] - lh[2];
4806         if (cs[0] >= cs[1] && cs[0] >= cs[2]) {
4807             cs_best = cs[0];
4808             if (cs[1] > cs[2])
4809                 cs_2nd_best = cs[1];
4810             else
4811                 cs_2nd_best = cs[2];
4812         } else if (cs[1] >= cs[2]) {
4813             cs_best = cs[1];
4814             if (cs[0] > cs[2])
4815                 cs_2nd_best = cs[0];
4816             else
4817                 cs_2nd_best = cs[2];
4818         } else {
4819             cs_best = cs[2];
4820             if (cs[0] > cs[1])
4821                 cs_2nd_best = cs[0];
4822             else
4823                 cs_2nd_best = cs[1];
4824         }
4825         if (aLRT > (cs_best - cs_2nd_best) + 0.05)
4826             SH_aLRT_support++;
4827     }
4828 #ifdef _OPENMP
4829     finish_random(rstream);
4830     }
4831 #endif
4832     delete[] pat_lh[2];
4833     delete[] pat_lh[1];
4834 
4835     lbp_support = 0.0;
4836     if (times > 0)
4837         lbp_support = ((double)lbp_support_int) / times;
4838 
4839     if (times > 0)
4840         return ((double) SH_aLRT_support) / times;
4841     else
4842         return 0.0;
4843 }
4844 
testAllBranches(int threshold,double best_score,double * pattern_lh,int reps,int lbp_reps,bool aLRT_test,bool aBayes_test,PhyloNode * node,PhyloNode * dad)4845 int PhyloTree::testAllBranches(int threshold, double best_score, double *pattern_lh, int reps, int lbp_reps, bool aLRT_test, bool aBayes_test,
4846         PhyloNode *node, PhyloNode *dad) {
4847     int num_low_support = 0;
4848     if (!node) {
4849         node = (PhyloNode*) root;
4850         root->neighbors[0]->node->name = "";
4851         if (isSuperTree()) {
4852             int tmp = save_all_trees;
4853             save_all_trees = 2;
4854             bool nni5 = params->nni5;
4855             params->nni5 = true; // always optimize 5 branches for accurate SH-aLRT
4856             initPartitionInfo();
4857             params->nni5 = nni5;
4858             save_all_trees = tmp;
4859         }
4860     }
4861     if (dad && !node->isLeaf() && !dad->isLeaf()) {
4862         double lbp_support, aLRT_support, aBayes_support;
4863         double SH_aLRT_support = (testOneBranch(best_score, pattern_lh, reps, lbp_reps,
4864             node, dad, lbp_support, aLRT_support, aBayes_support) * 100);
4865         ostringstream ss;
4866         ss.precision(3);
4867         ss << node->name;
4868         if (!node->name.empty())
4869             ss << "/";
4870         if (reps)
4871             ss << SH_aLRT_support;
4872         if (lbp_reps)
4873             ss << "/" << lbp_support * 100;
4874         if (aLRT_test)
4875             ss << "/" << aLRT_support;
4876         if (aBayes_test)
4877             ss << "/" << aBayes_support;
4878         node->name = ss.str();
4879         if (SH_aLRT_support < threshold)
4880             num_low_support = 1;
4881         if (((PhyloNeighbor*) node->findNeighbor(dad))->partial_pars) {
4882             ((PhyloNeighbor*) node->findNeighbor(dad))->partial_pars[0] = round(SH_aLRT_support);
4883             ((PhyloNeighbor*) dad->findNeighbor(node))->partial_pars[0] = round(SH_aLRT_support);
4884         }
4885     }
4886     FOR_NEIGHBOR_IT(node, dad, it)
4887         num_low_support += testAllBranches(threshold, best_score, pattern_lh, reps, lbp_reps, aLRT_test, aBayes_test, (PhyloNode*) (*it)->node, node);
4888 
4889     return num_low_support;
4890 }
4891 
4892 /****************************************************************************
4893  Collapse stable (highly supported) clades by one representative
4894  ****************************************************************************/
4895 
deleteLeaf(Node * leaf)4896 void PhyloTree::deleteLeaf(Node *leaf) {
4897 
4898     Node *near_node = leaf->neighbors[0]->node;
4899     ASSERT(leaf->isLeaf() && near_node->degree() == 3);
4900     Node *node1 = NULL;
4901     Node *node2 = NULL;
4902     double sum_len = 0.0;
4903 
4904     FOR_NEIGHBOR_IT(near_node, leaf, it){
4905     sum_len += (*it)->length;
4906     if (!node1)
4907     node1 = (*it)->node;
4908 
4909     else
4910     node2 = (*it)->node;
4911 }
4912 // make sure that the returned node1 and node2 are correct
4913     ASSERT(node1 && node2);
4914     // update the neighbor
4915     node1->updateNeighbor(near_node, node2, sum_len);
4916     node2->updateNeighbor(near_node, node1, sum_len);
4917 }
4918 
reinsertLeaf(Node * leaf,Node * node,Node * dad)4919 void PhyloTree::reinsertLeaf(Node *leaf, Node *node, Node *dad) {
4920 
4921     bool first = true;
4922     Node *adjacent_node = leaf->neighbors[0]->node;
4923     Neighbor *nei = node->findNeighbor(dad);
4924     //double len = nei->length;
4925     double len = max(nei->length, params->min_branch_length * 2);
4926     // to avoid too small branch length when reinserting leaf
4927 
4928     FOR_NEIGHBOR_IT(adjacent_node, leaf, it){
4929         if (first) {
4930             (*it)->node = node;
4931             (*it)->length = len / 2;
4932             node->updateNeighbor(dad, adjacent_node, len / 2);
4933         } else {
4934             (*it)->node = dad;
4935             (*it)->length = len / 2;
4936             dad->updateNeighbor(node, adjacent_node, len / 2);
4937         }
4938         first = false;
4939     }
4940 }
4941 
isSupportedNode(PhyloNode * node,int min_support)4942 bool PhyloTree::isSupportedNode(PhyloNode* node, int min_support) {
4943     FOR_NEIGHBOR_IT(node, NULL, it)if (!(*it)->node->isLeaf())
4944     if (((PhyloNeighbor*) * it)->partial_pars[0] < min_support) {
4945 
4946         return false;
4947     }
4948     return true;
4949 }
4950 
collapseStableClade(int min_support,NodeVector & pruned_taxa,StrVector & linked_name,double * & dist_mat)4951 int PhyloTree::collapseStableClade(int min_support, NodeVector &pruned_taxa, StrVector &linked_name,
4952         double* &dist_mat) {
4953     NodeVector taxa;
4954     NodeVector::iterator tax_it;
4955     StrVector::iterator linked_it;
4956     getTaxa(taxa);
4957     IntVector linked_taxid;
4958     linked_taxid.resize(leafNum, -1);
4959     int num_pruned_taxa; // global num of pruned taxa
4960     int ntaxa = leafNum;
4961     do {
4962         num_pruned_taxa = 0;
4963         for (tax_it = taxa.begin(); tax_it != taxa.end(); tax_it++)
4964             if (linked_taxid[(*tax_it)->id] < 0) {
4965                 Node *taxon = (*tax_it);
4966                 PhyloNode *near_node = (PhyloNode*) taxon->neighbors[0]->node;
4967                 Node *adj_taxon = NULL;
4968                 FOR_NEIGHBOR_DECLARE(near_node, taxon, it)
4969                     if ((*it)->node->isLeaf()) {
4970                         adj_taxon = (*it)->node;
4971                         break;
4972                     }
4973                 // if it is not a cherry
4974                 if (!adj_taxon)
4975                     continue;
4976                 ASSERT(linked_taxid[adj_taxon->id] < 0);
4977                 PhyloNeighbor * near_nei = NULL;
4978                 FOR_NEIGHBOR(near_node, taxon, it)
4979                     if ((*it)->node != adj_taxon) {
4980                         near_nei = (PhyloNeighbor*) (*it);
4981                         break;
4982                     }
4983                 ASSERT(near_nei);
4984                 // continue if the cherry is not stable, or distance between two taxa is near ZERO
4985                 if (!isSupportedNode((PhyloNode*) near_nei->node, min_support)
4986                         && dist_mat[taxon->id * ntaxa + adj_taxon->id] > 2e-6)
4987                     continue;
4988                 // now do the taxon pruning
4989                 Node * pruned_taxon = taxon, *stayed_taxon = adj_taxon;
4990                 // prune the taxon that is far away
4991                 if (adj_taxon->neighbors[0]->length > taxon->neighbors[0]->length) {
4992                     pruned_taxon = adj_taxon;
4993                     stayed_taxon = taxon;
4994                 }
4995                 deleteLeaf(pruned_taxon);
4996                 linked_taxid[pruned_taxon->id] = stayed_taxon->id;
4997                 pruned_taxa.push_back(pruned_taxon);
4998                 linked_name.push_back(stayed_taxon->name);
4999                 num_pruned_taxa++;
5000                 // do not prune more than n-4 taxa
5001                 if (pruned_taxa.size() >= ntaxa - 4)
5002                     break;
5003             }
5004     } while (num_pruned_taxa && pruned_taxa.size() < ntaxa - 4);
5005 
5006     if (pruned_taxa.empty())
5007         return 0;
5008 
5009     if (verbose_mode >= VB_MED)
5010         for (tax_it = pruned_taxa.begin(), linked_it = linked_name.begin(); tax_it != pruned_taxa.end();
5011                 tax_it++, linked_it++)
5012             cout << "Delete " << (*tax_it)->name << " from " << (*linked_it) << endl;
5013 
5014     // set root to the first taxon which was not deleted
5015     for (tax_it = taxa.begin(); tax_it != taxa.end(); tax_it++)
5016         if (linked_taxid[(*tax_it)->id] < 0) {
5017             root = (*tax_it);
5018             break;
5019         }
5020     // extract the sub alignment
5021     IntVector stayed_id;
5022     int i, j;
5023     for (i = 0; i < taxa.size(); i++)
5024         if (linked_taxid[i] < 0)
5025             stayed_id.push_back(i);
5026     ASSERT(stayed_id.size() + pruned_taxa.size() == leafNum);
5027     Alignment * pruned_aln = new Alignment();
5028     pruned_aln->extractSubAlignment(aln, stayed_id, 2); // at least 2 informative characters
5029     nodeNum = leafNum = stayed_id.size();
5030     initializeTree();
5031     setAlignment(pruned_aln);
5032 
5033     double *pruned_dist = new double[leafNum * leafNum];
5034     for (i = 0; i < leafNum; i++)
5035         for (j = 0; j < leafNum; j++)
5036             pruned_dist[i * leafNum + j] = dist_mat[stayed_id[i] * ntaxa + stayed_id[j]];
5037     dist_mat = pruned_dist;
5038 
5039     return pruned_taxa.size();
5040 }
5041 
restoreStableClade(Alignment * original_aln,NodeVector & pruned_taxa,StrVector & linked_name)5042 int PhyloTree::restoreStableClade(Alignment *original_aln, NodeVector &pruned_taxa, StrVector &linked_name) {
5043     //int num_inserted_taxa;
5044     NodeVector::reverse_iterator tax_it;
5045     StrVector::reverse_iterator linked_it;
5046     tax_it = pruned_taxa.rbegin();
5047     linked_it = linked_name.rbegin();
5048     for (; tax_it != pruned_taxa.rend(); tax_it++, linked_it++) {
5049         //cout << "Reinsert " << (*tax_it)->name << " to " << (*linked_it) << endl;
5050         Node *linked_taxon = findNodeName((*linked_it));
5051         ASSERT(linked_taxon);
5052         ASSERT(linked_taxon->isLeaf());
5053         leafNum++;
5054         reinsertLeaf((*tax_it), linked_taxon, linked_taxon->neighbors[0]->node);
5055     }
5056     ASSERT(leafNum == original_aln->getNSeq());
5057     nodeNum = leafNum;
5058     initializeTree();
5059     setAlignment(original_aln);
5060     setRootNode(params->root);
5061     //if (verbose_mode >= VB_MED) drawTree(cout);
5062 
5063     return 0;
5064 }
5065 
checkEqualScalingFactor(double & sum_scaling,PhyloNode * node,PhyloNode * dad)5066 bool PhyloTree::checkEqualScalingFactor(double &sum_scaling, PhyloNode *node, PhyloNode *dad) {
5067     if (!node)
5068         node = (PhyloNode*) root;
5069     if (dad) {
5070         double scaling = ((PhyloNeighbor*) node->findNeighbor(dad))->lh_scale_factor
5071                 + ((PhyloNeighbor*) dad->findNeighbor(node))->lh_scale_factor;
5072         if (sum_scaling > 0)
5073             sum_scaling = scaling;
5074         if (fabs(sum_scaling - scaling) > 1e-6) {
5075             cout << sum_scaling << " " << scaling << endl;
5076             return false;
5077         }
5078     }
5079     FOR_NEIGHBOR_IT(node, dad, it)if (!checkEqualScalingFactor(sum_scaling, (PhyloNode*) (*it)->node, node)) return false;
5080 
5081     return true;
5082 }
5083 
randomizeNeighbors(Node * node,Node * dad)5084 void PhyloTree::randomizeNeighbors(Node *node, Node *dad) {
5085 
5086     if (!node)
5087         node = root;
5088     FOR_NEIGHBOR_IT(node, dad, it)randomizeNeighbors((*it)->node, node);
5089 
5090     my_random_shuffle(node->neighbors.begin(), node->neighbors.end());
5091 }
5092 
printTransMatrices(Node * node,Node * dad)5093 void PhyloTree::printTransMatrices(Node *node, Node *dad) {
5094     if (!node)
5095         node = root;
5096     int nstates = aln->num_states;
5097 
5098     if (dad) {
5099         double *trans_cat = new double[nstates * nstates];
5100         model_factory->computeTransMatrix(dad->findNeighbor(node)->length * site_rate->getRate(0), trans_cat);
5101         cout << "Transition matrix " << dad->name << " to " << node->name << endl;
5102         for (int i = 0; i < nstates; i++) {
5103             for (int j = 0; j < nstates; j++) {
5104                 cout << "\t" << trans_cat[i * nstates + j];
5105             }
5106             cout << endl;
5107         }
5108         delete[] trans_cat;
5109     }
5110     FOR_NEIGHBOR_IT(node, dad, it)printTransMatrices((*it)->node, node);
5111 }
5112 
removeIdenticalSeqs(Params & params)5113 void PhyloTree::removeIdenticalSeqs(Params &params) {
5114     // commented out because it also works for SuperAlignment now!
5115     Alignment *new_aln;
5116     // 2017-03-31: always keep two identical sequences no matter if -bb or not, to avoid conflict between 2 subsequent runs
5117     if (params.root)
5118         new_aln = aln->removeIdenticalSeq((string)params.root, true, removed_seqs, twin_seqs);
5119     else
5120         new_aln = aln->removeIdenticalSeq("", true, removed_seqs, twin_seqs);
5121     if (removed_seqs.size() > 0) {
5122         cout << "NOTE: " << removed_seqs.size() << " identical sequences (see below) will be ignored for subsequent analysis" << endl;
5123         for (int i = 0; i < removed_seqs.size(); i++) {
5124             cout << "NOTE: " << removed_seqs[i] << " (identical to " << twin_seqs[i] << ") is ignored but added at the end" << endl;
5125         }
5126         delete aln;
5127         aln = new_aln;
5128     }
5129     if (!constraintTree.empty()) {
5130         int count = constraintTree.removeTaxa(removed_seqs);
5131         if (count)
5132             cout << count << " taxa removed from constraint tree" << endl;
5133     }
5134 }
5135 
reinsertIdenticalSeqs(Alignment * orig_aln)5136 void PhyloTree::reinsertIdenticalSeqs(Alignment *orig_aln) {
5137     if (removed_seqs.empty()) return;
5138 
5139     insertTaxa(removed_seqs, twin_seqs);
5140     setAlignment(orig_aln);
5141     // delete all partial_lh, which will be automatically recreated later
5142     deleteAllPartialLh();
5143     clearAllPartialLH();
5144 }
5145 
computeSeqIdentityAlongTree(Split & sp,Node * node,Node * dad)5146 void PhyloTree::computeSeqIdentityAlongTree(Split &sp, Node *node, Node *dad) {
5147     ASSERT(node && !node->isLeaf());
5148     // recursive
5149     FOR_NEIGHBOR_IT(node, dad, it) {
5150         if ((*it)->node->isLeaf()) {
5151             sp.addTaxon((*it)->node->id);
5152         } else {
5153             Split newsp(leafNum);
5154             computeSeqIdentityAlongTree(newsp, (*it)->node, node);
5155             sp += newsp;
5156         }
5157     }
5158     if (!dad) return;
5159     // now going along alignment to compute seq identity
5160     int ident = 0, nseqs = aln->getNSeq();
5161     for (Alignment::iterator it = aln->begin(); it != aln->end(); it++) {
5162         char state = aln->STATE_UNKNOWN;
5163         bool is_const = true;
5164         for (int i = 0; i != nseqs; i++)
5165             if ((*it)[i] < aln->num_states && sp.containTaxon(i)) {
5166                 if (state < aln->num_states && state != (*it)[i]) {
5167                     is_const = false;
5168                     break;
5169                 }
5170                 state = (*it)[i];
5171             }
5172         if (is_const)
5173             ident += it->frequency;
5174     }
5175     ident = (ident*100)/aln->getNSite();
5176     if (node->name == "")
5177         node->name = convertIntToString(ident);
5178     else
5179         node->name += "/" + convertIntToString(ident);
5180 }
5181 
computeSeqIdentityAlongTree()5182 void PhyloTree::computeSeqIdentityAlongTree() {
5183     Split sp(leafNum);
5184     if (root->isLeaf())
5185         computeSeqIdentityAlongTree(sp, root->neighbors[0]->node);
5186     else
5187         computeSeqIdentityAlongTree(sp, root);
5188 }
5189 
generateRandomTree(TreeGenType tree_type)5190 void PhyloTree::generateRandomTree(TreeGenType tree_type) {
5191     if (!constraintTree.empty() && tree_type != YULE_HARDING)
5192         outError("Only Yule-Harding ramdom tree supported with constraint tree");
5193     ASSERT(aln);
5194     int orig_size = params->sub_size;
5195     params->sub_size = aln->getNSeq();
5196     MExtTree ext_tree;
5197     if (constraintTree.empty()) {
5198         switch (tree_type) {
5199         case YULE_HARDING:
5200             ext_tree.generateYuleHarding(*params);
5201             break;
5202         case UNIFORM:
5203             ext_tree.generateUniform(params->sub_size);
5204             break;
5205         case CATERPILLAR:
5206             ext_tree.generateCaterpillar(params->sub_size);
5207             break;
5208         case BALANCED:
5209             ext_tree.generateBalanced(params->sub_size);
5210             break;
5211         case STAR_TREE:
5212             ext_tree.generateStarTree(*params);
5213             break;
5214         default:
5215             break;
5216         }
5217         NodeVector taxa;
5218         ext_tree.getTaxa(taxa);
5219         ASSERT(taxa.size() == aln->getNSeq());
5220         for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++)
5221             (*it)->name = aln->getSeqName((*it)->id);
5222     } else {
5223         ext_tree.generateConstrainedYuleHarding(*params, &constraintTree, aln->getSeqNames());
5224     }
5225     params->sub_size = orig_size;
5226     stringstream str;
5227     ext_tree.printTree(str);
5228     PhyloTree::readTreeStringSeqName(str.str());
5229 }
5230 
computeBranchDirection(PhyloNode * node,PhyloNode * dad)5231 void PhyloTree::computeBranchDirection(PhyloNode *node, PhyloNode *dad) {
5232     if (!node) {
5233         node = (PhyloNode*)root;
5234     }
5235     if (dad)
5236         ((PhyloNeighbor*)node->findNeighbor(dad))->direction = TOWARD_ROOT;
5237     FOR_NEIGHBOR_IT(node, dad, it) {
5238         // do not update if direction was already computed
5239         ASSERT(((PhyloNeighbor*)*it)->direction != TOWARD_ROOT);
5240         if (((PhyloNeighbor*)*it)->direction != UNDEFINED_DIRECTION)
5241             continue;
5242         // otherwise undefined.
5243         ((PhyloNeighbor*)*it)->direction = AWAYFROM_ROOT;
5244         computeBranchDirection((PhyloNode*)(*it)->node, node);
5245     }
5246 }
5247 
clearBranchDirection(PhyloNode * node,PhyloNode * dad)5248 void PhyloTree::clearBranchDirection(PhyloNode *node, PhyloNode *dad) {
5249     if (!node)
5250         node = (PhyloNode*)root;
5251     if (dad)
5252         ((PhyloNeighbor*)node->findNeighbor(dad))->direction = UNDEFINED_DIRECTION;
5253     FOR_NEIGHBOR_IT(node, dad, it) {
5254         ((PhyloNeighbor*)*it)->direction = UNDEFINED_DIRECTION;
5255         clearBranchDirection((PhyloNode*)(*it)->node, node);
5256     }
5257 
5258 }
5259 
5260 /*
5261 void PhyloTree::sortNeighborBySubtreeSize(PhyloNode *node, PhyloNode *dad) {
5262 
5263     // already sorted, return
5264     PhyloNeighbor *nei = (PhyloNeighbor*)dad->findNeighbor(node);
5265     if (nei->size >= 1)
5266         return;
5267 
5268     if (dad && node->isLeaf()) {
5269         nei->size = 1;
5270         return;
5271     }
5272 
5273     nei->size = 0;
5274     FOR_NEIGHBOR_DECLARE(node, dad, it) {
5275         sortNeighborBySubtreeSize((PhyloNode*)(*it)->node, node);
5276         nei->size += ((PhyloNeighbor*)*it)->size;
5277     }
5278 
5279     // sort neighbors in descending order of sub-tree size
5280     FOR_NEIGHBOR(node, dad, it)
5281         for (NeighborVec::iterator it2 = it+1; it2 != node->neighbors.end(); it2++)
5282             if ((*it2)->node != dad && ((PhyloNeighbor*)*it)->size < ((PhyloNeighbor*)*it2)->size) {
5283                 Neighbor *nei;
5284                 nei = *it;
5285                 *it = *it2;
5286                 *it2 = nei;
5287             }
5288 }
5289 */
5290 
convertToRooted()5291 void PhyloTree::convertToRooted() {
5292     ASSERT(leafNum == aln->getNSeq());
5293     Node *node, *dad;
5294     double node_len, dad_len;
5295     if (params->root) {
5296         string name = params->root;
5297         node = findNodeName(name);
5298         if (!node)
5299             outError("Cannot find leaf with name " + name);
5300         ASSERT(node->isLeaf());
5301         dad = node->neighbors[0]->node;
5302         node_len = dad_len = node->neighbors[0]->length*0.5;
5303     } else {
5304         //midpoint rooting
5305         Node *node1, *node2;
5306         double longest = root->longestPath2(node1, node2);
5307         longest *= 0.5;
5308         double curlen = 0.0;
5309         for (; node1 != node2 && curlen + node1->highestNei->length < longest; node1 = node1->highestNei->node) {
5310             curlen += node1->highestNei->length;
5311         }
5312         // place root on node1->heigestNei
5313         node = node1;
5314         dad = node1->highestNei->node;
5315         node_len = longest - curlen;
5316         dad_len = node1->highestNei->length - node_len;
5317         ASSERT(dad_len >= 0.0);
5318         /*
5319         // place root to the longest branch
5320         NodeVector nodes1, nodes2;
5321         getBranches(nodes1, nodes2);
5322         double max_length = -1.0;
5323         for (int i = 0; i < nodes1.size(); i++)
5324             if (max_length < nodes1[i]->findNeighbor(nodes2[i])->length) {
5325                 max_length = nodes1[i]->findNeighbor(nodes2[i])->length;
5326                 node = nodes1[i];
5327                 dad = nodes2[i];
5328             }
5329          */
5330     }
5331     rooted = true;
5332     root = newNode(leafNum, ROOT_NAME);
5333     Node *root_int = newNode();
5334     root->addNeighbor(root_int, 0.0);
5335     root_int->addNeighbor(root, 0.0);
5336     leafNum++;
5337     //double newlen = node->findNeighbor(dad)->length/2.0;
5338     node->updateNeighbor(dad, root_int, node_len);
5339     root_int->addNeighbor(node, node_len);
5340     dad->updateNeighbor(node, root_int, dad_len);
5341     root_int->addNeighbor(dad, dad_len);
5342     initializeTree();
5343     computeBranchDirection();
5344     current_it = current_it_back = NULL;
5345 }
5346 
convertToUnrooted()5347 void PhyloTree::convertToUnrooted() {
5348     ASSERT(rooted);
5349     if (aln)
5350         ASSERT(leafNum == aln->getNSeq()+1);
5351     ASSERT(root);
5352     ASSERT(root->isLeaf() && root->id == leafNum-1);
5353     Node *node = root->neighbors[0]->node;
5354     Node *taxon = findFirstTaxon();
5355 
5356     rooted = false;
5357     leafNum--;
5358 
5359     // delete root node
5360     if (node->degree() == 3) {
5361         // delete and join adjacent branches
5362         Node *node1 = NULL, *node2 = NULL;
5363         double len = 0.0;
5364         FOR_NEIGHBOR_IT(node, root, it) {
5365             if (!node1) node1 = (*it)->node; else node2 = (*it)->node;
5366             len += (*it)->length;
5367         }
5368         node1->updateNeighbor(node, node2, len);
5369         node2->updateNeighbor(node, node1, len);
5370         delete node;
5371     } else {
5372         // only delete root node
5373         auto it = node->findNeighborIt(root);
5374         delete *it;
5375         node->neighbors.erase(it);
5376 
5377     }
5378 
5379     delete root;
5380     // set a temporary taxon so that tree traversal works
5381     root = taxon;
5382 
5383     if (params)
5384         setRootNode(params->root);
5385 
5386     initializeTree();
5387     clearBranchDirection();
5388 //    computeBranchDirection();
5389 }
5390 
reorientPartialLh(PhyloNeighbor * dad_branch,Node * dad)5391 void PhyloTree::reorientPartialLh(PhyloNeighbor* dad_branch, Node *dad) {
5392     ASSERT(!isSuperTree());
5393     if (dad_branch->partial_lh)
5394         return;
5395     Node * node = dad_branch->node;
5396     FOR_NEIGHBOR_IT(node, dad, it) {
5397         PhyloNeighbor *backnei = (PhyloNeighbor*)(*it)->node->findNeighbor(node);
5398         if (backnei->partial_lh) {
5399             mem_slots.takeover(dad_branch, backnei);
5400             break;
5401         }
5402     }
5403     if (params->lh_mem_save == LM_PER_NODE)
5404         ASSERT(dad_branch->partial_lh && "partial_lh is not re-oriented");
5405 }
5406 
5407 /****************************************************************************
5408         helper functions for computing tree traversal
5409  ****************************************************************************/
5410 
computeTraversalInfo(PhyloNeighbor * dad_branch,PhyloNode * dad,double * & buffer)5411 bool PhyloTree::computeTraversalInfo(PhyloNeighbor *dad_branch, PhyloNode *dad, double* &buffer) {
5412 
5413     size_t nstates = aln->num_states;
5414     PhyloNode *node = (PhyloNode*)dad_branch->node;
5415 
5416     if ((dad_branch->partial_lh_computed & 1) || node->isLeaf()) {
5417         return mem_slots.lock(dad_branch);
5418     }
5419 
5420 
5421     size_t num_leaves = 0;
5422     bool locked[node->degree()];
5423     memset(locked, 0, node->degree());
5424 
5425     // sort neighbor in desceding size order
5426     NeighborVec neivec = node->neighbors;
5427     NeighborVec::iterator it, i2;
5428     for (it = neivec.begin(); it != neivec.end(); it++)
5429         for (i2 = it+1; i2 != neivec.end(); i2++)
5430             if (((PhyloNeighbor*)*it)->size < ((PhyloNeighbor*)*i2)->size) {
5431                 Neighbor *nei = *it;
5432                 *it = *i2;
5433                 *i2 = nei;
5434             }
5435 
5436 
5437     // recursive
5438     for (it = neivec.begin(); it != neivec.end(); it++)
5439         if ((*it)->node != dad) {
5440             locked[it - neivec.begin()] = computeTraversalInfo((PhyloNeighbor*)(*it), node, buffer);
5441             if ((*it)->node->isLeaf())
5442                 num_leaves++;
5443         }
5444     dad_branch->partial_lh_computed |= 1;
5445 
5446     // prepare information for this branch
5447     TraversalInfo info(dad_branch, dad);
5448     info.echildren = info.partial_lh_leaves = NULL;
5449 
5450     // re-orient partial_lh
5451     reorientPartialLh(dad_branch, dad);
5452 
5453     if (!dad_branch->partial_lh || mem_slots.locked(dad_branch)) {
5454         // still no free entry found, memory saving technique
5455         int slot_id = mem_slots.allocate(dad_branch);
5456         if (slot_id < 0) {
5457             cout << "traversal order:";
5458             for (auto it = traversal_info.begin(); it != traversal_info.end(); it++) {
5459                 it->dad_branch->node->name = convertIntToString(it->dad_branch->size);
5460                 cout << "  ";
5461                 if (it->dad->isLeaf())
5462                     cout << it->dad->name;
5463                 else
5464                     cout << it->dad->id;
5465                 cout << "->";
5466                 if (it->dad_branch->node->isLeaf())
5467                     cout << it->dad_branch->node->name;
5468                 else
5469                     cout << it->dad_branch->node->id;
5470                 if (params->lh_mem_save == LM_MEM_SAVE) {
5471                     if (it->dad_branch->partial_lh_computed)
5472                         cout << " [";
5473                     else
5474                         cout << " (";
5475                     cout << mem_slots.findNei(it->dad_branch) - mem_slots.begin();
5476                     if (it->dad_branch->partial_lh_computed)
5477                         cout << "]";
5478                     else
5479                         cout << ")";
5480                 }
5481             }
5482             cout << endl;
5483             drawTree(cout);
5484             ASSERT(0 && "No free/unlocked mem slot found!");
5485         }
5486     } else
5487         mem_slots.update(dad_branch);
5488 
5489         if (verbose_mode >= VB_MED && params->lh_mem_save == LM_MEM_SAVE) {
5490             int slot_id = mem_slots.findNei(dad_branch) - mem_slots.begin();
5491             node->name = convertIntToString(slot_id);
5492             cout << "Branch " << dad->id << "-" << node->id << " assigned slot " << slot_id << endl;
5493         }
5494 
5495     if (params->lh_mem_save == LM_MEM_SAVE) {
5496         for (it = neivec.begin(); it != neivec.end(); it++)
5497             if ((*it)->node != dad) {
5498                 if (!(*it)->node->isLeaf() && locked[it-neivec.begin()])
5499                     mem_slots.unlock((PhyloNeighbor*)*it);
5500             }
5501     }
5502 
5503     if (!model->isSiteSpecificModel() && !Params::getInstance().buffer_mem_save) {
5504         //------- normal model -----
5505         info.echildren = buffer;
5506         size_t block = nstates * ((model_factory->fused_mix_rate) ? site_rate->getNRate() : site_rate->getNRate()*model->getNMixtures());
5507         buffer += get_safe_upper_limit(block*nstates*(node->degree()-1));
5508         if (num_leaves) {
5509             info.partial_lh_leaves = buffer;
5510             buffer += get_safe_upper_limit((aln->STATE_UNKNOWN+1)*block*num_leaves);
5511         }
5512     }
5513 
5514     traversal_info.push_back(info);
5515     return mem_slots.lock(dad_branch);
5516 }
5517 
writeSiteLh(ostream & out,SiteLoglType wsl,int partid)5518 void PhyloTree::writeSiteLh(ostream &out, SiteLoglType wsl, int partid) {
5519     // error checking
5520     if (!getModel()->isMixture()) {
5521         if (wsl != WSL_RATECAT) {
5522             outWarning("Switch now to '-wslr' as it is the only option for non-mixture model");
5523             wsl = WSL_RATECAT;
5524         }
5525     } else {
5526         // mixture model
5527         if (wsl == WSL_MIXTURE_RATECAT && getModelFactory()->fused_mix_rate) {
5528             outWarning("-wslmr is not suitable for fused mixture model, switch now to -wslm");
5529             wsl = WSL_MIXTURE;
5530         }
5531     }
5532     size_t i, nsites = getAlnNSite();
5533     size_t ncat = getNumLhCat(wsl);
5534     double *pattern_lh, *pattern_lh_cat;
5535     pattern_lh = aligned_alloc<double>(getAlnNPattern());
5536     pattern_lh_cat = aligned_alloc<double>(getAlnNPattern()*ncat);
5537     computePatternLikelihood(pattern_lh, NULL, pattern_lh_cat, wsl);
5538     for (i = 0; i < nsites; i++) {
5539         if (partid >= 0)
5540             out << partid << "\t";
5541         size_t ptn = aln->getPatternID(i);
5542         out << i+1 << "\t" << pattern_lh[ptn];
5543         for (int j = 0; j < ncat; j++) {
5544             out << "\t" << pattern_lh_cat[ptn*ncat+j];
5545         }
5546         out << endl;
5547     }
5548     aligned_free(pattern_lh_cat);
5549     aligned_free(pattern_lh);
5550 }
5551 
writeSiteRates(ostream & out,bool bayes,int partid)5552 void PhyloTree::writeSiteRates(ostream &out, bool bayes, int partid) {
5553     DoubleVector pattern_rates;
5554     IntVector pattern_cat;
5555     int ncategory = 1;
5556 
5557     if (bayes)
5558         ncategory = site_rate->computePatternRates(pattern_rates, pattern_cat);
5559     else
5560         optimizePatternRates(pattern_rates);
5561 
5562     if (pattern_rates.empty()) return;
5563     int nsite = aln->getNSite();
5564     int i;
5565 
5566     out.setf(ios::fixed,ios::floatfield);
5567     out.precision(5);
5568     //cout << __func__ << endl;
5569     IntVector count;
5570     count.resize(ncategory, 0);
5571     for (i = 0; i < nsite; i++) {
5572         int ptn = aln->getPatternID(i);
5573         if (partid >= 0)
5574             out << partid << "\t";
5575         out << i+1 << "\t";
5576         if (pattern_rates[ptn] >= MAX_SITE_RATE) out << "100.0"; else out << pattern_rates[ptn];
5577         //cout << i << " "<< ptn << " " << pattern_cat[ptn] << endl;
5578         if (!pattern_cat.empty()) {
5579             int site_cat;
5580             double cat_rate;
5581             if (site_rate->getPInvar() == 0.0) {
5582                 site_cat = pattern_cat[ptn]+1;
5583                 cat_rate = site_rate->getRate(pattern_cat[ptn]);
5584             } else {
5585                 site_cat = pattern_cat[ptn];
5586                 if (site_cat == 0)
5587                     cat_rate = 0.0;
5588                 else
5589                     cat_rate = site_rate->getRate(pattern_cat[ptn]-1);
5590             }
5591             out << "\t" << site_cat << "\t" << cat_rate;
5592             count[pattern_cat[ptn]]++;
5593         }
5594         out << endl;
5595     }
5596     if (bayes) {
5597         cout << "Empirical proportions for each category:";
5598         for (i = 0; i < count.size(); i++)
5599             cout << " " << ((double)count[i])/nsite;
5600         cout << endl;
5601     }
5602 }
5603 
writeBranches(ostream & out)5604 void PhyloTree::writeBranches(ostream &out) {
5605     outError("Please only use this feature with partition model");
5606 }
5607 
5608