1 /***************************************************************************
2 * Copyright (C) 2009-2015 by *
3 * BUI Quang Minh <minh.bui@univie.ac.at> *
4 * Lam-Tung Nguyen <nltung@gmail.com> *
5 * *
6 * *
7 * This program is free software; you can redistribute it and/or modify *
8 * it under the terms of the GNU General Public License as published by *
9 * the Free Software Foundation; either version 2 of the License, or *
10 * (at your option) any later version. *
11 * *
12 * This program is distributed in the hope that it will be useful, *
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 * GNU General Public License for more details. *
16 * *
17 * You should have received a copy of the GNU General Public License *
18 * along with this program; if not, write to the *
19 * Free Software Foundation, Inc., *
20 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
21 ***************************************************************************/
22 #include "phylotree.h"
23 #include "utils/bionj.h"
24 //#include "rateheterogeneity.h"
25 #include "alignment/alignmentpairwise.h"
26 #include <algorithm>
27 #include <limits>
28 #include "utils/timeutil.h"
29 #include "utils/pllnni.h"
30 #include "phylosupertree.h"
31 #include "phylosupertreeplen.h"
32 #include "upperbounds.h"
33 #include "utils/MPIHelper.h"
34 #include "model/modelmixture.h"
35 #include "phylonodemixlen.h"
36 #include "phylotreemixlen.h"
37
38 const int LH_MIN_CONST = 1;
39
40 //const static int BINARY_SCALE = floor(log2(1/SCALING_THRESHOLD));
41 //const static double LOG_BINARY_SCALE = -(log(2) * BINARY_SCALE);
42
43 /****************************************************************************
44 SPRMoves class
45 ****************************************************************************/
46
add(PhyloNode * prune_node,PhyloNode * prune_dad,PhyloNode * regraft_node,PhyloNode * regraft_dad,double score)47 void SPRMoves::add(PhyloNode *prune_node, PhyloNode *prune_dad, PhyloNode *regraft_node, PhyloNode *regraft_dad,
48 double score) {
49 if (size() >= MAX_SPR_MOVES && score <= rbegin()->score)
50 return;
51 if (size() >= MAX_SPR_MOVES) {
52 iterator it = end();
53 it--;
54 erase(it);
55 }
56 SPRMove spr;
57 spr.prune_node = prune_node;
58 spr.prune_dad = prune_dad;
59 spr.regraft_node = regraft_node;
60 spr.regraft_dad = regraft_dad;
61 spr.score = score;
62 insert(spr);
63 }
64
65 /****************************************************************************
66 PhyloTree class
67 ****************************************************************************/
68
PhyloTree()69 PhyloTree::PhyloTree() : MTree(), CheckpointFactory() {
70 init();
71 }
72
init()73 void PhyloTree::init() {
74 aln = NULL;
75 model = NULL;
76 site_rate = NULL;
77 optimize_by_newton = true;
78 central_partial_lh = NULL;
79 nni_partial_lh = NULL;
80 tip_partial_lh = NULL;
81 tip_partial_pars = NULL;
82 tip_partial_lh_computed = 0;
83 ptn_freq_computed = false;
84 central_scale_num = NULL;
85 nni_scale_num = NULL;
86 central_partial_pars = NULL;
87 cost_matrix = NULL;
88 model_factory = NULL;
89 discard_saturated_site = true;
90 _pattern_lh = NULL;
91 _pattern_lh_cat = NULL;
92 _pattern_lh_cat_state = NULL;
93 //root_state = STATE_UNKNOWN;
94 root_state = 126;
95 theta_all = NULL;
96 buffer_scale_all = NULL;
97 buffer_partial_lh = NULL;
98 ptn_freq = NULL;
99 ptn_freq_pars = NULL;
100 ptn_invar = NULL;
101 subTreeDistComputed = false;
102 dist_matrix = NULL;
103 var_matrix = NULL;
104 params = NULL;
105 setLikelihoodKernel(LK_SSE2); // FOR TUNG: you forgot to initialize this variable!
106 setNumThreads(1);
107 num_threads = 0;
108 max_lh_slots = 0;
109 save_all_trees = 0;
110 nodeBranchDists = NULL;
111 // FOR: upper bounds
112 mlCheck = 0;
113 skippedNNIub = 0;
114 totalNNIub = 0;
115 minStateFreq = 0.0;
116 //minUB = 0.0;
117 //meanUB = 0.0;
118 //maxUB = 0.0;
119 pllInst = NULL;
120 pllAlignment = NULL;
121 pllPartitions = NULL;
122 // lhComputed = false;
123 curScore = -DBL_MAX;
124 root = NULL;
125 params = NULL;
126 current_scaling = 1.0;
127 is_opt_scaling = false;
128 num_partial_lh_computations = 0;
129 vector_size = 0;
130 safe_numeric = false;
131 }
132
PhyloTree(Alignment * aln)133 PhyloTree::PhyloTree(Alignment *aln) : MTree(), CheckpointFactory() {
134 init();
135 this->aln = aln;
136 }
137
PhyloTree(string & treeString,Alignment * aln,bool isRooted)138 PhyloTree::PhyloTree(string& treeString, Alignment* aln, bool isRooted) : MTree() {
139 stringstream str;
140 str << treeString;
141 str.seekg(0, ios::beg);
142 freeNode();
143 readTree(str, isRooted);
144 setAlignment(aln);
145 }
146
startCheckpoint()147 void PhyloTree::startCheckpoint() {
148 checkpoint->startStruct("PhyloTree");
149 }
150
151
saveCheckpoint()152 void PhyloTree::saveCheckpoint() {
153 startCheckpoint();
154 // StrVector leafNames;
155 // getTaxaName(leafNames);
156 // CKP_VECTOR_SAVE(leafNames);
157 string newick = PhyloTree::getTreeString();
158 CKP_SAVE(newick);
159 // CKP_SAVE(curScore);
160 endCheckpoint();
161 CheckpointFactory::saveCheckpoint();
162 }
163
restoreCheckpoint()164 void PhyloTree::restoreCheckpoint() {
165 CheckpointFactory::restoreCheckpoint();
166 startCheckpoint();
167 // StrVector leafNames;
168 // if (CKP_VECTOR_RESTORE(leafNames)) {
169 // if (leafNames.size() +(int)rooted != leafNum)
170 // outError("Alignment mismatched from checkpoint!");
171 //
172 // StrVector taxname;
173 // getTaxaName(taxname);
174 // for (int i = 0; i < leafNames.size(); i++)
175 // if (taxname[i] != leafNames[i])
176 // outError("Sequence name " + taxname[i] + " mismatched from checkpoint");
177 // }
178 string newick;
179 // CKP_RESTORE(curScore);
180 if (CKP_RESTORE(newick))
181 PhyloTree::readTreeString(newick);
182 endCheckpoint();
183 }
184
discardSaturatedSite(bool val)185 void PhyloTree::discardSaturatedSite(bool val) {
186 discard_saturated_site = val;
187 }
188
myPartitionsDestroy(partitionList * pl)189 void myPartitionsDestroy(partitionList *pl) {
190 int i;
191 for (i = 0; i < pl->numberOfPartitions; i++) {
192 rax_free(pl->partitionData[i]->partitionName);
193 rax_free(pl->partitionData[i]);
194 }
195 rax_free(pl->partitionData);
196 rax_free(pl);
197 }
198
~PhyloTree()199 PhyloTree::~PhyloTree() {
200 if (nni_scale_num)
201 aligned_free(nni_scale_num);
202 nni_scale_num = NULL;
203 if (nni_partial_lh)
204 aligned_free(nni_partial_lh);
205 nni_partial_lh = NULL;
206 if (central_partial_lh)
207 aligned_free(central_partial_lh);
208 central_partial_lh = NULL;
209 if (central_scale_num)
210 aligned_free(central_scale_num);
211 central_scale_num = NULL;
212
213 if (central_partial_pars)
214 aligned_free(central_partial_pars);
215 central_partial_pars = NULL;
216
217 if(cost_matrix){
218 aligned_free(cost_matrix);
219 cost_matrix = NULL;
220 }
221
222 if (model_factory)
223 delete model_factory;
224 model_factory = NULL;
225 if (model)
226 delete model;
227 model = NULL;
228 if (site_rate)
229 delete site_rate;
230 site_rate = NULL;
231 if (_pattern_lh_cat)
232 aligned_free(_pattern_lh_cat);
233 _pattern_lh_cat = NULL;
234 if (_pattern_lh)
235 aligned_free(_pattern_lh);
236 _pattern_lh = NULL;
237 //if (state_freqs)
238 // delete [] state_freqs;
239 if (theta_all)
240 aligned_free(theta_all);
241 theta_all = NULL;
242 if (buffer_scale_all)
243 aligned_free(buffer_scale_all);
244 buffer_scale_all = NULL;
245 if (buffer_partial_lh)
246 aligned_free(buffer_partial_lh);
247 buffer_partial_lh = NULL;
248 if (ptn_freq)
249 aligned_free(ptn_freq);
250 ptn_freq = NULL;
251 if (ptn_freq_pars)
252 aligned_free(ptn_freq_pars);
253 ptn_freq_pars = NULL;
254 ptn_freq_computed = false;
255 if (ptn_invar)
256 aligned_free(ptn_invar);
257 ptn_invar = NULL;
258 if (dist_matrix)
259 delete[] dist_matrix;
260 dist_matrix = NULL;
261
262 if (var_matrix)
263 delete[] var_matrix;
264 var_matrix = NULL;
265
266 if (pllPartitions)
267 myPartitionsDestroy(pllPartitions);
268 if (pllAlignment)
269 pllAlignmentDataDestroy(pllAlignment);
270 if (pllInst)
271 pllDestroyInstance(pllInst);
272
273 pllPartitions = NULL;
274 pllAlignment = NULL;
275 pllInst = NULL;
276
277 }
278
readTree(const char * infile,bool & is_rooted)279 void PhyloTree::readTree(const char *infile, bool &is_rooted) {
280 MTree::readTree(infile, is_rooted);
281 // 2015-10-14: has to reset this pointer when read in
282 current_it = current_it_back = NULL;
283 if (rooted && root)
284 computeBranchDirection();
285 }
286
readTree(istream & in,bool & is_rooted)287 void PhyloTree::readTree(istream &in, bool &is_rooted) {
288 MTree::readTree(in, is_rooted);
289 // 2015-10-14: has to reset this pointer when read in
290 current_it = current_it_back = NULL;
291 // remove taxa if necessary
292 if (removed_seqs.size() > 0)
293 removeTaxa(removed_seqs);
294
295 // collapse any internal node of degree 2
296 NodeVector nodes;
297 getInternalNodes(nodes);
298 int num_collapsed = 0;
299 for (NodeVector::iterator it = nodes.begin(); it != nodes.end(); it++)
300 if ((*it)->degree() == 2) {
301 Node *left = (*it)->neighbors[0]->node;
302 Node *right = (*it)->neighbors[1]->node;
303 double len = (*it)->neighbors[0]->length+(*it)->neighbors[1]->length;
304 left->updateNeighbor((*it), right, len);
305 right->updateNeighbor((*it), left, len);
306 delete (*it);
307 num_collapsed++;
308 if (verbose_mode >= VB_MED)
309 cout << "Node of degree 2 collapsed" << endl;
310 }
311 if (num_collapsed)
312 initializeTree();
313 if (rooted)
314 computeBranchDirection();
315 }
316
assignLeafNames(Node * node,Node * dad)317 void PhyloTree::assignLeafNames(Node *node, Node *dad) {
318 if (!node)
319 node = root;
320 if (node->isLeaf()) {
321 if (rooted && node == root) {
322 ASSERT(node->id == leafNum-1);
323 root->name = ROOT_NAME;
324 } else {
325 node->id = atoi(node->name.c_str());
326 node->name = aln->getSeqName(node->id);
327 }
328 ASSERT(node->id >= 0 && node->id < leafNum);
329 }
330 FOR_NEIGHBOR_IT(node, dad, it)assignLeafNames((*it)->node, node);
331 }
332
copyTree(MTree * tree)333 void PhyloTree::copyTree(MTree *tree) {
334 MTree::copyTree(tree);
335 if (!aln)
336 return;
337 // reset the ID with alignment
338 setAlignment(aln);
339 }
340
copyTree(MTree * tree,string & taxa_set)341 void PhyloTree::copyTree(MTree *tree, string &taxa_set) {
342 MTree::copyTree(tree, taxa_set);
343 if (rooted)
344 computeBranchDirection();
345 if (!aln)
346 return;
347 // reset the ID with alignment
348 setAlignment(aln);
349 }
350
copyPhyloTree(PhyloTree * tree)351 void PhyloTree::copyPhyloTree(PhyloTree *tree) {
352 MTree::copyTree(tree);
353 if (!tree->aln)
354 return;
355 setAlignment(tree->aln);
356 }
357
copyPhyloTreeMixlen(PhyloTree * tree,int mix)358 void PhyloTree::copyPhyloTreeMixlen(PhyloTree *tree, int mix) {
359 if (tree->isMixlen()) {
360 ((PhyloTreeMixlen*)tree)->cur_mixture = mix;
361 }
362 copyPhyloTree(tree);
363 if (tree->isMixlen()) {
364 ((PhyloTreeMixlen*)tree)->cur_mixture = -1;
365 }
366 }
367
setAlignment(Alignment * alignment)368 void PhyloTree::setAlignment(Alignment *alignment) {
369 aln = alignment;
370 bool err = false;
371 int nseq = aln->getNSeq();
372 for (int seq = 0; seq < nseq; seq++) {
373 string seq_name = aln->getSeqName(seq);
374 Node *node = findLeafName(seq_name);
375 if (!node) {
376 string str = "Alignment sequence ";
377 str += seq_name;
378 str += " does not appear in the tree";
379 err = true;
380 outError(str, false);
381 } else {
382 ASSERT(node->isLeaf());
383 node->id = seq;
384 }
385 }
386 if (err) {
387 printTree(cout, WT_NEWLINE);
388 outError("Tree taxa and alignment sequence do not match (see above)");
389 }
390 if (rooted) {
391 ASSERT(root->name == ROOT_NAME);
392 root->id = nseq;
393 }
394 StrVector taxname;
395 getTaxaName(taxname);
396 for (StrVector::iterator it = taxname.begin(); it != taxname.end(); it++)
397 if ((*it) != ROOT_NAME && alignment->getSeqID(*it) < 0) {
398 outError((string)"Tree taxon " + (*it) + " does not appear in the alignment", false);
399 err = true;
400 }
401 if (err) outError("Tree taxa and alignment sequence do not match (see above)");
402 }
403
setRootNode(const char * my_root,bool multi_taxa)404 void PhyloTree::setRootNode(const char *my_root, bool multi_taxa) {
405 if (rooted) {
406 computeBranchDirection();
407 return;
408 }
409
410 if (!my_root) {
411 root = findNodeName(aln->getSeqName(0));
412 ASSERT(root);
413 return;
414 }
415
416 if (strchr(my_root, ',') == NULL) {
417 string root_name = my_root;
418 root = findNodeName(root_name);
419 ASSERT(root);
420 return;
421 }
422
423 // my_root is a list of taxa
424 StrVector taxa;
425 convert_string_vec(my_root, taxa);
426 root = findNodeName(taxa[0]);
427 ASSERT(root);
428 if (!multi_taxa) {
429 return;
430 }
431 unordered_set<string> taxa_set;
432 for (auto it = taxa.begin(); it != taxa.end(); it++)
433 taxa_set.insert(*it);
434 pair<Node*,Neighbor*> res = {NULL, NULL};
435 findNodeNames(taxa_set, res, root->neighbors[0]->node, root);
436 if (res.first)
437 root = res.first;
438 else
439 outWarning("Branch separating outgroup is not found");
440 }
441
442 //void PhyloTree::setParams(Params* params) {
443 // this->params = params;
444 //}
445
readTreeString(const string & tree_string)446 void PhyloTree::readTreeString(const string &tree_string) {
447 stringstream str(tree_string);
448 freeNode();
449
450 // bug fix 2016-04-14: in case taxon name happens to be ID
451 MTree::readTree(str, rooted);
452
453 assignLeafNames();
454 setRootNode(Params::getInstance().root);
455
456 if (isSuperTree()) {
457 ((PhyloSuperTree*) this)->mapTrees();
458 }
459 if (Params::getInstance().pll) {
460 pllReadNewick(getTreeString());
461 }
462 resetCurScore();
463 if (Params::getInstance().fixStableSplits || Params::getInstance().adaptPertubation) {
464 buildNodeSplit();
465 }
466 current_it = current_it_back = NULL;
467 }
468
readTreeStringSeqName(const string & tree_string)469 void PhyloTree::readTreeStringSeqName(const string &tree_string) {
470 stringstream str(tree_string);
471 // str(tree_string);
472 // str.seekg(0, ios::beg);
473 freeNode();
474 if (rooted) {
475 rooted = false;
476 readTree(str, rooted);
477 if (!rooted)
478 convertToRooted();
479 } else {
480 readTree(str, rooted);
481 }
482 // assignLeafNames();
483 setAlignment(aln);
484 setRootNode(params->root);
485
486 if (isSuperTree()) {
487 ((PhyloSuperTree*) this)->mapTrees();
488 }
489 if (params->pll) {
490 pllReadNewick(getTreeString());
491 }
492 resetCurScore();
493 // lhComputed = false;
494 if (params->fixStableSplits) {
495 buildNodeSplit();
496 }
497 current_it = current_it_back = NULL;
498 }
499
wrapperFixNegativeBranch(bool force_change)500 int PhyloTree::wrapperFixNegativeBranch(bool force_change) {
501 // Initialize branch lengths for the parsimony tree
502 initializeAllPartialPars();
503 clearAllPartialLH();
504 int numFixed = fixNegativeBranch(force_change);
505 if (params->pll) {
506 pllReadNewick(getTreeString());
507 }
508 resetCurScore();
509 if (verbose_mode >= VB_MAX)
510 printTree(cout);
511 // lhComputed = false;
512 return numFixed;
513 }
514
pllReadNewick(string newickTree)515 void PhyloTree::pllReadNewick(string newickTree) {
516 pllNewickTree *newick = pllNewickParseString(newickTree.c_str());
517 pllTreeInitTopologyNewick(pllInst, newick, PLL_FALSE);
518 pllNewickParseDestroy(&newick);
519 }
520
readTreeFile(const string & file_name)521 void PhyloTree::readTreeFile(const string &file_name) {
522 ifstream str;
523 str.open(file_name.c_str());
524 // str << tree_string;
525 // str.seekg(0, ios::beg);
526 freeNode();
527 if (rooted) {
528 rooted = false;
529 readTree(str, rooted);
530 if (!rooted)
531 convertToRooted();
532 } else
533 readTree(str, rooted);
534 setAlignment(aln);
535 if (isSuperTree()) {
536 ((PhyloSuperTree*) this)->mapTrees();
537 } else {
538 clearAllPartialLH();
539 }
540 str.close();
541 current_it = current_it_back = NULL;
542 }
543
getTreeString()544 string PhyloTree::getTreeString() {
545 stringstream tree_stream;
546 setRootNode(params->root);
547 printTree(tree_stream, WT_TAXON_ID + WT_BR_LEN + WT_SORT_TAXA);
548 return tree_stream.str();
549 }
550
getTopologyString(bool printBranchLength)551 string PhyloTree::getTopologyString(bool printBranchLength) {
552 stringstream tree_stream;
553 // important: to make topology string unique
554 setRootNode(params->root);
555 //printTree(tree_stream, WT_TAXON_ID + WT_SORT_TAXA);
556 if (printBranchLength) {
557 printTree(tree_stream, WT_SORT_TAXA + WT_BR_LEN + WT_TAXON_ID);
558 } else {
559 printTree(tree_stream, WT_SORT_TAXA);
560 }
561 return tree_stream.str();
562 }
563
rollBack(istream & best_tree_string)564 void PhyloTree::rollBack(istream &best_tree_string) {
565 best_tree_string.seekg(0, ios::beg);
566 freeNode();
567 readTree(best_tree_string, rooted);
568 assignLeafNames();
569 initializeAllPartialLh();
570 clearAllPartialLH();
571 }
572
setModel(ModelSubst * amodel)573 void PhyloTree::setModel(ModelSubst *amodel) {
574 model = amodel;
575 //state_freqs = new double[numStates];
576 //model->getStateFrequency(state_freqs);
577 }
578
setModelFactory(ModelFactory * model_fac)579 void PhyloTree::setModelFactory(ModelFactory *model_fac) {
580 model_factory = model_fac;
581 if (model_fac) {
582 model = model_factory->model;
583 site_rate = model_factory->site_rate;
584 if (!isSuperTree()) {
585 setLikelihoodKernel(sse);
586 setNumThreads(num_threads);
587 }
588 } else {
589 model = NULL;
590 site_rate = NULL;
591 }
592 }
593
setRate(RateHeterogeneity * rate)594 void PhyloTree::setRate(RateHeterogeneity *rate) {
595 site_rate = rate;
596 if (!rate)
597 return;
598 //numCat = site_rate->getNRate();
599 //if (aln) {
600 // block = aln->num_states * numCat;
601 // lh_size = aln->size() * block;
602 //}
603 }
604
getRate()605 RateHeterogeneity *PhyloTree::getRate() {
606 return site_rate;
607 }
608
newNode(int node_id,const char * node_name)609 Node* PhyloTree::newNode(int node_id, const char* node_name) {
610 return (Node*) (new PhyloNode(node_id, node_name));
611 }
612
newNode(int node_id,int node_name)613 Node* PhyloTree::newNode(int node_id, int node_name) {
614 return (Node*) (new PhyloNode(node_id, node_name));
615 }
616
clearAllPartialLH(bool make_null)617 void PhyloTree::clearAllPartialLH(bool make_null) {
618 if (!root)
619 return;
620 ((PhyloNode*) root->neighbors[0]->node)->clearAllPartialLh(make_null, (PhyloNode*) root);
621 tip_partial_lh_computed = 0;
622 // 2015-10-14: has to reset this pointer when read in
623 current_it = current_it_back = NULL;
624 }
625
626 /*
627 void PhyloTree::computeAllPartialLh(PhyloNode *node, PhyloNode *dad) {
628 if (!node) node = (PhyloNode*)root;
629 FOR_NEIGHBOR_IT(node, dad, it) {
630 if ((((PhyloNeighbor*)*it)->partial_lh_computed & 1) == 0)
631 computePartialLikelihood((PhyloNeighbor*)*it, node);
632 PhyloNeighbor *rev = (PhyloNeighbor*) (*it)->node->findNeighbor(node);
633 if ((rev->partial_lh_computed & 1) == 0)
634 computePartialLikelihood(rev, (PhyloNode*)(*it)->node);
635 computeAllPartialLh((PhyloNode*)(*it)->node, node);
636 }
637 }
638 */
639
getASCName(ASCType ASC_type)640 string getASCName(ASCType ASC_type) {
641 switch (ASC_type) {
642 case ASC_NONE:
643 return "";
644 case ASC_VARIANT:
645 return "+ASC";
646 case ASC_VARIANT_MISSING:
647 return "+ASC_MIS";
648 case ASC_INFORMATIVE:
649 return "+ASC_INF";
650 case ASC_INFORMATIVE_MISSING:
651 return "+ASC_INF_MIS";
652 }
653 }
654
getSubstName()655 string PhyloTree::getSubstName() {
656 return model->getName() + getASCName(model_factory->getASC());
657 }
658
getRateName()659 string PhyloTree::getRateName() {
660 if (model_factory->fused_mix_rate) {
661 return "*" + site_rate->name.substr(1);
662 } else {
663 return site_rate->name;
664 }
665 }
666
getModelName()667 string PhyloTree::getModelName() {
668 return getSubstName() + getRateName();
669 }
670
getModelNameParams()671 string PhyloTree::getModelNameParams() {
672 string name = model->getNameParams();
673 name += getASCName(model_factory->getASC());
674 string rate_name = site_rate->getNameParams();
675
676 if (model_factory->fused_mix_rate) {
677 name += "*" + rate_name.substr(1);
678 } else {
679 name += rate_name;
680 }
681
682 return name;
683 }
684
saveBranchLengths(DoubleVector & lenvec,int startid,PhyloNode * node,PhyloNode * dad)685 void PhyloTree::saveBranchLengths(DoubleVector &lenvec, int startid, PhyloNode *node, PhyloNode *dad) {
686 if (!node) {
687 node = (PhyloNode*) root;
688 ASSERT(branchNum == nodeNum-1);
689 if (lenvec.empty()) lenvec.resize(branchNum*getMixlen() + startid);
690 }
691 FOR_NEIGHBOR_IT(node, dad, it){
692 (*it)->getLength(lenvec, (*it)->id*getMixlen() + startid);
693 PhyloTree::saveBranchLengths(lenvec, startid, (PhyloNode*) (*it)->node, node);
694 }
695 }
696
restoreBranchLengths(DoubleVector & lenvec,int startid,PhyloNode * node,PhyloNode * dad)697 void PhyloTree::restoreBranchLengths(DoubleVector &lenvec, int startid, PhyloNode *node, PhyloNode *dad) {
698 if (!node) {
699 node = (PhyloNode*) root;
700 ASSERT(!lenvec.empty());
701 }
702 FOR_NEIGHBOR_IT(node, dad, it){
703 (*it)->setLength(lenvec, (*it)->id*getMixlen() + startid, getMixlen());
704 (*it)->node->findNeighbor(node)->setLength(lenvec, (*it)->id*getMixlen() + startid, getMixlen());
705 PhyloTree::restoreBranchLengths(lenvec, startid, (PhyloNode*) (*it)->node, node);
706 }
707 }
708
709
710 /****************************************************************************
711 Parsimony function
712 ****************************************************************************/
713
714 /*
715 double PhyloTree::computeCorrectedParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad) {
716 // double corrected_bran = 0;
717 // int parbran;
718 // int parscore = computeParsimonyBranch(node21_it, node2, &parbran);
719 // if (site_rate->getGammaShape() != 0) {
720 // corrected_bran = (aln->num_states - 1.0) / aln->num_states
721 // * site_rate->getGammaShape()
722 // * (pow( 1.0 - aln->num_states / (aln->num_states - 1.0) * ((double) parbran / aln->getNSite()),
723 // -1.0 / site_rate->getGammaShape()) - 1.0);
724 // } else {
725 // corrected_bran = -((aln->num_states - 1.0) / aln->num_states)
726 // * log(1.0 - (aln->num_states / (aln->num_states - 1.0)) * ((double) parbran / aln->getNSite()));
727 // }
728 // return corrected_bran;
729 }
730 */
initializeAllPartialPars()731 void PhyloTree::initializeAllPartialPars() {
732 if (!ptn_freq_pars)
733 ptn_freq_pars = aligned_alloc<UINT>(get_safe_upper_limit_float(getAlnNPattern()));
734 int index = 0;
735 initializeAllPartialPars(index);
736 clearAllPartialLH();
737 //assert(index == (nodeNum - 1)*2);
738 }
739
initializeAllPartialPars(int & index,PhyloNode * node,PhyloNode * dad)740 void PhyloTree::initializeAllPartialPars(int &index, PhyloNode *node, PhyloNode *dad) {
741 size_t pars_block_size = getBitsBlockSize();
742 if (!node) {
743 node = (PhyloNode*) root;
744 // allocate the big central partial pars memory
745 if (!central_partial_pars) {
746 uint64_t tip_partial_pars_size = get_safe_upper_limit_float(aln->num_states * (aln->STATE_UNKNOWN+1));
747 size_t memsize = (aln->getNSeq()) * 4 * pars_block_size + tip_partial_pars_size;
748 if (verbose_mode >= VB_MAX)
749 cout << "Allocating " << memsize * sizeof(UINT) << " bytes for partial parsimony vectors" << endl;
750 central_partial_pars = aligned_alloc<UINT>(memsize);
751 if (!central_partial_pars)
752 outError("Not enough memory for partial parsimony vectors");
753 tip_partial_pars = central_partial_pars + (aln->getNSeq()) * 4 * pars_block_size;
754 }
755 index = 0;
756 }
757 if (dad) {
758 // make memory alignment 16
759 // assign a region in central_partial_lh to both Neihgbors (dad->node, and node->dad)
760 PhyloNeighbor *nei = (PhyloNeighbor*) node->findNeighbor(dad);
761 nei->partial_pars = central_partial_pars + (index * pars_block_size);
762 nei = (PhyloNeighbor*) dad->findNeighbor(node);
763 nei->partial_pars = central_partial_pars + ((index + 1) * pars_block_size);
764 index += 2;
765 //assert(index < nodeNum * 2 - 1);
766 }
767 FOR_NEIGHBOR_IT(node, dad, it)initializeAllPartialPars(index, (PhyloNode*) (*it)->node, node);
768 }
769
770 #ifdef __AVX512KNL
771 #define SIMD_BITS 512
772 #else
773 #define SIMD_BITS 256
774 #endif
775
776
getBitsBlockSize()777 size_t PhyloTree::getBitsBlockSize() {
778 // reserve the last entry for parsimony score
779 // return (aln->num_states * aln->size() + UINT_BITS - 1) / UINT_BITS + 1;
780 if (cost_matrix) {
781 return get_safe_upper_limit_float(aln->size() * aln->num_states);
782 }
783 size_t len = aln->getMaxNumStates() * ((max(aln->size(), (size_t)aln->num_variant_sites) + SIMD_BITS - 1) / UINT_BITS) + 4;
784 #ifdef __AVX512KNL
785 len = ((len+15)/16)*16;
786 #else
787 len = ((len+7)/8)*8;
788 #endif
789 return len;
790 }
791
newBitsBlock()792 UINT *PhyloTree::newBitsBlock() {
793 return aligned_alloc<UINT>(getBitsBlockSize());
794 }
795
796
computePartialParsimony(PhyloNeighbor * dad_branch,PhyloNode * dad)797 void PhyloTree::computePartialParsimony(PhyloNeighbor *dad_branch, PhyloNode *dad) {
798 (this->*computePartialParsimonyPointer)(dad_branch, dad);
799 }
800
computeReversePartialParsimony(PhyloNode * node,PhyloNode * dad)801 void PhyloTree::computeReversePartialParsimony(PhyloNode *node, PhyloNode *dad) {
802 PhyloNeighbor *node_nei = (PhyloNeighbor*)node->findNeighbor(dad);
803 ASSERT(node_nei);
804 computePartialParsimony(node_nei, node);
805 for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it ++)
806 if ((*it)->node != dad)
807 computeReversePartialParsimony((PhyloNode*)(*it)->node, node);
808
809 }
810
811
computeParsimonyBranch(PhyloNeighbor * dad_branch,PhyloNode * dad,int * branch_subst)812 int PhyloTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
813 return (this->*computeParsimonyBranchPointer)(dad_branch, dad, branch_subst);
814 }
815
816
computeParsimony()817 int PhyloTree::computeParsimony() {
818 return computeParsimonyBranch((PhyloNeighbor*) root->neighbors[0], (PhyloNode*) root);
819 }
820
821
822 /****************************************************************************
823 likelihood function
824 ****************************************************************************/
825
getBufferPartialLhSize()826 size_t PhyloTree::getBufferPartialLhSize() {
827 const size_t VECTOR_SIZE = 8; // TODO, adjusted
828 // 2017-12-13: make sure that num_threads was already set
829 ASSERT(num_threads > 0);
830 size_t ncat_mix = site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
831 size_t block = model->num_states * ncat_mix;
832 size_t buffer_size = 0;
833
834 // buffer for traversal_info.echildren and partial_lh_leaves
835 if (!Params::getInstance().buffer_mem_save) {
836 buffer_size += get_safe_upper_limit(block * model->num_states * 2) * aln->getNSeq();
837 buffer_size += get_safe_upper_limit(block *(aln->STATE_UNKNOWN+1)) * aln->getNSeq();
838 }
839
840 buffer_size += get_safe_upper_limit(block *(aln->STATE_UNKNOWN+1));
841 buffer_size += (block*2+model->num_states)*VECTOR_SIZE*num_threads;
842
843 // always more buffer for non-rev kernel, in case switching between kernels
844 buffer_size += get_safe_upper_limit(block)*(aln->STATE_UNKNOWN+1)*2;
845 buffer_size += block*2*VECTOR_SIZE*num_threads;
846 buffer_size += get_safe_upper_limit(3*block*model->num_states);
847
848 if (isMixlen()) {
849 size_t nmix = max(getMixlen(), getRate()->getNRate());
850 buffer_size += nmix*(nmix+1)*VECTOR_SIZE + (nmix+3)*nmix*VECTOR_SIZE*num_threads;
851 }
852 return buffer_size;
853 }
854
initializeAllPartialLh()855 void PhyloTree::initializeAllPartialLh() {
856 int index, indexlh;
857 int numStates = model->num_states;
858 // Minh's question: why getAlnNSite() but not getAlnNPattern() ?
859 //size_t mem_size = ((getAlnNSite() % 2) == 0) ? getAlnNSite() : (getAlnNSite() + 1);
860 // extra #numStates for ascertainment bias correction
861 size_t mem_size = get_safe_upper_limit(getAlnNPattern()) + max(get_safe_upper_limit(numStates),
862 get_safe_upper_limit(model_factory->unobserved_ptns.size()));
863
864 size_t block_size = mem_size * numStates * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
865 // make sure _pattern_lh size is divisible by 4 (e.g., 9->12, 14->16)
866 if (!_pattern_lh)
867 _pattern_lh = aligned_alloc<double>(mem_size);
868 if (!_pattern_lh_cat)
869 _pattern_lh_cat = aligned_alloc<double>(mem_size * site_rate->getNDiscreteRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures()));
870 if (!theta_all)
871 theta_all = aligned_alloc<double>(block_size);
872 if (!buffer_scale_all)
873 buffer_scale_all = aligned_alloc<double>(mem_size);
874 if (!buffer_partial_lh) {
875 buffer_partial_lh = aligned_alloc<double>(getBufferPartialLhSize());
876 }
877 if (!ptn_freq) {
878 ptn_freq = aligned_alloc<double>(mem_size);
879 ptn_freq_computed = false;
880 }
881 if (!ptn_freq_pars)
882 ptn_freq_pars = aligned_alloc<UINT>(mem_size);
883 if (!ptn_invar)
884 ptn_invar = aligned_alloc<double>(mem_size);
885 initializeAllPartialLh(index, indexlh);
886 if (params->lh_mem_save == LM_MEM_SAVE)
887 mem_slots.init(this, max_lh_slots);
888
889 ASSERT(index == (nodeNum - 1) * 2);
890 if (params->lh_mem_save == LM_PER_NODE) {
891 ASSERT(indexlh == nodeNum-leafNum);
892 }
893
894 clearAllPartialLH();
895
896 }
897
deleteAllPartialLh()898 void PhyloTree::deleteAllPartialLh() {
899
900 if (central_partial_lh) {
901 aligned_free(central_partial_lh);
902 }
903 if (central_scale_num) {
904 aligned_free(central_scale_num);
905 }
906 if (central_partial_pars)
907 aligned_free(central_partial_pars);
908
909 if (nni_scale_num)
910 aligned_free(nni_scale_num);
911 nni_scale_num = NULL;
912 if (nni_partial_lh)
913 aligned_free(nni_partial_lh);
914 nni_partial_lh = NULL;
915
916 if (ptn_invar)
917 aligned_free(ptn_invar);
918 if (ptn_freq)
919 aligned_free(ptn_freq);
920 if (ptn_freq_pars)
921 aligned_free(ptn_freq_pars);
922 if (theta_all)
923 aligned_free(theta_all);
924 if (buffer_scale_all)
925 aligned_free(buffer_scale_all);
926 if (buffer_partial_lh)
927 aligned_free(buffer_partial_lh);
928 if (_pattern_lh_cat)
929 aligned_free(_pattern_lh_cat);
930 if (_pattern_lh)
931 aligned_free(_pattern_lh);
932 central_partial_lh = NULL;
933 central_scale_num = NULL;
934 central_partial_pars = NULL;
935
936 ptn_invar = NULL;
937 ptn_freq = NULL;
938 ptn_freq_pars = NULL;
939 ptn_freq_computed = false;
940 theta_all = NULL;
941 buffer_scale_all = NULL;
942 buffer_partial_lh = NULL;
943 _pattern_lh_cat = NULL;
944 _pattern_lh = NULL;
945
946 tip_partial_lh = NULL;
947 tip_partial_pars = NULL;
948
949 clearAllPartialLH();
950 }
951
getMemoryRequired(size_t ncategory,bool full_mem)952 uint64_t PhyloTree::getMemoryRequired(size_t ncategory, bool full_mem) {
953 // +num_states for ascertainment bias correction
954 int64_t nptn = get_safe_upper_limit(aln->getNPattern()) + get_safe_upper_limit(aln->num_states);
955 if (model_factory)
956 nptn = get_safe_upper_limit(aln->getNPattern()) + max(get_safe_upper_limit(aln->num_states), get_safe_upper_limit(model_factory->unobserved_ptns.size()));
957 int64_t scale_block_size = nptn;
958 if (site_rate)
959 scale_block_size *= site_rate->getNRate();
960 else
961 scale_block_size *= ncategory;
962 if (model && !model_factory->fused_mix_rate)
963 scale_block_size *= model->getNMixtures();
964
965 int64_t block_size = scale_block_size * aln->num_states;
966
967 int64_t mem_size;
968 // memory to tip_partial_lh
969 if (model)
970 mem_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures() * sizeof(double);
971 else
972 mem_size = aln->num_states * (aln->STATE_UNKNOWN+1) * sizeof(double);
973
974 // memory for UFBoot
975 if (params->gbo_replicates)
976 mem_size += params->gbo_replicates*nptn*sizeof(BootValType);
977
978 // memory for model
979 if (model)
980 mem_size += model->getMemoryRequired();
981
982 int64_t lh_scale_size = block_size * sizeof(double) + scale_block_size * sizeof(UBYTE);
983
984 max_lh_slots = leafNum-2;
985
986 if (!full_mem && params->lh_mem_save == LM_MEM_SAVE) {
987 int64_t min_lh_slots = log2(leafNum)+LH_MIN_CONST;
988 if (params->max_mem_size == 0.0) {
989 max_lh_slots = min_lh_slots;
990 } else if (params->max_mem_size <= 1) {
991 max_lh_slots = floor(params->max_mem_size*(leafNum-2));
992 } else {
993 int64_t rest_mem = params->max_mem_size - mem_size;
994
995 // include 2 blocks for nni_partial_lh
996 max_lh_slots = rest_mem / lh_scale_size - 2;
997
998 // RAM over requirement, reset to LM_PER_NODE
999 if (max_lh_slots > leafNum-2)
1000 max_lh_slots = leafNum-2;
1001 }
1002 if (max_lh_slots < min_lh_slots) {
1003 cout << "WARNING: Too low -mem, automatically increased to " << (mem_size + (min_lh_slots+2)*lh_scale_size)/1048576.0 << " MB" << endl;
1004 max_lh_slots = min_lh_slots;
1005 }
1006 }
1007
1008 // also count MEM for nni_partial_lh
1009 mem_size += (max_lh_slots+2) * lh_scale_size;
1010
1011
1012 return mem_size;
1013 }
1014
getMemoryRequiredThreaded(size_t ncategory,bool full_mem)1015 uint64_t PhyloTree::getMemoryRequiredThreaded(size_t ncategory, bool full_mem) {
1016 return getMemoryRequired(ncategory, full_mem);
1017 }
1018
getMemoryRequired(uint64_t & partial_lh_entries,uint64_t & scale_num_entries,uint64_t & partial_pars_entries)1019 void PhyloTree::getMemoryRequired(uint64_t &partial_lh_entries, uint64_t &scale_num_entries, uint64_t &partial_pars_entries) {
1020 // +num_states for ascertainment bias correction
1021 uint64_t block_size = get_safe_upper_limit(aln->getNPattern()) + get_safe_upper_limit(aln->num_states);
1022 if (model_factory)
1023 block_size = get_safe_upper_limit(aln->getNPattern()) + max(get_safe_upper_limit(aln->num_states), get_safe_upper_limit(model_factory->unobserved_ptns.size()));
1024 size_t scale_size = block_size;
1025 block_size = block_size * aln->num_states;
1026 if (site_rate) {
1027 block_size *= site_rate->getNRate();
1028 scale_size *= site_rate->getNRate();
1029 }
1030 if (model && !model_factory->fused_mix_rate) {
1031 block_size *= model->getNMixtures();
1032 scale_size *= model->getNMixtures();
1033 }
1034
1035 uint64_t tip_partial_lh_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures();
1036 uint64_t tip_partial_pars_size = aln->num_states * (aln->STATE_UNKNOWN+1);
1037
1038 // TODO mem save
1039 partial_lh_entries = ((uint64_t)leafNum - 2) * (uint64_t) block_size + 4 + tip_partial_lh_size;
1040 scale_num_entries = (leafNum - 2) * scale_size;
1041
1042 size_t pars_block_size = getBitsBlockSize();
1043 partial_pars_entries = (leafNum - 1) * 4 * pars_block_size + tip_partial_pars_size;
1044 }
1045
initializeAllPartialLh(int & index,int & indexlh,PhyloNode * node,PhyloNode * dad)1046 void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node, PhyloNode *dad) {
1047 uint64_t pars_block_size = getBitsBlockSize();
1048 // +num_states for ascertainment bias correction
1049 size_t nptn = get_safe_upper_limit(aln->size())+ max(get_safe_upper_limit(aln->num_states), get_safe_upper_limit(model_factory->unobserved_ptns.size()));
1050 uint64_t block_size;
1051 uint64_t scale_block_size = nptn * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
1052 block_size = scale_block_size * model->num_states;
1053
1054 if (!node) {
1055 node = (PhyloNode*) root;
1056 // allocate the big central partial likelihoods memory
1057 // size_t IT_NUM = (params->nni5) ? 6 : 2;
1058 size_t IT_NUM = 2;
1059 if (!nni_partial_lh) {
1060 // allocate memory only once!
1061 nni_partial_lh = aligned_alloc<double>(IT_NUM*block_size);
1062 nni_scale_num = aligned_alloc<UBYTE>(IT_NUM*scale_block_size);
1063 }
1064
1065
1066 if (!central_partial_lh) {
1067 uint64_t tip_partial_lh_size = get_safe_upper_limit(aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures());
1068 if (model->isSiteSpecificModel())
1069 tip_partial_lh_size = get_safe_upper_limit(aln->size()) * model->num_states * leafNum;
1070
1071 if (max_lh_slots == 0)
1072 getMemoryRequired();
1073
1074 uint64_t mem_size = (uint64_t)max_lh_slots * block_size + 4 + tip_partial_lh_size;
1075
1076 if (verbose_mode >= VB_MAX)
1077 cout << "Allocating " << mem_size * sizeof(double) << " bytes for partial likelihood vectors" << endl;
1078 try {
1079 central_partial_lh = aligned_alloc<double>(mem_size);
1080 } catch (std::bad_alloc &ba) {
1081 outError("Not enough memory for partial likelihood vectors (bad_alloc)");
1082 }
1083 if (!central_partial_lh)
1084 outError("Not enough memory for partial likelihood vectors");
1085 }
1086
1087 // now always assign tip_partial_lh
1088 if (params->lh_mem_save == LM_PER_NODE) {
1089 tip_partial_lh = central_partial_lh + ((nodeNum - leafNum)*block_size);
1090 } else {
1091 tip_partial_lh = central_partial_lh + (max_lh_slots*block_size);
1092 }
1093
1094 if (!central_scale_num) {
1095 uint64_t mem_size = max_lh_slots * scale_block_size;
1096
1097 if (verbose_mode >= VB_MAX)
1098 cout << "Allocating " << mem_size * sizeof(UBYTE) << " bytes for scale num vectors" << endl;
1099 try {
1100 central_scale_num = aligned_alloc<UBYTE>(mem_size);
1101 } catch (std::bad_alloc &ba) {
1102 outError("Not enough memory for scale num vectors (bad_alloc)");
1103 }
1104 if (!central_scale_num)
1105 outError("Not enough memory for scale num vectors");
1106 }
1107
1108 if (!central_partial_pars) {
1109 uint64_t tip_partial_pars_size = get_safe_upper_limit_float(aln->num_states * (aln->STATE_UNKNOWN+1));
1110 uint64_t mem_size = (leafNum - 1) * 4 * pars_block_size + tip_partial_pars_size;
1111 if (verbose_mode >= VB_MAX)
1112 cout << "Allocating " << mem_size * sizeof(UINT)
1113 << " bytes for partial parsimony vectors" << endl;
1114 try {
1115 central_partial_pars = aligned_alloc<UINT>(mem_size);
1116 } catch (std::bad_alloc &ba) {
1117 outError("Not enough memory for partial parsimony vectors (bad_alloc)");
1118 }
1119 if (!central_partial_pars)
1120 outError("Not enough memory for partial parsimony vectors");
1121 tip_partial_pars = central_partial_pars + ((leafNum - 1) * 4 * pars_block_size);
1122 }
1123 index = 0;
1124 indexlh = 0;
1125 }
1126 if (dad) {
1127 // assign a region in central_partial_lh to both Neihgbors (dad->node, and node->dad)
1128 PhyloNeighbor *nei = (PhyloNeighbor*) node->findNeighbor(dad);
1129 PhyloNeighbor *nei2 = (PhyloNeighbor*) dad->findNeighbor(node);
1130
1131 // first initialize partial_pars
1132 nei->partial_pars = central_partial_pars + (index * pars_block_size);
1133 index++;
1134 nei2->partial_pars = central_partial_pars + (index * pars_block_size);
1135 index ++;
1136 ASSERT(index < nodeNum * 2 - 1);
1137
1138 // now initialize partial_lh and scale_num
1139 if (params->lh_mem_save == LM_PER_NODE) {
1140 if (!node->isLeaf()) { // only allocate memory to internal node
1141 nei->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
1142 nei->scale_num = NULL;
1143 nei2->scale_num = central_scale_num + ((indexlh) * scale_block_size);
1144 nei2->partial_lh = central_partial_lh + (indexlh * block_size);
1145 indexlh++;
1146 } else {
1147 nei->partial_lh = NULL;
1148 nei->scale_num = NULL;
1149 nei2->scale_num = NULL;
1150 nei2->partial_lh = NULL;
1151 }
1152 } else {
1153 nei->partial_lh = NULL;
1154 nei->scale_num = NULL;
1155 nei2->scale_num = NULL;
1156 nei2->partial_lh = NULL;
1157 }
1158
1159 // zero memory to allocate contiguous chunk of memory
1160 // if (nei->partial_lh)
1161 // memset(nei->partial_lh, 0, block_size*sizeof(double));
1162 // if (nei2->partial_lh)
1163 // memset(nei2->partial_lh, 0, block_size*sizeof(double));
1164
1165 // if (model->isSiteSpecificModel() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
1166 // // allocate tip memory for this model
1167 // if (node->isLeaf()) {
1168 // nei2->partial_lh = tip_partial_lh + (node->id * tip_block_size);
1169 // }
1170 // if (dad->isLeaf()) {
1171 // nei->partial_lh = tip_partial_lh + (dad->id * tip_block_size);
1172 // }
1173 // }
1174 }
1175 FOR_NEIGHBOR_IT(node, dad, it) initializeAllPartialLh(index, indexlh, (PhyloNode*) (*it)->node, node);
1176 }
1177
newPartialLh()1178 double *PhyloTree::newPartialLh() {
1179 return aligned_alloc<double>(getPartialLhSize());
1180 }
1181
getPartialLhSize()1182 size_t PhyloTree::getPartialLhSize() {
1183 // +num_states for ascertainment bias correction
1184 size_t block_size = get_safe_upper_limit(aln->size())+max(get_safe_upper_limit(aln->num_states),
1185 get_safe_upper_limit(model_factory->unobserved_ptns.size()));
1186 block_size *= model->num_states * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
1187 return block_size;
1188 }
1189
getPartialLhBytes()1190 size_t PhyloTree::getPartialLhBytes() {
1191 // +num_states for ascertainment bias correction
1192 return getPartialLhSize() * sizeof(double);
1193 }
1194
getScaleNumSize()1195 size_t PhyloTree::getScaleNumSize() {
1196 size_t block_size = get_safe_upper_limit(aln->size())+max(get_safe_upper_limit(aln->num_states),
1197 get_safe_upper_limit(model_factory->unobserved_ptns.size()));
1198 return (block_size) * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
1199 }
1200
getScaleNumBytes()1201 size_t PhyloTree::getScaleNumBytes() {
1202 return getScaleNumSize()*sizeof(UBYTE);
1203 }
1204
newScaleNum()1205 UBYTE *PhyloTree::newScaleNum() {
1206 return aligned_alloc<UBYTE>(getScaleNumSize());
1207 }
1208
findFirstFarLeaf(Node * node,Node * dad=NULL)1209 Node *findFirstFarLeaf(Node *node, Node *dad = NULL) {
1210
1211 do {
1212 FOR_NEIGHBOR_IT(node, dad, it) {
1213 dad = node;
1214 node = (*it)->node;
1215 break;
1216 }
1217 } while (!node->isLeaf());
1218 return node;
1219
1220 }
1221
computeLikelihood(double * pattern_lh)1222 double PhyloTree::computeLikelihood(double *pattern_lh) {
1223 ASSERT(model);
1224 ASSERT(site_rate);
1225 ASSERT(root->isLeaf());
1226 if (!current_it) {
1227 Node *leaf = findFarthestLeaf();
1228 current_it = (PhyloNeighbor*)leaf->neighbors[0];
1229 current_it_back = (PhyloNeighbor*)current_it->node->findNeighbor(leaf);
1230 // PhyloNeighbor *nei = ((PhyloNeighbor*) root->neighbors[0]);
1231 // current_it = nei;
1232 // assert(current_it);
1233 // current_it_back = (PhyloNeighbor*) nei->node->findNeighbor(root);
1234 // assert(current_it_back);
1235 }
1236 double score;
1237 // string root_name = ROOT_NAME;
1238 // Node *vroot = findLeafName(root_name);
1239 // if (root_state != aln->STATE_UNKNOWN && vroot) {
1240 // if (verbose_mode >= VB_DEBUG)
1241 // cout << __func__ << " HIT ROOT STATE " << endl;
1242 // score = computeLikelihoodRooted((PhyloNeighbor*) vroot->neighbors[0], (PhyloNode*) vroot);
1243 // } else {
1244 score = computeLikelihoodBranch(current_it, (PhyloNode*) current_it_back->node);
1245 // }
1246 if (pattern_lh)
1247 memmove(pattern_lh, _pattern_lh, aln->size() * sizeof(double));
1248
1249 if (pattern_lh && current_it->lh_scale_factor < 0.0) {
1250 int nptn = aln->getNPattern();
1251 //double check_score = 0.0;
1252 for (int i = 0; i < nptn; i++) {
1253 pattern_lh[i] += max(current_it->scale_num[i], UBYTE(0)) * LOG_SCALING_THRESHOLD;
1254 //check_score += (pattern_lh[i] * (aln->at(i).frequency));
1255 }
1256 /* if (fabs(score - check_score) > 1e-6) {
1257 cout << "score = " << score << " check_score = " << check_score << endl;
1258 outError("Scaling error ", __func__);
1259 }*/
1260 }
1261 curScore = score;
1262 return score;
1263 }
1264
1265 //double PhyloTree::computeLikelihoodRooted(PhyloNeighbor *dad_branch, PhyloNode *dad) {
1266 // double score = computeLikelihoodBranchNaive(dad_branch, dad);
1267 // if (verbose_mode >= VB_DEBUG) {
1268 // printTransMatrices(dad_branch->node, dad);
1269 // /*
1270 // FOR_NEIGHBOR_IT(dad_branch->node, dad, it) {
1271 // PhyloNeighbor *pit = (PhyloNeighbor*)(*it);
1272 // cout << pit->node->name << "\t" << pit->partial_lh[0] << endl;
1273 //
1274 // }*/
1275 // }
1276 // double* state_freq = new double[aln->num_states];
1277 // model->getStateFrequency(state_freq);
1278 // score -= log(state_freq[(int) root_state]);
1279 // delete[] state_freq;
1280 // return score;
1281 //}
1282
getNumLhCat(SiteLoglType wsl)1283 int PhyloTree::getNumLhCat(SiteLoglType wsl) {
1284 int ncat = 0;
1285 switch (wsl) {
1286 case WSL_NONE: ASSERT(0 && "is not WSL_NONE"); return 0;
1287 case WSL_SITE: ASSERT(0 && "is not WSL_SITE"); return 0;
1288 case WSL_MIXTURE_RATECAT:
1289 ncat = getRate()->getNDiscreteRate();
1290 if (getModel()->isMixture() && !getModelFactory()->fused_mix_rate)
1291 ncat *= getModel()->getNMixtures();
1292 return ncat;
1293 case WSL_RATECAT:
1294 return getRate()->getNDiscreteRate();
1295 case WSL_MIXTURE:
1296 return getModel()->getNMixtures();
1297 }
1298 }
1299
transformPatternLhCat()1300 void PhyloTree::transformPatternLhCat() {
1301 if (vector_size == 1)
1302 return;
1303
1304 size_t nptn = ((aln->size()+vector_size-1)/vector_size)*vector_size;
1305 // size_t nstates = aln->num_states;
1306 size_t ncat = site_rate->getNRate();
1307 if (!model_factory->fused_mix_rate) ncat *= model->getNMixtures();
1308
1309 double *mem = aligned_alloc<double>(nptn*ncat);
1310 memcpy(mem, _pattern_lh_cat, nptn*ncat*sizeof(double));
1311 double *memptr = mem;
1312
1313 size_t ptn, cat, i;
1314 for (ptn = 0; ptn < nptn; ptn+=vector_size) {
1315 double *lh_cat_ptr = &_pattern_lh_cat[ptn*ncat];
1316 for (cat = 0; cat < ncat; cat++) {
1317 for (i = 0; i < vector_size; i++)
1318 lh_cat_ptr[i*ncat+cat] = memptr[i];
1319 memptr += vector_size;
1320 }
1321 }
1322 aligned_free(mem);
1323 }
1324
computePatternLhCat(SiteLoglType wsl)1325 double PhyloTree::computePatternLhCat(SiteLoglType wsl) {
1326 if (!current_it) {
1327 Node *leaf = findFirstFarLeaf(root);
1328 current_it = (PhyloNeighbor*)leaf->neighbors[0];
1329 current_it_back = (PhyloNeighbor*)current_it->node->findNeighbor(leaf);
1330 }
1331
1332 double score;
1333
1334 score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
1335 // TODO: SIMD aware
1336 transformPatternLhCat();
1337 /*
1338 if (getModel()->isSiteSpecificModel()) {
1339 score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
1340 } else if (!getModel()->isMixture())
1341 score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
1342 else if (getModelFactory()->fused_mix_rate)
1343 score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
1344 else {
1345 score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
1346 */
1347 if (!getModel()->isSiteSpecificModel() && getModel()->isMixture() && !getModelFactory()->fused_mix_rate) {
1348 if (wsl == WSL_MIXTURE || wsl == WSL_RATECAT) {
1349 double *lh_cat = _pattern_lh_cat;
1350 double *lh_res = _pattern_lh_cat;
1351 size_t ptn, nptn = aln->getNPattern();
1352 size_t m, nmixture = getModel()->getNMixtures();
1353 size_t c, ncat = getRate()->getNRate();
1354 if (wsl == WSL_MIXTURE && ncat > 1) {
1355 // transform to lh per mixture class
1356 for (ptn = 0; ptn < nptn; ptn++) {
1357 for (m = 0; m < nmixture; m++) {
1358 double lh = lh_cat[0];
1359 for (c = 1; c < ncat; c++)
1360 lh += lh_cat[c];
1361 lh_res[m] = lh;
1362 lh_cat += ncat;
1363 }
1364 lh_res += nmixture;
1365 }
1366 } else if (wsl == WSL_RATECAT && nmixture > 1) {
1367 // transform to lh per rate category
1368 for (ptn = 0; ptn < nptn; ptn++) {
1369 if (lh_res != lh_cat)
1370 memcpy(lh_res, lh_cat, ncat*sizeof(double));
1371 lh_cat += ncat;
1372 for (m = 1; m < nmixture; m++) {
1373 for (c = 0; c < ncat; c++)
1374 lh_res[c] += lh_cat[c];
1375 lh_cat += ncat;
1376 }
1377 lh_res += ncat;
1378 }
1379 }
1380 }
1381 }
1382
1383 return score;
1384 }
1385
computePatternStateFreq(double * ptn_state_freq)1386 void PhyloTree::computePatternStateFreq(double *ptn_state_freq) {
1387 ASSERT(getModel()->isMixture());
1388 computePatternLhCat(WSL_MIXTURE);
1389 double *lh_cat = _pattern_lh_cat;
1390 size_t ptn, nptn = getAlnNPattern(), m, nmixture = getModel()->getNMixtures();
1391 double *ptn_freq = ptn_state_freq;
1392 size_t state, nstates = aln->num_states;
1393 // ModelMixture *models = (ModelMixture*)model;
1394
1395 if (params->print_site_state_freq == WSF_POSTERIOR_MEAN) {
1396 cout << "Computing posterior mean site frequencies...." << endl;
1397 // loop over all site-patterns
1398 for (ptn = 0; ptn < nptn; ptn++) {
1399
1400 // first compute posterior for each mixture component
1401 double sum_lh = 0.0;
1402 for (m = 0; m < nmixture; m++) {
1403 sum_lh += lh_cat[m];
1404 }
1405 sum_lh = 1.0/sum_lh;
1406 for (m = 0; m < nmixture; m++) {
1407 lh_cat[m] *= sum_lh;
1408 }
1409
1410 // now compute state frequencies
1411 for (state = 0; state < nstates; state++) {
1412 double freq = 0;
1413 for (m = 0; m < nmixture; m++)
1414 freq += model->getMixtureClass(m)->state_freq[state] * lh_cat[m];
1415 ptn_freq[state] = freq;
1416 }
1417
1418 // increase the pointers
1419 lh_cat += nmixture;
1420 ptn_freq += nstates;
1421 }
1422 } else if (params->print_site_state_freq == WSF_POSTERIOR_MAX) {
1423 cout << "Computing posterior max site frequencies...." << endl;
1424 // loop over all site-patterns
1425 for (ptn = 0; ptn < nptn; ptn++) {
1426
1427 // first compute posterior for each mixture component
1428 size_t max_comp = 0;
1429 for (m = 1; m < nmixture; m++)
1430 if (lh_cat[m] > lh_cat[max_comp]) {
1431 max_comp = m;
1432 }
1433
1434 // now compute state frequencies
1435 memcpy(ptn_freq, model->getMixtureClass(max_comp)->state_freq, nstates*sizeof(double));
1436
1437 // increase the pointers
1438 lh_cat += nmixture;
1439 ptn_freq += nstates;
1440 }
1441 }
1442 }
1443
1444
1445
computePatternLikelihood(double * ptn_lh,double * cur_logl,double * ptn_lh_cat,SiteLoglType wsl)1446 void PhyloTree::computePatternLikelihood(double *ptn_lh, double *cur_logl, double *ptn_lh_cat, SiteLoglType wsl) {
1447 /* if (!dad_branch) {
1448 dad_branch = (PhyloNeighbor*) root->neighbors[0];
1449 dad = (PhyloNode*) root;
1450 }*/
1451 int nptn = aln->getNPattern();
1452 int i;
1453 int ncat = getNumLhCat(wsl);
1454 if (ptn_lh_cat) {
1455 // Right now only Naive version store _pattern_lh_cat!
1456 computePatternLhCat(wsl);
1457 }
1458
1459 double sum_scaling = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
1460 //double sum_scaling = 0.0;
1461 if (sum_scaling < 0.0) {
1462 if (current_it->lh_scale_factor == 0.0) {
1463 for (i = 0; i < nptn; i++) {
1464 ptn_lh[i] = _pattern_lh[i] + (max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
1465 }
1466 } else if (current_it_back->lh_scale_factor == 0.0){
1467 for (i = 0; i < nptn; i++) {
1468 ptn_lh[i] = _pattern_lh[i] + (max(UBYTE(0), current_it->scale_num[i])) * LOG_SCALING_THRESHOLD;
1469 }
1470 } else {
1471 for (i = 0; i < nptn; i++) {
1472 ptn_lh[i] = _pattern_lh[i] + (max(UBYTE(0), current_it->scale_num[i]) +
1473 max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
1474 }
1475 }
1476 } else {
1477 memmove(ptn_lh, _pattern_lh, nptn * sizeof(double));
1478 }
1479
1480 if (!ptn_lh_cat)
1481 return;
1482
1483 /*
1484 if (ptn_lh_cat && model->isSiteSpecificModel()) {
1485 int offset = 0;
1486 if (sum_scaling == 0.0) {
1487 int nptncat = nptn * ncat;
1488 for (i = 0; i < nptncat; i++) {
1489 ptn_lh_cat[i] = log(_pattern_lh_cat[i]);
1490 }
1491 } else if (current_it->lh_scale_factor == 0.0) {
1492 for (i = 0; i < nptn; i++) {
1493 double scale = (max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
1494 for (int j = 0; j < ncat; j++, offset++)
1495 ptn_lh_cat[offset] = log(_pattern_lh_cat[offset]) + scale;
1496 }
1497 } else if (current_it_back->lh_scale_factor == 0.0) {
1498 for (i = 0; i < nptn; i++) {
1499 double scale = (max(UBYTE(0), current_it->scale_num[i])) * LOG_SCALING_THRESHOLD;
1500 for (int j = 0; j < ncat; j++, offset++)
1501 ptn_lh_cat[offset] = log(_pattern_lh_cat[offset]) + scale;
1502 }
1503 } else {
1504 for (i = 0; i < nptn; i++) {
1505 double scale = (max(UBYTE(0), current_it->scale_num[i]) +
1506 max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
1507 for (int j = 0; j < ncat; j++, offset++)
1508 ptn_lh_cat[offset] = log(_pattern_lh_cat[offset]) + scale;
1509 }
1510 }
1511 return;
1512 }
1513 */
1514
1515 // New kernel
1516 int ptn;
1517 PhyloNeighbor *nei1 = current_it;
1518 PhyloNeighbor *nei2 = current_it_back;
1519 if (!nei1->node->isLeaf() && nei2->node->isLeaf()) {
1520 // exchange
1521 PhyloNeighbor *tmp = nei1;
1522 nei1 = nei2;
1523 nei2 = tmp;
1524 }
1525 if (nei1->node->isLeaf()) {
1526 // external branch
1527 double *lh_cat = _pattern_lh_cat;
1528 double *out_lh_cat = ptn_lh_cat;
1529 UBYTE *nei2_scale = nei2->scale_num;
1530 if (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling) {
1531 // per-category scaling
1532 for (ptn = 0; ptn < nptn; ptn++) {
1533 for (i = 0; i < ncat; i++) {
1534 out_lh_cat[i] = log(lh_cat[i]) + nei2_scale[i] * LOG_SCALING_THRESHOLD;
1535 }
1536 lh_cat += ncat;
1537 out_lh_cat += ncat;
1538 nei2_scale += ncat;
1539 }
1540 } else {
1541 // normal scaling
1542 for (ptn = 0; ptn < nptn; ptn++) {
1543 double scale = nei2_scale[ptn] * LOG_SCALING_THRESHOLD;
1544 for (i = 0; i < ncat; i++)
1545 out_lh_cat[i] = log(lh_cat[i]) + scale;
1546 lh_cat += ncat;
1547 out_lh_cat += ncat;
1548 }
1549 }
1550 } else {
1551 // internal branch
1552 double *lh_cat = _pattern_lh_cat;
1553 double *out_lh_cat = ptn_lh_cat;
1554 UBYTE *nei1_scale = nei1->scale_num;
1555 UBYTE *nei2_scale = nei2->scale_num;
1556 if (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling) {
1557 // per-category scaling
1558 for (ptn = 0; ptn < nptn; ptn++) {
1559 for (i = 0; i < ncat; i++) {
1560 out_lh_cat[i] = log(lh_cat[i]) + (nei1_scale[i]+nei2_scale[i]) * LOG_SCALING_THRESHOLD;
1561 }
1562 lh_cat += ncat;
1563 out_lh_cat += ncat;
1564 nei1_scale += ncat;
1565 nei2_scale += ncat;
1566 }
1567 } else {
1568 // normal scaling
1569 for (ptn = 0; ptn < nptn; ptn++) {
1570 double scale = (nei1_scale[ptn] + nei2_scale[ptn]) * LOG_SCALING_THRESHOLD;
1571 for (i = 0; i < ncat; i++)
1572 out_lh_cat[i] = log(lh_cat[i]) + scale;
1573 lh_cat += ncat;
1574 out_lh_cat += ncat;
1575 }
1576 }
1577 }
1578
1579 // if (cur_logl) {
1580 // double check_score = 0.0;
1581 // for (int i = 0; i < nptn; i++) {
1582 // check_score += (ptn_lh[i] * (aln->at(i).frequency));
1583 // }
1584 // if (fabs(check_score - *cur_logl) > 0.01) {
1585 // cout << *cur_logl << " " << check_score << endl;
1586 // assert(0);
1587 // }
1588 // }
1589 //double score = computeLikelihoodBranch(dad_branch, dad, pattern_lh);
1590 //return score;
1591 }
1592
computePatternProbabilityCategory(double * ptn_prob_cat,SiteLoglType wsl)1593 void PhyloTree::computePatternProbabilityCategory(double *ptn_prob_cat, SiteLoglType wsl) {
1594 /* if (!dad_branch) {
1595 dad_branch = (PhyloNeighbor*) root->neighbors[0];
1596 dad = (PhyloNode*) root;
1597 }*/
1598 size_t ptn, nptn = aln->getNPattern();
1599 size_t cat, ncat = getNumLhCat(wsl);
1600 // Right now only Naive version store _pattern_lh_cat!
1601 computePatternLhCat(wsl);
1602
1603 memcpy(ptn_prob_cat, _pattern_lh_cat, sizeof(double)*nptn*ncat);
1604
1605 for (ptn = 0; ptn < nptn; ptn++) {
1606 double *lh_cat = ptn_prob_cat + ptn*ncat;
1607 double sum = lh_cat[0];
1608 for (cat = 1; cat < ncat; cat++)
1609 sum += lh_cat[cat];
1610 sum = 1.0/sum;
1611 for (cat = 0; cat < ncat; cat++)
1612 lh_cat[cat] *= sum;
1613 }
1614 }
1615
computePatternCategories(IntVector * pattern_ncat)1616 int PhyloTree::computePatternCategories(IntVector *pattern_ncat) {
1617 if (sse != LK_386) {
1618 // compute _pattern_lh_cat
1619 computePatternLhCat(WSL_MIXTURE_RATECAT);
1620 }
1621
1622 size_t npattern = aln->getNPattern();
1623 size_t ncat = getRate()->getNRate();
1624 size_t nmixture;
1625 if (getModel()->isMixture() && !getModelFactory()->fused_mix_rate)
1626 nmixture = getModel()->getNMixtures();
1627 else
1628 nmixture = ncat;
1629 size_t ptn, m, c;
1630 if (pattern_ncat)
1631 pattern_ncat->resize(npattern);
1632 if (ptn_cat_mask.empty())
1633 ptn_cat_mask.resize(npattern, 0);
1634
1635 size_t num_best_mixture = 0;
1636 ASSERT(ncat < sizeof(uint64_t)*8 && nmixture < sizeof(uint64_t)*8);
1637
1638 double *lh_cat = _pattern_lh_cat;
1639 // double *cat_prob = new double[ncat];
1640 double *lh_mixture = new double[nmixture];
1641 double *sorted_lh_mixture = new double[nmixture];
1642 int *id_mixture = new int[nmixture];
1643
1644 // for (c = 0; c < ncat; c++)
1645 // cat_prob[c] = getRate()->getProp(c);
1646
1647 // cout << "Ptn\tFreq\tNumMix\tBestMix" << endl;
1648 size_t sum_nmix = 0;
1649 for (ptn = 0; ptn < npattern; ptn++) {
1650 double sum_prob = 0.0, acc_prob = 0.0;
1651 memset(lh_mixture, 0, nmixture*sizeof(double));
1652 if (getModel()->isMixture() && !getModelFactory()->fused_mix_rate) {
1653 for (m = 0; m < nmixture; m++) {
1654 for (c = 0; c < ncat; c++) {
1655 // lh_mixture[m] += lh_cat[c] * cat_prob[c];
1656 lh_mixture[m] += lh_cat[c];
1657 }
1658 // lh_mixture[m] *= prop[m];
1659 sum_prob += lh_mixture[m];
1660 lh_cat += ncat;
1661 id_mixture[m] = m;
1662 }
1663 } else {
1664 for (m = 0; m < nmixture; m++) {
1665 // lh_mixture[m] = lh_cat[m] * prop[m];
1666 lh_mixture[m] = lh_cat[m];
1667 sum_prob += lh_mixture[m];
1668 id_mixture[m] = m;
1669 }
1670 lh_cat += nmixture;
1671 }
1672 sum_prob = 1.0 / sum_prob;
1673 for (m = 0; m < nmixture; m++) {
1674 lh_mixture[m] *= sum_prob;
1675 sorted_lh_mixture[m] = -lh_mixture[m];
1676 }
1677 quicksort(sorted_lh_mixture, 0, m-1, id_mixture);
1678 for (m = 0; m < nmixture && acc_prob <= 0.99; m++) {
1679 acc_prob -= sorted_lh_mixture[m];
1680 ptn_cat_mask[ptn] |= (uint64_t)1 << id_mixture[m];
1681 }
1682 if (m > num_best_mixture)
1683 num_best_mixture = m;
1684 sum_nmix += m;
1685 if (pattern_ncat)
1686 (*pattern_ncat)[ptn] = m;
1687
1688 if (verbose_mode >= VB_MED) {
1689 cout << ptn << "\t" << (int)ptn_freq[ptn] << "\t" << m << "\t" << id_mixture[0];
1690 for (c = 0; c < m; c++)
1691 cout << "\t" << id_mixture[c] << "\t" << -sorted_lh_mixture[c];
1692 cout << endl;
1693 }
1694 }
1695 // cout << 100*(double(sum_nmix)/nmixture)/npattern << "% computation necessary" << endl;
1696 delete [] id_mixture;
1697 delete [] sorted_lh_mixture;
1698 delete [] lh_mixture;
1699 // delete [] cat_prob;
1700 return num_best_mixture;
1701 }
1702
computeLogLVariance(double * ptn_lh,double tree_lh)1703 double PhyloTree::computeLogLVariance(double *ptn_lh, double tree_lh) {
1704 int i;
1705 int nptn = getAlnNPattern();
1706 int nsite = getAlnNSite();
1707 double *pattern_lh = ptn_lh;
1708 if (!ptn_lh) {
1709 pattern_lh = new double[nptn];
1710 computePatternLikelihood(pattern_lh);
1711 }
1712 IntVector pattern_freq;
1713 aln->getPatternFreq(pattern_freq);
1714 if (tree_lh == 0.0) {
1715 for (i = 0; i < nptn; i++)
1716 tree_lh += pattern_lh[i] * pattern_freq[i];
1717 }
1718 double avg_site_lh = tree_lh / nsite;
1719 double variance = 0.0;
1720 for (i = 0; i < nptn; i++) {
1721 double diff = (pattern_lh[i] - avg_site_lh);
1722 variance += diff * diff * pattern_freq[i];
1723 }
1724 if (!ptn_lh)
1725 delete[] pattern_lh;
1726 if (nsite <= 1)
1727 return 0.0;
1728 return variance * ((double) nsite / (nsite - 1.0));
1729 }
1730
computeLogLDiffVariance(double * pattern_lh_other,double * ptn_lh)1731 double PhyloTree::computeLogLDiffVariance(double *pattern_lh_other, double *ptn_lh) {
1732 int i;
1733 int nptn = getAlnNPattern();
1734 int nsite = getAlnNSite();
1735 double *pattern_lh = ptn_lh;
1736 if (!ptn_lh) {
1737 pattern_lh = new double[nptn];
1738 computePatternLikelihood(pattern_lh);
1739 }
1740 IntVector pattern_freq;
1741 aln->getPatternFreq(pattern_freq);
1742
1743 double avg_site_lh_diff = 0.0;
1744 for (i = 0; i < nptn; i++)
1745 avg_site_lh_diff += (pattern_lh[i] - pattern_lh_other[i]) * pattern_freq[i];
1746 avg_site_lh_diff /= nsite;
1747 double variance = 0.0;
1748 for (i = 0; i < nptn; i++) {
1749 double diff = (pattern_lh[i] - pattern_lh_other[i] - avg_site_lh_diff);
1750 variance += diff * diff * pattern_freq[i];
1751 }
1752 if (!ptn_lh)
1753 delete[] pattern_lh;
1754 if (nsite <= 1)
1755 return 0.0;
1756 return variance * ((double) nsite / (nsite - 1.0));
1757 }
1758
computeLogLDiffVariance(PhyloTree * other_tree,double * pattern_lh)1759 double PhyloTree::computeLogLDiffVariance(PhyloTree *other_tree, double *pattern_lh) {
1760 double *pattern_lh_other = new double[getAlnNPattern()];
1761 other_tree->computePatternLikelihood(pattern_lh_other);
1762 // BUG FIX found by Xcode analyze (use of memory after it is freed)
1763 // delete[] pattern_lh_other;
1764 double res = computeLogLDiffVariance(pattern_lh_other, pattern_lh);
1765 delete[] pattern_lh_other;
1766 return res;
1767 }
1768
getUnmarkedNodes(PhyloNodeVector & unmarkedNodes,PhyloNode * node,PhyloNode * dad)1769 void PhyloTree::getUnmarkedNodes(PhyloNodeVector& unmarkedNodes, PhyloNode* node, PhyloNode* dad) {
1770 if (!node) {
1771 node = (PhyloNode*) root;
1772 }
1773
1774 if (markedNodeList.find(node->id) == markedNodeList.end()) {
1775 int numUnmarkedNei = 0;
1776 for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++) {
1777 if (markedNodeList.find((*it)->node->id) == markedNodeList.end())
1778 numUnmarkedNei++;
1779 }
1780 if (numUnmarkedNei == 1)
1781 unmarkedNodes.push_back(node);
1782 }
1783
1784 FOR_NEIGHBOR_IT(node, dad, it){
1785 getUnmarkedNodes(unmarkedNodes, (PhyloNode*) (*it)->node, node);
1786 }
1787 }
1788
optimizeOneBranchLS(PhyloNode * node1,PhyloNode * node2)1789 double PhyloTree::optimizeOneBranchLS(PhyloNode *node1, PhyloNode *node2) {
1790 if (!subTreeDistComputed) {
1791 if (params->ls_var_type == WLS_PAUPLIN) {
1792 computeNodeBranchDists();
1793 for (int i = 0; i < leafNum; i++)
1794 for (int j = 0; j < leafNum; j++)
1795 var_matrix[i*leafNum+j] = pow(2.0,nodeBranchDists[i*nodeNum+j]);
1796 }
1797 computeSubtreeDists();
1798 }
1799 double A, B, C, D;
1800 A = B = C = D = 0;
1801 PhyloNode *nodeA = NULL, *nodeB = NULL, *nodeC = NULL, *nodeD = NULL;
1802 double lsBranch;
1803
1804 // One of the node is a leaf
1805 if (node1->isLeaf() || node2->isLeaf()) {
1806 if (node1->isLeaf()) {
1807 // nodeA and nodeB are children of node2
1808 FOR_NEIGHBOR_IT(node2, node1, it){
1809 if (A == 0) {
1810 A = getNumTaxa((*it)->node, node2);
1811 nodeA = (PhyloNode*) (*it)->node;
1812 } else {
1813 B = getNumTaxa((*it)->node, node2);
1814 nodeB = (PhyloNode*) (*it)->node;
1815 }
1816 }
1817 // nodeC is now node1
1818 nodeC = node1;
1819 } else {
1820 // nodeA and nodeB are children of node1
1821 FOR_NEIGHBOR_IT(node1, node2, it) {
1822 if (A == 0) {
1823 A = getNumTaxa((*it)->node, node1);
1824 nodeA = (PhyloNode*) (*it)->node;
1825 } else {
1826 B = getNumTaxa((*it)->node, node1);
1827 nodeB = (PhyloNode*) (*it)->node;
1828 }
1829 }
1830 // nodeC is now node1
1831 nodeC = node2;
1832 }
1833 ASSERT(A != 0);
1834 ASSERT(B != 0);
1835 string keyAC = getBranchID(nodeA, nodeC);
1836 ASSERT(subTreeDists.count(keyAC));
1837 double distAC = subTreeDists[keyAC];
1838 double weightAC = subTreeWeights[keyAC];
1839 string keyBC = getBranchID(nodeB, nodeC);
1840 ASSERT(subTreeDists.count(keyBC));
1841 double distBC = subTreeDists[keyBC];
1842 double weightBC = subTreeWeights[keyBC];
1843 string keyAB = getBranchID(nodeA, nodeB);
1844 ASSERT(subTreeDists.count(keyAB));
1845 double distAB = subTreeDists[keyAB];
1846 double weightAB = subTreeWeights[keyAB];
1847 if (params->ls_var_type == OLS/* || params->ls_var_type == FIRST_TAYLOR || params->ls_var_type == FITCH_MARGOLIASH
1848 || params->ls_var_type == SECOND_TAYLOR*/) {
1849 lsBranch = 0.5 * (distAC / A + distBC / B - distAB / (A * B));
1850 } /*else if (params->ls_var_type == PAUPLIN) {
1851 // TODO: Chua test bao gio
1852 outError("Paulin formula not supported yet");
1853 lsBranch = 0.5 * (distAC + distBC) - 0.5 * distAB;
1854 }*/ else {
1855 // weighted least square
1856 lsBranch = 0.5*(distAC/weightAC + distBC/weightBC - distAB/weightAB);
1857 }
1858 } else { // Both node are internal node
1859 FOR_NEIGHBOR_IT(node1, node2, it) {
1860 if (A == 0) {
1861 A = getNumTaxa((*it)->node, node1);
1862 nodeA = (PhyloNode*) (*it)->node;
1863 } else {
1864 B = getNumTaxa((*it)->node, node1);
1865 nodeB = (PhyloNode*) (*it)->node;
1866 }
1867 }
1868
1869 FOR_NEIGHBOR_IT(node2, node1, it) {
1870 if (C == 0) {
1871 C = getNumTaxa((*it)->node, node2);
1872 nodeC = (PhyloNode*) (*it)->node;
1873 } else {
1874 D = getNumTaxa((*it)->node, node2);
1875 nodeD = (PhyloNode*) (*it)->node;
1876 }
1877 }
1878
1879 string keyAC = getBranchID(nodeA, nodeC);
1880 ASSERT(subTreeDists.count(keyAC));
1881 double distAC = subTreeDists[keyAC];
1882 double weightAC = subTreeWeights[keyAC];
1883
1884 string keyBD = getBranchID(nodeB, nodeD);
1885 ASSERT(subTreeDists.count(keyBD));
1886 double distBD = subTreeDists[keyBD];
1887 double weightBD = subTreeWeights[keyBD];
1888
1889 string keyBC = getBranchID(nodeB, nodeC);
1890 ASSERT(subTreeDists.count(keyBC));
1891 double distBC = subTreeDists[keyBC];
1892 double weightBC = subTreeWeights[keyBC];
1893
1894 string keyAD = getBranchID(nodeA, nodeD);
1895 ASSERT(subTreeDists.count(keyAD));
1896 double distAD = subTreeDists[keyAD];
1897 double weightAD = subTreeWeights[keyAD];
1898
1899 string keyAB = getBranchID(nodeA, nodeB);
1900 ASSERT(subTreeDists.count(keyAB));
1901 double distAB = subTreeDists[keyAB];
1902 double weightAB = subTreeWeights[keyAB];
1903
1904 string keyCD = getBranchID(nodeC, nodeD);
1905 ASSERT(subTreeDists.count(keyCD));
1906 double distCD = subTreeDists[keyCD];
1907 double weightCD = subTreeWeights[keyCD];
1908
1909 /*if (params->ls_var_type == PAUPLIN) {
1910 // this distance has a typo as also seen in Mihaescu & Pachter 2008
1911 //lsBranch = 0.25 * (distAC + distBD + distAD + distBC) - 0.5 * (distAB - distCD);
1912 outError("Paulin formula not supported yet");
1913 lsBranch = 0.25 * (distAC + distBD + distAD + distBC) - 0.5 * (distAB + distCD);
1914 } else*/ if (params->ls_var_type == OLS) {
1915 double gamma = (B * C + A * D) / ((A + B)*(C + D));
1916 lsBranch = 0.5 * (gamma * (distAC / (A * C) + distBD / (B * D))
1917 + (1 - gamma) * (distBC / (B * C) + distAD / (A * D))
1918 - distAB / (A * B) - distCD / (C * D));
1919 } else {
1920 // weighted least square
1921 double K = 1.0/weightAC + 1.0/weightBD + 1.0/weightAD + 1.0/weightBC;
1922 lsBranch =
1923 ((distAC/weightAC+distBD/weightBD)*(weightAD+weightBC)/(weightAD*weightBC)+
1924 (distAD/weightAD+distBC/weightBC)*(weightAC+weightBD)/(weightAC*weightBD))/K
1925 - distAB/weightAB - distCD/weightCD;
1926 lsBranch = 0.5*lsBranch;
1927 }
1928 }
1929 return lsBranch;
1930 }
1931
updateSubtreeDists(NNIMove & nnimove)1932 void PhyloTree::updateSubtreeDists(NNIMove &nnimove) {
1933 ASSERT(subTreeDistComputed);
1934 PhyloNode *nodeA = NULL, *nodeB = NULL, *nodeC = NULL, *nodeD = NULL;
1935 PhyloNode *node1 = nnimove.node1;
1936 PhyloNode *node2 = nnimove.node2;
1937 NeighborVec::iterator node1Nei_it = nnimove.node1Nei_it;
1938 NeighborVec::iterator node2Nei_it = nnimove.node2Nei_it;
1939 Neighbor *node1Nei = *(node1Nei_it);
1940 Neighbor *node2Nei = *(node2Nei_it);
1941
1942 // ((A,C),(B,D))
1943 // C and D are the 2 subtree that get swapped
1944 FOR_NEIGHBOR_IT(node1, node2, it) {
1945 if ((*it)->id != node1Nei->id) {
1946 nodeA = (PhyloNode*) (*it)->node;
1947 } else {
1948 nodeC = (PhyloNode*) (*it)->node;
1949 }
1950 }
1951
1952 ASSERT(nodeA);
1953 ASSERT(nodeC);
1954
1955 FOR_NEIGHBOR_IT(node2, node1, it) {
1956 if ((*it)->id != node2Nei->id) {
1957 nodeB = (PhyloNode*) (*it)->node;
1958 } else {
1959 nodeD = (PhyloNode*) (*it)->node;
1960 }
1961 }
1962
1963 ASSERT(nodeB);
1964 ASSERT(nodeD);
1965
1966 NodeVector nodeListA, nodeListB, nodeListC, nodeListD;
1967 getAllNodesInSubtree(nodeA, node1, nodeListA);
1968 getAllNodesInSubtree(nodeC, node1, nodeListC);
1969 getAllNodesInSubtree(nodeB, node2, nodeListB);
1970 getAllNodesInSubtree(nodeD, node2, nodeListD);
1971
1972 for (NodeVector::iterator it = nodeListA.begin(); it != nodeListA.end(); ++it) {
1973 string key = getBranchID((*it), node2);
1974 double distB = subTreeDists.find(getBranchID((*it), nodeB))->second;
1975 double distD = subTreeDists.find(getBranchID((*it), nodeD))->second;
1976 double newDist = distB + distD;
1977 StringDoubleMap::iterator dist_it = subTreeDists.find(key);
1978 ASSERT(dist_it != subTreeDists.end());
1979 dist_it->second = newDist;
1980 }
1981
1982 for (NodeVector::iterator it = nodeListB.begin(); it != nodeListB.end(); ++it) {
1983 string key = getBranchID((*it), node1);
1984 double distC = subTreeDists.find(getBranchID((*it), nodeC))->second;
1985 double distA = subTreeDists.find(getBranchID((*it), nodeA))->second;
1986 double newDist = distC + distA;
1987 StringDoubleMap::iterator dist_it = subTreeDists.find(key);
1988 ASSERT(dist_it != subTreeDists.end());
1989 dist_it->second = newDist;
1990 }
1991
1992 for (NodeVector::iterator it = nodeListC.begin(); it != nodeListC.end(); ++it) {
1993 string key = getBranchID((*it), node2);
1994 double distD = subTreeDists.find(getBranchID((*it), nodeD))->second;
1995 double distB = subTreeDists.find(getBranchID((*it), nodeB))->second;
1996 double newDist = distD + distB;
1997 StringDoubleMap::iterator dist_it = subTreeDists.find(key);
1998 ASSERT(dist_it != subTreeDists.end());
1999 dist_it->second = newDist;
2000 }
2001
2002 for (NodeVector::iterator it = nodeListD.begin(); it != nodeListD.end(); ++it) {
2003 string key = getBranchID((*it), node1);
2004 double distA = subTreeDists.find(getBranchID((*it), nodeA))->second;
2005 double distC = subTreeDists.find(getBranchID((*it), nodeC))->second;
2006 double newDist = distA + distC;
2007 StringDoubleMap::iterator dist_it = subTreeDists.find(key);
2008 ASSERT(dist_it != subTreeDists.end());
2009 dist_it->second = newDist;
2010 }
2011
2012 double distAB = subTreeDists.find(getBranchID(nodeA, nodeB))->second;
2013 double distAD = subTreeDists.find(getBranchID(nodeA, nodeD))->second;
2014 double distCB = subTreeDists.find(getBranchID(nodeC, nodeB))->second;
2015 double distCD = subTreeDists.find(getBranchID(nodeC, nodeD))->second;
2016
2017 subTreeDists.find(getBranchID(node1, node2))->second = distAB + distAD + distCB + distCD;
2018
2019 }
2020
computeSubtreeDists()2021 void PhyloTree::computeSubtreeDists() {
2022 PhyloNodeVector unmarkedNodes;
2023 subTreeDists.clear();
2024 subTreeWeights.clear();
2025 do {
2026 // Generate a list of unmarked node that is adjacent to exactly one unmarked nodes
2027 // Here we will work up the tree in a bottom up manner
2028 unmarkedNodes.clear();
2029 getUnmarkedNodes(unmarkedNodes);
2030 if (unmarkedNodes.size() == 0)
2031 break;
2032
2033 for (PhyloNodeVector::iterator it = unmarkedNodes.begin(); it != unmarkedNodes.end(); ++it) {
2034 // if the node is an internal node then all of its child nodes should be marked
2035 // source_nei1 and source_nei2 are the 2 marked child node
2036 // nextNode is the other node, used for traversal
2037 PhyloNode* source_nei1 = NULL;
2038 PhyloNode* source_nei2 = NULL;
2039 PhyloNode* nextNode;
2040 if (!(*it)->isLeaf()) {
2041 // select the 2 marked child nodes
2042 for (NeighborVec::iterator it2 = (*it)->neighbors.begin(); it2 != (*it)->neighbors.end(); ++it2) {
2043 if (markedNodeList.find((*it2)->node->id) != markedNodeList.end()) {
2044 if (!source_nei1) {
2045 source_nei1 = (PhyloNode*) (*it2)->node;
2046 } else {
2047 source_nei2 = (PhyloNode*) (*it2)->node;
2048 }
2049 } else {
2050 nextNode = (PhyloNode*) (*it2)->node;
2051 }
2052 }
2053 ASSERT(source_nei1);
2054 ASSERT(source_nei2);
2055 } else {
2056 nextNode = (PhyloNode*) (*it)->neighbors[0]->node;
2057 }
2058 // warning: 'nextNode' may be used uninitialized in this function
2059 computeAllSubtreeDistForOneNode((*it), source_nei1, source_nei2, (*it), nextNode);
2060 markedNodeList.insert(IntPhyloNodeMap::value_type((*it)->id, (*it)));
2061 }
2062 } while (true);
2063 markedNodeList.clear();
2064 subTreeDistComputed = true;
2065 }
2066
computeAllSubtreeDistForOneNode(PhyloNode * source,PhyloNode * source_nei1,PhyloNode * source_nei2,PhyloNode * node,PhyloNode * dad)2067 void PhyloTree::computeAllSubtreeDistForOneNode(PhyloNode* source, PhyloNode* source_nei1, PhyloNode* source_nei2,
2068 PhyloNode* node, PhyloNode* dad) {
2069 string key = getBranchID(source, dad);
2070 double dist, weight;
2071 if (markedNodeList.find(dad->id) != markedNodeList.end()) {
2072 return;
2073 } else if (source->isLeaf() && dad->isLeaf()) {
2074 ASSERT(dist_matrix);
2075 int nseq = aln->getNSeq();
2076 if (params->ls_var_type == OLS) {
2077 dist = dist_matrix[dad->id * nseq + source->id];
2078 weight = 1.0;
2079 } else {
2080 // this will take into account variances, also work for OLS since var = 1
2081 weight = 1.0/var_matrix[dad->id * nseq + source->id];
2082 dist = dist_matrix[dad->id * nseq + source->id] * weight;
2083 }
2084 subTreeDists.insert(StringDoubleMap::value_type(key, dist));
2085 subTreeWeights.insert(StringDoubleMap::value_type(key, weight));
2086 } else if (!source->isLeaf() && dad->isLeaf()) {
2087 ASSERT(source_nei1);
2088 ASSERT(source_nei2);
2089 string key1 = getBranchID(source_nei1, dad);
2090 ASSERT(subTreeDists.find(key1) == subTreeDists.end());
2091 double dist1 = subTreeDists.find(key1)->second;
2092 double weight1 = subTreeWeights.find(key1)->second;
2093 string key2 = getBranchID(source_nei2, dad);
2094 ASSERT(subTreeDists.find(key2) == subTreeDists.end());
2095 double dist2 = subTreeDists.find(key2)->second;
2096 double weight2 = subTreeWeights.find(key2)->second;
2097 dist = dist1 + dist2;
2098 weight = weight1 + weight2;
2099 subTreeDists.insert(StringDoubleMap::value_type(key, dist));
2100 subTreeWeights.insert(StringDoubleMap::value_type(key, weight));
2101 } else {
2102 PhyloNode* dad_nei1 = NULL;
2103 PhyloNode* dad_nei2 = NULL;
2104 for (NeighborVec::iterator it = dad->neighbors.begin(); it != dad->neighbors.end(); ++it) {
2105 if ((*it)->node != node) {
2106 if (!dad_nei1) {
2107 dad_nei1 = (PhyloNode*) (*it)->node;
2108 } else {
2109 dad_nei2 = (PhyloNode*) (*it)->node;
2110 }
2111 }
2112 }
2113 ASSERT(dad_nei1);
2114 ASSERT(dad_nei2);
2115 computeAllSubtreeDistForOneNode(source, source_nei1, source_nei2, dad, dad_nei1);
2116 computeAllSubtreeDistForOneNode(source, source_nei1, source_nei2, dad, dad_nei2);
2117 string key1 = getBranchID(source, dad_nei1);
2118 string key2 = getBranchID(source, dad_nei2);
2119 ASSERT(subTreeDists.find(key1) != subTreeDists.end());
2120 ASSERT(subTreeDists.find(key2) != subTreeDists.end());
2121 double dist1 = subTreeDists.find(key1)->second;
2122 double weight1 = subTreeWeights.find(key1)->second;
2123 double dist2 = subTreeDists.find(key2)->second;
2124 double weight2 = subTreeWeights.find(key2)->second;
2125 dist = dist1 + dist2;
2126 weight = weight1 + weight2;
2127 subTreeDists.insert(StringDoubleMap::value_type(key, dist));
2128 subTreeWeights.insert(StringDoubleMap::value_type(key, weight));
2129 }
2130 }
2131
computeNodeBranchDists(Node * node,Node * dad)2132 set<int> PhyloTree::computeNodeBranchDists(Node *node, Node *dad) {
2133 set<int>::iterator i, j;
2134 if (!nodeBranchDists) {
2135 cout << "nodeNum = " << nodeNum << endl;
2136 nodeBranchDists = new int[nodeNum*nodeNum];
2137 }
2138 if (!node) {
2139 memset(nodeBranchDists, 0, sizeof(int)*nodeNum*nodeNum);
2140 ASSERT(root->isLeaf());
2141 dad = root;
2142 node = dad->neighbors[0]->node;
2143 set<int> res = computeNodeBranchDists(node, dad);
2144 for (i = res.begin(); i != res.end(); i++)
2145 nodeBranchDists[(*i)*nodeNum + dad->id] = nodeBranchDists[(dad->id)*nodeNum + (*i)] =
2146 nodeBranchDists[(*i)*nodeNum + node->id] + 1;
2147 // sanity check that all distances are filled
2148 for (int x = 0; x < nodeNum; x++)
2149 for (int y = 0; y < nodeNum; y++)
2150 if (x != y)
2151 ASSERT(nodeBranchDists[x*nodeNum+y] != 0);
2152 else
2153 ASSERT(nodeBranchDists[x*nodeNum+y] == 0);
2154 return res;
2155 }
2156 if (node->isLeaf()) {
2157 set<int> res;
2158 res.insert(node->id);
2159 return res;
2160 }
2161 ASSERT(node->degree() == 3);
2162 Node *left = NULL, *right = NULL;
2163 FOR_NEIGHBOR_IT(node, dad, it) {
2164 if (!left) left = (*it)->node; else right = (*it)->node;
2165 }
2166 set<int> resl = computeNodeBranchDists(left, node);
2167 set<int> resr = computeNodeBranchDists(right, node);
2168 for (i = resl.begin(); i != resl.end(); i++)
2169 nodeBranchDists[(*i)*nodeNum + node->id] = nodeBranchDists[(node->id)*nodeNum + (*i)] =
2170 nodeBranchDists[(*i)*nodeNum + left->id] + 1;
2171 for (i = resr.begin(); i != resr.end(); i++)
2172 nodeBranchDists[(*i)*nodeNum + node->id] = nodeBranchDists[(node->id)*nodeNum + (*i)] =
2173 nodeBranchDists[(*i)*nodeNum + right->id] + 1;
2174 for (i = resl.begin(); i != resl.end(); i++)
2175 for (j = resr.begin(); j != resr.end(); j++)
2176 nodeBranchDists[(*i)*nodeNum + (*j)] = nodeBranchDists[(*j)*nodeNum+(*i)] =
2177 nodeBranchDists[(*i)*nodeNum+node->id]+nodeBranchDists[(*j)*nodeNum+node->id];
2178 resl.insert(resr.begin(), resr.end());
2179 resl.insert(node->id);
2180 return resl;
2181 }
2182
2183
2184 /*
2185 b0: initial guess for the maximum
2186 */
approxOneBranch(PhyloNode * node,PhyloNode * dad,double b0)2187 double PhyloTree::approxOneBranch(PhyloNode *node, PhyloNode *dad, double b0) {
2188 double b_max, ddl, b1, b2, std, seqlen;
2189 double t1, t3, t5, t11, t18, t21, t26, t29, t30, t32, t44, t46, t48;
2190 double beps = 1/DBL_MAX;
2191
2192 /* TODO: insert call to get sequence length */
2193 seqlen = getAlnNSite();
2194
2195 /* use a robust first order approximation to the variance */
2196 std = sqrt(b0/seqlen);
2197
2198 /* determine neighbour points */
2199 b1 = b0 - std;
2200 if (b1<=0) b1 = beps; /* only happens for b<=1 with small seq. len. */
2201 b2 = b0 + std;
2202
2203 /* TODO: insert calls to log-likelihood function */
2204 PhyloNeighbor *dad_nei = (PhyloNeighbor*)(dad->findNeighbor(node));
2205 PhyloNeighbor *node_nei = (PhyloNeighbor*)(node->findNeighbor(dad));
2206 double old_len = dad_nei->length;
2207 dad_nei->length = node_nei->length = b0;
2208 double l0 = computeLikelihoodBranch(dad_nei, dad);
2209 dad_nei->length = node_nei->length = b1;
2210 double l1 = computeLikelihoodBranch(dad_nei, dad);
2211 dad_nei->length = node_nei->length = b2;
2212 double l2 = computeLikelihoodBranch(dad_nei, dad);
2213 dad_nei->length = node_nei->length = old_len;
2214
2215 t1 = sqrt(b0);
2216 t3 = sqrt(b2);
2217 t5 = sqrt(b1);
2218 t11 = pow(-t1*l2+t3*l0+t5*l2+t1*l1-t5*l0-t3*l1,2.0);
2219 t18 = -b0*l2+b2*l0+b1*l2+b0*l1-b1*l0-b2*l1;
2220 t21 = t1-t5;
2221 t26 = -t1*t3+t1*t5+b2-t5*t3;
2222 t29 = t18*t18;
2223 t30 = 1/t11;
2224 t32 = sqrt(t29*t30);
2225 ddl = -2.0*t11/t18/t21/t26/t32;
2226
2227 if (ddl > 0) {
2228 /* the analytic extremum is a minimum,
2229 so the maximum is at the lower bound */
2230 b_max = 0;
2231 } else {
2232 t44 = pow(-t1*b2+t5*b2-t5*b0+t3*b0-t3*b1+t1*b1,2.0);
2233 t46 = t21*t21;
2234 t48 = t26*t26;
2235 b_max = t29*t44/t46/t48*t30/4.0;
2236 }
2237
2238 return(b_max);
2239 }
2240
approxAllBranches(PhyloNode * node,PhyloNode * dad)2241 void PhyloTree::approxAllBranches(PhyloNode *node, PhyloNode *dad) {
2242 if (!node) {
2243 node = (PhyloNode*) root;
2244 }
2245
2246 if (dad) {
2247 PhyloNeighbor *node_dad_nei = (PhyloNeighbor*) node->findNeighbor(dad);
2248 PhyloNeighbor *dad_node_nei = (PhyloNeighbor*) dad->findNeighbor(node);
2249 double len = approxOneBranch(node, dad, dad_node_nei->length);
2250 node_dad_nei->length = len;
2251 dad_node_nei->length = len;
2252 }
2253
2254 for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++)
2255 if ((*it)->node != (dad)) {
2256 approxAllBranches((PhyloNode*) (*it)->node, node);
2257 }
2258 }
2259
2260 /*
2261 void PhyloTree::computeAllSubtreeDists(PhyloNode* node, PhyloNode* dad) {
2262 if (!node) {
2263 node = (PhyloNode*) root;
2264 }
2265
2266 if (dad) {
2267 // This function compute all pairwise subtree distance between subtree rooted at dad and others
2268
2269 computeSubtreeDists(node, dad);
2270 }
2271
2272 FOR_NEIGHBOR_IT(node, dad, it) {
2273
2274 computeAllSubtreeDists((PhyloNode*) (*it)->node, node);
2275 }
2276 }
2277
2278 void PhyloTree::computeSubtreeDists(PhyloNode* node, PhyloNode* dad) {
2279 // if both nodes are leaf then it is trivial, just retrieve the values from the distance matrix
2280 if (dad->isLeaf() && node->isLeaf()) {
2281 string key = nodePair2String(dad, node);
2282 assert(dist_matrix);
2283 int nseq = aln->getNSeq();
2284 double dist = dist_matrix[dad->id * nseq + node->id];
2285 interSubtreeDistances.insert(StringDoubleMap::value_type(key, dist));
2286 } else if (!dad->isLeaf() && node->isLeaf()) {
2287
2288 FOR_NEIGHBOR_IT(node, dad, it) {
2289
2290 computeSubtreeDists(dad, (PhyloNode*) (*it)->node);
2291 }
2292
2293 }
2294 }
2295 */
2296
computeBayesianBranchLength(PhyloNeighbor * dad_branch,PhyloNode * dad)2297 double PhyloTree::computeBayesianBranchLength(PhyloNeighbor *dad_branch, PhyloNode *dad) {
2298 double obsLen = 0.0;
2299 PhyloNode *node = (PhyloNode*) dad_branch->node;
2300 PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
2301 ASSERT(node_branch);
2302 /*
2303 if (node->isLeaf() || dad->isLeaf()) {
2304 return -1.0;
2305 }*/
2306 // TODO
2307 // if ((dad_branch->partial_lh_computed & 1) == 0)
2308 // computePartialLikelihood(dad_branch, dad);
2309 // if ((node_branch->partial_lh_computed & 1) == 0)
2310 // computePartialLikelihood(node_branch, node);
2311 // now combine likelihood at the branch
2312 int nstates = aln->num_states;
2313 int numCat = site_rate->getNRate();
2314 size_t block = numCat * nstates;
2315 size_t nptn = aln->size();
2316 size_t ptn;
2317 int cat, state;
2318 double *tmp_state_freq = new double[nstates];
2319 double *tmp_anscentral_state_prob1 = new double[nstates];
2320 double *tmp_anscentral_state_prob2 = new double[nstates];
2321
2322 //computeLikelihoodBranchNaive(dad_branch, dad, NULL, tmp_ptn_rates);
2323 //double sum_rates = 0.0;
2324 //for (ptn = 0; ptn < nptn; ptn++)
2325 // sum_rates += tmp_ptn_rates[ptn] * aln->at(ptn).frequency;
2326 //cout << "sum_rates = " << sum_rates << endl;
2327
2328 model->getStateFrequency(tmp_state_freq);
2329
2330 for (ptn = 0; ptn < nptn; ptn++) {
2331 // Compute the probability of each state for the current site
2332 double sum_prob1 = 0.0, sum_prob2 = 0.0;
2333 size_t offset = ptn * block;
2334 double *partial_lh_site = node_branch->partial_lh + (offset);
2335 double *partial_lh_child = dad_branch->partial_lh + (offset);
2336 for (state = 0; state < nstates; state++) {
2337 tmp_anscentral_state_prob1[state] = 0.0;
2338 tmp_anscentral_state_prob2[state] = 0.0;
2339 for (cat = 0; cat < numCat; cat++) {
2340 tmp_anscentral_state_prob1[state] += partial_lh_site[nstates * cat + state];
2341 tmp_anscentral_state_prob2[state] += partial_lh_child[nstates * cat + state];
2342 }
2343 tmp_anscentral_state_prob1[state] *= tmp_state_freq[state];
2344 tmp_anscentral_state_prob2[state] *= tmp_state_freq[state];
2345 sum_prob1 += tmp_anscentral_state_prob1[state];
2346 sum_prob2 += tmp_anscentral_state_prob2[state];
2347 }
2348 bool sameState = false;
2349 int state1 = 0, state2 = 0;
2350 double cutoff = 1.0/nstates;
2351 for (state = 0; state < nstates; state++) {
2352 tmp_anscentral_state_prob1[state] /= sum_prob1;
2353 tmp_anscentral_state_prob2[state] /= sum_prob2;
2354 if (tmp_anscentral_state_prob1[state] > tmp_anscentral_state_prob1[state1])
2355 state1 = state;
2356 if (tmp_anscentral_state_prob2[state] > tmp_anscentral_state_prob2[state2])
2357 state2 = state;
2358 if (tmp_anscentral_state_prob1[state] > cutoff && tmp_anscentral_state_prob2[state] > cutoff)
2359 sameState = true;
2360 }
2361 sameState = sameState || (state1 == state2);
2362 if (!sameState) {
2363 obsLen += aln->at(ptn).frequency;
2364 }
2365
2366 }
2367 obsLen /= getAlnNSite();
2368 if (obsLen < params->min_branch_length)
2369 obsLen = params->min_branch_length;
2370 delete[] tmp_anscentral_state_prob2;
2371 delete[] tmp_anscentral_state_prob1;
2372 delete[] tmp_state_freq;
2373
2374 return obsLen;
2375 }
2376
correctBranchLengthF81(double observedBran,double alpha)2377 double PhyloTree::correctBranchLengthF81(double observedBran, double alpha) {
2378 if (!model)
2379 return JukesCantorCorrection(observedBran, alpha);
2380 double H = 0.0;
2381 double correctedBranLen;
2382 for (int i = 0; i < model->num_states; i++) {
2383 H += model->state_freq[i] * (1 - model->state_freq[i]);
2384 }
2385 observedBran = 1.0 - observedBran / H;
2386 // no gamma
2387 if (observedBran <= 0.0)
2388 return params->max_branch_length;
2389
2390 if (alpha <= 0.0) {
2391 correctedBranLen = -H * log(observedBran);
2392 } else {
2393 //if (verbose_mode >= VB_MAX) cout << "alpha: " << alpha << endl;
2394
2395 correctedBranLen = H * alpha * (pow(observedBran, -1 / alpha) - 1);
2396 }
2397
2398 // Branch lengths under PoMo are #events, which is ~N^2 * #substitutions
2399 if (aln->seq_type == SEQ_POMO)
2400 correctedBranLen *= aln->virtual_pop_size * aln->virtual_pop_size;
2401
2402 if (correctedBranLen < params->min_branch_length)
2403 correctedBranLen = params->min_branch_length;
2404 if (correctedBranLen > params->max_branch_length)
2405 correctedBranLen = params->max_branch_length;
2406
2407 return correctedBranLen;
2408 }
2409
computeCorrectedBayesianBranchLength(PhyloNeighbor * dad_branch,PhyloNode * dad)2410 double PhyloTree::computeCorrectedBayesianBranchLength(PhyloNeighbor *dad_branch, PhyloNode *dad) {
2411 double observedBran = computeBayesianBranchLength(dad_branch, dad);
2412 return correctBranchLengthF81(observedBran, site_rate->getGammaShape());
2413 }
2414
computeAllBayesianBranchLengths(Node * node,Node * dad)2415 void PhyloTree::computeAllBayesianBranchLengths(Node *node, Node *dad) {
2416
2417 if (!node)
2418 node = root;
2419
2420 FOR_NEIGHBOR_IT(node, dad, it){
2421 double branch_length = computeBayesianBranchLength((PhyloNeighbor*) (*it), (PhyloNode*) node);
2422 (*it)->length = branch_length;
2423 // set the backward branch length
2424 (*it)->node->findNeighbor(node)->length = (*it)->length;
2425 computeAllBayesianBranchLengths((*it)->node, node);
2426 }
2427 }
2428
2429
computeLikelihoodZeroBranch(PhyloNeighbor * dad_branch,PhyloNode * dad)2430 double PhyloTree::computeLikelihoodZeroBranch(PhyloNeighbor *dad_branch, PhyloNode *dad) {
2431 double lh_zero_branch;
2432 double saved_len = dad_branch->length;
2433 PhyloNeighbor *node_branch = (PhyloNeighbor*) dad_branch->node->findNeighbor(dad);
2434 dad_branch->length = 0.0;
2435 node_branch->length = 0.0;
2436 lh_zero_branch = computeLikelihoodBranch(dad_branch, dad);
2437 // restore branch length
2438 dad_branch->length = saved_len;
2439 node_branch->length = saved_len;
2440
2441 return lh_zero_branch;
2442 }
2443
2444
2445 /****************************************************************************
2446 Branch length optimization by maximum likelihood
2447 ****************************************************************************/
2448
2449 //const double MIN_TREE_LENGTH_SCALE = 0.001;
2450 //const double MAX_TREE_LENGTH_SCALE = 100.0;
2451 const double TOL_TREE_LENGTH_SCALE = 0.001;
2452
2453
optimizeTreeLengthScaling(double min_scaling,double & scaling,double max_scaling,double gradient_epsilon)2454 double PhyloTree::optimizeTreeLengthScaling(double min_scaling, double &scaling, double max_scaling, double gradient_epsilon) {
2455 is_opt_scaling = true;
2456 current_scaling = scaling;
2457 double negative_lh, ferror;
2458 // 2018-08-20: make sure that max and min branch lengths do not go over bounds
2459 vector<DoubleVector> brlens;
2460 brlens.resize(branchNum);
2461 getBranchLengths(brlens);
2462 double min_brlen = params->max_branch_length, max_brlen = params->min_branch_length;
2463 for (auto brlenvec = brlens.begin(); brlenvec != brlens.end(); brlenvec++) {
2464 for (auto brlen = brlenvec->begin(); brlen != brlenvec->end(); brlen++) {
2465 max_brlen = max(max_brlen, *brlen);
2466 min_brlen = min(min_brlen, *brlen);
2467 }
2468 }
2469 if (min_brlen <= 0.0) min_brlen = params->min_branch_length;
2470 if (max_scaling > 10.0*params->max_branch_length / max_brlen)
2471 max_scaling = 10.0*params->max_branch_length / max_brlen;
2472 if (min_scaling < 0.1*params->min_branch_length / min_brlen)
2473 min_scaling = 0.1*params->min_branch_length / min_brlen;
2474
2475 scaling = minimizeOneDimen(min(scaling, min_scaling), scaling, max(max_scaling, scaling), max(TOL_TREE_LENGTH_SCALE, gradient_epsilon), &negative_lh, &ferror);
2476 if (scaling != current_scaling) {
2477 scaleLength(scaling / current_scaling);
2478 current_scaling = scaling;
2479 clearAllPartialLH();
2480 }
2481 is_opt_scaling = false;
2482 return computeLikelihood();
2483 }
2484
printTreeLengthScaling(const char * filename)2485 void PhyloTree::printTreeLengthScaling(const char *filename) {
2486 // double treescale = 1.0;
2487 //
2488 // cout << "Optimizing tree length scaling ..." << endl;
2489 //
2490 // double lh = optimizeTreeLengthScaling(MIN_BRLEN_SCALE, treescale, MAX_BRLEN_SCALE, 0.001);
2491 //
2492 // cout << "treescale: " << treescale << " / LogL: " << lh << endl;
2493
2494 Checkpoint *saved_checkpoint = getModelFactory()->getCheckpoint();
2495 Checkpoint *new_checkpoint = new Checkpoint;
2496 new_checkpoint->setFileName(filename);
2497 new_checkpoint->setCompression(false);
2498 new_checkpoint->setHeader("IQ-TREE scaled tree length and model parameters");
2499 new_checkpoint->put("treelength", treeLength());
2500 saved_checkpoint->put("treelength", treeLength()); // also put treelength into current checkpoint
2501
2502 getModelFactory()->setCheckpoint(new_checkpoint);
2503 getModelFactory()->saveCheckpoint();
2504 new_checkpoint->dump();
2505
2506 getModelFactory()->setCheckpoint(saved_checkpoint);
2507 }
2508
computeFunction(double value)2509 double PhyloTree::computeFunction(double value) {
2510 if (!is_opt_scaling) {
2511 current_it->length = value;
2512 current_it_back->length = value;
2513 return -computeLikelihoodBranch(current_it, (PhyloNode*) current_it_back->node);
2514 } else {
2515 if (value != current_scaling) {
2516 scaleLength(value / current_scaling);
2517 current_scaling = value;
2518 clearAllPartialLH();
2519 }
2520 return -computeLikelihood();
2521 }
2522 }
2523
computeFuncDerv(double value,double & df,double & ddf)2524 void PhyloTree::computeFuncDerv(double value, double &df, double &ddf) {
2525 current_it->length = value;
2526 current_it_back->length = value;
2527 // double lh;
2528 // lh = -computeLikelihoodDerv(current_it, (PhyloNode*) current_it_back->node, df, ddf);
2529 computeLikelihoodDerv(current_it, (PhyloNode*) current_it_back->node, &df, &ddf);
2530
2531 df = -df;
2532 ddf = -ddf;
2533
2534 // return lh;
2535 }
2536
optimizePatternRates(DoubleVector & pattern_rates)2537 void PhyloTree::optimizePatternRates(DoubleVector &pattern_rates) {
2538 size_t nptn = aln->getNPattern();
2539 pattern_rates.resize(nptn, 1.0);
2540 #pragma omp parallel for
2541 for (size_t ptn = 0; ptn < nptn; ptn++) {
2542 Alignment *paln = new Alignment;
2543 IntVector ptn_id;
2544 ptn_id.push_back(ptn);
2545 paln->extractPatterns(aln, ptn_id);
2546 PhyloTree *tree = new PhyloTree;
2547 tree->copyPhyloTree(this);
2548 tree->setParams(params);
2549 tree->setAlignment(paln);
2550 tree->sse = sse;
2551 tree->num_threads = 1;
2552 // initialize model
2553 tree->setModelFactory(getModelFactory());
2554
2555 // main optimization
2556 tree->optimizeTreeLengthScaling(MIN_SITE_RATE, pattern_rates[ptn], MAX_SITE_RATE, 0.0001);
2557
2558 tree->setModelFactory(NULL);
2559
2560 delete tree;
2561 delete paln;
2562 }
2563 }
2564
getNBranchParameters(int brlen_type)2565 int PhyloTree::getNBranchParameters(int brlen_type) {
2566 if (params->fixed_branch_length || brlen_type == BRLEN_FIX)
2567 return 0;
2568
2569 int df = 0;
2570
2571 if (brlen_type == BRLEN_OPTIMIZE) {
2572 df = branchNum - (int)rooted;
2573 // If model is Lie-Markov, and is in fact time reversible, one of the
2574 // degrees of freedom is illusary. (Of the two edges coming from the
2575 // root, only sum of their lenghts affects likelihood.)
2576 // So correct for this. Without this correction, K2P and RY2.2b
2577 // would not be synonymous, for example.
2578
2579 // string className(typeid(*model).name());
2580 // if (className.find("ModelLieMarkov")!=string::npos && model->isReversible())
2581 // df--;
2582
2583 // BQM 2017-04-28, alternatively, check if there is a virtual_root and model is reversible
2584 if (rooted && model && model->isReversible())
2585 df--;
2586
2587 } else if (brlen_type == BRLEN_SCALE)
2588 df = 1;
2589 return df;
2590 }
2591
optimizeOneBranch(PhyloNode * node1,PhyloNode * node2,bool clearLH,int maxNRStep)2592 void PhyloTree::optimizeOneBranch(PhyloNode *node1, PhyloNode *node2, bool clearLH, int maxNRStep) {
2593
2594 if (rooted && (node1 == root || node2 == root))
2595 return; // does not optimize virtual branch from root
2596
2597 double negative_lh;
2598 current_it = (PhyloNeighbor*) node1->findNeighbor(node2);
2599 ASSERT(current_it);
2600 current_it_back = (PhyloNeighbor*) node2->findNeighbor(node1);
2601 ASSERT(current_it_back);
2602
2603 double current_len = current_it->length;
2604 double ferror, optx;
2605 ASSERT(current_len >= 0.0);
2606 theta_computed = false;
2607 // mem_slots.cleanup();
2608 if (optimize_by_newton) {
2609 // Newton-Raphson method
2610 optx = minimizeNewton(params->min_branch_length, current_len, params->max_branch_length, params->min_branch_length, negative_lh, maxNRStep);
2611 if (verbose_mode >= VB_DEBUG) {
2612 cout << "minimizeNewton logl: " << computeLikelihoodFromBuffer() << endl;
2613 }
2614 if (optx > params->max_branch_length*0.95 && !isSuperTree()) {
2615 // newton raphson diverged, reset
2616 double opt_lh = computeLikelihoodFromBuffer();
2617 current_it->length = current_len;
2618 current_it_back->length = current_len;
2619 double orig_lh = computeLikelihoodFromBuffer();
2620 if (orig_lh > opt_lh) {
2621 optx = current_len;
2622 }
2623 }
2624 } else {
2625 // Brent method
2626 optx = minimizeOneDimen(params->min_branch_length, current_len, params->max_branch_length, params->min_branch_length, &negative_lh, &ferror);
2627 if (verbose_mode >= VB_MAX) {
2628 cout << "minimizeBrent logl: " << -negative_lh << endl;
2629 }
2630 }
2631
2632 current_it->length = optx;
2633 current_it_back->length = optx;
2634 //curScore = -negative_lh;
2635
2636 if (clearLH && current_len != optx) {
2637 node1->clearReversePartialLh(node2);
2638 node2->clearReversePartialLh(node1);
2639 }
2640
2641 // return -negative_lh;
2642 }
2643
optimizeChildBranches(PhyloNode * node,PhyloNode * dad)2644 double PhyloTree::optimizeChildBranches(PhyloNode *node, PhyloNode *dad) {
2645
2646 // double tree_lh = 0.0;
2647
2648 FOR_NEIGHBOR_DECLARE(node, dad, it){
2649
2650 // tree_lh = optimizeOneBranch((PhyloNode*) node, (PhyloNode*) (*it)->node);
2651 optimizeOneBranch((PhyloNode*) node, (PhyloNode*) (*it)->node);
2652 }
2653 return computeLikelihoodFromBuffer();
2654 // return tree_lh;
2655 }
2656
optimizeAllBranchesLS(PhyloNode * node,PhyloNode * dad)2657 void PhyloTree::optimizeAllBranchesLS(PhyloNode *node, PhyloNode *dad) {
2658 if (!node) {
2659 node = (PhyloNode*) root;
2660 }
2661
2662 if (dad) {
2663 double lsBran = optimizeOneBranchLS(node, dad);
2664 PhyloNeighbor *node_dad_nei = (PhyloNeighbor*) node->findNeighbor(dad);
2665 PhyloNeighbor *dad_node_nei = (PhyloNeighbor*) dad->findNeighbor(node);
2666 node_dad_nei->length = lsBran;
2667 dad_node_nei->length = lsBran;
2668 }
2669
2670 for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++)
2671 if ((*it)->node != (dad)) {
2672 optimizeAllBranchesLS((PhyloNode*) (*it)->node, node);
2673 }
2674 }
2675
optimizeAllBranches(PhyloNode * node,PhyloNode * dad,int maxNRStep)2676 void PhyloTree::optimizeAllBranches(PhyloNode *node, PhyloNode *dad, int maxNRStep) {
2677 // double tree_lh = -DBL_MAX;
2678
2679 if (!node) node = (PhyloNode*)root;
2680
2681 for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++)
2682 if ((*it)->node != (dad)) {
2683 optimizeAllBranches((PhyloNode*) (*it)->node, node, maxNRStep);
2684 }
2685 if (dad)
2686 optimizeOneBranch(node, dad, true, maxNRStep); // BQM 2014-02-24: true was missing
2687
2688 // return tree_lh;
2689 }
2690
computeBestTraversal(NodeVector & nodes,NodeVector & nodes2)2691 void PhyloTree::computeBestTraversal(NodeVector &nodes, NodeVector &nodes2) {
2692 Node *farleaf = findFarthestLeaf();
2693 // Node *farleaf = root;
2694
2695 // double call to farthest leaf to find the longest path on the tree
2696 findFarthestLeaf(farleaf);
2697 if (verbose_mode >= VB_MAX)
2698 cout << "Tree diameter: " << farleaf->height << endl;
2699 getPreOrderBranches(nodes, nodes2, farleaf);
2700 }
2701
optimizeAllBranches(int my_iterations,double tolerance,int maxNRStep)2702 double PhyloTree::optimizeAllBranches(int my_iterations, double tolerance, int maxNRStep) {
2703 if (verbose_mode >= VB_MAX)
2704 cout << "Optimizing branch lengths (max " << my_iterations << " loops)..." << endl;
2705
2706 NodeVector nodes, nodes2;
2707 computeBestTraversal(nodes, nodes2);
2708
2709 double tree_lh = computeLikelihoodBranch((PhyloNeighbor*)nodes[0]->findNeighbor(nodes2[0]), (PhyloNode*)nodes[0]);
2710
2711 if (verbose_mode >= VB_MAX) {
2712 cout << "Initial tree log-likelihood: " << tree_lh << endl;
2713 }
2714 DoubleVector lenvec;
2715 //cout << tree_lh << endl;
2716 for (int i = 0; i < my_iterations; i++) {
2717 // string string_brlen = getTreeString();
2718 saveBranchLengths(lenvec);
2719 // if (verbose_mode >= VB_DEBUG) {
2720 // printTree(cout, WT_BR_LEN+WT_NEWLINE);
2721 // }
2722
2723 for (int j = 0; j < nodes.size(); j++) {
2724 optimizeOneBranch((PhyloNode*)nodes[j], (PhyloNode*)nodes2[j]);
2725 if (verbose_mode >= VB_MAX) {
2726 cout << "Branch " << nodes[j]->id << " " << nodes2[j]->id << ": " << computeLikelihoodFromBuffer() << endl;
2727 }
2728 }
2729
2730 // if (i == 0)
2731 // optimizeOneBranch((PhyloNode*)nodes[0], (PhyloNode*)nodes2[0]);
2732 // if (i % 2 == 0) {
2733 // for (int j = 1; j < nodes.size(); j++)
2734 // optimizeOneBranch((PhyloNode*)nodes[j], (PhyloNode*)nodes2[j]);
2735 // } else {
2736 // for (int j = nodes.size()-2; j >= 0; j--)
2737 // optimizeOneBranch((PhyloNode*)nodes[j], (PhyloNode*)nodes2[j]);
2738 // }
2739
2740 // optimizeAllBranches((PhyloNode*) root, NULL, maxNRStep);
2741
2742 double new_tree_lh = computeLikelihoodFromBuffer();
2743 //cout<<"After opt log-lh = "<<new_tree_lh<<endl;
2744
2745 if (verbose_mode >= VB_MAX) {
2746 cout << "Likelihood after iteration " << i + 1 << " : ";
2747 cout << new_tree_lh << endl;
2748 }
2749
2750 // if (verbose_mode >= VB_DEBUG) {
2751 // printTree(cout, WT_BR_LEN+WT_NEWLINE);
2752 // }
2753
2754 // if (new_tree_lh < tree_lh - 10.0) { // make sure that the new tree likelihood never decreases too much
2755 // cout << "ERROR: Branch length optimization failed as log-likelihood decreases too much: " << tree_lh << " --> " << new_tree_lh << endl;
2756 // getModel()->writeInfo(cout);
2757 // getRate()->writeInfo(cout);
2758 // assert(new_tree_lh >= tree_lh - 10.0);
2759 // }
2760
2761
2762 if (new_tree_lh < tree_lh - tolerance*0.1) {
2763 // IN RARE CASE: tree log-likelihood decreases, revert the branch length and stop
2764 if (verbose_mode >= VB_MED)
2765 cout << "NOTE: Restoring branch lengths as tree log-likelihood decreases after branch length optimization: "
2766 << tree_lh << " -> " << new_tree_lh << endl;
2767
2768 clearAllPartialLH();
2769 restoreBranchLengths(lenvec);
2770
2771 //clearAllPartialLH();
2772 // readTreeString(string_brlen);
2773 double max_delta_lh = 1.0;
2774 // Increase max delta with PoMo because log likelihood is very much lower.
2775 if (aln->seq_type == SEQ_POMO) max_delta_lh = 3.0;
2776 new_tree_lh = computeLikelihood();
2777 if (fabs(new_tree_lh-tree_lh) > max_delta_lh) {
2778 printTree(cout);
2779 cout << endl;
2780 cout << "new_tree_lh: " << new_tree_lh << " tree_lh: " << tree_lh << endl;
2781 }
2782 ASSERT(fabs(new_tree_lh-tree_lh) < max_delta_lh);
2783 return new_tree_lh;
2784 }
2785
2786 // only return if the new_tree_lh >= tree_lh! (in rare case that likelihood decreases, continue the loop)
2787 if (tree_lh <= new_tree_lh && new_tree_lh <= tree_lh + tolerance) {
2788 curScore = new_tree_lh;
2789 return new_tree_lh;
2790 }
2791 tree_lh = new_tree_lh;
2792 }
2793 curScore = tree_lh;
2794 return tree_lh;
2795 }
2796
moveRoot(Node * node1,Node * node2)2797 void PhyloTree::moveRoot(Node *node1, Node *node2) {
2798 // unplug root from tree
2799 Node *root_dad = root->neighbors[0]->node;
2800 Node *root_nei1 = NULL, *root_nei2 = NULL;
2801 double len = 0.0;
2802 FOR_NEIGHBOR_IT(root_dad, root, it) {
2803 if (!root_nei1)
2804 root_nei1 = (*it)->node;
2805 else if (!root_nei2)
2806 root_nei2 = (*it)->node;
2807 else
2808 outError("Cannot move multifurcating root branch");
2809 len += (*it)->length;
2810 }
2811 root_nei1->updateNeighbor(root_dad, root_nei2, len);
2812 root_nei2->updateNeighbor(root_dad, root_nei1, len);
2813
2814 // plug root to new branch
2815 len = node1->findNeighbor(node2)->length / 2.0;
2816 root_dad->updateNeighbor(root_nei1, node1, len);
2817 node1->updateNeighbor(node2, root_dad, len);
2818 root_dad->updateNeighbor(root_nei2, node2, len);
2819 node2->updateNeighbor(node1, root_dad, len);
2820
2821 if (isSuperTree()) {
2822 ((PhyloSuperTree*) this)->mapTrees();
2823 }
2824 if (Params::getInstance().pll) {
2825 pllReadNewick(getTreeString());
2826 }
2827 resetCurScore();
2828 if (Params::getInstance().fixStableSplits || Params::getInstance().adaptPertubation) {
2829 buildNodeSplit();
2830 }
2831 current_it = current_it_back = NULL;
2832 clearBranchDirection();
2833 computeBranchDirection();
2834 }
2835
optimizeRootPosition(int root_dist,bool write_info,double logl_epsilon)2836 double PhyloTree::optimizeRootPosition(int root_dist, bool write_info, double logl_epsilon) {
2837 if (!rooted)
2838 return curScore;
2839
2840 NodeVector nodes1, nodes2;
2841 getBranches(root_dist+1, nodes1, nodes2);
2842 int i;
2843 Node *root_dad = root->neighbors[0]->node;
2844
2845 double best_score = curScore;
2846 string best_tree = getTreeString();
2847
2848 StrVector trees;
2849
2850 // ignore branches directly descended from root branch
2851 for (i = 0; i != nodes1.size(); )
2852 if (nodes1[i] == root_dad || nodes2[i] == root_dad) {
2853 nodes1[i] = nodes1[nodes1.size()-1];
2854 nodes2[i] = nodes2[nodes2.size()-1];
2855 nodes1.pop_back();
2856 nodes2.pop_back();
2857 } else {
2858 i++;
2859 }
2860
2861 // get all trees
2862 for (i = 0; i != nodes1.size(); i++) {
2863 moveRoot(nodes1[i], nodes2[i]);
2864 trees.push_back(getTreeString());
2865 }
2866
2867 // optimize branch lengths for all trees
2868 for (auto t = trees.begin(); t != trees.end(); t++) {
2869 readTreeString(*t);
2870 optimizeAllBranches(100, logl_epsilon);
2871 if (verbose_mode >= VB_MED) {
2872 cout << "Root pos " << (t - trees.begin())+1 << ": " << curScore << endl;
2873 if (verbose_mode >= VB_DEBUG)
2874 drawTree(cout);
2875 }
2876 if (curScore > best_score + logl_epsilon) {
2877 if (verbose_mode >= VB_MED || write_info)
2878 cout << "Better root: " << curScore << endl;
2879 best_score = curScore;
2880 best_tree = getTreeString();
2881 }
2882 }
2883
2884 readTreeString(best_tree);
2885 curScore = computeLikelihood();
2886
2887 ASSERT(fabs(curScore-best_score) < logl_epsilon);
2888
2889 return curScore;
2890 }
2891
testRootPosition(bool write_info,double logl_epsilon)2892 double PhyloTree::testRootPosition(bool write_info, double logl_epsilon) {
2893 if (!rooted)
2894 return curScore;
2895
2896 BranchVector branches;
2897 getBranches(branches);
2898 int i;
2899 Node *root_nei = root->neighbors[0]->node;
2900 ASSERT(root_nei->degree() == 3);
2901 Branch root_br = {NULL, NULL};
2902 FOR_NEIGHBOR_IT(root_nei, root, it) {
2903 if (root_br.first == NULL)
2904 root_br.first = (*it)->node;
2905 else
2906 root_br.second = (*it)->node;
2907 }
2908
2909 double best_score = curScore, orig_score = curScore;
2910
2911 multimap<double, string> logl_trees;
2912
2913 // ignore branches directly descended from root branch
2914 for (i = 0; i != branches.size(); )
2915 if (branches[i].first == root_nei || branches[i].second == root_nei) {
2916 branches[i] = branches[branches.size()-1];
2917 branches.pop_back();
2918 } else {
2919 i++;
2920 }
2921 branches.push_back(root_br);
2922
2923 // get all trees
2924 // for (i = 0; i != nodes1.size(); i++) {
2925 // moveRoot(nodes1[i], nodes2[i]);
2926 // trees.push_back(getTreeString());
2927 // }
2928 //
2929 // optimize branch lengths for all trees
2930 for (i = 0; i != branches.size(); i++) {
2931 // for (auto t = trees.begin()+1; t != trees.end(); t++) {
2932 moveRoot(branches[i].first, branches[i].second);
2933 // readTreeString(*t);
2934 optimizeAllBranches(100, logl_epsilon);
2935 stringstream ss;
2936 printTree(ss);
2937 logl_trees.insert({curScore, ss.str()});
2938 if (verbose_mode >= VB_MED) {
2939 cout << "Root pos " << i+1 << ": " << curScore << endl;
2940 if (verbose_mode >= VB_DEBUG)
2941 drawTree(cout);
2942 }
2943 if (curScore > best_score + logl_epsilon) {
2944 if (verbose_mode >= VB_MED || write_info)
2945 cout << "Better root: " << curScore << endl;
2946 best_score = curScore;
2947 }
2948 }
2949
2950 // readTreeString(best_tree);
2951 // curScore = computeLikelihood();
2952
2953 ASSERT(curScore > orig_score - 0.1);
2954 if (curScore > orig_score)
2955 cout << "UPDATE BEST SCORE: " << curScore << endl;
2956
2957 ofstream out;
2958 string out_file = (string)params->out_prefix + ".rooted_trees";
2959 out.open(out_file);
2960 out.precision(10);
2961 for (auto lt = logl_trees.rbegin(); lt != logl_trees.rend(); lt++) {
2962 out << "[ lh=" << lt->first << " ]" << lt->second << endl;
2963 }
2964 out.close();
2965 cout << "Rooted trees with log-likelihoods printed to " << out_file << endl;
2966 if (params->treeset_file.empty())
2967 params->treeset_file = out_file;
2968
2969 // convert logL to weight based on the best score
2970 // ASSERT(logLs.size() == nodes1.size());
2971 // for (i = 0; i < logLs.size(); i++) {
2972 // double weight = exp(logLs[i] - best_score);
2973 // nodes1[i]->name = convertDoubleToString(weight);
2974 // }
2975
2976 return curScore;
2977 }
2978
2979 /****************************************************************************
2980 Stepwise addition (greedy) by maximum likelihood
2981 ****************************************************************************/
2982
addTaxonML(Node * added_node,Node * & target_node,Node * & target_dad,Node * node,Node * dad)2983 double PhyloTree::addTaxonML(Node *added_node, Node* &target_node, Node* &target_dad, Node *node, Node *dad) {
2984
2985 Neighbor *dad_nei = dad->findNeighbor(node);
2986
2987 // now insert the new node in the middle of the branch node-dad
2988 double len = dad_nei->length;
2989 node->updateNeighbor(dad, added_node, len / 2.0);
2990 dad->updateNeighbor(node, added_node, len / 2.0);
2991 added_node->updateNeighbor((Node*) 1, node, len / 2.0);
2992 added_node->updateNeighbor((Node*) 2, dad, len / 2.0);
2993 // compute the likelihood
2994 clearAllPartialLH();
2995 double best_score = optimizeChildBranches((PhyloNode*) added_node);
2996 target_node = node;
2997 target_dad = dad;
2998 // remove the added node
2999 node->updateNeighbor(added_node, dad, len);
3000 dad->updateNeighbor(added_node, node, len);
3001 added_node->updateNeighbor(node, (Node*) 1, len);
3002 added_node->updateNeighbor(dad, (Node*) 2, len);
3003
3004 // now tranverse the tree downwards
3005
3006 FOR_NEIGHBOR_IT(node, dad, it){
3007 Node *target_node2;
3008 Node *target_dad2;
3009 double score = addTaxonML(added_node, target_node2, target_dad2, (*it)->node, node);
3010 if (score > best_score) {
3011
3012 best_score = score;
3013 target_node = target_node2;
3014 target_dad = target_dad2;
3015 }
3016 }
3017 return best_score;
3018 }
3019
growTreeML(Alignment * alignment)3020 void PhyloTree::growTreeML(Alignment *alignment) {
3021
3022 cout << "Stepwise addition using ML..." << endl;
3023 aln = alignment;
3024 int size = aln->getNSeq();
3025 if (size < 3)
3026 outError(ERR_FEW_TAXA);
3027
3028 root = newNode();
3029 Node * new_taxon;
3030
3031 // create initial tree with 3 taxa
3032 for (leafNum = 0; leafNum < 3; leafNum++) {
3033 cout << "Add " << aln->getSeqName(leafNum) << " to the tree" << endl;
3034 new_taxon = newNode(leafNum, aln->getSeqName(leafNum).c_str());
3035 root->addNeighbor(new_taxon, 1.0);
3036 new_taxon->addNeighbor(root, 1.0);
3037 }
3038 root = findNodeID(0);
3039 optimizeAllBranches();
3040
3041 // stepwise adding the next taxon
3042 for (leafNum = 3; leafNum < size; leafNum++) {
3043
3044 cout << "Add " << aln->getSeqName(leafNum) << " to the tree" << endl;
3045 // allocate a new taxon and a new ajedcent internal node
3046 new_taxon = newNode(leafNum, aln->getSeqName(leafNum).c_str());
3047 Node *added_node = newNode();
3048 added_node->addNeighbor(new_taxon, 1.0);
3049 new_taxon->addNeighbor(added_node, 1.0);
3050
3051 // preserve two neighbors
3052 added_node->addNeighbor((Node*) 1, 1.0);
3053 added_node->addNeighbor((Node*) 2, 1.0);
3054
3055 Node *target_node = NULL;
3056 Node *target_dad = NULL;
3057 addTaxonML(added_node, target_node, target_dad, root->neighbors[0]->node, root);
3058 // now insert the new node in the middle of the branch node-dad
3059 double len = target_dad->findNeighbor(target_node)->length;
3060 target_node->updateNeighbor(target_dad, added_node, len / 2.0);
3061 target_dad->updateNeighbor(target_node, added_node, len / 2.0);
3062 added_node->updateNeighbor((Node*) 1, target_node, len / 2.0);
3063 added_node->updateNeighbor((Node*) 2, target_dad, len / 2.0);
3064 // compute the likelihood
3065 clearAllPartialLH();
3066 optimizeAllBranches();
3067 //optimizeNNI();
3068 }
3069
3070 nodeNum = 2 * leafNum - 2;
3071 }
3072
3073 /****************************************************************************
3074 Distance function
3075 ****************************************************************************/
3076
computeDist(int seq1,int seq2,double initial_dist,double & d2l)3077 double PhyloTree::computeDist(int seq1, int seq2, double initial_dist, double &d2l) {
3078 // if no model or site rate is specified, return JC distance
3079 if (initial_dist == 0.0) {
3080 if (params->compute_obs_dist)
3081 return (initial_dist = aln->computeObsDist(seq1, seq2));
3082 else
3083 initial_dist = aln->computeDist(seq1, seq2);
3084 }
3085 if (!model_factory || !site_rate)
3086 return initial_dist; // MANUEL: here no d2l is return
3087
3088 // now optimize the distance based on the model and site rate
3089 AlignmentPairwise *aln_pair = new AlignmentPairwise(this, seq1, seq2);
3090
3091 double dist = aln_pair->optimizeDist(initial_dist, d2l);
3092 delete aln_pair;
3093 return dist;
3094 }
3095
computeDist(int seq1,int seq2,double initial_dist)3096 double PhyloTree::computeDist(int seq1, int seq2, double initial_dist) {
3097 double var;
3098 return computeDist(seq1, seq2, initial_dist, var);
3099 }
3100
correctDist(double * dist_mat)3101 double PhyloTree::correctDist(double *dist_mat) {
3102 int i, j, k, pos;
3103 int n = aln->getNSeq();
3104 int nsqr = n * n;
3105 // use Floyd algorithm to find shortest path between all pairs of taxa
3106 for (k = 0; k < n; k++)
3107 for (i = 0, pos = 0; i < n; i++)
3108 for (j = 0; j < n; j++, pos++) {
3109 double tmp = dist_mat[i * n + k] + dist_mat[k * n + j];
3110 if (dist_mat[pos] > tmp)
3111 dist_mat[pos] = tmp;
3112 }
3113 double longest_dist = 0.0;
3114 for (i = 0; i < nsqr; i++)
3115 if (dist_mat[i] > longest_dist)
3116 longest_dist = dist_mat[i];
3117
3118 return longest_dist;
3119 }
3120
computeDist(double * dist_mat,double * var_mat)3121 double PhyloTree::computeDist(double *dist_mat, double *var_mat) {
3122 int nseqs = aln->getNSeq();
3123 int pos = 0;
3124 int num_pairs = nseqs * (nseqs - 1) / 2;
3125 double longest_dist = 0.0;
3126 int *row_id = new int[num_pairs];
3127 int *col_id = new int[num_pairs];
3128
3129 row_id[0] = 0;
3130 col_id[0] = 1;
3131 for (pos = 1; pos < num_pairs; pos++) {
3132 row_id[pos] = row_id[pos - 1];
3133 col_id[pos] = col_id[pos - 1] + 1;
3134 if (col_id[pos] >= nseqs) {
3135 row_id[pos]++;
3136 col_id[pos] = row_id[pos] + 1;
3137 }
3138 }
3139 // compute the upper-triangle of distance matrix
3140 #ifdef _OPENMP
3141 #pragma omp parallel for private(pos)
3142 #endif
3143
3144 for (pos = 0; pos < num_pairs; pos++) {
3145 int seq1 = row_id[pos];
3146 int seq2 = col_id[pos];
3147 double d2l; // moved here for thread-safe (OpenMP)
3148 int sym_pos = seq1 * nseqs + seq2;
3149 dist_mat[sym_pos] = computeDist(seq1, seq2, dist_mat[sym_pos], d2l);
3150 if (params->ls_var_type == OLS)
3151 var_mat[sym_pos] = 1.0;
3152 else if (params->ls_var_type == WLS_PAUPLIN)
3153 var_mat[sym_pos] = 0.0;
3154 else if (params->ls_var_type == WLS_FIRST_TAYLOR)
3155 var_mat[sym_pos] = dist_mat[sym_pos];
3156 else if (params->ls_var_type == WLS_FITCH_MARGOLIASH)
3157 var_mat[sym_pos] = dist_mat[sym_pos] * dist_mat[sym_pos];
3158 else if (params->ls_var_type == WLS_SECOND_TAYLOR)
3159 var_mat[sym_pos] = -1.0 / d2l;
3160 }
3161
3162 // copy upper-triangle into lower-triangle and set diagonal = 0
3163 for (int seq1 = 0; seq1 < nseqs; seq1++)
3164 for (int seq2 = 0; seq2 <= seq1; seq2++) {
3165 pos = seq1 * nseqs + seq2;
3166 if (seq1 == seq2) {
3167 dist_mat[pos] = 0.0;
3168 var_mat[pos] = 0.0;
3169 }
3170 else {
3171 dist_mat[pos] = dist_mat[seq2 * nseqs + seq1];
3172 var_mat[pos] = var_mat[seq2 * nseqs + seq1];
3173 }
3174 if (dist_mat[pos] > longest_dist)
3175 longest_dist = dist_mat[pos];
3176 }
3177 delete[] col_id;
3178 delete[] row_id;
3179
3180 /*
3181 if (longest_dist > MAX_GENETIC_DIST * 0.99)
3182 outWarning("Some distances are saturated. Please check your alignment again");*/
3183 // NOTE: Bionj does handle long distances already (thanks Manuel)
3184 //return correctDist(dist_mat);
3185 return longest_dist;
3186 }
3187
computeDist(Params & params,Alignment * alignment,double * & dist_mat,double * & var_mat,string & dist_file)3188 double PhyloTree::computeDist(Params ¶ms, Alignment *alignment, double* &dist_mat, double* &var_mat,
3189 string &dist_file) {
3190 this->params = ¶ms;
3191 double longest_dist = 0.0;
3192 aln = alignment;
3193 dist_file = params.out_prefix;
3194 if (!model_factory) {
3195 if (params.compute_obs_dist)
3196 dist_file += ".obsdist";
3197 else
3198 //dist_file += ".jcdist"; // too many files, I decided to discard .jcdist
3199 dist_file += ".mldist";
3200 } else
3201 dist_file += ".mldist";
3202
3203 if (!dist_mat) {
3204 dist_mat = new double[alignment->getNSeq() * alignment->getNSeq()];
3205 memset(dist_mat, 0, sizeof(double) * alignment->getNSeq() * alignment->getNSeq());
3206 var_mat = new double[alignment->getNSeq() * alignment->getNSeq()];
3207 // BUG!
3208 //memset(var_mat, 1, sizeof(double) * alignment->getNSeq() * alignment->getNSeq());
3209 int nseq = alignment->getNSeq();
3210 for (int i = 0; i < nseq; i++)
3211 for (int j = 0; j < nseq; j++)
3212 var_mat[i*nseq+j] = 1.0;
3213 }
3214 if (!params.dist_file) {
3215 longest_dist = computeDist(dist_mat, var_mat);
3216 alignment->printDist(dist_file.c_str(), dist_mat);
3217 } else {
3218 longest_dist = alignment->readDist(params.dist_file, dist_mat);
3219 dist_file = params.dist_file;
3220 }
3221 return longest_dist;
3222 }
3223
computeObsDist(double * dist_mat)3224 double PhyloTree::computeObsDist(double *dist_mat) {
3225 int nseqs = aln->getNSeq();
3226 int pos = 0;
3227 double longest_dist = 0.0;
3228 for (int seq1 = 0; seq1 < nseqs; seq1++)
3229 for (int seq2 = 0; seq2 < nseqs; seq2++, pos++) {
3230 if (seq1 == seq2)
3231 dist_mat[pos] = 0.0;
3232 else if (seq2 > seq1) {
3233 dist_mat[pos] = aln->computeObsDist(seq1, seq2);
3234 } else
3235 dist_mat[pos] = dist_mat[seq2 * nseqs + seq1];
3236
3237 if (dist_mat[pos] > longest_dist)
3238 longest_dist = dist_mat[pos];
3239 }
3240 return longest_dist;
3241 //return correctDist(dist_mat);
3242 }
3243
computeObsDist(Params & params,Alignment * alignment,double * & dist_mat,string & dist_file)3244 double PhyloTree::computeObsDist(Params ¶ms, Alignment *alignment, double* &dist_mat, string &dist_file) {
3245 double longest_dist = 0.0;
3246 aln = alignment;
3247 dist_file = params.out_prefix;
3248 dist_file += ".obsdist";
3249
3250 if (!dist_mat) {
3251 dist_mat = new double[alignment->getNSeq() * alignment->getNSeq()];
3252 memset(dist_mat, 0, sizeof(double) * alignment->getNSeq() * alignment->getNSeq());
3253 }
3254 longest_dist = computeObsDist(dist_mat);
3255 alignment->printDist(dist_file.c_str(), dist_mat);
3256
3257 return longest_dist;
3258 }
3259
3260 /****************************************************************************
3261 compute BioNJ tree, a more accurate extension of Neighbor-Joining
3262 ****************************************************************************/
3263
computeBioNJ(Params & params,Alignment * alignment,string & dist_file)3264 void PhyloTree::computeBioNJ(Params ¶ms, Alignment *alignment, string &dist_file) {
3265 string bionj_file = params.out_prefix;
3266 bionj_file += ".bionj";
3267 cout << "Computing BIONJ tree..." << endl;
3268 BioNj bionj;
3269 bionj.create(dist_file.c_str(), bionj_file.c_str());
3270 // bool my_rooted = false;
3271 bool non_empty_tree = (root != NULL);
3272 // if (root)
3273 // freeNode();
3274 readTreeFile(bionj_file.c_str());
3275
3276
3277 if (non_empty_tree) {
3278 initializeAllPartialLh();
3279 }
3280 // setAlignment(alignment);
3281 }
3282
setNegativeBranch(bool force,double newlen,Node * node,Node * dad)3283 int PhyloTree::setNegativeBranch(bool force, double newlen, Node *node, Node *dad) {
3284 if (!node) node = root;
3285 int fixed = 0;
3286
3287 FOR_NEIGHBOR_IT(node, dad, it) {
3288 if ((*it)->length < 0.0 || force) { // negative branch length detected
3289 (*it)->length = newlen;
3290 // set the backward branch length
3291 (*it)->node->findNeighbor(node)->length = (*it)->length;
3292 fixed++;
3293 }
3294 fixed += setNegativeBranch(force, newlen, (*it)->node, node);
3295 }
3296 return fixed;
3297 }
3298
fixOneNegativeBranch(double branch_length,Neighbor * dad_branch,Node * dad)3299 void PhyloTree::fixOneNegativeBranch(double branch_length, Neighbor *dad_branch, Node *dad) {
3300 dad_branch->length = branch_length;
3301 // set the backward branch length
3302 dad_branch->node->findNeighbor(dad)->length = branch_length;
3303 }
3304
fixNegativeBranch(bool force,Node * node,Node * dad)3305 int PhyloTree::fixNegativeBranch(bool force, Node *node, Node *dad) {
3306
3307 // 2019-02-05: fix crash when no variant sites found
3308 if (aln->num_variant_sites == 0)
3309 return setNegativeBranch(force, params->min_branch_length, root, NULL);
3310
3311 if (!node) {
3312 node = root;
3313 // 2015-11-30: if not bifurcating, initialize unknown branch lengths with 0.1
3314 if (!isBifurcating())
3315 return setNegativeBranch(force, 0.1, root, NULL);
3316 }
3317 int fixed = 0;
3318
3319 if (force && !cost_matrix)
3320 return setParsimonyBranchLengths();
3321
3322 double alpha = (site_rate) ? site_rate->getGammaShape() : 1.0;
3323
3324 FOR_NEIGHBOR_IT(node, dad, it){
3325 if ((*it)->length < 0.0 || force) { // negative branch length detected
3326 int branch_subst;
3327 int pars_score = computeParsimonyBranch((PhyloNeighbor*) (*it), (PhyloNode*) node, &branch_subst);
3328 // first compute the observed parsimony distance
3329 double branch_length = (branch_subst > 0) ? ((double) branch_subst / getAlnNSite()) : (1.0 / getAlnNSite());
3330
3331 branch_length = correctBranchLengthF81(branch_length, alpha);
3332
3333 // if (verbose_mode >= VB_DEBUG)
3334 // cout << "Negative branch length " << (*it)->length << " was set to ";
3335 //(*it)->length = fixed_length;
3336 //(*it)->length = random_double()+0.1;
3337 fixOneNegativeBranch(branch_length, (*it), node);
3338 if (verbose_mode >= VB_DEBUG)
3339 cout << branch_length << " parsimony = " << pars_score << endl;
3340 fixed++;
3341 }
3342 if ((*it)->length <= 0.0 && (!rooted || node != root)) {
3343 (*it)->length = params->min_branch_length;
3344 (*it)->node->findNeighbor(node)->length = (*it)->length;
3345 }
3346 fixed += fixNegativeBranch(force, (*it)->node, node);
3347 }
3348 return fixed;
3349 }
3350
3351 //int PhyloTree::assignRandomBranchLengths(bool force, Node *node, Node *dad) {
3352 //
3353 // if (!node)
3354 // node = root;
3355 // int fixed = 0;
3356 //
3357 // FOR_NEIGHBOR_IT(node, dad, it){
3358 // if ((*it)->length < 0.0 || force) { // negative branch length detected
3359 // if (verbose_mode >= VB_DEBUG)
3360 // cout << "Negative branch length " << (*it)->length << " was set to ";
3361 // (*it)->length = random_double() + 0.1;
3362 // if (verbose_mode >= VB_DEBUG)
3363 // cout << (*it)->length << endl;
3364 // // set the backward branch length
3365 // (*it)->node->findNeighbor(node)->length = (*it)->length;
3366 // fixed++;
3367 // }
3368 // if ((*it)->length <= 0.0) {
3369 // (*it)->length = 1e-6;
3370 // (*it)->node->findNeighbor(node)->length = (*it)->length;
3371 // }
3372 // fixed += assignRandomBranchLengths(force, (*it)->node, node);
3373 // }
3374 // return fixed;
3375 //}
3376
3377 /****************************************************************************
3378 Nearest Neighbor Interchange by maximum likelihood
3379 ****************************************************************************/
3380
3381 /*
3382 void PhyloTree::doOneRandomNNI(Branch branch) {
3383 assert(isInnerBranch(branch.first, branch.second));
3384
3385 if (((PhyloNeighbor*)branch.first->findNeighbor(branch.second))->direction == TOWARD_ROOT) {
3386 // swap node1 and node2 if the direction is not right, only for nonreversible models
3387 Node *tmp = branch.first;
3388 branch.first = branch.second;
3389 branch.second = tmp;
3390 }
3391
3392 NNIMove nni;
3393 nni.node1 = (PhyloNode*) branch.first;
3394 nni.node2 = (PhyloNode*) branch.second;
3395 FOR_NEIGHBOR_IT(branch.first, branch.second, node1NeiIt)
3396 if (((PhyloNeighbor*)*node1NeiIt)->direction != TOWARD_ROOT)
3397 {
3398 nni.node1Nei_it = node1NeiIt;
3399 break;
3400 }
3401 int randInt = random_int(branch.second->neighbors.size()-1);
3402 int cnt = 0;
3403 FOR_NEIGHBOR_IT(branch.second, branch.first, node2NeiIt) {
3404 // if this loop, is it sure that direction is away from root because node1->node2 is away from root
3405 if (cnt == randInt) {
3406 nni.node2Nei_it = node2NeiIt;
3407 break;
3408 } else {
3409 cnt++;
3410 }
3411 }
3412 assert(*nni.node1Nei_it != NULL && *nni.node2Nei_it != NULL);
3413 assert(((PhyloNeighbor*)*nni.node1Nei_it)->direction != TOWARD_ROOT && ((PhyloNeighbor*)*nni.node2Nei_it)->direction != TOWARD_ROOT);
3414
3415 if (constraintTree.isCompatible(nni))
3416 doNNI(nni, true);
3417 }
3418 */
3419
getRandomNNI(Branch & branch)3420 NNIMove PhyloTree::getRandomNNI(Branch &branch) {
3421 ASSERT(isInnerBranch(branch.first, branch.second));
3422 // for rooted tree
3423 if (((PhyloNeighbor*)branch.first->findNeighbor(branch.second))->direction == TOWARD_ROOT) {
3424 // swap node1 and node2 if the direction is not right, only for nonreversible models
3425 Node *tmp = branch.first;
3426 branch.first = branch.second;
3427 branch.second = tmp;
3428 }
3429 NNIMove nni;
3430 nni.node1 = (PhyloNode*) branch.first;
3431 nni.node2 = (PhyloNode*) branch.second;
3432
3433 FOR_NEIGHBOR_IT(branch.first, branch.second, node1NeiIt)
3434 if (((PhyloNeighbor*)*node1NeiIt)->direction != TOWARD_ROOT) {
3435 nni.node1Nei_it = node1NeiIt;
3436 break;
3437 }
3438 int randInt = random_int(branch.second->neighbors.size()-1);
3439 int cnt = 0;
3440 FOR_NEIGHBOR_IT(branch.second, branch.first, node2NeiIt) {
3441 // if this loop, is it sure that direction is away from root because node1->node2 is away from root
3442 if (cnt == randInt) {
3443 nni.node2Nei_it = node2NeiIt;
3444 break;
3445 } else {
3446 cnt++;
3447 }
3448 }
3449 ASSERT(*nni.node1Nei_it != NULL && *nni.node2Nei_it != NULL);
3450 ASSERT(((PhyloNeighbor*)*nni.node1Nei_it)->direction != TOWARD_ROOT && ((PhyloNeighbor*)*nni.node2Nei_it)->direction != TOWARD_ROOT);
3451 nni.newloglh = 0.0;
3452 return nni;
3453 }
3454
doNNI(NNIMove & move,bool clearLH)3455 void PhyloTree::doNNI(NNIMove &move, bool clearLH) {
3456 PhyloNode *node1 = move.node1;
3457 PhyloNode *node2 = move.node2;
3458 NeighborVec::iterator node1Nei_it = move.node1Nei_it;
3459 NeighborVec::iterator node2Nei_it = move.node2Nei_it;
3460 Neighbor *node1Nei = *(node1Nei_it);
3461 Neighbor *node2Nei = *(node2Nei_it);
3462
3463 // TODO MINH
3464 /* Node *nodeA = node1Nei->node;
3465 Node *nodeB = node2Nei->node;
3466
3467 NeighborVec::iterator nodeA_it = nodeA->findNeighborIt(node1);
3468 NeighborVec::iterator nodeB_it = nodeB->findNeighborIt(node2);
3469 Neighbor *nodeANei = *(nodeA_it);
3470 Neighbor *nodeBNei = *(nodeB_it);
3471 *node1Nei_it = node2Nei;
3472 *nodeB_it = nodeANei;
3473 *node2Nei_it = node1Nei;
3474 *nodeA_it = nodeBNei;*/
3475 // END TODO MINH
3476 ASSERT(node1->degree() == 3 && node2->degree() == 3);
3477
3478 PhyloNeighbor *node12_it = (PhyloNeighbor*) node1->findNeighbor(node2); // return neighbor of node1 which points to node 2
3479 PhyloNeighbor *node21_it = (PhyloNeighbor*) node2->findNeighbor(node1); // return neighbor of node2 which points to node 1
3480
3481 // reorient partial_lh before swap
3482 if (!isSuperTree()) {
3483 reorientPartialLh(node12_it, node1);
3484 reorientPartialLh(node21_it, node2);
3485 }
3486
3487 // do the NNI swap
3488 node1->updateNeighbor(node1Nei_it, node2Nei);
3489 node2Nei->node->updateNeighbor(node2, node1);
3490
3491 node2->updateNeighbor(node2Nei_it, node1Nei);
3492 node1Nei->node->updateNeighbor(node1, node2);
3493
3494 // BQM check branch ID
3495 /*
3496 if (node1->findNeighbor(nodeB)->id != nodeB->findNeighbor(node1)->id) {
3497 cout << node1->findNeighbor(nodeB)->id << "<->" << nodeB->findNeighbor(node1)->id << endl;
3498 cout << node1->id << "," << nodeB->id << endl;
3499 outError("Wrong ID");
3500 }
3501 if (node2->findNeighbor(nodeA)->id != nodeA->findNeighbor(node2)->id) {
3502 cout << node2->findNeighbor(nodeA)->id << "<->" << nodeA->findNeighbor(node2)->id << endl;
3503 cout << node2->id << "," << nodeA->id << endl;
3504 outError("Wrong ID");
3505 }*/
3506
3507 PhyloNeighbor *nei12 = (PhyloNeighbor*) node1->findNeighbor(node2); // return neighbor of node1 which points to node 2
3508 PhyloNeighbor *nei21 = (PhyloNeighbor*) node2->findNeighbor(node1); // return neighbor of node2 which points to node 1
3509
3510 if (clearLH) {
3511 // clear partial likelihood vector
3512 nei12->clearPartialLh();
3513 nei21->clearPartialLh();
3514 nei12->size = nei21->size = 0;
3515
3516 node2->clearReversePartialLh(node1);
3517 node1->clearReversePartialLh(node2);
3518 //if (params->nni5Branches)
3519 // clearAllPartialLH();
3520 }
3521
3522 if (params->leastSquareNNI) {
3523 updateSubtreeDists(move);
3524 }
3525
3526 // update split store in node
3527 if (nei12->split != NULL || nei21->split != NULL) {
3528 delete nei12->split;
3529 nei12->split = new Split(leafNum);
3530 delete nei21->split;
3531 nei21->split = new Split(leafNum);
3532
3533 FOR_NEIGHBOR_IT(nei12->node, node1, it)
3534 *(nei12->split) += *((*it)->split);
3535
3536 FOR_NEIGHBOR_IT(nei21->node, node2, it)
3537 *(nei21->split) += *((*it)->split);
3538 }
3539 }
3540
changeNNIBrans(NNIMove & nnimove)3541 void PhyloTree::changeNNIBrans(NNIMove &nnimove) {
3542 PhyloNode *node1 = nnimove.node1;
3543 PhyloNode *node2 = nnimove.node2;
3544 PhyloNeighbor *node1_node2_nei = (PhyloNeighbor*) node1->findNeighbor(node2);
3545 PhyloNeighbor *node2_node1_nei = (PhyloNeighbor*) node2->findNeighbor(node1);
3546 node1_node2_nei->setLength(nnimove.newLen[0]);
3547 node2_node1_nei->setLength(nnimove.newLen[0]);
3548 if (params->nni5) {
3549 int i = 1;
3550 Neighbor* nei;
3551 Neighbor* nei_back;
3552 NeighborVec::iterator it;
3553 FOR_NEIGHBOR(node1, node2, it)
3554 {
3555 nei = (*it)->node->findNeighbor(node1);
3556 nei_back = (node1)->findNeighbor((*it)->node);
3557 nei->setLength(nnimove.newLen[i]);
3558 nei_back->setLength(nnimove.newLen[i]);
3559 i++;
3560 }
3561 FOR_NEIGHBOR(node2, node1, it)
3562 {
3563 nei = (*it)->node->findNeighbor(node2);
3564 nei_back = (node2)->findNeighbor((*it)->node);
3565 nei->setLength(nnimove.newLen[i]);
3566 nei_back->setLength(nnimove.newLen[i]);
3567 i++;
3568 }
3569 }
3570 }
3571
getBestNNIForBran(PhyloNode * node1,PhyloNode * node2,NNIMove * nniMoves)3572 NNIMove PhyloTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove* nniMoves) {
3573
3574 ASSERT(!node1->isLeaf() && !node2->isLeaf());
3575 ASSERT(node1->degree() == 3 && node2->degree() == 3);
3576
3577 if (((PhyloNeighbor*)node1->findNeighbor(node2))->direction == TOWARD_ROOT) {
3578 // swap node1 and node2 if the direction is not right, only for nonreversible models
3579 PhyloNode *tmp = node1;
3580 node1 = node2;
3581 node2 = tmp;
3582 }
3583
3584 int IT_NUM = (params->nni5) ? 6 : 2;
3585 size_t partial_lh_size = getPartialLhBytes()/sizeof(double);
3586 size_t scale_num_size = getScaleNumBytes()/sizeof(UBYTE);
3587
3588
3589 // Upper Bounds ---------------
3590 /*
3591 if(params->upper_bound_NNI){
3592 totalNNIub += 2;
3593 NNIMove resMove;
3594 resMove = getBestNNIForBranUB(node1,node2,this);
3595 // if UB is smaller than the current likelihood, then we don't recompute the likelihood of the swapped topology.
3596 // Otherwise, follow the normal procedure: evaluate NNIs and compute the likelihood.
3597
3598 // here, we skip NNI is its UB n times worse than the curLikelihood
3599 if( resMove.newloglh < (1+params->upper_bound_frac)*this->curScore){
3600 //cout << "Skipping Likelihood evaluation of NNIs for this branch :) ........................"<<endl;
3601 return resMove;
3602 }
3603 }
3604 */
3605
3606 //-----------------------------
3607
3608 NeighborVec::iterator it;
3609
3610 NeighborVec::iterator saved_it[6];
3611 int id = 0;
3612
3613 saved_it[id++] = node1->findNeighborIt(node2);
3614 saved_it[id++] = node2->findNeighborIt(node1);
3615
3616 if (params->nni5) {
3617 FOR_NEIGHBOR(node1, node2, it)
3618 saved_it[id++] = (*it)->node->findNeighborIt(node1);
3619
3620 FOR_NEIGHBOR(node2, node1, it)
3621 saved_it[id++] = (*it)->node->findNeighborIt(node2);
3622 }
3623 ASSERT(id == IT_NUM);
3624
3625 if (!params->nni5) {
3626 reorientPartialLh((PhyloNeighbor*)node1->findNeighbor(node2), node1);
3627 reorientPartialLh((PhyloNeighbor*)node2->findNeighbor(node1), node2);
3628 }
3629
3630 Neighbor *saved_nei[6];
3631 int mem_id = 0;
3632 // save Neighbor and allocate new Neighbor pointer
3633 for (id = 0; id < IT_NUM; id++) {
3634 saved_nei[id] = (*saved_it[id]);
3635 *saved_it[id] = saved_nei[id]->newNeighbor();
3636
3637 if (((PhyloNeighbor*)saved_nei[id])->partial_lh) {
3638 ((PhyloNeighbor*) (*saved_it[id]))->partial_lh = nni_partial_lh + mem_id*partial_lh_size;
3639 ((PhyloNeighbor*) (*saved_it[id]))->scale_num = nni_scale_num + mem_id*scale_num_size;
3640 mem_id++;
3641 mem_slots.addSpecialNei((PhyloNeighbor*)*saved_it[id]);
3642 }
3643 // ((PhyloNeighbor*) (*saved_it[id]))->scale_num = newScaleNum();
3644 }
3645 if (params->nni5)
3646 ASSERT(mem_id == 2);
3647
3648 // get the Neighbor again since it is replaced for saving purpose
3649 PhyloNeighbor* node12_it = (PhyloNeighbor*) node1->findNeighbor(node2);
3650 PhyloNeighbor* node21_it = (PhyloNeighbor*) node2->findNeighbor(node1);
3651
3652 int cnt;
3653
3654 //NNIMove nniMoves[2];
3655 bool newNNIMoves = false;
3656 if (!nniMoves) {
3657 // Initialize the 2 NNI moves
3658 newNNIMoves = true;
3659 nniMoves = new NNIMove[2];
3660 nniMoves[0].ptnlh = nniMoves[1].ptnlh = NULL;
3661 nniMoves[0].node1 = NULL;
3662
3663 }
3664
3665 if (nniMoves[0].node1) {
3666 // assuming that node1Nei_it and node2Nei_it is defined in nniMoves structure
3667 for (cnt = 0; cnt < 2; cnt++) {
3668 // sanity check
3669 if (!node1->findNeighbor((*nniMoves[cnt].node1Nei_it)->node)) outError(__func__);
3670 if (!node2->findNeighbor((*nniMoves[cnt].node2Nei_it)->node)) outError(__func__);
3671 }
3672 } else {
3673 cnt = 0;
3674 FOR_NEIGHBOR_IT(node1, node2, node1_it)
3675 if (((PhyloNeighbor*)*node1_it)->direction != TOWARD_ROOT)
3676 {
3677 cnt = 0;
3678 FOR_NEIGHBOR_IT(node2, node1, node2_it) {
3679 // Initialize the 2 NNI moves
3680 nniMoves[cnt].node1Nei_it = node1_it;
3681 nniMoves[cnt].node2Nei_it = node2_it;
3682 cnt++;
3683 }
3684 break;
3685 }
3686 ASSERT(cnt == 2);
3687 }
3688
3689 // Initialize node1 and node2 in nniMoves
3690 nniMoves[0].node1 = nniMoves[1].node1 = node1;
3691 nniMoves[0].node2 = nniMoves[1].node2 = node2;
3692 nniMoves[0].newloglh = nniMoves[1].newloglh = -DBL_MAX;
3693
3694 double backupScore = curScore;
3695
3696 for (cnt = 0; cnt < 2; cnt++) if (constraintTree.isCompatible(nniMoves[cnt]))
3697 {
3698 // do the NNI swap
3699 NeighborVec::iterator node1_it = nniMoves[cnt].node1Nei_it;
3700 NeighborVec::iterator node2_it = nniMoves[cnt].node2Nei_it;
3701 Neighbor *node1_nei = *node1_it;
3702 Neighbor *node2_nei = *node2_it;
3703
3704 // reorient partial_lh before swap
3705 reorientPartialLh(node12_it, node1);
3706 reorientPartialLh(node21_it, node2);
3707
3708 node1->updateNeighbor(node1_it, node2_nei);
3709 node2_nei->node->updateNeighbor(node2, node1);
3710
3711 node2->updateNeighbor(node2_it, node1_nei);
3712 node1_nei->node->updateNeighbor(node1, node2);
3713
3714 if (params->lh_mem_save == LM_MEM_SAVE) {
3715 // reset subtree size to change traversal order
3716 for (id = 0; id < IT_NUM; id++)
3717 ((PhyloNeighbor*)*saved_it[id])->size = 0;
3718 }
3719
3720 int nni5_num_eval = max(params->nni5_num_eval, getMixlen());
3721
3722 for (int step = 0; step < nni5_num_eval; step++) {
3723
3724
3725 // clear partial likelihood vector
3726 node12_it->clearPartialLh();
3727 node21_it->clearPartialLh();
3728
3729 // compute the score of the swapped topology
3730 // double saved_len = node1_nei->length;
3731
3732 // BUG FIX: commented out following (reported by Stephen Crotty)
3733 // optimizeOneBranch(node1, node2, false, NNI_MAX_NR_STEP);
3734 // node1->findNeighbor(node2)->getLength(nniMoves[cnt].newLen[0]);
3735
3736 int i=1;
3737 if (params->nni5) {
3738 FOR_NEIGHBOR(node1, node2, it)
3739 {
3740 ((PhyloNeighbor*) (*it)->node->findNeighbor(node1))->clearPartialLh();
3741 optimizeOneBranch(node1, (PhyloNode*) (*it)->node, false, NNI_MAX_NR_STEP);
3742 node1->findNeighbor((*it)->node)->getLength(nniMoves[cnt].newLen[i]);
3743 i++;
3744 }
3745 node21_it->clearPartialLh();
3746 }
3747
3748 optimizeOneBranch(node1, node2, false, NNI_MAX_NR_STEP);
3749 node1->findNeighbor(node2)->getLength(nniMoves[cnt].newLen[0]);
3750
3751 if (params->nni5) {
3752 FOR_NEIGHBOR(node2, node1, it)
3753 {
3754 ((PhyloNeighbor*) (*it)->node->findNeighbor(node2))->clearPartialLh();
3755 optimizeOneBranch(node2, (PhyloNode*) (*it)->node, false, NNI_MAX_NR_STEP);
3756 //node2_lastnei = (PhyloNeighbor*) (*it);
3757 node2->findNeighbor((*it)->node)->getLength(nniMoves[cnt].newLen[i]);
3758 i++;
3759 }
3760 node12_it->clearPartialLh();
3761 }
3762 }
3763 double score = computeLikelihoodFromBuffer();
3764 if (verbose_mode >= VB_DEBUG)
3765 cout << "NNI " << node1->id << " - " << node2->id << ": " << score << endl;
3766 nniMoves[cnt].newloglh = score;
3767 // compute the pattern likelihoods if wanted
3768 if (nniMoves[cnt].ptnlh)
3769 computePatternLikelihood(nniMoves[cnt].ptnlh, &score);
3770
3771 if (save_all_trees == 2) {
3772 saveCurrentTree(score); // BQM: for new bootstrap
3773 }
3774
3775 // reorient partial_lh before swap
3776 reorientPartialLh(node12_it, node1);
3777 reorientPartialLh(node21_it, node2);
3778
3779 // else, swap back, also recover the branch lengths
3780 node1->updateNeighbor(node1_it, node1_nei);
3781 node1_nei->node->updateNeighbor(node2, node1);
3782 node2->updateNeighbor(node2_it, node2_nei);
3783 node2_nei->node->updateNeighbor(node1, node2);
3784 // ONLY FOR CHECKING WITH OLGA's PLEN MODEL
3785 //node1_nei->length = node2_nei->length = saved_len;
3786 }
3787
3788 // restore the Neighbor*
3789 for (id = IT_NUM-1; id >= 0; id--) {
3790 // aligned_free(((PhyloNeighbor*) *saved_it[id])->scale_num);
3791 //delete[] ((PhyloNeighbor*) *saved_it[id])->partial_lh;
3792 // if (((PhyloNeighbor*)saved_nei[id])->partial_lh) {
3793 // if (saved_nei[id]->node == node1)
3794 // mem_slots.restore(node21_it, (PhyloNeighbor*)saved_nei[id]);
3795 // else
3796 // mem_slots.restore(node12_it, (PhyloNeighbor*)saved_nei[id]);
3797 // }
3798 if (*saved_it[id] == current_it) current_it = (PhyloNeighbor*) saved_nei[id];
3799 if (*saved_it[id] == current_it_back) current_it_back = (PhyloNeighbor*) saved_nei[id];
3800
3801 delete (*saved_it[id]);
3802 (*saved_it[id]) = saved_nei[id];
3803 }
3804
3805 mem_slots.eraseSpecialNei();
3806
3807 // aligned_free(new_partial_lh);
3808
3809 // restore the length of 4 branches around node1, node2
3810 FOR_NEIGHBOR(node1, node2, it)
3811 (*it)->setLength((*it)->node->findNeighbor(node1));
3812 FOR_NEIGHBOR(node2, node1, it)
3813 (*it)->setLength((*it)->node->findNeighbor(node2));
3814
3815 // restore curScore
3816 curScore = backupScore;
3817
3818 NNIMove res;
3819 if (nniMoves[0].newloglh > nniMoves[1].newloglh) {
3820 res = nniMoves[0];
3821 } else {
3822 res = nniMoves[1];
3823 }
3824 if (newNNIMoves) {
3825 delete [] nniMoves;
3826 }
3827 return res;
3828 }
3829
3830
3831 /****************************************************************************
3832 Subtree Pruning and Regrafting by maximum likelihood
3833 ****************************************************************************/
3834
optimizeSPR_old(double cur_score,PhyloNode * node,PhyloNode * dad)3835 double PhyloTree::optimizeSPR_old(double cur_score, PhyloNode *node, PhyloNode *dad) {
3836 if (!node)
3837 node = (PhyloNode*) root;
3838 PhyloNeighbor * dad1_nei = NULL;
3839 PhyloNeighbor * dad2_nei = NULL;
3840 PhyloNode * sibling1 = NULL;
3841 PhyloNode * sibling2 = NULL;
3842 double sibling1_len = 0.0, sibling2_len = 0.0;
3843
3844 if (dad && !dad->isLeaf()) {
3845
3846 ASSERT(dad->degree() == 3);
3847 // assign the sibling of node, with respect to dad
3848
3849 FOR_NEIGHBOR_DECLARE(dad, node, it) {
3850 if (!sibling1) {
3851 dad1_nei = (PhyloNeighbor*) (*it);
3852 sibling1 = (PhyloNode*) (*it)->node;
3853 sibling1_len = (*it)->length;
3854 } else {
3855
3856 dad2_nei = (PhyloNeighbor*) (*it);
3857 sibling2 = (PhyloNode*) (*it)->node;
3858 sibling2_len = (*it)->length;
3859 }
3860 }
3861 // remove the subtree leading to node
3862 double sum_len = sibling1_len + sibling2_len;
3863 sibling1->updateNeighbor(dad, sibling2, sum_len);
3864 sibling2->updateNeighbor(dad, sibling1, sum_len);
3865 PhyloNeighbor* sibling1_nei = (PhyloNeighbor*) sibling1->findNeighbor(sibling2);
3866 PhyloNeighbor* sibling2_nei = (PhyloNeighbor*) sibling2->findNeighbor(sibling1);
3867 sibling1_nei->clearPartialLh();
3868 sibling2_nei->clearPartialLh();
3869
3870 // now try to move the subtree to somewhere else
3871 vector<PhyloNeighbor*> spr_path;
3872
3873 FOR_NEIGHBOR(sibling1, sibling2, it)
3874 {
3875 spr_path.push_back(sibling1_nei);
3876 double score = swapSPR_old(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling1,
3877 spr_path);
3878 // if likelihood score improves, return
3879 if (score > cur_score)
3880
3881 return score;
3882 spr_path.pop_back();
3883 }
3884
3885 FOR_NEIGHBOR(sibling2, sibling1, it)
3886 {
3887 spr_path.push_back(sibling2_nei);
3888 double score = swapSPR_old(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling2,
3889 spr_path);
3890 // if likelihood score improves, return
3891 if (score > cur_score)
3892
3893 return score;
3894 spr_path.pop_back();
3895 }
3896 // if likelihood does not imporve, swap back
3897 sibling1->updateNeighbor(sibling2, dad, sibling1_len);
3898 sibling2->updateNeighbor(sibling1, dad, sibling2_len);
3899 dad1_nei->node = sibling1;
3900 dad1_nei->length = sibling1_len;
3901 dad2_nei->node = sibling2;
3902 dad2_nei->length = sibling2_len;
3903 clearAllPartialLH();
3904 }
3905
3906 FOR_NEIGHBOR_IT(node, dad, it){
3907 double score = optimizeSPR_old(cur_score, (PhyloNode*) (*it)->node, node);
3908
3909 if (score > cur_score) return score;
3910 }
3911 return cur_score;
3912 }
3913
3914 /**
3915 move the subtree (dad1-node1) to the branch (dad2-node2)
3916 */
swapSPR_old(double cur_score,int cur_depth,PhyloNode * node1,PhyloNode * dad1,PhyloNode * orig_node1,PhyloNode * orig_node2,PhyloNode * node2,PhyloNode * dad2,vector<PhyloNeighbor * > & spr_path)3917 double PhyloTree::swapSPR_old(double cur_score, int cur_depth, PhyloNode *node1, PhyloNode *dad1, PhyloNode *orig_node1,
3918 PhyloNode *orig_node2, PhyloNode *node2, PhyloNode *dad2, vector<PhyloNeighbor*> &spr_path) {
3919 PhyloNeighbor *node1_nei = (PhyloNeighbor*) node1->findNeighbor(dad1);
3920 PhyloNeighbor *dad1_nei = (PhyloNeighbor*) dad1->findNeighbor(node1);
3921 double node1_dad1_len = node1_nei->length;
3922 PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
3923
3924 if (dad2) {
3925 // now, connect (node1-dad1) to the branch (node2-dad2)
3926
3927 bool first = true;
3928 PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
3929 PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
3930 double len2 = node2_nei->length;
3931
3932 FOR_NEIGHBOR_IT(dad1, node1, it){
3933 if (first) {
3934 (*it)->node = dad2;
3935 (*it)->length = len2 / 2;
3936 dad2->updateNeighbor(node2, dad1, len2 / 2);
3937 first = false;
3938 } else {
3939 (*it)->node = node2;
3940 (*it)->length = len2 / 2;
3941 node2->updateNeighbor(dad2, dad1, len2 / 2);
3942 }
3943 ((PhyloNeighbor*) (*it))->clearPartialLh();
3944 }
3945 node2_nei->clearPartialLh();
3946 dad2_nei->clearPartialLh();
3947 node1_nei->clearPartialLh();
3948 vector<PhyloNeighbor*>::iterator it2;
3949 for (it2 = spr_path.begin(); it2 != spr_path.end(); it2++)
3950 (*it2)->clearPartialLh();
3951 clearAllPartialLH();
3952 // optimize relevant branches
3953 double score;
3954
3955 /* testing different branch optimization */
3956 optimizeOneBranch(node1, dad1);
3957 score = computeLikelihoodFromBuffer();
3958 //score = optimizeOneBranch(dad2, dad1);
3959 //score = optimizeOneBranch(node2, dad1);
3960 /*
3961 PhyloNode *cur_node = dad2;
3962 for (int i = spr_path.size()-1; i >= 0; i--) {
3963 score = optimizeOneBranch(cur_node, (PhyloNode*)spr_path[i]->node);
3964 cur_node = (PhyloNode*)spr_path[i]->node;
3965 }
3966 */
3967 //score = optimizeAllBranches(dad1);
3968 // if score improves, return
3969 if (score > cur_score)
3970 return score;
3971 // else, swap back
3972 node2->updateNeighbor(dad1, dad2, len2);
3973 dad2->updateNeighbor(dad1, node2, len2);
3974 node2_nei->clearPartialLh();
3975 dad2_nei->clearPartialLh();
3976 node1_nei->length = node1_dad1_len;
3977 dad1_nei->length = node1_dad1_len;
3978
3979 // add to candiate SPR moves
3980 spr_moves.add(node1, dad1, node2, dad2, score);
3981 }
3982 if (cur_depth >= spr_radius)
3983
3984 return cur_score;
3985 spr_path.push_back(node2_nei);
3986
3987 FOR_NEIGHBOR_IT(node2, dad2, it){
3988 double score = swapSPR(cur_score, cur_depth + 1, node1, dad1, orig_node1, orig_node2, (PhyloNode*) (*it)->node, node2, spr_path);
3989 if (score > cur_score) return score;
3990 }
3991 spr_path.pop_back();
3992
3993 return cur_score;
3994
3995 }
3996
optimizeSPR(double cur_score,PhyloNode * node,PhyloNode * dad)3997 double PhyloTree::optimizeSPR(double cur_score, PhyloNode *node, PhyloNode *dad) {
3998 if (!node)
3999 node = (PhyloNode*) root;
4000 PhyloNeighbor * dad1_nei = NULL;
4001 PhyloNeighbor * dad2_nei = NULL;
4002 PhyloNode * sibling1 = NULL;
4003 PhyloNode * sibling2 = NULL;
4004 double sibling1_len = 0.0, sibling2_len = 0.0;
4005
4006 if (dad && !dad->isLeaf()) {
4007
4008 ASSERT(dad->degree() == 3);
4009 // assign the sibling of node, with respect to dad
4010
4011 FOR_NEIGHBOR_DECLARE(dad, node, it) {
4012 if (!sibling1) {
4013 dad1_nei = (PhyloNeighbor*) (*it);
4014 sibling1 = (PhyloNode*) (*it)->node;
4015 sibling1_len = (*it)->length;
4016 } else {
4017
4018 dad2_nei = (PhyloNeighbor*) (*it);
4019 sibling2 = (PhyloNode*) (*it)->node;
4020 sibling2_len = (*it)->length;
4021 }
4022 }
4023 // remove the subtree leading to node
4024 double sum_len = sibling1_len + sibling2_len;
4025 sibling1->updateNeighbor(dad, sibling2, sum_len);
4026 sibling2->updateNeighbor(dad, sibling1, sum_len);
4027 PhyloNeighbor* sibling1_nei = (PhyloNeighbor*) sibling1->findNeighbor(sibling2);
4028 PhyloNeighbor* sibling2_nei = (PhyloNeighbor*) sibling2->findNeighbor(sibling1);
4029 // save partial likelihood
4030 double* sibling1_partial_lh = sibling1_nei->partial_lh;
4031 double* sibling2_partial_lh = sibling2_nei->partial_lh;
4032 sibling1_nei->partial_lh = newPartialLh();
4033 sibling2_nei->partial_lh = newPartialLh();
4034 sibling1_nei->clearPartialLh();
4035 sibling2_nei->clearPartialLh();
4036
4037 // now try to move the subtree to somewhere else
4038 vector<PhyloNeighbor*> spr_path;
4039
4040 FOR_NEIGHBOR(sibling1, sibling2, it)
4041 {
4042 spr_path.push_back(sibling1_nei);
4043 double score = swapSPR(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling1,
4044 spr_path);
4045 // if likelihood score improves, return
4046 if (score > cur_score) {
4047 cout << "cur_score = " << cur_score << endl;
4048 cout << "Found new BETTER SCORE by SPR: " << score << endl;
4049
4050 return score;
4051 }
4052 spr_path.pop_back();
4053 }
4054
4055 FOR_NEIGHBOR(sibling2, sibling1, it)
4056 {
4057 spr_path.push_back(sibling2_nei);
4058 double score = swapSPR(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling2,
4059 spr_path);
4060 // if likelihood score improves, return
4061 if (score > cur_score) {
4062 cout << "cur_score = " << cur_score << endl;
4063 cout << "Found new BETTER SCORE by SPR: " << score << endl;
4064
4065 return score;
4066 }
4067 spr_path.pop_back();
4068 }
4069 // if likelihood does not imporve, swap back
4070 sibling1->updateNeighbor(sibling2, dad, sibling1_len);
4071 sibling2->updateNeighbor(sibling1, dad, sibling2_len);
4072 dad1_nei->node = sibling1;
4073 dad1_nei->length = sibling1_len;
4074 dad2_nei->node = sibling2;
4075 dad2_nei->length = sibling2_len;
4076 delete[] sibling1_nei->partial_lh;
4077 delete[] sibling2_nei->partial_lh;
4078 sibling1_nei->partial_lh = sibling1_partial_lh;
4079 sibling2_nei->partial_lh = sibling2_partial_lh;
4080 //clearAllPartialLH();
4081
4082 }
4083
4084 FOR_NEIGHBOR_IT(node, dad, it){
4085 double score = optimizeSPR(cur_score, (PhyloNode*) (*it)->node, node);
4086
4087 if (score > cur_score) return score;
4088 }
4089 return cur_score;
4090 }
4091
4092 /**
4093 move the subtree (dad1-node1) to the branch (dad2-node2)
4094 */
swapSPR(double cur_score,int cur_depth,PhyloNode * node1,PhyloNode * dad1,PhyloNode * orig_node1,PhyloNode * orig_node2,PhyloNode * node2,PhyloNode * dad2,vector<PhyloNeighbor * > & spr_path)4095 double PhyloTree::swapSPR(double cur_score, int cur_depth, PhyloNode *node1, PhyloNode *dad1, PhyloNode *orig_node1,
4096 PhyloNode *orig_node2, PhyloNode *node2, PhyloNode *dad2, vector<PhyloNeighbor*> &spr_path) {
4097
4098 PhyloNeighbor *node1_nei = (PhyloNeighbor*) node1->findNeighbor(dad1);
4099 PhyloNeighbor *dad1_nei = (PhyloNeighbor*) dad1->findNeighbor(node1);
4100 double node1_dad1_len = node1_nei->length;
4101 PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
4102 PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
4103
4104 //double* node1dad1_lh_save = node1_nei->partial_lh;
4105 //double* dad1node1_lh_save = dad1_nei->partial_lh;
4106 //double node1dad1_scale = node1_nei->lh_scale_factor;
4107 //double dad1node1_scale = dad1_nei->lh_scale_factor;
4108
4109 double* node2dad2_lh_save = node2_nei->partial_lh;
4110 double* dad2node2_lh_save = dad2_nei->partial_lh;
4111 double node2dad2_scale = node2_nei->lh_scale_factor;
4112 double dad2node_scale = dad2_nei->lh_scale_factor;
4113
4114 double len2 = node2_nei->length;
4115 double newLen2 = sqrt(len2);
4116
4117 if (dad2 && cur_depth >= SPR_DEPTH) {
4118 // now, connect (node1-dad1) to the branch (node2-dad2)
4119
4120 bool first = true;
4121 //PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
4122 //PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
4123 //double len2 = node2_nei->length;
4124
4125 FOR_NEIGHBOR_IT(dad1, node1, it){
4126 // Finding new 2 neighbors for dad1 that are not node1
4127 if (first) {
4128 (*it)->node = dad2;
4129 //(*it)->length = len2 / 2;
4130 (*it)->length = newLen2;
4131 dad2->updateNeighbor(node2, dad1, newLen2);
4132 first = false;
4133 } else {
4134 (*it)->node = node2;
4135 (*it)->length = newLen2;
4136 node2->updateNeighbor(dad2, dad1, newLen2);
4137 }
4138 // clear all partial likelihood leading from
4139 // dad1 to the new neighbors
4140 ((PhyloNeighbor*) (*it))->clearPartialLh();
4141 }
4142
4143 // clear partial likelihood from node2 to dad1
4144 node2_nei->clearPartialLh();
4145 // clear partial likelihood from dad2 to dad1
4146 dad2_nei->clearPartialLh();
4147 // clear partial likelihood from dad1 to node1
4148 node1_nei->clearPartialLh();
4149
4150 // set new legnth as suggested by Alexis
4151 node1_nei->length = 0.9;
4152 dad1_nei->length = 0.9;
4153
4154 //Save the partial likelihood from the removal point to the insertion point
4155 vector<PhyloNeighbor*>::iterator it2;
4156 vector<double*> saved_partial_lhs(spr_path.size());
4157 for (it2 = spr_path.begin(); it2 != spr_path.end(); it2++) {
4158 saved_partial_lhs.push_back((*it2)->partial_lh);
4159 (*it2)->partial_lh = newPartialLh();
4160 (*it2)->clearPartialLh();
4161 }
4162
4163 // optimize relevant branches
4164 double score;
4165
4166 /* testing different branch optimization */
4167 optimizeOneBranch(node1, dad1);
4168 optimizeOneBranch(dad2, dad1);
4169 optimizeOneBranch(node2, dad1);
4170 optimizeOneBranch(orig_node1, orig_node2);
4171 score = computeLikelihoodFromBuffer();
4172
4173 /*
4174 PhyloNode *cur_node = dad2;
4175 for (int i = spr_path.size()-1; i >= 0; i--) {
4176 score = optimizeOneBranch(cur_node, (PhyloNode*)spr_path[i]->node);
4177 cur_node = (PhyloNode*)spr_path[i]->node;
4178 }
4179 */
4180 //score = optimizeAllBranches(dad1);
4181 // if score improves, return
4182 if (score > cur_score) {
4183 cout << score << endl;
4184 return score;
4185 }
4186
4187 // else, swap back
4188 node2->updateNeighbor(dad1, dad2, len2);
4189 dad2->updateNeighbor(dad1, node2, len2);
4190 //node2_nei->clearPartialLh();
4191 //dad2_nei->clearPartialLh();
4192 // restore partial likelihood vectors
4193 node2_nei->partial_lh = node2dad2_lh_save;
4194 node2_nei->lh_scale_factor = node2dad2_scale;
4195 dad2_nei->partial_lh = dad2node2_lh_save;
4196 dad2_nei->lh_scale_factor = dad2node_scale;
4197 node2_nei->length = len2;
4198 dad2_nei->length = len2;
4199 node1_nei->length = node1_dad1_len;
4200 dad1_nei->length = node1_dad1_len;
4201 int index = 0;
4202 for (it2 = spr_path.begin(); it2 != spr_path.end(); it2++) {
4203 delete[] (*it2)->partial_lh;
4204 (*it2)->partial_lh = saved_partial_lhs.at(index);
4205 (*it2)->unclearPartialLh();
4206 index++;
4207 }
4208
4209 // add to candiate SPR moves
4210 // Tung : why adding negative SPR move ?
4211 spr_moves.add(node1, dad1, node2, dad2, score);
4212 }
4213 if (cur_depth >= spr_radius)
4214
4215 return cur_score;
4216 spr_path.push_back(node2_nei);
4217
4218 FOR_NEIGHBOR_IT(node2, dad2, it){
4219 double score = swapSPR(cur_score, cur_depth + 1, node1, dad1, orig_node1, orig_node2, (PhyloNode*) (*it)->node, node2, spr_path);
4220 if (score > cur_score) return score;
4221 }
4222 spr_path.pop_back();
4223
4224 return cur_score;
4225 }
4226
assessSPRMove(double cur_score,const SPRMove & spr)4227 double PhyloTree::assessSPRMove(double cur_score, const SPRMove &spr) {
4228
4229 PhyloNode *dad = spr.prune_dad;
4230 PhyloNode *node = spr.prune_node;
4231 PhyloNode *dad2 = spr.regraft_dad;
4232 PhyloNode *node2 = spr.regraft_node;
4233
4234 PhyloNeighbor *dad_nei1 = NULL;
4235 PhyloNeighbor *dad_nei2 = NULL;
4236 PhyloNode *sibling1 = NULL;
4237 PhyloNode *sibling2 = NULL;
4238 double sibling1_len = 0.0, sibling2_len = 0.0;
4239
4240 PhyloNeighbor *node1_nei = (PhyloNeighbor*) node->findNeighbor(dad);
4241 PhyloNeighbor *dad1_nei = (PhyloNeighbor*) dad->findNeighbor(node);
4242 double node1_dad1_len = node1_nei->length;
4243
4244 // assign the sibling of node, with respect to dad
4245
4246 FOR_NEIGHBOR_DECLARE(dad, node, it) {
4247 if (!sibling1) {
4248 dad_nei1 = (PhyloNeighbor*) (*it);
4249 sibling1 = (PhyloNode*) (*it)->node;
4250 sibling1_len = (*it)->length;
4251 } else {
4252
4253 dad_nei2 = (PhyloNeighbor*) (*it);
4254 sibling2 = (PhyloNode*) (*it)->node;
4255 sibling2_len = (*it)->length;
4256 }
4257 }
4258 // remove the subtree leading to node
4259 double sum_len = sibling1_len + sibling2_len;
4260 sibling1->updateNeighbor(dad, sibling2, sum_len);
4261 sibling2->updateNeighbor(dad, sibling1, sum_len);
4262 // now try to move the subtree to somewhere else
4263
4264 bool first = true;
4265 PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
4266 //PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
4267 double len2 = node2_nei->length;
4268
4269 FOR_NEIGHBOR(dad, node, it)
4270 {
4271 if (first) {
4272 (*it)->node = dad2;
4273 (*it)->length = len2 / 2;
4274 dad2->updateNeighbor(node2, dad, len2 / 2);
4275 first = false;
4276 } else {
4277 (*it)->node = node2;
4278 (*it)->length = len2 / 2;
4279 node2->updateNeighbor(dad2, dad, len2 / 2);
4280 }
4281 ((PhyloNeighbor*) (*it))->clearPartialLh();
4282 }
4283
4284 clearAllPartialLH();
4285 // optimize branches
4286 double score;
4287 optimizeAllBranches(dad);
4288 score = computeLikelihoodBranch((PhyloNeighbor*)dad->neighbors.back(), dad);
4289
4290 // if score improves, return
4291 if (score > cur_score)
4292 return score;
4293 // else, swap back
4294 node2->updateNeighbor(dad, dad2, len2);
4295 dad2->updateNeighbor(dad, node2, len2);
4296
4297 node1_nei->length = node1_dad1_len;
4298 dad1_nei->length = node1_dad1_len;
4299
4300 sibling1->updateNeighbor(sibling2, dad, sibling1_len);
4301 sibling2->updateNeighbor(sibling1, dad, sibling2_len);
4302 dad_nei1->node = sibling1;
4303 dad_nei1->length = sibling1_len;
4304 dad_nei2->node = sibling2;
4305 dad_nei2->length = sibling2_len;
4306 clearAllPartialLH();
4307
4308 return cur_score;
4309
4310 }
4311
optimizeSPR()4312 double PhyloTree::optimizeSPR() {
4313 double cur_score = computeLikelihood();
4314 //spr_radius = leafNum / 5;
4315 spr_radius = 10;
4316 for (int i = 0; i < 100; i++) {
4317 cout << "i = " << i << endl;
4318 spr_moves.clear();
4319 double score = optimizeSPR_old(cur_score, (PhyloNode*) root->neighbors[0]->node);
4320 clearAllPartialLH();
4321 // why this?
4322 if (score <= cur_score) {
4323 for (SPRMoves::iterator it = spr_moves.begin(); it != spr_moves.end(); it++) {
4324 //cout << (*it).score << endl;
4325 score = assessSPRMove(cur_score, *it);
4326 // if likelihood score improves, apply to SPR
4327 if (score > cur_score)
4328 break;
4329 }
4330 if (score <= cur_score)
4331 break;
4332 } else {
4333
4334 cur_score = optimizeAllBranches();
4335 cout << "SPR " << i + 1 << " : " << cur_score << endl;
4336 cur_score = score;
4337 }
4338 }
4339 return cur_score;
4340 //return optimizeAllBranches();
4341 }
4342
optimizeSPRBranches()4343 double PhyloTree::optimizeSPRBranches() {
4344 cout << "Search with Subtree Pruning and Regrafting (SPR) using ML..." << endl;
4345 double cur_score = computeLikelihood();
4346 for (int i = 0; i < 100; i++) {
4347 double score = optimizeSPR();
4348 if (score <= cur_score + TOL_LIKELIHOOD)
4349
4350 break;
4351 cur_score = score;
4352 }
4353 return cur_score;
4354 }
4355
pruneSubtree(PhyloNode * node,PhyloNode * dad,PruningInfo & info)4356 void PhyloTree::pruneSubtree(PhyloNode *node, PhyloNode *dad, PruningInfo &info) {
4357
4358 bool first = true;
4359 info.node = node;
4360 info.dad = dad;
4361
4362 FOR_NEIGHBOR_IT(dad, node, it){
4363 if (first) {
4364 info.dad_it_left = it;
4365 info.dad_nei_left = (*it);
4366 info.dad_lh_left = ((PhyloNeighbor*) (*it))->partial_lh;
4367 info.left_node = (*it)->node;
4368 info.left_len = (*it)->length;
4369 first = false;
4370 } else {
4371
4372 info.dad_it_right = it;
4373 info.dad_nei_right = (*it);
4374 info.dad_lh_right = ((PhyloNeighbor*) (*it))->partial_lh;
4375 info.right_node = (*it)->node;
4376 info.right_len = (*it)->length;
4377 }
4378 }
4379 info.left_it = info.left_node->findNeighborIt(dad);
4380 info.right_it = info.right_node->findNeighborIt(dad);
4381 info.left_nei = (*info.left_it);
4382 info.right_nei = (*info.right_it);
4383
4384 info.left_node->updateNeighbor(info.left_it, info.dad_nei_right);
4385 info.right_node->updateNeighbor(info.right_it, info.dad_nei_left);
4386 ((PhyloNeighbor*) info.dad_nei_right)->partial_lh = newPartialLh();
4387 ((PhyloNeighbor*) info.dad_nei_left)->partial_lh = newPartialLh();
4388 }
4389
regraftSubtree(PruningInfo & info,PhyloNode * in_node,PhyloNode * in_dad)4390 void PhyloTree::regraftSubtree(PruningInfo &info, PhyloNode *in_node, PhyloNode *in_dad) {
4391
4392 NeighborVec::iterator in_node_it = in_node->findNeighborIt(in_dad);
4393 NeighborVec::iterator in_dad_it = in_dad->findNeighborIt(in_node);
4394 Neighbor *in_dad_nei = (*in_dad_it);
4395 Neighbor *in_node_nei = (*in_node_it);
4396 //double in_len = in_dad_nei->length;
4397 info.dad->updateNeighbor(info.dad_it_right, in_dad_nei);
4398 info.dad->updateNeighbor(info.dad_it_left, in_node_nei);
4399 // SOMETHING NEED TO BE DONE
4400 //in_dad->updateNeighbor(in_dad_it,
4401
4402 }
4403
4404 /****************************************************************************
4405 Approximate Likelihood Ratio Test with SH-like interpretation
4406 ****************************************************************************/
4407
4408 /*void PhyloTree::computeNNIPatternLh(double cur_lh, double &lh2, double *pattern_lh2, double &lh3, double *pattern_lh3,
4409 PhyloNode *node1, PhyloNode *node2) {
4410
4411 assert(node1->degree() == 3 && node2->degree() == 3);
4412
4413 // recompute pattern scaling factors if necessary
4414 PhyloNeighbor *node12_it = (PhyloNeighbor*) node1->findNeighbor(node2);
4415 PhyloNeighbor *node21_it = (PhyloNeighbor*) node2->findNeighbor(node1);
4416 NeighborVec::iterator it;
4417 const int IT_NUM = 6;
4418
4419 NeighborVec::iterator saved_it[IT_NUM];
4420 int id = 0;
4421
4422 FOR_NEIGHBOR(node1, node2, it)
4423 {
4424 saved_it[id++] = (*it)->node->findNeighborIt(node1);
4425 } else {
4426
4427 saved_it[id++] = it;
4428 }
4429
4430 FOR_NEIGHBOR(node2, node1, it)
4431 {
4432 saved_it[id++] = (*it)->node->findNeighborIt(node2);
4433 } else {
4434 saved_it[id++] = it;
4435 }
4436 assert(id == IT_NUM);
4437
4438 Neighbor * saved_nei[IT_NUM];
4439 // save Neighbor and allocate new Neighbor pointer
4440 for (id = 0; id < IT_NUM; id++) {
4441 saved_nei[id] = (*saved_it[id]);
4442 // NOTE BUG DOWN HERE!
4443 *saved_it[id] = new PhyloNeighbor(saved_nei[id]->node, saved_nei[id]->length); // BUG for PhyloSuperTree!
4444 ((PhyloNeighbor*) (*saved_it[id]))->partial_lh = newPartialLh();
4445 ((PhyloNeighbor*) (*saved_it[id]))->scale_num = newScaleNum();
4446 }
4447
4448 // get the Neighbor again since it is replaced for saving purpose
4449 node12_it = (PhyloNeighbor*) node1->findNeighbor(node2);
4450 node21_it = (PhyloNeighbor*) node2->findNeighbor(node1);
4451
4452 // PhyloNeighbor *node2_lastnei = NULL;
4453
4454 // save the first found neighbor of node 1 (excluding node2) in node1_it
4455 FOR_NEIGHBOR_DECLARE(node1, node2, node1_it)
4456
4457 break;
4458 Neighbor *node1_nei = *node1_it;
4459
4460 bool first = true;
4461
4462 FOR_NEIGHBOR_IT(node2, node1, node2_it) {
4463 // do the NNI swap
4464 Neighbor *node2_nei = *node2_it;
4465 node1->updateNeighbor(node1_it, node2_nei);
4466 node2_nei->node->updateNeighbor(node2, node1);
4467
4468 node2->updateNeighbor(node2_it, node1_nei);
4469 node1_nei->node->updateNeighbor(node1, node2);
4470
4471 // re-optimize five adjacent branches
4472 double old_score = -INFINITY, new_score = old_score;
4473
4474 // clear partial likelihood vector
4475 node12_it->clearPartialLh();
4476 node21_it->clearPartialLh();
4477 int i;
4478 for (i = 0; i < 2; i++) {
4479
4480 new_score = optimizeOneBranch(node1, node2, false);
4481
4482 FOR_NEIGHBOR(node1, node2, it) {
4483 //for (id = 0; id < IT_NUM; id++)
4484 //((PhyloNeighbor*)(*saved_it[id]))->clearPartialLh();
4485 ((PhyloNeighbor*) (*it)->node->findNeighbor(node1))->clearPartialLh();
4486 new_score = optimizeOneBranch(node1, (PhyloNode*) (*it)->node, false);
4487 }
4488
4489 node21_it->clearPartialLh();
4490
4491 FOR_NEIGHBOR(node2, node1, it) {
4492 //for (id = 0; id < IT_NUM; id++)
4493 //((PhyloNeighbor*)(*saved_it[id]))->clearPartialLh();
4494 ((PhyloNeighbor*) (*it)->node->findNeighbor(node2))->clearPartialLh();
4495 new_score = optimizeOneBranch(node2, (PhyloNode*) (*it)->node, false);
4496 //node2_lastnei = (PhyloNeighbor*) (*it);
4497 }
4498 node12_it->clearPartialLh();
4499 if (new_score < old_score + TOL_LIKELIHOOD) break;
4500 old_score = new_score;
4501 }
4502 saveCurrentTree(new_score); // BQM: for new bootstrap
4503
4504 //new_score = optimizeOneBranch(node1, node2, false);
4505 if (new_score > cur_lh + TOL_LIKELIHOOD)
4506 cout << "Alternative NNI shows better likelihood " << new_score << " > " << cur_lh << endl;
4507 double *result_lh;
4508 if (first) {
4509 result_lh = pattern_lh2;
4510 lh2 = new_score;
4511 } else {
4512 result_lh = pattern_lh3;
4513 lh3 = new_score;
4514 }
4515 old_score = new_score;
4516 computePatternLikelihood(result_lh);
4517 // swap back and recover the branch lengths
4518 node1->updateNeighbor(node1_it, node1_nei);
4519 node1_nei->node->updateNeighbor(node2, node1);
4520 node2->updateNeighbor(node2_it, node2_nei);
4521 node2_nei->node->updateNeighbor(node1, node2);
4522 first = false;
4523 }
4524
4525 // restore the Neighbor*
4526 for (id = 0; id < IT_NUM; id++) {
4527
4528 delete[] ((PhyloNeighbor*) *saved_it[id])->scale_num;
4529 delete[] ((PhyloNeighbor*) *saved_it[id])->partial_lh;
4530 delete (*saved_it[id]);
4531 (*saved_it[id]) = saved_nei[id];
4532 }
4533
4534 // restore the length of 4 branches around node1, node2
4535 FOR_NEIGHBOR(node1, node2, it)
4536 (*it)->length = (*it)->node->findNeighbor(node1)->length;
4537 FOR_NEIGHBOR(node2, node1, it)
4538 (*it)->length = (*it)->node->findNeighbor(node2)->length;
4539 }*/
4540
computeNNIPatternLh(double cur_lh,double & lh2,double * pattern_lh2,double & lh3,double * pattern_lh3,PhyloNode * node1,PhyloNode * node2)4541 void PhyloTree::computeNNIPatternLh(double cur_lh, double &lh2, double *pattern_lh2, double &lh3, double *pattern_lh3,
4542 PhyloNode *node1, PhyloNode *node2) {
4543 NNIMove nniMoves[2];
4544 nniMoves[0].ptnlh = pattern_lh2;
4545 nniMoves[1].ptnlh = pattern_lh3;
4546 bool nni5 = params->nni5;
4547 params->nni5 = true; // always optimize 5 branches for accurate SH-aLRT
4548 nniMoves[0].node1 = nniMoves[1].node1 = NULL;
4549 nniMoves[0].node2 = nniMoves[1].node2 = NULL;
4550 getBestNNIForBran(node1, node2, nniMoves);
4551 params->nni5 = nni5;
4552 lh2 = nniMoves[0].newloglh;
4553 lh3 = nniMoves[1].newloglh;
4554 if (max(lh2,lh3) > cur_lh + TOL_LIKELIHOOD)
4555 cout << "Alternative NNI shows better log-likelihood " << max(lh2,lh3) << " > " << cur_lh << endl;
4556 }
4557
resampleLh(double ** pat_lh,double * lh_new,int * rstream)4558 void PhyloTree::resampleLh(double **pat_lh, double *lh_new, int *rstream) {
4559 //int nsite = getAlnNSite();
4560 int nptn = getAlnNPattern();
4561 memset(lh_new, 0, sizeof(double) * 3);
4562 int i;
4563 int *boot_freq = aligned_alloc<int>(getAlnNPattern());
4564 aln->createBootstrapAlignment(boot_freq, params->bootstrap_spec, rstream);
4565 for (i = 0; i < nptn; i++) {
4566
4567 lh_new[0] += boot_freq[i] * pat_lh[0][i];
4568 lh_new[1] += boot_freq[i] * pat_lh[1][i];
4569 lh_new[2] += boot_freq[i] * pat_lh[2][i];
4570 }
4571 aligned_free(boot_freq);
4572 }
4573
4574 /*********************************************************/
4575 /** THIS FUNCTION IS TAKEN FROM PHYML source code alrt.c
4576 * Convert an aLRT statistic to a none parametric support
4577 * param in: the statistic
4578 */
4579
Statistics_To_Probabilities(double in)4580 double Statistics_To_Probabilities(double in)
4581 {
4582 double rough_value=0.0;
4583 double a=0.0;
4584 double b=0.0;
4585 double fa=0.0;
4586 double fb=0.0;
4587
4588 if(in>=0.000000393 && in<0.00000157)
4589 {
4590 a=0.000000393;
4591 b=0.00000157;
4592 fa=0.0005;
4593 fb=0.001;
4594 }
4595 else if(in>=0.00000157 && in<0.0000393)
4596 {
4597 a=0.00000157;
4598 b=0.0000393;
4599 fa=0.001;
4600 fb=0.005;
4601 }
4602 else if(in>=0.0000393 && in<0.000157)
4603 {
4604 a=0.0000393;
4605 b=0.000157;
4606 fa=0.005;
4607 fb=0.01;
4608 }
4609 else if(in>=0.000157 && in<0.000982)
4610 {
4611 a=0.000157;
4612 b=0.000982;
4613 fa=0.01;
4614 fb=0.025;
4615 }
4616 else if(in>0.000982 && in<0.00393)
4617 {
4618 a=0.000982;
4619 b=0.00393;
4620 fa=0.025;
4621 fb=0.05;
4622 }
4623 else if(in>=0.00393 && in<0.0158)
4624 {
4625 a=0.00393;
4626 b=0.0158;
4627 fa=0.05;
4628 fb=0.1;
4629 }
4630 else if(in>=0.0158 && in<0.0642)
4631 {
4632 a=0.0158;
4633 b=0.0642;
4634 fa=0.1;
4635 fb=0.2;
4636 }
4637 else if(in>=0.0642 && in<0.148)
4638 {
4639 a=0.0642;
4640 b=0.148;
4641 fa=0.2;
4642 fb=0.3;
4643 }
4644 else if(in>=0.148 && in<0.275)
4645 {
4646 a=0.148;
4647 b=0.275;
4648 fa=0.3;
4649 fb=0.4;
4650 }
4651 else if(in>=0.275 && in<0.455)
4652 {
4653 a=0.275;
4654 b=0.455;
4655 fa=0.4;
4656 fb=0.5;
4657 }
4658 else if(in>=0.455 && in<0.708)
4659 {
4660 a=0.455;
4661 b=0.708;
4662 fa=0.5;
4663 fb=0.6;
4664 }
4665 else if(in>=0.708 && in<1.074)
4666 {
4667 a=0.708;
4668 b=1.074;
4669 fa=0.6;
4670 fb=0.7;
4671 }
4672 else if(in>=1.074 && in<1.642)
4673 {
4674 a=1.074;
4675 b=1.642;
4676 fa=0.7;
4677 fb=0.8;
4678 }
4679 else if(in>=1.642 && in<2.706)
4680 {
4681 a=1.642;
4682 b=2.706;
4683 fa=0.8;
4684 fb=0.9;
4685 }
4686 else if(in>=2.706 && in<3.841)
4687 {
4688 a=2.706;
4689 b=3.841;
4690 fa=0.9;
4691 fb=0.95;
4692 }
4693 else if(in>=3.841 && in<5.024)
4694 {
4695 a=3.841;
4696 b=5.024;
4697 fa=0.95;
4698 fb=0.975;
4699 }
4700 else if(in>=5.024 && in<6.635)
4701 {
4702 a=5.024;
4703 b=6.635;
4704 fa=0.975;
4705 fb=0.99;
4706 }
4707 else if(in>=6.635 && in<7.879)
4708 {
4709 a=6.635;
4710 b=7.879;
4711 fa=0.99;
4712 fb=0.995;
4713 }
4714 else if(in>=7.879 && in<10.828)
4715 {
4716 a=7.879;
4717 b=10.828;
4718 fa=0.995;
4719 fb=0.999;
4720 }
4721 else if(in>=10.828 && in<12.116)
4722 {
4723 a=10.828;
4724 b=12.116;
4725 fa=0.999;
4726 fb=0.9995;
4727 }
4728 if (in>=12.116)
4729 {
4730 rough_value=0.9999;
4731 }
4732 else if(in<0.000000393)
4733 {
4734 rough_value=0.0001;
4735 }
4736 else
4737 {
4738 rough_value=(b-in)/(b-a)*fa + (in - a)/(b-a)*fb;
4739 }
4740 rough_value=rough_value+(1.0-rough_value)/2.0;
4741 rough_value=rough_value*rough_value*rough_value;
4742 return rough_value;
4743 }
4744
4745 // Implementation of testBranch follows Guindon et al. (2010)
4746
testOneBranch(double best_score,double * pattern_lh,int reps,int lbp_reps,PhyloNode * node1,PhyloNode * node2,double & lbp_support,double & aLRT_support,double & aBayes_support)4747 double PhyloTree::testOneBranch(double best_score, double *pattern_lh, int reps, int lbp_reps,
4748 PhyloNode *node1, PhyloNode *node2, double &lbp_support, double &aLRT_support, double &aBayes_support) {
4749 const int NUM_NNI = 3;
4750 double lh[NUM_NNI];
4751 double *pat_lh[NUM_NNI];
4752 lh[0] = best_score;
4753 pat_lh[0] = pattern_lh;
4754 int nptn = getAlnNPattern();
4755 pat_lh[1] = new double[nptn];
4756 pat_lh[2] = new double[nptn];
4757 int tmp = save_all_trees;
4758 save_all_trees = 0;
4759 computeNNIPatternLh(best_score, lh[1], pat_lh[1], lh[2], pat_lh[2], node1, node2);
4760 save_all_trees = tmp;
4761 double aLRT;
4762 if (lh[1] > lh[2])
4763 aLRT = (lh[0] - lh[1]);
4764 else
4765 aLRT = (lh[0] - lh[2]);
4766
4767 // compute parametric aLRT test support
4768 double aLRT_stat = 2*aLRT;
4769 aLRT_support = 0.0;
4770 if (aLRT_stat >= 0) {
4771 aLRT_support = Statistics_To_Probabilities(aLRT_stat);
4772 }
4773
4774 aBayes_support = 1.0 / (1.0 + exp(lh[1]-lh[0]) + exp(lh[2]-lh[0]));
4775
4776 int SH_aLRT_support = 0;
4777 int lbp_support_int = 0;
4778
4779 int times = max(reps, lbp_reps);
4780
4781 if (max(lh[1],lh[2]) == -DBL_MAX) {
4782 SH_aLRT_support = times;
4783 outWarning("Branch where both NNIs violate constraint tree will show 100% SH-aLRT support");
4784 } else
4785 #ifdef _OPENMP
4786 #pragma omp parallel
4787 {
4788 int *rstream;
4789 init_random(params->ran_seed + omp_get_thread_num(), false, &rstream);
4790 #pragma omp for reduction(+: lbp_support_int, SH_aLRT_support)
4791 #endif
4792 for (int i = 0; i < times; i++) {
4793 double lh_new[NUM_NNI];
4794 // resampling estimated log-likelihood (RELL)
4795 #ifdef _OPENMP
4796 resampleLh(pat_lh, lh_new, rstream);
4797 #else
4798 resampleLh(pat_lh, lh_new, randstream);
4799 #endif
4800 if (lh_new[0] > lh_new[1] && lh_new[0] > lh_new[2])
4801 lbp_support_int++;
4802 double cs[NUM_NNI], cs_best, cs_2nd_best;
4803 cs[0] = lh_new[0] - lh[0];
4804 cs[1] = lh_new[1] - lh[1];
4805 cs[2] = lh_new[2] - lh[2];
4806 if (cs[0] >= cs[1] && cs[0] >= cs[2]) {
4807 cs_best = cs[0];
4808 if (cs[1] > cs[2])
4809 cs_2nd_best = cs[1];
4810 else
4811 cs_2nd_best = cs[2];
4812 } else if (cs[1] >= cs[2]) {
4813 cs_best = cs[1];
4814 if (cs[0] > cs[2])
4815 cs_2nd_best = cs[0];
4816 else
4817 cs_2nd_best = cs[2];
4818 } else {
4819 cs_best = cs[2];
4820 if (cs[0] > cs[1])
4821 cs_2nd_best = cs[0];
4822 else
4823 cs_2nd_best = cs[1];
4824 }
4825 if (aLRT > (cs_best - cs_2nd_best) + 0.05)
4826 SH_aLRT_support++;
4827 }
4828 #ifdef _OPENMP
4829 finish_random(rstream);
4830 }
4831 #endif
4832 delete[] pat_lh[2];
4833 delete[] pat_lh[1];
4834
4835 lbp_support = 0.0;
4836 if (times > 0)
4837 lbp_support = ((double)lbp_support_int) / times;
4838
4839 if (times > 0)
4840 return ((double) SH_aLRT_support) / times;
4841 else
4842 return 0.0;
4843 }
4844
testAllBranches(int threshold,double best_score,double * pattern_lh,int reps,int lbp_reps,bool aLRT_test,bool aBayes_test,PhyloNode * node,PhyloNode * dad)4845 int PhyloTree::testAllBranches(int threshold, double best_score, double *pattern_lh, int reps, int lbp_reps, bool aLRT_test, bool aBayes_test,
4846 PhyloNode *node, PhyloNode *dad) {
4847 int num_low_support = 0;
4848 if (!node) {
4849 node = (PhyloNode*) root;
4850 root->neighbors[0]->node->name = "";
4851 if (isSuperTree()) {
4852 int tmp = save_all_trees;
4853 save_all_trees = 2;
4854 bool nni5 = params->nni5;
4855 params->nni5 = true; // always optimize 5 branches for accurate SH-aLRT
4856 initPartitionInfo();
4857 params->nni5 = nni5;
4858 save_all_trees = tmp;
4859 }
4860 }
4861 if (dad && !node->isLeaf() && !dad->isLeaf()) {
4862 double lbp_support, aLRT_support, aBayes_support;
4863 double SH_aLRT_support = (testOneBranch(best_score, pattern_lh, reps, lbp_reps,
4864 node, dad, lbp_support, aLRT_support, aBayes_support) * 100);
4865 ostringstream ss;
4866 ss.precision(3);
4867 ss << node->name;
4868 if (!node->name.empty())
4869 ss << "/";
4870 if (reps)
4871 ss << SH_aLRT_support;
4872 if (lbp_reps)
4873 ss << "/" << lbp_support * 100;
4874 if (aLRT_test)
4875 ss << "/" << aLRT_support;
4876 if (aBayes_test)
4877 ss << "/" << aBayes_support;
4878 node->name = ss.str();
4879 if (SH_aLRT_support < threshold)
4880 num_low_support = 1;
4881 if (((PhyloNeighbor*) node->findNeighbor(dad))->partial_pars) {
4882 ((PhyloNeighbor*) node->findNeighbor(dad))->partial_pars[0] = round(SH_aLRT_support);
4883 ((PhyloNeighbor*) dad->findNeighbor(node))->partial_pars[0] = round(SH_aLRT_support);
4884 }
4885 }
4886 FOR_NEIGHBOR_IT(node, dad, it)
4887 num_low_support += testAllBranches(threshold, best_score, pattern_lh, reps, lbp_reps, aLRT_test, aBayes_test, (PhyloNode*) (*it)->node, node);
4888
4889 return num_low_support;
4890 }
4891
4892 /****************************************************************************
4893 Collapse stable (highly supported) clades by one representative
4894 ****************************************************************************/
4895
deleteLeaf(Node * leaf)4896 void PhyloTree::deleteLeaf(Node *leaf) {
4897
4898 Node *near_node = leaf->neighbors[0]->node;
4899 ASSERT(leaf->isLeaf() && near_node->degree() == 3);
4900 Node *node1 = NULL;
4901 Node *node2 = NULL;
4902 double sum_len = 0.0;
4903
4904 FOR_NEIGHBOR_IT(near_node, leaf, it){
4905 sum_len += (*it)->length;
4906 if (!node1)
4907 node1 = (*it)->node;
4908
4909 else
4910 node2 = (*it)->node;
4911 }
4912 // make sure that the returned node1 and node2 are correct
4913 ASSERT(node1 && node2);
4914 // update the neighbor
4915 node1->updateNeighbor(near_node, node2, sum_len);
4916 node2->updateNeighbor(near_node, node1, sum_len);
4917 }
4918
reinsertLeaf(Node * leaf,Node * node,Node * dad)4919 void PhyloTree::reinsertLeaf(Node *leaf, Node *node, Node *dad) {
4920
4921 bool first = true;
4922 Node *adjacent_node = leaf->neighbors[0]->node;
4923 Neighbor *nei = node->findNeighbor(dad);
4924 //double len = nei->length;
4925 double len = max(nei->length, params->min_branch_length * 2);
4926 // to avoid too small branch length when reinserting leaf
4927
4928 FOR_NEIGHBOR_IT(adjacent_node, leaf, it){
4929 if (first) {
4930 (*it)->node = node;
4931 (*it)->length = len / 2;
4932 node->updateNeighbor(dad, adjacent_node, len / 2);
4933 } else {
4934 (*it)->node = dad;
4935 (*it)->length = len / 2;
4936 dad->updateNeighbor(node, adjacent_node, len / 2);
4937 }
4938 first = false;
4939 }
4940 }
4941
isSupportedNode(PhyloNode * node,int min_support)4942 bool PhyloTree::isSupportedNode(PhyloNode* node, int min_support) {
4943 FOR_NEIGHBOR_IT(node, NULL, it)if (!(*it)->node->isLeaf())
4944 if (((PhyloNeighbor*) * it)->partial_pars[0] < min_support) {
4945
4946 return false;
4947 }
4948 return true;
4949 }
4950
collapseStableClade(int min_support,NodeVector & pruned_taxa,StrVector & linked_name,double * & dist_mat)4951 int PhyloTree::collapseStableClade(int min_support, NodeVector &pruned_taxa, StrVector &linked_name,
4952 double* &dist_mat) {
4953 NodeVector taxa;
4954 NodeVector::iterator tax_it;
4955 StrVector::iterator linked_it;
4956 getTaxa(taxa);
4957 IntVector linked_taxid;
4958 linked_taxid.resize(leafNum, -1);
4959 int num_pruned_taxa; // global num of pruned taxa
4960 int ntaxa = leafNum;
4961 do {
4962 num_pruned_taxa = 0;
4963 for (tax_it = taxa.begin(); tax_it != taxa.end(); tax_it++)
4964 if (linked_taxid[(*tax_it)->id] < 0) {
4965 Node *taxon = (*tax_it);
4966 PhyloNode *near_node = (PhyloNode*) taxon->neighbors[0]->node;
4967 Node *adj_taxon = NULL;
4968 FOR_NEIGHBOR_DECLARE(near_node, taxon, it)
4969 if ((*it)->node->isLeaf()) {
4970 adj_taxon = (*it)->node;
4971 break;
4972 }
4973 // if it is not a cherry
4974 if (!adj_taxon)
4975 continue;
4976 ASSERT(linked_taxid[adj_taxon->id] < 0);
4977 PhyloNeighbor * near_nei = NULL;
4978 FOR_NEIGHBOR(near_node, taxon, it)
4979 if ((*it)->node != adj_taxon) {
4980 near_nei = (PhyloNeighbor*) (*it);
4981 break;
4982 }
4983 ASSERT(near_nei);
4984 // continue if the cherry is not stable, or distance between two taxa is near ZERO
4985 if (!isSupportedNode((PhyloNode*) near_nei->node, min_support)
4986 && dist_mat[taxon->id * ntaxa + adj_taxon->id] > 2e-6)
4987 continue;
4988 // now do the taxon pruning
4989 Node * pruned_taxon = taxon, *stayed_taxon = adj_taxon;
4990 // prune the taxon that is far away
4991 if (adj_taxon->neighbors[0]->length > taxon->neighbors[0]->length) {
4992 pruned_taxon = adj_taxon;
4993 stayed_taxon = taxon;
4994 }
4995 deleteLeaf(pruned_taxon);
4996 linked_taxid[pruned_taxon->id] = stayed_taxon->id;
4997 pruned_taxa.push_back(pruned_taxon);
4998 linked_name.push_back(stayed_taxon->name);
4999 num_pruned_taxa++;
5000 // do not prune more than n-4 taxa
5001 if (pruned_taxa.size() >= ntaxa - 4)
5002 break;
5003 }
5004 } while (num_pruned_taxa && pruned_taxa.size() < ntaxa - 4);
5005
5006 if (pruned_taxa.empty())
5007 return 0;
5008
5009 if (verbose_mode >= VB_MED)
5010 for (tax_it = pruned_taxa.begin(), linked_it = linked_name.begin(); tax_it != pruned_taxa.end();
5011 tax_it++, linked_it++)
5012 cout << "Delete " << (*tax_it)->name << " from " << (*linked_it) << endl;
5013
5014 // set root to the first taxon which was not deleted
5015 for (tax_it = taxa.begin(); tax_it != taxa.end(); tax_it++)
5016 if (linked_taxid[(*tax_it)->id] < 0) {
5017 root = (*tax_it);
5018 break;
5019 }
5020 // extract the sub alignment
5021 IntVector stayed_id;
5022 int i, j;
5023 for (i = 0; i < taxa.size(); i++)
5024 if (linked_taxid[i] < 0)
5025 stayed_id.push_back(i);
5026 ASSERT(stayed_id.size() + pruned_taxa.size() == leafNum);
5027 Alignment * pruned_aln = new Alignment();
5028 pruned_aln->extractSubAlignment(aln, stayed_id, 2); // at least 2 informative characters
5029 nodeNum = leafNum = stayed_id.size();
5030 initializeTree();
5031 setAlignment(pruned_aln);
5032
5033 double *pruned_dist = new double[leafNum * leafNum];
5034 for (i = 0; i < leafNum; i++)
5035 for (j = 0; j < leafNum; j++)
5036 pruned_dist[i * leafNum + j] = dist_mat[stayed_id[i] * ntaxa + stayed_id[j]];
5037 dist_mat = pruned_dist;
5038
5039 return pruned_taxa.size();
5040 }
5041
restoreStableClade(Alignment * original_aln,NodeVector & pruned_taxa,StrVector & linked_name)5042 int PhyloTree::restoreStableClade(Alignment *original_aln, NodeVector &pruned_taxa, StrVector &linked_name) {
5043 //int num_inserted_taxa;
5044 NodeVector::reverse_iterator tax_it;
5045 StrVector::reverse_iterator linked_it;
5046 tax_it = pruned_taxa.rbegin();
5047 linked_it = linked_name.rbegin();
5048 for (; tax_it != pruned_taxa.rend(); tax_it++, linked_it++) {
5049 //cout << "Reinsert " << (*tax_it)->name << " to " << (*linked_it) << endl;
5050 Node *linked_taxon = findNodeName((*linked_it));
5051 ASSERT(linked_taxon);
5052 ASSERT(linked_taxon->isLeaf());
5053 leafNum++;
5054 reinsertLeaf((*tax_it), linked_taxon, linked_taxon->neighbors[0]->node);
5055 }
5056 ASSERT(leafNum == original_aln->getNSeq());
5057 nodeNum = leafNum;
5058 initializeTree();
5059 setAlignment(original_aln);
5060 setRootNode(params->root);
5061 //if (verbose_mode >= VB_MED) drawTree(cout);
5062
5063 return 0;
5064 }
5065
checkEqualScalingFactor(double & sum_scaling,PhyloNode * node,PhyloNode * dad)5066 bool PhyloTree::checkEqualScalingFactor(double &sum_scaling, PhyloNode *node, PhyloNode *dad) {
5067 if (!node)
5068 node = (PhyloNode*) root;
5069 if (dad) {
5070 double scaling = ((PhyloNeighbor*) node->findNeighbor(dad))->lh_scale_factor
5071 + ((PhyloNeighbor*) dad->findNeighbor(node))->lh_scale_factor;
5072 if (sum_scaling > 0)
5073 sum_scaling = scaling;
5074 if (fabs(sum_scaling - scaling) > 1e-6) {
5075 cout << sum_scaling << " " << scaling << endl;
5076 return false;
5077 }
5078 }
5079 FOR_NEIGHBOR_IT(node, dad, it)if (!checkEqualScalingFactor(sum_scaling, (PhyloNode*) (*it)->node, node)) return false;
5080
5081 return true;
5082 }
5083
randomizeNeighbors(Node * node,Node * dad)5084 void PhyloTree::randomizeNeighbors(Node *node, Node *dad) {
5085
5086 if (!node)
5087 node = root;
5088 FOR_NEIGHBOR_IT(node, dad, it)randomizeNeighbors((*it)->node, node);
5089
5090 my_random_shuffle(node->neighbors.begin(), node->neighbors.end());
5091 }
5092
printTransMatrices(Node * node,Node * dad)5093 void PhyloTree::printTransMatrices(Node *node, Node *dad) {
5094 if (!node)
5095 node = root;
5096 int nstates = aln->num_states;
5097
5098 if (dad) {
5099 double *trans_cat = new double[nstates * nstates];
5100 model_factory->computeTransMatrix(dad->findNeighbor(node)->length * site_rate->getRate(0), trans_cat);
5101 cout << "Transition matrix " << dad->name << " to " << node->name << endl;
5102 for (int i = 0; i < nstates; i++) {
5103 for (int j = 0; j < nstates; j++) {
5104 cout << "\t" << trans_cat[i * nstates + j];
5105 }
5106 cout << endl;
5107 }
5108 delete[] trans_cat;
5109 }
5110 FOR_NEIGHBOR_IT(node, dad, it)printTransMatrices((*it)->node, node);
5111 }
5112
removeIdenticalSeqs(Params & params)5113 void PhyloTree::removeIdenticalSeqs(Params ¶ms) {
5114 // commented out because it also works for SuperAlignment now!
5115 Alignment *new_aln;
5116 // 2017-03-31: always keep two identical sequences no matter if -bb or not, to avoid conflict between 2 subsequent runs
5117 if (params.root)
5118 new_aln = aln->removeIdenticalSeq((string)params.root, true, removed_seqs, twin_seqs);
5119 else
5120 new_aln = aln->removeIdenticalSeq("", true, removed_seqs, twin_seqs);
5121 if (removed_seqs.size() > 0) {
5122 cout << "NOTE: " << removed_seqs.size() << " identical sequences (see below) will be ignored for subsequent analysis" << endl;
5123 for (int i = 0; i < removed_seqs.size(); i++) {
5124 cout << "NOTE: " << removed_seqs[i] << " (identical to " << twin_seqs[i] << ") is ignored but added at the end" << endl;
5125 }
5126 delete aln;
5127 aln = new_aln;
5128 }
5129 if (!constraintTree.empty()) {
5130 int count = constraintTree.removeTaxa(removed_seqs);
5131 if (count)
5132 cout << count << " taxa removed from constraint tree" << endl;
5133 }
5134 }
5135
reinsertIdenticalSeqs(Alignment * orig_aln)5136 void PhyloTree::reinsertIdenticalSeqs(Alignment *orig_aln) {
5137 if (removed_seqs.empty()) return;
5138
5139 insertTaxa(removed_seqs, twin_seqs);
5140 setAlignment(orig_aln);
5141 // delete all partial_lh, which will be automatically recreated later
5142 deleteAllPartialLh();
5143 clearAllPartialLH();
5144 }
5145
computeSeqIdentityAlongTree(Split & sp,Node * node,Node * dad)5146 void PhyloTree::computeSeqIdentityAlongTree(Split &sp, Node *node, Node *dad) {
5147 ASSERT(node && !node->isLeaf());
5148 // recursive
5149 FOR_NEIGHBOR_IT(node, dad, it) {
5150 if ((*it)->node->isLeaf()) {
5151 sp.addTaxon((*it)->node->id);
5152 } else {
5153 Split newsp(leafNum);
5154 computeSeqIdentityAlongTree(newsp, (*it)->node, node);
5155 sp += newsp;
5156 }
5157 }
5158 if (!dad) return;
5159 // now going along alignment to compute seq identity
5160 int ident = 0, nseqs = aln->getNSeq();
5161 for (Alignment::iterator it = aln->begin(); it != aln->end(); it++) {
5162 char state = aln->STATE_UNKNOWN;
5163 bool is_const = true;
5164 for (int i = 0; i != nseqs; i++)
5165 if ((*it)[i] < aln->num_states && sp.containTaxon(i)) {
5166 if (state < aln->num_states && state != (*it)[i]) {
5167 is_const = false;
5168 break;
5169 }
5170 state = (*it)[i];
5171 }
5172 if (is_const)
5173 ident += it->frequency;
5174 }
5175 ident = (ident*100)/aln->getNSite();
5176 if (node->name == "")
5177 node->name = convertIntToString(ident);
5178 else
5179 node->name += "/" + convertIntToString(ident);
5180 }
5181
computeSeqIdentityAlongTree()5182 void PhyloTree::computeSeqIdentityAlongTree() {
5183 Split sp(leafNum);
5184 if (root->isLeaf())
5185 computeSeqIdentityAlongTree(sp, root->neighbors[0]->node);
5186 else
5187 computeSeqIdentityAlongTree(sp, root);
5188 }
5189
generateRandomTree(TreeGenType tree_type)5190 void PhyloTree::generateRandomTree(TreeGenType tree_type) {
5191 if (!constraintTree.empty() && tree_type != YULE_HARDING)
5192 outError("Only Yule-Harding ramdom tree supported with constraint tree");
5193 ASSERT(aln);
5194 int orig_size = params->sub_size;
5195 params->sub_size = aln->getNSeq();
5196 MExtTree ext_tree;
5197 if (constraintTree.empty()) {
5198 switch (tree_type) {
5199 case YULE_HARDING:
5200 ext_tree.generateYuleHarding(*params);
5201 break;
5202 case UNIFORM:
5203 ext_tree.generateUniform(params->sub_size);
5204 break;
5205 case CATERPILLAR:
5206 ext_tree.generateCaterpillar(params->sub_size);
5207 break;
5208 case BALANCED:
5209 ext_tree.generateBalanced(params->sub_size);
5210 break;
5211 case STAR_TREE:
5212 ext_tree.generateStarTree(*params);
5213 break;
5214 default:
5215 break;
5216 }
5217 NodeVector taxa;
5218 ext_tree.getTaxa(taxa);
5219 ASSERT(taxa.size() == aln->getNSeq());
5220 for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++)
5221 (*it)->name = aln->getSeqName((*it)->id);
5222 } else {
5223 ext_tree.generateConstrainedYuleHarding(*params, &constraintTree, aln->getSeqNames());
5224 }
5225 params->sub_size = orig_size;
5226 stringstream str;
5227 ext_tree.printTree(str);
5228 PhyloTree::readTreeStringSeqName(str.str());
5229 }
5230
computeBranchDirection(PhyloNode * node,PhyloNode * dad)5231 void PhyloTree::computeBranchDirection(PhyloNode *node, PhyloNode *dad) {
5232 if (!node) {
5233 node = (PhyloNode*)root;
5234 }
5235 if (dad)
5236 ((PhyloNeighbor*)node->findNeighbor(dad))->direction = TOWARD_ROOT;
5237 FOR_NEIGHBOR_IT(node, dad, it) {
5238 // do not update if direction was already computed
5239 ASSERT(((PhyloNeighbor*)*it)->direction != TOWARD_ROOT);
5240 if (((PhyloNeighbor*)*it)->direction != UNDEFINED_DIRECTION)
5241 continue;
5242 // otherwise undefined.
5243 ((PhyloNeighbor*)*it)->direction = AWAYFROM_ROOT;
5244 computeBranchDirection((PhyloNode*)(*it)->node, node);
5245 }
5246 }
5247
clearBranchDirection(PhyloNode * node,PhyloNode * dad)5248 void PhyloTree::clearBranchDirection(PhyloNode *node, PhyloNode *dad) {
5249 if (!node)
5250 node = (PhyloNode*)root;
5251 if (dad)
5252 ((PhyloNeighbor*)node->findNeighbor(dad))->direction = UNDEFINED_DIRECTION;
5253 FOR_NEIGHBOR_IT(node, dad, it) {
5254 ((PhyloNeighbor*)*it)->direction = UNDEFINED_DIRECTION;
5255 clearBranchDirection((PhyloNode*)(*it)->node, node);
5256 }
5257
5258 }
5259
5260 /*
5261 void PhyloTree::sortNeighborBySubtreeSize(PhyloNode *node, PhyloNode *dad) {
5262
5263 // already sorted, return
5264 PhyloNeighbor *nei = (PhyloNeighbor*)dad->findNeighbor(node);
5265 if (nei->size >= 1)
5266 return;
5267
5268 if (dad && node->isLeaf()) {
5269 nei->size = 1;
5270 return;
5271 }
5272
5273 nei->size = 0;
5274 FOR_NEIGHBOR_DECLARE(node, dad, it) {
5275 sortNeighborBySubtreeSize((PhyloNode*)(*it)->node, node);
5276 nei->size += ((PhyloNeighbor*)*it)->size;
5277 }
5278
5279 // sort neighbors in descending order of sub-tree size
5280 FOR_NEIGHBOR(node, dad, it)
5281 for (NeighborVec::iterator it2 = it+1; it2 != node->neighbors.end(); it2++)
5282 if ((*it2)->node != dad && ((PhyloNeighbor*)*it)->size < ((PhyloNeighbor*)*it2)->size) {
5283 Neighbor *nei;
5284 nei = *it;
5285 *it = *it2;
5286 *it2 = nei;
5287 }
5288 }
5289 */
5290
convertToRooted()5291 void PhyloTree::convertToRooted() {
5292 ASSERT(leafNum == aln->getNSeq());
5293 Node *node, *dad;
5294 double node_len, dad_len;
5295 if (params->root) {
5296 string name = params->root;
5297 node = findNodeName(name);
5298 if (!node)
5299 outError("Cannot find leaf with name " + name);
5300 ASSERT(node->isLeaf());
5301 dad = node->neighbors[0]->node;
5302 node_len = dad_len = node->neighbors[0]->length*0.5;
5303 } else {
5304 //midpoint rooting
5305 Node *node1, *node2;
5306 double longest = root->longestPath2(node1, node2);
5307 longest *= 0.5;
5308 double curlen = 0.0;
5309 for (; node1 != node2 && curlen + node1->highestNei->length < longest; node1 = node1->highestNei->node) {
5310 curlen += node1->highestNei->length;
5311 }
5312 // place root on node1->heigestNei
5313 node = node1;
5314 dad = node1->highestNei->node;
5315 node_len = longest - curlen;
5316 dad_len = node1->highestNei->length - node_len;
5317 ASSERT(dad_len >= 0.0);
5318 /*
5319 // place root to the longest branch
5320 NodeVector nodes1, nodes2;
5321 getBranches(nodes1, nodes2);
5322 double max_length = -1.0;
5323 for (int i = 0; i < nodes1.size(); i++)
5324 if (max_length < nodes1[i]->findNeighbor(nodes2[i])->length) {
5325 max_length = nodes1[i]->findNeighbor(nodes2[i])->length;
5326 node = nodes1[i];
5327 dad = nodes2[i];
5328 }
5329 */
5330 }
5331 rooted = true;
5332 root = newNode(leafNum, ROOT_NAME);
5333 Node *root_int = newNode();
5334 root->addNeighbor(root_int, 0.0);
5335 root_int->addNeighbor(root, 0.0);
5336 leafNum++;
5337 //double newlen = node->findNeighbor(dad)->length/2.0;
5338 node->updateNeighbor(dad, root_int, node_len);
5339 root_int->addNeighbor(node, node_len);
5340 dad->updateNeighbor(node, root_int, dad_len);
5341 root_int->addNeighbor(dad, dad_len);
5342 initializeTree();
5343 computeBranchDirection();
5344 current_it = current_it_back = NULL;
5345 }
5346
convertToUnrooted()5347 void PhyloTree::convertToUnrooted() {
5348 ASSERT(rooted);
5349 if (aln)
5350 ASSERT(leafNum == aln->getNSeq()+1);
5351 ASSERT(root);
5352 ASSERT(root->isLeaf() && root->id == leafNum-1);
5353 Node *node = root->neighbors[0]->node;
5354 Node *taxon = findFirstTaxon();
5355
5356 rooted = false;
5357 leafNum--;
5358
5359 // delete root node
5360 if (node->degree() == 3) {
5361 // delete and join adjacent branches
5362 Node *node1 = NULL, *node2 = NULL;
5363 double len = 0.0;
5364 FOR_NEIGHBOR_IT(node, root, it) {
5365 if (!node1) node1 = (*it)->node; else node2 = (*it)->node;
5366 len += (*it)->length;
5367 }
5368 node1->updateNeighbor(node, node2, len);
5369 node2->updateNeighbor(node, node1, len);
5370 delete node;
5371 } else {
5372 // only delete root node
5373 auto it = node->findNeighborIt(root);
5374 delete *it;
5375 node->neighbors.erase(it);
5376
5377 }
5378
5379 delete root;
5380 // set a temporary taxon so that tree traversal works
5381 root = taxon;
5382
5383 if (params)
5384 setRootNode(params->root);
5385
5386 initializeTree();
5387 clearBranchDirection();
5388 // computeBranchDirection();
5389 }
5390
reorientPartialLh(PhyloNeighbor * dad_branch,Node * dad)5391 void PhyloTree::reorientPartialLh(PhyloNeighbor* dad_branch, Node *dad) {
5392 ASSERT(!isSuperTree());
5393 if (dad_branch->partial_lh)
5394 return;
5395 Node * node = dad_branch->node;
5396 FOR_NEIGHBOR_IT(node, dad, it) {
5397 PhyloNeighbor *backnei = (PhyloNeighbor*)(*it)->node->findNeighbor(node);
5398 if (backnei->partial_lh) {
5399 mem_slots.takeover(dad_branch, backnei);
5400 break;
5401 }
5402 }
5403 if (params->lh_mem_save == LM_PER_NODE)
5404 ASSERT(dad_branch->partial_lh && "partial_lh is not re-oriented");
5405 }
5406
5407 /****************************************************************************
5408 helper functions for computing tree traversal
5409 ****************************************************************************/
5410
computeTraversalInfo(PhyloNeighbor * dad_branch,PhyloNode * dad,double * & buffer)5411 bool PhyloTree::computeTraversalInfo(PhyloNeighbor *dad_branch, PhyloNode *dad, double* &buffer) {
5412
5413 size_t nstates = aln->num_states;
5414 PhyloNode *node = (PhyloNode*)dad_branch->node;
5415
5416 if ((dad_branch->partial_lh_computed & 1) || node->isLeaf()) {
5417 return mem_slots.lock(dad_branch);
5418 }
5419
5420
5421 size_t num_leaves = 0;
5422 bool locked[node->degree()];
5423 memset(locked, 0, node->degree());
5424
5425 // sort neighbor in desceding size order
5426 NeighborVec neivec = node->neighbors;
5427 NeighborVec::iterator it, i2;
5428 for (it = neivec.begin(); it != neivec.end(); it++)
5429 for (i2 = it+1; i2 != neivec.end(); i2++)
5430 if (((PhyloNeighbor*)*it)->size < ((PhyloNeighbor*)*i2)->size) {
5431 Neighbor *nei = *it;
5432 *it = *i2;
5433 *i2 = nei;
5434 }
5435
5436
5437 // recursive
5438 for (it = neivec.begin(); it != neivec.end(); it++)
5439 if ((*it)->node != dad) {
5440 locked[it - neivec.begin()] = computeTraversalInfo((PhyloNeighbor*)(*it), node, buffer);
5441 if ((*it)->node->isLeaf())
5442 num_leaves++;
5443 }
5444 dad_branch->partial_lh_computed |= 1;
5445
5446 // prepare information for this branch
5447 TraversalInfo info(dad_branch, dad);
5448 info.echildren = info.partial_lh_leaves = NULL;
5449
5450 // re-orient partial_lh
5451 reorientPartialLh(dad_branch, dad);
5452
5453 if (!dad_branch->partial_lh || mem_slots.locked(dad_branch)) {
5454 // still no free entry found, memory saving technique
5455 int slot_id = mem_slots.allocate(dad_branch);
5456 if (slot_id < 0) {
5457 cout << "traversal order:";
5458 for (auto it = traversal_info.begin(); it != traversal_info.end(); it++) {
5459 it->dad_branch->node->name = convertIntToString(it->dad_branch->size);
5460 cout << " ";
5461 if (it->dad->isLeaf())
5462 cout << it->dad->name;
5463 else
5464 cout << it->dad->id;
5465 cout << "->";
5466 if (it->dad_branch->node->isLeaf())
5467 cout << it->dad_branch->node->name;
5468 else
5469 cout << it->dad_branch->node->id;
5470 if (params->lh_mem_save == LM_MEM_SAVE) {
5471 if (it->dad_branch->partial_lh_computed)
5472 cout << " [";
5473 else
5474 cout << " (";
5475 cout << mem_slots.findNei(it->dad_branch) - mem_slots.begin();
5476 if (it->dad_branch->partial_lh_computed)
5477 cout << "]";
5478 else
5479 cout << ")";
5480 }
5481 }
5482 cout << endl;
5483 drawTree(cout);
5484 ASSERT(0 && "No free/unlocked mem slot found!");
5485 }
5486 } else
5487 mem_slots.update(dad_branch);
5488
5489 if (verbose_mode >= VB_MED && params->lh_mem_save == LM_MEM_SAVE) {
5490 int slot_id = mem_slots.findNei(dad_branch) - mem_slots.begin();
5491 node->name = convertIntToString(slot_id);
5492 cout << "Branch " << dad->id << "-" << node->id << " assigned slot " << slot_id << endl;
5493 }
5494
5495 if (params->lh_mem_save == LM_MEM_SAVE) {
5496 for (it = neivec.begin(); it != neivec.end(); it++)
5497 if ((*it)->node != dad) {
5498 if (!(*it)->node->isLeaf() && locked[it-neivec.begin()])
5499 mem_slots.unlock((PhyloNeighbor*)*it);
5500 }
5501 }
5502
5503 if (!model->isSiteSpecificModel() && !Params::getInstance().buffer_mem_save) {
5504 //------- normal model -----
5505 info.echildren = buffer;
5506 size_t block = nstates * ((model_factory->fused_mix_rate) ? site_rate->getNRate() : site_rate->getNRate()*model->getNMixtures());
5507 buffer += get_safe_upper_limit(block*nstates*(node->degree()-1));
5508 if (num_leaves) {
5509 info.partial_lh_leaves = buffer;
5510 buffer += get_safe_upper_limit((aln->STATE_UNKNOWN+1)*block*num_leaves);
5511 }
5512 }
5513
5514 traversal_info.push_back(info);
5515 return mem_slots.lock(dad_branch);
5516 }
5517
writeSiteLh(ostream & out,SiteLoglType wsl,int partid)5518 void PhyloTree::writeSiteLh(ostream &out, SiteLoglType wsl, int partid) {
5519 // error checking
5520 if (!getModel()->isMixture()) {
5521 if (wsl != WSL_RATECAT) {
5522 outWarning("Switch now to '-wslr' as it is the only option for non-mixture model");
5523 wsl = WSL_RATECAT;
5524 }
5525 } else {
5526 // mixture model
5527 if (wsl == WSL_MIXTURE_RATECAT && getModelFactory()->fused_mix_rate) {
5528 outWarning("-wslmr is not suitable for fused mixture model, switch now to -wslm");
5529 wsl = WSL_MIXTURE;
5530 }
5531 }
5532 size_t i, nsites = getAlnNSite();
5533 size_t ncat = getNumLhCat(wsl);
5534 double *pattern_lh, *pattern_lh_cat;
5535 pattern_lh = aligned_alloc<double>(getAlnNPattern());
5536 pattern_lh_cat = aligned_alloc<double>(getAlnNPattern()*ncat);
5537 computePatternLikelihood(pattern_lh, NULL, pattern_lh_cat, wsl);
5538 for (i = 0; i < nsites; i++) {
5539 if (partid >= 0)
5540 out << partid << "\t";
5541 size_t ptn = aln->getPatternID(i);
5542 out << i+1 << "\t" << pattern_lh[ptn];
5543 for (int j = 0; j < ncat; j++) {
5544 out << "\t" << pattern_lh_cat[ptn*ncat+j];
5545 }
5546 out << endl;
5547 }
5548 aligned_free(pattern_lh_cat);
5549 aligned_free(pattern_lh);
5550 }
5551
writeSiteRates(ostream & out,bool bayes,int partid)5552 void PhyloTree::writeSiteRates(ostream &out, bool bayes, int partid) {
5553 DoubleVector pattern_rates;
5554 IntVector pattern_cat;
5555 int ncategory = 1;
5556
5557 if (bayes)
5558 ncategory = site_rate->computePatternRates(pattern_rates, pattern_cat);
5559 else
5560 optimizePatternRates(pattern_rates);
5561
5562 if (pattern_rates.empty()) return;
5563 int nsite = aln->getNSite();
5564 int i;
5565
5566 out.setf(ios::fixed,ios::floatfield);
5567 out.precision(5);
5568 //cout << __func__ << endl;
5569 IntVector count;
5570 count.resize(ncategory, 0);
5571 for (i = 0; i < nsite; i++) {
5572 int ptn = aln->getPatternID(i);
5573 if (partid >= 0)
5574 out << partid << "\t";
5575 out << i+1 << "\t";
5576 if (pattern_rates[ptn] >= MAX_SITE_RATE) out << "100.0"; else out << pattern_rates[ptn];
5577 //cout << i << " "<< ptn << " " << pattern_cat[ptn] << endl;
5578 if (!pattern_cat.empty()) {
5579 int site_cat;
5580 double cat_rate;
5581 if (site_rate->getPInvar() == 0.0) {
5582 site_cat = pattern_cat[ptn]+1;
5583 cat_rate = site_rate->getRate(pattern_cat[ptn]);
5584 } else {
5585 site_cat = pattern_cat[ptn];
5586 if (site_cat == 0)
5587 cat_rate = 0.0;
5588 else
5589 cat_rate = site_rate->getRate(pattern_cat[ptn]-1);
5590 }
5591 out << "\t" << site_cat << "\t" << cat_rate;
5592 count[pattern_cat[ptn]]++;
5593 }
5594 out << endl;
5595 }
5596 if (bayes) {
5597 cout << "Empirical proportions for each category:";
5598 for (i = 0; i < count.size(); i++)
5599 cout << " " << ((double)count[i])/nsite;
5600 cout << endl;
5601 }
5602 }
5603
writeBranches(ostream & out)5604 void PhyloTree::writeBranches(ostream &out) {
5605 outError("Please only use this feature with partition model");
5606 }
5607
5608