1 /***************************************************************************
2 * Copyright (C) 2009-2015 by *
3 * BUI Quang Minh <minh.bui@univie.ac.at> *
4 * Lam-Tung Nguyen <nltung@gmail.com> *
5 * *
6 * *
7 * This program is free software; you can redistribute it and/or modify *
8 * it under the terms of the GNU General Public License as published by *
9 * the Free Software Foundation; either version 2 of the License, or *
10 * (at your option) any later version. *
11 * *
12 * This program is distributed in the hope that it will be useful, *
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 * GNU General Public License for more details. *
16 * *
17 * You should have received a copy of the GNU General Public License *
18 * along with this program; if not, write to the *
19 * Free Software Foundation, Inc., *
20 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
21 ***************************************************************************/
22
23 #ifndef PHYLOTREE_H
24 #define PHYLOTREE_H
25 //#define NDEBUG
26 // comented out this for Mac
27
28 // PLEASE DONT TOUCH THESE VARIABLES ANYMORE!
29 #define EIGEN_NO_AUTOMATIC_RESIZING
30 //#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 32 // PLEASE DONT TOUCH THESE VARIABLES ANYMORE!
31 //#define EIGEN_UNROLLING_LIMIT 1000 // PLEASE DONT TOUCH THESE VARIABLES ANYMORE!
32
33 //#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (512*256)
34 //#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (8*512*512)
35 //#include <Eigen/Core>
36 #include "mtree.h"
37 #include "alignment/alignment.h"
38 #include "model/modelsubst.h"
39 #include "model/modelfactory.h"
40 #include "phylonode.h"
41 #include "utils/optimization.h"
42 #include "model/rateheterogeneity.h"
43 #include "candidateset.h"
44 #include "pll/pll.h"
45 #include "utils/checkpoint.h"
46 #include "constrainttree.h"
47 #include "memslot.h"
48
49 #define BOOT_VAL_FLOAT
50 #define BootValType float
51 //#define BootValType double
52
53 enum CostMatrixType {CM_UNIFORM, CM_LINEAR};
54
55 //extern int instruction_set;
56
57 #define SAFE_LH true // safe likelihood scaling to avoid numerical underflow for ultra large trees
58 #define NORM_LH false // normal likelihood scaling
59
60 const double TOL_BRANCH_LEN = 0.000001; // NEVER TOUCH THIS CONSTANT AGAIN PLEASE!
61 const double TOL_LIKELIHOOD = 0.001; // NEVER TOUCH THIS CONSTANT AGAIN PLEASE!
62 const double TOL_LIKELIHOOD_PARAMOPT = 0.001; // BQM: newly introduced for ModelFactory::optimizeParameters
63 //const static double SCALING_THRESHOLD = sqrt(DBL_MIN);
64 //const static double SCALING_THRESHOLD = 1e-100;
65 //const static double SCALING_THRESHOLD_INVER = 1 / SCALING_THRESHOLD;
66 //const static double LOG_SCALING_THRESHOLD = log(SCALING_THRESHOLD);
67 //#include "pll/pll.h"
68 // 2^256
69 //#define SCALING_THRESHOLD_INVER 115792089237316195423570985008687907853269984665640564039457584007913129639936.0
70 #define SCALING_THRESHOLD_EXP 256
71 //#define SCALING_THRESHOLD (1.0/SCALING_THRESHOLD_INVER)
72 // 2^{-256}
73 #define SCALING_THRESHOLD 8.636168555094444625386e-78
74 //#define SCALING_THRESHOLD ldexp(1.0, -256)
75 //#define LOG_SCALING_THRESHOLD log(SCALING_THRESHOLD)
76 #define LOG_SCALING_THRESHOLD -177.4456782233459932741
77
78 const int SPR_DEPTH = 2;
79
80 //using namespace Eigen;
81
82 template< class T>
aligned_alloc(size_t size)83 inline T *aligned_alloc(size_t size) {
84 size_t MEM_ALIGNMENT = (Params::getInstance().SSE >= LK_AVX512) ? 64 : ((Params::getInstance().SSE >= LK_AVX) ? 32 : 16);
85 void *mem;
86
87 #if defined WIN32 || defined _WIN32 || defined __WIN32__
88 #if (defined(__MINGW32__) || defined(__clang__)) && defined(BINARY32)
89 mem = __mingw_aligned_malloc(size*sizeof(T), MEM_ALIGNMENT);
90 #else
91 mem = _aligned_malloc(size*sizeof(T), MEM_ALIGNMENT);
92 #endif
93 #else
94 int res = posix_memalign(&mem, MEM_ALIGNMENT, size*sizeof(T));
95 if (res == ENOMEM) {
96 #if (defined(__GNUC__) || defined(__clang__)) && !defined(WIN32) && !defined(__CYGWIN__)
97 print_stacktrace(cerr);
98 #endif
99 outError("Not enough memory, allocation of " + convertInt64ToString(size*sizeof(T)) + " bytes failed (bad_alloc)");
100 }
101 #endif
102 if (mem == NULL) {
103 #if (defined(__GNUC__) || defined(__clang__)) && !defined(WIN32) && !defined(__CYGWIN__)
104 print_stacktrace(cerr);
105 #endif
106 outError("Not enough memory, allocation of " + convertInt64ToString(size*sizeof(T)) + " bytes failed (bad_alloc)");
107 }
108 return (T*)mem;
109 }
110
aligned_free(void * mem)111 inline void aligned_free(void* mem) {
112 #if defined WIN32 || defined _WIN32 || defined __WIN32__
113 #if (defined(__MINGW32__) || defined(__clang__)) && defined(BINARY32)
114 __mingw_aligned_free(mem);
115 #else
116 _aligned_free(mem);
117 #endif
118 #else
119 free(mem);
120 #endif
121 }
122
123
124 /**
125 * Row Major Array For Eigen
126 */
127 //typedef Array<double, Dynamic, Dynamic, RowMajor> RowMajorArrayXXd;
128
129
130 typedef std::map< string, double > StringDoubleMap;
131 typedef std::map< int, PhyloNode* > IntPhyloNodeMap;
132
133 /*
134 #define MappedMat(NSTATES) Map<Matrix<double, NSTATES, NSTATES> >
135 #define MappedArr2D(NSTATES) Map<Array<double, NSTATES, NSTATES> >
136 #define MappedRowVec(NSTATES) Map<Matrix<double, 1, NSTATES> >
137 #define MappedVec(NSTATES) Map<Matrix<double, NSTATES, 1> >
138 #define Matrix(NSTATES) Matrix<double, NSTATES, NSTATES>
139 #define RowVector(NSTATES) Matrix<double, 1, NSTATES>
140 #define MappedRowArr2DDyn Map<Array<double, Dynamic, Dynamic, RowMajor> >
141 #define MappedArrDyn Map<Array<double, Dynamic, 1> >
142 #define MappedVecDyn(NSTATES) Map<Matrix<double, Dynamic, NSTATES> >
143 */
144
145 const int MAX_SPR_MOVES = 20;
146
147 struct NNIMove {
148
149 // Two nodes representing the central branch
150 PhyloNode *node1, *node2;
151
152 // Roots of the two subtree that are swapped
153 NeighborVec::iterator node1Nei_it, node2Nei_it;
154
155 // log-likelihood of the tree after applying the NNI
156 double newloglh;
157
158 int swap_id;
159
160 // new branch lengths of 5 branches corresponding to the NNI
161 DoubleVector newLen[5];
162
163 // pattern likelihoods
164 double *ptnlh;
165
166 bool operator<(const NNIMove & rhs) const {
167 return newloglh > rhs.newloglh;
168 }
169 };
170
171 /**
172 an SPR move.
173 */
174 struct SPRMove {
175 PhyloNode *prune_dad;
176 PhyloNode *prune_node;
177 PhyloNode *regraft_dad;
178 PhyloNode *regraft_node;
179 double score;
180 };
181
182 struct SPR_compare {
183
operatorSPR_compare184 bool operator()(SPRMove s1, SPRMove s2) const {
185 return s1.score > s2.score;
186 }
187 };
188
189 class SPRMoves : public set<SPRMove, SPR_compare> {
190 public:
191 void add(PhyloNode *prune_node, PhyloNode *prune_dad,
192 PhyloNode *regraft_node, PhyloNode *regraft_dad, double score);
193 };
194
195 /*
196 left_node-----------dad-----------right_node
197 |
198 |
199 |inline
200 node
201 */
202 struct PruningInfo {
203 NeighborVec::iterator dad_it_left, dad_it_right, left_it, right_it;
204 Neighbor *dad_nei_left, *dad_nei_right, *left_nei, *right_nei;
205 Node *node, *dad, *left_node, *right_node;
206 double left_len, right_len;
207 double *dad_lh_left, *dad_lh_right;
208
209 };
210
211 /**
212 * This Structure is used in PhyloSuperTreePlen.
213 */
214 struct SwapNNIParam {
215 double nni1_score;
216 double nni1_brlen;
217 double nni2_score;
218 double nni2_brlen;
219 Neighbor* node1_nei;
220 Neighbor* node2_nei;
221 double *nni1_ptnlh;
222 double *nni2_ptnlh;
223 };
224
225
226 struct LeafFreq {
227 int leaf_id;
228
229 int freq;
230
231 bool operator<(const LeafFreq & rhs) const {
232 return ( freq < rhs.freq);
233 }
234 };
235
236
237 // **********************************************
238 // BEGIN definitions for likelihood mapping (HAS)
239 // **********************************************
240
241 /* maximum exp difference, such that 1.0+exp(-TP_MAX_EXP_DIFF) == 1.0 */
242 const double TP_MAX_EXP_DIFF = 40.0;
243
244 /* Index definition for counter array needed in likelihood mapping analysis (HAS) */
245 #define LM_REG1 0
246 #define LM_REG2 1
247 #define LM_REG3 2
248 #define LM_REG4 3
249 #define LM_REG5 4
250 #define LM_REG6 5
251 #define LM_REG7 6
252 #define LM_AR1 7
253 #define LM_AR2 8
254 #define LM_AR3 9
255 #define LM_MAX 10
256
257 struct QuartetGroups{
258 int numGroups; // number of clusters:
259 // 0: not initialized, default -> 1
260 // 1: no clusters - any (a,b)|(c,d)
261 // 2: 2 clusters - (a,a')|(b,b')
262 // 3: 3 clusters - (a,a')|(b,c) [rare]
263 // 4: 4 clusters - (a,b)|(c,d)
264 int numSeqs; // number of seqs in alignment (should be #A+#B+#C+#D+#X)
265 int numQuartSeqs; // number of seqs in analysis (should be #A+#B+#C+#D)
266 int numGrpSeqs[5]; // number of seqs in cluster A, B, C, D, and X (exclude)
267 int64_t uniqueQuarts; // number of existing unique quartets for this grouping
268 string Name[5]; // seqIDs of cluster A
269 vector<int> GroupA; // seqIDs of cluster A
270 vector<int> GroupB; // seqIDs of cluster B
271 vector<int> GroupC; // seqIDs of cluster C
272 vector<int> GroupD; // seqIDs of cluster D
273 vector<int> GroupX; // seqIDs of cluster X
274 };
275
276 struct QuartetInfo {
277 int seqID[4];
278 double logl[3]; // log-lh for {0,1}|{2,3} {0,2}|{1,3} {0,3}|{1,4}
279 double qweight[3]; // weight for {0,1}|{2,3} {0,2}|{1,3} {0,3}|{1,4}
280 int corner; // for the 3 corners of the simplex triangle (0:top, 1:right, 2:left)
281 int area; // for the 7 areas of the simplex triangle
282 // corners (0:top, 1:right, 2:left), rectangles (3:right, 4:left, 5:bottom), 6:center
283 };
284
285 struct SeqQuartetInfo {
286 int64_t countarr[LM_MAX]; // the 7 areas of the simplex triangle [0-6; corners (0:top, 1:right, 2:left), rectangles (3:right, 4:left, 5:bottom), 6:center] and the 3 corners [7-9; 7:top, 8:right, 9:left]
287 };
288
289 // ********************************************
290 // END definitions for likelihood mapping (HAS)
291 // ********************************************
292
293
294 // ********************************************
295 // BEGIN traversal information
296 // ********************************************
297
298 class TraversalInfo {
299 public:
300 PhyloNeighbor *dad_branch;
301 PhyloNode *dad;
302 double *echildren;
303 double *partial_lh_leaves;
304
TraversalInfo(PhyloNeighbor * dad_branch,PhyloNode * dad)305 TraversalInfo(PhyloNeighbor *dad_branch, PhyloNode *dad) {
306 this->dad = dad;
307 this->dad_branch = dad_branch;
308 }
309 };
310
311 // ********************************************
312 // END traversal information
313 // ********************************************
314
315
316 /**
317 Phylogenetic Tree class
318
319 @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui@univie.ac.at>
320 */
321 class PhyloTree : public MTree, public Optimization, public CheckpointFactory {
322
323 friend class PhyloSuperTree;
324 friend class PhyloSuperTreePlen;
325 friend class RateGamma;
326 friend class RateGammaInvar;
327 friend class RateKategory;
328 friend class ModelMixture;
329 friend class RateFree;
330 friend class RateHeterotachy;
331 friend class PhyloTreeMixlen;
332 friend class ModelFactoryMixlen;
333 friend class MemSlotVector;
334 friend class ModelFactory;
335
336 public:
337 /**
338 default constructor ( everything is initialized to NULL)
339 */
340 PhyloTree();
341
342 // EIGEN_MAKE_ALIGNED_OPERATOR_NEW
343
344 /**
345 * Constructor with given alignment
346 * @param alignment
347 */
348 PhyloTree(Alignment *aln);
349
350 /**
351 * Create a phylotree from the tree string and assign alignment.
352 * Taxa IDs are numbered according to their orders in the alignment.
353 */
354 PhyloTree(string& treeString, Alignment *aln, bool isRooted);
355
356 void init();
357
358 /**
359 destructor
360 */
361 virtual ~PhyloTree();
362
363 /**
364 start structure for checkpointing
365 */
366 virtual void startCheckpoint();
367
368 /**
369 save object into the checkpoint
370 */
371 virtual void saveCheckpoint();
372
373 /**
374 restore object from the checkpoint
375 */
376 virtual void restoreCheckpoint();
377
378 /**
379 read the tree from the input file in newick format
380 @param infile the input file file.
381 @param is_rooted (IN/OUT) true if tree is rooted
382 */
383 virtual void readTree(const char *infile, bool &is_rooted);
384
385 /**
386 read the tree from the ifstream in newick format
387 @param in the input stream.
388 @param is_rooted (IN/OUT) true if tree is rooted
389 */
390 virtual void readTree(istream &in, bool &is_rooted);
391
392 /**
393 copy the phylogenetic tree structure into this tree, override to take sequence names
394 in the alignment into account
395 @param tree the tree to copy
396 */
397 virtual void copyTree(MTree *tree);
398 /**
399 copy the sub-tree structure into this tree
400 @param tree the tree to copy
401 @param taxa_set 0-1 string of length leafNum (1 to keep the leaf)
402 */
403 virtual void copyTree(MTree *tree, string &taxa_set);
404
405 /**
406 copy the constraint tree structure into this tree and reindex node IDs accordingly
407 @param tree the tree to copy
408 @param[out] order of taxa with first part being in constraint tree
409 */
410 void copyConstraintTree(MTree *tree, IntVector &taxon_order, int *rand_stream);
411
412 /**
413 copy the phylogenetic tree structure into this tree, designed specifically for PhyloTree.
414 So there is some distinction with copyTree.
415 @param tree the tree to copy
416 */
417 void copyPhyloTree(PhyloTree *tree);
418
419 /**
420 copy the phylogenetic tree structure into this tree, designed specifically for PhyloTree.
421 So there is some distinction with copyTree.
422 @param tree the tree to copy
423 @param mix mixture ID of branch lengths
424 */
425 virtual void copyPhyloTreeMixlen(PhyloTree *tree, int mix);
426
427
428 /**
429 Set the alignment, important to compute parsimony or likelihood score
430 Assing taxa ids according to their position in the alignment
431 @param alignment associated alignment
432 */
433 virtual void setAlignment(Alignment *alignment);
434
435 /** set the root by name
436 @param my_root root node name
437 @param multi_taxa TRUE if my_root is a comma-separated list of nodes
438 */
439 virtual void setRootNode(const char *my_root, bool multi_taxa = false);
440
441
442 /**
443 set the substitution model, important to compute the likelihood
444 @param amodel associated substitution model
445 */
446 void setModel(ModelSubst *amodel);
447
448 /**
449 set the model factory
450 @param model_fac model factory
451 */
452 virtual void setModelFactory(ModelFactory *model_fac);
453
454 /**
455 set rate heterogeneity, important to compute the likelihood
456 @param rate associated rate heterogeneity class
457 */
458 void setRate(RateHeterogeneity *rate);
459
460 /**
461 get rate heterogeneity
462 @return associated rate heterogeneity class
463 */
464 RateHeterogeneity *getRate();
465
466 void discardSaturatedSite(bool val);
467
468 /** get substitution matrix name */
469 string getSubstName();
470
471 /** get rate heterogeneity name */
472 string getRateName();
473
474 /**
475 get the name of the model
476 */
477 virtual string getModelName();
478
479 /**
480 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}+I{pinvar}+G{alpha}
481 */
482 virtual string getModelNameParams();
483
getModel()484 ModelSubst *getModel() {
485 return model;
486 }
487
getModelFactory()488 ModelFactory *getModelFactory() {
489 return model_factory;
490 }
491
isSuperTree()492 virtual bool isSuperTree() {
493 return false;
494 }
495
isSuperTreeUnlinked()496 virtual bool isSuperTreeUnlinked() {
497 return false;
498 }
499
500 /**
501 @return true if this is a tree with mixture branch lengths, default: false
502 */
isMixlen()503 virtual bool isMixlen() { return false; }
504
505 /**
506 @return number of mixture branch lengths, default: 1
507 */
getMixlen()508 virtual int getMixlen() { return 1; }
509
510 /**
511 allocate a new node. Override this if you have an inherited Node class.
512 @param node_id node ID
513 @param node_name node name
514 @return a new node
515 */
516 virtual Node* newNode(int node_id = -1, const char* node_name = NULL);
517
518 /**
519 allocate a new node. Override this if you have an inherited Node class.
520 @param node_id node ID
521 @param node_name node name issued by an interger
522 @return a new node
523 */
524 virtual Node* newNode(int node_id, int node_name);
525
526 /**
527 * @return number of alignment patterns
528 */
getAlnNPattern()529 virtual int getAlnNPattern() {
530 return aln->getNPattern();
531 }
532
533 /**
534 * @return number of alignment sites
535 */
getAlnNSite()536 virtual int getAlnNSite() {
537 return aln->getNSite();
538 }
539
540 /**
541 * save branch lengths into a vector
542 */
543 virtual void saveBranchLengths(DoubleVector &lenvec, int startid = 0, PhyloNode *node = NULL, PhyloNode *dad = NULL);
544 /**
545 * restore branch lengths from a vector previously called with saveBranchLengths
546 */
547 virtual void restoreBranchLengths(DoubleVector &lenvec, int startid = 0, PhyloNode *node = NULL, PhyloNode *dad = NULL);
548
549 /****************************************************************************
550 Dot product
551 ****************************************************************************/
552 template <class Numeric, class VectorClass>
553 Numeric dotProductSIMD(Numeric *x, Numeric *y, int size);
554
555 typedef BootValType (PhyloTree::*DotProductType)(BootValType *x, BootValType *y, int size);
556 DotProductType dotProduct;
557
558 typedef double (PhyloTree::*DotProductDoubleType)(double *x, double *y, int size);
559 DotProductDoubleType dotProductDouble;
560
561 double dotProductDoubleCall(double *x, double *y, int size);
562
563 #if defined(BINARY32) || defined(__NOAVX__)
setDotProductAVX()564 void setDotProductAVX() {}
setDotProductFMA()565 void setDotProductFMA() {}
566 #else
567 void setDotProductAVX();
568 void setDotProductFMA();
569 void setDotProductAVX512();
570 #endif
571
572 void setDotProductSSE();
573
574 /**
575 this function return the parsimony or likelihood score of the tree. Default is
576 to compute the parsimony score. Override this function if you define a new
577 score function.
578 @return the tree score
579 */
580 //virtual double computeScore() { return -computeLikelihood(); }
581 //virtual double computeScore() { return (double)computeParsimonyScore(); }
582
583 /****************************************************************************
584 Parsimony function
585 ****************************************************************************/
586
587 /**
588 * Return the approximated branch length estimation using corrected parsimony branch length
589 * This is usually used as the starting point before using Newton-Raphson
590 */
591 // double computeCorrectedParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad);
592
593 /**
594 initialize partial_pars vector of all PhyloNeighbors, allocating central_partial_pars
595 */
596 virtual void initializeAllPartialPars();
597
598 /**
599 initialize partial_pars vector of all PhyloNeighbors, allocating central_partial_pars
600 @param node the current node
601 @param dad dad of the node, used to direct the search
602 @param index the index
603 */
604 virtual void initializeAllPartialPars(int &index, PhyloNode *node = NULL, PhyloNode *dad = NULL);
605
606 /**
607 compute the tree parsimony score
608 @return parsimony score of the tree
609 */
610 int computeParsimony();
611
612 typedef void (PhyloTree::*ComputePartialParsimonyType)(PhyloNeighbor *, PhyloNode *);
613 ComputePartialParsimonyType computePartialParsimonyPointer;
614
615 /**
616 Compute partial parsimony score of the subtree rooted at dad
617 @param dad_branch the branch leading to the subtree
618 @param dad its dad, used to direct the tranversal
619 */
620 virtual void computePartialParsimony(PhyloNeighbor *dad_branch, PhyloNode *dad);
621 // void computePartialParsimonyNaive(PhyloNeighbor *dad_branch, PhyloNode *dad);
622 void computePartialParsimonyFast(PhyloNeighbor *dad_branch, PhyloNode *dad);
623 template<class VectorClass>
624 void computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
625
626 template<class VectorClass>
627 void computePartialParsimonySankoffSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
628
629 void computeReversePartialParsimony(PhyloNode *node, PhyloNode *dad);
630
631 typedef int (PhyloTree::*ComputeParsimonyBranchType)(PhyloNeighbor *, PhyloNode *, int *);
632 ComputeParsimonyBranchType computeParsimonyBranchPointer;
633
634 /**
635 compute tree parsimony score on a branch
636 @param dad_branch the branch leading to the subtree
637 @param dad its dad, used to direct the tranversal
638 @param branch_subst (OUT) if not NULL, the number of substitutions on this branch
639 @return parsimony score of the tree
640 */
641 virtual int computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
642 // int computeParsimonyBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
643 int computeParsimonyBranchFast(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
644 template<class VectorClass>
645 int computeParsimonyBranchFastSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
646
647 template<class VectorClass>
648 int computeParsimonyBranchSankoffSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
649
650 // void printParsimonyStates(PhyloNeighbor *dad_branch = NULL, PhyloNode *dad = NULL);
651
652 virtual void setParsimonyKernel(LikelihoodKernel lk);
653 #if defined(BINARY32) || defined(__NOAVX__)
setParsimonyKernelAVX()654 virtual void setParsimonyKernelAVX() {}
655 #else
656 virtual void setParsimonyKernelAVX();
657 #endif
658
659 virtual void setParsimonyKernelSSE();
660
661 /****************************************************************************
662 Sankoff Parsimony function
663 ****************************************************************************/
664
665 /**
666 * initialise cost_matrix as linear
667 * initialize for 'nstates' and 'columns'
668 */
669 void initCostMatrix(CostMatrixType cost_type);
670
671 /**
672 * read the cost matrix file
673 * initialize for 'nstates' and 'columns'
674 */
675 void loadCostMatrixFile(char* file_name = NULL);
676
677 /*
678 * For a leaf character corresponding to an ambiguous state
679 * set elements corresponding to possible states to 0, others to UINT_MAX
680 */
681 void initLeafSiteParsForAmbiguousState(char state, UINT * site_partial_pars);
682
683 /**
684 compute partial parsimony score of the subtree rooted at dad
685 @param dad_branch the branch leading to the subtree
686 @param dad its dad, used to direct the traversal
687 */
688 void computePartialParsimonySankoff(PhyloNeighbor *dad_branch, PhyloNode *dad);
689
690 /**
691 compute tree parsimony score based on a particular branch
692 @param dad_branch the branch leading to the subtree
693 @param dad its dad, used to direct the traversal
694 @param branch_subst (OUT) if not NULL, the number of substitutions on this branch
695 @return parsimony score of the tree
696 */
697 int computeParsimonyBranchSankoff(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
698
699 /****************************************************************************
700 likelihood function
701 ****************************************************************************/
702
703 size_t getBufferPartialLhSize();
704
705 /**
706 initialize partial_lh vector of all PhyloNeighbors, allocating central_partial_lh
707 */
708 virtual void initializeAllPartialLh();
709
710 /**
711 de-allocate central_partial_lh
712 */
713 virtual void deleteAllPartialLh();
714
715 /**
716 initialize partial_lh vector of all PhyloNeighbors, allocating central_partial_lh
717 @param node the current node
718 @param dad dad of the node, used to direct the search
719 @param index the index
720 */
721 virtual void initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node = NULL, PhyloNode *dad = NULL);
722
723
724 /**
725 clear all partial likelihood for a clean computation again
726 @param make_null true to make all partial_lh become NULL
727 */
728 virtual void clearAllPartialLH(bool make_null = false);
729
730 /**
731 * compute all partial likelihoods if not computed before
732 */
733 void computeAllPartialLh(PhyloNode *node = NULL, PhyloNode *dad = NULL);
734
735 /**
736 * compute all partial parsimony vector if not computed before
737 */
738 void computeAllPartialPars(PhyloNode *node = NULL, PhyloNode *dad = NULL);
739
740 /**
741 allocate memory for a partial likelihood vector
742 */
743 double *newPartialLh();
744
745 /** get the number of bytes occupied by partial_lh */
746 size_t getPartialLhBytes();
747 size_t getPartialLhSize();
748
749 /**
750 allocate memory for a scale num vector
751 */
752 UBYTE *newScaleNum();
753
754 /** get the number of bytes occupied by scale_num */
755 size_t getScaleNumBytes();
756 size_t getScaleNumSize();
757
758 /**
759 * this stores partial_lh for each state at the leaves of the tree because they are the same between leaves
760 * e.g. (1,0,0,0) for A, (0,0,0,1) for T
761 */
762 double *tip_partial_lh;
763 int tip_partial_lh_computed;
764 UINT *tip_partial_pars;
765
766 bool ptn_freq_computed;
767
768 /** vector size used by SIMD kernel */
769 size_t vector_size;
770
771 /** true if using safe numeric for likelihood kernel */
772 bool safe_numeric;
773
774 /** number of threads used for likelihood kernel */
775 int num_threads;
776
777
778 /****************************************************************************
779 helper functions for computing tree traversal
780 ****************************************************************************/
781
782
783 /**
784 compute traversal_info of a subtree
785 */
786 bool computeTraversalInfo(PhyloNeighbor *dad_branch, PhyloNode *dad, double* &buffer);
787
788
789 /**
790 compute traversal_info of both subtrees
791 */
792 template<class VectorClass, const int nstates>
793 void computeTraversalInfo(PhyloNode *node, PhyloNode *dad, bool compute_partial_lh);
794 template<class VectorClass>
795 void computeTraversalInfo(PhyloNode *node, PhyloNode *dad, bool compute_partial_lh);
796
797 /**
798 precompute info for models
799 */
800 template<class VectorClass, const int nstates>
801 void computePartialInfo(TraversalInfo &info, VectorClass* buffer);
802 template<class VectorClass>
803 void computePartialInfo(TraversalInfo &info, VectorClass* buffer);
804
805 /**
806 sort neighbor in descending order of subtree size (number of leaves within subree)
807 @param node the starting node, NULL to start from the root
808 @param dad dad of the node, used to direct the search
809 */
810 void sortNeighborBySubtreeSize(PhyloNode *node, PhyloNode *dad);
811
812 /****************************************************************************
813 computing partial (conditional) likelihood of subtrees
814 ****************************************************************************/
815
816 /** transform _pattern_lh_cat from "interleaved" to "sequential", due to vector_size > 1 */
817 void transformPatternLhCat();
818
819 // Compute the partial likelihoods LH (OUT) at the leaves for an observed PoMo
820 // STATE (IN). Use binomial sampling unless hyper is true, then use
821 // hypergeometric sampling.
822 void computeTipPartialLikelihoodPoMo(int state, double *lh, bool hyper=false);
823 void computeTipPartialLikelihood();
824 void computeTipPartialParsimony();
825 void computePtnInvar();
826 void computePtnFreq();
827
828
829 /**
830 compute the partial likelihood at a subtree
831 @param dad_branch the branch leading to the subtree
832 @param dad its dad, used to direct the tranversal
833 */
834 virtual void computePartialLikelihood(TraversalInfo &info, size_t ptn_lower, size_t ptn_upper, int thread_id);
835 typedef void (PhyloTree::*ComputePartialLikelihoodType)(TraversalInfo &info, size_t ptn_lower, size_t ptn_upper, int thread_id);
836 ComputePartialLikelihoodType computePartialLikelihoodPointer;
837
838
839 //template <const int nstates>
840 // void computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
841
842 // void computeSitemodelPartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
843
844 // template <class VectorClass, const int VCSIZE, const int nstates>
845 // void computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
846
847 void computeNonrevPartialLikelihood(TraversalInfo &info, size_t ptn_lower, size_t ptn_upper, int thread_id);
848 template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false>
849 void computeNonrevPartialLikelihoodGenericSIMD(TraversalInfo &info, size_t ptn_lower, size_t ptn_upper, int thread_id);
850 template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false>
851 void computeNonrevPartialLikelihoodSIMD(TraversalInfo &info, size_t ptn_lower, size_t ptn_upper, int thread_id);
852
853 template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false, const bool SITE_MODEL = false>
854 void computePartialLikelihoodSIMD(TraversalInfo &info, size_t ptn_lower, size_t ptn_upper, int thread_id);
855
856 template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false, const bool SITE_MODEL = false>
857 void computePartialLikelihoodGenericSIMD(TraversalInfo &info, size_t ptn_lower, size_t ptn_upper, int thread_id);
858
859 /*
860 template <class VectorClass, const int VCSIZE, const int nstates>
861 void computeMixratePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
862
863 template <class VectorClass, const int VCSIZE, const int nstates>
864 void computeMixturePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
865
866 template <class VectorClass, const int VCSIZE, const int nstates>
867 void computeSitemodelPartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
868 */
869
870 /****************************************************************************
871 computing likelihood on a branch
872 ****************************************************************************/
873
874 /**
875 compute tree likelihood on a branch. used to optimize branch length
876 @param dad_branch the branch leading to the subtree
877 @param dad its dad, used to direct the tranversal
878 @return tree likelihood
879 */
880 virtual double computeLikelihoodBranch(PhyloNeighbor *dad_branch, PhyloNode *dad);
881
882 typedef double (PhyloTree::*ComputeLikelihoodBranchType)(PhyloNeighbor*, PhyloNode*);
883 ComputeLikelihoodBranchType computeLikelihoodBranchPointer;
884
885 /**
886 * MINH: this implements the fast alternative strategy for reversible model (March 2013)
887 * where partial likelihoods at nodes store real partial likelihoods times eigenvectors
888 */
889 // template<int NSTATES>
890 // inline double computeLikelihoodBranchFast(PhyloNeighbor *dad_branch, PhyloNode *dad);
891
892 //template <const int nstates>
893 // double computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
894
895 // double computeSitemodelLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
896
897 // template <class VectorClass, const int VCSIZE, const int nstates>
898 // double computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
899
900 double computeNonrevLikelihoodBranch(PhyloNeighbor *dad_branch, PhyloNode *dad);
901 template<class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false>
902 double computeNonrevLikelihoodBranchGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
903 template<class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false>
904 double computeNonrevLikelihoodBranchSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
905
906 template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false, const bool SITE_MODEL = false>
907 double computeLikelihoodBranchSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
908
909 template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false, const bool SITE_MODEL = false>
910 double computeLikelihoodBranchGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
911
912 /*
913 template <class VectorClass, const int VCSIZE, const int nstates>
914 double computeMixrateLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
915
916 template <class VectorClass, const int VCSIZE, const int nstates>
917 double computeMixtureLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
918
919 template <class VectorClass, const int VCSIZE, const int nstates>
920 double computeSitemodelLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
921 */
922
923 /****************************************************************************
924 computing likelihood on a branch using buffer
925 ****************************************************************************/
926
927 /**
928 quickly compute tree likelihood on branch current_it <-> current_it_back given buffer (theta_all).
929 Used after optimizing branch length
930 @param pattern_lh (OUT) if not NULL, the function will assign pattern log-likelihoods to this vector
931 assuming pattern_lh has the size of the number of patterns
932 @return tree likelihood
933 */
934 virtual double computeLikelihoodFromBuffer();
935 typedef double (PhyloTree::*ComputeLikelihoodFromBufferType)();
936 ComputeLikelihoodFromBufferType computeLikelihoodFromBufferPointer;
937
938 // template <class VectorClass, const int VCSIZE, const int nstates>
939 // double computeLikelihoodFromBufferEigenSIMD();
940
941 template <class VectorClass, const int nstates, const bool FMA = false, const bool SITE_MODEL = false>
942 double computeLikelihoodFromBufferSIMD();
943
944 template <class VectorClass, const bool FMA = false, const bool SITE_MODEL = false>
945 double computeLikelihoodFromBufferGenericSIMD();
946
947 /*
948 template <class VectorClass, const int VCSIZE, const int nstates>
949 double computeMixrateLikelihoodFromBufferEigenSIMD();
950
951 template <class VectorClass, const int VCSIZE, const int nstates>
952 double computeMixtureLikelihoodFromBufferEigenSIMD();
953
954 template <class VectorClass, const int VCSIZE, const int nstates>
955 double computeSitemodelLikelihoodFromBufferEigenSIMD();
956
957 double computeSitemodelLikelihoodFromBufferEigen();
958 */
959
960 /**
961 compute tree likelihood when a branch length collapses to zero
962 @param dad_branch the branch leading to the subtree
963 @param dad its dad, used to direct the tranversal
964 @return tree likelihood
965 */
966 virtual double computeLikelihoodZeroBranch(PhyloNeighbor *dad_branch, PhyloNode *dad);
967
968 /**
969 compute likelihood of rooted tree with virtual root (FOR TINA)
970 @param dad_branch the branch leading to the subtree
971 @param dad its dad, used to direct the tranversal
972 @return tree likelihood
973 */
974 // virtual double computeLikelihoodRooted(PhyloNeighbor *dad_branch, PhyloNode *dad);
975
976 /**
977 compute the tree likelihood
978 @param pattern_lh (OUT) if not NULL, the function will assign pattern log-likelihoods to this vector
979 assuming pattern_lh has the size of the number of patterns
980 @return tree likelihood
981 */
982 virtual double computeLikelihood(double *pattern_lh = NULL);
983
984 /**
985 * @return number of elements per site lhl entry, used in conjunction with computePatternLhCat
986 */
987 virtual int getNumLhCat(SiteLoglType wsl);
988
989 /**
990 * compute _pattern_lh_cat for site-likelihood per category
991 * @return tree log-likelihood
992 */
993 virtual double computePatternLhCat(SiteLoglType wsl);
994
995 /**
996 compute state frequency for each pattern (for Huaichun)
997 @param[out] ptn_state_freq state frequency vector per pattern,
998 should be pre-allocated with size of num_patterns * num_states
999 */
1000 void computePatternStateFreq(double *ptn_state_freq);
1001
1002 /****************************************************************************
1003 ancestral sequence reconstruction
1004 ****************************************************************************/
1005
1006 /**
1007 initialize computing ancestral sequence probability for an internal node by marginal reconstruction
1008 */
1009 virtual void initMarginalAncestralState(ostream &out, bool &orig_kernel_nonrev, double* &ptn_ancestral_prob, int* &ptn_ancestral_seq);
1010
1011 /**
1012 compute ancestral sequence probability for an internal node by marginal reconstruction
1013 (Yang, Kumar and Nei 1995)
1014 @param dad_branch branch leading to an internal node where to obtain ancestral sequence
1015 @param dad dad of the target internal node
1016 @param[out] ptn_ancestral_prob pattern ancestral probability vector of dad_branch->node
1017 */
1018 virtual void computeMarginalAncestralState(PhyloNeighbor *dad_branch, PhyloNode *dad,
1019 double *ptn_ancestral_prob, int *ptn_ancestral_seq);
1020
1021 virtual void writeMarginalAncestralState(ostream &out, PhyloNode *node, double *ptn_ancestral_prob, int *ptn_ancestral_seq);
1022
1023 /**
1024 end computing ancestral sequence probability for an internal node by marginal reconstruction
1025 */
1026 virtual void endMarginalAncestralState(bool orig_kernel_nonrev, double* &ptn_ancestral_prob, int* &ptn_ancestral_seq);
1027
1028 /**
1029 compute the joint ancestral states at a pattern (Pupko et al. 2000)
1030 */
1031 void computeJointAncestralSequences(int *ancestral_seqs);
1032
1033 /**
1034 * compute max ancestral likelihood according to
1035 * step 1-3 of the dynamic programming algorithm of Pupko et al. 2000, MBE 17:890-896
1036 * @param dad_branch branch leading to an internal node where to obtain ancestral sequence
1037 * @param dad dad of the target internal node
1038 * @param[out] C array storing all information about max ancestral states
1039 */
1040 void computeAncestralLikelihood(PhyloNeighbor *dad_branch, PhyloNode *dad, int *C);
1041
1042 /**
1043 * compute max ancestral states according to
1044 * step 4-5 of the dynamic programming algorithm of Pupko et al. 2000, MBE 17:890-896
1045 * @param dad_branch branch leading to an internal node where to obtain ancestral sequence
1046 * @param dad dad of the target internal node
1047 * @param C array storing all information about max ancestral states
1048 * @param[out] ancestral_seqs array of size nptn*nnode for ancestral sequences at all internal nodes
1049 */
1050 void computeAncestralState(PhyloNeighbor *dad_branch, PhyloNode *dad, int *C, int *ancestral_seqs);
1051
1052 /**
1053 compute pattern likelihoods only if the accumulated scaling factor is non-zero.
1054 Otherwise, copy the pattern_lh attribute
1055 @param pattern_lh (OUT) pattern log-likelihoods,
1056 assuming pattern_lh has the size of the number of patterns
1057 @param cur_logl current log-likelihood (for sanity check)
1058 @param pattern_lh_cat (OUT) if not NULL, store all pattern-likelihood per category
1059 */
1060 virtual void computePatternLikelihood(double *pattern_lh, double *cur_logl = NULL,
1061 double *pattern_lh_cat = NULL, SiteLoglType wsl = WSL_RATECAT);
1062
1063 /**
1064 compute pattern posterior probabilities per rate/mixture category
1065 @param pattern_prob_cat (OUT) all pattern-probabilities per category
1066 @param wsl either WSL_RATECAT, WSL_MIXTURE or WSL_MIXTURE_RATECAT
1067 */
1068 virtual void computePatternProbabilityCategory(double *pattern_prob_cat, SiteLoglType wsl);
1069
1070 vector<uint64_t> ptn_cat_mask;
1071
1072 /**
1073 compute categories for each pattern, update ptn_cat_mask
1074 @return max number of categories necessary
1075 */
1076 virtual int computePatternCategories(IntVector *pattern_ncat = NULL);
1077
1078 /**
1079 Compute the variance in tree log-likelihood
1080 (Kishino & Hasegawa 1989, JME 29:170-179)
1081 @param pattern_lh pattern log-likelihoods, will be computed if NULL
1082 @param tree_lh tree log-likelihood, will be computed if ZERO
1083 */
1084 double computeLogLVariance(double *pattern_lh = NULL, double tree_lh = 0.0);
1085
1086 /**
1087 Compute the variance in log-likelihood difference
1088 between the current tree and another tree.
1089 (Kishino & Hasegawa 1989, JME 29:170-179)
1090 @param pattern_lh_other pattern log-likelihoods of the other tree
1091 @param pattern_lh pattern log-likelihoods of current tree, will be computed if NULL
1092 */
1093 double computeLogLDiffVariance(double *pattern_lh_other, double *pattern_lh = NULL);
1094
1095 /**
1096 * \brief Estimate the observed branch length between \a dad_branch and \a dad analytically.
1097 * The ancestral states of the 2 nodes are first computed (Yang, 2006).
1098 * Branch length are then computed using analytical formula.
1099 * @param[in] dad_branch must be an internal node
1100 * @param[in] dad must be an internal node
1101 * @return estimated branch length or -1.0 if one of the 2 nodes is leaf
1102 */
1103 double computeBayesianBranchLength(PhyloNeighbor *dad_branch, PhyloNode *dad);
1104
1105 /**
1106 * \brief Approximate the branch legnth between \a dad_branch and \a dad using Least Square instead
1107 * of Newton Raphson
1108 * @param[in] dad_branch
1109 * @param[in] dad
1110 * @return approximated branch length
1111 */
1112 double computeLeastSquareBranLen(PhyloNeighbor *dad_branch, PhyloNode *dad);
1113
1114 /**
1115 * Update all subtree distances that are affect by doing an NNI on branch (node1-node2)
1116 * @param nni NNI move that is carried out
1117 */
1118 void updateSubtreeDists(NNIMove &nni);
1119
1120 /**
1121 * Compute all pairwise distance of subtree rooted at \a source and other subtrees
1122 */
1123 void computeSubtreeDists();
1124
1125 void getUnmarkedNodes(PhyloNodeVector& unmarkedNodes, PhyloNode* node = NULL, PhyloNode* dad = NULL);
1126
1127 void computeAllSubtreeDistForOneNode(PhyloNode* source, PhyloNode* nei1, PhyloNode* nei2, PhyloNode* node, PhyloNode* dad);
1128
1129 double correctBranchLengthF81(double observedBran, double alpha);
1130
1131 double computeCorrectedBayesianBranchLength(PhyloNeighbor *dad_branch, PhyloNode *dad);
1132
1133 /**
1134 Compute the variance in log-likelihood difference
1135 between the current tree and another tree.
1136 (Kishino & Hasegawa 1989, JME 29:170-179)
1137 @param other_tree the other tree to compare
1138 @param pattern_lh pattern log-likelihoods of current tree, will be computed if NULL
1139 */
1140 double computeLogLDiffVariance(PhyloTree *other_tree, double *pattern_lh = NULL);
1141
1142 /**
1143 Roll back the tree saved with only Taxon IDs and branch lengths.
1144 For this function to work, one must printTree before with WT_TAXON_ID + WT_BR_LEN
1145 @param best_tree_string input stream to read from
1146 */
1147 void rollBack(istream &best_tree_string);
1148
1149 /**
1150 refactored 2015-12-22: Taxon IDs instead of Taxon names to save space!
1151 Read the tree saved with Taxon IDs and branch lengths.
1152 @param tree_string tree string to read from
1153 @param updatePLL if true, tree is read into PLL
1154 */
1155 virtual void readTreeString(const string &tree_string);
1156
1157 /**
1158 Read the tree saved with Taxon names and branch lengths.
1159 @param tree_string tree string to read from
1160 @param updatePLL if true, tree is read into PLL
1161 */
1162 virtual void readTreeStringSeqName(const string &tree_string);
1163
1164 /**
1165 Read the tree saved with Taxon Names and branch lengths.
1166 @param tree_string tree string to read from
1167 */
1168 void readTreeFile(const string &file_name);
1169
1170 /*
1171 refactored 2015-12-22: Taxon IDs instead of Taxon names to save space!
1172 * Return the tree string contining taxon IDs and branch lengths
1173 * @return
1174 * @param format (WT_TAXON_ID, WT_BR_LEN, ...)
1175 * @return the tree string with the specified format
1176 */
1177 virtual string getTreeString();
1178
1179 /**
1180 * Assign branch lengths for branch that has no or negative length
1181 * With single model branch lengths are assigned using parsimony. With partition model
1182 * branch lengths are assigned randomly
1183 * @param force_change if true then force fixing also positive branch lengths
1184 * @return number of branches fixed
1185 */
1186 virtual int wrapperFixNegativeBranch(bool force_change);
1187
1188 /**
1189 * Read the newick string into PLL kernel
1190 * @param newickTree
1191 */
1192 void pllReadNewick(string newickTree);
1193
1194 /**
1195 * Return the sorted topology without branch length, used to compare tree topology
1196 * @param
1197 * printBranchLength true/false
1198 */
1199 string getTopologyString(bool printBranchLength);
1200
1201
1202 bool checkEqualScalingFactor(double &sum_scaling, PhyloNode *node = NULL, PhyloNode *dad = NULL);
1203
1204 /****************************************************************************
1205 computing derivatives of likelihood function
1206 ****************************************************************************/
1207
1208 //template <const int nstates>
1209 // void computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
1210
1211 // void computeSitemodelLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
1212
1213 // template <class VectorClass, const int VCSIZE, const int nstates>
1214 // void computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
1215
1216 void computeNonrevLikelihoodDerv(PhyloNeighbor *dad_branch, PhyloNode *dad, double *df, double *ddf);
1217 template<class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false>
1218 void computeNonrevLikelihoodDervGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double *df, double *ddf);
1219 template<class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false>
1220 void computeNonrevLikelihoodDervSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double *df, double *ddf);
1221
1222 template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false, const bool SITE_MODEL = false>
1223 void computeLikelihoodBufferSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, size_t ptn_lower, size_t ptn_upper, int thread_id);
1224
1225 template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false, const bool SITE_MODEL = false>
1226 void computeLikelihoodBufferGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, size_t ptn_lower, size_t ptn_upper, int thread_id);
1227
1228
1229 template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false, const bool SITE_MODEL = false>
1230 void computeLikelihoodDervSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double *df, double *ddf);
1231
1232 template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false, const bool SITE_MODEL = false>
1233 void computeLikelihoodDervGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double *df, double *ddf);
1234
1235 /** For Mixlen stuffs */
getCurMixture()1236 virtual int getCurMixture() { return 0; }
1237
1238 template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false, const bool SITE_MODEL = false>
1239 void computeLikelihoodDervMixlenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
1240
1241 template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false, const bool SITE_MODEL = false>
1242 void computeLikelihoodDervMixlenGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
1243
1244
1245 /*
1246 template <class VectorClass, const int VCSIZE, const int nstates>
1247 void computeMixrateLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
1248
1249 template <class VectorClass, const int VCSIZE, const int nstates>
1250 void computeMixtureLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
1251
1252 template <class VectorClass, const int VCSIZE, const int nstates>
1253 void computeSitemodelLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
1254 */
1255
1256 /**
1257 compute tree likelihood and derivatives on a branch. used to optimize branch length
1258 @param dad_branch the branch leading to the subtree
1259 @param dad its dad, used to direct the tranversal
1260 @param df (OUT) first derivative
1261 @param ddf (OUT) second derivative
1262 @return tree likelihood
1263 */
1264 void computeLikelihoodDerv(PhyloNeighbor *dad_branch, PhyloNode *dad, double *df, double *ddf);
1265
1266 typedef void (PhyloTree::*ComputeLikelihoodDervType)(PhyloNeighbor *, PhyloNode *, double *, double *);
1267 ComputeLikelihoodDervType computeLikelihoodDervPointer;
1268
1269 typedef void (PhyloTree::*ComputeLikelihoodDervMixlenType)(PhyloNeighbor *, PhyloNode *, double &, double &);
1270 ComputeLikelihoodDervMixlenType computeLikelihoodDervMixlenPointer;
1271
1272 /****************************************************************************
1273 Stepwise addition (greedy) by maximum parsimony
1274 ****************************************************************************/
1275
1276 /** constraint tree used to guide tree search */
1277 ConstraintTree constraintTree;
1278
1279 /**
1280 insert a node to a target branch
1281 @param added_node node to add
1282 @param target_node left node of the target branch
1283 @param target_dad right node of the target branch
1284 */
1285 void insertNode2Branch(Node* added_node, Node* target_node, Node* target_dad);
1286
1287 /**
1288 FAST VERSION: used internally by computeParsimonyTree() to find the best target branch to add into the tree
1289 @param added_node node to add
1290 @param target_node (OUT) one end of the best branch found
1291 @param target_dad (OUT) the other end of the best branch found
1292 @param target_partial_pars (OUT) copy of the partial_pars corresponding to best branch
1293 @param node the current node
1294 @param dad dad of the node, used to direct the search
1295 @return the parsimony score of the tree
1296 */
1297 int addTaxonMPFast(Node *added_taxon, Node *added_node, Node *node, Node *dad);
1298
1299 /**
1300 create a 3-taxon tree and return random taxon order
1301 @param[out] taxon_order random taxon order
1302 */
1303 void create3TaxonTree(IntVector &taxon_order, int *rand_stream);
1304
1305 /**
1306 Extract a bifurcating subtree and return randomly removed Neighbor
1307 If the tree is bifurcating, nothing change
1308 @param[out] removed_nei vector of removed Neighbor
1309 @param[out] attached_node vector of node attached to removed Neighbor
1310 */
1311 void extractBifurcatingSubTree(NeighborVec &removed_nei, NodeVector &attached_node, int *rand_stream);
1312
1313
1314 /**
1315 * FAST VERSION: compute parsimony tree by step-wise addition
1316 * @param out_prefix prefix for .parstree file
1317 * @param alignment input alignment
1318 * @param rand_stream random stream
1319 * @return parsimony score
1320 */
1321 virtual int computeParsimonyTree(const char *out_prefix, Alignment *alignment, int *rand_stream);
1322
1323 /****************************************************************************
1324 Branch length optimization by maximum likelihood
1325 ****************************************************************************/
1326
1327
1328 /**
1329 @param brlen_type either BRLEN_OPTIMIZE, BRLEN_FIX or BRLEN_SCALE
1330 @return the number of free branch parameters
1331 */
1332 int getNBranchParameters(int brlen_type);
1333
1334 /**
1335 * IMPORTANT: semantic change: this function does not return score anymore, for efficiency purpose
1336 optimize one branch length by ML
1337 @param node1 1st end node of the branch
1338 @param node2 2nd end node of the branch
1339 @param clearLH true to clear the partial likelihood, otherwise false
1340 @param maxNRStep maximum number of Newton-Raphson steps
1341 @return likelihood score
1342 */
1343 virtual void optimizeOneBranch(PhyloNode *node1, PhyloNode *node2, bool clearLH = true, int maxNRStep = 100);
1344
1345 /**
1346 optimize all branch lengths of the children of node
1347 @param node the current node
1348 @param dad dad of the node, used to direct the search
1349 @return the likelihood of the tree
1350 */
1351 double optimizeChildBranches(PhyloNode *node, PhyloNode *dad = NULL);
1352
1353 /**
1354 optimize all branch lengths at the subtree rooted at node step-by-step.
1355 @param node the current node
1356 @param dad dad of the node, used to direct the search
1357 @return the likelihood of the tree
1358 */
1359 virtual void optimizeAllBranches(PhyloNode *node, PhyloNode *dad = NULL, int maxNRStep = 100);
1360
1361 /**
1362 * optimize all branch lengths at the subtree rooted at node step-by-step.
1363 * Using Least Squares instead of Newton Raphson.
1364 * @param node the current node
1365 * @param dad dad of the node, used to direct the search
1366 */
1367 void optimizeAllBranchesLS(PhyloNode *node = NULL, PhyloNode *dad = NULL);
1368
1369 void computeBestTraversal(NodeVector &nodes, NodeVector &nodes2);
1370
1371 /**
1372 optimize all branch lengths of the tree
1373 @param iterations number of iterations to loop through all branches
1374 @return the likelihood of the tree
1375 */
1376 virtual double optimizeAllBranches(int my_iterations = 100, double tolerance = TOL_LIKELIHOOD, int maxNRStep = 100);
1377
1378 void moveRoot(Node *node1, Node *node2);
1379
1380 /**
1381 Optimize root position for rooted tree
1382 @param root_dist maximum distance to move root
1383 @param write_info true to write information to cout
1384 @param logl_epsilon epsilon of log-likelihood to consider as better
1385 */
1386 virtual double optimizeRootPosition(int root_dist, bool write_info, double logl_epsilon);
1387
1388 /**
1389 Test all root positions for rooted tree
1390 @param write_info true to write information to cout
1391 @param logl_epsilon epsilon of log-likelihood to consider as better
1392 */
1393 virtual double testRootPosition(bool write_info, double logl_epsilon);
1394
1395 /**
1396 inherited from Optimization class, to return to likelihood of the tree
1397 when the current branceh length is set to value
1398 @param value current branch length
1399 @return negative of likelihood (for minimization)
1400 */
1401 virtual double computeFunction(double value);
1402
1403 /**
1404 Inherited from Optimization class.
1405 This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
1406 used by Newton raphson method to minimize the function.
1407 @param value current branch length
1408 @param df (OUT) first derivative
1409 @param ddf (OUT) second derivative
1410 @return negative of likelihood (for minimization)
1411 */
1412 virtual void computeFuncDerv(double value, double &df, double &ddf);
1413
1414 /**
1415 optimize the scaling factor for tree length, given all branch lengths fixed
1416 @param scaling (IN/OUT) start value of scaling factor, and as output the optimal value
1417 @param gradient_epsilon gradient epsilon
1418 @return optimal tree log-likelihood
1419 */
1420 double optimizeTreeLengthScaling(double min_scaling, double &scaling, double max_scaling, double gradient_epsilon);
1421
1422 /**
1423 print tree length scaling to a file (requested by Rob Lanfear)
1424 @param filename output file name written in YAML format
1425 */
1426 void printTreeLengthScaling(const char *filename);
1427
1428 /**
1429 optimize pattern-specific rates by maxmimum likelihood given the tree with fixed branch lengths
1430 This function will call optimizeTreeLengthScaling
1431 */
1432 void optimizePatternRates(DoubleVector &pattern_rates);
1433
1434 /****************************************************************************
1435 Branch length optimization by Least Squares
1436 ****************************************************************************/
1437
1438 /**
1439 * Estimate the current branch using least squares
1440 * @param node1 first node of the branch
1441 * @param node2 second node of the branch
1442 * @return
1443 */
1444 double optimizeOneBranchLS(PhyloNode *node1, PhyloNode *node2);
1445
1446 /****************************************************************************
1447 Auxilary functions and varialbes for speeding up branch length optimization (RAxML Trick)
1448 ****************************************************************************/
1449
1450 bool theta_computed;
1451
1452 /**
1453 * NSTATES x NUMCAT x (number of patterns) array
1454 * Used to store precomputed values when optimizing branch length
1455 * See Tung's report on 07.05.2012 for more information
1456 */
1457 double* theta_all;
1458
1459 /** total scaling buffer */
1460 double *buffer_scale_all;
1461
1462 /** buffer used when computing partial_lh, to avoid repeated mem allocation */
1463 double *buffer_partial_lh;
1464
1465 /**
1466 * frequencies of alignment patterns, used as buffer for likelihood computation
1467 */
1468 double *ptn_freq;
1469
1470 /**
1471 * frequencies of aln->ordered_pattern, used as buffer for parsimony computation
1472 */
1473 UINT *ptn_freq_pars;
1474
1475 /**
1476 * used as buffer for faster likelihood computation
1477 * for const pattern: it stores product of p_invar and state frequency
1478 * for other pattern: zero
1479 */
1480 double *ptn_invar;
1481
1482 vector<TraversalInfo> traversal_info;
1483
1484
1485 /****************************************************************************
1486 Nearest Neighbor Interchange by maximum likelihood
1487 ****************************************************************************/
1488
1489 /**
1490 Deprecated
1491 search by a nearest neigbor interchange, then optimize branch lengths. Do it
1492 until tree does not improve
1493 @return the likelihood of the tree
1494 */
1495 // double optimizeNNIBranches();
1496
1497 /**
1498 search by a nearest neigbor interchange
1499 @return the likelihood of the tree
1500 */
1501 //double optimizeNNI();
1502
1503 /**
1504 search by a nearest neigbor interchange
1505 @param cur_score current likelihood score
1506 @param node the current node
1507 @param dad dad of the node, used to direct the search
1508 @return the likelihood of the tree
1509 */
1510 // double optimizeNNI(double cur_score, PhyloNode *node = NULL, PhyloNode *dad = NULL
1511 // /*,ostream *out = NULL, int brtype = 0, ostream *out_lh = NULL, ostream *site_lh = NULL,
1512 // StringIntMap *treels = NULL, vector<double*> *treels_ptnlh = NULL, DoubleVector *treels_logl = NULL,
1513 // int *max_trees = NULL, double *logl_cutoff = NULL*/
1514 // );
1515
1516
1517 /**
1518 search for the best NNI move corresponding to this branch
1519 @return NNIMove the best NNI, this NNI could be worse than the current tree
1520 according to the evaluation scheme in use
1521 @param node1 1 of the 2 nodes on the branch
1522 @param node2 1 of the 2 nodes on the branch
1523 @param nniMoves (IN/OUT) detailed information of the 2 NNIs, set .ptnlh to compute pattern likelihoods
1524 */
1525 virtual NNIMove getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove *nniMoves = NULL);
1526
1527 /**
1528 Do an NNI
1529 @param move reference to an NNI move object containing information about the move
1530 @param clearLH decides whether or not the partial likelihood should be cleared
1531 */
1532 virtual void doNNI(NNIMove &move, bool clearLH = true);
1533
1534 /**
1535 * [DEPRECATED]
1536 * Randomly choose perform an NNI, out of the two defined by branch node1-node2.
1537 * This function also clear the corresponding partial likelihood vectors
1538 *
1539 * @param branch on which a random NNI is done
1540 */
1541 // void doOneRandomNNI(Branch branch);
1542
1543 /**
1544 * Get a random NNI from an internal branch, checking for consistency with constraintTree
1545 * @param branch the internal branch
1546 * @return an NNIMove, node1 and node2 are set to NULL if not consistent with constraintTree
1547 */
1548 NNIMove getRandomNNI(Branch& branch);
1549
1550
1551 /**
1552 * Apply 5 new branch lengths stored in the NNI move
1553 * @param nnimove the NNI move currently in consideration
1554 */
1555 virtual void changeNNIBrans(NNIMove &nnimove);
1556
1557 /****************************************************************************
1558 Stepwise addition (greedy) by maximum likelihood
1559 ****************************************************************************/
1560
1561 /**
1562 grow the tree by step-wise addition
1563 @param alignment input alignment
1564 */
1565 void growTreeML(Alignment *alignment);
1566
1567 /**
1568 used internally by growTreeML() to find the best target branch to add into the tree
1569 @param added_node node to add
1570 @param target_node (OUT) one end of the best branch found
1571 @param target_dad (OUT) the other end of the best branch found
1572 @param node the current node
1573 @param dad dad of the node, used to direct the search
1574 @return the likelihood of the tree
1575 */
1576 double addTaxonML(Node *added_node, Node* &target_node, Node* &target_dad, Node *node, Node *dad);
1577
1578 /****************************************************************************
1579 Distance function
1580 ****************************************************************************/
1581
1582 /**
1583 compute the distance between 2 sequences.
1584 @param seq1 index of sequence 1
1585 @param seq2 index of sequence 2
1586 @param initial_dist initial distance
1587 @param (OUT) variance of distance between seq1 and seq2
1588 @return distance between seq1 and seq2
1589 */
1590
1591 virtual double computeDist(int seq1, int seq2, double initial_dist, double &var);
1592
1593 virtual double computeDist(int seq1, int seq2, double initial_dist);
1594
1595 /**
1596 compute distance and variance matrix, assume dist_mat and var_mat are allocated by memory of size num_seqs * num_seqs.
1597 @param dist_mat (OUT) distance matrix between all pairs of sequences in the alignment
1598 @param var_mat (OUT) variance matrix for distance matrix
1599 @return the longest distance
1600 */
1601 double computeDist(double *dist_mat, double *var_mat);
1602
1603 /**
1604 compute observed distance matrix, assume dist_mat is allocated by memory of size num_seqs * num_seqs.
1605 @param dist_mat (OUT) distance matrix between all pairs of sequences in the alignment
1606 @return the longest distance
1607 */
1608 double computeObsDist(double *dist_mat);
1609
1610 /**
1611 compute distance matrix, allocating memory if necessary
1612 @param params program parameters
1613 @param alignment input alignment
1614 @param dist_mat (OUT) distance matrix between all pairs of sequences in the alignment
1615 @param dist_file (OUT) name of the distance file
1616 @return the longest distance
1617 */
1618 double computeDist(Params ¶ms, Alignment *alignment, double* &dist_mat, double* &var_mat, string &dist_file);
1619
1620 /**
1621 compute observed distance matrix, allocating memory if necessary
1622 @param params program parameters
1623 @param alignment input alignment
1624 @param dist_mat (OUT) distance matrix between all pairs of sequences in the alignment
1625 @param dist_file (OUT) name of the distance file
1626 @return the longest distance
1627 */
1628 double computeObsDist(Params ¶ms, Alignment *alignment, double* &dist_mat, string &dist_file);
1629
1630 /**
1631 correct the distances to follow metric property of triangle inequalities.
1632 Using the Floyd alogrithm.
1633 @param dist_mat (IN/OUT) the shortest path between all pairs of taxa
1634 @return the longest distance
1635 */
1636 double correctDist(double *dist_mat);
1637
1638 /****************************************************************************
1639 compute BioNJ tree, a more accurate extension of Neighbor-Joining
1640 ****************************************************************************/
1641
1642 /**
1643 compute BioNJ tree
1644 @param params program parameters
1645 @param alignment input alignment
1646 @param dist_file distance matrix file
1647 */
1648 void computeBioNJ(Params ¶ms, Alignment *alignment, string &dist_file);
1649
1650 /**
1651 called by fixNegativeBranch to fix one branch
1652 @param branch_length new branch length
1653 @param dad_branch dad branch
1654 @param dad dad node
1655 */
1656 virtual void fixOneNegativeBranch(double branch_length, Neighbor *dad_branch, Node *dad);
1657
1658 /**
1659 Neighbor-joining/parsimony tree might contain negative branch length. This
1660 function will fix this.
1661 @param fixed_length fixed branch length to set to negative branch lengths
1662 @param node the current node
1663 @param dad dad of the node, used to direct the search
1664 @return The number of branches that have no/negative length
1665 */
1666 virtual int fixNegativeBranch(bool force = false, Node *node = NULL, Node *dad = NULL);
1667
1668 /**
1669 Jukes-Cantor correction with alpha shape of Gamma distribution
1670 @param dist observed distance
1671 @param alpha shape of Gamma distribution
1672 @return corrected JC distance
1673 */
1674 double JukesCantorCorrection(double dist, double alpha);
1675
1676 /**
1677 set all branch lengths using parsimony
1678 */
1679 int setParsimonyBranchLengths();
1680
1681 /**
1682 set negative branch to a new len
1683 */
1684 int setNegativeBranch(bool force, double newlen, Node *node = NULL, Node *dad = NULL);
1685
1686 // OBSOLETE: assignRandomBranchLengths no longer needed, use fixNegativeBranch instead!
1687 // int assignRandomBranchLengths(bool force = false, Node *node = NULL, Node *dad = NULL);
1688
1689 /* compute Bayesian branch lengths based on ancestral sequence reconstruction */
1690 void computeAllBayesianBranchLengths(Node *node = NULL, Node *dad = NULL);
1691
1692 /**
1693 generate random tree
1694 */
1695 void generateRandomTree(TreeGenType tree_type);
1696
1697
1698 /**
1699 test the best number of threads
1700 */
1701 virtual int testNumThreads();
1702
1703 /**
1704 print warning about too many threads for short alignments
1705 */
1706 void warnNumThreads();
1707
1708 /****************************************************************************
1709 Subtree Pruning and Regrafting by maximum likelihood
1710 NOTE: NOT DONE YET
1711 ****************************************************************************/
1712
1713 /**
1714 search by Subtree pruning and regrafting
1715 @return the likelihood of the tree
1716 */
1717 double optimizeSPR();
1718
1719 /**
1720 search by Subtree pruning and regrafting, then optimize branch lengths. Iterative until
1721 no tree improvement found.
1722 @return the likelihood of the tree
1723 */
1724 double optimizeSPRBranches();
1725
1726 /**
1727 search by Subtree pruning and regrafting at a current subtree
1728 @param cur_score current likelihood score
1729 @param node the current node
1730 @param dad dad of the node, used to direct the search
1731 @return the likelihood of the tree
1732 */
1733 double optimizeSPR(double cur_score, PhyloNode *node = NULL, PhyloNode *dad = NULL);
1734
1735 /**
1736 * original implementation by Minh
1737 */
1738 double optimizeSPR_old(double cur_score, PhyloNode *node = NULL, PhyloNode *dad = NULL);
1739
1740 /**
1741 * original implementation by Minh
1742 */
1743 double swapSPR_old(double cur_score, int cur_depth, PhyloNode *node1, PhyloNode *dad1,
1744 PhyloNode *orig_node1, PhyloNode *orig_node2,
1745 PhyloNode *node2, PhyloNode *dad2, vector<PhyloNeighbor*> &spr_path);
1746
1747 /**
1748 move the subtree (dad1-node1) to the branch (dad2-node2)
1749 */
1750 double swapSPR(double cur_score, int cur_depth, PhyloNode *node1, PhyloNode *dad1,
1751 PhyloNode *orig_node1, PhyloNode *orig_node2,
1752 PhyloNode *node2, PhyloNode *dad2, vector<PhyloNeighbor*> &spr_path);
1753
1754 double assessSPRMove(double cur_score, const SPRMove &spr);
1755
1756 void pruneSubtree(PhyloNode *node, PhyloNode *dad, PruningInfo &info);
1757
1758 void regraftSubtree(PruningInfo &info,
1759 PhyloNode *in_node, PhyloNode *in_dad);
1760
1761 /****************************************************************************
1762 Approximate Likelihood Ratio Test with SH-like interpretation
1763 ****************************************************************************/
1764
1765 void computeNNIPatternLh(double cur_lh,
1766 double &lh2, double *pattern_lh2,
1767 double &lh3, double *pattern_lh3,
1768 PhyloNode *node1, PhyloNode *node2);
1769
1770 /**
1771 Resampling estimated log-likelihood (RELL)
1772 */
1773 void resampleLh(double **pat_lh, double *lh_new, int *rstream);
1774
1775 /**
1776 Test one branch of the tree with aLRT SH-like interpretation
1777 */
1778 double testOneBranch(double best_score, double *pattern_lh,
1779 int reps, int lbp_reps,
1780 PhyloNode *node1, PhyloNode *node2,
1781 double &lbp_support, double &aLRT_support, double &aBayes_support);
1782
1783 /**
1784 Test all branches of the tree with aLRT SH-like interpretation
1785 */
1786 virtual int testAllBranches(int threshold, double best_score, double *pattern_lh,
1787 int reps, int lbp_reps, bool aLRT_test, bool aBayes_test,
1788 PhyloNode *node = NULL, PhyloNode *dad = NULL);
1789
1790 /****************************************************************************
1791 Quartet functions
1792 ****************************************************************************/
1793
1794 QuartetGroups LMGroups;
1795 /**
1796 * for doLikelihoodMapping reportLikelihoodMapping: likelihood mapping information by region
1797 */
1798 vector<QuartetInfo> lmap_quartet_info;
1799 int areacount[8];
1800 int cornercount[4];
1801 // int areacount[8] = {0, 0, 0, 0, 0, 0, 0, 0};
1802 // int cornercount[4] = {0, 0, 0, 0};
1803
1804 /**
1805 * for doLikelihoodMapping, reportLikelihoodMapping: likelihood mapping information by sequence
1806 */
1807 vector<SeqQuartetInfo> lmap_seq_quartet_info;
1808
1809 /** generate a bunch of quartets and compute likelihood for 3 quartet trees for each replicate
1810 @param lmap_num_quartets number of quartets
1811 @param lmap_quartet_info (OUT) vector of quartet information
1812 */
1813 void computeQuartetLikelihoods(vector<QuartetInfo> &lmap_quartet_info, QuartetGroups &LMGroups);
1814
1815 /** main function that performs likelihood mapping analysis (Strimmer & von Haeseler 1997) */
1816 void doLikelihoodMapping();
1817
1818 /** output results of likelihood mapping analysis */
1819 void reportLikelihoodMapping(ofstream &out);
1820
1821 /** read clusters for likelihood mapping analysis */
1822 void readLikelihoodMappingGroups(char *filename, QuartetGroups &LMGroups);
1823
1824 /**
1825 compute site concordance factor and assign node names
1826 */
1827 void computeSiteConcordance(map<string,string> &meanings);
1828
1829 /**
1830 compute site concordance factor
1831 @param branch target branch
1832 @param nquartets number of quartets
1833 @param[out] info concordance information
1834 @param rstream random stream
1835 */
1836 virtual void computeSiteConcordance(Branch &branch, int nquartets, int *rstream);
1837
1838 /**
1839 Compute gene concordance factor
1840 for each branch, assign how many times this branch appears in the input set of trees.
1841 Work fine also when the trees do not have the same taxon set.
1842 */
1843 void computeGeneConcordance(MTreeSet &trees, map<string,string> &meanings);
1844
1845 /**
1846 Compute quartet concordance factor and internode certainty, similar to Zhou et al. biorxiv
1847 Work fine also when the trees do not have the same taxon set.
1848 */
1849 void computeQuartetConcordance(MTreeSet &trees);
1850
1851 /**
1852 compute quartet concordance factor and assign node names
1853 @param branch target branch
1854 @param trees input tree set
1855 @return quartet concordance factor
1856 */
1857 double computeQuartetConcordance(Branch &branch, MTreeSet &trees);
1858
1859 /****************************************************************************
1860 Collapse stable (highly supported) clades by one representative
1861 ****************************************************************************/
1862
1863 /**
1864 delete a leaf from the tree, assume tree is birfucating
1865 @param leaf the leaf node to remove
1866 */
1867 void deleteLeaf(Node *leaf);
1868
1869 /**
1870 reinsert one leaf back into the tree
1871 @param leaf the leaf to reinsert
1872 @param adjacent_node the node adjacent to the leaf, returned by deleteLeaves() function
1873 @param node one end node of the reinsertion branch in the existing tree
1874 @param dad the other node of the reinsertion branch in the existing tree
1875 */
1876 void reinsertLeaf(Node *leaf, Node *node, Node *dad);
1877
1878 bool isSupportedNode(PhyloNode* node, int min_support);
1879
1880 /**
1881 Collapse stable (highly supported) clades by one representative
1882 @return the number of taxa prunned
1883 */
1884 int collapseStableClade(int min_support, NodeVector &pruned_taxa, StrVector &linked_name, double* &dist_mat);
1885
1886 int restoreStableClade(Alignment *original_aln, NodeVector &pruned_taxa, StrVector &linked_name);
1887
1888 /**
1889 randomize the neighbor orders of all nodes
1890 */
1891 void randomizeNeighbors(Node *node = NULL, Node *dad = NULL);
1892
1893 virtual void changeLikelihoodKernel(LikelihoodKernel lk);
1894
1895 virtual void setLikelihoodKernel(LikelihoodKernel lk);
1896
1897 virtual void setNumThreads(int num_threads);
1898
1899 #if defined(BINARY32) || defined(__NOAVX__)
setLikelihoodKernelAVX()1900 void setLikelihoodKernelAVX() {}
setLikelihoodKernelFMA()1901 void setLikelihoodKernelFMA() {}
1902 #else
1903 void setLikelihoodKernelAVX();
1904 void setLikelihoodKernelFMA();
1905 void setLikelihoodKernelAVX512();
1906 #endif
1907 virtual void setLikelihoodKernelSSE();
1908
1909 /****************************************************************************
1910 Public variables
1911 ****************************************************************************/
1912
1913 /**
1914 associated alignment
1915 */
1916 Alignment *aln;
1917
1918 /**
1919 * Distance matrix
1920 */
1921 double *dist_matrix;
1922
1923 /**
1924 * Variance matrix
1925 */
1926 double *var_matrix;
1927
1928 /** distance matrix file */
1929 string dist_file;
1930
1931 /**
1932 TRUE if you want to optimize branch lengths by Newton-Raphson method
1933 */
1934 bool optimize_by_newton;
1935
1936 /**
1937 * TRUE if the loglikelihood is computed using SSE
1938 */
1939 LikelihoodKernel sse;
1940
1941 /**
1942 * for UpperBounds: Initial tree log-likelihood
1943 */
1944 double mlInitial;
1945
1946 /**
1947 * for UpperBounds: Log-likelihood after optimization of model parameters in the beginning of tree search
1948 */
1949 double mlFirstOpt;
1950
1951 /**
1952 * for Upper Bounds: how many NNIs have UB < L curScore, that is NNIs for which we don't need to compute likelihood
1953 */
1954 int skippedNNIub;
1955
1956 /**
1957 * for Upper Bounds: how many NNIs were considered in total
1958 */
1959 int totalNNIub;
1960
1961 /**
1962 * for Upper Bounds: min, mean and max UB encountered during the tree search, such that UB < L curScore
1963 */
1964
1965 //double minUB, meanUB, maxUB;
1966
1967 /*
1968 * for UpperBounds: mlCheck = 1, if previous two values were already saved.
1969 * Needed, because parameter optimization is done twice before and after tree search
1970 */
1971
1972 int mlCheck;
1973
1974 /*
1975 * for Upper Bounds: min base frequency
1976 */
1977
1978 double minStateFreq;
1979
1980 /** sequence names that were removed */
1981 StrVector removed_seqs;
1982
1983 /** sequence that are identical to one of the removed sequences */
1984 StrVector twin_seqs;
1985
1986 size_t num_partial_lh_computations;
1987
1988 /** remove identical sequences from the tree */
1989 virtual void removeIdenticalSeqs(Params ¶ms);
1990
1991 /** reinsert identical sequences into the tree and reset original alignment */
1992 virtual void reinsertIdenticalSeqs(Alignment *orig_aln);
1993
1994
1995 /**
1996 assign the leaf names with the alignment sequence names, using the leaf ID for assignment.
1997 @param node the starting node, NULL to start from the root
1998 @param dad dad of the node, used to direct the search
1999 */
2000 void assignLeafNames(Node *node = NULL, Node *dad = NULL);
2001
2002 /**
2003 * initialize partition information for super tree
2004 */
initPartitionInfo()2005 virtual void initPartitionInfo() {
2006 }
2007
2008 /**
2009 * print transition matrix for all branches
2010 *
2011 */
2012 void printTransMatrices(Node *node = NULL, Node *dad = NULL);
2013
2014 /**
2015 * compute the memory size required for storing partial likelihood vectors
2016 * @return memory size required in bytes
2017 */
2018 virtual uint64_t getMemoryRequired(size_t ncategory = 1, bool full_mem = false);
2019
2020 /**
2021 * compute the memory size for top partitions required for storing partial likelihood vectors
2022 * @return memory size required in bytes
2023 */
2024 virtual uint64_t getMemoryRequiredThreaded(size_t ncategory = 1, bool full_mem = false);
2025
2026 void getMemoryRequired(uint64_t &partial_lh_entries, uint64_t &scale_num_entries, uint64_t &partial_pars_entries);
2027
2028 /****** following variables are for ultra-fast bootstrap *******/
2029 /** 2 to save all trees, 1 to save intermediate trees */
2030 int save_all_trees;
2031
2032 set<int> computeNodeBranchDists(Node *node = NULL, Node *dad = NULL);
2033
2034 /*
2035 * Manuel's approach for analytic approximation of branch length given initial guess
2036 b0: initial guess for the maximum
2037 @return approximted branch length
2038 */
2039 double approxOneBranch(PhyloNode *node, PhyloNode *dad, double b0);
2040
2041 void approxAllBranches(PhyloNode *node = NULL, PhyloNode *dad = NULL);
2042
getCurScore()2043 double getCurScore() {
2044 return curScore;
2045 }
2046
setCurScore(double curScore)2047 void setCurScore(double curScore) {
2048 this->curScore = curScore;
2049 }
2050
2051 /**
2052 * This will invalidate curScore variable, used whenever reading a tree!
2053 */
2054 void resetCurScore(double score = 0.0) {
2055 if (score != 0.0)
2056 curScore = score;
2057 else
2058 curScore = -DBL_MAX;
2059 if (model)
2060 initializeAllPartialLh();
2061 }
2062
2063 void computeSeqIdentityAlongTree(Split &resp, Node *node = NULL, Node *dad = NULL);
2064 void computeSeqIdentityAlongTree();
2065
getPatternLhCatPointer()2066 double *getPatternLhCatPointer() { return _pattern_lh_cat; }
2067
2068 /**
2069 * for rooted tree update direction for all branches
2070 */
2071 void computeBranchDirection(PhyloNode *node = NULL, PhyloNode *dad = NULL);
2072
2073 /**
2074 * clear branch direction for all branches
2075 */
2076 void clearBranchDirection(PhyloNode *node = NULL, PhyloNode *dad = NULL);
2077
2078 /**
2079 convert from unrooted to rooted tree
2080 */
2081 void convertToRooted();
2082
2083 /**
2084 convert from rooted to unrooted tree
2085 */
2086 void convertToUnrooted();
2087
2088
2089 /**
2090 write site-rates to a file in the following format:
2091 1 rate_1
2092 2 rate_2
2093 ....
2094 This function will call computePatternRates()
2095 @param out output stream to write rates
2096 @param bayes TRUE to use empirical Bayesian, false for ML method
2097 */
2098 virtual void writeSiteRates(ostream &out, bool bayes, int partid = -1);
2099
2100 /**
2101 write site log likelihood to a output stream
2102 @param out output stream
2103 @param wsl write site-loglikelihood type
2104 @param partid partition ID as first column of the line. -1 to omit it
2105 */
2106 virtual void writeSiteLh(ostream &out, SiteLoglType wsl, int partid = -1);
2107
2108 /**
2109 write branches into a csv file
2110 Feature requested by Rob Lanfear
2111 @param out output stream
2112 */
2113 virtual void writeBranches(ostream &out);
2114
2115 protected:
2116
2117 /**
2118 * Instance of the phylogenetic likelihood library. This is basically the tree data strucutre in RAxML
2119 */
2120 pllInstance *pllInst;
2121
2122 /**
2123 * PLL data structure for alignment
2124 */
2125 pllAlignmentData *pllAlignment;
2126
2127 /**
2128 * PLL data structure for storing phylognetic analysis options
2129 */
2130 pllInstanceAttr pllAttr;
2131
2132 /**
2133 * PLL partition list
2134 */
2135 partitionList * pllPartitions;
2136
2137 /**
2138 * is the subtree distance matrix need to be computed or updated
2139 */
2140 bool subTreeDistComputed;
2141
2142 /**
2143 * Map data structure to store distance Candidate trees between subtree.
2144 * The key is a string which is constructed by concatenating IDs of
2145 * the 2 nodes, e.g. 15-16
2146 */
2147 StringDoubleMap subTreeDists;
2148
2149 StringDoubleMap subTreeWeights;
2150
2151 /** distance (# of branches) between 2 nodes */
2152 int *nodeBranchDists;
2153
2154 /**
2155 * A list containing all the marked list. This is used in the dynamic programming
2156 * algorithm for compute inter subtree distances
2157 */
2158 IntPhyloNodeMap markedNodeList;
2159
2160 /** converted root state, for Tina's zoombie domain */
2161 char root_state;
2162
2163 /**
2164 internal pattern log-likelihoods, always stored after calling computeLikelihood()
2165 or related functions. Note that scaling factors are not incorporated here.
2166 If you want to get real pattern log-likelihoods, please use computePatternLikelihood()
2167 */
2168 double *_pattern_lh;
2169
2170 /**
2171 internal pattern likelihoods per category,
2172 */
2173 double *_pattern_lh_cat;
2174
2175 /**
2176 internal pattern likelihoods per category per state
2177 will be computed if not NULL and using non-reversible kernel
2178 */
2179 double *_pattern_lh_cat_state;
2180
2181 /**
2182 associated substitution model
2183 */
2184 ModelSubst *model;
2185
2186 /**
2187 Model factory includes SubstModel and RateHeterogeneity
2188 stores transition matrices computed before for efficiency purpose, eps. AA or CODON model.
2189 */
2190 ModelFactory *model_factory;
2191
2192 /**
2193 among-site rates
2194 */
2195 RateHeterogeneity *site_rate;
2196
2197 /**
2198 current branch iterator, used by computeFunction() to optimize branch lengths
2199 and by computePatternLikelihood() to compute all pattern likelihoods
2200 */
2201 PhyloNeighbor *current_it;
2202 /**
2203 current branch iterator of the other end, used by computeFunction() to optimize branch lengths
2204 and by computePatternLikelihood() to compute all pattern likelihoods
2205 */
2206 PhyloNeighbor *current_it_back;
2207
2208 bool is_opt_scaling;
2209
2210 /** current scaling factor for optimizeTreeLengthScaling() */
2211 double current_scaling;
2212
2213 /**
2214 spr moves
2215 */
2216 SPRMoves spr_moves;
2217
2218 /**
2219 SPR radius
2220 */
2221 int spr_radius;
2222
2223
2224 /**
2225 the main memory storing all partial likelihoods for all neighbors of the tree.
2226 The variable partial_lh in PhyloNeighbor will be assigned to a region inside this variable.
2227 */
2228 double *central_partial_lh;
2229 double *nni_partial_lh; // used for NNI functions
2230
2231 /**
2232 the main memory storing all scaling event numbers for all neighbors of the tree.
2233 The variable scale_num in PhyloNeighbor will be assigned to a region inside this variable.
2234 */
2235 UBYTE *central_scale_num;
2236 UBYTE *nni_scale_num; // used for NNI functions
2237
2238 /**
2239 the main memory storing all partial parsimony states for all neighbors of the tree.
2240 The variable partial_pars in PhyloNeighbor will be assigned to a region inside this variable.
2241 */
2242 UINT *central_partial_pars;
2243
2244 virtual void reorientPartialLh(PhyloNeighbor* dad_branch, Node *dad);
2245
2246 //----------- memory saving technique ------//
2247
2248 /** maximum number of partial_lh_slots */
2249 int64_t max_lh_slots;
2250
2251 /** mapping from */
2252 MemSlotVector mem_slots;
2253
2254 /**
2255 TRUE to discard saturated for Meyer & von Haeseler (2003) model
2256 */
2257 bool discard_saturated_site;
2258
2259 /**
2260 * Temporary partial likelihood array: used when swapping branch and recalculate the
2261 * likelihood --> avoid calling malloc everytime
2262 */
2263 // double *tmp_partial_lh1;
2264 // double *tmp_partial_lh2;
2265
2266 /**
2267 * Temporary array containing anscentral states.
2268 * Used to avoid calling malloc
2269 */
2270
2271 // double *tmp_anscentral_state_prob1;
2272 // double *tmp_anscentral_state_prob2;
2273 /** pattern-specific rates */
2274 //double *tmp_ptn_rates;
2275
2276 /**
2277 * Temporary scale num array: used when swapping branch and recalculate the
2278 * likelihood --> avoid calling malloc
2279 */
2280 // UBYTE *tmp_scale_num1;
2281 // UBYTE *tmp_scale_num2;
2282
2283 /****************************************************************************
2284 Vector of bit blocks, used for parsimony function
2285 ****************************************************************************/
2286
2287 /**
2288 @return size of the bits block vector for one node
2289 */
2290 size_t getBitsBlockSize();
2291
2292 /**
2293 allocate new memory for a bit block vector
2294 @return the allocated memory
2295 */
2296 UINT *newBitsBlock();
2297
saveCurrentTree(double logl)2298 virtual void saveCurrentTree(double logl) {
2299 } // save current tree
2300
2301
2302 /**
2303 * Current score of the tree;
2304 */
2305 double curScore;
2306
2307 /** current best parsimony score */
2308 UINT best_pars_score;
2309
2310 /** cost_matrix for non-uniform parsimony */
2311 unsigned int * cost_matrix; // Sep 2016: store cost matrix in 1D array
2312
2313 };
2314
2315 #endif
2316