1 /** 2 * PLL (version 1.0.0) a software library for phylogenetic inference 3 * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis 4 * 5 * Derived from 6 * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic 7 * trees by Alexandros Stamatakis 8 * 9 * This program is free software: you can redistribute it and/or modify it 10 * under the terms of the GNU General Public License as published by the Free 11 * Software Foundation, either version 3 of the License, or (at your option) 12 * any later version. 13 * 14 * This program is distributed in the hope that it will be useful, but WITHOUT 15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 17 * more details. 18 * 19 * You should have received a copy of the GNU General Public License along with 20 * this program. If not, see <http://www.gnu.org/licenses/>. 21 * 22 * For any other enquiries send an Email to Tomas Flouri 23 * Tomas.Flouri@h-its.org 24 * 25 * When publishing work that uses PLL please cite PLL 26 * 27 * ABSTRACT 28 * 29 * PLL is a highly optimized, parallelized software library to ease the 30 * development of new software tools dealing with phylogenetic inference. Among 31 * the functions included in PLL are 32 * 33 * DOCUMENTATION 34 * 35 * Extensive documentation for using PLL is available online at 36 * 37 * http://www.libpll.org 38 * 39 * 40 * USAGE 41 * 42 * To use PLL, 43 * 44 * @file pll.h 45 * @brief Data structures for tree and model 46 * 47 * @author Tomas Flouri 48 * @author Fernando Izquierdo-Carrasco 49 * @author Andre Aberer 50 * @author Alexandros Stamatakis 51 */ 52 53 #ifndef __pll__ 54 #define __pll__ 55 56 #include <stdint.h> 57 #include <stdio.h> 58 #include <errno.h> 59 60 #ifdef __cplusplus 61 extern "C" { 62 #endif 63 64 #ifdef __MIC_NATIVE 65 #define PLL_BYTE_ALIGNMENT 64 66 #define PLL_VECTOR_WIDTH 8 67 #elif defined (__AVX) 68 69 #include <xmmintrin.h> 70 #include <immintrin.h> 71 #include <pmmintrin.h> 72 73 #define PLL_BYTE_ALIGNMENT 32 74 #define PLL_VECTOR_WIDTH 4 75 76 #elif defined (__SSE3) 77 78 #include <xmmintrin.h> 79 #include <pmmintrin.h> 80 81 #define PLL_BYTE_ALIGNMENT 16 82 #define PLL_VECTOR_WIDTH 2 83 84 #else 85 #define PLL_BYTE_ALIGNMENT 8 86 #define PLL_VECTOR_WIDTH 1 87 #endif 88 89 #ifdef _MSC_VER 90 #define PLL_ALIGN_BEGIN __declspec(align(PLL_BYTE_ALIGNMENT)) 91 #define PLL_ALIGN_END 92 #else 93 #define PLL_ALIGN_BEGIN 94 #define PLL_ALIGN_END __attribute__((aligned(PLL_BYTE_ALIGNMENT))) 95 #endif 96 97 98 #include "stack.h" 99 #include "newick.h" 100 #include "queue.h" 101 102 #define PLL_MAX_TIP_EV 0.999999999 /* max tip vector value, sum of EVs needs to be smaller than 1.0, otherwise the numerics break down */ 103 #define PLL_MAX_LOCAL_SMOOTHING_ITERATIONS 32 /** @brief maximum iterations of smoothings per insert in the */ 104 #define PLL_ITERATIONS 10 /* maximum iterations of iterations per insert */ 105 #define PLL_NEWZPERCYCLE 10 /* iterations of makenewz per tree traversal */ 106 #define PLL_NMLNGTH 256 /* number of characters in species name */ 107 #define PLL_DELTAZ 0.00001 /* test of net branch length change in update */ 108 #define PLL_DEFAULTZ 0.9 /* value of z assigned as starting point */ 109 #define PLL_UNLIKELY -1.0E300 /* low likelihood for initialization */ 110 #define PLL_SUMMARIZE_LENGTH -3 111 #define PLL_SUMMARIZE_LH -2 112 #define PLL_NO_BRANCHES -1 113 #define PLL_MASK_LENGTH 32 114 #define PLL_ZMIN 1.0E-15 /* max branch prop. to -log(PLL_ZMIN) (= 34) */ 115 #define PLL_ZMAX (1.0 - 1.0E-6) /* min branch prop. to 1.0-zmax (= 1.0E-6) */ 116 #define PLL_TWOTOTHE256 115792089237316195423570985008687907853269984665640564039457584007913129639936.0 /* 2**256 (exactly) */ 117 #define PLL_MINLIKELIHOOD (1.0/PLL_TWOTOTHE256) 118 #define PLL_MINUSMINLIKELIHOOD -PLL_MINLIKELIHOOD 119 120 121 #define PLL_FORMAT_PHYLIP 1 122 #define PLL_FORMAT_FASTA 2 123 #define PLL_FORMAT_NEWICK 3 124 125 #define PLL_NNI_P_NEXT 1 /**< Use p->next for the NNI move */ 126 #define PLL_NNI_P_NEXTNEXT 2 /**< Use p->next->next for the NNI move */ 127 128 #define PLL_BADREAR -1 129 130 #define PLL_NUM_BRANCHES 16 131 132 #define PLL_TRUE 1 133 #define PLL_FALSE 0 134 135 #define PLL_REARRANGE_SPR 0 136 #define PLL_REARRANGE_TBR 1 137 #define PLL_REARRANGE_NNI 2 138 139 #define PLL_AA_SCALE 10.0 140 #define PLL_AA_SCALE_PLUS_EPSILON 10.001 141 142 /* ALPHA_MIN is critical -> numerical instability, eg for 4 discrete rate cats */ 143 /* and alpha = 0.01 the lowest rate r_0 is */ 144 /* 0.00000000000000000000000000000000000000000000000000000000000034878079110511010487 */ 145 /* which leads to numerical problems Table for alpha settings below: */ 146 /* */ 147 /* 0.010000 0.00000000000000000000000000000000000000000000000000000000000034878079110511010487 */ 148 /* 0.010000 yielded nasty numerical bugs in at least one case ! */ 149 /* 0.020000 0.00000000000000000000000000000044136090435925743185910935350715027016962154188875 */ 150 /* 0.030000 0.00000000000000000000476844846859006690412039180149775802624789852441798419292220 */ 151 /* 0.040000 0.00000000000000049522423236954066431210260930029681736928018820007024736185030633 */ 152 /* 0.050000 0.00000000000050625351310359203371872643495343928538368616365517027588794007897377 */ 153 /* 0.060000 0.00000000005134625283884191118711474021861409372524676086868566926568746566772461 */ 154 /* 0.070000 0.00000000139080650074206434685544624965062437960128249869740102440118789672851562 */ 155 /* 0.080000 0.00000001650681201563587066858709818343436959153791576682124286890029907226562500 */ 156 /* 0.090000 0.00000011301977332931251259273962858978301859735893231118097901344299316406250000 */ 157 /* 0.100000 0.00000052651925834844387815526344648331402709118265192955732345581054687500000000 */ 158 159 #define PLL_ALPHA_MIN 0.02 160 #define PLL_ALPHA_MAX 1000.0 161 162 #define PLL_RATE_MIN 0.0000001 163 #define PLL_RATE_MAX 1000000.0 164 165 #define PLL_LG4X_RATE_MIN 0.0000001 166 #define PLL_LG4X_RATE_MAX 1000.0 167 168 #define PLL_FREQ_MIN 0.001 169 170 #define PLL_NUM_AA_STATES 20 171 #define PLL_NUM_DNA_STATES 4 172 173 /* 174 previous values between 0.001 and 0.000001 175 176 TO AVOID NUMERICAL PROBLEMS WHEN FREQ == 0 IN PARTITIONED MODELS, ESPECIALLY WITH AA 177 previous value of FREQ_MIN was: 0.000001, but this seemed to cause problems with some 178 of the 7-state secondary structure models with some rather exotic small toy test datasets, 179 on the other hand 0.001 caused problems with some of the 16-state secondary structure models 180 181 For some reason the frequency settings seem to be repeatedly causing numerical problems 182 */ 183 184 #define PLL_ITMAX 100 /* max number of iterations in brent's algorithm */ 185 186 #define PLL_SHFT(a,b,c,d) (a)=(b);(b)=(c);(c)=(d); 187 #define PLL_SIGN(a,b) ((b) > 0.0 ? fabs(a) : -fabs(a)) 188 #define PLL_ABS(x) (((x)<0) ? (-(x)) : (x)) 189 #define PLL_MIN(x,y) (((x)<(y)) ? (x) : (y)) 190 #define PLL_MAX(x,y) (((x)>(y)) ? (x) : (y)) 191 #define PLL_SWAP(x,y) do{ __typeof__ (x) _t = x; x = y; y = _t; } while(0) 192 #define PLL_SWAP_PTR(x,y) do{ char* _t = x; x = y; y = _t; } while(0) 193 #define PLL_SWAP_INT(x,y) do{ int _t = x; x = y; y = _t; } while(0) 194 195 #define PLL_POINT_GAMMA(prob,alpha,beta) PointChi2(prob,2.0*(alpha))/(2.0*(beta)) 196 197 #define PLL_LIB_NAME "PLL" 198 #define PLL_LIB_VERSION "1.0.1" 199 #define PLL_LIB_DATE "November 3 2014" 200 201 /* aminoacid substitution models */ 202 #define PLL_DAYHOFF 0 203 #define PLL_DCMUT 1 204 #define PLL_JTT 2 205 #define PLL_MTREV 3 206 #define PLL_WAG 4 207 #define PLL_RTREV 5 208 #define PLL_CPREV 6 209 #define PLL_VT 7 210 #define PLL_BLOSUM62 8 211 #define PLL_MTMAM 9 212 #define PLL_LG 10 213 #define PLL_MTART 11 214 #define PLL_MTZOA 12 215 #define PLL_PMB 13 216 #define PLL_HIVB 14 217 #define PLL_HIVW 15 218 #define PLL_JTTDCMUT 16 219 #define PLL_FLU 17 220 #define PLL_AUTO 18 221 #define PLL_LG4M 19 222 #define PLL_LG4X 20 223 #define PLL_GTR 21 /* GTR always needs to be the last one */ 224 #define PLL_NUM_PROT_MODELS 22 225 226 /* information criteria for auto protein model selection */ 227 #define PLL_AUTO_ML 0 228 #define PLL_AUTO_BIC 1 229 #define PLL_AUTO_AIC 2 230 #define PLL_AUTO_AICC 3 231 232 /* bipartition stuff */ 233 #define PLL_BIPARTITIONS_RF 4 234 235 /* scenarios for likelihood computation */ 236 #define PLL_TIP_TIP 0 237 #define PLL_TIP_INNER 1 238 #define PLL_INNER_INNER 2 239 240 241 /* available data types in PLL */ 242 #define PLL_MIN_MODEL -1 243 #define PLL_BINARY_DATA 0 244 #define PLL_DNA_DATA 1 245 #define PLL_AA_DATA 2 246 #define PLL_SECONDARY_DATA 3 247 #define PLL_SECONDARY_DATA_6 4 248 #define PLL_SECONDARY_DATA_7 5 249 #define PLL_GENERIC_32 6 250 #define PLL_GENERIC_64 7 251 #define PLL_MAX_MODEL 8 252 253 #define PLL_SEC_6_A 0 254 #define PLL_SEC_6_B 1 255 #define PLL_SEC_6_C 2 256 #define PLL_SEC_6_D 3 257 #define PLL_SEC_6_E 4 258 259 #define PLL_SEC_7_A 5 260 #define PLL_SEC_7_B 6 261 #define PLL_SEC_7_C 7 262 #define PLL_SEC_7_D 8 263 #define PLL_SEC_7_E 9 264 #define PLL_SEC_7_F 10 265 266 #define PLL_SEC_16 11 267 #define PLL_SEC_16_A 12 268 #define PLL_SEC_16_B 13 269 #define PLL_SEC_16_C 14 270 #define PLL_SEC_16_D 15 271 #define PLL_SEC_16_E 16 272 #define PLL_SEC_16_F 17 273 #define PLL_SEC_16_I 18 274 #define PLL_SEC_16_J 19 275 #define PLL_SEC_16_K 20 276 277 #define PLL_ORDERED_MULTI_STATE 0 278 #define PLL_MK_MULTI_STATE 1 279 #define PLL_GTR_MULTI_STATE 2 280 281 282 /* available models of rate heterogeneity in PLL */ 283 #define PLL_CAT 0 284 #define PLL_GAMMA 1 285 286 /* recomp */ 287 #define PLL_SLOT_UNUSED -2 /* value to mark an available vector */ 288 #define PLL_NODE_UNPINNED -3 /* marks an inner node as not available in RAM */ 289 #define PLL_INNER_NODE_INIT_STLEN -1 /* initialization */ 290 291 #define PLL_MIN_RECOM_FRACTION 0.1 /* at least this % of inner nodes will be allocated in RAM */ 292 #define PLL_MAX_RECOM_FRACTION 1.0 /* always 1, just there for boundary checks */ 293 294 295 typedef int pllBoolean; 296 297 /* @brief PLL instance attribute structure */ 298 typedef struct 299 { 300 int rateHetModel; 301 int fastScaling; 302 int saveMemory; 303 int useRecom; 304 long randomNumberSeed; 305 int numberOfThreads; 306 } pllInstanceAttr; 307 308 /** @brief Stores the recomputation-state of likelihood vectors */ 309 typedef struct 310 { 311 int numVectors; /**< Number of inner vectors allocated in RAM*/ 312 int *iVector; /**< size: numVectors, stores node id || PLL_SLOT_UNUSED */ 313 int *iNode; /**< size: inner nodes, stores slot id || PLL_NODE_UNPINNED */ 314 int *stlen; /**< Number of tips behind the current orientation of the indexed inner node (subtree size/cost) */ 315 int *unpinnable; /**< size:numVectors , TRUE if we dont need the vector */ 316 int maxVectorsUsed; 317 pllBoolean allSlotsBusy; /**< on if all slots contain an ancesctral node (the usual case after first full traversal) */ 318 } recompVectors; 319 /* E recomp */ 320 321 /** @brief ??? 322 * @todo add explanation, is this ever used? */ 323 324 typedef unsigned int hashNumberType; 325 326 327 328 /*typedef uint_fast32_t parsimonyNumber;*/ 329 330 #define PLL_PCF 32 331 332 /** @brief ???Hash tables 333 * @todo add explanation of all hash tables */ 334 typedef struct pllBipartitionEntry 335 { 336 unsigned int *bitVector; 337 unsigned int *treeVector; 338 unsigned int amountTips; 339 int *supportVector; 340 unsigned int bipNumber; 341 unsigned int bipNumber2; 342 unsigned int supportFromTreeset[2]; 343 struct pllBipartitionEntry *next; 344 } pllBipartitionEntry; 345 346 //typedef struct 347 //{ 348 // hashNumberType tableSize; 349 // entry **table; 350 // hashNumberType entryCount; 351 //} 352 // hashtable; 353 //struct stringEnt 354 //{ 355 // int nodeNumber; 356 // char *word; 357 // struct stringEnt *next; 358 //}; 359 // 360 //typedef struct stringEnt stringEntry; 361 //typedef struct 362 //{ 363 // hashNumberType tableSize; 364 // stringEntry **table; 365 //} 366 // stringHashtable; 367 368 typedef struct pllHashItem 369 { 370 void * data; 371 char * str; 372 struct pllHashItem * next; 373 } pllHashItem; 374 375 typedef struct pllHashTable 376 { 377 unsigned int size; 378 struct pllHashItem ** Items; 379 unsigned int entries; 380 } pllHashTable; 381 382 383 384 385 /** @brief Per-site Rate category entry: likelihood per-site and CAT rate applied ??? 386 * 387 */ 388 typedef struct ratec 389 { 390 double accumulatedSiteLikelihood; 391 double rate; 392 }rateCategorize; 393 394 /** @brief Traversal descriptor entry. 395 * 396 * Contains the information required to execute an operation in a step of the tree traversal. 397 * q r 398 * \ / 399 * p 400 * 401 * The entry defines 2 input/parent nodes (q and r) and one output/child node (p) 402 * qz represents the branch length(s) of the branch connecting q and p 403 * rz represents the branch length(s) of the branch connecting r and p 404 * PLL_TIP_TIP Both p and r are tips 405 * PLL_INNER_INNER Both p and r are inner nodes 406 * @note PLL_TIP_INNER q is a tip and r is an inner node (by convention, flip q and r if required) 407 */ 408 typedef struct 409 { 410 int tipCase; /**< Type of entry, must be PLL_TIP_TIP PLL_TIP_INNER or PLL_INNER_INNER */ 411 int pNumber; /**< should exist in some nodeptr p->number */ 412 int qNumber; /**< should exist in some nodeptr q->number */ 413 int rNumber; /**< should exist in some nodeptr r->number */ 414 double qz[PLL_NUM_BRANCHES]; 415 double rz[PLL_NUM_BRANCHES]; 416 /* recom */ 417 int slot_p; /**< In recomputation mode, the RAM slot index for likelihood vector of node p, otherwise unused */ 418 int slot_q; /**< In recomputation mode, the RAM slot index for likelihood vector of node q, otherwise unused */ 419 int slot_r; /**< In recomputation mode, the RAM slot index for likelihood vector of node r, otherwise unused */ 420 /* E recom */ 421 } traversalInfo; 422 423 /** @brief Traversal descriptor. 424 * 425 * Describes the state of a traversal descriptor 426 */ 427 typedef struct 428 { 429 traversalInfo *ti; /**< list of traversal steps */ 430 int count; /**< number of traversal steps */ 431 int functionType; 432 pllBoolean traversalHasChanged; 433 pllBoolean *executeModel; 434 double *parameterValues; 435 } traversalData; 436 437 /** @brief Node record structure 438 * 439 * Each inner node is a trifurcation in the tree represented as a circular list containing 3 node records. One node record uniquely identifies a subtree, and the orientation of the likelihood vector within a node 440 * 441 * p1 -------> p2 ----> to the next node 442 * ^ | 443 * |-----p3<---| 444 * 445 */ 446 struct noderec; 447 448 /** @brief Branch length information. 449 * 450 * @todo add relevant info on where this is used ??? 451 */ 452 typedef struct 453 { 454 unsigned int *vector; 455 int support; 456 struct noderec *oP; 457 struct noderec *oQ; 458 } branchInfo; 459 460 461 462 463 464 /** @brief Linkage of partitions. 465 * 466 * @todo add relevant info on where this is used ??? 467 */ 468 typedef struct 469 { 470 pllBoolean valid; 471 int partitions; 472 int *partitionList; 473 } 474 linkageData; 475 typedef struct 476 { 477 int entries; 478 linkageData* ld; 479 } 480 linkageList; 481 482 483 484 /** 485 * 486 * the data structure below is fundamental for representing trees 487 in the library! 488 489 Inner nodes are represented by three instances of the nodeptr data structure that is linked 490 via a cyclic list using the next pointer. 491 492 So for building an inner node of the tree we need to allocate three nodeptr 493 data structures and link them together, e.g.: 494 495 assuming that we have allocated space for an inner node 496 for nodeptr pointers p1, p2, p3, 497 498 we would then link them like this: 499 500 p1->next = p2; 501 p2->next = p3; 502 p3->next = p1; 503 504 also note that the node number that identifies the inner node 505 needs to be set to the same value. 506 507 for n taxa, tip nodes are enumarated/indexed from 1....n, 508 and inner node inbdices start at n+1. Assuming that we have 10 taxa 509 and this is our first inner node, we'd initialize the number as follows: 510 511 p1->number = 11; 512 p2->number = 11; 513 p3->number = 11; 514 515 Note that the node number is important for indexing tip sequence data as well as inner likelihood vectors 516 and that it is this number (the index) that actually gets stored in the traversal descriptor. 517 518 Tip nodes are non-cyclic nodes that simply consist of one instance/allocation of nodeptr. 519 520 if we have allocated a tip data structure nodeptr t1, 521 we would initialize it as follows: 522 523 t1->number = 1; 524 525 t1->next = NULL; 526 527 now let's assume that we want to build a four taxon tree with tips t1, t2, t3, t4 528 and inner nodes (p1,p2,p3) and (q1,q2,q3). 529 530 we first build the tips: 531 532 t1->number = 1; 533 t1->next = NULL; 534 535 t2->number = 2; 536 t2->next = NULL; 537 538 t3->number = 3; 539 t3->next = NULL; 540 541 t4->number = 4; 542 t4->next = NULL; 543 544 now the first inner node 545 546 p1->next = p2; 547 p2->next = p3; 548 p3->next = p1; 549 550 p1->number = 5; 551 p2->number = 5; 552 p3->number = 5; 553 554 and the second inner node. 555 556 q1->next = q2; 557 q2->next = q3; 558 q3->next = q1; 559 560 q1->number = 6; 561 q2->number = 6; 562 q3->number = 6; 563 564 now we need to link the nodes together such that they form a tree, let's assume we want ((t1,t2), (t3, t4)); 565 566 we will have to link the nodes via the so-called back pointer, 567 i.e.: 568 569 let's connect node p with t1 and t2 570 571 t1->back = p1; 572 t2->back = p2; 573 574 and vice versa: 575 576 p1->back = t1; 577 p2->back = t2; 578 579 let's connect node p with node q: 580 581 p3->back = q3; 582 583 and vice versa: 584 585 q3->back = p3; 586 587 and now let's connect node q with tips t3 and t4: 588 589 q1->back = t3; 590 q2->back = t4; 591 592 and vice versa: 593 594 t3->back = q1; 595 t4->back = q2; 596 597 What remains to be done is to set up the branch lengths. 598 Using the data structure below, we always have to store the 599 branch length twice for each "topological branch" unfortunately. 600 601 Assuming that we are only estimating a single branch across all partitions 602 we'd just set the first index of the branch length array z[PLL_NUM_BRANCHES]. 603 604 e.g., 605 606 t3->z[0] = q1->z[0] = 0.9; 607 608 the above operation for connecting nodes is implemented in functions hookup() which will set 609 the back pointers of two nodes that are to be connected as well as the branch lengths. 610 611 The branchInfo data field is a pointer to a data-structure that stores meta-data and requires 612 the tree not to change while it is being used. 613 614 Also, this pointer needs to be set by doing a full tree traversal on the tree. 615 616 Note that q1->bInf == t3->bInf in the above example. 617 618 The hash number is used for mapping bipartitions to a hash table as described in the following paper: 619 620 A. Aberer, N. Pattengale, A. Stamatakis: "Parallelized phylogenetic post-analysis on multi-core architectures". Journal of Computational Science 1, 107-114, 2010. 621 622 The support data field stores the support value for the branch associated with each nodeptr structure. 623 Note that support always refers to branches. 624 625 Thus for consistency, q3->support must be equal to p3->support; 626 627 Finally, the three char fields x, xPars and xBips are very very important! 628 629 They are used to denote the presence/absence or if you want, direction of the 630 parsimony, bipartition, or likelihood vector at a node with respect to the virtual root. 631 632 Essentially, they are just used as single presence/absence bits and ONLY for inner nodes! 633 634 When setting up new inner nodes, one of the three pointers in the cyclic list must 635 have x = 1 and the other two x = 0; 636 637 in the above example we could set: 638 639 p1->x = 0; 640 p2->x = 0; 641 p3->x = 1; 642 643 q1->x = 0; 644 q2->x = 0; 645 q3->x = 1; 646 647 This would mean that the virtual root is located at the inner branch of the four taxon tree ((t1,t2),(t3,t4)); 648 649 When we re-root the tree at some other branch we need to update the location of the x pointer that is set to 1. 650 651 This means if we root the tree at the branch leading to t1 we would set 652 653 p1->x = 1; 654 p2->x = 0; 655 p3->x = 0; 656 657 the values for q remaon unchanged since q3 is still pointing toward the root. 658 659 When we re-locate the root to branch p1 <-> t1 the fact that we have to "rotate" the x value that is set to 1 660 to another node of the cyclic list representing the abstract topological node p, also tells us that we 661 need to re-compute the conditional likelihood array for p. 662 663 Note that, only one likelihood or parsimony array is stored per inner node and the location of x essentially tells us which subtree 664 it summarizes, if p1->x == 1, it summarizes subtree (t2, (t3, t4)), if p3->x = 1 the likelihood vector associated with 665 node p summarizes subtree (t1, t2). 666 667 @todo I think we should rename the back pointer. It's not back, it can be forward depending on the orientation. We should renmae it to outer. Back is too confusing, I would assume it's the opposite of next, i.e. previous. 668 669 @struct noderec 670 671 @brief Tree node record 672 673 A node in a tree is a structure which contains a cyclic list of pointers to 3 nodes which we call a \e roundabout. The first node is the structure itself, and the other two nodes are accessed via \a noderec->next and \a noderec->next->next. To access the outer node with which each of the 3 nodes forms an edge one has to use the \a back pointer 674 675 @var noderec::next 676 @brief Next node in the roundabout 677 678 @var noderec::back 679 @brief Outer node 680 681 @var noderec::number 682 @brief Node identifier 683 684 In general, tips (i.e. leaves) are numbered from 1 to \e n where \e n is the number of taxa. Identifiers for internal nodes start from \e n + 1. Note 685 that for a given inner node, the identifier must be the same for all 3 nodes that compose it. 686 687 @var info::z 688 @brief The branch lengths per partition for the main node in the roundabout 689 690 @todo Append an image 691 */ 692 typedef struct noderec 693 { 694 695 branchInfo *bInf; 696 double z[PLL_NUM_BRANCHES]; 697 struct noderec *next; 698 struct noderec *back; 699 hashNumberType hash; 700 int support; 701 int number; 702 char x; 703 char xPars; 704 char xBips; 705 } 706 node, *nodeptr; 707 708 typedef unsigned int parsimonyNumber; 709 710 /* @brief Alignment, transition model, model of rate heterogenety and likelihood vectors for one partition. 711 * 712 * @todo De-couple into smaller data structures 713 * 714 * ALIGNMENT DATA 715 * This depends only on the type of data in this partition of the alignment 716 * 717 * MODEL OF RATE HETEROGENETY, We use either GAMMA or PSR 718 * Rate heterogenety: Per Site Categories (PSR) model aka CAT, 719 * Rate of site i is given by perSiteRates[rateCategory[i]] 720 * 721 * TRANSITION MODEL: We always assume General Time Reversibility 722 * Transistion probability matrix: P(t) = exp(Qt) 723 * Branch length t is the expected number of substitutions per site 724 * Pij(t) is the probability of going from state i to state j in a branch of length t 725 * Relative substitution rates (Entries in the Q matrix) 726 * In GTR we can write Q = S * D, where S is a symmetrical matrix and D a diagonal with the state frequencies 727 728 @var protModels 729 @brief Protein models 730 731 @detail Detailed protein models descriptiopn 732 733 @var autoProtModels 734 @brief Auto prot models 735 @detail Detailed auto prot models 736 */ 737 738 739 740 /** @struct pInfo 741 742 @brief Partition information structure 743 744 This data structure encapsulates all properties and auxiliary variables that together 745 consist a partition. 746 747 @var pInfo::dataType 748 @brief Type of data this partition contains 749 750 Can be DNA (\b PLL_DNA_DATA) or AminoAcid (\b PLL_AA_DATA) data 751 752 @var pInfo::states 753 @brief Number of states 754 755 Number of states this type of data can consist of 756 757 @var pInfo::maxTipStates 758 @brief Number of undetermined states (possible states at the tips) 759 760 This is the total number of possible states that can appear in the alignment. This includes degenerate (undetermined) bases 761 762 @var pInfo::partitionName 763 @brief Name of partition 764 765 A null-terminated string describing the name of partition 766 767 @var pInfo::lower 768 @brief Position of the first site in the alignment that is part of this partition [1, tr->originalCrunchedLength] 769 770 @var pInfo::upper 771 @brief Position of the last site that is part of this partition plus one (i.e. position of the first site that is not part of this partition) 772 773 @var pInfo::width 774 @brief Number of sites in the partition (i.e. \a upper - \a lower) 775 776 @var pInfo::wgt 777 @brief Weight of site 778 779 Number of times this particular site appeared in the partition before the duplicates were removed and replaced by this weight 780 781 @var pInfo::empiricalFrequencies 782 @brief Empirical frequency of each state in the current partition 783 784 @var pInfo::perSiteRates 785 @brief Per Site Categories (PSR) or aka CAT values for each rate 786 787 @var pInfo::rateCategory 788 @brief CAT category index for each site 789 790 @var pInfo::numberOfCategories 791 @brief CAT size of the set of possible categories 792 793 @var pInfo::alpha 794 @brief Gamma parameter to be optimized 795 796 @var pInfo::gammaRates 797 @brief Values of the 4 gamma categories (rates) computed given an alpha 798 799 @var pInfo::substRates 800 @brief Entries of substitution matrix, e.g. 6 free parameters in DNA 801 802 In GTR we can write \f$ Q = S * D \f$, where \f$ S \f$ is a symmetrical matrix and \f$ D \f$ a diagonal with the state frequencies, 803 which is represented by the array \a frequencies. The symmetrical matrix is the array \a substRates 804 805 @var pInfo::frequencies 806 @brief State frequencies, entries in D are initialized as empiricalFrequencies 807 808 In GTR we can write \f$ Q = S * D \f$, where \f$ S \f$ is a symmetrical matrix and \f$ D \f$ a diagonal with the state frequencies, 809 which is represented by the array \a frequencies. The symmetrical matrix is the array \a substRates 810 811 @var pInfo::freqExponents 812 813 @var pInfo::EIGN 814 @brief Eigenvalues of Q matrix 815 816 @var pInfo::EV 817 @brief Eigenvectors of Q matrix 818 819 @var pInfo::EI 820 @brief Inverse eigenvectors of Q matrix 821 822 @var pInfo::left 823 @brief P matrix for the left term of the conditional likelihood equation 824 825 @var pInfo::right 826 @brief P matrix for the right term of the conditional likelihood equation 827 828 @var pInfo::tipVector 829 @brief Precomputed (based on current P matrix) conditional likelihood vectors for every possible base 830 831 @var pInfo::EIGN_LG4 832 @brief Eigenvalues of Q matrix for the LG4 model 833 834 @var pInfo::EV_LG4 835 @brief Eigenvectors of Q matrix for the LG4 model 836 837 @var pInfo::EI_LG4 838 @brief Inverse eigenvectors of Q matrix for the LG4 model 839 840 @var pInfo::frequencies_LG4 841 @brief State frequencies for the LG4 model 842 843 @var pInfo::tipVector_LG4 844 @brief Precomputed (based on current P matrix) conditional likelihood vectors for every possible base for the LG4 model 845 846 @var pInfo::substRates_LG4 847 @brief Entries of substitution matrix for the LG4 model 848 849 @var pInfo::protModels 850 @brief Protein model for current partition 851 852 In case \a pInfo::dataType is set to \a PLL_AA_DATA then \a protModels indicates the index in the global array \a protModels 853 of the protein model that the current partition uses. 854 855 @var pInfo::autoProtModels 856 @brief Best fitted protein model for the \b PLL_AUTO partitions 857 858 If \a protModels is set to \b PLL_AUTO then \a autoProtModels holds the currently detected best fitting protein model for the partition 859 860 @var pInfo::protUseEmpiricalFreqs 861 862 @var pInfo::nonGTR 863 864 @var pInfo::optimizeBaseFrequencies 865 866 @var pInfo::optimizeAlphaParameter 867 868 @var pInfo::optimizeSubstitutionRates 869 870 @var pInfo::symmetryVector 871 872 @var pInfo::frequencyGrouping 873 874 875 @todo 876 Document freqExponents 877 878 */ 879 880 881 882 typedef struct { 883 int dataType; 884 int states; 885 int maxTipStates; 886 char *partitionName; 887 int lower; 888 int upper; 889 int width; 890 int *wgt; 891 double *empiricalFrequencies; 892 893 894 /* MODEL OF RATE HETEROGENETY, We use either GAMMA or PSR */ 895 /* Rate heterogenety: Per Site Categories (PSR) model aka CAT, see updatePerSiteRates() */ 896 /* Rate of site i is given by perSiteRates[rateCategory[i]] */ 897 double *perSiteRates; 898 int *rateCategory; 899 int numberOfCategories; 900 /* Rate heterogenety: GAMMA model of rate heterogenety */ 901 double alpha; 902 double *gammaRates; 903 904 905 /* TRANSITION MODEL: We always assume General Time Reversibility */ 906 /* Transistion probability matrix: P(t) = exp(Qt)*/ 907 /* Branch length t is the expected number of substitutions per site */ 908 /* Pij(t) is the probability of going from state i to state j in a branch of length t */ 909 /* Relative substitution rates (Entries in the Q matrix) */ 910 /* In GTR we can write Q = S * D, where S is a symmetrical matrix and D a diagonal with the state frequencies */ 911 double *substRates; /**< TRANSITION MODEL Entries in S, e.g. 6 free parameters in DNA */ 912 double *frequencies; /**< State frequencies, entries in D, are initialized as empiricalFrequencies */ 913 double *freqExponents; 914 /* Matrix decomposition: @todo map this syntax to Explanation of the mathematical background */ 915 double *EIGN; 916 double *EV; 917 double *EI; 918 double *left; 919 double *right; 920 double *tipVector; 921 922 923 /* asc bias */ 924 pllBoolean ascBias; 925 int ascOffset; 926 int * ascExpVector; 927 double * ascSumBuffer; 928 double * ascVector; 929 double ascScaler[64]; 930 931 /* LG4 */ 932 933 double *EIGN_LG4[4]; 934 double *EV_LG4[4]; 935 double *EI_LG4[4]; 936 937 double *frequencies_LG4[4]; 938 double *tipVector_LG4[4]; 939 double *substRates_LG4[4]; 940 941 /* LG4X */ 942 943 double lg4x_weights[4]; 944 double lg4x_weightExponents[4]; 945 double lg4x_weightsBuffer[4]; 946 double lg4x_weightExponentsBuffer[4]; 947 double lg4x_weightLikelihood; 948 949 /* Protein specific */ 950 int protModels; /**< Empirical model matrix */ 951 int autoProtModels; /**< Model selected with "auto" protein model */ 952 int protUseEmpiricalFreqs; /**< Whether to use empirical frequencies for protein model */ 953 954 pllBoolean nonGTR; 955 pllBoolean optimizeBaseFrequencies; /**< Whether to optimize base frequencies */ 956 pllBoolean optimizeAlphaParameter; /**< Whether to optimize alpha parameters and gamma rates */ 957 pllBoolean optimizeSubstitutionRates; /**< Whether to optimize substitution rates */ 958 int *symmetryVector; /**< Specify linkage between substitution rate parameters */ 959 int *frequencyGrouping; 960 961 /* LIKELIHOOD VECTORS */ 962 963 /* partial LH Inner vectors ancestral vectors, we have 2*tips - 3 inner nodes */ 964 double **xVector; /**< Conditional likelihood vectors for inner nodes */ 965 unsigned char **yVector; /**< Tip entries (sequence) for tip nodes */ 966 unsigned int *globalScaler; /**< Counters for scaling operations done at node i */ 967 968 /* data structures for conducting per-site likelihood scaling. 969 this allows to compute the per-site log likelihood scores 970 needed for RELL-based bootstrapping and all sorts of statistical 971 tests for comparing trees ! */ 972 int **expVector; /**< @brief An entry per inner node. Each element is an array of size the number of sites in the current partition and represents how many times the respective site has been scaled in the subtree rooted at the current node */ 973 size_t *expSpaceVector; /**< @brief Each entry represents an inner node and states the size of the corresponding element in \a expVector, which is the number of sites for the current partition */ 974 975 /* These are for the saveMemory option (tracking gaps to skip computations and memory) */ 976 size_t *xSpaceVector; /* Size of conditional likelihood vectors per inner node */ 977 int gapVectorLength; /** Length of \a gapVector bitvector in unsigned integers assuming that \a unsigned \a int is 32bits. It is set to partition size / 32 */ 978 unsigned int *gapVector; /** A bit vector of size \a gapVectorLength * 32 bits. A bit is set to 1 if the corresponding */ 979 double *gapColumn; 980 981 /* Parsimony vectors at each node */ 982 size_t parsimonyLength; 983 parsimonyNumber *parsVect; 984 parsimonyNumber *perSiteParsScores; 985 986 /* This buffer of size width is used to store intermediate values for the branch length optimization under 987 newton-raphson. The data in here can be re-used for all iterations irrespective of the branch length. 988 */ 989 double *sumBuffer; 990 991 /* Buffer to store the per-site log likelihoods */ 992 double *perSiteLikelihoods; 993 994 /* This buffer of size width is used to store the ancestral state at a node of the tree. */ 995 double *ancestralBuffer; 996 997 /* From tree */ 998 pllBoolean executeModel; 999 double fracchange; 1000 double rawFracchange; 1001 double partitionContribution; 1002 double partitionWeight; 1003 double partitionLH; 1004 1005 // #if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI)) 1006 int partitionAssignment; 1007 // #endif 1008 1009 } pInfo; 1010 1011 typedef struct 1012 { 1013 pInfo **partitionData; 1014 int numberOfPartitions; 1015 pllBoolean perGeneBranchLengths; 1016 pllBoolean dirty; 1017 linkageList *alphaList; 1018 linkageList *rateList; 1019 linkageList *freqList; 1020 } partitionList; 1021 1022 1023 1024 #define PLL_REARR_SETTING 1 1025 #define PLL_FAST_SPRS 2 1026 #define PLL_SLOW_SPRS 3 1027 1028 1029 /** @brief Checkpointing states. 1030 * 1031 * @todo Raxml specific 1032 */ 1033 typedef struct { 1034 1035 int state; 1036 1037 /*unsigned int vLength;*/ 1038 double accumulatedTime; 1039 int rearrangementsMax; 1040 int rearrangementsMin; 1041 int thoroughIterations; 1042 int fastIterations; 1043 int mintrav; 1044 int maxtrav; 1045 int bestTrav; 1046 double startLH; 1047 double lh; 1048 double previousLh; 1049 double difference; 1050 double epsilon; 1051 pllBoolean impr; 1052 pllBoolean cutoff; 1053 1054 double tr_startLH; 1055 double tr_endLH; 1056 double tr_likelihood; 1057 double tr_bestOfNode; 1058 double tr_lhCutoff; 1059 double tr_lhAVG; 1060 double tr_lhDEC; 1061 int tr_NumberOfCategories; 1062 int tr_itCount; 1063 int tr_doCutoff; 1064 int tr_thoroughInsertion; 1065 int tr_optimizeRateCategoryInvocations; 1066 1067 /* prevent users from doing stupid things */ 1068 1069 int searchConvergenceCriterion; 1070 int rateHetModel; 1071 int maxCategories; 1072 int NumberOfModels; 1073 int numBranches; 1074 int originalCrunchedLength; 1075 int mxtips; 1076 char seq_file[1024]; 1077 } checkPointState; 1078 1079 1080 1081 /* recomp */ 1082 #ifdef _DEBUG_RECOMPUTATION 1083 typedef struct { 1084 unsigned long int numTraversals; 1085 unsigned long int tt; 1086 unsigned long int ti; 1087 unsigned long int ii; 1088 unsigned int *travlenFreq; 1089 } traversalCounter; 1090 #endif 1091 /* E recomp */ 1092 1093 1094 /** @brief Tree topology. 1095 * 1096 * @todo Apart from the topology this structure contains several fields that act like global variables in raxml 1097 */ 1098 typedef struct { 1099 1100 int *ti; 1101 1102 /* recomp */ 1103 recompVectors *rvec; /**< this data structure tracks which vectors store which nodes */ 1104 float maxMegabytesMemory; /**< User says how many MB in main memory should be used */ 1105 float vectorRecomFraction; /**< vectorRecomFraction ~= 0.8 * maxMegabytesMemory */ 1106 pllBoolean useRecom; /**< ON if we apply recomputation of ancestral vectors*/ 1107 #ifdef _DEBUG_RECOMPUTATION 1108 traversalCounter *travCounter; 1109 double stlenTime; 1110 #endif 1111 /* E recomp */ 1112 1113 pllBoolean fastScaling; 1114 pllBoolean saveMemory; 1115 int startingTree; 1116 long randomNumberSeed; 1117 1118 double *lhs; /**< Array to store per-site log likelihoods of \a originalCrunchedLength (compressed) sites */ 1119 double *patrat; /**< rates per pattern */ 1120 double *patratStored; 1121 int *rateCategory; 1122 int *aliaswgt; /**< weight by pattern */ 1123 pllBoolean manyPartitions; 1124 1125 pllBoolean grouped; /**< No idea what this is, but is always set to PLL_FALSE */ 1126 pllBoolean constrained; /**< No idea what this is, but is always set to PLL_FALSE */ 1127 int threadID; 1128 volatile int numberOfThreads; 1129 1130 //#if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI)) 1131 1132 unsigned char *y_ptr; 1133 1134 double lower_spacing; 1135 double upper_spacing; 1136 1137 double *ancestralVector; 1138 1139 //#endif 1140 1141 pllHashTable *nameHash; 1142 char ** tipNames; 1143 1144 char *secondaryStructureInput; 1145 1146 traversalData td[1]; 1147 1148 int maxCategories; 1149 int categories; 1150 1151 double coreLZ[PLL_NUM_BRANCHES]; 1152 1153 1154 branchInfo *bInf; 1155 1156 int multiStateModel; 1157 1158 1159 pllBoolean curvatOK[PLL_NUM_BRANCHES]; 1160 1161 /* the stuff below is shared among DNA and AA, span does 1162 not change depending on datatype */ 1163 1164 /* model stuff end */ 1165 unsigned char **yVector; /**< list of raw sequences (parsed from the alignment)*/ 1166 1167 int secondaryStructureModel; 1168 int originalCrunchedLength; /**< Length of alignment after removing duplicate sites in each partition */ 1169 1170 int *secondaryStructurePairs; 1171 1172 double fracchange; /**< Average substitution rate */ 1173 double rawFracchange; 1174 double lhCutoff; 1175 double lhAVG; 1176 unsigned long lhDEC; 1177 unsigned long itCount; 1178 int numberOfInvariableColumns; 1179 int weightOfInvariableColumns; 1180 int rateHetModel; 1181 1182 double startLH; 1183 double endLH; 1184 double likelihood; /**< last likelihood value evaluated for the current topology */ 1185 1186 node **nodep; /**< pointer to the list of nodes, which describe the current topology */ 1187 nodeptr nodeBaseAddress; 1188 node *start; /**< starting node by default for full traversals (must be a tip contained in the tree we are operating on) */ 1189 int mxtips; /**< Number of tips in the topology */ 1190 1191 int *constraintVector; /**< @todo What is this? */ 1192 int numberOfSecondaryColumns; 1193 pllBoolean searchConvergenceCriterion; 1194 int ntips; 1195 int nextnode; 1196 1197 pllBoolean bigCutoff; 1198 pllBoolean partitionSmoothed[PLL_NUM_BRANCHES]; 1199 pllBoolean partitionConverged[PLL_NUM_BRANCHES]; 1200 pllBoolean rooted; 1201 pllBoolean doCutoff; 1202 1203 double gapyness; 1204 1205 char **nameList; /**< list of tips names (read from the phylip file) */ 1206 char *tree_string; /**< the newick representaion of the topology */ 1207 char *tree0; 1208 char *tree1; 1209 int treeStringLength; 1210 1211 unsigned int bestParsimony; 1212 unsigned int *parsimonyScore; 1213 1214 double bestOfNode; 1215 nodeptr removeNode; /**< the node that has been removed. Together with \a insertNode represents an SPR move */ 1216 nodeptr insertNode; /**< the node where insertion should take place . Together with \a removeNode represents an SPR move*/ 1217 1218 double zqr[PLL_NUM_BRANCHES]; 1219 double currentZQR[PLL_NUM_BRANCHES]; 1220 1221 double currentLZR[PLL_NUM_BRANCHES]; 1222 double currentLZQ[PLL_NUM_BRANCHES]; 1223 double currentLZS[PLL_NUM_BRANCHES]; 1224 double currentLZI[PLL_NUM_BRANCHES]; 1225 double lzs[PLL_NUM_BRANCHES]; 1226 double lzq[PLL_NUM_BRANCHES]; 1227 double lzr[PLL_NUM_BRANCHES]; 1228 double lzi[PLL_NUM_BRANCHES]; 1229 1230 1231 unsigned int **bitVectors; 1232 1233 unsigned int vLength; 1234 1235 pllHashTable *h; /**< hashtable for ML convergence criterion */ 1236 //hashtable *h; 1237 1238 int optimizeRateCategoryInvocations; 1239 1240 checkPointState ckp; 1241 pllBoolean thoroughInsertion; /**< true if the neighbor branches should be optimized when a subtree is inserted (slower)*/ 1242 pllBoolean useMedian; 1243 1244 int autoProteinSelectionType; 1245 1246 pllStack * rearrangeHistory; 1247 1248 1249 /* analdef defines */ 1250 /* TODO: Do some initialization */ 1251 int bestTrav; /**< best rearrangement radius */ 1252 int max_rearrange; /**< max. rearrangemenent radius */ 1253 int stepwidth; /**< step in rearrangement radius */ 1254 int initial; /**< user defined rearrangement radius which also sets bestTrav if initialSet is set */ 1255 pllBoolean initialSet; /**< set bestTrav according to initial */ 1256 int mode; /**< candidate for removal */ 1257 pllBoolean perGeneBranchLengths; 1258 pllBoolean permuteTreeoptimize; /**< randomly select subtrees for SPR moves */ 1259 pllBoolean compressPatterns; 1260 double likelihoodEpsilon; 1261 pllBoolean useCheckpoint; 1262 1263 } pllInstance; 1264 1265 /** @brief Stores data related to a NNI move */ 1266 typedef struct { 1267 pllInstance * tr; 1268 nodeptr p; 1269 int nniType; 1270 double z[PLL_NUM_BRANCHES]; // optimize branch lengths 1271 double z0[PLL_NUM_BRANCHES]; // unoptimized branch lengths 1272 double likelihood; 1273 double deltaLH; 1274 } nniMove; 1275 1276 /***************************************************************/ 1277 1278 typedef struct { 1279 int partitionNumber; 1280 int partitionLength; 1281 } partitionType; 1282 1283 typedef struct 1284 { 1285 double z[PLL_NUM_BRANCHES]; 1286 nodeptr p, q; 1287 int cp, cq; 1288 } 1289 connectRELL, *connptrRELL; 1290 1291 typedef struct 1292 { 1293 connectRELL *connect; 1294 int start; 1295 double likelihood; 1296 } 1297 topolRELL; 1298 1299 1300 typedef struct 1301 { 1302 int max; 1303 topolRELL **t; 1304 } 1305 topolRELL_LIST; 1306 1307 /**************************************************************/ 1308 1309 /** @brief Connection within a topology. 1310 * */ 1311 typedef struct conntyp { 1312 double z[PLL_NUM_BRANCHES]; /**< branch length */ 1313 node *p, *q; /**< parent and child sectors */ 1314 void *valptr; /**< pointer to value of subtree */ 1315 int descend; /**< pointer to first connect of child */ 1316 int sibling; /**< next connect from same parent */ 1317 } pllConnect, *connptr; 1318 1319 /** @brief Single Topology 1320 * */ 1321 typedef struct { 1322 double likelihood; 1323 int initialTreeNumber; 1324 pllConnect *links; /**< pointer to first connect (start) */ 1325 node *start; 1326 int nextlink; /**< index of next available connect */ 1327 /**< tr->start = tpl->links->p */ 1328 int ntips; 1329 int nextnode; /**< next available inner node for tree parsing */ 1330 int scrNum; /**< position in sorted list of scores */ 1331 int tplNum; /**< position in sorted list of trees */ 1332 } topol; 1333 1334 /** @brief small helper data structure for printing out/downstream use of marginal ancestral probability vectors. 1335 * 1336 * it is allocated as an array that has the same length as the input alignment and can be used to 1337 * index the ancestral states for each position/site/pattern 1338 * */ 1339 typedef struct { 1340 double *probs; /**< marginal ancestral states */ 1341 char c; /**< most likely stated, i.e. max(probs[i]) above */ 1342 int states; /**< number of states for this position */ 1343 } ancestralState; 1344 1345 /** @brief List of topologies 1346 * 1347 * */ 1348 typedef struct { 1349 double best; /**< highest score saved */ 1350 double worst; /**< lowest score saved */ 1351 topol *start; /**< starting tree for optimization */ 1352 topol **byScore; 1353 topol **byTopol; 1354 int nkeep; /**< maximum topologies to save */ 1355 int nvalid; /**< number of topologies saved */ 1356 int ninit; /**< number of topologies initialized */ 1357 int numtrees; /**< number of alternatives tested */ 1358 pllBoolean improved; 1359 } bestlist; 1360 1361 /** @brief This is used to look up some hard-coded data for each data type 1362 * */ 1363 typedef struct 1364 { 1365 int leftLength; /**< s^2 */ 1366 int rightLength;/**< s^2 */ 1367 int eignLength;/**< s */ 1368 int evLength; 1369 int eiLength; 1370 int substRatesLength; /**< (s^2 - s)/2 free model parameters for matrix Q i.e. substitution rates */ 1371 int frequenciesLength; /**< s frequency of each state */ 1372 int tipVectorLength; /* ??? */ 1373 int symmetryVectorLength; 1374 int frequencyGroupingLength; 1375 1376 pllBoolean nonGTR; 1377 pllBoolean optimizeBaseFrequencies; 1378 1379 int undetermined; 1380 1381 const char *inverseMeaning; 1382 1383 int states; /* s */ 1384 1385 pllBoolean smoothFrequencies; 1386 1387 const unsigned int *bitVector; 1388 1389 } partitionLengths; 1390 1391 typedef struct 1392 { 1393 int rearrangeType; 1394 double likelihood; 1395 1396 union { 1397 struct { 1398 double * zp; 1399 double * zpn; 1400 double * zpnn; 1401 double * zqr; 1402 nodeptr pn; 1403 nodeptr pnn; 1404 nodeptr r; 1405 nodeptr p; 1406 nodeptr q; 1407 } SPR; 1408 struct { 1409 nodeptr origin; 1410 int swapType; 1411 double z[PLL_NUM_BRANCHES]; 1412 } NNI; 1413 }; 1414 } pllRollbackInfo; 1415 1416 1417 /** @struct pllRearrangeAttr 1418 1419 @brief Structure holding attributes for searching possible tree rearrangements 1420 1421 Holds the attributes for performing tree rearrangements. 1422 1423 @var pllRearrangeAttr 1424 The origin node where the search should start 1425 1426 @var pllRearrangeAttr:mintrav 1427 The minimum radius around the origin node \a p for which nodes should be tested 1428 1429 @var pllRearrangeAttr:maxtrav 1430 The maximum radius around the origin node \a p for which nodes should be tested 1431 1432 @var pllRearrangeAttr:max 1433 Maximum number of results to be returned 1434 */ 1435 typedef struct 1436 { 1437 nodeptr p; 1438 int mintrav; 1439 int maxtrav; 1440 } pllRearrangeAttr; 1441 1442 /** @typedef pllRearrangeInfo 1443 1444 @brief Tree rearrangement information structure 1445 1446 Holds information for conducting tree arrangements. This structure 1447 is the result of a tree arrangement search under given search 1448 attributes. 1449 1450 @var pllRearrangeInfo::rearrangeType 1451 Type of rearrangement. Can be \b PLL_REARRANGE_SPR, \b PLL_REARRANGE_NNI or 1452 \b PLL_REARRANGE_TBR 1453 1454 @var pllRearrangeInfo::likelihood 1455 Holds the computed likelihood for the addressed rearrangement 1456 1457 @var pllRearrangeInfo::SPR::removeNode 1458 Node where to perform subtree pruning 1459 1460 @var pllRearrangeInfo::SPR::insertNode 1461 Node where to place the pruned subtree 1462 1463 @var pllRearrangeInfo::zqr 1464 Holds the computed branch lengths after the SPR 1465 */ 1466 typedef struct 1467 { 1468 int rearrangeType; 1469 double likelihood; 1470 union { 1471 struct { 1472 nodeptr removeNode; 1473 nodeptr insertNode; 1474 double zqr[PLL_NUM_BRANCHES]; 1475 } SPR; 1476 struct { 1477 nodeptr originNode; 1478 int swapType; 1479 } NNI; 1480 }; 1481 } pllRearrangeInfo; 1482 1483 1484 typedef struct 1485 { 1486 int max_entries; 1487 int entries; 1488 pllRearrangeInfo * rearr; 1489 } pllRearrangeList; 1490 1491 /** @brief Generic structure for storing a multiple sequence alignment */ 1492 typedef struct 1493 { 1494 int sequenceCount; /**< @brief Number of sequences */ 1495 int sequenceLength; /**< @brief Length of sequences */ 1496 int originalSeqLength; /**< @brief Original length of sequences (not modified after removing duplicates) */ 1497 char ** sequenceLabels; /**< @brief An array of where the \a i-th element is the name of the \a i-th sequence */ 1498 unsigned char ** sequenceData; /**< @brief The actual sequence data */ 1499 int * siteWeights; /**< @brief An array where the \a i-th element indicates how many times site \a i appeared (prior to duplicates removal) in the alignment */ 1500 } pllAlignmentData; 1501 1502 1503 /******************** START OF API FUNCTION DESCRIPTIONS ********************/ 1504 1505 #if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI)) 1506 pllBoolean isThisMyPartition(partitionList *pr, int tid, int model); 1507 void printParallelTimePerRegion(void); 1508 #endif 1509 1510 #ifdef _FINE_GRAIN_MPI 1511 extern void pllFinalizeMPI (void); 1512 #endif 1513 1514 1515 1516 /** 1517 * @brief Create the main instance of PLL 1518 * 1519 * Create an instance of the phylogenetic likelihood library 1520 * 1521 * @param rateHetModel Rate heterogeneity model 1522 * @param fastScaling TODO: explain fastScaling here 1523 * @param saveMemory TODO: explain saveMemory here 1524 * @param useRecom If set to \b PLL_TRUE, enables ancestral state recomputation 1525 * 1526 * @todo Document fastScaling, rate heterogeneity and saveMemory and useRecom 1527 * 1528 * @note Do not set \a saveMemory to when using \a useRecom as memory saving 1529 * techniques are not yet implemented for ancestral state recomputation. 1530 * 1531 * @return On success returns an instance to PLL, otherwise \b NULL 1532 */ 1533 extern pllInstance * pllCreateInstance (pllInstanceAttr * pInst); 1534 1535 /** 1536 * @ingroup instanceLinkingGroup 1537 * @brief Load alignment to the PLL instance 1538 * 1539 * Loads (copies) the parsed alignment \a alignmentData to the PLL instance 1540 * as a deep copy. 1541 * 1542 * @param tr The library instance 1543 * @param alignmentData The multiple sequence alignment 1544 * @param partitions List of partitions 1545 * 1546 * @return Returns 1 in case of success, 0 otherwise. 1547 */ 1548 extern int pllLoadAlignment (pllInstance * tr, 1549 pllAlignmentData * alignmentData, 1550 partitionList * pList); 1551 1552 /** 1553 * @brief Compute the empirical base frequencies for all partitions 1554 * 1555 * Compute the empirical base frequencies for all partitions in the list \a pl. 1556 * 1557 * @param pl Partition list 1558 * @param alignmentData Multiple sequence alignment 1559 * 1560 * @return A list of \a pl->numberOfPartitions arrays each of size 1561 \a pl->partitionData[i]->states, where \a i is the \a i-th partition 1562 */ 1563 extern double ** pllBaseFrequenciesAlignment (pllAlignmentData * alignmentData, partitionList * pl); 1564 extern double ** pllBaseFrequenciesInstance (pllInstance * tr, partitionList * pl); 1565 1566 /* pthreads and MPI */ 1567 extern void pllStartPthreads (pllInstance *tr, partitionList *pr); 1568 extern void pllStopPthreads (pllInstance * tr); 1569 extern void pllLockMPI (pllInstance * tr); 1570 extern void pllInitMPI(int * argc, char **argv[]); 1571 1572 1573 /* handling branch lengths*/ 1574 extern double pllGetBranchLength (pllInstance *tr, nodeptr p, int partition_id); 1575 extern void pllSetBranchLength (pllInstance *tr, nodeptr p, int partition_id, double bl); 1576 extern int pllNniSearch(pllInstance * tr, partitionList *pr, int estimateModel); 1577 extern void pllOptimizeBranchLengths ( pllInstance *tr, partitionList *pr, int maxSmoothIterations ); 1578 1579 1580 extern void pllEvaluateLikelihood (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean fullTraversal, pllBoolean getPerSiteLikelihoods); 1581 extern void pllUpdatePartials (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean masked); 1582 extern void pllUpdatePartialsAncestral(pllInstance *tr, partitionList *pr, nodeptr p); 1583 extern void pllNewviewIterative(pllInstance *tr, partitionList *pr, int startIndex); 1584 extern void pllEvaluateIterative(pllInstance *tr, partitionList *pr, pllBoolean getPerSiteLikelihoods); 1585 1586 /* newick parser declarations */ 1587 extern pllNewickTree * pllNewickParseString (const char * newick); 1588 extern pllNewickTree * pllNewickParseFile (const char * filename); 1589 extern int pllValidateNewick (pllNewickTree *); 1590 extern void pllNewickParseDestroy (pllNewickTree **); 1591 extern int pllNewickUnroot (pllNewickTree * t); 1592 extern char * pllTreeToNewick ( char *treestr, pllInstance *tr, partitionList *pr, nodeptr p, 1593 pllBoolean printBranchLengths, pllBoolean printNames, pllBoolean printLikelihood, 1594 pllBoolean rellTree, pllBoolean finalPrint, int perGene, 1595 pllBoolean branchLabelSupport, pllBoolean printSHSupport); 1596 1597 /* partition parser declarations */ 1598 extern void pllQueuePartitionsDestroy (pllQueue ** partitions); 1599 extern pllQueue * pllPartitionParse (const char * filename); 1600 extern pllQueue * pllPartitionParseString (const char * p); 1601 extern void pllPartitionDump (pllQueue * partitions); 1602 void pllBaseSubstitute (pllInstance * tr, partitionList * partitions); 1603 //void pllBaseSubstitute (pllAlignmentData * tr, partitionList * partitions); 1604 partitionList * pllPartitionsCommit (pllQueue * parts, pllAlignmentData * alignmentData); 1605 int pllPartitionsValidate (pllQueue * parts, pllAlignmentData * alignmentData); 1606 extern void pllAlignmentRemoveDups (pllAlignmentData * alignmentData, partitionList * pl); 1607 void pllPartitionsDestroy (pllInstance *, partitionList **); 1608 1609 /* alignment data declarations */ 1610 extern void pllAlignmentDataDestroy (pllAlignmentData *); 1611 extern int pllAlignmentDataDumpFile (pllAlignmentData *, int, const char *); 1612 extern void pllAlignmentDataDumpConsole (pllAlignmentData * alignmentData); 1613 extern pllAlignmentData * pllInitAlignmentData (int, int); 1614 extern pllAlignmentData * pllParseAlignmentFile (int fileType, const char *); 1615 extern pllAlignmentData *pllParsePHYLIPString (const char *rawdata, long filesize); 1616 1617 1618 /* model management */ 1619 int pllInitModel (pllInstance *, partitionList *); 1620 void pllInitReversibleGTR(pllInstance * tr, partitionList * pr, int model); 1621 void pllMakeGammaCats(double alpha, double *gammaRates, int K, pllBoolean useMedian); 1622 int pllLinkAlphaParameters(char *string, partitionList *pr); 1623 int pllLinkFrequencies(char *string, partitionList *pr); 1624 int pllLinkRates(char *string, partitionList *pr); 1625 int pllSetSubstitutionRateMatrixSymmetries(char *string, partitionList * pr, int model); 1626 void pllSetFixedAlpha(double alpha, int model, partitionList * pr, pllInstance *tr); 1627 void pllSetFixedBaseFrequencies(double *f, int length, int model, partitionList * pr, pllInstance *tr); 1628 int pllSetOptimizeBaseFrequencies(int model, partitionList * pr, pllInstance *tr); 1629 void pllSetSubstitutionMatrix(double *q, int length, int model, partitionList * pr, pllInstance *tr); 1630 void pllSetFixedSubstitutionMatrix(double *q, int length, int model, partitionList * pr, pllInstance *tr); 1631 int pllGetInstRateMatrix (partitionList * pr, int model, double * outBuffer); 1632 int pllOptimizeModelParameters(pllInstance *tr, partitionList *pr, double likelihoodEpsilon); 1633 double pllGetAlpha (partitionList * pr, int pid); 1634 void pllGetGammaRates (partitionList * pr, int pid, double * outBuffer); 1635 extern void pllGetBaseFrequencies(partitionList * pr, int model, double * outBuffer); 1636 extern void pllGetSubstitutionMatrix (partitionList * pr, int model, double * outBuffer); 1637 void pllEmpiricalFrequenciesDestroy (double *** empiricalFrequencies, int models); 1638 extern void pllOptRatesGeneric(pllInstance *tr, partitionList *pr, double modelEpsilon, linkageList *ll); 1639 extern void pllOptBaseFreqs(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll); 1640 extern void pllOptAlphasGeneric(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll); 1641 extern void pllOptLG4X(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels); 1642 1643 /* tree topology */ 1644 void pllTreeInitTopologyNewick (pllInstance *, pllNewickTree *, int); 1645 void pllTreeInitTopologyRandom (pllInstance * tr, int tips, char ** nameList); 1646 void pllTreeInitTopologyForAlignment (pllInstance * tr, pllAlignmentData * alignmentData); 1647 extern void pllMakeRandomTree ( pllInstance *tr); 1648 void pllMakeParsimonyTree(pllInstance *tr); 1649 extern void pllMakeParsimonyTreeFast(pllInstance *tr, partitionList *pr, int sprDist); 1650 void pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInstance * tr, partitionList * partitions, int sprDist); 1651 nodeptr pllGetRandomSubtree(pllInstance *); 1652 extern void pllFreeParsimonyDataStructures(pllInstance *tr, partitionList *pr); 1653 void pllDestroyInstance (pllInstance *); 1654 extern void pllGetAncestralState(pllInstance *tr, partitionList *pr, nodeptr p, double * outProbs, char * outSequence); 1655 unsigned int pllEvaluateParsimony(pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean full, pllBoolean perSiteScores); 1656 void pllInitParsimonyStructures(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores); 1657 1658 /* rearrange functions (NNI and SPR) */ 1659 pllRearrangeList * pllCreateRearrangeList (int max); 1660 void pllDestroyRearrangeList (pllRearrangeList ** bestList); 1661 void pllRearrangeSearch (pllInstance * tr, partitionList * pr, int rearrangeType, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList); 1662 void pllRearrangeCommit (pllInstance * tr, partitionList * pr, pllRearrangeInfo * rearr, int saveRollbackInfo); 1663 int pllRearrangeRollback (pllInstance * tr, partitionList * pr); 1664 void pllClearRearrangeHistory (pllInstance * tr); 1665 int pllRaxmlSearchAlgorithm (pllInstance * tr, partitionList * pr, pllBoolean estimateModel); 1666 int pllGetTransitionMatrix (pllInstance * tr, partitionList * pr, nodeptr p, int model, int rate, double * outBuffer); 1667 void pllGetTransitionMatrix2 (pllInstance * tr, partitionList * pr, int model, nodeptr p, double * outBuffer); 1668 int pllGetCLV (pllInstance * tr, partitionList * pr, nodeptr p, int partition, double * outProbs); 1669 extern int pllTopologyPerformNNI(pllInstance * tr, nodeptr p, int swap); 1670 1671 /* hash functions */ 1672 unsigned int pllHashString (const char * s, unsigned int size); 1673 int pllHashAdd (pllHashTable * hTable, unsigned int hash, const char * s, void * item); 1674 pllHashTable * pllHashInit (unsigned int n); 1675 int pllHashSearch (struct pllHashTable * hTable, char * s, void ** item); 1676 void pllHashDestroy (struct pllHashTable ** hTable, void (*cbDealloc)(void *)); 1677 1678 /* node specific functions */ 1679 nodeptr pllGetOrientedNodePointer (pllInstance * pInst, nodeptr p); 1680 1681 /* other functions */ 1682 extern char * pllReadFile (const char *, long *); 1683 extern int * pllssort1main (char ** x, int n); 1684 extern node ** pllGetInnerBranchEndPoints (pllInstance * tr); 1685 1686 /* ---------------- */ 1687 1688 #ifdef __cplusplus 1689 } /* extern "C" */ 1690 #endif 1691 1692 #endif 1693