1 /**
2  * PLL (version 1.0.0) a software library for phylogenetic inference
3  * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
4  *
5  * Derived from
6  * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
7  * trees by Alexandros Stamatakis
8  *
9  * This program is free software: you can redistribute it and/or modify it
10  * under the terms of the GNU General Public License as published by the Free
11  * Software Foundation, either version 3 of the License, or (at your option)
12  * any later version.
13  *
14  * This program is distributed in the hope that it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  * more details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program.  If not, see <http://www.gnu.org/licenses/>.
21  *
22  * For any other enquiries send an Email to Tomas Flouri
23  * Tomas.Flouri@h-its.org
24  *
25  * When publishing work that uses PLL please cite PLL
26  *
27  * ABSTRACT
28  *
29  * PLL is a highly optimized, parallelized software library to ease the
30  * development of new software tools dealing with phylogenetic inference. Among
31  * the functions included in PLL are
32  *
33  * DOCUMENTATION
34  *
35  * Extensive documentation for using PLL is available online at
36  *
37  *                 http://www.libpll.org
38  *
39  *
40  * USAGE
41  *
42  * To use PLL,
43  *
44  * @file pll.h
45  * @brief Data structures for tree and model
46  *
47  * @author Tomas Flouri
48  * @author Fernando Izquierdo-Carrasco
49  * @author Andre Aberer
50  * @author Alexandros Stamatakis
51  */
52 
53 #ifndef __pll__
54 #define __pll__
55 
56 #include <stdint.h>
57 #include <stdio.h>
58 #include <errno.h>
59 
60 #ifdef __cplusplus
61 extern "C" {
62 #endif
63 
64 #ifdef __MIC_NATIVE
65 #define PLL_BYTE_ALIGNMENT 64
66 #define PLL_VECTOR_WIDTH 8
67 #elif defined (__AVX)
68 
69 #include <xmmintrin.h>
70 #include <immintrin.h>
71 #include <pmmintrin.h>
72 
73 #define PLL_BYTE_ALIGNMENT 32
74 #define PLL_VECTOR_WIDTH 4
75 
76 #elif defined (__SSE3)
77 
78 #include <xmmintrin.h>
79 #include <pmmintrin.h>
80 
81 #define PLL_BYTE_ALIGNMENT 16
82 #define PLL_VECTOR_WIDTH 2
83 
84 #else
85 #define PLL_BYTE_ALIGNMENT 8
86 #define PLL_VECTOR_WIDTH 1
87 #endif
88 
89 #ifdef _MSC_VER
90 	#define PLL_ALIGN_BEGIN __declspec(align(PLL_BYTE_ALIGNMENT))
91 	#define PLL_ALIGN_END
92 #else
93 	#define PLL_ALIGN_BEGIN
94 	#define PLL_ALIGN_END __attribute__((aligned(PLL_BYTE_ALIGNMENT)))
95 #endif
96 
97 
98 #include "stack.h"
99 #include "newick.h"
100 #include "queue.h"
101 
102 #define PLL_MAX_TIP_EV                          0.999999999 /* max tip vector value, sum of EVs needs to be smaller than 1.0, otherwise the numerics break down */
103 #define PLL_MAX_LOCAL_SMOOTHING_ITERATIONS      32          /** @brief maximum iterations of smoothings per insert in the */
104 #define PLL_ITERATIONS                          10          /* maximum iterations of iterations per insert */
105 #define PLL_NEWZPERCYCLE                        10           /* iterations of makenewz per tree traversal */
106 #define PLL_NMLNGTH                             256         /* number of characters in species name */
107 #define PLL_DELTAZ                              0.00001     /* test of net branch length change in update */
108 #define PLL_DEFAULTZ                            0.9         /* value of z assigned as starting point */
109 #define PLL_UNLIKELY                            -1.0E300    /* low likelihood for initialization */
110 #define PLL_SUMMARIZE_LENGTH                    -3
111 #define PLL_SUMMARIZE_LH                        -2
112 #define PLL_NO_BRANCHES                         -1
113 #define PLL_MASK_LENGTH                         32
114 #define PLL_ZMIN                                1.0E-15  /* max branch prop. to -log(PLL_ZMIN) (= 34) */
115 #define PLL_ZMAX                                (1.0 - 1.0E-6) /* min branch prop. to 1.0-zmax (= 1.0E-6) */
116 #define PLL_TWOTOTHE256                         115792089237316195423570985008687907853269984665640564039457584007913129639936.0  /*  2**256 (exactly)  */
117 #define PLL_MINLIKELIHOOD                       (1.0/PLL_TWOTOTHE256)
118 #define PLL_MINUSMINLIKELIHOOD                  -PLL_MINLIKELIHOOD
119 
120 
121 #define PLL_FORMAT_PHYLIP                       1
122 #define PLL_FORMAT_FASTA                        2
123 #define PLL_FORMAT_NEWICK                       3
124 
125 #define PLL_NNI_P_NEXT                          1       /**< Use p->next for the NNI move */
126 #define PLL_NNI_P_NEXTNEXT                      2       /**< Use p->next->next for the NNI move */
127 
128 #define PLL_BADREAR                             -1
129 
130 #define PLL_NUM_BRANCHES                        16
131 
132 #define PLL_TRUE                                1
133 #define PLL_FALSE                               0
134 
135 #define PLL_REARRANGE_SPR                       0
136 #define PLL_REARRANGE_TBR                       1
137 #define PLL_REARRANGE_NNI                       2
138 
139 #define PLL_AA_SCALE                            10.0
140 #define PLL_AA_SCALE_PLUS_EPSILON               10.001
141 
142 /* ALPHA_MIN is critical -> numerical instability, eg for 4 discrete rate cats                    */
143 /* and alpha = 0.01 the lowest rate r_0 is                                                        */
144 /* 0.00000000000000000000000000000000000000000000000000000000000034878079110511010487             */
145 /* which leads to numerical problems Table for alpha settings below:                              */
146 /*                                                                                                */
147 /* 0.010000 0.00000000000000000000000000000000000000000000000000000000000034878079110511010487    */
148 /* 0.010000 yielded nasty numerical bugs in at least one case !                                   */
149 /* 0.020000 0.00000000000000000000000000000044136090435925743185910935350715027016962154188875    */
150 /* 0.030000 0.00000000000000000000476844846859006690412039180149775802624789852441798419292220    */
151 /* 0.040000 0.00000000000000049522423236954066431210260930029681736928018820007024736185030633    */
152 /* 0.050000 0.00000000000050625351310359203371872643495343928538368616365517027588794007897377    */
153 /* 0.060000 0.00000000005134625283884191118711474021861409372524676086868566926568746566772461    */
154 /* 0.070000 0.00000000139080650074206434685544624965062437960128249869740102440118789672851562    */
155 /* 0.080000 0.00000001650681201563587066858709818343436959153791576682124286890029907226562500    */
156 /* 0.090000 0.00000011301977332931251259273962858978301859735893231118097901344299316406250000    */
157 /* 0.100000 0.00000052651925834844387815526344648331402709118265192955732345581054687500000000    */
158 
159 #define PLL_ALPHA_MIN                           0.02
160 #define PLL_ALPHA_MAX                           1000.0
161 
162 #define PLL_RATE_MIN                            0.0000001
163 #define PLL_RATE_MAX                            1000000.0
164 
165 #define PLL_LG4X_RATE_MIN                       0.0000001
166 #define PLL_LG4X_RATE_MAX                       1000.0
167 
168 #define PLL_FREQ_MIN                            0.001
169 
170 #define PLL_NUM_AA_STATES                       20
171 #define PLL_NUM_DNA_STATES                      4
172 
173 /*
174    previous values between 0.001 and 0.000001
175 
176    TO AVOID NUMERICAL PROBLEMS WHEN FREQ == 0 IN PARTITIONED MODELS, ESPECIALLY WITH AA
177    previous value of FREQ_MIN was: 0.000001, but this seemed to cause problems with some
178    of the 7-state secondary structure models with some rather exotic small toy test datasets,
179    on the other hand 0.001 caused problems with some of the 16-state secondary structure models
180 
181    For some reason the frequency settings seem to be repeatedly causing numerical problems
182 */
183 
184 #define PLL_ITMAX                               100    /* max number of iterations in brent's algorithm */
185 
186 #define PLL_SHFT(a,b,c,d)                       (a)=(b);(b)=(c);(c)=(d);
187 #define PLL_SIGN(a,b)                           ((b) > 0.0 ? fabs(a) : -fabs(a))
188 #define PLL_ABS(x)                              (((x)<0)   ?  (-(x)) : (x))
189 #define PLL_MIN(x,y)                            (((x)<(y)) ?    (x)  : (y))
190 #define PLL_MAX(x,y)                            (((x)>(y)) ?    (x)  : (y))
191 #define PLL_SWAP(x,y)                           do{ __typeof__ (x) _t = x; x = y; y = _t; } while(0)
192 #define PLL_SWAP_PTR(x,y) do{ char* _t = x; x = y; y = _t; } while(0)
193 #define PLL_SWAP_INT(x,y) do{ int _t = x; x = y; y = _t; } while(0)
194 
195 #define PLL_POINT_GAMMA(prob,alpha,beta)        PointChi2(prob,2.0*(alpha))/(2.0*(beta))
196 
197 #define PLL_LIB_NAME                            "PLL"
198 #define PLL_LIB_VERSION                         "1.0.1"
199 #define PLL_LIB_DATE                            "November 3 2014"
200 
201 /* aminoacid substitution models */
202 #define PLL_DAYHOFF                             0
203 #define PLL_DCMUT                               1
204 #define PLL_JTT                                 2
205 #define PLL_MTREV                               3
206 #define PLL_WAG                                 4
207 #define PLL_RTREV                               5
208 #define PLL_CPREV                               6
209 #define PLL_VT                                  7
210 #define PLL_BLOSUM62                            8
211 #define PLL_MTMAM                               9
212 #define PLL_LG                                  10
213 #define PLL_MTART                               11
214 #define PLL_MTZOA                               12
215 #define PLL_PMB                                 13
216 #define PLL_HIVB                                14
217 #define PLL_HIVW                                15
218 #define PLL_JTTDCMUT                            16
219 #define PLL_FLU                                 17
220 #define PLL_AUTO                                18
221 #define PLL_LG4M                                 19
222 #define PLL_LG4X                                20
223 #define PLL_GTR                                 21  /* GTR always needs to be the last one */
224 #define PLL_NUM_PROT_MODELS                     22
225 
226 /* information criteria for auto protein model selection */
227 #define PLL_AUTO_ML   0
228 #define PLL_AUTO_BIC  1
229 #define PLL_AUTO_AIC  2
230 #define PLL_AUTO_AICC 3
231 
232 /* bipartition stuff */
233 #define PLL_BIPARTITIONS_RF                     4
234 
235 /* scenarios for likelihood computation */
236 #define PLL_TIP_TIP                             0
237 #define PLL_TIP_INNER                           1
238 #define PLL_INNER_INNER                         2
239 
240 
241 /* available data types in PLL */
242 #define PLL_MIN_MODEL                          -1
243 #define PLL_BINARY_DATA                         0
244 #define PLL_DNA_DATA                            1
245 #define PLL_AA_DATA                             2
246 #define PLL_SECONDARY_DATA                      3
247 #define PLL_SECONDARY_DATA_6                    4
248 #define PLL_SECONDARY_DATA_7                    5
249 #define PLL_GENERIC_32                          6
250 #define PLL_GENERIC_64                          7
251 #define PLL_MAX_MODEL                           8
252 
253 #define PLL_SEC_6_A                             0
254 #define PLL_SEC_6_B                             1
255 #define PLL_SEC_6_C                             2
256 #define PLL_SEC_6_D                             3
257 #define PLL_SEC_6_E                             4
258 
259 #define PLL_SEC_7_A                             5
260 #define PLL_SEC_7_B                             6
261 #define PLL_SEC_7_C                             7
262 #define PLL_SEC_7_D                             8
263 #define PLL_SEC_7_E                             9
264 #define PLL_SEC_7_F                             10
265 
266 #define PLL_SEC_16                              11
267 #define PLL_SEC_16_A                            12
268 #define PLL_SEC_16_B                            13
269 #define PLL_SEC_16_C                            14
270 #define PLL_SEC_16_D                            15
271 #define PLL_SEC_16_E                            16
272 #define PLL_SEC_16_F                            17
273 #define PLL_SEC_16_I                            18
274 #define PLL_SEC_16_J                            19
275 #define PLL_SEC_16_K                            20
276 
277 #define PLL_ORDERED_MULTI_STATE                 0
278 #define PLL_MK_MULTI_STATE                      1
279 #define PLL_GTR_MULTI_STATE                     2
280 
281 
282 /* available models of rate heterogeneity in PLL */
283 #define PLL_CAT                                 0
284 #define PLL_GAMMA                               1
285 
286 /* recomp */
287 #define PLL_SLOT_UNUSED                        -2  /* value to mark an available vector */
288 #define PLL_NODE_UNPINNED                      -3  /* marks an inner node as not available in RAM */
289 #define PLL_INNER_NODE_INIT_STLEN              -1  /* initialization */
290 
291 #define PLL_MIN_RECOM_FRACTION     0.1 /* at least this % of inner nodes will be allocated in RAM */
292 #define PLL_MAX_RECOM_FRACTION     1.0 /* always 1, just there for boundary checks */
293 
294 
295 typedef  int pllBoolean;
296 
297 /* @brief PLL instance attribute structure */
298 typedef struct
299 {
300   int rateHetModel;
301   int fastScaling;
302   int saveMemory;
303   int useRecom;
304   long randomNumberSeed;
305   int numberOfThreads;
306 } pllInstanceAttr;
307 
308 /** @brief Stores the recomputation-state of likelihood vectors  */
309 typedef struct
310 {
311   int numVectors;      /**< Number of inner vectors allocated in RAM*/
312   int *iVector;        /**< size: numVectors, stores node id || PLL_SLOT_UNUSED  */
313   int *iNode;          /**< size: inner nodes, stores slot id || PLL_NODE_UNPINNED */
314   int *stlen;          /**< Number of tips behind the current orientation of the indexed inner node (subtree size/cost) */
315   int *unpinnable;     /**< size:numVectors , TRUE if we dont need the vector */
316   int maxVectorsUsed;
317   pllBoolean allSlotsBusy; /**< on if all slots contain an ancesctral node (the usual case after first full traversal) */
318 } recompVectors;
319 /* E recomp */
320 
321 /** @brief ???
322  * @todo add explanation, is this ever used?  */
323 
324 typedef unsigned int hashNumberType;
325 
326 
327 
328 /*typedef uint_fast32_t parsimonyNumber;*/
329 
330 #define PLL_PCF 32
331 
332 /** @brief ???Hash tables
333  * @todo add explanation of all hash tables  */
334 typedef struct pllBipartitionEntry
335 {
336   unsigned int *bitVector;
337   unsigned int *treeVector;
338   unsigned int amountTips;
339   int *supportVector;
340   unsigned int bipNumber;
341   unsigned int bipNumber2;
342   unsigned int supportFromTreeset[2];
343   struct pllBipartitionEntry *next;
344 } pllBipartitionEntry;
345 
346 //typedef struct
347 //{
348 //  hashNumberType tableSize;
349 //  entry **table;
350 //  hashNumberType entryCount;
351 //}
352 //  hashtable;
353 //struct stringEnt
354 //{
355 //  int nodeNumber;
356 //  char *word;
357 //  struct stringEnt *next;
358 //};
359 //
360 //typedef struct stringEnt stringEntry;
361 //typedef struct
362 //{
363 //  hashNumberType tableSize;
364 //  stringEntry **table;
365 //}
366 //  stringHashtable;
367 
368 typedef struct pllHashItem
369 {
370   void * data;
371   char * str;
372   struct pllHashItem * next;
373 } pllHashItem;
374 
375 typedef struct pllHashTable
376 {
377   unsigned int size;
378   struct pllHashItem ** Items;
379   unsigned int entries;
380 } pllHashTable;
381 
382 
383 
384 
385 /** @brief Per-site Rate category entry: likelihood per-site and CAT rate applied ???
386   *
387   */
388 typedef struct ratec
389 {
390   double accumulatedSiteLikelihood;
391   double rate;
392 }rateCategorize;
393 
394 /** @brief Traversal descriptor entry.
395   *
396   * Contains the information required to execute an operation in a step of the tree traversal.
397   * q   r
398   *  \ /
399   *   p
400   *
401   * The entry defines 2 input/parent nodes (q and r) and one output/child node (p)
402   * qz represents the branch length(s) of the branch connecting q and p
403   * rz represents the branch length(s) of the branch connecting r and p
404   * PLL_TIP_TIP     Both p and r are tips
405   * PLL_INNER_INNER Both p and r are inner nodes
406   * @note PLL_TIP_INNER   q is a tip and r is an inner node (by convention, flip q and r if required)
407   */
408 typedef struct
409 {
410   int tipCase;                  /**< Type of entry, must be PLL_TIP_TIP PLL_TIP_INNER or PLL_INNER_INNER */
411   int pNumber;                  /**< should exist in some nodeptr p->number */
412   int qNumber;                  /**< should exist in some nodeptr q->number */
413   int rNumber;                  /**< should exist in some nodeptr r->number */
414   double qz[PLL_NUM_BRANCHES];
415   double rz[PLL_NUM_BRANCHES];
416   /* recom */
417   int slot_p;                   /**< In recomputation mode, the RAM slot index for likelihood vector of node p, otherwise unused */
418   int slot_q;                   /**< In recomputation mode, the RAM slot index for likelihood vector of node q, otherwise unused */
419   int slot_r;                   /**< In recomputation mode, the RAM slot index for likelihood vector of node r, otherwise unused */
420   /* E recom */
421 } traversalInfo;
422 
423 /** @brief Traversal descriptor.
424   *
425   * Describes the state of a traversal descriptor
426   */
427 typedef struct
428 {
429   traversalInfo *ti;              /**< list of traversal steps */
430   int count;                      /**< number of traversal steps */
431   int functionType;
432   pllBoolean traversalHasChanged;
433   pllBoolean *executeModel;
434   double  *parameterValues;
435 } traversalData;
436 
437 /** @brief Node record structure
438   *
439   * Each inner node is a trifurcation in the tree represented as a circular list containing 3 node records. One node record uniquely identifies a subtree, and the orientation of the likelihood vector within a node
440   *
441   * p1 -------> p2 ----> to the next node
442   * ^           |
443   * |-----p3<---|
444   *
445   */
446 struct noderec;
447 
448 /** @brief Branch length information.
449   *
450   * @todo add relevant info on where this is used ???
451   */
452 typedef struct
453 {
454   unsigned int *vector;
455   int support;
456   struct noderec *oP;
457   struct noderec *oQ;
458 } branchInfo;
459 
460 
461 
462 
463 
464 /** @brief Linkage of partitions.
465   *
466   * @todo add relevant info on where this is used ???
467   */
468 typedef struct
469 {
470   pllBoolean valid;
471   int partitions;
472   int *partitionList;
473 }
474   linkageData;
475 typedef struct
476 {
477   int entries;
478   linkageData* ld;
479 }
480   linkageList;
481 
482 
483 
484   /**
485    *
486    * the data structure below is fundamental for representing trees
487      in the library!
488 
489      Inner nodes are represented by three instances of the nodeptr data structure that is linked
490      via a cyclic list using the next pointer.
491 
492      So for building an inner node of the tree we need to allocate three nodeptr
493      data structures and link them together, e.g.:
494 
495      assuming that we have allocated space for an inner node
496      for nodeptr pointers p1, p2, p3,
497 
498      we would then link them like this:
499 
500      p1->next = p2;
501      p2->next = p3;
502      p3->next = p1;
503 
504      also note that the node number that identifies the inner node
505      needs to be set to the same value.
506 
507      for n taxa, tip nodes are enumarated/indexed from 1....n,
508      and inner node inbdices start at n+1. Assuming that we have 10 taxa
509      and this is our first inner node, we'd initialize the number as follows:
510 
511      p1->number = 11;
512      p2->number = 11;
513      p3->number = 11;
514 
515      Note that the node number is important for indexing tip sequence data as well as inner likelihood vectors
516      and that it is this number (the index) that actually gets stored in the traversal descriptor.
517 
518      Tip nodes are non-cyclic nodes that simply consist of one instance/allocation of nodeptr.
519 
520      if we have allocated a tip data structure nodeptr t1,
521      we would initialize it as follows:
522 
523      t1->number = 1;
524 
525      t1->next = NULL;
526 
527      now let's assume that we want to build a four taxon tree with tips t1, t2, t3, t4
528      and inner nodes (p1,p2,p3) and (q1,q2,q3).
529 
530      we first build the tips:
531 
532      t1->number = 1;
533      t1->next = NULL;
534 
535      t2->number = 2;
536      t2->next = NULL;
537 
538      t3->number = 3;
539      t3->next = NULL;
540 
541      t4->number = 4;
542      t4->next = NULL;
543 
544      now the first inner node
545 
546      p1->next = p2;
547      p2->next = p3;
548      p3->next = p1;
549 
550      p1->number = 5;
551      p2->number = 5;
552      p3->number = 5;
553 
554      and the second inner node.
555 
556      q1->next = q2;
557      q2->next = q3;
558      q3->next = q1;
559 
560      q1->number = 6;
561      q2->number = 6;
562      q3->number = 6;
563 
564      now we need to link the nodes together such that they form a tree, let's assume we want ((t1,t2), (t3, t4));
565 
566      we will have to link the nodes via the so-called back pointer,
567      i.e.:
568 
569      let's connect node p with t1 and t2
570 
571      t1->back = p1;
572      t2->back = p2;
573 
574      and vice versa:
575 
576      p1->back = t1;
577      p2->back = t2;
578 
579      let's connect node p with node q:
580 
581      p3->back = q3;
582 
583      and vice versa:
584 
585      q3->back = p3;
586 
587      and now let's connect node q with tips t3 and t4:
588 
589      q1->back = t3;
590      q2->back = t4;
591 
592      and vice versa:
593 
594      t3->back = q1;
595      t4->back = q2;
596 
597      What remains to be done is to set up the branch lengths.
598      Using the data structure below, we always have to store the
599      branch length twice for each "topological branch" unfortunately.
600 
601      Assuming that we are only estimating a single branch across all partitions
602      we'd just set the first index of the branch length array z[PLL_NUM_BRANCHES].
603 
604      e.g.,
605 
606      t3->z[0] = q1->z[0] = 0.9;
607 
608      the above operation for connecting nodes is implemented in functions hookup() which will set
609      the back pointers of two nodes that are to be connected as well as the branch lengths.
610 
611      The branchInfo data field is a pointer to a data-structure that stores meta-data and requires
612      the tree not to change while it is being used.
613 
614      Also, this pointer needs to be set by doing a full tree traversal on the tree.
615 
616      Note that q1->bInf == t3->bInf in the above example.
617 
618      The hash number is used for mapping bipartitions to a hash table as described in the following paper:
619 
620      A. Aberer, N. Pattengale, A. Stamatakis: "Parallelized phylogenetic post-analysis on multi-core architectures". Journal of Computational Science 1, 107-114, 2010.
621 
622      The support data field stores the support value for the branch associated with each nodeptr structure.
623      Note that support always refers to branches.
624 
625      Thus for consistency, q3->support must be equal to p3->support;
626 
627      Finally, the three char fields x, xPars and xBips are very very important!
628 
629      They are used to denote the presence/absence or if you want, direction of the
630      parsimony, bipartition, or likelihood vector at a node with respect to the virtual root.
631 
632      Essentially, they are just used as single presence/absence bits and ONLY for inner nodes!
633 
634      When setting up new inner nodes, one of the three pointers in the cyclic list must
635      have x = 1 and the other two x = 0;
636 
637      in the above example we could set:
638 
639      p1->x = 0;
640      p2->x = 0;
641      p3->x = 1;
642 
643      q1->x = 0;
644      q2->x = 0;
645      q3->x = 1;
646 
647      This would mean that the virtual root is located at the inner branch of the four taxon tree ((t1,t2),(t3,t4));
648 
649      When we re-root the tree at some other branch we need to update the location of the x pointer that is set to 1.
650 
651      This means if we root the tree at the branch leading to t1 we would set
652 
653      p1->x = 1;
654      p2->x = 0;
655      p3->x = 0;
656 
657      the values for q remaon unchanged since q3 is still pointing toward the root.
658 
659      When we re-locate the root to branch p1 <-> t1 the fact that we have to "rotate" the x value that is set to 1
660      to another node of the cyclic list representing the abstract topological node p, also tells us that we
661      need to re-compute the conditional likelihood array for p.
662 
663      Note that, only one likelihood or parsimony array is stored per inner node and the location of x essentially tells us which subtree
664      it summarizes, if p1->x == 1, it summarizes subtree (t2, (t3, t4)), if p3->x = 1 the likelihood vector associated with
665      node p summarizes subtree (t1, t2).
666 
667      @todo I think we should rename the back pointer. It's not back, it can be forward depending on the orientation. We should renmae it to outer. Back is too confusing, I would assume it's the opposite of next, i.e. previous.
668 
669      @struct noderec
670 
671      @brief Tree node record
672 
673      A node in a tree is a structure which contains a cyclic list of pointers to 3 nodes which we call a \e roundabout. The first node is the structure itself, and the other two nodes are accessed via \a noderec->next and \a noderec->next->next. To access the outer node with which each of the 3 nodes forms an edge one has to use the \a back pointer
674 
675      @var noderec::next
676      @brief Next node in the roundabout
677 
678      @var noderec::back
679      @brief Outer node
680 
681      @var noderec::number
682      @brief Node identifier
683 
684      In general, tips (i.e. leaves) are numbered from 1 to \e n where \e n is the number of taxa. Identifiers for internal nodes start from \e n + 1. Note
685      that for a given inner node, the identifier must be the same for all 3 nodes that compose it.
686 
687      @var info::z
688      @brief The branch lengths per partition for the main node in the roundabout
689 
690      @todo Append an image
691   */
692 typedef  struct noderec
693 {
694 
695   branchInfo      *bInf;
696   double           z[PLL_NUM_BRANCHES];
697   struct noderec  *next;
698   struct noderec  *back;
699   hashNumberType   hash;
700   int              support;
701   int              number;
702   char             x;
703   char             xPars;
704   char             xBips;
705 }
706   node, *nodeptr;
707 
708 typedef unsigned int parsimonyNumber;
709 
710 /* @brief Alignment, transition model, model of rate heterogenety and likelihood vectors for one partition.
711   *
712   * @todo De-couple into smaller data structures
713   *
714   * ALIGNMENT DATA
715   * This depends only on the type of data in this partition of the alignment
716   *
717   * MODEL OF RATE HETEROGENETY, We use either GAMMA or PSR
718   * Rate heterogenety: Per Site Categories (PSR) model aka CAT,
719   * Rate of site i is given by perSiteRates[rateCategory[i]]
720   *
721   * TRANSITION MODEL: We always assume General Time Reversibility
722   * Transistion probability matrix: P(t) = exp(Qt)
723   * Branch length t is the expected number of substitutions per site
724   * Pij(t) is the probability of going from state i to state j in a branch of length t
725   * Relative substitution rates (Entries in the Q matrix)
726   * In GTR we can write Q = S * D, where S is a symmetrical matrix and D a diagonal with the state frequencies
727 
728     @var protModels
729     @brief Protein models
730 
731     @detail Detailed protein models descriptiopn
732 
733     @var autoProtModels
734     @brief Auto prot models
735     @detail Detailed auto prot models
736   */
737 
738 
739 
740 /** @struct pInfo
741 
742     @brief Partition information structure
743 
744     This data structure encapsulates all properties and auxiliary variables that together
745     consist a partition.
746 
747     @var pInfo::dataType
748     @brief Type of data this partition contains
749 
750     Can be DNA (\b PLL_DNA_DATA) or AminoAcid (\b PLL_AA_DATA) data
751 
752     @var pInfo::states
753     @brief Number of states
754 
755     Number of states this type of data can consist of
756 
757     @var pInfo::maxTipStates
758     @brief Number of undetermined states (possible states at the tips)
759 
760     This is the total number of possible states that can appear in the alignment. This includes degenerate (undetermined) bases
761 
762     @var pInfo::partitionName
763     @brief Name of partition
764 
765     A null-terminated string describing the name of partition
766 
767     @var pInfo::lower
768     @brief Position of the first site in the alignment that is part of this partition [1, tr->originalCrunchedLength]
769 
770     @var pInfo::upper
771     @brief Position of the last site that is part of this partition plus one (i.e. position of the first site that is not part of this partition)
772 
773     @var pInfo::width
774     @brief Number of sites in the partition (i.e. \a upper - \a lower)
775 
776     @var pInfo::wgt
777     @brief Weight of site
778 
779     Number of times this particular site appeared in the partition before the duplicates were removed and replaced by this weight
780 
781     @var pInfo::empiricalFrequencies
782     @brief Empirical frequency of each state in the current partition
783 
784     @var pInfo::perSiteRates
785     @brief Per Site Categories (PSR) or aka CAT values for each rate
786 
787     @var pInfo::rateCategory
788     @brief CAT category index for each site
789 
790     @var pInfo::numberOfCategories
791     @brief CAT size of the set of possible categories
792 
793     @var pInfo::alpha
794     @brief Gamma parameter to be optimized
795 
796     @var pInfo::gammaRates
797     @brief Values of the 4 gamma categories (rates) computed given an alpha
798 
799     @var pInfo::substRates
800     @brief Entries of substitution matrix, e.g. 6 free parameters in DNA
801 
802     In GTR we can write \f$ Q = S * D \f$, where \f$ S \f$ is a symmetrical matrix and \f$ D \f$ a diagonal with the state frequencies,
803     which is represented by the array \a frequencies. The symmetrical matrix is the array \a substRates
804 
805     @var pInfo::frequencies
806     @brief State frequencies, entries in D are initialized as empiricalFrequencies
807 
808     In GTR we can write \f$ Q = S * D \f$, where \f$ S \f$ is a symmetrical matrix and \f$ D \f$ a diagonal with the state frequencies,
809     which is represented by the array \a frequencies. The symmetrical matrix is the array \a substRates
810 
811     @var pInfo::freqExponents
812 
813     @var pInfo::EIGN
814     @brief Eigenvalues of Q matrix
815 
816     @var pInfo::EV
817     @brief Eigenvectors of Q matrix
818 
819     @var pInfo::EI
820     @brief Inverse eigenvectors of Q matrix
821 
822     @var pInfo::left
823     @brief P matrix for the left term of the conditional likelihood equation
824 
825     @var pInfo::right
826     @brief P matrix for the right term of the conditional likelihood equation
827 
828     @var pInfo::tipVector
829     @brief Precomputed (based on current P matrix) conditional likelihood vectors for every possible base
830 
831     @var pInfo::EIGN_LG4
832     @brief Eigenvalues of Q matrix for the LG4 model
833 
834     @var pInfo::EV_LG4
835     @brief Eigenvectors of Q matrix for the LG4 model
836 
837     @var pInfo::EI_LG4
838     @brief Inverse eigenvectors of Q matrix for the LG4 model
839 
840     @var pInfo::frequencies_LG4
841     @brief State frequencies for the LG4 model
842 
843     @var pInfo::tipVector_LG4
844     @brief Precomputed (based on current P matrix) conditional likelihood vectors for every possible base for the LG4 model
845 
846     @var pInfo::substRates_LG4
847     @brief Entries of substitution matrix for the LG4 model
848 
849     @var pInfo::protModels
850     @brief Protein model for current partition
851 
852     In case \a pInfo::dataType is set to \a PLL_AA_DATA then \a protModels indicates the index in the global array \a protModels
853     of the protein model that the current partition uses.
854 
855     @var pInfo::autoProtModels
856     @brief Best fitted protein model for the \b PLL_AUTO partitions
857 
858     If \a protModels is set to \b PLL_AUTO then \a autoProtModels holds the currently detected best fitting protein model for the partition
859 
860     @var pInfo::protUseEmpiricalFreqs
861 
862     @var pInfo::nonGTR
863 
864     @var pInfo::optimizeBaseFrequencies
865 
866     @var pInfo::optimizeAlphaParameter
867 
868     @var pInfo::optimizeSubstitutionRates
869 
870     @var pInfo::symmetryVector
871 
872     @var pInfo::frequencyGrouping
873 
874 
875     @todo
876       Document freqExponents
877 
878 */
879 
880 
881 
882 typedef struct {
883   int     dataType;
884   int     states;
885   int     maxTipStates;
886   char   *partitionName;
887   int     lower;
888   int     upper;
889   int     width;
890   int    *wgt;
891   double *empiricalFrequencies;
892 
893 
894   /* MODEL OF RATE HETEROGENETY, We use either GAMMA or PSR */
895   /* Rate heterogenety: Per Site Categories (PSR) model aka CAT, see updatePerSiteRates() */
896   /* Rate of site i is given by perSiteRates[rateCategory[i]] */
897   double *perSiteRates;
898   int    *rateCategory;
899   int     numberOfCategories;
900   /* Rate heterogenety: GAMMA model of rate heterogenety */
901   double alpha;
902   double *gammaRates;
903 
904 
905   /* TRANSITION MODEL: We always assume General Time Reversibility */
906   /* Transistion probability matrix: P(t) = exp(Qt)*/
907   /* Branch length t is the expected number of substitutions per site */
908   /* Pij(t) is the probability of going from state i to state j in a branch of length t */
909   /* Relative substitution rates (Entries in the Q matrix) */
910   /* In GTR we can write Q = S * D, where S is a symmetrical matrix and D a diagonal with the state frequencies */
911   double *substRates;       /**< TRANSITION MODEL Entries in S, e.g. 6 free parameters in DNA */
912   double *frequencies;      /**< State frequencies, entries in D, are initialized as empiricalFrequencies */
913   double *freqExponents;
914   /* Matrix decomposition: @todo map this syntax to Explanation of the mathematical background */
915   double *EIGN;
916   double *EV;
917   double *EI;
918   double *left;
919   double *right;
920   double *tipVector;
921 
922 
923   /* asc bias */
924   pllBoolean       ascBias;
925   int           ascOffset;
926   int         * ascExpVector;
927   double      * ascSumBuffer;
928   double      * ascVector;
929   double        ascScaler[64];
930 
931   /* LG4 */
932 
933   double *EIGN_LG4[4];
934   double *EV_LG4[4];
935   double *EI_LG4[4];
936 
937   double *frequencies_LG4[4];
938   double *tipVector_LG4[4];
939   double *substRates_LG4[4];
940 
941   /* LG4X */
942 
943   double lg4x_weights[4];
944   double lg4x_weightExponents[4];
945   double lg4x_weightsBuffer[4];
946   double lg4x_weightExponentsBuffer[4];
947   double lg4x_weightLikelihood;
948 
949   /* Protein specific */
950   int     protModels;			/**< Empirical model matrix */
951   int     autoProtModels;		/**< Model selected with "auto" protein model */
952   int     protUseEmpiricalFreqs;	/**< Whether to use empirical frequencies for protein model */
953 
954   pllBoolean nonGTR;
955   pllBoolean optimizeBaseFrequencies;	/**< Whether to optimize base frequencies */
956   pllBoolean optimizeAlphaParameter;	/**< Whether to optimize alpha parameters and gamma rates */
957   pllBoolean optimizeSubstitutionRates;	/**< Whether to optimize substitution rates */
958   int    *symmetryVector;		/**< Specify linkage between substitution rate parameters */
959   int    *frequencyGrouping;
960 
961   /* LIKELIHOOD VECTORS */
962 
963   /* partial LH Inner vectors  ancestral vectors, we have 2*tips - 3 inner nodes */
964   double          **xVector;          /**< Conditional likelihood vectors for inner nodes */
965   unsigned char   **yVector;          /**< Tip entries (sequence) for tip nodes */
966   unsigned int     *globalScaler;     /**< Counters for scaling operations done at node i */
967 
968   /* data structures for conducting per-site likelihood scaling.
969      this allows to compute the per-site log likelihood scores
970      needed for RELL-based bootstrapping and all sorts of statistical
971      tests for comparing trees ! */
972   int              **expVector;     /**< @brief An entry per inner node. Each element is an array of size the number of sites in the current partition and represents how many times the respective site has been scaled in the subtree rooted at the current node */
973   size_t           *expSpaceVector; /**< @brief Each entry represents an inner node and states the size of the corresponding element in \a expVector, which is the number of sites for the current partition */
974 
975   /* These are for the saveMemory option (tracking gaps to skip computations and memory) */
976   size_t           *xSpaceVector;       /* Size of conditional likelihood vectors per inner node */
977   int               gapVectorLength;    /** Length of \a gapVector bitvector in unsigned integers assuming that \a unsigned \a int is 32bits. It is set to partition size / 32 */
978   unsigned int     *gapVector;          /** A bit vector of size \a gapVectorLength * 32 bits. A bit is set to 1 if the corresponding */
979   double           *gapColumn;
980 
981   /* Parsimony vectors at each node */
982   size_t parsimonyLength;
983   parsimonyNumber *parsVect;
984   parsimonyNumber *perSiteParsScores;
985 
986   /* This buffer of size width is used to store intermediate values for the branch length optimization under
987      newton-raphson. The data in here can be re-used for all iterations irrespective of the branch length.
988    */
989   double *sumBuffer;
990 
991   /* Buffer to store the per-site log likelihoods */
992   double *perSiteLikelihoods;
993 
994   /* This buffer of size width is used to store the ancestral state at a node of the tree. */
995   double *ancestralBuffer;
996 
997   /* From tree */
998   pllBoolean executeModel;
999   double fracchange;
1000   double rawFracchange;
1001   double partitionContribution;
1002   double partitionWeight;
1003   double partitionLH;
1004 
1005 // #if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
1006   int partitionAssignment;
1007 // #endif
1008 
1009 } pInfo;
1010 
1011 typedef struct
1012  {
1013    pInfo **partitionData;
1014    int numberOfPartitions;
1015    pllBoolean perGeneBranchLengths;
1016    pllBoolean dirty;
1017    linkageList *alphaList;
1018    linkageList *rateList;
1019    linkageList *freqList;
1020  }  partitionList;
1021 
1022 
1023 
1024 #define PLL_REARR_SETTING 1
1025 #define PLL_FAST_SPRS     2
1026 #define PLL_SLOW_SPRS     3
1027 
1028 
1029 /** @brief Checkpointing states.
1030  *
1031  * @todo Raxml specific
1032   */
1033 typedef struct {
1034 
1035   int state;
1036 
1037   /*unsigned int vLength;*/
1038   double accumulatedTime;
1039   int rearrangementsMax;
1040   int rearrangementsMin;
1041   int thoroughIterations;
1042   int fastIterations;
1043   int mintrav;
1044   int maxtrav;
1045   int bestTrav;
1046   double startLH;
1047   double lh;
1048   double previousLh;
1049   double difference;
1050   double epsilon;
1051   pllBoolean impr;
1052   pllBoolean cutoff;
1053 
1054   double tr_startLH;
1055   double tr_endLH;
1056   double tr_likelihood;
1057   double tr_bestOfNode;
1058   double tr_lhCutoff;
1059   double tr_lhAVG;
1060   double tr_lhDEC;
1061   int    tr_NumberOfCategories;
1062   int    tr_itCount;
1063   int    tr_doCutoff;
1064   int    tr_thoroughInsertion;
1065   int    tr_optimizeRateCategoryInvocations;
1066 
1067   /* prevent users from doing stupid things */
1068 
1069   int searchConvergenceCriterion;
1070   int rateHetModel;
1071   int maxCategories;
1072   int NumberOfModels;
1073   int numBranches;
1074   int originalCrunchedLength;
1075   int mxtips;
1076   char seq_file[1024];
1077 } checkPointState;
1078 
1079 
1080 
1081 /* recomp */
1082 #ifdef _DEBUG_RECOMPUTATION
1083 typedef struct {
1084   unsigned long int numTraversals;
1085   unsigned long int tt;
1086   unsigned long int ti;
1087   unsigned long int ii;
1088   unsigned int *travlenFreq;
1089 } traversalCounter;
1090 #endif
1091 /* E recomp */
1092 
1093 
1094 /** @brief Tree topology.
1095  *
1096  * @todo Apart from the topology this structure contains several fields that act like global variables in raxml
1097   */
1098 typedef  struct  {
1099 
1100   int *ti;
1101 
1102   /* recomp */
1103   recompVectors *rvec;            /**< this data structure tracks which vectors store which nodes */
1104   float maxMegabytesMemory;       /**< User says how many MB in main memory should be used */
1105   float vectorRecomFraction;      /**< vectorRecomFraction ~= 0.8 * maxMegabytesMemory  */
1106   pllBoolean useRecom;               /**< ON if we apply recomputation of ancestral vectors*/
1107 #ifdef _DEBUG_RECOMPUTATION
1108   traversalCounter *travCounter;
1109   double stlenTime;
1110 #endif
1111   /* E recomp */
1112 
1113   pllBoolean fastScaling;
1114   pllBoolean saveMemory;
1115   int              startingTree;
1116   long             randomNumberSeed;
1117 
1118   double          *lhs;         /**< Array to store per-site log likelihoods of \a originalCrunchedLength (compressed) sites */
1119   double          *patrat;      /**< rates per pattern */
1120   double          *patratStored;
1121   int             *rateCategory;
1122   int             *aliaswgt;    /**< weight by pattern */
1123   pllBoolean    manyPartitions;
1124 
1125   pllBoolean grouped;              /**< No idea what this is, but is always set to PLL_FALSE */
1126   pllBoolean constrained;          /**< No idea what this is, but is always set to PLL_FALSE */
1127   int threadID;
1128   volatile int numberOfThreads;
1129 
1130 //#if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
1131 
1132   unsigned char *y_ptr;
1133 
1134   double lower_spacing;
1135   double upper_spacing;
1136 
1137   double *ancestralVector;
1138 
1139 //#endif
1140 
1141   pllHashTable     *nameHash;
1142   char           ** tipNames;
1143 
1144   char             *secondaryStructureInput;
1145 
1146   traversalData    td[1];
1147 
1148   int              maxCategories;
1149   int              categories;
1150 
1151   double           coreLZ[PLL_NUM_BRANCHES];
1152 
1153 
1154   branchInfo       *bInf;
1155 
1156   int              multiStateModel;
1157 
1158 
1159   pllBoolean curvatOK[PLL_NUM_BRANCHES];
1160 
1161   /* the stuff below is shared among DNA and AA, span does
1162      not change depending on datatype */
1163 
1164   /* model stuff end */
1165   unsigned char    **yVector;        /**< list of raw sequences (parsed from the alignment)*/
1166 
1167   int              secondaryStructureModel;
1168   int              originalCrunchedLength; /**< Length of alignment after removing duplicate sites in each partition */
1169 
1170   int              *secondaryStructurePairs;
1171 
1172   double            fracchange;      /**< Average substitution rate */
1173   double            rawFracchange;
1174   double            lhCutoff;
1175   double            lhAVG;
1176   unsigned long     lhDEC;
1177   unsigned long     itCount;
1178   int               numberOfInvariableColumns;
1179   int               weightOfInvariableColumns;
1180   int               rateHetModel;
1181 
1182   double           startLH;
1183   double           endLH;
1184   double           likelihood;           /**< last likelihood value evaluated for the current topology */
1185 
1186   node           **nodep;                /**< pointer to the list of nodes, which describe the current topology */
1187   nodeptr          nodeBaseAddress;
1188   node            *start;                /**< starting node by default for full traversals (must be a tip contained in the tree we are operating on) */
1189   int              mxtips;  /**< Number of tips in the topology */
1190 
1191   int              *constraintVector;   /**< @todo What is this? */
1192   int              numberOfSecondaryColumns;
1193   pllBoolean          searchConvergenceCriterion;
1194   int              ntips;
1195   int              nextnode;
1196 
1197   pllBoolean          bigCutoff;
1198   pllBoolean          partitionSmoothed[PLL_NUM_BRANCHES];
1199   pllBoolean          partitionConverged[PLL_NUM_BRANCHES];
1200   pllBoolean          rooted;
1201   pllBoolean          doCutoff;
1202 
1203   double         gapyness;
1204 
1205   char **nameList;     /**< list of tips names (read from the phylip file) */
1206   char *tree_string;   /**< the newick representaion of the topology */
1207   char *tree0;
1208   char *tree1;
1209   int treeStringLength;
1210 
1211   unsigned int bestParsimony;
1212   unsigned int *parsimonyScore;
1213 
1214   double bestOfNode;
1215   nodeptr removeNode;   /**< the node that has been removed. Together with \a insertNode represents an SPR move */
1216   nodeptr insertNode;   /**< the node where insertion should take place . Together with \a removeNode represents an SPR move*/
1217 
1218   double zqr[PLL_NUM_BRANCHES];
1219   double currentZQR[PLL_NUM_BRANCHES];
1220 
1221   double currentLZR[PLL_NUM_BRANCHES];
1222   double currentLZQ[PLL_NUM_BRANCHES];
1223   double currentLZS[PLL_NUM_BRANCHES];
1224   double currentLZI[PLL_NUM_BRANCHES];
1225   double lzs[PLL_NUM_BRANCHES];
1226   double lzq[PLL_NUM_BRANCHES];
1227   double lzr[PLL_NUM_BRANCHES];
1228   double lzi[PLL_NUM_BRANCHES];
1229 
1230 
1231   unsigned int **bitVectors;
1232 
1233   unsigned int vLength;
1234 
1235   pllHashTable *h;                 /**< hashtable for ML convergence criterion */
1236   //hashtable *h;
1237 
1238   int optimizeRateCategoryInvocations;
1239 
1240   checkPointState ckp;
1241   pllBoolean thoroughInsertion; /**< true if the neighbor branches should be optimized when a subtree is inserted (slower)*/
1242   pllBoolean useMedian;
1243 
1244   int autoProteinSelectionType;
1245 
1246   pllStack * rearrangeHistory;
1247 
1248 
1249   /* analdef defines */
1250   /* TODO: Do some initialization */
1251   int              bestTrav;            /**< best rearrangement radius */
1252   int              max_rearrange;       /**< max. rearrangemenent radius */
1253   int              stepwidth;           /**< step in rearrangement radius */
1254   int              initial;             /**< user defined rearrangement radius which also sets bestTrav if initialSet is set */
1255   pllBoolean          initialSet;          /**< set bestTrav according to initial */
1256   int              mode;                /**< candidate for removal */
1257   pllBoolean        perGeneBranchLengths;
1258   pllBoolean        permuteTreeoptimize;   /**< randomly select subtrees for SPR moves */
1259   pllBoolean        compressPatterns;
1260   double         likelihoodEpsilon;
1261   pllBoolean        useCheckpoint;
1262 
1263 } pllInstance;
1264 
1265 /** @brief Stores data related to a NNI move  */
1266 typedef struct {
1267         pllInstance * tr;
1268         nodeptr p;
1269         int nniType;
1270         double z[PLL_NUM_BRANCHES]; // optimize branch lengths
1271         double z0[PLL_NUM_BRANCHES]; // unoptimized branch lengths
1272         double likelihood;
1273         double deltaLH;
1274 } nniMove;
1275 
1276 /***************************************************************/
1277 
1278 typedef struct {
1279   int partitionNumber;
1280   int partitionLength;
1281 } partitionType;
1282 
1283 typedef struct
1284 {
1285   double z[PLL_NUM_BRANCHES];
1286   nodeptr p, q;
1287   int cp, cq;
1288 }
1289   connectRELL, *connptrRELL;
1290 
1291 typedef  struct
1292 {
1293   connectRELL     *connect;
1294   int             start;
1295   double          likelihood;
1296 }
1297   topolRELL;
1298 
1299 
1300 typedef  struct
1301 {
1302   int max;
1303   topolRELL **t;
1304 }
1305   topolRELL_LIST;
1306 
1307 /**************************************************************/
1308 
1309 /** @brief Connection within a topology.
1310 *   */
1311 typedef struct conntyp {
1312     double           z[PLL_NUM_BRANCHES];           /**< branch length */
1313     node            *p, *q;       /**< parent and child sectors */
1314     void            *valptr;      /**< pointer to value of subtree */
1315     int              descend;     /**< pointer to first connect of child */
1316     int              sibling;     /**< next connect from same parent */
1317     } pllConnect, *connptr;
1318 
1319 /** @brief Single Topology
1320 *   */
1321 typedef  struct {
1322     double           likelihood;
1323     int              initialTreeNumber;
1324     pllConnect         *links;       /**< pointer to first connect (start) */
1325     node            *start;
1326     int              nextlink;    /**< index of next available connect */
1327                                   /**< tr->start = tpl->links->p */
1328     int              ntips;
1329     int              nextnode;    /**< next available inner node for tree parsing */
1330     int              scrNum;      /**< position in sorted list of scores */
1331     int              tplNum;      /**< position in sorted list of trees */
1332     } topol;
1333 
1334 /** @brief small helper data structure for printing out/downstream use of marginal ancestral probability vectors.
1335 *
1336 * it is allocated as an array that has the same length as the input alignment and can be used to
1337 *   index the ancestral states for each position/site/pattern
1338 *   */
1339 typedef struct {
1340   double *probs; /**< marginal ancestral states */
1341   char c; /**< most likely stated, i.e. max(probs[i]) above */
1342   int states; /**< number of states for this position */
1343 } ancestralState;
1344 
1345 /** @brief List of topologies
1346 *
1347 *   */
1348 typedef struct {
1349     double           best;        /**< highest score saved */
1350     double           worst;       /**< lowest score saved */
1351     topol           *start;       /**< starting tree for optimization */
1352     topol          **byScore;
1353     topol          **byTopol;
1354     int              nkeep;       /**< maximum topologies to save */
1355     int              nvalid;      /**< number of topologies saved */
1356     int              ninit;       /**< number of topologies initialized */
1357     int              numtrees;    /**< number of alternatives tested */
1358     pllBoolean          improved;
1359     } bestlist;
1360 
1361 /** @brief  This is used to look up some hard-coded data for each data type
1362 *   */
1363 typedef struct
1364 {
1365   int leftLength;         /**< s^2 */
1366   int rightLength;/**< s^2 */
1367   int eignLength;/**<  s */
1368   int evLength;
1369   int eiLength;
1370   int substRatesLength;   /**< (s^2 - s)/2 free model parameters for matrix Q i.e. substitution rates */
1371   int frequenciesLength;  /**< s frequency of each state */
1372   int tipVectorLength;    /* ??? */
1373   int symmetryVectorLength;
1374   int frequencyGroupingLength;
1375 
1376   pllBoolean nonGTR;
1377   pllBoolean optimizeBaseFrequencies;
1378 
1379   int undetermined;
1380 
1381   const char *inverseMeaning;
1382 
1383   int states;   /* s */
1384 
1385   pllBoolean smoothFrequencies;
1386 
1387   const unsigned  int *bitVector;
1388 
1389 } partitionLengths;
1390 
1391 typedef struct
1392 {
1393   int rearrangeType;
1394   double  likelihood;
1395 
1396   union {
1397     struct {
1398       double * zp;
1399       double * zpn;
1400       double * zpnn;
1401       double * zqr;
1402       nodeptr pn;
1403       nodeptr pnn;
1404       nodeptr r;
1405       nodeptr p;
1406       nodeptr q;
1407     } SPR;
1408     struct {
1409       nodeptr origin;
1410       int swapType;
1411       double z[PLL_NUM_BRANCHES];
1412     } NNI;
1413   };
1414 } pllRollbackInfo;
1415 
1416 
1417 /** @struct pllRearrangeAttr
1418 
1419     @brief Structure holding attributes for searching possible tree rearrangements
1420 
1421     Holds the attributes for performing tree rearrangements.
1422 
1423     @var pllRearrangeAttr
1424       The origin node where the search should start
1425 
1426     @var pllRearrangeAttr:mintrav
1427       The minimum radius around the origin node \a p for which nodes should be tested
1428 
1429     @var pllRearrangeAttr:maxtrav
1430       The maximum radius around the origin node \a p for which nodes should be tested
1431 
1432     @var pllRearrangeAttr:max
1433       Maximum number of results to be returned
1434 */
1435 typedef struct
1436  {
1437    nodeptr p;
1438    int mintrav;
1439    int maxtrav;
1440  } pllRearrangeAttr;
1441 
1442 /** @typedef pllRearrangeInfo
1443 
1444     @brief Tree rearrangement information structure
1445 
1446     Holds information for conducting tree arrangements. This structure
1447     is the result of a tree arrangement search under given search
1448     attributes.
1449 
1450     @var pllRearrangeInfo::rearrangeType
1451       Type of rearrangement. Can be \b PLL_REARRANGE_SPR, \b PLL_REARRANGE_NNI or
1452       \b PLL_REARRANGE_TBR
1453 
1454     @var pllRearrangeInfo::likelihood
1455       Holds the computed likelihood for the addressed rearrangement
1456 
1457     @var pllRearrangeInfo::SPR::removeNode
1458       Node where to perform subtree pruning
1459 
1460     @var pllRearrangeInfo::SPR::insertNode
1461       Node where to place the pruned subtree
1462 
1463     @var pllRearrangeInfo::zqr
1464       Holds the computed branch lengths after the SPR
1465 */
1466 typedef struct
1467  {
1468    int rearrangeType;
1469    double  likelihood;
1470    union {
1471      struct {
1472        nodeptr removeNode;
1473        nodeptr insertNode;
1474        double  zqr[PLL_NUM_BRANCHES];
1475      } SPR;
1476      struct {
1477        nodeptr originNode;
1478        int     swapType;
1479      } NNI;
1480    };
1481  } pllRearrangeInfo;
1482 
1483 
1484 typedef struct
1485  {
1486    int max_entries;
1487    int entries;
1488    pllRearrangeInfo * rearr;
1489  } pllRearrangeList;
1490 
1491 /** @brief Generic structure for storing a multiple sequence alignment */
1492 typedef struct
1493  {
1494    int              sequenceCount;      /**< @brief Number of sequences */
1495    int              sequenceLength;     /**< @brief Length of sequences */
1496    int              originalSeqLength;  /**< @brief Original length of sequences (not modified after removing duplicates) */
1497    char          ** sequenceLabels;     /**< @brief An array of where the \a i-th element is the name of the \a i-th sequence */
1498    unsigned char ** sequenceData;       /**< @brief The actual sequence data */
1499    int            * siteWeights;        /**< @brief An array where the \a i-th element indicates how many times site \a i appeared (prior to duplicates removal) in the alignment */
1500  } pllAlignmentData;
1501 
1502 
1503 /******************** START OF API FUNCTION DESCRIPTIONS ********************/
1504 
1505 #if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
1506 pllBoolean isThisMyPartition(partitionList *pr, int tid, int model);
1507 void printParallelTimePerRegion(void);
1508 #endif
1509 
1510 #ifdef _FINE_GRAIN_MPI
1511 extern void pllFinalizeMPI (void);
1512 #endif
1513 
1514 
1515 
1516 /**
1517  * @brief Create the main instance of PLL
1518  *
1519  * Create an instance of the phylogenetic likelihood library
1520  *
1521  * @param rateHetModel   Rate heterogeneity model
1522  * @param fastScaling    TODO: explain fastScaling here
1523  * @param saveMemory     TODO: explain saveMemory here
1524  * @param useRecom       If set to \b PLL_TRUE, enables ancestral state recomputation
1525  *
1526  * @todo                 Document fastScaling, rate heterogeneity and saveMemory and useRecom
1527  *
1528  * @note                 Do not set \a saveMemory to when using \a useRecom as memory saving
1529  *                       techniques are not yet implemented for ancestral state recomputation.
1530  *
1531  * @return               On success returns an instance to PLL, otherwise \b NULL
1532  */
1533 extern pllInstance * pllCreateInstance (pllInstanceAttr * pInst);
1534 
1535 /**
1536  *  @ingroup instanceLinkingGroup
1537  *  @brief Load alignment to the PLL instance
1538  *
1539  *   Loads (copies) the parsed alignment \a alignmentData to the PLL instance
1540  *   as a deep copy.
1541  *
1542  *    @param tr              The library instance
1543  *    @param alignmentData   The multiple sequence alignment
1544  *    @param partitions      List of partitions
1545  *
1546  *    @return Returns 1 in case of success, 0 otherwise.
1547  */
1548 extern int pllLoadAlignment (pllInstance * tr,
1549                              pllAlignmentData * alignmentData,
1550                              partitionList * pList);
1551 
1552 /**
1553  * @brief Compute the empirical base frequencies for all partitions
1554  *
1555  * Compute the empirical base frequencies for all partitions in the list \a pl.
1556  *
1557  * @param pl                Partition list
1558  * @param alignmentData     Multiple sequence alignment
1559  *
1560  * @return   A list of \a pl->numberOfPartitions arrays each of size
1561              \a pl->partitionData[i]->states, where \a i is the \a i-th partition
1562 */
1563 extern double ** pllBaseFrequenciesAlignment (pllAlignmentData * alignmentData, partitionList * pl);
1564 extern double ** pllBaseFrequenciesInstance (pllInstance * tr, partitionList * pl);
1565 
1566 /* pthreads and MPI */
1567 extern void pllStartPthreads (pllInstance *tr, partitionList *pr);
1568 extern void pllStopPthreads (pllInstance * tr);
1569 extern void pllLockMPI (pllInstance * tr);
1570 extern void pllInitMPI(int * argc, char **argv[]);
1571 
1572 
1573 /* handling branch lengths*/
1574 extern double pllGetBranchLength (pllInstance *tr, nodeptr p, int partition_id);
1575 extern void pllSetBranchLength (pllInstance *tr, nodeptr p, int partition_id, double bl);
1576 extern int pllNniSearch(pllInstance * tr, partitionList *pr, int estimateModel);
1577 extern void pllOptimizeBranchLengths ( pllInstance *tr, partitionList *pr, int maxSmoothIterations );
1578 
1579 
1580 extern void pllEvaluateLikelihood (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean fullTraversal, pllBoolean getPerSiteLikelihoods);
1581 extern void pllUpdatePartials (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean masked);
1582 extern void pllUpdatePartialsAncestral(pllInstance *tr, partitionList *pr, nodeptr p);
1583 extern void pllNewviewIterative(pllInstance *tr, partitionList *pr, int startIndex);
1584 extern void pllEvaluateIterative(pllInstance *tr, partitionList *pr, pllBoolean getPerSiteLikelihoods);
1585 
1586 /* newick parser declarations */
1587 extern pllNewickTree * pllNewickParseString (const char * newick);
1588 extern pllNewickTree * pllNewickParseFile (const char * filename);
1589 extern int pllValidateNewick (pllNewickTree *);
1590 extern void pllNewickParseDestroy (pllNewickTree **);
1591 extern int pllNewickUnroot (pllNewickTree * t);
1592 extern char * pllTreeToNewick ( char *treestr, pllInstance *tr, partitionList *pr, nodeptr p,
1593       pllBoolean printBranchLengths, pllBoolean printNames, pllBoolean printLikelihood,
1594       pllBoolean rellTree, pllBoolean finalPrint, int perGene,
1595       pllBoolean branchLabelSupport, pllBoolean printSHSupport);
1596 
1597 /* partition parser declarations */
1598 extern void  pllQueuePartitionsDestroy (pllQueue ** partitions);
1599 extern pllQueue * pllPartitionParse (const char * filename);
1600 extern pllQueue * pllPartitionParseString (const char * p);
1601 extern void pllPartitionDump (pllQueue * partitions);
1602 void pllBaseSubstitute (pllInstance * tr, partitionList * partitions);
1603 //void pllBaseSubstitute (pllAlignmentData * tr, partitionList * partitions);
1604 partitionList * pllPartitionsCommit (pllQueue * parts, pllAlignmentData * alignmentData);
1605 int pllPartitionsValidate (pllQueue * parts, pllAlignmentData * alignmentData);
1606 extern void pllAlignmentRemoveDups (pllAlignmentData * alignmentData, partitionList * pl);
1607 void pllPartitionsDestroy (pllInstance *, partitionList **);
1608 
1609 /* alignment data declarations */
1610 extern void pllAlignmentDataDestroy (pllAlignmentData *);
1611 extern int pllAlignmentDataDumpFile (pllAlignmentData *, int, const char *);
1612 extern void pllAlignmentDataDumpConsole (pllAlignmentData * alignmentData);
1613 extern pllAlignmentData * pllInitAlignmentData (int, int);
1614 extern pllAlignmentData * pllParseAlignmentFile (int fileType, const char *);
1615 extern pllAlignmentData *pllParsePHYLIPString (const char *rawdata, long filesize);
1616 
1617 
1618 /* model management */
1619 int pllInitModel (pllInstance *, partitionList *);
1620 void pllInitReversibleGTR(pllInstance * tr, partitionList * pr, int model);
1621 void pllMakeGammaCats(double alpha, double *gammaRates, int K, pllBoolean useMedian);
1622 int pllLinkAlphaParameters(char *string, partitionList *pr);
1623 int pllLinkFrequencies(char *string, partitionList *pr);
1624 int pllLinkRates(char *string, partitionList *pr);
1625 int pllSetSubstitutionRateMatrixSymmetries(char *string, partitionList * pr, int model);
1626 void pllSetFixedAlpha(double alpha, int model, partitionList * pr, pllInstance *tr);
1627 void pllSetFixedBaseFrequencies(double *f, int length, int model, partitionList * pr, pllInstance *tr);
1628 int  pllSetOptimizeBaseFrequencies(int model, partitionList * pr, pllInstance *tr);
1629 void pllSetSubstitutionMatrix(double *q, int length, int model, partitionList * pr,  pllInstance *tr);
1630 void pllSetFixedSubstitutionMatrix(double *q, int length, int model, partitionList * pr,  pllInstance *tr);
1631 int pllGetInstRateMatrix (partitionList * pr, int model, double * outBuffer);
1632 int pllOptimizeModelParameters(pllInstance *tr, partitionList *pr, double likelihoodEpsilon);
1633 double pllGetAlpha (partitionList * pr, int pid);
1634 void pllGetGammaRates (partitionList * pr, int pid, double * outBuffer);
1635 extern void pllGetBaseFrequencies(partitionList * pr, int model, double * outBuffer);
1636 extern void pllGetSubstitutionMatrix (partitionList * pr, int model, double * outBuffer);
1637 void pllEmpiricalFrequenciesDestroy (double *** empiricalFrequencies, int models);
1638 extern void pllOptRatesGeneric(pllInstance *tr, partitionList *pr, double modelEpsilon, linkageList *ll);
1639 extern void pllOptBaseFreqs(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll);
1640 extern void pllOptAlphasGeneric(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll);
1641 extern void pllOptLG4X(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels);
1642 
1643 /* tree topology */
1644 void pllTreeInitTopologyNewick (pllInstance *, pllNewickTree *, int);
1645 void pllTreeInitTopologyRandom (pllInstance * tr, int tips, char ** nameList);
1646 void pllTreeInitTopologyForAlignment (pllInstance * tr, pllAlignmentData * alignmentData);
1647 extern void pllMakeRandomTree ( pllInstance *tr);
1648 void pllMakeParsimonyTree(pllInstance *tr);
1649 extern void pllMakeParsimonyTreeFast(pllInstance *tr, partitionList *pr, int sprDist);
1650 void pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInstance * tr, partitionList * partitions, int sprDist);
1651 nodeptr pllGetRandomSubtree(pllInstance *);
1652 extern void pllFreeParsimonyDataStructures(pllInstance *tr, partitionList *pr);
1653 void pllDestroyInstance (pllInstance *);
1654 extern void pllGetAncestralState(pllInstance *tr, partitionList *pr, nodeptr p, double * outProbs, char * outSequence);
1655 unsigned int pllEvaluateParsimony(pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean full, pllBoolean perSiteScores);
1656 void pllInitParsimonyStructures(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores);
1657 
1658 /* rearrange functions (NNI and SPR) */
1659 pllRearrangeList * pllCreateRearrangeList (int max);
1660 void pllDestroyRearrangeList (pllRearrangeList ** bestList);
1661 void pllRearrangeSearch (pllInstance * tr, partitionList * pr, int rearrangeType, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList);
1662 void pllRearrangeCommit (pllInstance * tr, partitionList * pr, pllRearrangeInfo * rearr, int saveRollbackInfo);
1663 int pllRearrangeRollback (pllInstance * tr, partitionList * pr);
1664 void pllClearRearrangeHistory (pllInstance * tr);
1665 int pllRaxmlSearchAlgorithm (pllInstance * tr, partitionList * pr, pllBoolean estimateModel);
1666 int pllGetTransitionMatrix (pllInstance * tr, partitionList * pr, nodeptr p, int model, int rate, double * outBuffer);
1667 void pllGetTransitionMatrix2 (pllInstance * tr, partitionList * pr, int model, nodeptr p, double * outBuffer);
1668 int pllGetCLV (pllInstance * tr, partitionList * pr, nodeptr p, int partition, double * outProbs);
1669 extern int pllTopologyPerformNNI(pllInstance * tr, nodeptr p, int swap);
1670 
1671 /* hash functions */
1672 unsigned int pllHashString (const char * s, unsigned int size);
1673 int pllHashAdd  (pllHashTable * hTable, unsigned int hash, const char * s, void * item);
1674 pllHashTable * pllHashInit (unsigned int n);
1675 int pllHashSearch (struct pllHashTable * hTable, char * s, void ** item);
1676 void pllHashDestroy (struct pllHashTable ** hTable, void (*cbDealloc)(void *));
1677 
1678 /* node specific functions */
1679 nodeptr pllGetOrientedNodePointer (pllInstance * pInst, nodeptr p);
1680 
1681 /* other functions */
1682 extern char * pllReadFile (const char *, long *);
1683 extern int * pllssort1main (char ** x, int n);
1684 extern node ** pllGetInnerBranchEndPoints (pllInstance * tr);
1685 
1686 /* ---------------- */
1687 
1688 #ifdef __cplusplus
1689 } /* extern "C" */
1690 #endif
1691 
1692 #endif
1693