1 /**
2  * PLL (version 1.0.0) a software library for phylogenetic inference
3  * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
4  *
5  * Derived from
6  * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
7  * trees by Alexandros Stamatakis
8  *
9  * This program is free software: you can redistribute it and/or modify it
10  * under the terms of the GNU General Public License as published by the Free
11  * Software Foundation, either version 3 of the License, or (at your option)
12  * any later version.
13  *
14  * This program is distributed in the hope that it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  * more details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program.  If not, see <http://www.gnu.org/licenses/>.
21  *
22  * For any other enquiries send an Email to Tomas Flouri
23  * Tomas.Flouri@h-its.org
24  *
25  * When publishing work that uses PLL please cite PLL
26  *
27  * @file evaluateGenericSpecial.c
28  *
29  * @brief Functions for computing the log likelihood at a given branch of the tree (i.e. a virtual root that is placed at this branch)
30  */
31 #include "mem_alloc.h"
32 
33 #ifndef WIN32
34 #include <unistd.h>
35 #endif
36 
37 #include <math.h>
38 #include <time.h>
39 #include <stdlib.h>
40 #include <stdio.h>
41 #include <ctype.h>
42 #include <string.h>
43 #include <assert.h>
44 
45 #include "pll.h"
46 #include "pllInternal.h"
47 
48 #ifdef __MIC_NATIVE
49 #include "mic_native.h"
50 #endif
51 
52 /* the set of functions in here computes the log likelihood at a given branch (the virtual root of a tree) */
53 
54 /* includes for using SSE3 intrinsics */
55 
56 #ifdef __SSE3
57 #include <xmmintrin.h>
58 #include <pmmintrin.h>
59 /*#include <tmmintrin.h>*/
60 #endif
61 
62 
63 /** @defgroup evaluateLikelihoodGroup Likelihood evaluation
64 
65     This set of functions deals with the evaluation of likelihood for the current topology
66 */
67 
68 
69 
70 
71 
72 
73 
74 /* below are the function headers for unreadeble highly optimized versions of the above functions
75    for DNA and protein data that also use SSE3 intrinsics and implement some memory saving tricks.
76    The actual functions can be found at the end of this source file.
77    All other likelihood function implementation files:
78 
79    newviewGenericSpacial.c
80    makenewzSpecial.c
81    evaluatePartialGenericSpecial.c
82 
83    are also structured like this
84 
85    To decide which set of function implementations to use you will have to undefine or define _OPTIMIZED_FUNCTIONS
86    in the Makefile
87    */
88 #if (defined(__SSE3) || defined(__AVX))
89 
90 static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr,
91                                        double *x1, double *x2,
92                                        double *tipVector[4],
93                                        unsigned char *tipX1, int n, double *diagptable, const pllBoolean fastScaling,
94                                        double * lg4_weights);
95 
96 /* GAMMA for proteins with memory saving */
97 
98 static double evaluateGTRGAMMAPROT_GAPPED_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
99                                                 double *x1, double *x2,
100                                                 double *tipVector,
101                                                 unsigned char *tipX1, int n, double *diagptable,
102                                                 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
103 
104 
105 /* GAMMA for proteins */
106 
107 static double evaluateGTRGAMMAPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
108                                     double *x1, double *x2,
109                                     double *tipVector,
110                                     unsigned char *tipX1, int n, double *diagptable);
111 
112 /* CAT for proteins */
113 
114 static double evaluateGTRCATPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
115                                   double *x1, double *x2, double *tipVector,
116                                   unsigned char *tipX1, int n, double *diagptable_start);
117 
118 
119 /* CAT for proteins with memory saving */
120 
121 static double evaluateGTRCATPROT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
122                                        double *x1, double *x2, double *tipVector,
123                                        unsigned char *tipX1, int n, double *diagptable_start,
124                                        double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
125 
126 /* analogous DNA fuctions */
127 
128 static double evaluateGTRCAT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
129                                    double *x1_start, double *x2_start, double *tipVector,
130                                    unsigned char *tipX1, int n, double *diagptable_start,
131                                    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
132 
133 static double evaluateGTRGAMMA_GAPPED_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
134                                            double *x1_start, double *x2_start,
135                                            double *tipVector,
136                                            unsigned char *tipX1, const int n, double *diagptable,
137                                            double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
138 
139 static double evaluateGTRGAMMA(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
140                                double *x1_start, double *x2_start,
141                                double *tipVector,
142                                unsigned char *tipX1, const int n, double *diagptable);
143 
144 
145 static double evaluateGTRCAT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
146                               double *x1_start, double *x2_start, double *tipVector,
147                               unsigned char *tipX1, int n, double *diagptable_start);
148 
149 
150 #endif
151 
152 #if (defined(__AVX) || defined(__SSE3))
153 static double evaluateGTRGAMMA_BINARY(int *ex1, int *ex2, int *wptr,
154                                       double *x1_start, double *x2_start,
155                                       double *tipVector,
156                                       unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling);
157 
158 static double evaluateGTRCAT_BINARY (int *ex1, int *ex2, int *cptr, int *wptr,
159                                      double *x1_start, double *x2_start, double *tipVector,
160                                      unsigned char *tipX1, int n, double *diagptable_start, const pllBoolean fastScaling);
161 #endif
162 
163 
164 /*
165    global variables of pthreads version, reductionBuffer is the global array
166    that is used for implementing deterministic reduction operations, that is,
167    the total log likelihood over the partial log lieklihoods for the sites that each thread has computed
168 
169    NumberOfThreads is just the number of threads.
170 
171    Note the volatile modifier here, that guarantees that the compiler will not do weird optimizations
172    rearraengements of the code accessing those variables, because it does not know that several concurrent threads
173    will access those variables simulatenously
174 
175    UPDATE: reductionBuffer is now merged with globalResult
176    */
177 
178 
179 /* a pre-computed 32-bit integer mask */
180 
181 extern const unsigned int mask32[32];
182 
183 /* the function below computes the P matrix from the decomposition of the Q matrix and the respective rate categories for a single partition */
184 
185 /** @brief Compute the diagonal of P matrix for a specific edge
186 
187     This function computes the diagonal of P matrix for a branch of length \a z
188     from the decomposition of the Q matrix specified in \a EIGN and the respective
189     rate categories \a rptr for a single partition. The diagonal is then stored in
190     \a diagptable.
191 
192     @param z                  Length of edge
193     @param states             Number of states
194     @param numberOfCategories Number of categories in the rate heterogeneity rate arrays
195     @param rptr               Rate heterogeneity rate arrays
196     @param EIGN               Eigenvalues
197     @param diagptable         Where to store the resulting P matrix
198 */
calcDiagptable(const double z,const int states,const int numberOfCategories,const double * rptr,const double * EIGN,double * diagptable)199 static void calcDiagptable(const double z, const int states, const int numberOfCategories, const double *rptr, const double *EIGN, double *diagptable)
200 {
201   int
202     i,
203     l;
204 
205   double
206     lz,
207     *lza = (double *)rax_malloc(sizeof(double) * states);
208 
209   /* transform the root branch length to the log and check if it is not too small */
210 
211   if (z < PLL_ZMIN)
212     lz = log(PLL_ZMIN);
213   else
214     lz = log(z);
215 
216   /* do some pre-computations to avoid redundant computations further below */
217 
218   for(i = 1; i < states; i++)
219     lza[i] = EIGN[i] * lz;
220 
221   /* loop over the number of per-site or discrete gamma rate categories */
222 
223   for(i = 0; i < numberOfCategories; i++)
224   {
225     /*
226        diagptable is a pre-allocated array of doubles that stores the P-Matrix
227        the first entry is always 1.0
228        */
229     diagptable[i * states] = 1.0;
230 
231     /* compute the P matrix for all remaining states of the model */
232 
233     for(l = 1; l < states; l++)
234       diagptable[i * states + l] = exp(rptr[i] * lza[l]);
235   }
236 
237   rax_free(lza);
238 }
239 
240 /** @brief Compute the diagonal of P matrix for a specific edge for the LG4 model
241 
242     This function computes the diagonal of P matrix for a branch of length \a z
243     from the decomposition of the 4 LG4 Q matrices specified in \a EIGN and the respective
244     rate categories \a rptr for a single partition. The diagonal is then stored in
245     \a diagptable.
246 
247     @param z
248       Length of edge
249 
250     @param states
251       Number of states
252 
253     @param numberOfCategories
254       Number of categories in the rate heterogeneity rate arrays
255 
256     @param rptr
257       Rate heterogeneity rate arrays
258 
259     @param EIGN
260       Eigenvalues of the 4 Q matrices
261 
262     @param diagptable
263       Where to store the resulting P matrix
264 
265     @param numStates
266       Number of states
267 */
calcDiagptableFlex_LG4(double z,int numberOfCategories,double * rptr,double * EIGN[4],double * diagptable,const int numStates)268 static void calcDiagptableFlex_LG4(double z, int numberOfCategories, double *rptr, double *EIGN[4], double *diagptable, const int numStates)
269 {
270   int
271     i,
272     l;
273 
274   double
275     lz;
276 
277   assert(numStates <= 64);
278 
279   if (z < PLL_ZMIN)
280     lz = log(PLL_ZMIN);
281   else
282     lz = log(z);
283 
284   for(i = 0; i <  numberOfCategories; i++)
285     {
286       diagptable[i * numStates + 0] = 1.0;
287 
288       for(l = 1; l < numStates; l++)
289         diagptable[i * numStates + l] = exp(rptr[i] * EIGN[i][l] * lz);
290     }
291 }
292 
ascertainmentBiasSequence(unsigned char tip[32],int numStates)293 static void ascertainmentBiasSequence(unsigned char tip[32], int numStates)
294 {
295   assert(numStates <= 32 && numStates > 1);
296 
297   switch(numStates)
298     {
299     case 2:
300       tip[0] = 1;
301       tip[1] = 2;
302       break;
303     case 4:
304       tip[0] = 1;
305       tip[1] = 2;
306       tip[2] = 4;
307       tip[3] = 8;
308       break;
309     default:
310       {
311 	int
312 	  i;
313 	for(i = 0; i < numStates; i++)
314 	  {
315 	    tip[i] = i;
316 	    //printf("%c ", inverseMeaningPROT[i]);
317 	  }
318 	//printf("\n");
319       }
320       break;
321     }
322 }
323 
evaluateCatAsc(int * ex1,int * ex2,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable,const int numStates)324 static double evaluateCatAsc(int *ex1, int *ex2,
325 			     double *x1, double *x2,
326 			     double *tipVector,
327 			     unsigned char *tipX1, int n, double *diagptable, const int numStates)
328 {
329   double
330     exponent,
331     sum = 0.0,
332     unobserved,
333     term,
334     *left,
335     *right;
336 
337   int
338     i,
339     l;
340 
341   unsigned char
342     tip[32];
343 
344   ascertainmentBiasSequence(tip, numStates);
345 
346   if(tipX1)
347     {
348       for (i = 0; i < n; i++)
349 	{
350 	  left = &(tipVector[numStates * tip[i]]);
351 	  right = &(x2[i * numStates]);
352 
353 	  term = 0.0;
354 
355 	  for(l = 0; l < numStates; l++)
356 	    term += left[l] * right[l] * diagptable[l];
357 
358 	  /* assumes that pow behaves as expected/specified for underflows
359 	     from the man page:
360 	       If result underflows, and is not representable,
361 	       a range error occurs and 0.0 is returned.
362 	 */
363 
364 	  exponent = pow(PLL_MINLIKELIHOOD, (double)ex2[i]);
365 
366 	  unobserved = fabs(term) * exponent;
367 
368 #ifdef _DEBUG_ASC
369 	  if(ex2[i] > 0)
370 	    {
371 	      printf("s %d\n", ex2[i]);
372 	      assert(0);
373 	    }
374 #endif
375 
376 	  sum += unobserved;
377 	}
378     }
379   else
380     {
381       for (i = 0; i < n; i++)
382 	{
383 	  term = 0.0;
384 
385 	  left  = &(x1[i * numStates]);
386 	  right = &(x2[i * numStates]);
387 
388 	  for(l = 0; l < numStates; l++)
389 	    term += left[l] * right[l] * diagptable[l];
390 
391 	  /* assumes that pow behaves as expected/specified for underflows
392 	     from the man page:
393 	       If result underflows, and is not representable,
394 	       a range error occurs and 0.0 is returned.
395 	  */
396 
397 	  exponent = pow(PLL_MINLIKELIHOOD, (double)(ex1[i] + ex2[i]));
398 
399 	  unobserved = fabs(term) * exponent;
400 
401 #ifdef _DEBUG_ASC
402 	  if(ex2[i] > 0 || ex1[i] > 0)
403 	    {
404 	      printf("s %d %d\n", ex1[i], ex2[i]);
405 	      assert(0);
406 	    }
407 #endif
408 
409 	  sum += unobserved;
410 	}
411     }
412 
413   return  sum;
414 }
415 
416 
evaluateGammaAsc(int * ex1,int * ex2,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable,const int numStates)417 static double evaluateGammaAsc(int *ex1, int *ex2,
418 				double *x1, double *x2,
419 				double *tipVector,
420 				unsigned char *tipX1, int n, double *diagptable, const int numStates)
421 {
422   double
423     exponent,
424     sum = 0.0,
425     unobserved,
426     term,
427     *left,
428     *right;
429 
430   int
431     i,
432     j,
433     l;
434 
435   const int
436     gammaStates = numStates * 4;
437 
438   unsigned char
439     tip[32];
440 
441   ascertainmentBiasSequence(tip, numStates);
442 
443   if(tipX1)
444     {
445       for (i = 0; i < n; i++)
446 	{
447 	  left = &(tipVector[numStates * tip[i]]);
448 
449 	  for(j = 0, term = 0.0; j < 4; j++)
450 	    {
451 	      right = &(x2[gammaStates * i + numStates * j]);
452 
453 	      for(l = 0; l < numStates; l++)
454 		term += left[l] * right[l] * diagptable[j * numStates + l];
455 	    }
456 
457       /* assumes that pow behaves as expected/specified for underflows
458          from the man page:
459            If result underflows, and is not representable,
460            a range error occurs and 0.0 is returned.
461       */
462 
463       exponent = pow(PLL_MINLIKELIHOOD, (double)ex2[i]);
464 
465       unobserved = fabs(term) * exponent;
466 
467 #ifdef _DEBUG_ASC
468 	  if(ex2[i] > 0)
469 	    {
470 	      printf("s %d\n", ex2[i]);
471 	      assert(0);
472 	    }
473 #endif
474 
475 	  sum += unobserved;
476 	}
477     }
478   else
479     {
480       for (i = 0; i < n; i++)
481 	{
482 
483 	  for(j = 0, term = 0.0; j < 4; j++)
484 	    {
485 	      left  = &(x1[gammaStates * i + numStates * j]);
486 	      right = &(x2[gammaStates * i + numStates * j]);
487 
488 	      for(l = 0; l < numStates; l++)
489 		term += left[l] * right[l] * diagptable[j * numStates + l];
490 	    }
491 
492 	  /* assumes that pow behaves as expected/specified for underflows
493 	     from the man page:
494 	       If result underflows, and is not representable,
495 	       a range error occurs and 0.0 is returned.
496 	  */
497 
498 	  exponent = pow(PLL_MINLIKELIHOOD, (double)(ex1[i] + ex2[i]));
499 
500 	  unobserved = fabs(term) * exponent;
501 
502 #ifdef _DEBUG_ASC
503 	  if(ex2[i] > 0 || ex1[i] > 0)
504 	    {
505 	      printf("s %d %d\n", ex1[i], ex2[i]);
506 	      assert(0);
507 	    }
508 #endif
509 
510 	  sum += unobserved;
511 	}
512     }
513 
514   return  sum;
515 }
516 
517 
518 /** @ingroup evaluateLikelihoodGroup
519     @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the GAMMA model of rate heterogeneity
520 
521     Computes the log likelihood of the topology for a specific partition, assuming
522     that the GAMMA model of rate heterogeneity is used. The likelihood is computed at
523     a virtual root placed at an edge whose two end-points (nodes) have the conditional
524     likelihood vectors \a x1 and \a x2.
525     Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
526     likelihood for each site is also computed and stored at the corresponding position
527     in the array \a perSiteLikelihoods.
528 
529     @param fastScaling
530       If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
531       of times it has been scaled down
532 
533     @param ex1
534       An array that holds how many times a site has been scaled and points at the entries for node \a p. This
535       parameter is used if \a fastScaling is set to \b PLL_FALSE.
536 
537     @param ex2
538       An array that holds how many times a site has been scaled and points at the entries for node \a q. This
539       parameter is used if \a fastScaling is set to \b PLL_TRUE.
540 
541     @param wptr
542       Array holding the weight for each site in the compressed partition alignment
543 
544     @param x1_start
545       Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
546 
547     @param x2_start
548       Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
549 
550     @param tipVector
551       Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data
552       type, i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition
553       probabilities computed from the eigenvectors of the decomposed Q matrix.
554 
555     @param tipX1
556       If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
557       this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
558       holds the conditional likelihood vectors for the internal node.
559 
560     @param n
561       Number of sites for which we are doing the evaluation. For the single-thread version this is the
562       number of sites in the current partition, for multi-threads this is the number of sites assigned
563       to the running thread from the current partition.
564 
565     @param diagptable
566       Start of the array that contains the P-Matrix diagonal of the specific edge for which we are
567       evaluating the likehood, and for each category of the GAMMA model
568 
569     @param states
570       Number of states (4 for DNA, 20 for AA)
571 
572     @param perSiteLikelihoods
573       Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
574 
575     @param getPerSiteLikelihoods
576       If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
577 
578     @return
579       The evaluated log likelihood of the tree topology
580 */
evaluateGAMMA_FLEX(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,const int n,double * diagptable,const int states,double * perSiteLikelihoods,pllBoolean getPerSiteLikelihoods)581 static double evaluateGAMMA_FLEX(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
582                                  double *x1_start, double *x2_start,
583                                  double *tipVector,
584                                  unsigned char *tipX1, const int n, double *diagptable, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods)
585 {
586   double
587     sum = 0.0,
588     term,
589     *x1,
590     *x2;
591 
592   int
593     i,
594     j,
595     k;
596 
597   /* span is the offset within the likelihood array at an inner node that gets us from the values
598      of site i to the values of site i + 1 */
599 
600   const int
601     span = states * 4;
602 
603 
604   /* we distingusih between two cases here: one node of the two nodes defining the branch at which we put the virtual root is
605      a tip. Both nodes can not be tips because we do not allow for two-taxon trees ;-)
606      Nota that, if a node is a tip, this will always be tipX1. This is done for code simplicity and the flipping of the nodes
607      is done before when we compute the traversal descriptor.
608      */
609 
610   /* the left node is a tip */
611   if(tipX1)
612   {
613     /* loop over the sites of this partition */
614     for (i = 0; i < n; i++)
615     {
616       /* access pre-computed tip vector values via a lookup table */
617       x1 = &(tipVector[states * tipX1[i]]);
618       /* access the other(inner) node at the other end of the branch */
619       x2 = &(x2_start[span * i]);
620 
621       /* loop over GAMMA rate categories, hard-coded as 4 in RAxML */
622       for(j = 0, term = 0.0; j < 4; j++)
623         /* loop over states and multiply them with the P matrix */
624         for(k = 0; k < states; k++)
625           term += x1[k] * x2[j * states + k] * diagptable[j * states + k];
626 
627       /* take the log of the likelihood and multiply the per-gamma rate likelihood by 1/4.
628          Under the GAMMA model the 4 discrete GAMMA rates all have the same probability
629          of 0.25 */
630 
631       if(!fastScaling)
632         term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
633       else
634         term = log(0.25 * fabs(term));
635 
636       /* if required get the per-site log likelihoods.
637          note that these are the plain per site log-likes, not
638          multiplied with the pattern weight value */
639 
640       if(getPerSiteLikelihoods)
641         perSiteLikelihoods[i] = term;
642 
643       sum += wptr[i] * term;
644     }
645   }
646   else
647   {
648     for (i = 0; i < n; i++)
649     {
650       /* same as before, only that now we access two inner likelihood vectors x1 and x2 */
651 
652       x1 = &(x1_start[span * i]);
653       x2 = &(x2_start[span * i]);
654 
655       for(j = 0, term = 0.0; j < 4; j++)
656         for(k = 0; k < states; k++)
657           term += x1[j * states + k] * x2[j * states + k] * diagptable[j * states + k];
658 
659       if(!fastScaling)
660         term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
661       else
662         term = log(0.25 * fabs(term));
663 
664       if(getPerSiteLikelihoods)
665         perSiteLikelihoods[i] = term;
666 
667       sum += wptr[i] * term;
668     }
669   }
670 
671   return sum;
672 }
673 
674 #if (defined(__SSE3) || defined(__AVX))
675 /** @ingroup evaluateLikelihoodGroup
676     @brief Memory saving version of the generic (and slow) implementation of log likelihood evaluation of a tree using the GAMMA model of rate heterogeneity
677 
678     Computes the log likelihood of the topology for a specific partition, assuming
679     that the GAMMA model of rate heterogeneity is used and memory saving technique
680     is enabled. The likelihood is computed at a virtual root placed at an edge whose
681     two end-points (nodes) have the conditional likelihood vectors \a x1 and \a x2.
682     Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
683     likelihood for each site is also computed and stored at the corresponding position
684     in the array \a perSiteLikelihoods.
685 
686     @param fastScaling
687       If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
688       of times it has been scaled down
689 
690     @param ex1
691       An array that holds how many times a site has been scaled and points at the entries for node \a p. This
692       parameter is used if \a fastScaling is set to \b PLL_FALSE.
693 
694     @param ex2
695       An array that holds how many times a site has been scaled and points at the entries for node \a q. This
696       parameter is used if \a fastScaling is set to \b PLL_TRUE.
697 
698     @param wptr
699       Array holding the weight for each site in the compressed partition alignment
700 
701     @param x1_start
702       Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
703 
704     @param x2_start
705       Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
706 
707     @param tipVector
708       Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data
709       type, i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition
710       probabilities computed from the eigenvectors of the decomposed Q matrix.
711 
712     @param tipX1
713       If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
714       this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
715       holds the conditional likelihood vectors for the internal node.
716 
717     @param n
718       Number of sites for which we are doing the evaluation. For the single-thread version this is the
719       number of sites in the current partition, for multi-threads this is the number of sites assigned
720       to the running thread from the current partition.
721 
722     @param diagptable
723       Start of the array that contains the P-Matrix diagonal of the specific edge for which we are
724       evaluating the likehood, and for each category of the GAMMA model
725 
726     @param states
727       Number of states (4 for DNA, 20 for AA)
728 
729     @param perSiteLikelihoods
730       Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
731 
732     @param getPerSiteLikelihoods
733       If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
734 
735     @param x1_gapColumn
736 
737     @param x2_gapColumn
738 
739     @param x1_gap
740       Gap bitvector for the left child node
741 
742     @param x2_gap
743       Gap bitvector for the right child node
744 
745     @return
746       The evaluated log likelihood of the tree topology
747 
748     @todo
749       Document x1_gapColumn, x2_gapColumn, x1_gap, x2_gap and add a brief description of how this technique works
750 */
evaluateGAMMA_FLEX_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,const int n,double * diagptable,const int states,double * perSiteLikelihoods,pllBoolean getPerSiteLikelihoods,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)751 static double evaluateGAMMA_FLEX_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
752                                       double *x1_start, double *x2_start,
753                                       double *tipVector,
754                                       unsigned char *tipX1, const int n, double *diagptable, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods,
755                                       double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
756 {
757   double
758     sum = 0.0,
759     term,
760     *x1,
761     *x2,
762     *x1_ptr = x1_start,
763     *x2_ptr = x2_start;
764 
765   int
766     i,
767     j,
768     k;
769 
770   /* span is the offset within the likelihood array at an inner node that gets us from the values
771      of site i to the values of site i + 1 */
772 
773   const int
774     span = states * 4;
775 
776 
777   /* we distingusih between two cases here: one node of the two nodes defining the branch at which we put the virtual root is
778      a tip. Both nodes can not be tips because we do not allow for two-taxon trees ;-)
779      Nota that, if a node is a tip, this will always be tipX1. This is done for code simplicity and the flipping of the nodes
780      is done before when we compute the traversal descriptor.
781      */
782 
783   /* the left node is a tip */
784   if(tipX1)
785   {
786     /* loop over the sites of this partition */
787     for (i = 0; i < n; i++)
788     {
789       /* access pre-computed tip vector values via a lookup table */
790       x1 = &(tipVector[states * tipX1[i]]);
791       /* access the other(inner) node at the other end of the branch */
792 
793       if(x2_gap[i / 32] & mask32[i % 32])
794         x2 = x2_gapColumn;
795       else
796         {
797           x2 = x2_ptr;
798           x2_ptr += span;
799         }
800 
801       /* loop over GAMMA rate categories, hard-coded as 4 in RAxML */
802       for(j = 0, term = 0.0; j < 4; j++)
803         /* loop over states and multiply them with the P matrix */
804         for(k = 0; k < states; k++)
805           term += x1[k] * x2[j * states + k] * diagptable[j * states + k];
806 
807       /* take the log of the likelihood and multiply the per-gamma rate likelihood by 1/4.
808          Under the GAMMA model the 4 discrete GAMMA rates all have the same probability
809          of 0.25 */
810 
811       if(!fastScaling)
812         term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
813       else
814         term = log(0.25 * fabs(term));
815 
816       /* if required get the per-site log likelihoods.
817          note that these are the plain per site log-likes, not
818          multiplied with the pattern weight value */
819 
820       if(getPerSiteLikelihoods)
821         perSiteLikelihoods[i] = term;
822 
823       sum += wptr[i] * term;
824     }
825   }
826   else
827   {
828     for (i = 0; i < n; i++)
829     {
830       /* same as before, only that now we access two inner likelihood vectors x1 and x2 */
831 
832       if(x1_gap[i / 32] & mask32[i % 32])
833         x1 = x1_gapColumn;
834       else
835         {
836           x1 = x1_ptr;
837           x1_ptr += span;
838         }
839 
840       if(x2_gap[i / 32] & mask32[i % 32])
841         x2 = x2_gapColumn;
842       else
843         {
844           x2 = x2_ptr;
845           x2_ptr += span;
846         }
847 
848       for(j = 0, term = 0.0; j < 4; j++)
849         for(k = 0; k < states; k++)
850           term += x1[j * states + k] * x2[j * states + k] * diagptable[j * states + k];
851 
852       if(!fastScaling)
853         term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
854       else
855         term = log(0.25 * fabs(term));
856 
857       if(getPerSiteLikelihoods)
858         perSiteLikelihoods[i] = term;
859 
860       sum += wptr[i] * term;
861     }
862   }
863 
864   return sum;
865 }
866 #endif
867 
868 /** @ingroup evaluateLikelihoodGroup
869     @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the CAT model of rate heterogeneity
870 
871     Computes the log likelihood of the topology for a specific partition, assuming
872     that the CAT model of rate heterogeneity is used. The likelihood is computed at
873     a virtual root placed at an edge whose two end-points (nodes) have the conditional
874     likelihood vectors \a x1 and \a x2.
875     Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
876     likelihood for each site is also computed and stored at the corresponding position
877     in the array \a perSiteLikelihoods.
878 
879     @param fastScaling
880       If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
881       of times it has been scaled down
882 
883     @param ex1
884       An array that holds how many times a site has been scaled and points at the entries for node \a p. This
885       parameter is used if \a fastScaling is set to \b PLL_FALSE.
886 
887     @param ex2
888       An array that holds how many times a site has been scaled and points at the entries for node \a q. This
889       parameter is used if \a fastScaling is set to \b PLL_TRUE.
890 
891     @param cptr
892       Array holding the rate for each site in the compressed partition alignment
893 
894     @param wptr
895       Array holding the weight for each site in the compressed partition alignment
896 
897     @param x1
898       Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
899 
900     @param x2
901       Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
902 
903     @param tipVector
904       Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data type,
905       i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition probabilities
906       computed from the eigenvectors of the decomposed Q matrix.
907 
908     @param tipX1
909       If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
910       this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
911       holds the conditional likelihood vectors for the internal node.
912 
913     @param n
914       Number of sites for which we are doing the evaluation. For the single-thread version this is the number of sites in the
915       current partition, for multi-threads this is the number of sites assigned to the running thread from the current partition.
916 
917     @param diagptable_start
918       Start of the array that contains the P-Matrix diagonal of the specific edge for which we are evaluating the likehood,
919       and for each category of the CAT model
920 
921     @param states
922       Number of states (4 for DNA, 20 for AA)
923 
924     @param perSiteLikelihoods
925       Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
926 
927     @param getPerSiteLikelihoods
928       If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
929 
930     @return
931       The evaluated log likelihood of the tree topology
932 */
evaluateCAT_FLEX(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start,const int states,double * perSiteLikelihoods,pllBoolean getPerSiteLikelihoods)933 static double evaluateCAT_FLEX (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
934                                 double *x1, double *x2, double *tipVector,
935                                 unsigned char *tipX1, int n, double *diagptable_start, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods)
936 {
937   double
938     sum = 0.0,
939     term,
940     *diagptable,
941     *left,
942     *right;
943 
944   int
945     i,
946     l;
947 
948   /* chosing between tip vectors and non tip vectors is identical in all flavors of this function ,regardless
949      of whether we are using CAT, GAMMA, DNA or protein data etc */
950 
951   if(tipX1)
952   {
953     for (i = 0; i < n; i++)
954     {
955       /* same as in the GAMMA implementation */
956       left = &(tipVector[states * tipX1[i]]);
957       right = &(x2[states * i]);
958 
959       /* important difference here, we do not have, as for GAMMA
960          4 P matrices assigned to each site, but just one. However those
961          P-Matrices can be different for the sites.
962          Hence we index into the precalculated P-matrices for individual sites
963          via the category pointer cptr[i]
964          */
965       diagptable = &diagptable_start[states * cptr[i]];
966 
967       /* similar to gamma, with the only difference that we do not integrate (sum)
968          over the discrete gamma rates, but simply compute the likelihood of the
969          site and the given P-matrix */
970 
971       for(l = 0, term = 0.0; l < states; l++)
972         term += left[l] * right[l] * diagptable[l];
973 
974       /* take the log */
975        if(!fastScaling)
976          term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
977        else
978          term = log(fabs(term));
979 
980        /* if required get the per-site log likelihoods.
981           note that these are the plain per site log-likes, not
982           multiplied with the pattern weight value */
983 
984        if(getPerSiteLikelihoods)
985          perSiteLikelihoods[i] = term;
986 
987       /*
988          multiply the log with the pattern weight of this site.
989          The site pattern for which we just computed the likelihood may
990          represent several alignment columns sites that have been compressed
991          into one site pattern if they are exactly identical AND evolve under the same model,
992          i.e., form part of the same partition.
993          */
994 
995       sum += wptr[i] * term;
996     }
997   }
998   else
999   {
1000     for (i = 0; i < n; i++)
1001     {
1002       /* as before we now access the likelihood arrayes of two inner nodes */
1003       left  = &x1[states * i];
1004       right = &x2[states * i];
1005 
1006       diagptable = &diagptable_start[states * cptr[i]];
1007 
1008       for(l = 0, term = 0.0; l < states; l++)
1009         term += left[l] * right[l] * diagptable[l];
1010 
1011       if(!fastScaling)
1012         term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
1013       else
1014         term = log(fabs(term));
1015 
1016       if(getPerSiteLikelihoods)
1017         perSiteLikelihoods[i] = term;
1018 
1019       sum += wptr[i] * term;
1020     }
1021   }
1022 
1023   return  sum;
1024 }
1025 
1026 #if (defined(__SSE3) || defined(__AVX))
1027 /** @ingroup evaluateLikelihoodGroup
1028     @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the CAT model of rate heterogeneity with memory saving
1029 
1030     This is the same as ::evaluateCAT_FLEX but with the memory saving technique enabled.
1031     Please check ::evaluateCAT_FLEX for more information and a description of the common
1032     input parameters
1033 
1034     @param x1_gapColumn
1035 
1036     @param x2_gapColumn
1037 
1038     @param x1_gap
1039       Gap bitvector for the left child node
1040 
1041     @param x2_gap
1042       Gap bitvector for the right child node
1043 
1044     @todo
1045       Comment on x1_gapColumn and x2_gapColumn
1046 */
evaluateCAT_FLEX_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start,const int states,double * perSiteLikelihoods,pllBoolean getPerSiteLikelihoods,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)1047 static double evaluateCAT_FLEX_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
1048                                      double *x1, double *x2, double *tipVector,
1049                                      unsigned char *tipX1, int n, double *diagptable_start, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods,
1050                                      double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
1051 {
1052   double
1053     sum = 0.0,
1054     term,
1055     *diagptable,
1056     *left,
1057     *right,
1058     *left_ptr = x1,
1059     *right_ptr = x2;
1060 
1061   int
1062     i,
1063     l;
1064 
1065   /* chosing between tip vectors and non tip vectors is identical in all flavors of this function ,regardless
1066      of whether we are using CAT, GAMMA, DNA or protein data etc */
1067 
1068   if(tipX1)
1069   {
1070     for (i = 0; i < n; i++)
1071     {
1072       /* same as in the GAMMA implementation */
1073       left = &(tipVector[states * tipX1[i]]);
1074 
1075       if(isGap(x2_gap, i))
1076         right = x2_gapColumn;
1077       else
1078         {
1079           right = right_ptr;
1080           right_ptr += states;
1081         }
1082       /* important difference here, we do not have, as for GAMMA
1083          4 P matrices assigned to each site, but just one. However those
1084          P-Matrices can be different for the sites.
1085          Hence we index into the precalculated P-matrices for individual sites
1086          via the category pointer cptr[i]
1087          */
1088       diagptable = &diagptable_start[states * cptr[i]];
1089 
1090       /* similar to gamma, with the only difference that we do not integrate (sum)
1091          over the discrete gamma rates, but simply compute the likelihood of the
1092          site and the given P-matrix */
1093 
1094       for(l = 0, term = 0.0; l < states; l++)
1095         term += left[l] * right[l] * diagptable[l];
1096 
1097       /* take the log */
1098        if(!fastScaling)
1099          term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
1100        else
1101          term = log(fabs(term));
1102 
1103        /* if required get the per-site log likelihoods.
1104           note that these are the plain per site log-likes, not
1105           multiplied with the pattern weight value */
1106 
1107        if(getPerSiteLikelihoods)
1108          perSiteLikelihoods[i] = term;
1109 
1110       /*
1111          multiply the log with the pattern weight of this site.
1112          The site pattern for which we just computed the likelihood may
1113          represent several alignment columns sites that have been compressed
1114          into one site pattern if they are exactly identical AND evolve under the same model,
1115          i.e., form part of the same partition.
1116          */
1117 
1118       sum += wptr[i] * term;
1119     }
1120   }
1121   else
1122   {
1123     for (i = 0; i < n; i++)
1124     {
1125       /* as before we now access the likelihood arrayes of two inner nodes */
1126 
1127       if(isGap(x1_gap, i))
1128         left = x1_gapColumn;
1129       else
1130         {
1131           left = left_ptr;
1132           left_ptr += states;
1133         }
1134 
1135       if(isGap(x2_gap, i))
1136         right = x2_gapColumn;
1137       else
1138         {
1139           right = right_ptr;
1140           right_ptr += states;
1141         }
1142 
1143       diagptable = &diagptable_start[states * cptr[i]];
1144 
1145       for(l = 0, term = 0.0; l < states; l++)
1146         term += left[l] * right[l] * diagptable[l];
1147 
1148       if(!fastScaling)
1149         term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
1150       else
1151         term = log(fabs(term));
1152 
1153       if(getPerSiteLikelihoods)
1154         perSiteLikelihoods[i] = term;
1155 
1156       sum += wptr[i] * term;
1157     }
1158   }
1159 
1160   return  sum;
1161 }
1162 #endif
1163 
1164 
1165 /* This is the core function for computing the log likelihood at a branch */
1166 /** @ingroup evaluateLikelihoodGroup
1167     @brief Evaluate the log likelihood of a specific branch of the topology
1168 
1169     Evaluates the likelihood of the tree topology assuming a virtual root is
1170     placed at the edge whose end-points are node with number \a pNumber and \a
1171     qNumber in the first slot of the traversal descriptor. The function first
1172     computes the conditional likelihoods for all necessary nodes (the ones in
1173     the traversal descriptor list) by calling the function \a pllNewviewIterative
1174     and then evaluates the likelihood at the root. In addition, if \a
1175     getPerSiteLikelihoods is set to \b PLL_TRUE, the per-site likelihoods are
1176     stored in \a tr->lhs.
1177 
1178     @param tr
1179       PLL instance
1180 
1181     @param pr
1182       List of partitions
1183 
1184     @param getPerSiteLikelihoods
1185       If set to \b PLL_TRUE, compute the log likelihood for each site.
1186 
1187     @note
1188       This is an internal function and should not be called by the user. It assumes
1189       that a valid traversal descriptor has already been computed. It also assumes
1190       that the edge we are referring to is an edge that leads to a tip, i.e. either
1191       p or q of the first entry of traversal descriptor are tips.
1192 */
pllEvaluateIterative(pllInstance * tr,partitionList * pr,pllBoolean getPerSiteLikelihoods)1193 void pllEvaluateIterative(pllInstance *tr, partitionList *pr, pllBoolean getPerSiteLikelihoods)
1194 {
1195   /* the branch lengths and node indices of the virtual root branch are always the first one that
1196      are stored in the very important traversal array data structure that describes a partial or full tree traversal */
1197 
1198   /* get the branch length at the root */
1199   double
1200     *pz = tr->td[0].ti[0].qz;
1201 
1202   /* get the node number of the node to the left and right of the branch that defines the virtual rooting */
1203 
1204   int
1205     pNumber = tr->td[0].ti[0].pNumber,
1206     qNumber = tr->td[0].ti[0].qNumber,
1207     p_slot,
1208     q_slot,
1209     model;
1210 
1211   pllBoolean
1212     fastScaling = tr->fastScaling;
1213 
1214   /* the slots are the entries in xVector where the LH vector is available */
1215   if(tr->useRecom)
1216     {
1217       p_slot = tr->td[0].ti[0].slot_p;
1218       q_slot = tr->td[0].ti[0].slot_q;
1219     }
1220   else
1221     {
1222       p_slot = pNumber - tr->mxtips - 1;
1223       q_slot = qNumber - tr->mxtips - 1;
1224     }
1225 
1226   /* before we can compute the likelihood at the virtual root, we need to do a partial or full tree traversal to compute
1227      the conditional likelihoods of the vectors as specified in the traversal descriptor. Maintaining this tarversal descriptor consistent
1228      will unfortunately be the responsibility of users. This is tricky, if as planned for here, we use a rooted view (described somewhere in Felsenstein's book)
1229      for the conditional vectors with respect to the tree
1230      */
1231 
1232   /* iterate over all valid entries in the traversal descriptor */
1233 
1234   pllNewviewIterative(tr, pr, 1);
1235 
1236   /* after the above call we are sure that we have properly and consistently computed the
1237      conditionals to the right and left of the virtual root and we can now invoke the
1238      the log likelihood computation */
1239 
1240   /* we need to loop over all partitions. Note that we may have a mix of DNA, protein binary data etc partitions */
1241 
1242   for(model = 0; model < pr->numberOfPartitions; model++)
1243     {
1244       /* whats' the number of sites of this partition (at the current thread) */
1245       int
1246         width = pr->partitionData[model]->width;
1247 
1248       /*
1249          Important part of the tarversal descriptor:
1250          figure out if we need to recalculate the likelihood of this
1251          partition:
1252 
1253          The reasons why this is important in terms of performance are given in this paper
1254          here which you should actually read:
1255 
1256          A. Stamatakis, M. Ott: "Load Balance in the Phylogenetic Likelihood Kernel". Proceedings of ICPP 2009, accepted for publication, Vienna, Austria, September 2009
1257 
1258          The width > 0 check is for checking if under the cyclic data distribution of per-partition sites to threads this thread does indeed have a site
1259          of the current partition.
1260 
1261       */
1262 
1263       if(tr->td[0].executeModel[model] && width > 0)
1264         {
1265           int
1266 #if (defined(__SSE3) || defined(__AVX))
1267             rateHet = (int)discreteRateCategories(tr->rateHetModel),
1268 #endif
1269             categories,
1270             ascWidth = pr->partitionData[model]->states,
1271 
1272             /* get the number of states in the partition, e.g.: 4 = DNA, 20 = Protein */
1273 
1274             states = pr->partitionData[model]->states,
1275             *ex1 = NULL,
1276             *ex2 = NULL,
1277             *ex1_asc = NULL,
1278             *ex2_asc = NULL;
1279 
1280           double
1281             *rateCategories = (double*)NULL,
1282             z,
1283             partitionLikelihood = 0.0,
1284             *x1_start           = NULL,
1285             *x2_start           = NULL,
1286             *diagptable         = NULL,
1287             *x1_start_asc       = NULL,
1288             *x2_start_asc       = NULL;
1289 
1290 #if (defined(__SSE3) || defined(__AVX))
1291           double
1292             *x1_gapColumn = (double*)NULL,
1293             *x2_gapColumn = (double*)NULL;
1294 #endif
1295 
1296 #if (defined(__SSE3) || defined(__AVX))
1297           unsigned int
1298             *x1_gap = (unsigned int*)NULL,
1299             *x2_gap = (unsigned int*)NULL;
1300 #endif
1301 
1302           unsigned char
1303             *tip = (unsigned char*)NULL;
1304 
1305           /*
1306              figure out if we are using the CAT or GAMMA model of rate heterogeneity
1307              and set pointers to the rate heterogeneity rate arrays and also set the
1308              number of distinct rate categories appropriately.
1309 
1310              Under GAMMA this is constant and hard-coded as 4, weheras under CAT
1311              the number of site-wise rate categories can vary in the course of computations
1312              up to a user defined maximum value of site categories (default: 25)
1313           */
1314 
1315           if(tr->rateHetModel == PLL_CAT)
1316             {
1317               rateCategories = pr->partitionData[model]->perSiteRates;
1318               categories = pr->partitionData[model]->numberOfCategories;
1319             }
1320           else  /* GAMMA */
1321             {
1322               rateCategories = pr->partitionData[model]->gammaRates;
1323               categories = 4;
1324             }
1325 
1326           /* set this pointer to the memory area where space has been reserved a priori for storing the
1327              P matrix at the root */
1328 
1329           diagptable = pr->partitionData[model]->left;
1330 
1331           /* figure out if we need to address tip vectors (a char array that indexes into a precomputed tip likelihood
1332              value array) or if we need to address inner vectors */
1333 
1334           /* either node p or node q is a tip */
1335 
1336           if(isTip(pNumber, tr->mxtips) || isTip(qNumber, tr->mxtips))
1337             {
1338               /* q is a tip */
1339 
1340               if(isTip(qNumber, tr->mxtips))
1341                 {
1342                   /* get the start address of the inner likelihood vector x2 for partition model,
1343                      note that inner nodes are enumerated/indexed starting at 0 to save allocating some
1344                      space for additional pointers */
1345 
1346                   x2_start = pr->partitionData[model]->xVector[p_slot];
1347 
1348                   /* get the corresponding tip vector */
1349 
1350                   tip      = pr->partitionData[model]->yVector[qNumber];
1351 
1352 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1353                   if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
1354 #else
1355                   if (pr->partitionData[model]->ascBias)
1356 #endif
1357                    {
1358                      x2_start_asc  = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
1359                      ex2_asc       = &pr->partitionData[model]->ascExpVector[(pNumber - tr->mxtips - 1) * ascWidth];
1360                    }
1361 
1362 
1363                   /* memory saving stuff, let's deal with this later or ask Fernando ;-) */
1364 
1365 #if (defined(__SSE3) || defined(__AVX))
1366                   if(tr->saveMemory)
1367                     {
1368                       x2_gap         = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
1369                       x2_gapColumn   = &(pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet]);
1370                     }
1371 #endif
1372                   /* per site likelihood scaling */
1373 
1374                   if(!fastScaling)
1375                     ex2 = pr->partitionData[model]->expVector[p_slot];
1376                 }
1377               else
1378                 {
1379                   /* p is a tip, same as above */
1380 
1381                   x2_start = pr->partitionData[model]->xVector[q_slot];
1382                   tip = pr->partitionData[model]->yVector[pNumber];
1383 
1384 
1385 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1386                   if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
1387 #else
1388                   if (pr->partitionData[model]->ascBias)
1389 #endif
1390                    {
1391                      x2_start_asc  = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
1392                      ex2_asc       = &pr->partitionData[model]->ascExpVector[(qNumber - tr->mxtips - 1) * ascWidth];
1393                    }
1394 
1395 #if (defined(__SSE3) || defined(__AVX))
1396                   if(tr->saveMemory)
1397                     {
1398                       x2_gap         = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
1399                       x2_gapColumn   = &(pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet]);
1400                     }
1401 #endif
1402 
1403                   /* per site likelihood scaling */
1404 
1405                   if(!fastScaling)
1406                     ex2 = pr->partitionData[model]->expVector[q_slot];
1407                 }
1408             }
1409           else
1410             {
1411 
1412               assert(p_slot != q_slot);
1413               /* neither p nor q are tips, hence we need to get the addresses of two inner vectors */
1414 
1415               x1_start = pr->partitionData[model]->xVector[p_slot];
1416               x2_start = pr->partitionData[model]->xVector[q_slot];
1417 
1418               /* memory saving option */
1419 
1420 #if (defined(__SSE3) || defined(__AVX))
1421               if(tr->saveMemory)
1422                 {
1423                   x1_gap = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
1424                   x2_gap = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
1425                   x1_gapColumn   = &pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet];
1426                   x2_gapColumn   = &pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet];
1427                 }
1428 #endif
1429 
1430               /* per site likelihood scaling */
1431 
1432               if(!fastScaling)
1433                 {
1434                   ex1      = pr->partitionData[model]->expVector[p_slot];
1435                   ex2      = pr->partitionData[model]->expVector[q_slot];
1436                 }
1437 
1438 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1439               if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
1440 #else
1441               if (pr->partitionData[model]->ascBias)
1442 #endif
1443                {
1444                  x1_start_asc  = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
1445                  x2_start_asc  = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
1446 
1447                  ex1_asc       = &pr->partitionData[model]->ascExpVector[(pNumber - tr->mxtips - 1) * ascWidth];
1448                  ex2_asc       = &pr->partitionData[model]->ascExpVector[(qNumber - tr->mxtips - 1) * ascWidth];
1449                }
1450 
1451 
1452 
1453             }
1454 
1455 
1456           /* if we are using a per-partition branch length estimate, the branch has an index, otherwise, for a joint branch length
1457              estimate over all partitions we just use the branch length value with index 0 */
1458 
1459           if(pr->perGeneBranchLengths)
1460             z = pz[model];
1461           else
1462             z = pz[0];
1463 
1464           /* calc P-Matrix at root for branch z connecting nodes p and q */
1465 
1466           if(pr->partitionData[model]->dataType == PLL_AA_DATA && (pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X))
1467             calcDiagptableFlex_LG4(z, 4, pr->partitionData[model]->gammaRates, pr->partitionData[model]->EIGN_LG4, diagptable, 20);
1468           else
1469             calcDiagptable(z, states, categories, rateCategories, pr->partitionData[model]->EIGN, diagptable);
1470 
1471 #if (!defined(__SSE3) && !defined(__AVX) && !defined(__MIC_NATIVE))
1472 
1473           /* generic slow functions, memory saving option is not implemented for these */
1474 
1475           assert(!tr->saveMemory);
1476 
1477           /* decide wheter CAT or GAMMA is used and compute log like */
1478           if(tr->rateHetModel == PLL_CAT)
1479             partitionLikelihood = evaluateCAT_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1480                                                 x1_start, x2_start, pr->partitionData[model]->tipVector,
1481                                                 tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, getPerSiteLikelihoods);
1482           else
1483             partitionLikelihood = evaluateGAMMA_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1484                                                 x1_start, x2_start, pr->partitionData[model]->tipVector,
1485                                                 tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, getPerSiteLikelihoods);
1486 #else
1487 
1488           /* if we want to compute the per-site likelihoods, we use the generic evaluate function implementations
1489              for this, because the slowdown is not that dramatic */
1490 
1491           if(getPerSiteLikelihoods)
1492             {
1493 #ifdef __MIC_NATIVE
1494                           // not supported on MIC!
1495                           assert(0 && "Per-site LH calculations is not implemented on Intel MIC");
1496 #else
1497                if(tr->rateHetModel == PLL_CAT)
1498                 {
1499                    if(tr->saveMemory)
1500                      partitionLikelihood = evaluateCAT_FLEX_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1501                                                                  x1_start, x2_start, pr->partitionData[model]->tipVector,
1502                                                                  tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE,
1503                                                                  x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1504                    else
1505                      partitionLikelihood = evaluateCAT_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1506                                                             x1_start, x2_start, pr->partitionData[model]->tipVector,
1507                                                             tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE);
1508                 }
1509               else
1510                 {
1511                   if(tr->saveMemory)
1512                     partitionLikelihood = evaluateGAMMA_FLEX_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1513                                                                   x1_start, x2_start, pr->partitionData[model]->tipVector,
1514                                                                   tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE,
1515                                                                   x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1516                   else
1517                     partitionLikelihood = evaluateGAMMA_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1518                                                              x1_start, x2_start, pr->partitionData[model]->tipVector,
1519                                                              tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE);
1520                 }
1521 #endif
1522             }
1523           else
1524             {
1525               /* for the optimized functions we have a dedicated, optimized function implementation
1526                  for each rate heterogeneity and data type combination, we switch over the number of states
1527                  and the rate heterogeneity model */
1528 
1529               switch(states)
1530                 {
1531                 case 2: /* binary */
1532                   assert (!tr->saveMemory);
1533                   if (tr->rateHetModel == PLL_CAT)
1534                    {
1535                      partitionLikelihood =  evaluateGTRCAT_BINARY(ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1536                                                                   x1_start, x2_start, pr->partitionData[model]->tipVector,
1537                                                                   tip, width, diagptable, fastScaling);
1538                    }
1539                   else
1540                    {
1541                      partitionLikelihood = evaluateGTRGAMMA_BINARY(ex1, ex2, pr->partitionData[model]->wgt,
1542                                                                    x1_start, x2_start, pr->partitionData[model]->tipVector,
1543                                                                    tip, width, diagptable, fastScaling);
1544                    }
1545                   break;
1546                 case 4: /* DNA */
1547                   {
1548 
1549 #ifdef __MIC_NATIVE
1550 
1551                   /* CAT & memory saving are not supported on MIC */
1552 
1553                   assert(!tr->saveMemory);
1554                   assert(tr->rateHetModel == PLL_GAMMA);
1555 
1556                   partitionLikelihood =  evaluateGTRGAMMA_MIC(ex1, ex2, pr->partitionData[model]->wgt,
1557                                               x1_start, x2_start, pr->partitionData[model]->tipVector,
1558                                               tip, width, diagptable, fastScaling);
1559 #else
1560                     if(tr->rateHetModel == PLL_CAT)
1561                       {
1562                         if(tr->saveMemory)
1563                           partitionLikelihood =  evaluateGTRCAT_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1564                                                                      x1_start, x2_start, pr->partitionData[model]->tipVector,
1565                                                                      tip, width, diagptable, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1566                         else
1567                           partitionLikelihood =  evaluateGTRCAT(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1568                                                                 x1_start, x2_start, pr->partitionData[model]->tipVector,
1569                                                                 tip, width, diagptable);
1570                       }
1571                     else
1572                       {
1573                         if(tr->saveMemory)
1574                           partitionLikelihood =  evaluateGTRGAMMA_GAPPED_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1575                                                                               x1_start, x2_start, pr->partitionData[model]->tipVector,
1576                                                                               tip, width, diagptable,
1577                                                                               x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1578                         else
1579                           partitionLikelihood =  evaluateGTRGAMMA(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1580                                                                   x1_start, x2_start, pr->partitionData[model]->tipVector,
1581                                                                   tip, width, diagptable);
1582                       }
1583 #endif
1584                   }
1585                   break;
1586                 case 20: /* proteins */
1587                   {
1588 
1589 #ifdef __MIC_NATIVE
1590 
1591                   /* CAT & memory saving are not supported on MIC */
1592 
1593                   assert(!tr->saveMemory);
1594                   assert(tr->rateHetModel == PLL_GAMMA);
1595 
1596                   if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
1597                     partitionLikelihood =  evaluateGTRGAMMAPROT_LG4_MIC(pr->partitionData[model]->wgt,
1598                                                                     x1_start, x2_start, pr->partitionData[model]->tipVector_LG4,
1599                                                                     tip, width, diagptable, pr->partitionData[model]->lg4x_weights);
1600                   else
1601                         partitionLikelihood =  evaluateGTRGAMMAPROT_MIC(ex1, ex2, pr->partitionData[model]->wgt,
1602                                               x1_start, x2_start, pr->partitionData[model]->tipVector,
1603                                               tip, width, diagptable, fastScaling);
1604 
1605 //                  printf("tip: %p, width: %d,  lh: %f\n", tip, width, partitionLikelihood);
1606 //                  int g;
1607 //                  if (x1_start)
1608 //                                        for (g = 0; g < 20; ++g)
1609 //                                                printf("%f \t", x1_start[g]);
1610 //                  printf("\n");
1611 //                  if (x2_start)
1612 //                                        for (g = 0; g < 20; ++g)
1613 //                                                printf("%f \t", x2_start[g]);
1614 #else
1615 
1616                       if(tr->rateHetModel == PLL_CAT)
1617                       {
1618                         if(tr->saveMemory)
1619                           partitionLikelihood = evaluateGTRCATPROT_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1620                                                                         x1_start, x2_start, pr->partitionData[model]->tipVector,
1621                                                                         tip, width, diagptable,  x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1622                         else
1623                           partitionLikelihood = evaluateGTRCATPROT(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1624                                                                    x1_start, x2_start, pr->partitionData[model]->tipVector,
1625                                                                    tip, width, diagptable);
1626                       }
1627                     else
1628                       {
1629                         if(tr->saveMemory)
1630                           partitionLikelihood = evaluateGTRGAMMAPROT_GAPPED_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1631                                                                                  x1_start, x2_start, pr->partitionData[model]->tipVector,
1632                                                                                  tip, width, diagptable,
1633                                                                                  x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1634                         else
1635                       {
1636                         if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
1637                           partitionLikelihood =  evaluateGTRGAMMAPROT_LG4((int *)NULL, (int *)NULL, pr->partitionData[model]->wgt,
1638                                                                           x1_start, x2_start, pr->partitionData[model]->tipVector_LG4,
1639                                                                           tip, width, diagptable, PLL_TRUE, pr->partitionData[model]->lg4x_weights);
1640                         else
1641                           partitionLikelihood = evaluateGTRGAMMAPROT(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1642                                                                      x1_start, x2_start, pr->partitionData[model]->tipVector,
1643                                                                      tip, width, diagptable);
1644                       }
1645                       }
1646 #endif
1647                   }
1648                   break;
1649                 default:
1650                   assert(0);
1651                 }
1652             }
1653 #endif
1654 
1655           /* check that there was no major numerical screw-up, the log likelihood should be < 0.0 always */
1656 
1657           assert(partitionLikelihood < 0.0);
1658 
1659           /* now here is a nasty part, for each partition and each node we maintain an integer counter to count how often
1660              how many entries per node were scaled by a constant factor. Here we use this information generated during Felsenstein's
1661              pruning algorithm by the newview() functions to undo the preceding scaling multiplications at the root, for mathematical details
1662              you should actually read:
1663 
1664              A. Stamatakis: "Orchestrating the Phylogenetic Likelihood Function on Emerging Parallel Architectures".
1665              In B. Schmidt, editor, Bioinformatics: High Performance Parallel Computer Architectures, 85-115, CRC Press, Taylor & Francis, 2010.
1666 
1667              There's a copy of this book in my office
1668           */
1669 
1670           if(fastScaling)
1671             partitionLikelihood += (pr->partitionData[model]->globalScaler[pNumber] + pr->partitionData[model]->globalScaler[qNumber]) * log(PLL_MINLIKELIHOOD);
1672 
1673           /* now we have the correct log likelihood for the current partition after undoing scaling multiplications */
1674 
1675           /* finally, we also store the per partition log likelihood which is important for optimizing the alpha parameter
1676              of this partition for example */
1677 
1678           /* asc bias stuff */
1679 
1680 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1681           if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
1682 #else
1683           if (pr->partitionData[model]->ascBias)
1684 #endif
1685            {
1686              size_t
1687                i;
1688 
1689              int
1690                w = 0;
1691 
1692              double
1693                correction;
1694 
1695              switch(tr->rateHetModel)
1696                {
1697                case PLL_CAT:
1698                  {
1699                    double
1700                      rates = 1.0;
1701 
1702                    //need to re-calculate P-matrix for the correction here assuming a rate of 1.0
1703                    calcDiagptable(z, states, 1, &rates, pr->partitionData[model]->EIGN, diagptable);
1704 
1705 
1706                    correction = evaluateCatAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector,
1707                                                tip, ascWidth, diagptable, ascWidth);
1708                  }
1709                  break;
1710                case PLL_GAMMA:
1711                  correction = evaluateGammaAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector,
1712                                                tip, ascWidth, diagptable, ascWidth);
1713                  break;
1714                default:
1715                  assert(0);
1716                }
1717 
1718 
1719 
1720              for(i = (size_t)pr->partitionData[model]->lower; i < (size_t)pr->partitionData[model]->upper; i++)
1721                w += tr->aliaswgt[i];
1722 
1723              partitionLikelihood = partitionLikelihood - (double)w * log(1.0 - correction);
1724 
1725            }
1726 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1727           if(!(pr->partitionData[model]->ascBias && tr->threadID == 0))
1728            {
1729 #endif
1730              if(partitionLikelihood >= 0.0)
1731                {
1732                  printf("positive log like: %f for partition %d\n", partitionLikelihood, model);
1733                  assert(0);
1734                }
1735 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1736            }
1737 #endif
1738 
1739 
1740           pr->partitionData[model]->partitionLH = partitionLikelihood;
1741         }
1742       else
1743         {
1744           /* if the current thread does not have a single site of this partition
1745              it is important to set the per partition log like to 0.0 because
1746              of the reduction operation that will take place later-on.
1747              That is, the values of tr->perPartitionLH across all threads
1748              need to be in a consistent state, always !
1749           */
1750 
1751           if(width == 0)
1752             pr->partitionData[model]->partitionLH = 0.0;
1753         }
1754     }
1755 
1756 
1757 #ifdef DEBUG_PERSITE_LNL
1758   /* per persite-stuff */
1759   {
1760     int model = 0;
1761     for(model = 0; model < pr->numberOfPartitions ; ++model)
1762       {
1763         int j= 0;
1764         pInfo *partition  =  pr->partitionData[model];
1765         for(j = 0;  j < partition->width; ++j)
1766           printf("[%d] lnl[%d]=%f\n", tr->threadID, j, partition->perSiteLikelihoods[j]);
1767 
1768       }
1769   }
1770 
1771 #endif
1772 }
1773 
1774 
1775 
1776 /** @ingroup evaluateLikelihoodGroup
1777     @brief Evaluate the log likelihood of the tree topology
1778 
1779     Evaluate the log likelihood of the tree topology of instance \a tr by
1780     assuming a virtual root between nodes \a p and \a p->back. If
1781     \a fullTraversal is set to \b PLL_TRUE then the log likelihood vectors for
1782     each node are recomputed from scratch.
1783 
1784     @param tr
1785       PLL instance
1786 
1787     @param pr
1788       List of partitions
1789 
1790     @param p
1791       Specifies the virtual root, which is assumed to be a (virtual node) connecting \a p and \a p->back
1792 
1793     @param fullTraversal
1794       If set to \b PLL_TRUE, then the likelihood vectors at all nodes are recomputed, otherwise only the
1795       necessary vectors (those that are not oriented in the right direction) are recomputed.
1796 
1797     @param getPerSiteLikelihoods
1798       Also compute and store (in \a tr->lhs) the log likelihood of each site of the (compressed) alignment
1799 
1800     @note
1801       If \a getPerSiteLikelihoods is set to \b PLL_TRUE, then make sure that \a tr->fastScaling is set to
1802       \b PLL_FALSE, otherwise an assertion will fail.
1803 */
pllEvaluateLikelihood(pllInstance * tr,partitionList * pr,nodeptr p,pllBoolean fullTraversal,pllBoolean getPerSiteLikelihoods)1804 void pllEvaluateLikelihood (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean fullTraversal, pllBoolean getPerSiteLikelihoods)
1805 {
1806   /* now this may be the entry point of the library to compute
1807      the log like at a branch defined by p and p->back == q */
1808 
1809   volatile double
1810     result = 0.0;
1811 
1812   nodeptr
1813     q = p->back;
1814 
1815 
1816   pllBoolean
1817         p_recom = PLL_FALSE, /* if one of was missing, we will need to force recomputation */
1818         q_recom = PLL_FALSE;
1819 
1820   int
1821     i,
1822     model,
1823     numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions : 1;
1824 
1825   /* if evaluate shall return the per-site log likelihoods
1826      fastScaling needs to be disabled, otherwise this will
1827      not work */
1828 
1829   if(getPerSiteLikelihoods)
1830     assert(!(tr->fastScaling));
1831 
1832   /* set the first entry of the traversal descriptor to contain the indices
1833      of nodes p and q */
1834 
1835   tr->td[0].ti[0].pNumber = p->number;
1836   tr->td[0].ti[0].qNumber = q->number;
1837 
1838   /* copy the branch lengths of the tree into the first entry of the traversal descriptor.
1839      if -M is not used tr->numBranches must be 1 */
1840 
1841   for(i = 0; i < numBranches; i++)
1842     tr->td[0].ti[0].qz[i] =  q->z[i];
1843 
1844   /* recom part */
1845   if(tr->useRecom)
1846   {
1847     int slot = -1;
1848     if(!isTip(q->number, tr->mxtips))
1849     {
1850       q_recom = getxVector(tr->rvec, q->number, &slot, tr->mxtips);
1851       tr->td[0].ti[0].slot_q = slot;
1852     }
1853     if(!isTip(p->number, tr->mxtips))
1854     {
1855       p_recom = getxVector(tr->rvec, p->number, &slot, tr->mxtips);
1856       tr->td[0].ti[0].slot_p = slot;
1857     }
1858     if(!isTip(p->number, tr->mxtips) &&  !isTip(q->number, tr->mxtips))
1859       assert(tr->td[0].ti[0].slot_q != tr->td[0].ti[0].slot_p);
1860   }
1861 
1862 
1863   /* now compute how many conditionals must be re-computed/re-oriented by newview
1864      to be able to calculate the likelihood at the root defined by p and q.
1865      */
1866 
1867   /* one entry in the traversal descriptor is already used, hence set the tarversal length counter to 1 */
1868   tr->td[0].count = 1;
1869 
1870   if(fullTraversal)
1871   {
1872     assert(isTip(q->back->number, tr->mxtips));
1873     computeTraversal(tr, q, PLL_FALSE, numBranches);
1874   }
1875   else
1876   {
1877     if(p_recom || needsRecomp(tr->useRecom, tr->rvec, p, tr->mxtips))
1878       computeTraversal(tr, p, PLL_TRUE, numBranches);
1879 
1880     if(q_recom || needsRecomp(tr->useRecom, tr->rvec, q, tr->mxtips))
1881       computeTraversal(tr, q, PLL_TRUE, numBranches);
1882   }
1883 
1884 
1885   /* now we copy this partition execute mask into the traversal descriptor which must come from the
1886      calling program, the logic of this should not form part of the library */
1887 
1888   storeExecuteMaskInTraversalDescriptor(tr, pr);
1889 
1890   /* also store in the traversal descriptor that something has changed i.e., in the parallel case that the
1891      traversal descriptor list of nodes needs to be broadcast once again */
1892 
1893   tr->td[0].traversalHasChanged = PLL_TRUE;
1894 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1895 
1896   /* now here we enter the fork-join region for Pthreads */
1897 
1898 
1899   /* start the parallel region and tell all threads to compute the log likelihood for
1900      their fraction of the data. This call is implemented in the case switch of execFunction in axml.c
1901      */
1902   if(getPerSiteLikelihoods)
1903     {
1904       memset(tr->lhs, 0, sizeof(double) * tr->originalCrunchedLength);
1905       pllMasterBarrier(tr, pr, PLL_THREAD_EVALUATE_PER_SITE_LIKES);
1906     }
1907   else
1908     pllMasterBarrier (tr, pr, PLL_THREAD_EVALUATE);
1909 
1910   /* and now here we explicitly do the reduction operation , that is add over the
1911      per-thread and per-partition log likelihoods to obtain the overall log like
1912      over all sites and partitions */
1913 
1914 
1915   /*
1916      for unpartitioned data that's easy, we just sum over the log likes computed
1917      by each thread, thread 0 stores his results in reductionBuffer[0] thread 1 in
1918      reductionBuffer[1] and so on
1919      */
1920 
1921   /* This reduction for the partitioned case is more complicated because each thread
1922      needs to store the partial log like of each partition and we then need to collect
1923      and add everything */
1924 
1925 #else
1926   /* and here is just the sequential case, we directly call pllEvaluateIterative() above
1927      without having to tell the threads/processes that they need to compute this function now */
1928 
1929   pllEvaluateIterative(tr, pr, getPerSiteLikelihoods); //PLL_TRUE
1930 
1931   /*
1932     if we want to obtain per-site rates they have initially been stored
1933      in arrays that are associated to the partition, now we
1934      copy them into the vector tr->lhs[].
1935      We may also chose that the user needs to rpovide an array, but this can be decided later-on.
1936   */
1937 
1938   if(getPerSiteLikelihoods) //PLL_TRUE
1939     {
1940       for(model = 0; model < pr->numberOfPartitions; model++)
1941         memcpy(&(tr->lhs[pr->partitionData[model]->lower]), pr->partitionData[model]->perSiteLikelihoods, pr->partitionData[model]->width  * sizeof(double));
1942     }
1943 
1944 #endif
1945 
1946   for(model = 0; model < pr->numberOfPartitions; model++)
1947     result += pr->partitionData[model]->partitionLH;
1948 
1949   /* set the tree data structure likelihood value to the total likelihood */
1950 
1951   tr->likelihood = result;
1952 
1953   /* the code below is mainly for testing if the per-site log
1954      likelihoods we have stored in tr->lhs yield the same
1955      likelihood as the likelihood we computed.
1956      For numerical reasons we need to make a dirt PLL_ABS(difference) < epsilon
1957      comparison */
1958 
1959   if(getPerSiteLikelihoods) //PLL_TRUE
1960     {
1961       double
1962         likelihood = 0;
1963       int i;
1964 
1965       /* note that in tr->lhs, we just store the likelihood of
1966          one representative of a potentially compressed pattern,
1967          hence, we need to multiply the elemnts with the pattern
1968          weight vector */
1969 
1970 
1971       for(i = 0; i < tr->originalCrunchedLength; i++)
1972         {
1973 //          printf("lhs[%d]=%f * %d\n", i, tr->lhs[i], tr->aliaswgt[i]);
1974           likelihood += (tr->lhs[i]   * tr->aliaswgt[i] );
1975         }
1976 
1977       if( PLL_ABS(tr->likelihood - likelihood) > 0.00001)
1978         {
1979   //        printf("likelihood was %f\t summed/weighted per-site-lnl was %f\n", tr->likelihood, likelihood);
1980         }
1981 
1982         assert(PLL_ABS(tr->likelihood - likelihood) < 0.00001);
1983     }
1984 
1985 
1986   if(tr->useRecom)
1987   {
1988     unpinNode(tr->rvec, p->number, tr->mxtips);
1989     unpinNode(tr->rvec, q->number, tr->mxtips);
1990   }
1991 
1992   /* do some bookkeeping to have traversalHasChanged in a consistent state */
1993 
1994   tr->td[0].traversalHasChanged = PLL_FALSE;
1995 }
1996 
1997 
perSiteLogLikelihoods(pllInstance * tr,partitionList * pr,double * logLikelihoods)1998 void perSiteLogLikelihoods(pllInstance *tr, partitionList *pr, double *logLikelihoods)
1999 {
2000 #if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
2001   double
2002     //likelihood,
2003     accumulatedPerSiteLikelihood = 0.0;
2004 
2005   size_t
2006     localCount,
2007     i,
2008     //globalCounter,
2009     lower,
2010     upper;
2011   int model;
2012 #endif
2013   /* compute the likelihood of the tree with the standard function to:
2014      1. obtain the current score for error checking
2015      2. store a full tree traversal in the traversal descriptor that
2016      will then be used for calculating per-site log likelihoods
2017      for each site individually and independently */
2018 
2019   pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
2020 
2021   //likelihood = tr->likelihood;
2022 
2023   /* now compute per-site log likelihoods using the respective functions */
2024 
2025 #if (defined( _USE_PTHREADS ) || defined(_FINE_GRAIN_MPI))
2026   /* here we need a barrier to invoke a parallel region that calls
2027      function
2028      perSiteLogLikelihoodsPthreads(tree *tr, partitionList *pr, double *lhs, int n, int tid)
2029      defined above and subsequently collects the per-site log likelihoods
2030      computed by the threads and stored in local per-thread memory
2031      and stores them in buffer tr->lhs.
2032      This corresponds to a gather operation in MPI.
2033      */
2034 
2035   pllMasterBarrier (tr, pr, PLL_THREAD_PER_SITE_LIKELIHOODS);
2036 
2037   /*
2038      when the parallel region has terminated, the per-site log likelihoods
2039      are stored in array tr->lhs of the master thread which we copy to the result buffer
2040   */
2041 
2042   memcpy(logLikelihoods, tr->lhs, sizeof(double) * tr->originalCrunchedLength);
2043 
2044 
2045 #else
2046 
2047   /* sequential case: just loop over all partitions and compute per site log likelihoods */
2048 
2049   for(model = 0; model < pr->numberOfPartitions; model++)
2050   {
2051     lower = pr->partitionData[model]->lower;
2052     upper = pr->partitionData[model]->upper;
2053 
2054     for(i = lower, localCount = 0; i < upper; i++, localCount++)
2055     {
2056       double
2057         l;
2058 
2059       /*
2060          we need to switch of rate heterogeneity implementations here.
2061          when we have PSR we actually need to provide the per-site rate
2062          to the function evaluatePartialGeneric() that computes the
2063          per-site log likelihood.
2064          Under GAMMA, the rate will just be ignored, here we just set it to 1.0
2065          */
2066 
2067       switch(tr->rateHetModel)
2068       {
2069         case PLL_CAT:
2070           l = evaluatePartialGeneric (tr, pr, i, pr->partitionData[model]->perSiteRates[pr->partitionData[model]->rateCategory[localCount]], model);
2071           break;
2072         case PLL_GAMMA:
2073           l = evaluatePartialGeneric (tr, pr, i, 1.0, model);
2074           break;
2075         default:
2076           assert(0);
2077       }
2078 
2079       /* store value in result array and add the likelihood of this site to the overall likelihood */
2080 
2081       logLikelihoods[i] = l;
2082       accumulatedPerSiteLikelihood += l;
2083     }
2084   }
2085 
2086 
2087   /* error checking. We need a dirt PLL_ABS() < epsilon here, because the implementations
2088      (standard versus per-site) are pretty different and hence slight numerical
2089      deviations are expected */
2090 
2091   assert(PLL_ABS(tr->likelihood - accumulatedPerSiteLikelihood) < 0.00001);
2092 
2093 #endif
2094 
2095 
2096 
2097 }
2098 
2099 #if (defined(__SSE3) || defined(__AVX))
evaluateGTRCAT_BINARY(int * ex1,int * ex2,int * cptr,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start,const pllBoolean fastScaling)2100 static double evaluateGTRCAT_BINARY (int *ex1, int *ex2, int *cptr, int *wptr,
2101                                      double *x1_start, double *x2_start, double *tipVector,
2102                                      unsigned char *tipX1, int n, double *diagptable_start, const pllBoolean fastScaling)
2103 {
2104   double  sum = 0.0, term;
2105   int     i;
2106 #if (!defined(__SSE3) && !defined(__AVX))
2107   int j;
2108 #endif
2109   double  *diagptable, *x1, *x2;
2110 
2111   if(tipX1)
2112     {
2113       for (i = 0; i < n; i++)
2114         {
2115 #if (defined(__SSE3) || defined(__AVX))
2116           PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2117 #endif
2118           x1 = &(tipVector[2 * tipX1[i]]);
2119           x2 = &(x2_start[2 * i]);
2120 
2121           diagptable = &(diagptable_start[2 * cptr[i]]);
2122 
2123 #if (defined(__SSE3) || defined(__AVX))
2124           _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
2125 
2126           if(fastScaling)
2127             term = log(fabs(t[0] + t[1]));
2128           else
2129             term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2130 #else
2131           for(j = 0, term = 0.0; j < 2; j++)
2132             term += x1[j] * x2[j] * diagptable[j];
2133 
2134           if(fastScaling)
2135             term = log(fabs(term));
2136           else
2137             term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2138 #endif
2139 
2140           sum += wptr[i] * term;
2141         }
2142     }
2143   else
2144     {
2145       for (i = 0; i < n; i++)
2146         {
2147 #if (defined(__SSE3) || defined(__AVX))
2148 		  PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2149 #endif
2150           x1 = &x1_start[2 * i];
2151           x2 = &x2_start[2 * i];
2152 
2153           diagptable = &diagptable_start[2 * cptr[i]];
2154 #if (defined(__SSE3) || defined(__AVX))
2155           _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
2156 
2157           if(fastScaling)
2158             term = log(fabs(t[0] + t[1]));
2159           else
2160             term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2161 #else
2162           for(j = 0, term = 0.0; j < 2; j++)
2163             term += x1[j] * x2[j] * diagptable[j];
2164 
2165           if(fastScaling)
2166             term = log(fabs(term));
2167           else
2168             term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2169 #endif
2170 
2171           sum += wptr[i] * term;
2172         }
2173     }
2174 
2175   return  sum;
2176 }
2177 
2178 
evaluateGTRGAMMA_BINARY(int * ex1,int * ex2,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,const int n,double * diagptable,const pllBoolean fastScaling)2179 static double evaluateGTRGAMMA_BINARY(int *ex1, int *ex2, int *wptr,
2180                                       double *x1_start, double *x2_start,
2181                                       double *tipVector,
2182                                       unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling)
2183 {
2184   double   sum = 0.0, term;
2185   int     i, j;
2186 #if (!defined(__SSE3) && !defined(__AVX))
2187   int k;
2188 #endif
2189   double  *x1, *x2;
2190 
2191   if(tipX1)
2192     {
2193       for (i = 0; i < n; i++)
2194         {
2195 #if (defined(__SSE3) || defined(__AVX))
2196 		  PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2197           __m128d termv, x1v, x2v, dv;
2198 #endif
2199           x1 = &(tipVector[2 * tipX1[i]]);
2200           x2 = &x2_start[8 * i];
2201 #if (defined(__SSE3) || defined(__AVX))
2202           termv = _mm_set1_pd(0.0);
2203 
2204           for(j = 0; j < 4; j++)
2205             {
2206               x1v = _mm_load_pd(&x1[0]);
2207               x2v = _mm_load_pd(&x2[j * 2]);
2208               dv   = _mm_load_pd(&diagptable[j * 2]);
2209 
2210               x1v = _mm_mul_pd(x1v, x2v);
2211               x1v = _mm_mul_pd(x1v, dv);
2212 
2213               termv = _mm_add_pd(termv, x1v);
2214             }
2215 
2216           _mm_store_pd(t, termv);
2217 
2218           if(fastScaling)
2219             term = log(0.25 * (fabs(t[0] + t[1])));
2220           else
2221             term = log(0.25 * (fabs(t[0] + t[1]))) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2222 #else
2223           for(j = 0, term = 0.0; j < 4; j++)
2224             for(k = 0; k < 2; k++)
2225               term += x1[k] * x2[j * 2 + k] * diagptable[j * 2 + k];
2226 
2227           if(fastScaling)
2228             term = log(0.25 * fabs(term));
2229           else
2230             term = log(0.25 * fabs(term)) + ex2[i] * log(PLL_MINLIKELIHOOD);
2231 #endif
2232 
2233           sum += wptr[i] * term;
2234         }
2235     }
2236   else
2237     {
2238       for (i = 0; i < n; i++)
2239         {
2240 #if (defined(__SSE3) || defined(__AVX))
2241 		  PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2242           __m128d termv, x1v, x2v, dv;
2243 #endif
2244           x1 = &x1_start[8 * i];
2245           x2 = &x2_start[8 * i];
2246 
2247 #if (defined(__SSE3) || defined(__AVX))
2248           termv = _mm_set1_pd(0.0);
2249 
2250           for(j = 0; j < 4; j++)
2251             {
2252               x1v = _mm_load_pd(&x1[j * 2]);
2253               x2v = _mm_load_pd(&x2[j * 2]);
2254               dv   = _mm_load_pd(&diagptable[j * 2]);
2255 
2256               x1v = _mm_mul_pd(x1v, x2v);
2257               x1v = _mm_mul_pd(x1v, dv);
2258 
2259               termv = _mm_add_pd(termv, x1v);
2260             }
2261 
2262           _mm_store_pd(t, termv);
2263 
2264 
2265           if(fastScaling)
2266             term = log(0.25 * (fabs(t[0] + t[1])));
2267           else
2268             term = log(0.25 * (fabs(t[0] + t[1]))) + ((ex1[i] +ex2[i]) * log(PLL_MINLIKELIHOOD));
2269 #else
2270           for(j = 0, term = 0.0; j < 4; j++)
2271             for(k = 0; k < 2; k++)
2272               term += x1[j * 2 + k] * x2[j * 2 + k] * diagptable[j * 2 + k];
2273 
2274           if(fastScaling)
2275             term = log(0.25 * fabs(term));
2276           else
2277             term = log(0.25 * fabs(term)) + (ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD);
2278 #endif
2279 
2280           sum += wptr[i] * term;
2281         }
2282     }
2283 
2284   return sum;
2285 }
2286 #endif
2287 
2288 
2289 
2290 /* below are the optimized function versions with geeky intrinsics */
2291 
2292 /** @ingroup evaluateLikelihoodGroup
2293     @brief Evaluation of log likelihood of a tree under the GAMMA model of rate heterogeneity and LG4 model of evolution
2294 
2295     This is the same as ::evaluateGAMMA_FLEX but for the LG4 model. It contains two implementations,
2296     one which is the generic, and one that is optimized with SSE3 instructions. The two implementations
2297     are separated by preprocessor macros.
2298     The difference from ::evaluateGAMMA_FLEX is that we have 4 different tipVectors computed from the 4 different
2299     Q matrix decompositions.
2300     Please check ::evaluateGAMMA_FLEX for more information and a description of the common
2301     input parameters.
2302 */
evaluateGTRGAMMAPROT_LG4(int * ex1,int * ex2,int * wptr,double * x1,double * x2,double * tipVector[4],unsigned char * tipX1,int n,double * diagptable,const pllBoolean fastScaling,double * lg4_weights)2303 static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr,
2304                                        double *x1, double *x2,
2305                                        double *tipVector[4],
2306                                        unsigned char *tipX1, int n, double *diagptable, const pllBoolean fastScaling,
2307                                        double * lg4_weights)
2308 {
2309   double   sum = 0.0, term;
2310   int     i, j, l;
2311   double  *left, *right;
2312 
2313   if(tipX1)
2314     {
2315       for (i = 0; i < n; i++)
2316         {
2317 #if (defined(__SSE3) || defined(__AVX))
2318           __m128d tv = _mm_setzero_pd();
2319 
2320           for(j = 0, term = 0.0; j < 4; j++)
2321             {
2322               double *d = &diagptable[j * 20];
2323 
2324               __m128d
2325               	  t = _mm_setzero_pd(),
2326               	  w = _mm_set1_pd(lg4_weights[j]);
2327 
2328               left = &(tipVector[j][20 * tipX1[i]]);
2329               right = &(x2[80 * i + 20 * j]);
2330               for(l = 0; l < 20; l+=2)
2331                 {
2332                   __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2333                   t = _mm_add_pd(t, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2334                 }
2335               tv = _mm_add_pd(tv, _mm_mul_pd(t, w));
2336             }
2337 
2338           tv = _mm_hadd_pd(tv, tv);
2339           _mm_storel_pd(&term, tv);
2340 
2341 
2342 #else
2343           for(j = 0, term = 0.0; j < 4; j++)
2344             {
2345         	  double t = 0.0;
2346 
2347               left = &(tipVector[j][20 * tipX1[i]]);
2348               right = &(x2[80 * i + 20 * j]);
2349 
2350               for(l = 0; l < 20; l++)
2351                 t += left[l] * right[l] * diagptable[j * 20 + l];
2352 
2353               term += lg4_weights[j] * t;
2354             }
2355 #endif
2356 
2357           if(fastScaling)
2358             term = log(fabs(term));
2359           else
2360             term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2361 
2362           sum += wptr[i] * term;
2363 
2364         }
2365     }
2366   else
2367     {
2368       for (i = 0; i < n; i++)
2369         {
2370 #if (defined(__SSE3) || defined(__AVX))
2371           __m128d tv = _mm_setzero_pd();
2372 
2373           for(j = 0, term = 0.0; j < 4; j++)
2374             {
2375               double *d = &diagptable[j * 20];
2376 
2377               __m128d
2378               t = _mm_setzero_pd(),
2379               w = _mm_set1_pd(lg4_weights[j]);
2380 
2381               left  = &(x1[80 * i + 20 * j]);
2382               right = &(x2[80 * i + 20 * j]);
2383 
2384               for(l = 0; l < 20; l+=2)
2385                 {
2386                   __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2387                   t = _mm_add_pd(t, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2388                 }
2389               tv = _mm_add_pd(tv, _mm_mul_pd(t, w));
2390             }
2391           tv = _mm_hadd_pd(tv, tv);
2392           _mm_storel_pd(&term, tv);
2393 #else
2394           for(j = 0, term = 0.0; j < 4; j++)
2395             {
2396         	  double t = 0.0;
2397 
2398               left  = &(x1[80 * i + 20 * j]);
2399               right = &(x2[80 * i + 20 * j]);
2400 
2401               for(l = 0; l < 20; l++)
2402                 t += left[l] * right[l] * diagptable[j * 20 + l];
2403 
2404               term += lg4_weights[j] * t;
2405             }
2406 #endif
2407 
2408           if(fastScaling)
2409             term = log(fabs(term));
2410           else
2411             term = log(fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
2412 
2413           sum += wptr[i] * term;
2414         }
2415     }
2416 
2417   return  sum;
2418 }
2419 
2420 #if (defined(__SSE3) || defined(__AVX))
2421 /** @ingroup evaluateLikelihoodGroup
2422     @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity
2423     and the memory saving technique (Optimized SSE3 version for AA data)
2424 
2425     This is the SSE3 optimized version of ::evaluateGAMMA_FLEX_SAVE for evaluating the log
2426     likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2427     vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX_SAVE for more information and
2428     a description of the input parameters
2429 */
evaluateGTRGAMMAPROT_GAPPED_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)2430 static double evaluateGTRGAMMAPROT_GAPPED_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
2431                                                 double *x1, double *x2,
2432                                                 double *tipVector,
2433                                                 unsigned char *tipX1, int n, double *diagptable,
2434                                                 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
2435 {
2436   double   sum = 0.0, term;
2437   int     i, j, l;
2438   double
2439     *left,
2440     *right,
2441     *x1_ptr = x1,
2442     *x2_ptr = x2,
2443     *x1v,
2444     *x2v;
2445   __m128d tv;
2446 
2447   if(tipX1)
2448   {
2449     for (i = 0; i < n; i++)
2450     {
2451       if(x2_gap[i / 32] & mask32[i % 32])
2452         x2v = x2_gapColumn;
2453       else
2454       {
2455         x2v = x2_ptr;
2456         x2_ptr += 80;
2457       }
2458 
2459 	  //TUNG: Standard C does not allow declaration after executable statement
2460 	  tv = _mm_setzero_pd();
2461       //__m128d tv = _mm_setzero_pd();
2462       left = &(tipVector[20 * tipX1[i]]);
2463 
2464       for(j = 0, term = 0.0; j < 4; j++)
2465       {
2466         double *d = &diagptable[j * 20];
2467         right = &(x2v[20 * j]);
2468         for(l = 0; l < 20; l+=2)
2469         {
2470           __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2471           tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2472         }
2473       }
2474 
2475       tv = _mm_hadd_pd(tv, tv);
2476       _mm_storel_pd(&term, tv);
2477 
2478 
2479       if(!fastScaling)
2480         term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2481       else
2482         term = log(0.25 * fabs(term));
2483 
2484       sum += wptr[i] * term;
2485     }
2486   }
2487   else
2488   {
2489     for (i = 0; i < n; i++)
2490     {
2491       if(x1_gap[i / 32] & mask32[i % 32])
2492         x1v = x1_gapColumn;
2493       else
2494       {
2495         x1v = x1_ptr;
2496         x1_ptr += 80;
2497       }
2498 
2499       if(x2_gap[i / 32] & mask32[i % 32])
2500         x2v = x2_gapColumn;
2501       else
2502       {
2503         x2v = x2_ptr;
2504         x2_ptr += 80;
2505       }
2506 
2507       //__m128d tv = _mm_setzero_pd();
2508 	  tv = _mm_setzero_pd();
2509 
2510       for(j = 0, term = 0.0; j < 4; j++)
2511       {
2512         double *d = &diagptable[j * 20];
2513         left  = &(x1v[20 * j]);
2514         right = &(x2v[20 * j]);
2515 
2516         for(l = 0; l < 20; l+=2)
2517         {
2518           __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2519           tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2520         }
2521       }
2522       tv = _mm_hadd_pd(tv, tv);
2523       _mm_storel_pd(&term, tv);
2524 
2525 
2526        if(!fastScaling)
2527         term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2528       else
2529         term = log(0.25 * fabs(term));
2530 
2531 
2532       sum += wptr[i] * term;
2533     }
2534   }
2535 
2536   return  sum;
2537 }
2538 
2539 
2540 
2541 /** @ingroup evaluateLikelihoodGroup
2542     @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity
2543     (Optimized SSE3 version for AA data)
2544 
2545     This is the SSE3 optimized version of ::evaluateGAMMA_FLEX for evaluating the log
2546     likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2547     vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX for more information and
2548     a description of the common input parameters
2549 */
evaluateGTRGAMMAPROT(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable)2550 static double evaluateGTRGAMMAPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
2551                                     double *x1, double *x2,
2552                                     double *tipVector,
2553                                     unsigned char *tipX1, int n, double *diagptable)
2554 {
2555   double   sum = 0.0, term;
2556   int     i, j, l;
2557   double  *left, *right;
2558 
2559   if(tipX1)
2560   {
2561     for (i = 0; i < n; i++)
2562     {
2563 
2564       __m128d tv = _mm_setzero_pd();
2565       left = &(tipVector[20 * tipX1[i]]);
2566 
2567       for(j = 0, term = 0.0; j < 4; j++)
2568       {
2569         double *d = &diagptable[j * 20];
2570         right = &(x2[80 * i + 20 * j]);
2571         for(l = 0; l < 20; l+=2)
2572         {
2573           __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2574           tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2575         }
2576       }
2577       tv = _mm_hadd_pd(tv, tv);
2578       _mm_storel_pd(&term, tv);
2579 
2580 
2581       if(!fastScaling)
2582         term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2583       else
2584         term = log(0.25 * fabs(term));
2585 
2586 
2587       sum += wptr[i] * term;
2588     }
2589   }
2590   else
2591   {
2592     for (i = 0; i < n; i++)
2593     {
2594       __m128d tv = _mm_setzero_pd();
2595 
2596       for(j = 0, term = 0.0; j < 4; j++)
2597       {
2598         double *d = &diagptable[j * 20];
2599         left  = &(x1[80 * i + 20 * j]);
2600         right = &(x2[80 * i + 20 * j]);
2601 
2602         for(l = 0; l < 20; l+=2)
2603         {
2604           __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2605           tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2606         }
2607       }
2608       tv = _mm_hadd_pd(tv, tv);
2609       _mm_storel_pd(&term, tv);
2610 
2611 
2612        if(!fastScaling)
2613         term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2614       else
2615         term = log(0.25 * fabs(term));
2616 
2617 
2618       sum += wptr[i] * term;
2619     }
2620   }
2621 
2622   return  sum;
2623 }
2624 
2625 
2626 /** @ingroup evaluateLikelihoodGroup
2627     @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity
2628     (Optimized SSE3 version for AA data)
2629 
2630     This is the SSE3 optimized version of ::evaluateCAT_FLEX for evaluating the log
2631     likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2632     vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX for more information and
2633     a description of the common input parameters
2634 */
evaluateGTRCATPROT(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start)2635 static double evaluateGTRCATPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
2636                                   double *x1, double *x2, double *tipVector,
2637                                   unsigned char *tipX1, int n, double *diagptable_start)
2638 {
2639   double   sum = 0.0, term;
2640   double  *diagptable,  *left, *right;
2641   int     i, l;
2642   __m128d tv;
2643 
2644   if(tipX1)
2645   {
2646     for (i = 0; i < n; i++)
2647     {
2648       left = &(tipVector[20 * tipX1[i]]);
2649       right = &(x2[20 * i]);
2650 
2651       diagptable = &diagptable_start[20 * cptr[i]];
2652 
2653 	  //TUNG: Standard C does not allow declaration after executable statement
2654 	  tv = _mm_setzero_pd();
2655       //__m128d tv = _mm_setzero_pd();
2656 
2657       for(l = 0; l < 20; l+=2)
2658       {
2659         __m128d lv = _mm_load_pd(&left[l]);
2660         __m128d rv = _mm_load_pd(&right[l]);
2661         __m128d mul = _mm_mul_pd(lv, rv);
2662         __m128d dv = _mm_load_pd(&diagptable[l]);
2663 
2664         tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));
2665       }
2666 
2667       tv = _mm_hadd_pd(tv, tv);
2668       _mm_storel_pd(&term, tv);
2669 
2670       if(!fastScaling)
2671         term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2672       else
2673         term = log(fabs(term));
2674 
2675       sum += wptr[i] * term;
2676     }
2677   }
2678   else
2679   {
2680 
2681     for (i = 0; i < n; i++)
2682     {
2683       left  = &x1[20 * i];
2684       right = &x2[20 * i];
2685 
2686       diagptable = &diagptable_start[20 * cptr[i]];
2687 
2688       __m128d tv = _mm_setzero_pd();
2689 
2690       for(l = 0; l < 20; l+=2)
2691       {
2692         __m128d lv = _mm_load_pd(&left[l]);
2693         __m128d rv = _mm_load_pd(&right[l]);
2694         __m128d mul = _mm_mul_pd(lv, rv);
2695         __m128d dv = _mm_load_pd(&diagptable[l]);
2696 
2697         tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));
2698       }
2699 
2700       tv = _mm_hadd_pd(tv, tv);
2701       _mm_storel_pd(&term, tv);
2702 
2703       if(!fastScaling)
2704         term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2705       else
2706         term = log(fabs(term));
2707 
2708       sum += wptr[i] * term;
2709     }
2710   }
2711 
2712   return  sum;
2713 }
2714 
2715 
2716 /** @ingroup evaluateLikelihoodGroup
2717     @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity with memory saving
2718     (Optimized SSE3 version for AA data)
2719 
2720     This is the SSE3 optimized version of ::evaluateCAT_FLEX_SAVE for evaluating the log
2721     likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2722     vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX_SAVE for more information and
2723     a description of the common input parameters
2724 */
evaluateGTRCATPROT_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)2725 static double evaluateGTRCATPROT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
2726                                        double *x1, double *x2, double *tipVector,
2727                                        unsigned char *tipX1, int n, double *diagptable_start,
2728                                        double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
2729 {
2730   double
2731     sum = 0.0,
2732         term,
2733         *diagptable,
2734         *left,
2735         *right,
2736         *left_ptr = x1,
2737         *right_ptr = x2;
2738 
2739   int
2740     i,
2741     l;
2742 
2743   if(tipX1)
2744   {
2745     for (i = 0; i < n; i++)
2746     {
2747       left = &(tipVector[20 * tipX1[i]]);
2748 
2749       if(isGap(x2_gap, i))
2750         right = x2_gapColumn;
2751       else
2752       {
2753         right = right_ptr;
2754         right_ptr += 20;
2755       }
2756 
2757       diagptable = &diagptable_start[20 * cptr[i]];
2758 
2759       __m128d tv = _mm_setzero_pd();
2760 
2761       for(l = 0; l < 20; l+=2)
2762       {
2763         __m128d lv = _mm_load_pd(&left[l]);
2764         __m128d rv = _mm_load_pd(&right[l]);
2765         __m128d mul = _mm_mul_pd(lv, rv);
2766         __m128d dv = _mm_load_pd(&diagptable[l]);
2767 
2768         tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));
2769       }
2770 
2771       tv = _mm_hadd_pd(tv, tv);
2772       _mm_storel_pd(&term, tv);
2773 
2774       if(!fastScaling)
2775         term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2776       else
2777         term = log(fabs(term));
2778 
2779       sum += wptr[i] * term;
2780     }
2781   }
2782   else
2783   {
2784 
2785     for (i = 0; i < n; i++)
2786     {
2787       if(isGap(x1_gap, i))
2788         left = x1_gapColumn;
2789       else
2790       {
2791         left = left_ptr;
2792         left_ptr += 20;
2793       }
2794 
2795       if(isGap(x2_gap, i))
2796         right = x2_gapColumn;
2797       else
2798       {
2799         right = right_ptr;
2800         right_ptr += 20;
2801       }
2802 
2803       diagptable = &diagptable_start[20 * cptr[i]];
2804 
2805       __m128d tv = _mm_setzero_pd();
2806 
2807       for(l = 0; l < 20; l+=2)
2808       {
2809         __m128d lv = _mm_load_pd(&left[l]);
2810         __m128d rv = _mm_load_pd(&right[l]);
2811         __m128d mul = _mm_mul_pd(lv, rv);
2812         __m128d dv = _mm_load_pd(&diagptable[l]);
2813 
2814         tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));
2815       }
2816 
2817       tv = _mm_hadd_pd(tv, tv);
2818       _mm_storel_pd(&term, tv);
2819 
2820       if(!fastScaling)
2821         term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2822       else
2823         term = log(fabs(term));
2824 
2825       sum += wptr[i] * term;
2826     }
2827   }
2828 
2829   return  sum;
2830 }
2831 
2832 
2833 /** @ingroup evaluateLikelihoodGroup
2834     @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity with memory saving
2835     (Optimized SSE3 version for DNA data)
2836 
2837     This is the SSE3 optimized version of ::evaluateCAT_FLEX_SAVE for evaluating the log
2838     likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2839     vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX_SAVE for more information and
2840     a description of the common input parameters
2841 */
evaluateGTRCAT_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)2842 static double evaluateGTRCAT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
2843                                    double *x1_start, double *x2_start, double *tipVector,
2844                                    unsigned char *tipX1, int n, double *diagptable_start,
2845                                    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
2846 {
2847   double  sum = 0.0, term;
2848   int     i;
2849 
2850   double  *diagptable,
2851           *x1,
2852           *x2,
2853           *x1_ptr = x1_start,
2854           *x2_ptr = x2_start;
2855 
2856   if(tipX1)
2857   {
2858     for (i = 0; i < n; i++)
2859     {
2860     	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2861       __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
2862 
2863       x1 = &(tipVector[4 * tipX1[i]]);
2864 
2865       if(isGap(x2_gap, i))
2866         x2 = x2_gapColumn;
2867       else
2868       {
2869         x2 = x2_ptr;
2870         x2_ptr += 4;
2871       }
2872 
2873       diagptable = &diagptable_start[4 * cptr[i]];
2874 
2875       x1v1 =  _mm_load_pd(&x1[0]);
2876       x1v2 =  _mm_load_pd(&x1[2]);
2877       x2v1 =  _mm_load_pd(&x2[0]);
2878       x2v2 =  _mm_load_pd(&x2[2]);
2879       dv1  =  _mm_load_pd(&diagptable[0]);
2880       dv2  =  _mm_load_pd(&diagptable[2]);
2881 
2882       x1v1 = _mm_mul_pd(x1v1, x2v1);
2883       x1v1 = _mm_mul_pd(x1v1, dv1);
2884 
2885       x1v2 = _mm_mul_pd(x1v2, x2v2);
2886       x1v2 = _mm_mul_pd(x1v2, dv2);
2887 
2888       x1v1 = _mm_add_pd(x1v1, x1v2);
2889 
2890       _mm_store_pd(t, x1v1);
2891 
2892       if(!fastScaling)
2893         term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2894       else
2895         term = log(fabs(t[0] + t[1]));
2896 
2897 
2898 
2899       sum += wptr[i] * term;
2900     }
2901   }
2902   else
2903   {
2904     for (i = 0; i < n; i++)
2905     {
2906     	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2907       __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
2908 
2909       if(isGap(x1_gap, i))
2910         x1 = x1_gapColumn;
2911       else
2912       {
2913         x1 = x1_ptr;
2914         x1_ptr += 4;
2915       }
2916 
2917       if(isGap(x2_gap, i))
2918         x2 = x2_gapColumn;
2919       else
2920       {
2921         x2 = x2_ptr;
2922         x2_ptr += 4;
2923       }
2924 
2925       diagptable = &diagptable_start[4 * cptr[i]];
2926 
2927       x1v1 =  _mm_load_pd(&x1[0]);
2928       x1v2 =  _mm_load_pd(&x1[2]);
2929       x2v1 =  _mm_load_pd(&x2[0]);
2930       x2v2 =  _mm_load_pd(&x2[2]);
2931       dv1  =  _mm_load_pd(&diagptable[0]);
2932       dv2  =  _mm_load_pd(&diagptable[2]);
2933 
2934       x1v1 = _mm_mul_pd(x1v1, x2v1);
2935       x1v1 = _mm_mul_pd(x1v1, dv1);
2936 
2937       x1v2 = _mm_mul_pd(x1v2, x2v2);
2938       x1v2 = _mm_mul_pd(x1v2, dv2);
2939 
2940       x1v1 = _mm_add_pd(x1v1, x1v2);
2941 
2942       _mm_store_pd(t, x1v1);
2943 
2944 
2945        if(!fastScaling)
2946         term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2947       else
2948         term = log(fabs(t[0] + t[1]));
2949 
2950       sum += wptr[i] * term;
2951     }
2952   }
2953 
2954   return  sum;
2955 }
2956 
2957 
2958 /** @ingroup evaluateLikelihoodGroup
2959     @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity with memory saving
2960     (Optimized SSE3 version for DNA data)
2961 
2962     This is the SSE3 optimized version of ::evaluateGAMMA_FLEX_SAVE for evaluating the log
2963     likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2964     vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX_SAVE for more information and
2965     a description of the common input parameters
2966 */
evaluateGTRGAMMA_GAPPED_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,const int n,double * diagptable,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)2967 static double evaluateGTRGAMMA_GAPPED_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
2968                                            double *x1_start, double *x2_start,
2969                                            double *tipVector,
2970                                            unsigned char *tipX1, const int n, double *diagptable,
2971                                            double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
2972 {
2973   double   sum = 0.0, term;
2974   int     i, j;
2975   double
2976     *x1,
2977     *x2,
2978     *x1_ptr = x1_start,
2979     *x2_ptr = x2_start;
2980 
2981 
2982 
2983   if(tipX1)
2984   {
2985 
2986 
2987     for (i = 0; i < n; i++)
2988     {
2989     	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2990       __m128d termv, x1v, x2v, dv;
2991 
2992       x1 = &(tipVector[4 * tipX1[i]]);
2993       if(x2_gap[i / 32] & mask32[i % 32])
2994         x2 = x2_gapColumn;
2995       else
2996       {
2997         x2 = x2_ptr;
2998         x2_ptr += 16;
2999       }
3000 
3001 
3002       termv = _mm_set1_pd(0.0);
3003 
3004       for(j = 0; j < 4; j++)
3005       {
3006         x1v = _mm_load_pd(&x1[0]);
3007         x2v = _mm_load_pd(&x2[j * 4]);
3008         dv   = _mm_load_pd(&diagptable[j * 4]);
3009 
3010         x1v = _mm_mul_pd(x1v, x2v);
3011         x1v = _mm_mul_pd(x1v, dv);
3012 
3013         termv = _mm_add_pd(termv, x1v);
3014 
3015         x1v = _mm_load_pd(&x1[2]);
3016         x2v = _mm_load_pd(&x2[j * 4 + 2]);
3017         dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
3018 
3019         x1v = _mm_mul_pd(x1v, x2v);
3020         x1v = _mm_mul_pd(x1v, dv);
3021 
3022         termv = _mm_add_pd(termv, x1v);
3023       }
3024 
3025       _mm_store_pd(t, termv);
3026 
3027        if(!fastScaling)
3028         term = log(0.25 * fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
3029       else
3030         term = log(0.25 * fabs(t[0] + t[1]));
3031 
3032 
3033       sum += wptr[i] * term;
3034     }
3035   }
3036   else
3037   {
3038 
3039     for (i = 0; i < n; i++)
3040     {
3041     	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
3042       __m128d termv, x1v, x2v, dv;
3043 
3044       if(x1_gap[i / 32] & mask32[i % 32])
3045         x1 = x1_gapColumn;
3046       else
3047       {
3048         x1 = x1_ptr;
3049         x1_ptr += 16;
3050       }
3051 
3052       if(x2_gap[i / 32] & mask32[i % 32])
3053         x2 = x2_gapColumn;
3054       else
3055       {
3056         x2 = x2_ptr;
3057         x2_ptr += 16;
3058       }
3059 
3060       termv = _mm_set1_pd(0.0);
3061 
3062       for(j = 0; j < 4; j++)
3063       {
3064         x1v = _mm_load_pd(&x1[j * 4]);
3065         x2v = _mm_load_pd(&x2[j * 4]);
3066         dv   = _mm_load_pd(&diagptable[j * 4]);
3067 
3068         x1v = _mm_mul_pd(x1v, x2v);
3069         x1v = _mm_mul_pd(x1v, dv);
3070 
3071         termv = _mm_add_pd(termv, x1v);
3072 
3073         x1v = _mm_load_pd(&x1[j * 4 + 2]);
3074         x2v = _mm_load_pd(&x2[j * 4 + 2]);
3075         dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
3076 
3077         x1v = _mm_mul_pd(x1v, x2v);
3078         x1v = _mm_mul_pd(x1v, dv);
3079 
3080         termv = _mm_add_pd(termv, x1v);
3081       }
3082 
3083       _mm_store_pd(t, termv);
3084 
3085       if(!fastScaling)
3086         term = log(0.25 * fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
3087       else
3088         term = log(0.25 * fabs(t[0] + t[1]));
3089 
3090 
3091       sum += wptr[i] * term;
3092     }
3093   }
3094 
3095   return sum;
3096 }
3097 
3098 
3099 /** @ingroup evaluateLikelihoodGroup
3100     @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity (Optimized SSE3 version for DNA data)
3101 
3102     This is the SSE3 optimized version of ::evaluateGAMMA_FLEX for evaluating the log
3103     likelihood at some edge whose two end-points (nodes) have the conditional likelihood
3104     vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX for more information and
3105     a description of the common input parameters
3106 */
evaluateGTRGAMMA(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,const int n,double * diagptable)3107 static double evaluateGTRGAMMA(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
3108                                double *x1_start, double *x2_start,
3109                                double *tipVector,
3110                                unsigned char *tipX1, const int n, double *diagptable)
3111 {
3112   double   sum = 0.0, term;
3113   int     i, j;
3114 
3115   double  *x1, *x2;
3116 
3117 
3118 
3119   if(tipX1)
3120   {
3121     for (i = 0; i < n; i++)
3122     {
3123     	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
3124       __m128d termv, x1v, x2v, dv;
3125 
3126       x1 = &(tipVector[4 * tipX1[i]]);
3127       x2 = &x2_start[16 * i];
3128 
3129 
3130       termv = _mm_set1_pd(0.0);
3131 
3132       for(j = 0; j < 4; j++)
3133       {
3134         x1v = _mm_load_pd(&x1[0]);
3135         x2v = _mm_load_pd(&x2[j * 4]);
3136         dv   = _mm_load_pd(&diagptable[j * 4]);
3137 
3138         x1v = _mm_mul_pd(x1v, x2v);
3139         x1v = _mm_mul_pd(x1v, dv);
3140 
3141         termv = _mm_add_pd(termv, x1v);
3142 
3143         x1v = _mm_load_pd(&x1[2]);
3144         x2v = _mm_load_pd(&x2[j * 4 + 2]);
3145         dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
3146 
3147         x1v = _mm_mul_pd(x1v, x2v);
3148         x1v = _mm_mul_pd(x1v, dv);
3149 
3150         termv = _mm_add_pd(termv, x1v);
3151       }
3152 
3153       _mm_store_pd(t, termv);
3154 
3155 
3156        if(!fastScaling)
3157         term = log(0.25 * fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
3158       else
3159         term = log(0.25 * fabs(t[0] + t[1]));
3160 
3161 
3162 
3163       sum += wptr[i] * term;
3164     }
3165   }
3166   else
3167   {
3168     for (i = 0; i < n; i++)
3169     {
3170     	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
3171       __m128d termv, x1v, x2v, dv;
3172 
3173 
3174       x1 = &x1_start[16 * i];
3175       x2 = &x2_start[16 * i];
3176 
3177 
3178       termv = _mm_set1_pd(0.0);
3179 
3180       for(j = 0; j < 4; j++)
3181       {
3182         x1v = _mm_load_pd(&x1[j * 4]);
3183         x2v = _mm_load_pd(&x2[j * 4]);
3184         dv   = _mm_load_pd(&diagptable[j * 4]);
3185 
3186         x1v = _mm_mul_pd(x1v, x2v);
3187         x1v = _mm_mul_pd(x1v, dv);
3188 
3189         termv = _mm_add_pd(termv, x1v);
3190 
3191         x1v = _mm_load_pd(&x1[j * 4 + 2]);
3192         x2v = _mm_load_pd(&x2[j * 4 + 2]);
3193         dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
3194 
3195         x1v = _mm_mul_pd(x1v, x2v);
3196         x1v = _mm_mul_pd(x1v, dv);
3197 
3198         termv = _mm_add_pd(termv, x1v);
3199       }
3200 
3201       _mm_store_pd(t, termv);
3202 
3203       if(!fastScaling)
3204         term = log(0.25 * fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
3205       else
3206         term = log(0.25 * fabs(t[0] + t[1]));
3207 
3208 
3209 
3210       sum += wptr[i] * term;
3211     }
3212   }
3213 
3214   return sum;
3215 }
3216 
3217 
3218 /** @ingroup evaluateLikelihoodGroup
3219     @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity (Optimized SSE3 version for DNA data)
3220 
3221     This is the SSE3 optimized version of ::evaluateCAT_FLEX for evaluating the log
3222     likelihood at some edge whose two end-points (nodes) have the conditional likelihood
3223     vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX for more information and
3224     a description of the common input parameters
3225 */
evaluateGTRCAT(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start)3226 static double evaluateGTRCAT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
3227                               double *x1_start, double *x2_start, double *tipVector,
3228                               unsigned char *tipX1, int n, double *diagptable_start)
3229 {
3230   double  sum = 0.0, term;
3231   int     i;
3232 
3233   double  *diagptable, *x1, *x2;
3234 
3235   if(tipX1)
3236   {
3237     for (i = 0; i < n; i++)
3238     {
3239     	PLL_ALIGN_BEGIN	double t[2] PLL_ALIGN_END;
3240       __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
3241 
3242       x1 = &(tipVector[4 * tipX1[i]]);
3243       x2 = &x2_start[4 * i];
3244 
3245       diagptable = &diagptable_start[4 * cptr[i]];
3246 
3247 
3248       x1v1 =  _mm_load_pd(&x1[0]);
3249       x1v2 =  _mm_load_pd(&x1[2]);
3250       x2v1 =  _mm_load_pd(&x2[0]);
3251       x2v2 =  _mm_load_pd(&x2[2]);
3252       dv1  =  _mm_load_pd(&diagptable[0]);
3253       dv2  =  _mm_load_pd(&diagptable[2]);
3254 
3255       x1v1 = _mm_mul_pd(x1v1, x2v1);
3256       x1v1 = _mm_mul_pd(x1v1, dv1);
3257 
3258       x1v2 = _mm_mul_pd(x1v2, x2v2);
3259       x1v2 = _mm_mul_pd(x1v2, dv2);
3260 
3261       x1v1 = _mm_add_pd(x1v1, x1v2);
3262 
3263       _mm_store_pd(t, x1v1);
3264 
3265        if(!fastScaling)
3266         term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
3267       else
3268         term = log(fabs(t[0] + t[1]));
3269 
3270 
3271       sum += wptr[i] * term;
3272     }
3273   }
3274   else
3275   {
3276     for (i = 0; i < n; i++)
3277     {
3278     	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
3279       __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
3280 
3281       x1 = &x1_start[4 * i];
3282       x2 = &x2_start[4 * i];
3283 
3284       diagptable = &diagptable_start[4 * cptr[i]];
3285 
3286 
3287       x1v1 =  _mm_load_pd(&x1[0]);
3288       x1v2 =  _mm_load_pd(&x1[2]);
3289       x2v1 =  _mm_load_pd(&x2[0]);
3290       x2v2 =  _mm_load_pd(&x2[2]);
3291       dv1  =  _mm_load_pd(&diagptable[0]);
3292       dv2  =  _mm_load_pd(&diagptable[2]);
3293 
3294       x1v1 = _mm_mul_pd(x1v1, x2v1);
3295       x1v1 = _mm_mul_pd(x1v1, dv1);
3296 
3297       x1v2 = _mm_mul_pd(x1v2, x2v2);
3298       x1v2 = _mm_mul_pd(x1v2, dv2);
3299 
3300       x1v1 = _mm_add_pd(x1v1, x1v2);
3301 
3302       _mm_store_pd(t, x1v1);
3303 
3304       if(!fastScaling)
3305         term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
3306       else
3307         term = log(fabs(t[0] + t[1]));
3308 
3309 
3310       sum += wptr[i] * term;
3311     }
3312   }
3313 
3314   return  sum;
3315 }
3316 
3317 
3318 
3319 
3320 
3321 #endif
3322