1 /**
2 * PLL (version 1.0.0) a software library for phylogenetic inference
3 * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
4 *
5 * Derived from
6 * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
7 * trees by Alexandros Stamatakis
8 *
9 * This program is free software: you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License as published by the Free
11 * Software Foundation, either version 3 of the License, or (at your option)
12 * any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17 * more details.
18 *
19 * You should have received a copy of the GNU General Public License along with
20 * this program. If not, see <http://www.gnu.org/licenses/>.
21 *
22 * For any other enquiries send an Email to Tomas Flouri
23 * Tomas.Flouri@h-its.org
24 *
25 * When publishing work that uses PLL please cite PLL
26 *
27 * @file evaluateGenericSpecial.c
28 *
29 * @brief Functions for computing the log likelihood at a given branch of the tree (i.e. a virtual root that is placed at this branch)
30 */
31 #include "mem_alloc.h"
32
33 #ifndef WIN32
34 #include <unistd.h>
35 #endif
36
37 #include <math.h>
38 #include <time.h>
39 #include <stdlib.h>
40 #include <stdio.h>
41 #include <ctype.h>
42 #include <string.h>
43 #include <assert.h>
44
45 #include "pll.h"
46 #include "pllInternal.h"
47
48 #ifdef __MIC_NATIVE
49 #include "mic_native.h"
50 #endif
51
52 /* the set of functions in here computes the log likelihood at a given branch (the virtual root of a tree) */
53
54 /* includes for using SSE3 intrinsics */
55
56 #ifdef __SSE3
57 #include <xmmintrin.h>
58 #include <pmmintrin.h>
59 /*#include <tmmintrin.h>*/
60 #endif
61
62
63 /** @defgroup evaluateLikelihoodGroup Likelihood evaluation
64
65 This set of functions deals with the evaluation of likelihood for the current topology
66 */
67
68
69
70
71
72
73
74 /* below are the function headers for unreadeble highly optimized versions of the above functions
75 for DNA and protein data that also use SSE3 intrinsics and implement some memory saving tricks.
76 The actual functions can be found at the end of this source file.
77 All other likelihood function implementation files:
78
79 newviewGenericSpacial.c
80 makenewzSpecial.c
81 evaluatePartialGenericSpecial.c
82
83 are also structured like this
84
85 To decide which set of function implementations to use you will have to undefine or define _OPTIMIZED_FUNCTIONS
86 in the Makefile
87 */
88 #if (defined(__SSE3) || defined(__AVX))
89
90 static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr,
91 double *x1, double *x2,
92 double *tipVector[4],
93 unsigned char *tipX1, int n, double *diagptable, const pllBoolean fastScaling,
94 double * lg4_weights);
95
96 /* GAMMA for proteins with memory saving */
97
98 static double evaluateGTRGAMMAPROT_GAPPED_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
99 double *x1, double *x2,
100 double *tipVector,
101 unsigned char *tipX1, int n, double *diagptable,
102 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
103
104
105 /* GAMMA for proteins */
106
107 static double evaluateGTRGAMMAPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
108 double *x1, double *x2,
109 double *tipVector,
110 unsigned char *tipX1, int n, double *diagptable);
111
112 /* CAT for proteins */
113
114 static double evaluateGTRCATPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
115 double *x1, double *x2, double *tipVector,
116 unsigned char *tipX1, int n, double *diagptable_start);
117
118
119 /* CAT for proteins with memory saving */
120
121 static double evaluateGTRCATPROT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
122 double *x1, double *x2, double *tipVector,
123 unsigned char *tipX1, int n, double *diagptable_start,
124 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
125
126 /* analogous DNA fuctions */
127
128 static double evaluateGTRCAT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
129 double *x1_start, double *x2_start, double *tipVector,
130 unsigned char *tipX1, int n, double *diagptable_start,
131 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
132
133 static double evaluateGTRGAMMA_GAPPED_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
134 double *x1_start, double *x2_start,
135 double *tipVector,
136 unsigned char *tipX1, const int n, double *diagptable,
137 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
138
139 static double evaluateGTRGAMMA(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
140 double *x1_start, double *x2_start,
141 double *tipVector,
142 unsigned char *tipX1, const int n, double *diagptable);
143
144
145 static double evaluateGTRCAT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
146 double *x1_start, double *x2_start, double *tipVector,
147 unsigned char *tipX1, int n, double *diagptable_start);
148
149
150 #endif
151
152 #if (defined(__AVX) || defined(__SSE3))
153 static double evaluateGTRGAMMA_BINARY(int *ex1, int *ex2, int *wptr,
154 double *x1_start, double *x2_start,
155 double *tipVector,
156 unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling);
157
158 static double evaluateGTRCAT_BINARY (int *ex1, int *ex2, int *cptr, int *wptr,
159 double *x1_start, double *x2_start, double *tipVector,
160 unsigned char *tipX1, int n, double *diagptable_start, const pllBoolean fastScaling);
161 #endif
162
163
164 /*
165 global variables of pthreads version, reductionBuffer is the global array
166 that is used for implementing deterministic reduction operations, that is,
167 the total log likelihood over the partial log lieklihoods for the sites that each thread has computed
168
169 NumberOfThreads is just the number of threads.
170
171 Note the volatile modifier here, that guarantees that the compiler will not do weird optimizations
172 rearraengements of the code accessing those variables, because it does not know that several concurrent threads
173 will access those variables simulatenously
174
175 UPDATE: reductionBuffer is now merged with globalResult
176 */
177
178
179 /* a pre-computed 32-bit integer mask */
180
181 extern const unsigned int mask32[32];
182
183 /* the function below computes the P matrix from the decomposition of the Q matrix and the respective rate categories for a single partition */
184
185 /** @brief Compute the diagonal of P matrix for a specific edge
186
187 This function computes the diagonal of P matrix for a branch of length \a z
188 from the decomposition of the Q matrix specified in \a EIGN and the respective
189 rate categories \a rptr for a single partition. The diagonal is then stored in
190 \a diagptable.
191
192 @param z Length of edge
193 @param states Number of states
194 @param numberOfCategories Number of categories in the rate heterogeneity rate arrays
195 @param rptr Rate heterogeneity rate arrays
196 @param EIGN Eigenvalues
197 @param diagptable Where to store the resulting P matrix
198 */
calcDiagptable(const double z,const int states,const int numberOfCategories,const double * rptr,const double * EIGN,double * diagptable)199 static void calcDiagptable(const double z, const int states, const int numberOfCategories, const double *rptr, const double *EIGN, double *diagptable)
200 {
201 int
202 i,
203 l;
204
205 double
206 lz,
207 *lza = (double *)rax_malloc(sizeof(double) * states);
208
209 /* transform the root branch length to the log and check if it is not too small */
210
211 if (z < PLL_ZMIN)
212 lz = log(PLL_ZMIN);
213 else
214 lz = log(z);
215
216 /* do some pre-computations to avoid redundant computations further below */
217
218 for(i = 1; i < states; i++)
219 lza[i] = EIGN[i] * lz;
220
221 /* loop over the number of per-site or discrete gamma rate categories */
222
223 for(i = 0; i < numberOfCategories; i++)
224 {
225 /*
226 diagptable is a pre-allocated array of doubles that stores the P-Matrix
227 the first entry is always 1.0
228 */
229 diagptable[i * states] = 1.0;
230
231 /* compute the P matrix for all remaining states of the model */
232
233 for(l = 1; l < states; l++)
234 diagptable[i * states + l] = exp(rptr[i] * lza[l]);
235 }
236
237 rax_free(lza);
238 }
239
240 /** @brief Compute the diagonal of P matrix for a specific edge for the LG4 model
241
242 This function computes the diagonal of P matrix for a branch of length \a z
243 from the decomposition of the 4 LG4 Q matrices specified in \a EIGN and the respective
244 rate categories \a rptr for a single partition. The diagonal is then stored in
245 \a diagptable.
246
247 @param z
248 Length of edge
249
250 @param states
251 Number of states
252
253 @param numberOfCategories
254 Number of categories in the rate heterogeneity rate arrays
255
256 @param rptr
257 Rate heterogeneity rate arrays
258
259 @param EIGN
260 Eigenvalues of the 4 Q matrices
261
262 @param diagptable
263 Where to store the resulting P matrix
264
265 @param numStates
266 Number of states
267 */
calcDiagptableFlex_LG4(double z,int numberOfCategories,double * rptr,double * EIGN[4],double * diagptable,const int numStates)268 static void calcDiagptableFlex_LG4(double z, int numberOfCategories, double *rptr, double *EIGN[4], double *diagptable, const int numStates)
269 {
270 int
271 i,
272 l;
273
274 double
275 lz;
276
277 assert(numStates <= 64);
278
279 if (z < PLL_ZMIN)
280 lz = log(PLL_ZMIN);
281 else
282 lz = log(z);
283
284 for(i = 0; i < numberOfCategories; i++)
285 {
286 diagptable[i * numStates + 0] = 1.0;
287
288 for(l = 1; l < numStates; l++)
289 diagptable[i * numStates + l] = exp(rptr[i] * EIGN[i][l] * lz);
290 }
291 }
292
ascertainmentBiasSequence(unsigned char tip[32],int numStates)293 static void ascertainmentBiasSequence(unsigned char tip[32], int numStates)
294 {
295 assert(numStates <= 32 && numStates > 1);
296
297 switch(numStates)
298 {
299 case 2:
300 tip[0] = 1;
301 tip[1] = 2;
302 break;
303 case 4:
304 tip[0] = 1;
305 tip[1] = 2;
306 tip[2] = 4;
307 tip[3] = 8;
308 break;
309 default:
310 {
311 int
312 i;
313 for(i = 0; i < numStates; i++)
314 {
315 tip[i] = i;
316 //printf("%c ", inverseMeaningPROT[i]);
317 }
318 //printf("\n");
319 }
320 break;
321 }
322 }
323
evaluateCatAsc(int * ex1,int * ex2,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable,const int numStates)324 static double evaluateCatAsc(int *ex1, int *ex2,
325 double *x1, double *x2,
326 double *tipVector,
327 unsigned char *tipX1, int n, double *diagptable, const int numStates)
328 {
329 double
330 exponent,
331 sum = 0.0,
332 unobserved,
333 term,
334 *left,
335 *right;
336
337 int
338 i,
339 l;
340
341 unsigned char
342 tip[32];
343
344 ascertainmentBiasSequence(tip, numStates);
345
346 if(tipX1)
347 {
348 for (i = 0; i < n; i++)
349 {
350 left = &(tipVector[numStates * tip[i]]);
351 right = &(x2[i * numStates]);
352
353 term = 0.0;
354
355 for(l = 0; l < numStates; l++)
356 term += left[l] * right[l] * diagptable[l];
357
358 /* assumes that pow behaves as expected/specified for underflows
359 from the man page:
360 If result underflows, and is not representable,
361 a range error occurs and 0.0 is returned.
362 */
363
364 exponent = pow(PLL_MINLIKELIHOOD, (double)ex2[i]);
365
366 unobserved = fabs(term) * exponent;
367
368 #ifdef _DEBUG_ASC
369 if(ex2[i] > 0)
370 {
371 printf("s %d\n", ex2[i]);
372 assert(0);
373 }
374 #endif
375
376 sum += unobserved;
377 }
378 }
379 else
380 {
381 for (i = 0; i < n; i++)
382 {
383 term = 0.0;
384
385 left = &(x1[i * numStates]);
386 right = &(x2[i * numStates]);
387
388 for(l = 0; l < numStates; l++)
389 term += left[l] * right[l] * diagptable[l];
390
391 /* assumes that pow behaves as expected/specified for underflows
392 from the man page:
393 If result underflows, and is not representable,
394 a range error occurs and 0.0 is returned.
395 */
396
397 exponent = pow(PLL_MINLIKELIHOOD, (double)(ex1[i] + ex2[i]));
398
399 unobserved = fabs(term) * exponent;
400
401 #ifdef _DEBUG_ASC
402 if(ex2[i] > 0 || ex1[i] > 0)
403 {
404 printf("s %d %d\n", ex1[i], ex2[i]);
405 assert(0);
406 }
407 #endif
408
409 sum += unobserved;
410 }
411 }
412
413 return sum;
414 }
415
416
evaluateGammaAsc(int * ex1,int * ex2,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable,const int numStates)417 static double evaluateGammaAsc(int *ex1, int *ex2,
418 double *x1, double *x2,
419 double *tipVector,
420 unsigned char *tipX1, int n, double *diagptable, const int numStates)
421 {
422 double
423 exponent,
424 sum = 0.0,
425 unobserved,
426 term,
427 *left,
428 *right;
429
430 int
431 i,
432 j,
433 l;
434
435 const int
436 gammaStates = numStates * 4;
437
438 unsigned char
439 tip[32];
440
441 ascertainmentBiasSequence(tip, numStates);
442
443 if(tipX1)
444 {
445 for (i = 0; i < n; i++)
446 {
447 left = &(tipVector[numStates * tip[i]]);
448
449 for(j = 0, term = 0.0; j < 4; j++)
450 {
451 right = &(x2[gammaStates * i + numStates * j]);
452
453 for(l = 0; l < numStates; l++)
454 term += left[l] * right[l] * diagptable[j * numStates + l];
455 }
456
457 /* assumes that pow behaves as expected/specified for underflows
458 from the man page:
459 If result underflows, and is not representable,
460 a range error occurs and 0.0 is returned.
461 */
462
463 exponent = pow(PLL_MINLIKELIHOOD, (double)ex2[i]);
464
465 unobserved = fabs(term) * exponent;
466
467 #ifdef _DEBUG_ASC
468 if(ex2[i] > 0)
469 {
470 printf("s %d\n", ex2[i]);
471 assert(0);
472 }
473 #endif
474
475 sum += unobserved;
476 }
477 }
478 else
479 {
480 for (i = 0; i < n; i++)
481 {
482
483 for(j = 0, term = 0.0; j < 4; j++)
484 {
485 left = &(x1[gammaStates * i + numStates * j]);
486 right = &(x2[gammaStates * i + numStates * j]);
487
488 for(l = 0; l < numStates; l++)
489 term += left[l] * right[l] * diagptable[j * numStates + l];
490 }
491
492 /* assumes that pow behaves as expected/specified for underflows
493 from the man page:
494 If result underflows, and is not representable,
495 a range error occurs and 0.0 is returned.
496 */
497
498 exponent = pow(PLL_MINLIKELIHOOD, (double)(ex1[i] + ex2[i]));
499
500 unobserved = fabs(term) * exponent;
501
502 #ifdef _DEBUG_ASC
503 if(ex2[i] > 0 || ex1[i] > 0)
504 {
505 printf("s %d %d\n", ex1[i], ex2[i]);
506 assert(0);
507 }
508 #endif
509
510 sum += unobserved;
511 }
512 }
513
514 return sum;
515 }
516
517
518 /** @ingroup evaluateLikelihoodGroup
519 @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the GAMMA model of rate heterogeneity
520
521 Computes the log likelihood of the topology for a specific partition, assuming
522 that the GAMMA model of rate heterogeneity is used. The likelihood is computed at
523 a virtual root placed at an edge whose two end-points (nodes) have the conditional
524 likelihood vectors \a x1 and \a x2.
525 Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
526 likelihood for each site is also computed and stored at the corresponding position
527 in the array \a perSiteLikelihoods.
528
529 @param fastScaling
530 If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
531 of times it has been scaled down
532
533 @param ex1
534 An array that holds how many times a site has been scaled and points at the entries for node \a p. This
535 parameter is used if \a fastScaling is set to \b PLL_FALSE.
536
537 @param ex2
538 An array that holds how many times a site has been scaled and points at the entries for node \a q. This
539 parameter is used if \a fastScaling is set to \b PLL_TRUE.
540
541 @param wptr
542 Array holding the weight for each site in the compressed partition alignment
543
544 @param x1_start
545 Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
546
547 @param x2_start
548 Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
549
550 @param tipVector
551 Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data
552 type, i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition
553 probabilities computed from the eigenvectors of the decomposed Q matrix.
554
555 @param tipX1
556 If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
557 this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
558 holds the conditional likelihood vectors for the internal node.
559
560 @param n
561 Number of sites for which we are doing the evaluation. For the single-thread version this is the
562 number of sites in the current partition, for multi-threads this is the number of sites assigned
563 to the running thread from the current partition.
564
565 @param diagptable
566 Start of the array that contains the P-Matrix diagonal of the specific edge for which we are
567 evaluating the likehood, and for each category of the GAMMA model
568
569 @param states
570 Number of states (4 for DNA, 20 for AA)
571
572 @param perSiteLikelihoods
573 Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
574
575 @param getPerSiteLikelihoods
576 If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
577
578 @return
579 The evaluated log likelihood of the tree topology
580 */
evaluateGAMMA_FLEX(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,const int n,double * diagptable,const int states,double * perSiteLikelihoods,pllBoolean getPerSiteLikelihoods)581 static double evaluateGAMMA_FLEX(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
582 double *x1_start, double *x2_start,
583 double *tipVector,
584 unsigned char *tipX1, const int n, double *diagptable, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods)
585 {
586 double
587 sum = 0.0,
588 term,
589 *x1,
590 *x2;
591
592 int
593 i,
594 j,
595 k;
596
597 /* span is the offset within the likelihood array at an inner node that gets us from the values
598 of site i to the values of site i + 1 */
599
600 const int
601 span = states * 4;
602
603
604 /* we distingusih between two cases here: one node of the two nodes defining the branch at which we put the virtual root is
605 a tip. Both nodes can not be tips because we do not allow for two-taxon trees ;-)
606 Nota that, if a node is a tip, this will always be tipX1. This is done for code simplicity and the flipping of the nodes
607 is done before when we compute the traversal descriptor.
608 */
609
610 /* the left node is a tip */
611 if(tipX1)
612 {
613 /* loop over the sites of this partition */
614 for (i = 0; i < n; i++)
615 {
616 /* access pre-computed tip vector values via a lookup table */
617 x1 = &(tipVector[states * tipX1[i]]);
618 /* access the other(inner) node at the other end of the branch */
619 x2 = &(x2_start[span * i]);
620
621 /* loop over GAMMA rate categories, hard-coded as 4 in RAxML */
622 for(j = 0, term = 0.0; j < 4; j++)
623 /* loop over states and multiply them with the P matrix */
624 for(k = 0; k < states; k++)
625 term += x1[k] * x2[j * states + k] * diagptable[j * states + k];
626
627 /* take the log of the likelihood and multiply the per-gamma rate likelihood by 1/4.
628 Under the GAMMA model the 4 discrete GAMMA rates all have the same probability
629 of 0.25 */
630
631 if(!fastScaling)
632 term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
633 else
634 term = log(0.25 * fabs(term));
635
636 /* if required get the per-site log likelihoods.
637 note that these are the plain per site log-likes, not
638 multiplied with the pattern weight value */
639
640 if(getPerSiteLikelihoods)
641 perSiteLikelihoods[i] = term;
642
643 sum += wptr[i] * term;
644 }
645 }
646 else
647 {
648 for (i = 0; i < n; i++)
649 {
650 /* same as before, only that now we access two inner likelihood vectors x1 and x2 */
651
652 x1 = &(x1_start[span * i]);
653 x2 = &(x2_start[span * i]);
654
655 for(j = 0, term = 0.0; j < 4; j++)
656 for(k = 0; k < states; k++)
657 term += x1[j * states + k] * x2[j * states + k] * diagptable[j * states + k];
658
659 if(!fastScaling)
660 term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
661 else
662 term = log(0.25 * fabs(term));
663
664 if(getPerSiteLikelihoods)
665 perSiteLikelihoods[i] = term;
666
667 sum += wptr[i] * term;
668 }
669 }
670
671 return sum;
672 }
673
674 #if (defined(__SSE3) || defined(__AVX))
675 /** @ingroup evaluateLikelihoodGroup
676 @brief Memory saving version of the generic (and slow) implementation of log likelihood evaluation of a tree using the GAMMA model of rate heterogeneity
677
678 Computes the log likelihood of the topology for a specific partition, assuming
679 that the GAMMA model of rate heterogeneity is used and memory saving technique
680 is enabled. The likelihood is computed at a virtual root placed at an edge whose
681 two end-points (nodes) have the conditional likelihood vectors \a x1 and \a x2.
682 Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
683 likelihood for each site is also computed and stored at the corresponding position
684 in the array \a perSiteLikelihoods.
685
686 @param fastScaling
687 If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
688 of times it has been scaled down
689
690 @param ex1
691 An array that holds how many times a site has been scaled and points at the entries for node \a p. This
692 parameter is used if \a fastScaling is set to \b PLL_FALSE.
693
694 @param ex2
695 An array that holds how many times a site has been scaled and points at the entries for node \a q. This
696 parameter is used if \a fastScaling is set to \b PLL_TRUE.
697
698 @param wptr
699 Array holding the weight for each site in the compressed partition alignment
700
701 @param x1_start
702 Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
703
704 @param x2_start
705 Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
706
707 @param tipVector
708 Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data
709 type, i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition
710 probabilities computed from the eigenvectors of the decomposed Q matrix.
711
712 @param tipX1
713 If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
714 this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
715 holds the conditional likelihood vectors for the internal node.
716
717 @param n
718 Number of sites for which we are doing the evaluation. For the single-thread version this is the
719 number of sites in the current partition, for multi-threads this is the number of sites assigned
720 to the running thread from the current partition.
721
722 @param diagptable
723 Start of the array that contains the P-Matrix diagonal of the specific edge for which we are
724 evaluating the likehood, and for each category of the GAMMA model
725
726 @param states
727 Number of states (4 for DNA, 20 for AA)
728
729 @param perSiteLikelihoods
730 Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
731
732 @param getPerSiteLikelihoods
733 If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
734
735 @param x1_gapColumn
736
737 @param x2_gapColumn
738
739 @param x1_gap
740 Gap bitvector for the left child node
741
742 @param x2_gap
743 Gap bitvector for the right child node
744
745 @return
746 The evaluated log likelihood of the tree topology
747
748 @todo
749 Document x1_gapColumn, x2_gapColumn, x1_gap, x2_gap and add a brief description of how this technique works
750 */
evaluateGAMMA_FLEX_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,const int n,double * diagptable,const int states,double * perSiteLikelihoods,pllBoolean getPerSiteLikelihoods,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)751 static double evaluateGAMMA_FLEX_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
752 double *x1_start, double *x2_start,
753 double *tipVector,
754 unsigned char *tipX1, const int n, double *diagptable, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods,
755 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
756 {
757 double
758 sum = 0.0,
759 term,
760 *x1,
761 *x2,
762 *x1_ptr = x1_start,
763 *x2_ptr = x2_start;
764
765 int
766 i,
767 j,
768 k;
769
770 /* span is the offset within the likelihood array at an inner node that gets us from the values
771 of site i to the values of site i + 1 */
772
773 const int
774 span = states * 4;
775
776
777 /* we distingusih between two cases here: one node of the two nodes defining the branch at which we put the virtual root is
778 a tip. Both nodes can not be tips because we do not allow for two-taxon trees ;-)
779 Nota that, if a node is a tip, this will always be tipX1. This is done for code simplicity and the flipping of the nodes
780 is done before when we compute the traversal descriptor.
781 */
782
783 /* the left node is a tip */
784 if(tipX1)
785 {
786 /* loop over the sites of this partition */
787 for (i = 0; i < n; i++)
788 {
789 /* access pre-computed tip vector values via a lookup table */
790 x1 = &(tipVector[states * tipX1[i]]);
791 /* access the other(inner) node at the other end of the branch */
792
793 if(x2_gap[i / 32] & mask32[i % 32])
794 x2 = x2_gapColumn;
795 else
796 {
797 x2 = x2_ptr;
798 x2_ptr += span;
799 }
800
801 /* loop over GAMMA rate categories, hard-coded as 4 in RAxML */
802 for(j = 0, term = 0.0; j < 4; j++)
803 /* loop over states and multiply them with the P matrix */
804 for(k = 0; k < states; k++)
805 term += x1[k] * x2[j * states + k] * diagptable[j * states + k];
806
807 /* take the log of the likelihood and multiply the per-gamma rate likelihood by 1/4.
808 Under the GAMMA model the 4 discrete GAMMA rates all have the same probability
809 of 0.25 */
810
811 if(!fastScaling)
812 term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
813 else
814 term = log(0.25 * fabs(term));
815
816 /* if required get the per-site log likelihoods.
817 note that these are the plain per site log-likes, not
818 multiplied with the pattern weight value */
819
820 if(getPerSiteLikelihoods)
821 perSiteLikelihoods[i] = term;
822
823 sum += wptr[i] * term;
824 }
825 }
826 else
827 {
828 for (i = 0; i < n; i++)
829 {
830 /* same as before, only that now we access two inner likelihood vectors x1 and x2 */
831
832 if(x1_gap[i / 32] & mask32[i % 32])
833 x1 = x1_gapColumn;
834 else
835 {
836 x1 = x1_ptr;
837 x1_ptr += span;
838 }
839
840 if(x2_gap[i / 32] & mask32[i % 32])
841 x2 = x2_gapColumn;
842 else
843 {
844 x2 = x2_ptr;
845 x2_ptr += span;
846 }
847
848 for(j = 0, term = 0.0; j < 4; j++)
849 for(k = 0; k < states; k++)
850 term += x1[j * states + k] * x2[j * states + k] * diagptable[j * states + k];
851
852 if(!fastScaling)
853 term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
854 else
855 term = log(0.25 * fabs(term));
856
857 if(getPerSiteLikelihoods)
858 perSiteLikelihoods[i] = term;
859
860 sum += wptr[i] * term;
861 }
862 }
863
864 return sum;
865 }
866 #endif
867
868 /** @ingroup evaluateLikelihoodGroup
869 @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the CAT model of rate heterogeneity
870
871 Computes the log likelihood of the topology for a specific partition, assuming
872 that the CAT model of rate heterogeneity is used. The likelihood is computed at
873 a virtual root placed at an edge whose two end-points (nodes) have the conditional
874 likelihood vectors \a x1 and \a x2.
875 Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
876 likelihood for each site is also computed and stored at the corresponding position
877 in the array \a perSiteLikelihoods.
878
879 @param fastScaling
880 If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
881 of times it has been scaled down
882
883 @param ex1
884 An array that holds how many times a site has been scaled and points at the entries for node \a p. This
885 parameter is used if \a fastScaling is set to \b PLL_FALSE.
886
887 @param ex2
888 An array that holds how many times a site has been scaled and points at the entries for node \a q. This
889 parameter is used if \a fastScaling is set to \b PLL_TRUE.
890
891 @param cptr
892 Array holding the rate for each site in the compressed partition alignment
893
894 @param wptr
895 Array holding the weight for each site in the compressed partition alignment
896
897 @param x1
898 Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
899
900 @param x2
901 Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
902
903 @param tipVector
904 Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data type,
905 i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition probabilities
906 computed from the eigenvectors of the decomposed Q matrix.
907
908 @param tipX1
909 If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
910 this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
911 holds the conditional likelihood vectors for the internal node.
912
913 @param n
914 Number of sites for which we are doing the evaluation. For the single-thread version this is the number of sites in the
915 current partition, for multi-threads this is the number of sites assigned to the running thread from the current partition.
916
917 @param diagptable_start
918 Start of the array that contains the P-Matrix diagonal of the specific edge for which we are evaluating the likehood,
919 and for each category of the CAT model
920
921 @param states
922 Number of states (4 for DNA, 20 for AA)
923
924 @param perSiteLikelihoods
925 Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
926
927 @param getPerSiteLikelihoods
928 If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
929
930 @return
931 The evaluated log likelihood of the tree topology
932 */
evaluateCAT_FLEX(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start,const int states,double * perSiteLikelihoods,pllBoolean getPerSiteLikelihoods)933 static double evaluateCAT_FLEX (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
934 double *x1, double *x2, double *tipVector,
935 unsigned char *tipX1, int n, double *diagptable_start, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods)
936 {
937 double
938 sum = 0.0,
939 term,
940 *diagptable,
941 *left,
942 *right;
943
944 int
945 i,
946 l;
947
948 /* chosing between tip vectors and non tip vectors is identical in all flavors of this function ,regardless
949 of whether we are using CAT, GAMMA, DNA or protein data etc */
950
951 if(tipX1)
952 {
953 for (i = 0; i < n; i++)
954 {
955 /* same as in the GAMMA implementation */
956 left = &(tipVector[states * tipX1[i]]);
957 right = &(x2[states * i]);
958
959 /* important difference here, we do not have, as for GAMMA
960 4 P matrices assigned to each site, but just one. However those
961 P-Matrices can be different for the sites.
962 Hence we index into the precalculated P-matrices for individual sites
963 via the category pointer cptr[i]
964 */
965 diagptable = &diagptable_start[states * cptr[i]];
966
967 /* similar to gamma, with the only difference that we do not integrate (sum)
968 over the discrete gamma rates, but simply compute the likelihood of the
969 site and the given P-matrix */
970
971 for(l = 0, term = 0.0; l < states; l++)
972 term += left[l] * right[l] * diagptable[l];
973
974 /* take the log */
975 if(!fastScaling)
976 term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
977 else
978 term = log(fabs(term));
979
980 /* if required get the per-site log likelihoods.
981 note that these are the plain per site log-likes, not
982 multiplied with the pattern weight value */
983
984 if(getPerSiteLikelihoods)
985 perSiteLikelihoods[i] = term;
986
987 /*
988 multiply the log with the pattern weight of this site.
989 The site pattern for which we just computed the likelihood may
990 represent several alignment columns sites that have been compressed
991 into one site pattern if they are exactly identical AND evolve under the same model,
992 i.e., form part of the same partition.
993 */
994
995 sum += wptr[i] * term;
996 }
997 }
998 else
999 {
1000 for (i = 0; i < n; i++)
1001 {
1002 /* as before we now access the likelihood arrayes of two inner nodes */
1003 left = &x1[states * i];
1004 right = &x2[states * i];
1005
1006 diagptable = &diagptable_start[states * cptr[i]];
1007
1008 for(l = 0, term = 0.0; l < states; l++)
1009 term += left[l] * right[l] * diagptable[l];
1010
1011 if(!fastScaling)
1012 term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
1013 else
1014 term = log(fabs(term));
1015
1016 if(getPerSiteLikelihoods)
1017 perSiteLikelihoods[i] = term;
1018
1019 sum += wptr[i] * term;
1020 }
1021 }
1022
1023 return sum;
1024 }
1025
1026 #if (defined(__SSE3) || defined(__AVX))
1027 /** @ingroup evaluateLikelihoodGroup
1028 @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the CAT model of rate heterogeneity with memory saving
1029
1030 This is the same as ::evaluateCAT_FLEX but with the memory saving technique enabled.
1031 Please check ::evaluateCAT_FLEX for more information and a description of the common
1032 input parameters
1033
1034 @param x1_gapColumn
1035
1036 @param x2_gapColumn
1037
1038 @param x1_gap
1039 Gap bitvector for the left child node
1040
1041 @param x2_gap
1042 Gap bitvector for the right child node
1043
1044 @todo
1045 Comment on x1_gapColumn and x2_gapColumn
1046 */
evaluateCAT_FLEX_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start,const int states,double * perSiteLikelihoods,pllBoolean getPerSiteLikelihoods,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)1047 static double evaluateCAT_FLEX_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
1048 double *x1, double *x2, double *tipVector,
1049 unsigned char *tipX1, int n, double *diagptable_start, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods,
1050 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
1051 {
1052 double
1053 sum = 0.0,
1054 term,
1055 *diagptable,
1056 *left,
1057 *right,
1058 *left_ptr = x1,
1059 *right_ptr = x2;
1060
1061 int
1062 i,
1063 l;
1064
1065 /* chosing between tip vectors and non tip vectors is identical in all flavors of this function ,regardless
1066 of whether we are using CAT, GAMMA, DNA or protein data etc */
1067
1068 if(tipX1)
1069 {
1070 for (i = 0; i < n; i++)
1071 {
1072 /* same as in the GAMMA implementation */
1073 left = &(tipVector[states * tipX1[i]]);
1074
1075 if(isGap(x2_gap, i))
1076 right = x2_gapColumn;
1077 else
1078 {
1079 right = right_ptr;
1080 right_ptr += states;
1081 }
1082 /* important difference here, we do not have, as for GAMMA
1083 4 P matrices assigned to each site, but just one. However those
1084 P-Matrices can be different for the sites.
1085 Hence we index into the precalculated P-matrices for individual sites
1086 via the category pointer cptr[i]
1087 */
1088 diagptable = &diagptable_start[states * cptr[i]];
1089
1090 /* similar to gamma, with the only difference that we do not integrate (sum)
1091 over the discrete gamma rates, but simply compute the likelihood of the
1092 site and the given P-matrix */
1093
1094 for(l = 0, term = 0.0; l < states; l++)
1095 term += left[l] * right[l] * diagptable[l];
1096
1097 /* take the log */
1098 if(!fastScaling)
1099 term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
1100 else
1101 term = log(fabs(term));
1102
1103 /* if required get the per-site log likelihoods.
1104 note that these are the plain per site log-likes, not
1105 multiplied with the pattern weight value */
1106
1107 if(getPerSiteLikelihoods)
1108 perSiteLikelihoods[i] = term;
1109
1110 /*
1111 multiply the log with the pattern weight of this site.
1112 The site pattern for which we just computed the likelihood may
1113 represent several alignment columns sites that have been compressed
1114 into one site pattern if they are exactly identical AND evolve under the same model,
1115 i.e., form part of the same partition.
1116 */
1117
1118 sum += wptr[i] * term;
1119 }
1120 }
1121 else
1122 {
1123 for (i = 0; i < n; i++)
1124 {
1125 /* as before we now access the likelihood arrayes of two inner nodes */
1126
1127 if(isGap(x1_gap, i))
1128 left = x1_gapColumn;
1129 else
1130 {
1131 left = left_ptr;
1132 left_ptr += states;
1133 }
1134
1135 if(isGap(x2_gap, i))
1136 right = x2_gapColumn;
1137 else
1138 {
1139 right = right_ptr;
1140 right_ptr += states;
1141 }
1142
1143 diagptable = &diagptable_start[states * cptr[i]];
1144
1145 for(l = 0, term = 0.0; l < states; l++)
1146 term += left[l] * right[l] * diagptable[l];
1147
1148 if(!fastScaling)
1149 term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
1150 else
1151 term = log(fabs(term));
1152
1153 if(getPerSiteLikelihoods)
1154 perSiteLikelihoods[i] = term;
1155
1156 sum += wptr[i] * term;
1157 }
1158 }
1159
1160 return sum;
1161 }
1162 #endif
1163
1164
1165 /* This is the core function for computing the log likelihood at a branch */
1166 /** @ingroup evaluateLikelihoodGroup
1167 @brief Evaluate the log likelihood of a specific branch of the topology
1168
1169 Evaluates the likelihood of the tree topology assuming a virtual root is
1170 placed at the edge whose end-points are node with number \a pNumber and \a
1171 qNumber in the first slot of the traversal descriptor. The function first
1172 computes the conditional likelihoods for all necessary nodes (the ones in
1173 the traversal descriptor list) by calling the function \a pllNewviewIterative
1174 and then evaluates the likelihood at the root. In addition, if \a
1175 getPerSiteLikelihoods is set to \b PLL_TRUE, the per-site likelihoods are
1176 stored in \a tr->lhs.
1177
1178 @param tr
1179 PLL instance
1180
1181 @param pr
1182 List of partitions
1183
1184 @param getPerSiteLikelihoods
1185 If set to \b PLL_TRUE, compute the log likelihood for each site.
1186
1187 @note
1188 This is an internal function and should not be called by the user. It assumes
1189 that a valid traversal descriptor has already been computed. It also assumes
1190 that the edge we are referring to is an edge that leads to a tip, i.e. either
1191 p or q of the first entry of traversal descriptor are tips.
1192 */
pllEvaluateIterative(pllInstance * tr,partitionList * pr,pllBoolean getPerSiteLikelihoods)1193 void pllEvaluateIterative(pllInstance *tr, partitionList *pr, pllBoolean getPerSiteLikelihoods)
1194 {
1195 /* the branch lengths and node indices of the virtual root branch are always the first one that
1196 are stored in the very important traversal array data structure that describes a partial or full tree traversal */
1197
1198 /* get the branch length at the root */
1199 double
1200 *pz = tr->td[0].ti[0].qz;
1201
1202 /* get the node number of the node to the left and right of the branch that defines the virtual rooting */
1203
1204 int
1205 pNumber = tr->td[0].ti[0].pNumber,
1206 qNumber = tr->td[0].ti[0].qNumber,
1207 p_slot,
1208 q_slot,
1209 model;
1210
1211 pllBoolean
1212 fastScaling = tr->fastScaling;
1213
1214 /* the slots are the entries in xVector where the LH vector is available */
1215 if(tr->useRecom)
1216 {
1217 p_slot = tr->td[0].ti[0].slot_p;
1218 q_slot = tr->td[0].ti[0].slot_q;
1219 }
1220 else
1221 {
1222 p_slot = pNumber - tr->mxtips - 1;
1223 q_slot = qNumber - tr->mxtips - 1;
1224 }
1225
1226 /* before we can compute the likelihood at the virtual root, we need to do a partial or full tree traversal to compute
1227 the conditional likelihoods of the vectors as specified in the traversal descriptor. Maintaining this tarversal descriptor consistent
1228 will unfortunately be the responsibility of users. This is tricky, if as planned for here, we use a rooted view (described somewhere in Felsenstein's book)
1229 for the conditional vectors with respect to the tree
1230 */
1231
1232 /* iterate over all valid entries in the traversal descriptor */
1233
1234 pllNewviewIterative(tr, pr, 1);
1235
1236 /* after the above call we are sure that we have properly and consistently computed the
1237 conditionals to the right and left of the virtual root and we can now invoke the
1238 the log likelihood computation */
1239
1240 /* we need to loop over all partitions. Note that we may have a mix of DNA, protein binary data etc partitions */
1241
1242 for(model = 0; model < pr->numberOfPartitions; model++)
1243 {
1244 /* whats' the number of sites of this partition (at the current thread) */
1245 int
1246 width = pr->partitionData[model]->width;
1247
1248 /*
1249 Important part of the tarversal descriptor:
1250 figure out if we need to recalculate the likelihood of this
1251 partition:
1252
1253 The reasons why this is important in terms of performance are given in this paper
1254 here which you should actually read:
1255
1256 A. Stamatakis, M. Ott: "Load Balance in the Phylogenetic Likelihood Kernel". Proceedings of ICPP 2009, accepted for publication, Vienna, Austria, September 2009
1257
1258 The width > 0 check is for checking if under the cyclic data distribution of per-partition sites to threads this thread does indeed have a site
1259 of the current partition.
1260
1261 */
1262
1263 if(tr->td[0].executeModel[model] && width > 0)
1264 {
1265 int
1266 #if (defined(__SSE3) || defined(__AVX))
1267 rateHet = (int)discreteRateCategories(tr->rateHetModel),
1268 #endif
1269 categories,
1270 ascWidth = pr->partitionData[model]->states,
1271
1272 /* get the number of states in the partition, e.g.: 4 = DNA, 20 = Protein */
1273
1274 states = pr->partitionData[model]->states,
1275 *ex1 = NULL,
1276 *ex2 = NULL,
1277 *ex1_asc = NULL,
1278 *ex2_asc = NULL;
1279
1280 double
1281 *rateCategories = (double*)NULL,
1282 z,
1283 partitionLikelihood = 0.0,
1284 *x1_start = NULL,
1285 *x2_start = NULL,
1286 *diagptable = NULL,
1287 *x1_start_asc = NULL,
1288 *x2_start_asc = NULL;
1289
1290 #if (defined(__SSE3) || defined(__AVX))
1291 double
1292 *x1_gapColumn = (double*)NULL,
1293 *x2_gapColumn = (double*)NULL;
1294 #endif
1295
1296 #if (defined(__SSE3) || defined(__AVX))
1297 unsigned int
1298 *x1_gap = (unsigned int*)NULL,
1299 *x2_gap = (unsigned int*)NULL;
1300 #endif
1301
1302 unsigned char
1303 *tip = (unsigned char*)NULL;
1304
1305 /*
1306 figure out if we are using the CAT or GAMMA model of rate heterogeneity
1307 and set pointers to the rate heterogeneity rate arrays and also set the
1308 number of distinct rate categories appropriately.
1309
1310 Under GAMMA this is constant and hard-coded as 4, weheras under CAT
1311 the number of site-wise rate categories can vary in the course of computations
1312 up to a user defined maximum value of site categories (default: 25)
1313 */
1314
1315 if(tr->rateHetModel == PLL_CAT)
1316 {
1317 rateCategories = pr->partitionData[model]->perSiteRates;
1318 categories = pr->partitionData[model]->numberOfCategories;
1319 }
1320 else /* GAMMA */
1321 {
1322 rateCategories = pr->partitionData[model]->gammaRates;
1323 categories = 4;
1324 }
1325
1326 /* set this pointer to the memory area where space has been reserved a priori for storing the
1327 P matrix at the root */
1328
1329 diagptable = pr->partitionData[model]->left;
1330
1331 /* figure out if we need to address tip vectors (a char array that indexes into a precomputed tip likelihood
1332 value array) or if we need to address inner vectors */
1333
1334 /* either node p or node q is a tip */
1335
1336 if(isTip(pNumber, tr->mxtips) || isTip(qNumber, tr->mxtips))
1337 {
1338 /* q is a tip */
1339
1340 if(isTip(qNumber, tr->mxtips))
1341 {
1342 /* get the start address of the inner likelihood vector x2 for partition model,
1343 note that inner nodes are enumerated/indexed starting at 0 to save allocating some
1344 space for additional pointers */
1345
1346 x2_start = pr->partitionData[model]->xVector[p_slot];
1347
1348 /* get the corresponding tip vector */
1349
1350 tip = pr->partitionData[model]->yVector[qNumber];
1351
1352 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1353 if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
1354 #else
1355 if (pr->partitionData[model]->ascBias)
1356 #endif
1357 {
1358 x2_start_asc = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
1359 ex2_asc = &pr->partitionData[model]->ascExpVector[(pNumber - tr->mxtips - 1) * ascWidth];
1360 }
1361
1362
1363 /* memory saving stuff, let's deal with this later or ask Fernando ;-) */
1364
1365 #if (defined(__SSE3) || defined(__AVX))
1366 if(tr->saveMemory)
1367 {
1368 x2_gap = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
1369 x2_gapColumn = &(pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet]);
1370 }
1371 #endif
1372 /* per site likelihood scaling */
1373
1374 if(!fastScaling)
1375 ex2 = pr->partitionData[model]->expVector[p_slot];
1376 }
1377 else
1378 {
1379 /* p is a tip, same as above */
1380
1381 x2_start = pr->partitionData[model]->xVector[q_slot];
1382 tip = pr->partitionData[model]->yVector[pNumber];
1383
1384
1385 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1386 if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
1387 #else
1388 if (pr->partitionData[model]->ascBias)
1389 #endif
1390 {
1391 x2_start_asc = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
1392 ex2_asc = &pr->partitionData[model]->ascExpVector[(qNumber - tr->mxtips - 1) * ascWidth];
1393 }
1394
1395 #if (defined(__SSE3) || defined(__AVX))
1396 if(tr->saveMemory)
1397 {
1398 x2_gap = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
1399 x2_gapColumn = &(pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet]);
1400 }
1401 #endif
1402
1403 /* per site likelihood scaling */
1404
1405 if(!fastScaling)
1406 ex2 = pr->partitionData[model]->expVector[q_slot];
1407 }
1408 }
1409 else
1410 {
1411
1412 assert(p_slot != q_slot);
1413 /* neither p nor q are tips, hence we need to get the addresses of two inner vectors */
1414
1415 x1_start = pr->partitionData[model]->xVector[p_slot];
1416 x2_start = pr->partitionData[model]->xVector[q_slot];
1417
1418 /* memory saving option */
1419
1420 #if (defined(__SSE3) || defined(__AVX))
1421 if(tr->saveMemory)
1422 {
1423 x1_gap = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
1424 x2_gap = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
1425 x1_gapColumn = &pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet];
1426 x2_gapColumn = &pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet];
1427 }
1428 #endif
1429
1430 /* per site likelihood scaling */
1431
1432 if(!fastScaling)
1433 {
1434 ex1 = pr->partitionData[model]->expVector[p_slot];
1435 ex2 = pr->partitionData[model]->expVector[q_slot];
1436 }
1437
1438 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1439 if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
1440 #else
1441 if (pr->partitionData[model]->ascBias)
1442 #endif
1443 {
1444 x1_start_asc = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
1445 x2_start_asc = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
1446
1447 ex1_asc = &pr->partitionData[model]->ascExpVector[(pNumber - tr->mxtips - 1) * ascWidth];
1448 ex2_asc = &pr->partitionData[model]->ascExpVector[(qNumber - tr->mxtips - 1) * ascWidth];
1449 }
1450
1451
1452
1453 }
1454
1455
1456 /* if we are using a per-partition branch length estimate, the branch has an index, otherwise, for a joint branch length
1457 estimate over all partitions we just use the branch length value with index 0 */
1458
1459 if(pr->perGeneBranchLengths)
1460 z = pz[model];
1461 else
1462 z = pz[0];
1463
1464 /* calc P-Matrix at root for branch z connecting nodes p and q */
1465
1466 if(pr->partitionData[model]->dataType == PLL_AA_DATA && (pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X))
1467 calcDiagptableFlex_LG4(z, 4, pr->partitionData[model]->gammaRates, pr->partitionData[model]->EIGN_LG4, diagptable, 20);
1468 else
1469 calcDiagptable(z, states, categories, rateCategories, pr->partitionData[model]->EIGN, diagptable);
1470
1471 #if (!defined(__SSE3) && !defined(__AVX) && !defined(__MIC_NATIVE))
1472
1473 /* generic slow functions, memory saving option is not implemented for these */
1474
1475 assert(!tr->saveMemory);
1476
1477 /* decide wheter CAT or GAMMA is used and compute log like */
1478 if(tr->rateHetModel == PLL_CAT)
1479 partitionLikelihood = evaluateCAT_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1480 x1_start, x2_start, pr->partitionData[model]->tipVector,
1481 tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, getPerSiteLikelihoods);
1482 else
1483 partitionLikelihood = evaluateGAMMA_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1484 x1_start, x2_start, pr->partitionData[model]->tipVector,
1485 tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, getPerSiteLikelihoods);
1486 #else
1487
1488 /* if we want to compute the per-site likelihoods, we use the generic evaluate function implementations
1489 for this, because the slowdown is not that dramatic */
1490
1491 if(getPerSiteLikelihoods)
1492 {
1493 #ifdef __MIC_NATIVE
1494 // not supported on MIC!
1495 assert(0 && "Per-site LH calculations is not implemented on Intel MIC");
1496 #else
1497 if(tr->rateHetModel == PLL_CAT)
1498 {
1499 if(tr->saveMemory)
1500 partitionLikelihood = evaluateCAT_FLEX_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1501 x1_start, x2_start, pr->partitionData[model]->tipVector,
1502 tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE,
1503 x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1504 else
1505 partitionLikelihood = evaluateCAT_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1506 x1_start, x2_start, pr->partitionData[model]->tipVector,
1507 tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE);
1508 }
1509 else
1510 {
1511 if(tr->saveMemory)
1512 partitionLikelihood = evaluateGAMMA_FLEX_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1513 x1_start, x2_start, pr->partitionData[model]->tipVector,
1514 tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE,
1515 x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1516 else
1517 partitionLikelihood = evaluateGAMMA_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1518 x1_start, x2_start, pr->partitionData[model]->tipVector,
1519 tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE);
1520 }
1521 #endif
1522 }
1523 else
1524 {
1525 /* for the optimized functions we have a dedicated, optimized function implementation
1526 for each rate heterogeneity and data type combination, we switch over the number of states
1527 and the rate heterogeneity model */
1528
1529 switch(states)
1530 {
1531 case 2: /* binary */
1532 assert (!tr->saveMemory);
1533 if (tr->rateHetModel == PLL_CAT)
1534 {
1535 partitionLikelihood = evaluateGTRCAT_BINARY(ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1536 x1_start, x2_start, pr->partitionData[model]->tipVector,
1537 tip, width, diagptable, fastScaling);
1538 }
1539 else
1540 {
1541 partitionLikelihood = evaluateGTRGAMMA_BINARY(ex1, ex2, pr->partitionData[model]->wgt,
1542 x1_start, x2_start, pr->partitionData[model]->tipVector,
1543 tip, width, diagptable, fastScaling);
1544 }
1545 break;
1546 case 4: /* DNA */
1547 {
1548
1549 #ifdef __MIC_NATIVE
1550
1551 /* CAT & memory saving are not supported on MIC */
1552
1553 assert(!tr->saveMemory);
1554 assert(tr->rateHetModel == PLL_GAMMA);
1555
1556 partitionLikelihood = evaluateGTRGAMMA_MIC(ex1, ex2, pr->partitionData[model]->wgt,
1557 x1_start, x2_start, pr->partitionData[model]->tipVector,
1558 tip, width, diagptable, fastScaling);
1559 #else
1560 if(tr->rateHetModel == PLL_CAT)
1561 {
1562 if(tr->saveMemory)
1563 partitionLikelihood = evaluateGTRCAT_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1564 x1_start, x2_start, pr->partitionData[model]->tipVector,
1565 tip, width, diagptable, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1566 else
1567 partitionLikelihood = evaluateGTRCAT(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1568 x1_start, x2_start, pr->partitionData[model]->tipVector,
1569 tip, width, diagptable);
1570 }
1571 else
1572 {
1573 if(tr->saveMemory)
1574 partitionLikelihood = evaluateGTRGAMMA_GAPPED_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1575 x1_start, x2_start, pr->partitionData[model]->tipVector,
1576 tip, width, diagptable,
1577 x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1578 else
1579 partitionLikelihood = evaluateGTRGAMMA(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1580 x1_start, x2_start, pr->partitionData[model]->tipVector,
1581 tip, width, diagptable);
1582 }
1583 #endif
1584 }
1585 break;
1586 case 20: /* proteins */
1587 {
1588
1589 #ifdef __MIC_NATIVE
1590
1591 /* CAT & memory saving are not supported on MIC */
1592
1593 assert(!tr->saveMemory);
1594 assert(tr->rateHetModel == PLL_GAMMA);
1595
1596 if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
1597 partitionLikelihood = evaluateGTRGAMMAPROT_LG4_MIC(pr->partitionData[model]->wgt,
1598 x1_start, x2_start, pr->partitionData[model]->tipVector_LG4,
1599 tip, width, diagptable, pr->partitionData[model]->lg4x_weights);
1600 else
1601 partitionLikelihood = evaluateGTRGAMMAPROT_MIC(ex1, ex2, pr->partitionData[model]->wgt,
1602 x1_start, x2_start, pr->partitionData[model]->tipVector,
1603 tip, width, diagptable, fastScaling);
1604
1605 // printf("tip: %p, width: %d, lh: %f\n", tip, width, partitionLikelihood);
1606 // int g;
1607 // if (x1_start)
1608 // for (g = 0; g < 20; ++g)
1609 // printf("%f \t", x1_start[g]);
1610 // printf("\n");
1611 // if (x2_start)
1612 // for (g = 0; g < 20; ++g)
1613 // printf("%f \t", x2_start[g]);
1614 #else
1615
1616 if(tr->rateHetModel == PLL_CAT)
1617 {
1618 if(tr->saveMemory)
1619 partitionLikelihood = evaluateGTRCATPROT_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1620 x1_start, x2_start, pr->partitionData[model]->tipVector,
1621 tip, width, diagptable, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1622 else
1623 partitionLikelihood = evaluateGTRCATPROT(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
1624 x1_start, x2_start, pr->partitionData[model]->tipVector,
1625 tip, width, diagptable);
1626 }
1627 else
1628 {
1629 if(tr->saveMemory)
1630 partitionLikelihood = evaluateGTRGAMMAPROT_GAPPED_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1631 x1_start, x2_start, pr->partitionData[model]->tipVector,
1632 tip, width, diagptable,
1633 x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
1634 else
1635 {
1636 if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
1637 partitionLikelihood = evaluateGTRGAMMAPROT_LG4((int *)NULL, (int *)NULL, pr->partitionData[model]->wgt,
1638 x1_start, x2_start, pr->partitionData[model]->tipVector_LG4,
1639 tip, width, diagptable, PLL_TRUE, pr->partitionData[model]->lg4x_weights);
1640 else
1641 partitionLikelihood = evaluateGTRGAMMAPROT(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
1642 x1_start, x2_start, pr->partitionData[model]->tipVector,
1643 tip, width, diagptable);
1644 }
1645 }
1646 #endif
1647 }
1648 break;
1649 default:
1650 assert(0);
1651 }
1652 }
1653 #endif
1654
1655 /* check that there was no major numerical screw-up, the log likelihood should be < 0.0 always */
1656
1657 assert(partitionLikelihood < 0.0);
1658
1659 /* now here is a nasty part, for each partition and each node we maintain an integer counter to count how often
1660 how many entries per node were scaled by a constant factor. Here we use this information generated during Felsenstein's
1661 pruning algorithm by the newview() functions to undo the preceding scaling multiplications at the root, for mathematical details
1662 you should actually read:
1663
1664 A. Stamatakis: "Orchestrating the Phylogenetic Likelihood Function on Emerging Parallel Architectures".
1665 In B. Schmidt, editor, Bioinformatics: High Performance Parallel Computer Architectures, 85-115, CRC Press, Taylor & Francis, 2010.
1666
1667 There's a copy of this book in my office
1668 */
1669
1670 if(fastScaling)
1671 partitionLikelihood += (pr->partitionData[model]->globalScaler[pNumber] + pr->partitionData[model]->globalScaler[qNumber]) * log(PLL_MINLIKELIHOOD);
1672
1673 /* now we have the correct log likelihood for the current partition after undoing scaling multiplications */
1674
1675 /* finally, we also store the per partition log likelihood which is important for optimizing the alpha parameter
1676 of this partition for example */
1677
1678 /* asc bias stuff */
1679
1680 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1681 if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
1682 #else
1683 if (pr->partitionData[model]->ascBias)
1684 #endif
1685 {
1686 size_t
1687 i;
1688
1689 int
1690 w = 0;
1691
1692 double
1693 correction;
1694
1695 switch(tr->rateHetModel)
1696 {
1697 case PLL_CAT:
1698 {
1699 double
1700 rates = 1.0;
1701
1702 //need to re-calculate P-matrix for the correction here assuming a rate of 1.0
1703 calcDiagptable(z, states, 1, &rates, pr->partitionData[model]->EIGN, diagptable);
1704
1705
1706 correction = evaluateCatAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector,
1707 tip, ascWidth, diagptable, ascWidth);
1708 }
1709 break;
1710 case PLL_GAMMA:
1711 correction = evaluateGammaAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector,
1712 tip, ascWidth, diagptable, ascWidth);
1713 break;
1714 default:
1715 assert(0);
1716 }
1717
1718
1719
1720 for(i = (size_t)pr->partitionData[model]->lower; i < (size_t)pr->partitionData[model]->upper; i++)
1721 w += tr->aliaswgt[i];
1722
1723 partitionLikelihood = partitionLikelihood - (double)w * log(1.0 - correction);
1724
1725 }
1726 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1727 if(!(pr->partitionData[model]->ascBias && tr->threadID == 0))
1728 {
1729 #endif
1730 if(partitionLikelihood >= 0.0)
1731 {
1732 printf("positive log like: %f for partition %d\n", partitionLikelihood, model);
1733 assert(0);
1734 }
1735 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1736 }
1737 #endif
1738
1739
1740 pr->partitionData[model]->partitionLH = partitionLikelihood;
1741 }
1742 else
1743 {
1744 /* if the current thread does not have a single site of this partition
1745 it is important to set the per partition log like to 0.0 because
1746 of the reduction operation that will take place later-on.
1747 That is, the values of tr->perPartitionLH across all threads
1748 need to be in a consistent state, always !
1749 */
1750
1751 if(width == 0)
1752 pr->partitionData[model]->partitionLH = 0.0;
1753 }
1754 }
1755
1756
1757 #ifdef DEBUG_PERSITE_LNL
1758 /* per persite-stuff */
1759 {
1760 int model = 0;
1761 for(model = 0; model < pr->numberOfPartitions ; ++model)
1762 {
1763 int j= 0;
1764 pInfo *partition = pr->partitionData[model];
1765 for(j = 0; j < partition->width; ++j)
1766 printf("[%d] lnl[%d]=%f\n", tr->threadID, j, partition->perSiteLikelihoods[j]);
1767
1768 }
1769 }
1770
1771 #endif
1772 }
1773
1774
1775
1776 /** @ingroup evaluateLikelihoodGroup
1777 @brief Evaluate the log likelihood of the tree topology
1778
1779 Evaluate the log likelihood of the tree topology of instance \a tr by
1780 assuming a virtual root between nodes \a p and \a p->back. If
1781 \a fullTraversal is set to \b PLL_TRUE then the log likelihood vectors for
1782 each node are recomputed from scratch.
1783
1784 @param tr
1785 PLL instance
1786
1787 @param pr
1788 List of partitions
1789
1790 @param p
1791 Specifies the virtual root, which is assumed to be a (virtual node) connecting \a p and \a p->back
1792
1793 @param fullTraversal
1794 If set to \b PLL_TRUE, then the likelihood vectors at all nodes are recomputed, otherwise only the
1795 necessary vectors (those that are not oriented in the right direction) are recomputed.
1796
1797 @param getPerSiteLikelihoods
1798 Also compute and store (in \a tr->lhs) the log likelihood of each site of the (compressed) alignment
1799
1800 @note
1801 If \a getPerSiteLikelihoods is set to \b PLL_TRUE, then make sure that \a tr->fastScaling is set to
1802 \b PLL_FALSE, otherwise an assertion will fail.
1803 */
pllEvaluateLikelihood(pllInstance * tr,partitionList * pr,nodeptr p,pllBoolean fullTraversal,pllBoolean getPerSiteLikelihoods)1804 void pllEvaluateLikelihood (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean fullTraversal, pllBoolean getPerSiteLikelihoods)
1805 {
1806 /* now this may be the entry point of the library to compute
1807 the log like at a branch defined by p and p->back == q */
1808
1809 volatile double
1810 result = 0.0;
1811
1812 nodeptr
1813 q = p->back;
1814
1815
1816 pllBoolean
1817 p_recom = PLL_FALSE, /* if one of was missing, we will need to force recomputation */
1818 q_recom = PLL_FALSE;
1819
1820 int
1821 i,
1822 model,
1823 numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions : 1;
1824
1825 /* if evaluate shall return the per-site log likelihoods
1826 fastScaling needs to be disabled, otherwise this will
1827 not work */
1828
1829 if(getPerSiteLikelihoods)
1830 assert(!(tr->fastScaling));
1831
1832 /* set the first entry of the traversal descriptor to contain the indices
1833 of nodes p and q */
1834
1835 tr->td[0].ti[0].pNumber = p->number;
1836 tr->td[0].ti[0].qNumber = q->number;
1837
1838 /* copy the branch lengths of the tree into the first entry of the traversal descriptor.
1839 if -M is not used tr->numBranches must be 1 */
1840
1841 for(i = 0; i < numBranches; i++)
1842 tr->td[0].ti[0].qz[i] = q->z[i];
1843
1844 /* recom part */
1845 if(tr->useRecom)
1846 {
1847 int slot = -1;
1848 if(!isTip(q->number, tr->mxtips))
1849 {
1850 q_recom = getxVector(tr->rvec, q->number, &slot, tr->mxtips);
1851 tr->td[0].ti[0].slot_q = slot;
1852 }
1853 if(!isTip(p->number, tr->mxtips))
1854 {
1855 p_recom = getxVector(tr->rvec, p->number, &slot, tr->mxtips);
1856 tr->td[0].ti[0].slot_p = slot;
1857 }
1858 if(!isTip(p->number, tr->mxtips) && !isTip(q->number, tr->mxtips))
1859 assert(tr->td[0].ti[0].slot_q != tr->td[0].ti[0].slot_p);
1860 }
1861
1862
1863 /* now compute how many conditionals must be re-computed/re-oriented by newview
1864 to be able to calculate the likelihood at the root defined by p and q.
1865 */
1866
1867 /* one entry in the traversal descriptor is already used, hence set the tarversal length counter to 1 */
1868 tr->td[0].count = 1;
1869
1870 if(fullTraversal)
1871 {
1872 assert(isTip(q->back->number, tr->mxtips));
1873 computeTraversal(tr, q, PLL_FALSE, numBranches);
1874 }
1875 else
1876 {
1877 if(p_recom || needsRecomp(tr->useRecom, tr->rvec, p, tr->mxtips))
1878 computeTraversal(tr, p, PLL_TRUE, numBranches);
1879
1880 if(q_recom || needsRecomp(tr->useRecom, tr->rvec, q, tr->mxtips))
1881 computeTraversal(tr, q, PLL_TRUE, numBranches);
1882 }
1883
1884
1885 /* now we copy this partition execute mask into the traversal descriptor which must come from the
1886 calling program, the logic of this should not form part of the library */
1887
1888 storeExecuteMaskInTraversalDescriptor(tr, pr);
1889
1890 /* also store in the traversal descriptor that something has changed i.e., in the parallel case that the
1891 traversal descriptor list of nodes needs to be broadcast once again */
1892
1893 tr->td[0].traversalHasChanged = PLL_TRUE;
1894 #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
1895
1896 /* now here we enter the fork-join region for Pthreads */
1897
1898
1899 /* start the parallel region and tell all threads to compute the log likelihood for
1900 their fraction of the data. This call is implemented in the case switch of execFunction in axml.c
1901 */
1902 if(getPerSiteLikelihoods)
1903 {
1904 memset(tr->lhs, 0, sizeof(double) * tr->originalCrunchedLength);
1905 pllMasterBarrier(tr, pr, PLL_THREAD_EVALUATE_PER_SITE_LIKES);
1906 }
1907 else
1908 pllMasterBarrier (tr, pr, PLL_THREAD_EVALUATE);
1909
1910 /* and now here we explicitly do the reduction operation , that is add over the
1911 per-thread and per-partition log likelihoods to obtain the overall log like
1912 over all sites and partitions */
1913
1914
1915 /*
1916 for unpartitioned data that's easy, we just sum over the log likes computed
1917 by each thread, thread 0 stores his results in reductionBuffer[0] thread 1 in
1918 reductionBuffer[1] and so on
1919 */
1920
1921 /* This reduction for the partitioned case is more complicated because each thread
1922 needs to store the partial log like of each partition and we then need to collect
1923 and add everything */
1924
1925 #else
1926 /* and here is just the sequential case, we directly call pllEvaluateIterative() above
1927 without having to tell the threads/processes that they need to compute this function now */
1928
1929 pllEvaluateIterative(tr, pr, getPerSiteLikelihoods); //PLL_TRUE
1930
1931 /*
1932 if we want to obtain per-site rates they have initially been stored
1933 in arrays that are associated to the partition, now we
1934 copy them into the vector tr->lhs[].
1935 We may also chose that the user needs to rpovide an array, but this can be decided later-on.
1936 */
1937
1938 if(getPerSiteLikelihoods) //PLL_TRUE
1939 {
1940 for(model = 0; model < pr->numberOfPartitions; model++)
1941 memcpy(&(tr->lhs[pr->partitionData[model]->lower]), pr->partitionData[model]->perSiteLikelihoods, pr->partitionData[model]->width * sizeof(double));
1942 }
1943
1944 #endif
1945
1946 for(model = 0; model < pr->numberOfPartitions; model++)
1947 result += pr->partitionData[model]->partitionLH;
1948
1949 /* set the tree data structure likelihood value to the total likelihood */
1950
1951 tr->likelihood = result;
1952
1953 /* the code below is mainly for testing if the per-site log
1954 likelihoods we have stored in tr->lhs yield the same
1955 likelihood as the likelihood we computed.
1956 For numerical reasons we need to make a dirt PLL_ABS(difference) < epsilon
1957 comparison */
1958
1959 if(getPerSiteLikelihoods) //PLL_TRUE
1960 {
1961 double
1962 likelihood = 0;
1963 int i;
1964
1965 /* note that in tr->lhs, we just store the likelihood of
1966 one representative of a potentially compressed pattern,
1967 hence, we need to multiply the elemnts with the pattern
1968 weight vector */
1969
1970
1971 for(i = 0; i < tr->originalCrunchedLength; i++)
1972 {
1973 // printf("lhs[%d]=%f * %d\n", i, tr->lhs[i], tr->aliaswgt[i]);
1974 likelihood += (tr->lhs[i] * tr->aliaswgt[i] );
1975 }
1976
1977 if( PLL_ABS(tr->likelihood - likelihood) > 0.00001)
1978 {
1979 // printf("likelihood was %f\t summed/weighted per-site-lnl was %f\n", tr->likelihood, likelihood);
1980 }
1981
1982 assert(PLL_ABS(tr->likelihood - likelihood) < 0.00001);
1983 }
1984
1985
1986 if(tr->useRecom)
1987 {
1988 unpinNode(tr->rvec, p->number, tr->mxtips);
1989 unpinNode(tr->rvec, q->number, tr->mxtips);
1990 }
1991
1992 /* do some bookkeeping to have traversalHasChanged in a consistent state */
1993
1994 tr->td[0].traversalHasChanged = PLL_FALSE;
1995 }
1996
1997
perSiteLogLikelihoods(pllInstance * tr,partitionList * pr,double * logLikelihoods)1998 void perSiteLogLikelihoods(pllInstance *tr, partitionList *pr, double *logLikelihoods)
1999 {
2000 #if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
2001 double
2002 //likelihood,
2003 accumulatedPerSiteLikelihood = 0.0;
2004
2005 size_t
2006 localCount,
2007 i,
2008 //globalCounter,
2009 lower,
2010 upper;
2011 int model;
2012 #endif
2013 /* compute the likelihood of the tree with the standard function to:
2014 1. obtain the current score for error checking
2015 2. store a full tree traversal in the traversal descriptor that
2016 will then be used for calculating per-site log likelihoods
2017 for each site individually and independently */
2018
2019 pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
2020
2021 //likelihood = tr->likelihood;
2022
2023 /* now compute per-site log likelihoods using the respective functions */
2024
2025 #if (defined( _USE_PTHREADS ) || defined(_FINE_GRAIN_MPI))
2026 /* here we need a barrier to invoke a parallel region that calls
2027 function
2028 perSiteLogLikelihoodsPthreads(tree *tr, partitionList *pr, double *lhs, int n, int tid)
2029 defined above and subsequently collects the per-site log likelihoods
2030 computed by the threads and stored in local per-thread memory
2031 and stores them in buffer tr->lhs.
2032 This corresponds to a gather operation in MPI.
2033 */
2034
2035 pllMasterBarrier (tr, pr, PLL_THREAD_PER_SITE_LIKELIHOODS);
2036
2037 /*
2038 when the parallel region has terminated, the per-site log likelihoods
2039 are stored in array tr->lhs of the master thread which we copy to the result buffer
2040 */
2041
2042 memcpy(logLikelihoods, tr->lhs, sizeof(double) * tr->originalCrunchedLength);
2043
2044
2045 #else
2046
2047 /* sequential case: just loop over all partitions and compute per site log likelihoods */
2048
2049 for(model = 0; model < pr->numberOfPartitions; model++)
2050 {
2051 lower = pr->partitionData[model]->lower;
2052 upper = pr->partitionData[model]->upper;
2053
2054 for(i = lower, localCount = 0; i < upper; i++, localCount++)
2055 {
2056 double
2057 l;
2058
2059 /*
2060 we need to switch of rate heterogeneity implementations here.
2061 when we have PSR we actually need to provide the per-site rate
2062 to the function evaluatePartialGeneric() that computes the
2063 per-site log likelihood.
2064 Under GAMMA, the rate will just be ignored, here we just set it to 1.0
2065 */
2066
2067 switch(tr->rateHetModel)
2068 {
2069 case PLL_CAT:
2070 l = evaluatePartialGeneric (tr, pr, i, pr->partitionData[model]->perSiteRates[pr->partitionData[model]->rateCategory[localCount]], model);
2071 break;
2072 case PLL_GAMMA:
2073 l = evaluatePartialGeneric (tr, pr, i, 1.0, model);
2074 break;
2075 default:
2076 assert(0);
2077 }
2078
2079 /* store value in result array and add the likelihood of this site to the overall likelihood */
2080
2081 logLikelihoods[i] = l;
2082 accumulatedPerSiteLikelihood += l;
2083 }
2084 }
2085
2086
2087 /* error checking. We need a dirt PLL_ABS() < epsilon here, because the implementations
2088 (standard versus per-site) are pretty different and hence slight numerical
2089 deviations are expected */
2090
2091 assert(PLL_ABS(tr->likelihood - accumulatedPerSiteLikelihood) < 0.00001);
2092
2093 #endif
2094
2095
2096
2097 }
2098
2099 #if (defined(__SSE3) || defined(__AVX))
evaluateGTRCAT_BINARY(int * ex1,int * ex2,int * cptr,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start,const pllBoolean fastScaling)2100 static double evaluateGTRCAT_BINARY (int *ex1, int *ex2, int *cptr, int *wptr,
2101 double *x1_start, double *x2_start, double *tipVector,
2102 unsigned char *tipX1, int n, double *diagptable_start, const pllBoolean fastScaling)
2103 {
2104 double sum = 0.0, term;
2105 int i;
2106 #if (!defined(__SSE3) && !defined(__AVX))
2107 int j;
2108 #endif
2109 double *diagptable, *x1, *x2;
2110
2111 if(tipX1)
2112 {
2113 for (i = 0; i < n; i++)
2114 {
2115 #if (defined(__SSE3) || defined(__AVX))
2116 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2117 #endif
2118 x1 = &(tipVector[2 * tipX1[i]]);
2119 x2 = &(x2_start[2 * i]);
2120
2121 diagptable = &(diagptable_start[2 * cptr[i]]);
2122
2123 #if (defined(__SSE3) || defined(__AVX))
2124 _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
2125
2126 if(fastScaling)
2127 term = log(fabs(t[0] + t[1]));
2128 else
2129 term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2130 #else
2131 for(j = 0, term = 0.0; j < 2; j++)
2132 term += x1[j] * x2[j] * diagptable[j];
2133
2134 if(fastScaling)
2135 term = log(fabs(term));
2136 else
2137 term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2138 #endif
2139
2140 sum += wptr[i] * term;
2141 }
2142 }
2143 else
2144 {
2145 for (i = 0; i < n; i++)
2146 {
2147 #if (defined(__SSE3) || defined(__AVX))
2148 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2149 #endif
2150 x1 = &x1_start[2 * i];
2151 x2 = &x2_start[2 * i];
2152
2153 diagptable = &diagptable_start[2 * cptr[i]];
2154 #if (defined(__SSE3) || defined(__AVX))
2155 _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
2156
2157 if(fastScaling)
2158 term = log(fabs(t[0] + t[1]));
2159 else
2160 term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2161 #else
2162 for(j = 0, term = 0.0; j < 2; j++)
2163 term += x1[j] * x2[j] * diagptable[j];
2164
2165 if(fastScaling)
2166 term = log(fabs(term));
2167 else
2168 term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2169 #endif
2170
2171 sum += wptr[i] * term;
2172 }
2173 }
2174
2175 return sum;
2176 }
2177
2178
evaluateGTRGAMMA_BINARY(int * ex1,int * ex2,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,const int n,double * diagptable,const pllBoolean fastScaling)2179 static double evaluateGTRGAMMA_BINARY(int *ex1, int *ex2, int *wptr,
2180 double *x1_start, double *x2_start,
2181 double *tipVector,
2182 unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling)
2183 {
2184 double sum = 0.0, term;
2185 int i, j;
2186 #if (!defined(__SSE3) && !defined(__AVX))
2187 int k;
2188 #endif
2189 double *x1, *x2;
2190
2191 if(tipX1)
2192 {
2193 for (i = 0; i < n; i++)
2194 {
2195 #if (defined(__SSE3) || defined(__AVX))
2196 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2197 __m128d termv, x1v, x2v, dv;
2198 #endif
2199 x1 = &(tipVector[2 * tipX1[i]]);
2200 x2 = &x2_start[8 * i];
2201 #if (defined(__SSE3) || defined(__AVX))
2202 termv = _mm_set1_pd(0.0);
2203
2204 for(j = 0; j < 4; j++)
2205 {
2206 x1v = _mm_load_pd(&x1[0]);
2207 x2v = _mm_load_pd(&x2[j * 2]);
2208 dv = _mm_load_pd(&diagptable[j * 2]);
2209
2210 x1v = _mm_mul_pd(x1v, x2v);
2211 x1v = _mm_mul_pd(x1v, dv);
2212
2213 termv = _mm_add_pd(termv, x1v);
2214 }
2215
2216 _mm_store_pd(t, termv);
2217
2218 if(fastScaling)
2219 term = log(0.25 * (fabs(t[0] + t[1])));
2220 else
2221 term = log(0.25 * (fabs(t[0] + t[1]))) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2222 #else
2223 for(j = 0, term = 0.0; j < 4; j++)
2224 for(k = 0; k < 2; k++)
2225 term += x1[k] * x2[j * 2 + k] * diagptable[j * 2 + k];
2226
2227 if(fastScaling)
2228 term = log(0.25 * fabs(term));
2229 else
2230 term = log(0.25 * fabs(term)) + ex2[i] * log(PLL_MINLIKELIHOOD);
2231 #endif
2232
2233 sum += wptr[i] * term;
2234 }
2235 }
2236 else
2237 {
2238 for (i = 0; i < n; i++)
2239 {
2240 #if (defined(__SSE3) || defined(__AVX))
2241 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2242 __m128d termv, x1v, x2v, dv;
2243 #endif
2244 x1 = &x1_start[8 * i];
2245 x2 = &x2_start[8 * i];
2246
2247 #if (defined(__SSE3) || defined(__AVX))
2248 termv = _mm_set1_pd(0.0);
2249
2250 for(j = 0; j < 4; j++)
2251 {
2252 x1v = _mm_load_pd(&x1[j * 2]);
2253 x2v = _mm_load_pd(&x2[j * 2]);
2254 dv = _mm_load_pd(&diagptable[j * 2]);
2255
2256 x1v = _mm_mul_pd(x1v, x2v);
2257 x1v = _mm_mul_pd(x1v, dv);
2258
2259 termv = _mm_add_pd(termv, x1v);
2260 }
2261
2262 _mm_store_pd(t, termv);
2263
2264
2265 if(fastScaling)
2266 term = log(0.25 * (fabs(t[0] + t[1])));
2267 else
2268 term = log(0.25 * (fabs(t[0] + t[1]))) + ((ex1[i] +ex2[i]) * log(PLL_MINLIKELIHOOD));
2269 #else
2270 for(j = 0, term = 0.0; j < 4; j++)
2271 for(k = 0; k < 2; k++)
2272 term += x1[j * 2 + k] * x2[j * 2 + k] * diagptable[j * 2 + k];
2273
2274 if(fastScaling)
2275 term = log(0.25 * fabs(term));
2276 else
2277 term = log(0.25 * fabs(term)) + (ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD);
2278 #endif
2279
2280 sum += wptr[i] * term;
2281 }
2282 }
2283
2284 return sum;
2285 }
2286 #endif
2287
2288
2289
2290 /* below are the optimized function versions with geeky intrinsics */
2291
2292 /** @ingroup evaluateLikelihoodGroup
2293 @brief Evaluation of log likelihood of a tree under the GAMMA model of rate heterogeneity and LG4 model of evolution
2294
2295 This is the same as ::evaluateGAMMA_FLEX but for the LG4 model. It contains two implementations,
2296 one which is the generic, and one that is optimized with SSE3 instructions. The two implementations
2297 are separated by preprocessor macros.
2298 The difference from ::evaluateGAMMA_FLEX is that we have 4 different tipVectors computed from the 4 different
2299 Q matrix decompositions.
2300 Please check ::evaluateGAMMA_FLEX for more information and a description of the common
2301 input parameters.
2302 */
evaluateGTRGAMMAPROT_LG4(int * ex1,int * ex2,int * wptr,double * x1,double * x2,double * tipVector[4],unsigned char * tipX1,int n,double * diagptable,const pllBoolean fastScaling,double * lg4_weights)2303 static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr,
2304 double *x1, double *x2,
2305 double *tipVector[4],
2306 unsigned char *tipX1, int n, double *diagptable, const pllBoolean fastScaling,
2307 double * lg4_weights)
2308 {
2309 double sum = 0.0, term;
2310 int i, j, l;
2311 double *left, *right;
2312
2313 if(tipX1)
2314 {
2315 for (i = 0; i < n; i++)
2316 {
2317 #if (defined(__SSE3) || defined(__AVX))
2318 __m128d tv = _mm_setzero_pd();
2319
2320 for(j = 0, term = 0.0; j < 4; j++)
2321 {
2322 double *d = &diagptable[j * 20];
2323
2324 __m128d
2325 t = _mm_setzero_pd(),
2326 w = _mm_set1_pd(lg4_weights[j]);
2327
2328 left = &(tipVector[j][20 * tipX1[i]]);
2329 right = &(x2[80 * i + 20 * j]);
2330 for(l = 0; l < 20; l+=2)
2331 {
2332 __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2333 t = _mm_add_pd(t, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2334 }
2335 tv = _mm_add_pd(tv, _mm_mul_pd(t, w));
2336 }
2337
2338 tv = _mm_hadd_pd(tv, tv);
2339 _mm_storel_pd(&term, tv);
2340
2341
2342 #else
2343 for(j = 0, term = 0.0; j < 4; j++)
2344 {
2345 double t = 0.0;
2346
2347 left = &(tipVector[j][20 * tipX1[i]]);
2348 right = &(x2[80 * i + 20 * j]);
2349
2350 for(l = 0; l < 20; l++)
2351 t += left[l] * right[l] * diagptable[j * 20 + l];
2352
2353 term += lg4_weights[j] * t;
2354 }
2355 #endif
2356
2357 if(fastScaling)
2358 term = log(fabs(term));
2359 else
2360 term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2361
2362 sum += wptr[i] * term;
2363
2364 }
2365 }
2366 else
2367 {
2368 for (i = 0; i < n; i++)
2369 {
2370 #if (defined(__SSE3) || defined(__AVX))
2371 __m128d tv = _mm_setzero_pd();
2372
2373 for(j = 0, term = 0.0; j < 4; j++)
2374 {
2375 double *d = &diagptable[j * 20];
2376
2377 __m128d
2378 t = _mm_setzero_pd(),
2379 w = _mm_set1_pd(lg4_weights[j]);
2380
2381 left = &(x1[80 * i + 20 * j]);
2382 right = &(x2[80 * i + 20 * j]);
2383
2384 for(l = 0; l < 20; l+=2)
2385 {
2386 __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2387 t = _mm_add_pd(t, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2388 }
2389 tv = _mm_add_pd(tv, _mm_mul_pd(t, w));
2390 }
2391 tv = _mm_hadd_pd(tv, tv);
2392 _mm_storel_pd(&term, tv);
2393 #else
2394 for(j = 0, term = 0.0; j < 4; j++)
2395 {
2396 double t = 0.0;
2397
2398 left = &(x1[80 * i + 20 * j]);
2399 right = &(x2[80 * i + 20 * j]);
2400
2401 for(l = 0; l < 20; l++)
2402 t += left[l] * right[l] * diagptable[j * 20 + l];
2403
2404 term += lg4_weights[j] * t;
2405 }
2406 #endif
2407
2408 if(fastScaling)
2409 term = log(fabs(term));
2410 else
2411 term = log(fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
2412
2413 sum += wptr[i] * term;
2414 }
2415 }
2416
2417 return sum;
2418 }
2419
2420 #if (defined(__SSE3) || defined(__AVX))
2421 /** @ingroup evaluateLikelihoodGroup
2422 @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity
2423 and the memory saving technique (Optimized SSE3 version for AA data)
2424
2425 This is the SSE3 optimized version of ::evaluateGAMMA_FLEX_SAVE for evaluating the log
2426 likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2427 vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX_SAVE for more information and
2428 a description of the input parameters
2429 */
evaluateGTRGAMMAPROT_GAPPED_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)2430 static double evaluateGTRGAMMAPROT_GAPPED_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
2431 double *x1, double *x2,
2432 double *tipVector,
2433 unsigned char *tipX1, int n, double *diagptable,
2434 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
2435 {
2436 double sum = 0.0, term;
2437 int i, j, l;
2438 double
2439 *left,
2440 *right,
2441 *x1_ptr = x1,
2442 *x2_ptr = x2,
2443 *x1v,
2444 *x2v;
2445 __m128d tv;
2446
2447 if(tipX1)
2448 {
2449 for (i = 0; i < n; i++)
2450 {
2451 if(x2_gap[i / 32] & mask32[i % 32])
2452 x2v = x2_gapColumn;
2453 else
2454 {
2455 x2v = x2_ptr;
2456 x2_ptr += 80;
2457 }
2458
2459 //TUNG: Standard C does not allow declaration after executable statement
2460 tv = _mm_setzero_pd();
2461 //__m128d tv = _mm_setzero_pd();
2462 left = &(tipVector[20 * tipX1[i]]);
2463
2464 for(j = 0, term = 0.0; j < 4; j++)
2465 {
2466 double *d = &diagptable[j * 20];
2467 right = &(x2v[20 * j]);
2468 for(l = 0; l < 20; l+=2)
2469 {
2470 __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2471 tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2472 }
2473 }
2474
2475 tv = _mm_hadd_pd(tv, tv);
2476 _mm_storel_pd(&term, tv);
2477
2478
2479 if(!fastScaling)
2480 term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2481 else
2482 term = log(0.25 * fabs(term));
2483
2484 sum += wptr[i] * term;
2485 }
2486 }
2487 else
2488 {
2489 for (i = 0; i < n; i++)
2490 {
2491 if(x1_gap[i / 32] & mask32[i % 32])
2492 x1v = x1_gapColumn;
2493 else
2494 {
2495 x1v = x1_ptr;
2496 x1_ptr += 80;
2497 }
2498
2499 if(x2_gap[i / 32] & mask32[i % 32])
2500 x2v = x2_gapColumn;
2501 else
2502 {
2503 x2v = x2_ptr;
2504 x2_ptr += 80;
2505 }
2506
2507 //__m128d tv = _mm_setzero_pd();
2508 tv = _mm_setzero_pd();
2509
2510 for(j = 0, term = 0.0; j < 4; j++)
2511 {
2512 double *d = &diagptable[j * 20];
2513 left = &(x1v[20 * j]);
2514 right = &(x2v[20 * j]);
2515
2516 for(l = 0; l < 20; l+=2)
2517 {
2518 __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2519 tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2520 }
2521 }
2522 tv = _mm_hadd_pd(tv, tv);
2523 _mm_storel_pd(&term, tv);
2524
2525
2526 if(!fastScaling)
2527 term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2528 else
2529 term = log(0.25 * fabs(term));
2530
2531
2532 sum += wptr[i] * term;
2533 }
2534 }
2535
2536 return sum;
2537 }
2538
2539
2540
2541 /** @ingroup evaluateLikelihoodGroup
2542 @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity
2543 (Optimized SSE3 version for AA data)
2544
2545 This is the SSE3 optimized version of ::evaluateGAMMA_FLEX for evaluating the log
2546 likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2547 vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX for more information and
2548 a description of the common input parameters
2549 */
evaluateGTRGAMMAPROT(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable)2550 static double evaluateGTRGAMMAPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
2551 double *x1, double *x2,
2552 double *tipVector,
2553 unsigned char *tipX1, int n, double *diagptable)
2554 {
2555 double sum = 0.0, term;
2556 int i, j, l;
2557 double *left, *right;
2558
2559 if(tipX1)
2560 {
2561 for (i = 0; i < n; i++)
2562 {
2563
2564 __m128d tv = _mm_setzero_pd();
2565 left = &(tipVector[20 * tipX1[i]]);
2566
2567 for(j = 0, term = 0.0; j < 4; j++)
2568 {
2569 double *d = &diagptable[j * 20];
2570 right = &(x2[80 * i + 20 * j]);
2571 for(l = 0; l < 20; l+=2)
2572 {
2573 __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2574 tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2575 }
2576 }
2577 tv = _mm_hadd_pd(tv, tv);
2578 _mm_storel_pd(&term, tv);
2579
2580
2581 if(!fastScaling)
2582 term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2583 else
2584 term = log(0.25 * fabs(term));
2585
2586
2587 sum += wptr[i] * term;
2588 }
2589 }
2590 else
2591 {
2592 for (i = 0; i < n; i++)
2593 {
2594 __m128d tv = _mm_setzero_pd();
2595
2596 for(j = 0, term = 0.0; j < 4; j++)
2597 {
2598 double *d = &diagptable[j * 20];
2599 left = &(x1[80 * i + 20 * j]);
2600 right = &(x2[80 * i + 20 * j]);
2601
2602 for(l = 0; l < 20; l+=2)
2603 {
2604 __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
2605 tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
2606 }
2607 }
2608 tv = _mm_hadd_pd(tv, tv);
2609 _mm_storel_pd(&term, tv);
2610
2611
2612 if(!fastScaling)
2613 term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2614 else
2615 term = log(0.25 * fabs(term));
2616
2617
2618 sum += wptr[i] * term;
2619 }
2620 }
2621
2622 return sum;
2623 }
2624
2625
2626 /** @ingroup evaluateLikelihoodGroup
2627 @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity
2628 (Optimized SSE3 version for AA data)
2629
2630 This is the SSE3 optimized version of ::evaluateCAT_FLEX for evaluating the log
2631 likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2632 vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX for more information and
2633 a description of the common input parameters
2634 */
evaluateGTRCATPROT(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start)2635 static double evaluateGTRCATPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
2636 double *x1, double *x2, double *tipVector,
2637 unsigned char *tipX1, int n, double *diagptable_start)
2638 {
2639 double sum = 0.0, term;
2640 double *diagptable, *left, *right;
2641 int i, l;
2642 __m128d tv;
2643
2644 if(tipX1)
2645 {
2646 for (i = 0; i < n; i++)
2647 {
2648 left = &(tipVector[20 * tipX1[i]]);
2649 right = &(x2[20 * i]);
2650
2651 diagptable = &diagptable_start[20 * cptr[i]];
2652
2653 //TUNG: Standard C does not allow declaration after executable statement
2654 tv = _mm_setzero_pd();
2655 //__m128d tv = _mm_setzero_pd();
2656
2657 for(l = 0; l < 20; l+=2)
2658 {
2659 __m128d lv = _mm_load_pd(&left[l]);
2660 __m128d rv = _mm_load_pd(&right[l]);
2661 __m128d mul = _mm_mul_pd(lv, rv);
2662 __m128d dv = _mm_load_pd(&diagptable[l]);
2663
2664 tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));
2665 }
2666
2667 tv = _mm_hadd_pd(tv, tv);
2668 _mm_storel_pd(&term, tv);
2669
2670 if(!fastScaling)
2671 term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2672 else
2673 term = log(fabs(term));
2674
2675 sum += wptr[i] * term;
2676 }
2677 }
2678 else
2679 {
2680
2681 for (i = 0; i < n; i++)
2682 {
2683 left = &x1[20 * i];
2684 right = &x2[20 * i];
2685
2686 diagptable = &diagptable_start[20 * cptr[i]];
2687
2688 __m128d tv = _mm_setzero_pd();
2689
2690 for(l = 0; l < 20; l+=2)
2691 {
2692 __m128d lv = _mm_load_pd(&left[l]);
2693 __m128d rv = _mm_load_pd(&right[l]);
2694 __m128d mul = _mm_mul_pd(lv, rv);
2695 __m128d dv = _mm_load_pd(&diagptable[l]);
2696
2697 tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));
2698 }
2699
2700 tv = _mm_hadd_pd(tv, tv);
2701 _mm_storel_pd(&term, tv);
2702
2703 if(!fastScaling)
2704 term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2705 else
2706 term = log(fabs(term));
2707
2708 sum += wptr[i] * term;
2709 }
2710 }
2711
2712 return sum;
2713 }
2714
2715
2716 /** @ingroup evaluateLikelihoodGroup
2717 @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity with memory saving
2718 (Optimized SSE3 version for AA data)
2719
2720 This is the SSE3 optimized version of ::evaluateCAT_FLEX_SAVE for evaluating the log
2721 likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2722 vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX_SAVE for more information and
2723 a description of the common input parameters
2724 */
evaluateGTRCATPROT_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1,double * x2,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)2725 static double evaluateGTRCATPROT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
2726 double *x1, double *x2, double *tipVector,
2727 unsigned char *tipX1, int n, double *diagptable_start,
2728 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
2729 {
2730 double
2731 sum = 0.0,
2732 term,
2733 *diagptable,
2734 *left,
2735 *right,
2736 *left_ptr = x1,
2737 *right_ptr = x2;
2738
2739 int
2740 i,
2741 l;
2742
2743 if(tipX1)
2744 {
2745 for (i = 0; i < n; i++)
2746 {
2747 left = &(tipVector[20 * tipX1[i]]);
2748
2749 if(isGap(x2_gap, i))
2750 right = x2_gapColumn;
2751 else
2752 {
2753 right = right_ptr;
2754 right_ptr += 20;
2755 }
2756
2757 diagptable = &diagptable_start[20 * cptr[i]];
2758
2759 __m128d tv = _mm_setzero_pd();
2760
2761 for(l = 0; l < 20; l+=2)
2762 {
2763 __m128d lv = _mm_load_pd(&left[l]);
2764 __m128d rv = _mm_load_pd(&right[l]);
2765 __m128d mul = _mm_mul_pd(lv, rv);
2766 __m128d dv = _mm_load_pd(&diagptable[l]);
2767
2768 tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));
2769 }
2770
2771 tv = _mm_hadd_pd(tv, tv);
2772 _mm_storel_pd(&term, tv);
2773
2774 if(!fastScaling)
2775 term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2776 else
2777 term = log(fabs(term));
2778
2779 sum += wptr[i] * term;
2780 }
2781 }
2782 else
2783 {
2784
2785 for (i = 0; i < n; i++)
2786 {
2787 if(isGap(x1_gap, i))
2788 left = x1_gapColumn;
2789 else
2790 {
2791 left = left_ptr;
2792 left_ptr += 20;
2793 }
2794
2795 if(isGap(x2_gap, i))
2796 right = x2_gapColumn;
2797 else
2798 {
2799 right = right_ptr;
2800 right_ptr += 20;
2801 }
2802
2803 diagptable = &diagptable_start[20 * cptr[i]];
2804
2805 __m128d tv = _mm_setzero_pd();
2806
2807 for(l = 0; l < 20; l+=2)
2808 {
2809 __m128d lv = _mm_load_pd(&left[l]);
2810 __m128d rv = _mm_load_pd(&right[l]);
2811 __m128d mul = _mm_mul_pd(lv, rv);
2812 __m128d dv = _mm_load_pd(&diagptable[l]);
2813
2814 tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));
2815 }
2816
2817 tv = _mm_hadd_pd(tv, tv);
2818 _mm_storel_pd(&term, tv);
2819
2820 if(!fastScaling)
2821 term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2822 else
2823 term = log(fabs(term));
2824
2825 sum += wptr[i] * term;
2826 }
2827 }
2828
2829 return sum;
2830 }
2831
2832
2833 /** @ingroup evaluateLikelihoodGroup
2834 @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity with memory saving
2835 (Optimized SSE3 version for DNA data)
2836
2837 This is the SSE3 optimized version of ::evaluateCAT_FLEX_SAVE for evaluating the log
2838 likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2839 vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX_SAVE for more information and
2840 a description of the common input parameters
2841 */
evaluateGTRCAT_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)2842 static double evaluateGTRCAT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
2843 double *x1_start, double *x2_start, double *tipVector,
2844 unsigned char *tipX1, int n, double *diagptable_start,
2845 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
2846 {
2847 double sum = 0.0, term;
2848 int i;
2849
2850 double *diagptable,
2851 *x1,
2852 *x2,
2853 *x1_ptr = x1_start,
2854 *x2_ptr = x2_start;
2855
2856 if(tipX1)
2857 {
2858 for (i = 0; i < n; i++)
2859 {
2860 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2861 __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
2862
2863 x1 = &(tipVector[4 * tipX1[i]]);
2864
2865 if(isGap(x2_gap, i))
2866 x2 = x2_gapColumn;
2867 else
2868 {
2869 x2 = x2_ptr;
2870 x2_ptr += 4;
2871 }
2872
2873 diagptable = &diagptable_start[4 * cptr[i]];
2874
2875 x1v1 = _mm_load_pd(&x1[0]);
2876 x1v2 = _mm_load_pd(&x1[2]);
2877 x2v1 = _mm_load_pd(&x2[0]);
2878 x2v2 = _mm_load_pd(&x2[2]);
2879 dv1 = _mm_load_pd(&diagptable[0]);
2880 dv2 = _mm_load_pd(&diagptable[2]);
2881
2882 x1v1 = _mm_mul_pd(x1v1, x2v1);
2883 x1v1 = _mm_mul_pd(x1v1, dv1);
2884
2885 x1v2 = _mm_mul_pd(x1v2, x2v2);
2886 x1v2 = _mm_mul_pd(x1v2, dv2);
2887
2888 x1v1 = _mm_add_pd(x1v1, x1v2);
2889
2890 _mm_store_pd(t, x1v1);
2891
2892 if(!fastScaling)
2893 term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
2894 else
2895 term = log(fabs(t[0] + t[1]));
2896
2897
2898
2899 sum += wptr[i] * term;
2900 }
2901 }
2902 else
2903 {
2904 for (i = 0; i < n; i++)
2905 {
2906 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2907 __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
2908
2909 if(isGap(x1_gap, i))
2910 x1 = x1_gapColumn;
2911 else
2912 {
2913 x1 = x1_ptr;
2914 x1_ptr += 4;
2915 }
2916
2917 if(isGap(x2_gap, i))
2918 x2 = x2_gapColumn;
2919 else
2920 {
2921 x2 = x2_ptr;
2922 x2_ptr += 4;
2923 }
2924
2925 diagptable = &diagptable_start[4 * cptr[i]];
2926
2927 x1v1 = _mm_load_pd(&x1[0]);
2928 x1v2 = _mm_load_pd(&x1[2]);
2929 x2v1 = _mm_load_pd(&x2[0]);
2930 x2v2 = _mm_load_pd(&x2[2]);
2931 dv1 = _mm_load_pd(&diagptable[0]);
2932 dv2 = _mm_load_pd(&diagptable[2]);
2933
2934 x1v1 = _mm_mul_pd(x1v1, x2v1);
2935 x1v1 = _mm_mul_pd(x1v1, dv1);
2936
2937 x1v2 = _mm_mul_pd(x1v2, x2v2);
2938 x1v2 = _mm_mul_pd(x1v2, dv2);
2939
2940 x1v1 = _mm_add_pd(x1v1, x1v2);
2941
2942 _mm_store_pd(t, x1v1);
2943
2944
2945 if(!fastScaling)
2946 term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
2947 else
2948 term = log(fabs(t[0] + t[1]));
2949
2950 sum += wptr[i] * term;
2951 }
2952 }
2953
2954 return sum;
2955 }
2956
2957
2958 /** @ingroup evaluateLikelihoodGroup
2959 @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity with memory saving
2960 (Optimized SSE3 version for DNA data)
2961
2962 This is the SSE3 optimized version of ::evaluateGAMMA_FLEX_SAVE for evaluating the log
2963 likelihood at some edge whose two end-points (nodes) have the conditional likelihood
2964 vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX_SAVE for more information and
2965 a description of the common input parameters
2966 */
evaluateGTRGAMMA_GAPPED_SAVE(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,const int n,double * diagptable,double * x1_gapColumn,double * x2_gapColumn,unsigned int * x1_gap,unsigned int * x2_gap)2967 static double evaluateGTRGAMMA_GAPPED_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
2968 double *x1_start, double *x2_start,
2969 double *tipVector,
2970 unsigned char *tipX1, const int n, double *diagptable,
2971 double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
2972 {
2973 double sum = 0.0, term;
2974 int i, j;
2975 double
2976 *x1,
2977 *x2,
2978 *x1_ptr = x1_start,
2979 *x2_ptr = x2_start;
2980
2981
2982
2983 if(tipX1)
2984 {
2985
2986
2987 for (i = 0; i < n; i++)
2988 {
2989 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
2990 __m128d termv, x1v, x2v, dv;
2991
2992 x1 = &(tipVector[4 * tipX1[i]]);
2993 if(x2_gap[i / 32] & mask32[i % 32])
2994 x2 = x2_gapColumn;
2995 else
2996 {
2997 x2 = x2_ptr;
2998 x2_ptr += 16;
2999 }
3000
3001
3002 termv = _mm_set1_pd(0.0);
3003
3004 for(j = 0; j < 4; j++)
3005 {
3006 x1v = _mm_load_pd(&x1[0]);
3007 x2v = _mm_load_pd(&x2[j * 4]);
3008 dv = _mm_load_pd(&diagptable[j * 4]);
3009
3010 x1v = _mm_mul_pd(x1v, x2v);
3011 x1v = _mm_mul_pd(x1v, dv);
3012
3013 termv = _mm_add_pd(termv, x1v);
3014
3015 x1v = _mm_load_pd(&x1[2]);
3016 x2v = _mm_load_pd(&x2[j * 4 + 2]);
3017 dv = _mm_load_pd(&diagptable[j * 4 + 2]);
3018
3019 x1v = _mm_mul_pd(x1v, x2v);
3020 x1v = _mm_mul_pd(x1v, dv);
3021
3022 termv = _mm_add_pd(termv, x1v);
3023 }
3024
3025 _mm_store_pd(t, termv);
3026
3027 if(!fastScaling)
3028 term = log(0.25 * fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
3029 else
3030 term = log(0.25 * fabs(t[0] + t[1]));
3031
3032
3033 sum += wptr[i] * term;
3034 }
3035 }
3036 else
3037 {
3038
3039 for (i = 0; i < n; i++)
3040 {
3041 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
3042 __m128d termv, x1v, x2v, dv;
3043
3044 if(x1_gap[i / 32] & mask32[i % 32])
3045 x1 = x1_gapColumn;
3046 else
3047 {
3048 x1 = x1_ptr;
3049 x1_ptr += 16;
3050 }
3051
3052 if(x2_gap[i / 32] & mask32[i % 32])
3053 x2 = x2_gapColumn;
3054 else
3055 {
3056 x2 = x2_ptr;
3057 x2_ptr += 16;
3058 }
3059
3060 termv = _mm_set1_pd(0.0);
3061
3062 for(j = 0; j < 4; j++)
3063 {
3064 x1v = _mm_load_pd(&x1[j * 4]);
3065 x2v = _mm_load_pd(&x2[j * 4]);
3066 dv = _mm_load_pd(&diagptable[j * 4]);
3067
3068 x1v = _mm_mul_pd(x1v, x2v);
3069 x1v = _mm_mul_pd(x1v, dv);
3070
3071 termv = _mm_add_pd(termv, x1v);
3072
3073 x1v = _mm_load_pd(&x1[j * 4 + 2]);
3074 x2v = _mm_load_pd(&x2[j * 4 + 2]);
3075 dv = _mm_load_pd(&diagptable[j * 4 + 2]);
3076
3077 x1v = _mm_mul_pd(x1v, x2v);
3078 x1v = _mm_mul_pd(x1v, dv);
3079
3080 termv = _mm_add_pd(termv, x1v);
3081 }
3082
3083 _mm_store_pd(t, termv);
3084
3085 if(!fastScaling)
3086 term = log(0.25 * fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
3087 else
3088 term = log(0.25 * fabs(t[0] + t[1]));
3089
3090
3091 sum += wptr[i] * term;
3092 }
3093 }
3094
3095 return sum;
3096 }
3097
3098
3099 /** @ingroup evaluateLikelihoodGroup
3100 @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity (Optimized SSE3 version for DNA data)
3101
3102 This is the SSE3 optimized version of ::evaluateGAMMA_FLEX for evaluating the log
3103 likelihood at some edge whose two end-points (nodes) have the conditional likelihood
3104 vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX for more information and
3105 a description of the common input parameters
3106 */
evaluateGTRGAMMA(const pllBoolean fastScaling,int * ex1,int * ex2,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,const int n,double * diagptable)3107 static double evaluateGTRGAMMA(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
3108 double *x1_start, double *x2_start,
3109 double *tipVector,
3110 unsigned char *tipX1, const int n, double *diagptable)
3111 {
3112 double sum = 0.0, term;
3113 int i, j;
3114
3115 double *x1, *x2;
3116
3117
3118
3119 if(tipX1)
3120 {
3121 for (i = 0; i < n; i++)
3122 {
3123 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
3124 __m128d termv, x1v, x2v, dv;
3125
3126 x1 = &(tipVector[4 * tipX1[i]]);
3127 x2 = &x2_start[16 * i];
3128
3129
3130 termv = _mm_set1_pd(0.0);
3131
3132 for(j = 0; j < 4; j++)
3133 {
3134 x1v = _mm_load_pd(&x1[0]);
3135 x2v = _mm_load_pd(&x2[j * 4]);
3136 dv = _mm_load_pd(&diagptable[j * 4]);
3137
3138 x1v = _mm_mul_pd(x1v, x2v);
3139 x1v = _mm_mul_pd(x1v, dv);
3140
3141 termv = _mm_add_pd(termv, x1v);
3142
3143 x1v = _mm_load_pd(&x1[2]);
3144 x2v = _mm_load_pd(&x2[j * 4 + 2]);
3145 dv = _mm_load_pd(&diagptable[j * 4 + 2]);
3146
3147 x1v = _mm_mul_pd(x1v, x2v);
3148 x1v = _mm_mul_pd(x1v, dv);
3149
3150 termv = _mm_add_pd(termv, x1v);
3151 }
3152
3153 _mm_store_pd(t, termv);
3154
3155
3156 if(!fastScaling)
3157 term = log(0.25 * fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
3158 else
3159 term = log(0.25 * fabs(t[0] + t[1]));
3160
3161
3162
3163 sum += wptr[i] * term;
3164 }
3165 }
3166 else
3167 {
3168 for (i = 0; i < n; i++)
3169 {
3170 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
3171 __m128d termv, x1v, x2v, dv;
3172
3173
3174 x1 = &x1_start[16 * i];
3175 x2 = &x2_start[16 * i];
3176
3177
3178 termv = _mm_set1_pd(0.0);
3179
3180 for(j = 0; j < 4; j++)
3181 {
3182 x1v = _mm_load_pd(&x1[j * 4]);
3183 x2v = _mm_load_pd(&x2[j * 4]);
3184 dv = _mm_load_pd(&diagptable[j * 4]);
3185
3186 x1v = _mm_mul_pd(x1v, x2v);
3187 x1v = _mm_mul_pd(x1v, dv);
3188
3189 termv = _mm_add_pd(termv, x1v);
3190
3191 x1v = _mm_load_pd(&x1[j * 4 + 2]);
3192 x2v = _mm_load_pd(&x2[j * 4 + 2]);
3193 dv = _mm_load_pd(&diagptable[j * 4 + 2]);
3194
3195 x1v = _mm_mul_pd(x1v, x2v);
3196 x1v = _mm_mul_pd(x1v, dv);
3197
3198 termv = _mm_add_pd(termv, x1v);
3199 }
3200
3201 _mm_store_pd(t, termv);
3202
3203 if(!fastScaling)
3204 term = log(0.25 * fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
3205 else
3206 term = log(0.25 * fabs(t[0] + t[1]));
3207
3208
3209
3210 sum += wptr[i] * term;
3211 }
3212 }
3213
3214 return sum;
3215 }
3216
3217
3218 /** @ingroup evaluateLikelihoodGroup
3219 @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity (Optimized SSE3 version for DNA data)
3220
3221 This is the SSE3 optimized version of ::evaluateCAT_FLEX for evaluating the log
3222 likelihood at some edge whose two end-points (nodes) have the conditional likelihood
3223 vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX for more information and
3224 a description of the common input parameters
3225 */
evaluateGTRCAT(const pllBoolean fastScaling,int * ex1,int * ex2,int * cptr,int * wptr,double * x1_start,double * x2_start,double * tipVector,unsigned char * tipX1,int n,double * diagptable_start)3226 static double evaluateGTRCAT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
3227 double *x1_start, double *x2_start, double *tipVector,
3228 unsigned char *tipX1, int n, double *diagptable_start)
3229 {
3230 double sum = 0.0, term;
3231 int i;
3232
3233 double *diagptable, *x1, *x2;
3234
3235 if(tipX1)
3236 {
3237 for (i = 0; i < n; i++)
3238 {
3239 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
3240 __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
3241
3242 x1 = &(tipVector[4 * tipX1[i]]);
3243 x2 = &x2_start[4 * i];
3244
3245 diagptable = &diagptable_start[4 * cptr[i]];
3246
3247
3248 x1v1 = _mm_load_pd(&x1[0]);
3249 x1v2 = _mm_load_pd(&x1[2]);
3250 x2v1 = _mm_load_pd(&x2[0]);
3251 x2v2 = _mm_load_pd(&x2[2]);
3252 dv1 = _mm_load_pd(&diagptable[0]);
3253 dv2 = _mm_load_pd(&diagptable[2]);
3254
3255 x1v1 = _mm_mul_pd(x1v1, x2v1);
3256 x1v1 = _mm_mul_pd(x1v1, dv1);
3257
3258 x1v2 = _mm_mul_pd(x1v2, x2v2);
3259 x1v2 = _mm_mul_pd(x1v2, dv2);
3260
3261 x1v1 = _mm_add_pd(x1v1, x1v2);
3262
3263 _mm_store_pd(t, x1v1);
3264
3265 if(!fastScaling)
3266 term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
3267 else
3268 term = log(fabs(t[0] + t[1]));
3269
3270
3271 sum += wptr[i] * term;
3272 }
3273 }
3274 else
3275 {
3276 for (i = 0; i < n; i++)
3277 {
3278 PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
3279 __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
3280
3281 x1 = &x1_start[4 * i];
3282 x2 = &x2_start[4 * i];
3283
3284 diagptable = &diagptable_start[4 * cptr[i]];
3285
3286
3287 x1v1 = _mm_load_pd(&x1[0]);
3288 x1v2 = _mm_load_pd(&x1[2]);
3289 x2v1 = _mm_load_pd(&x2[0]);
3290 x2v2 = _mm_load_pd(&x2[2]);
3291 dv1 = _mm_load_pd(&diagptable[0]);
3292 dv2 = _mm_load_pd(&diagptable[2]);
3293
3294 x1v1 = _mm_mul_pd(x1v1, x2v1);
3295 x1v1 = _mm_mul_pd(x1v1, dv1);
3296
3297 x1v2 = _mm_mul_pd(x1v2, x2v2);
3298 x1v2 = _mm_mul_pd(x1v2, dv2);
3299
3300 x1v1 = _mm_add_pd(x1v1, x1v2);
3301
3302 _mm_store_pd(t, x1v1);
3303
3304 if(!fastScaling)
3305 term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
3306 else
3307 term = log(fabs(t[0] + t[1]));
3308
3309
3310 sum += wptr[i] * term;
3311 }
3312 }
3313
3314 return sum;
3315 }
3316
3317
3318
3319
3320
3321 #endif
3322