1 /*
2  *  MrBayes 3
3  *
4  *  (c) 2002-2013
5  *
6  *  John P. Huelsenbeck
7  *  Dept. Integrative Biology
8  *  University of California, Berkeley
9  *  Berkeley, CA 94720-3140
10  *  johnh@berkeley.edu
11  *
12  *  Fredrik Ronquist
13  *  Swedish Museum of Natural History
14  *  Box 50007
15  *  SE-10405 Stockholm, SWEDEN
16  *  fredrik.ronquist@nrm.se
17  *
18  *  With important contributions by
19  *
20  *  Paul van der Mark (paulvdm@sc.fsu.edu)
21  *  Maxim Teslenko (maxkth@gmail.com)
22  *  Chi Zhang (zhangchicool@gmail.com)
23  *
24  *  and by many users (run 'acknowledgments' to see more info)
25  *
26  * This program is free software; you can redistribute it and/or
27  * modify it under the terms of the GNU General Public License
28  * as published by the Free Software Foundation; either version 2
29  * of the License, or (at your option) any later version.
30  *
31  * This program is distributed in the hope that it will be useful,
32  * but WITHOUT ANY WARRANTY; without even the implied warranty of
33  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34  * GNU General Public License for more details (www.gnu.org).
35  *
36  */
37 
38 #include "bayes.h"
39 #include "likelihood.h"
40 #include "mbbeagle.h"
41 #include "model.h"
42 #include "utils.h"
43 
44 #define LIKE_EPSILON                1.0e-300
45 
46 /* global variables declared here */
47 CLFlt     *preLikeL;                  /* precalculated cond likes for left descendant */
48 CLFlt     *preLikeR;                  /* precalculated cond likes for right descendant*/
49 CLFlt     *preLikeA;                  /* precalculated cond likes for ancestor        */
50 
51 /* global variables used here but declared elsewhere */
52 extern int      *chainId;
53 extern int      numLocalChains;
54 extern int      rateProbRowSize;            /* size of rate probs for one chain one state   */
55 extern MrBFlt   **rateProbs;                /* pointers to rate probs used by adgamma model */
56 
57 /* local prototypes */
58 void      CopySiteScalers (ModelInfo *m, int chain);
59 void      FlipCondLikeSpace (ModelInfo *m, int chain, int nodeIndex);
60 void      FlipCijkSpace (ModelInfo *m, int chain);
61 void      FlipNodeScalerSpace (ModelInfo *m, int chain, int nodeIndex);
62 void      FlipSiteScalerSpace (ModelInfo *m, int chain);
63 void      FlipTiProbsSpace (ModelInfo *m, int chain, int nodeIndex);
64 MrBFlt    GetRate (int division, int chain);
65 int       RemoveNodeScalers(TreeNode *p, int division, int chain);
66 #if defined (SSE_ENABLED)
67 int       RemoveNodeScalers_SSE(TreeNode *p, int division, int chain);
68 #endif
69 #if defined (AVX_ENABLED)
70 int       RemoveNodeScalers_AVX(TreeNode *p, int division, int chain);
71 #endif
72 void      ResetSiteScalers (ModelInfo *m, int chain);
73 int       SetBinaryQMatrix (MrBFlt **a, int whichChain, int division);
74 int       SetNucQMatrix (MrBFlt **a, int n, int whichChain, int division, MrBFlt rateMult, MrBFlt *rA, MrBFlt *rS);
75 int       SetStdQMatrix (MrBFlt **a, int nStates, MrBFlt *bs, int cType);
76 int       SetProteinQMatrix (MrBFlt **a, int n, int whichChain, int division, MrBFlt rateMult);
77 int       UpDateCijk (int whichPart, int whichChain);
78 
79 
80 #if !defined (SSE_ENABLED) || 1
81 /*----------------------------------------------------------------
82 |
83 |   CondLikeDown_Bin: binary model with or without rate
84 |       variation
85 |
86 -----------------------------------------------------------------*/
CondLikeDown_Bin(TreeNode * p,int division,int chain)87 int CondLikeDown_Bin (TreeNode *p, int division, int chain)
88 {
89     int             c, k;
90     CLFlt           *clL, *clR, *clP, *pL, *pR, *tiPL, *tiPR;
91     ModelInfo       *m;
92 
93     /* find model settings for this division */
94     m = &modelSettings[division];
95 
96     /* Flip conditional likelihood space */
97     FlipCondLikeSpace (m, chain, p->index);
98 
99     /* find conditional likelihood pointers */
100     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
101     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
102     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
103 
104     /* find transition probabilities */
105     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
106     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
107 
108     tiPL = pL;
109     tiPR = pR;
110     for (k=0; k<m->numRateCats; k++)
111         {
112         for (c=0; c<m->numChars; c++)
113             {
114             *(clP++) = (tiPL[0]*clL[0] + tiPL[1]*clL[1])
115                       *(tiPR[0]*clR[0] + tiPR[1]*clR[1]);
116             *(clP++) = (tiPL[2]*clL[0] + tiPL[3]*clL[1])
117                       *(tiPR[2]*clR[0] + tiPR[3]*clR[1]);
118 
119             clL += 2;
120             clR += 2;
121             }
122         tiPL += 4;
123         tiPR += 4;
124         }
125 
126     return NO_ERROR;
127 
128 }
129 #endif
130 
131 
132 #if defined (SSE_ENABLED)
133 /*----------------------------------------------------------------
134 |
135 |   CondLikeDown_Bin_SSE: binary model with or without rate
136 |       variation
137 |
138 -----------------------------------------------------------------*/
CondLikeDown_Bin_SSE(TreeNode * p,int division,int chain)139 int CondLikeDown_Bin_SSE (TreeNode *p, int division, int chain)
140 {
141     int             c, k;
142     CLFlt           *pL, *pR, *tiPL, *tiPR;
143     __m128          *clL, *clR, *clP;
144     __m128          m1, m2, m3, m4, m5, m6;
145     ModelInfo       *m;
146 
147     m = &modelSettings[division];
148 
149     /* flip state of node so that we are not overwriting old cond likes */
150     FlipCondLikeSpace (m, chain, p->index);
151 
152     /* find conditional likelihood pointers */
153     clL = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
154     clR = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
155     clP = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
156 
157     /* find transition probabilities */
158     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
159     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
160 
161     tiPL = pL;
162     tiPR = pR;
163     for (k=0; k<m->numRateCats; k++)
164         {
165         for (c=0; c<m->numVecChars; c++)
166             {
167             m1 = _mm_load1_ps (&tiPL[0]);
168             m2 = _mm_load1_ps (&tiPR[0]);
169             m5 = _mm_mul_ps (m1, clL[0]);
170             m6 = _mm_mul_ps (m2, clR[0]);
171 
172             m1 = _mm_load1_ps (&tiPL[1]);
173             m2 = _mm_load1_ps (&tiPR[1]);
174             m3 = _mm_mul_ps (m1, clL[1]);
175             m4 = _mm_mul_ps (m2, clR[1]);
176 
177             m5 = _mm_add_ps (m3, m5);
178             m6 = _mm_add_ps (m4, m6);
179 
180             *clP++ = _mm_mul_ps (m5, m6);
181 
182             m1 = _mm_load1_ps (&tiPL[2]);
183             m2 = _mm_load1_ps (&tiPR[2]);
184             m5 = _mm_mul_ps (m1, clL[0]);
185             m6 = _mm_mul_ps (m2, clR[0]);
186 
187             m1 = _mm_load1_ps (&tiPL[3]);
188             m2 = _mm_load1_ps (&tiPR[3]);
189             m3 = _mm_mul_ps (m1, clL[1]);
190             m4 = _mm_mul_ps (m2, clR[1]);
191 
192             m5 = _mm_add_ps (m3, m5);
193             m6 = _mm_add_ps (m4, m6);
194 
195             *clP++ = _mm_mul_ps (m5, m6);
196             clL += 2;
197             clR += 2;
198             }
199         tiPL += 4;
200         tiPR += 4;
201         }
202 
203     return NO_ERROR;
204 }
205 #endif
206 
207 
208 /*----------------------------------------------------------------
209 |
210 |   CondLikeDown_Gen: general n-state model with or without rate
211 |       variation
212 |
213 -----------------------------------------------------------------*/
CondLikeDown_Gen(TreeNode * p,int division,int chain)214 int CondLikeDown_Gen (TreeNode *p, int division, int chain)
215 {
216     int             a, b, c, h, i, k, j, shortCut, *lState=NULL, *rState=NULL,
217                     nObsStates, nStates, nStatesSquared, preLikeJump;
218     CLFlt           likeL, likeR, *pL, *pR, *tiPL, *tiPR, *clL, *clR, *clP;
219     ModelInfo       *m;
220 #   if !defined (DEBUG_NOSHORTCUTS)
221     int catStart;
222 #   endif
223 
224     /* find model settings for this division and nStates, nStatesSquared */
225     m = &modelSettings[division];
226     nObsStates = m->numStates;
227     nStates = m->numModelStates;
228     nStatesSquared = nStates * nStates;
229     preLikeJump = nObsStates * nStates;
230 
231     /* flip conditional likelihood space */
232     FlipCondLikeSpace (m, chain, p->index);
233 
234     /* find conditional likelihood pointers */
235     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
236     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
237     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
238 
239     /* find transition probabilities */
240     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
241     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
242 
243     /* find likelihoods of site patterns for left branch if terminal */
244     shortCut = 0;
245 #   if !defined (DEBUG_NOSHORTCUTS)
246     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
247         {
248         shortCut |= 1;
249         lState = m->termState[p->left->index];
250         tiPL = pL;
251         for (k=a=0; k<m->numRateCats; k++)
252             {
253             catStart = a;
254             for (i=0; i<nObsStates; i++)
255                 for (j=i; j<nStatesSquared; j+=nStates)
256                     preLikeL[a++] = tiPL[j];
257             for (b=1; b<nStates/nObsStates; b++)
258                 {
259                 a = catStart;
260                 for (i=0; i<nObsStates; i++)
261                     {
262                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
263                         preLikeL[a++] += tiPL[j];
264                     }
265                 }
266             /* for ambiguous */
267             for (i=0; i<nStates; i++)
268                 preLikeL[a++] = 1.0;
269             tiPL += nStatesSquared;
270             }
271         }
272 
273     /* find likelihoods of site patterns for right branch if terminal */
274     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
275         {
276         shortCut |= 2;
277         rState = m->termState[p->right->index];
278         tiPR = pR;
279         for (k=a=0; k<m->numRateCats; k++)
280             {
281             catStart = a;
282             for (i=0; i<nObsStates; i++)
283                 for (j=i; j<nStatesSquared; j+=nStates)
284                     preLikeR[a++] = tiPR[j];
285             for (b=1; b<nStates/nObsStates; b++)
286                 {
287                 a = catStart;
288                 for (i=0; i<nObsStates; i++)
289                     {
290                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
291                         preLikeR[a++] += tiPR[j];
292                     }
293                 }
294             /* for ambiguous */
295             for (i=0; i<nStates; i++)
296                 preLikeR[a++] = 1.0;
297             tiPR += nStatesSquared;
298             }
299         }
300 #   endif
301     switch (shortCut)
302         {
303         case 0:
304             tiPL = pL;
305             tiPR = pR;
306             for (k=0; k<m->numRateCats; k++)
307                 {
308                 for (c=0; c<m->numChars; c++)
309                     {
310                     for (i=h=0; i<nStates; i++)
311                         {
312                         likeL = likeR = 0.0;
313                         for (j=0; j<nStates; j++)
314                             {
315                             likeL += tiPL[h]*clL[j];
316                             likeR += tiPR[h++]*clR[j];
317                             }
318                         *(clP++) = likeL * likeR;
319                         }
320                     clL += nStates;
321                     clR += nStates;
322                     }
323                 tiPL += nStatesSquared;
324                 tiPR += nStatesSquared;
325                 }
326             break;
327         case 1:
328             tiPR = pR;
329             for (k=0; k<m->numRateCats; k++)
330                 {
331                 for (c=0; c<m->numChars; c++)
332                     {
333                     a = lState[c] + k*(preLikeJump+nStates);
334                     for (i=h=0; i<nStates; i++)
335                         {
336                         likeR = 0.0;
337                         for (j=0; j<nStates; j++)
338                             {
339                             likeR += tiPR[h++]*clR[j];
340                             }
341                         *(clP++) = preLikeL[a++] * likeR;
342                         }
343                     clR += nStates;
344                     }
345                 tiPR += nStatesSquared;
346                 }
347             break;
348         case 2:
349             tiPL = pL;
350             for (k=0; k<m->numRateCats; k++)
351                 {
352                 for (c=0; c<m->numChars; c++)
353                     {
354                     a = rState[c] + k*(preLikeJump+nStates);
355                     for (i=h=0; i<nStates; i++)
356                         {
357                         likeL = 0.0;
358                         for (j=0; j<nStates; j++)
359                             {
360                             likeL += tiPL[h++]*clL[j];
361                             }
362                         *(clP++) = preLikeR[a++] * likeL;
363                         }
364                     clL += nStates;
365                     }
366                 tiPL += nStatesSquared;
367                 }
368             break;
369         case 3:
370             for (k=0; k<m->numRateCats; k++)
371                 {
372                 for (c=0; c<m->numChars; c++)
373                     {
374                     a = rState[c] + k*(preLikeJump+nStates);
375                     b = lState[c] + k*(preLikeJump+nStates);
376                     for (i=0; i<nStates; i++)
377                         {
378                         *(clP++) = preLikeR[a++] * preLikeL[b++];
379                         }
380                     }
381                 }
382             break;
383         }
384 
385     return NO_ERROR;
386 }
387 
388 
389 #if defined (SSE_ENABLED)
390 /*----------------------------------------------------------------
391 |
392 |   CondLikeDown_Gen_SSE: general n-state model with or without rate
393 |       variation
394 |
395 -----------------------------------------------------------------*/
CondLikeDown_Gen_SSE(TreeNode * p,int division,int chain)396 int CondLikeDown_Gen_SSE (TreeNode *p, int division, int chain)
397 {
398     int             c, c1, h, i, j, k, t, shortCut, *lState=NULL, *rState=NULL, nStates, nStatesSquared, nObsStates, preLikeJump;
399     CLFlt           *pL, *pR, *tiPL, *tiPR;
400     __m128          *clL, *clR, *clP;
401     __m128          mTiPL, mTiPR, mL, mR, mAcumL, mAcumR;
402     ModelInfo       *m;
403     CLFlt           *preLikeRV[4] = {0};
404     CLFlt           *preLikeLV[4] = {0};
405 
406 #   if !defined (DEBUG_NOSHORTCUTS)
407     int             a, b, catStart;
408 #   endif
409 
410     /* find model settings for this division and nStates, nStatesSquared */
411     m = &modelSettings[division];
412     nObsStates = m->numStates;
413     nStates = m->numModelStates;
414     nStatesSquared = nStates * nStates;
415     preLikeJump = nObsStates * nStates;
416 
417     /* Flip conditional likelihood space */
418     FlipCondLikeSpace (m, chain, p->index);
419 
420     /* find conditional likelihood pointers */
421     clL = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->left->index ]];
422     clR = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->right->index]];
423     clP = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->index       ]];
424 
425     /* find transition probabilities */
426     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
427     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
428 
429     /* find likelihoods of site patterns for left branch if terminal */
430     shortCut = 0;
431 #   if !defined (DEBUG_NOSHORTCUTS)
432     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
433         {
434         shortCut |= 1;
435         lState = m->termState[p->left->index];
436         tiPL = pL;
437         for (k=a=0; k<m->numRateCats; k++)
438             {
439             catStart = a;
440             for (i=0; i<nObsStates; i++)
441                 for (j=i; j<nStatesSquared; j+=nStates)
442                     preLikeL[a++] = tiPL[j];
443             for (b=1; b<nStates/nObsStates; b++)
444                 {
445                 a = catStart;
446                 for (i=0; i<nObsStates; i++)
447                     {
448                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
449                         preLikeL[a++] += tiPL[j];
450                     }
451                 }
452             /* for ambiguous */
453             for (i=0; i<nStates; i++)
454                 preLikeL[a++] = 1.0;
455             tiPL += nStatesSquared;
456             }
457         }
458 
459     /* find likelihoods of site patterns for right branch if terminal */
460     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
461         {
462         shortCut |= 2;
463         rState = m->termState[p->right->index];
464         tiPR = pR;
465         for (k=a=0; k<m->numRateCats; k++)
466             {
467             catStart = a;
468             for (i=0; i<nObsStates; i++)
469                 for (j=i; j<nStatesSquared; j+=nStates)
470                     preLikeR[a++] = tiPR[j];
471             for (b=1; b<nStates/nObsStates; b++)
472                 {
473                 a = catStart;
474                 for (i=0; i<nObsStates; i++)
475                     {
476                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
477                         preLikeR[a++] += tiPR[j];
478                     }
479                 }
480             /* for ambiguous */
481             for (i=0; i<nStates; i++)
482                 preLikeR[a++] = 1.0;
483             tiPR += nStatesSquared;
484             }
485         }
486 #   endif
487 
488     switch (shortCut)
489         {
490         case 0:
491             tiPL = pL;
492             tiPR = pR;
493             for (k=0; k<m->numRateCats; k++)
494                 {
495                 for (c=0; c<m->numVecChars; c++)
496                     {
497                     for (i=h=0; i<nStates; i++)
498                         {
499                         mAcumL = _mm_setzero_ps();
500                         mAcumR = _mm_setzero_ps();
501                         for (j=0; j<nStates; j++)
502                             {
503                             mTiPL  = _mm_load1_ps (&tiPL[h]);
504                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
505                             mL     = _mm_mul_ps (mTiPL, clL[j]);
506                             mR     = _mm_mul_ps (mTiPR, clR[j]);
507                             mAcumL = _mm_add_ps (mL, mAcumL);
508                             mAcumR = _mm_add_ps (mR, mAcumR);
509                             }
510                         *(clP++) = _mm_mul_ps (mAcumL, mAcumR);
511                         }
512                     clL += nStates;
513                     clR += nStates;
514                     }
515                 tiPL += nStatesSquared;
516                 tiPR += nStatesSquared;
517                 }
518             break;
519         case 1:
520             tiPR = pR;
521             for (k=0; k<m->numRateCats; k++)
522                 {
523                 for (c=t=0; c<m->numVecChars; c++)
524                     {
525                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
526                         {
527                         preLikeLV[c1] = &preLikeL[lState[t] + k*(preLikeJump+nStates)];
528                         }
529                     for (i=h=0; i<nStates; i++)
530                         {
531                         mAcumL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
532                         mAcumR = _mm_setzero_ps();
533                         for (j=0; j<nStates; j++)
534                             {
535                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
536                             mR     = _mm_mul_ps (mTiPR, clR[j]);
537                             mAcumR = _mm_add_ps (mR, mAcumR);
538                             }
539                         *(clP++) = _mm_mul_ps (mAcumL,mAcumR);
540                         }
541                     clR += nStates;
542                     }
543                 tiPR += nStatesSquared;
544                 }
545             break;
546         case 2:
547             tiPL = pL;
548             for (k=0; k<m->numRateCats; k++)
549                 {
550                 for (c=t=0; c<m->numVecChars; c++)
551                     {
552                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
553                         {
554                         preLikeRV[c1] = &preLikeR[rState[t] + k*(preLikeJump+nStates)];
555                         }
556                     for (i=h=0; i<nStates; i++)
557                         {
558                         mAcumR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
559                         mAcumL = _mm_setzero_ps();
560                         for (j=0; j<nStates; j++)
561                             {
562                             mTiPL  = _mm_load1_ps (&tiPL[h++]);
563                             mL     = _mm_mul_ps (mTiPL, clL[j]);
564                             mAcumL = _mm_add_ps (mL, mAcumL);
565                             }
566                         *(clP++) = _mm_mul_ps (mAcumL,mAcumR);
567                         }
568                     clL += nStates;
569                     }
570                 tiPL += nStatesSquared;
571                 }
572             break;
573         case 3:
574             for (k=0; k<m->numRateCats; k++)
575                 {
576                 for (c=t=0; c<m->numVecChars; c++)
577                     {
578                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
579                         {
580                         preLikeRV[c1] = &preLikeR[rState[t] + k*(preLikeJump+nStates)];
581                         preLikeLV[c1] = &preLikeL[lState[t] + k*(preLikeJump+nStates)];
582                         }
583                     for (i=0; i<nStates; i++)
584                         {
585                         assert (m->numFloatsPerVec == 4); /* In the following 2 statments we assume that SSE register can hold exactly 4 ClFlts. */
586                         mL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
587                         mR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
588                         *(clP++) = _mm_mul_ps (mL,mR);
589                         }
590                     }
591                 }
592             break;
593         }
594     return NO_ERROR;
595 }
596 #endif
597 
598 
599 /*----------------------------------------------------------------
600 |
601 |   CondLikeDown_Gen_GibbsGamma: general n-state model with rate
602 |       variation modeled using discrete gamma with Gibbs resampling
603 |
604 -----------------------------------------------------------------*/
CondLikeDown_Gen_GibbsGamma(TreeNode * p,int division,int chain)605 int CondLikeDown_Gen_GibbsGamma (TreeNode *p, int division, int chain)
606 {
607     int             a, b, c, i, j, r, *rateCat, shortCut, *lState=NULL, *rState=NULL,
608                     nObsStates, nStates, nStatesSquared, nGammaCats;
609     CLFlt           likeL, likeR, *pL, *pR, *tiPL, *tiPR, *clL, *clR, *clP;
610     ModelInfo       *m;
611 #   if !defined (DEBUG_NOSHORTCUTS)
612     int k, catStart;
613 #   endif
614 
615     /* find model settings for this division and nStates, nStatesSquared */
616     m = &modelSettings[division];
617     nObsStates = m->numStates;
618     nStates = m->numModelStates;
619     nStatesSquared = nStates * nStates;
620 
621     /* flip conditional likelihood space */
622     FlipCondLikeSpace (m, chain, p->index);
623 
624     /* find conditional likelihood pointers */
625     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
626     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
627     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
628 
629     /* find transition probabilities */
630     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
631     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
632 
633     /* find rate category index and number of gamma categories */
634     rateCat = m->tiIndex + chain * m->numChars;
635     nGammaCats = m->numRateCats;
636 
637     /* find likelihoods of site patterns for left branch if terminal */
638     shortCut = 0;
639 #   if !defined (DEBUG_NOSHORTCUTS)
640     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
641         {
642         shortCut |= 1;
643         lState = m->termState[p->left->index];
644         tiPL = pL;
645         for (k=a=0; k<nGammaCats; k++)
646             {
647             catStart = a;
648             for (i=0; i<nObsStates; i++)
649                 for (j=i; j<nStatesSquared; j+=nStates)
650                     preLikeL[a++] = tiPL[j];
651             for (b=1; b<nStates/nObsStates; b++)
652                 {
653                 a = catStart;
654                 for (i=0; i<nObsStates; i++)
655                     {
656                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
657                         preLikeL[a++] += tiPL[j];
658                     }
659                 }
660             /* for ambiguous */
661             for (i=0; i<nStates; i++)
662                 preLikeL[a++] = 1.0;
663             tiPL += nStatesSquared;
664             }
665         }
666 
667     /* find likelihoods of site patterns for right branch if terminal */
668     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
669         {
670         shortCut |= 2;
671         rState = m->termState[p->right->index];
672         tiPR = pR;
673         for (k=a=0; k<nGammaCats; k++)
674             {
675             catStart = a;
676             for (i=0; i<nObsStates; i++)
677                 for (j=i; j<nStatesSquared; j+=nStates)
678                     preLikeR[a++] = tiPR[j];
679             for (b=1; b<nStates/nObsStates; b++)
680                 {
681                 a = catStart;
682                 for (i=0; i<nObsStates; i++)
683                     {
684                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
685                         preLikeR[a++] += tiPR[j];
686                     }
687                 }
688             /* for ambiguous */
689             for (i=0; i<nStates; i++)
690                 preLikeR[a++] = 1.0;
691             tiPR += nStatesSquared;
692             }
693         }
694 #   endif
695 
696     switch (shortCut)
697         {
698         case 0:
699             for (c=0; c<m->numChars; c++)
700                 {
701                 r = (*rateCat++);
702                 if (r < nGammaCats)
703                     {
704                     tiPL = pL + r*nStatesSquared;
705                     tiPR = pR + r*nStatesSquared;
706                     for (i=0; i<nStates; i++)
707                         {
708                         likeL = likeR = 0.0;
709                         for (j=0; j<nStates; j++)
710                             {
711                             likeL += (*tiPL++) * clL[j];
712                             likeR += (*tiPR++) * clR[j];
713                             }
714                         *(clP++) = likeL * likeR;
715                         }
716                     }
717                 else
718                     clP += nStates;
719                 clL += nStates;
720                 clR += nStates;
721                 }
722             break;
723         case 1:
724             for (c=0; c<m->numChars; c++)
725                 {
726                 r = (*rateCat++);
727                 if (r < nGammaCats)
728                     {
729                     tiPR = pR + r*nStatesSquared;
730                     a = lState[c] + r*(nStatesSquared+nStates);
731                     for (i=0; i<nStates; i++)
732                         {
733                         likeR = 0.0;
734                         for (j=0; j<nStates; j++)
735                             {
736                             likeR += (*tiPR++)*clR[j];
737                             }
738                         *(clP++) = preLikeL[a++] * likeR;
739                         }
740                     }
741                 else
742                     clP += nStates;
743                 clR += nStates;
744                 }
745             break;
746         case 2:
747             for (c=0; c<m->numChars; c++)
748                 {
749                 r = (*rateCat++);
750                 if (r < nGammaCats)
751                     {
752                     tiPL = pL + r*nStatesSquared;
753                     a = rState[c] + r*(nStatesSquared+nStates);
754                     for (i=0; i<nStates; i++)
755                         {
756                         likeL = 0.0;
757                         for (j=0; j<nStates; j++)
758                             {
759                             likeL += (*tiPL++)*clL[j];
760                             }
761                         *(clP++) = preLikeR[a++] * likeL;
762                         }
763                     }
764                 else
765                     clP += nStates;
766                 clL += nStates;
767                 }
768             break;
769         case 3:
770             for (c=0; c<m->numChars; c++)
771                 {
772                 r = (*rateCat++);
773                 if (r < nGammaCats)
774                     {
775                     a = lState[c] + r*(nStatesSquared+nStates);
776                     b = rState[c] + r*(nStatesSquared+nStates);
777                     for (i=0; i<nStates; i++)
778                         *(clP++) = preLikeL[a++]*preLikeR[b++];
779                     }
780                 else
781                     clP += nStates;
782                 }
783             break;
784         }
785 
786     return NO_ERROR;
787 }
788 
789 
790 /*----------------------------------------------------------------
791 |
792 |   CondLikeDown_NUC4: 4by4 nucleotide model with or without rate
793 |       variation
794 |
795 -----------------------------------------------------------------*/
CondLikeDown_NUC4(TreeNode * p,int division,int chain)796 int CondLikeDown_NUC4 (TreeNode *p, int division, int chain)
797 {
798     int             c, h, i, j, k, shortCut, *lState=NULL, *rState=NULL;
799     CLFlt           *clL, *clR, *clP, *pL, *pR, *tiPL, *tiPR;
800     ModelInfo       *m;
801 
802     m = &modelSettings[division];
803 
804     /* flip space so that we do not overwrite old cond likes */
805     FlipCondLikeSpace (m, chain, p->index);
806 
807     /* find conditional likelihood pointers */
808     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
809     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
810     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
811 
812     /* find transition probabilities */
813     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
814     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
815 
816     /* find likelihoods of site patterns for left branch if terminal */
817     shortCut = 0;
818 #   if !defined (DEBUG_NOSHORTCUTS)
819     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
820         {
821         shortCut |= 1;
822         lState = m->termState[p->left->index];
823         tiPL = pL;
824         for (k=j=0; k<m->numRateCats; k++)
825             {
826             for (i=0; i<4; i++)
827                 {
828                 preLikeL[j++] = tiPL[0];
829                 preLikeL[j++] = tiPL[4];
830                 preLikeL[j++] = tiPL[8];
831                 preLikeL[j++] = tiPL[12];
832                 tiPL++;
833                 }
834             /* for ambiguous */
835             for (i=0; i<4; i++)
836                 preLikeL[j++] = 1.0;
837             tiPL += 12;
838             }
839         }
840 
841     /* find likelihoods of site patterns for right branch if terminal */
842     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
843         {
844         shortCut |= 2;
845         rState = m->termState[p->right->index];
846         tiPR = pR;
847         for (k=j=0; k<m->numRateCats; k++)
848             {
849             for (i=0; i<4; i++)
850                 {
851                 preLikeR[j++] = tiPR[0];
852                 preLikeR[j++] = tiPR[4];
853                 preLikeR[j++] = tiPR[8];
854                 preLikeR[j++] = tiPR[12];
855                 tiPR++;
856                 }
857             /* for ambiguous */
858             for (i=0; i<4; i++)
859                 preLikeR[j++] = 1.0;
860             tiPR += 12;
861             }
862         }
863 #   endif
864 
865     switch (shortCut)
866         {
867         case 0:
868             tiPL = pL;
869             tiPR = pR;
870             for (k=h=0; k<m->numRateCats; k++)
871                 {
872                 for (c=0; c<m->numChars; c++)
873                     {
874                     clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
875                                 *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T]);
876                     clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
877                                 *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T]);
878                     clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
879                                 *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T]);
880                     clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
881                                 *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T]);
882                     clL += 4;
883                     clR += 4;
884                     }
885                 tiPL += 16;
886                 tiPR += 16;
887                 }
888             break;
889         case 1:
890             tiPR = pR;
891             for (k=h=0; k<m->numRateCats; k++)
892                 {
893                 for (c=0; c<m->numChars; c++)
894                     {
895                     i = lState[c] + k*20;
896                     clP[h++] =   preLikeL[i++]
897                                 *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T]);
898                     clP[h++] =   preLikeL[i++]
899                                 *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T]);
900                     clP[h++] =   preLikeL[i++]
901                                 *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T]);
902                     clP[h++] =   preLikeL[i++]
903                                 *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T]);
904                     clR += 4;
905                     }
906                 tiPR += 16;
907                 }
908             break;
909         case 2:
910             tiPL = pL;
911             for (k=h=0; k<m->numRateCats; k++)
912                 {
913                 for (c=0; c<m->numChars; c++)
914                     {
915                     i = rState[c] + k*20;
916                     clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
917                                 *preLikeR[i++];
918                     clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
919                                 *preLikeR[i++];
920                     clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
921                                 *preLikeR[i++];
922                     clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
923                                 *preLikeR[i++];
924                     clL += 4;
925                     }
926                 tiPL += 16;
927                 }
928             break;
929         case 3:
930             for (k=h=0; k<m->numRateCats; k++)
931                 {
932                 for (c=0; c<m->numChars; c++)
933                     {
934                     i = j = k*20;
935                     i += lState[c];
936                     j += rState[c];
937                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
938                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
939                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
940                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
941                     }
942                 }
943         }
944 
945     return NO_ERROR;
946 }
947 
948 
949 /*----------------------------------------------------------------
950 |
951 |   CondLikeDown_NUC4_GibbsGamma: 4by4 nucleotide model with rate
952 |       variation approximated using Gibbs sampling of gamma
953 |
954 -----------------------------------------------------------------*/
CondLikeDown_NUC4_GibbsGamma(TreeNode * p,int division,int chain)955 int CondLikeDown_NUC4_GibbsGamma (TreeNode *p, int division, int chain)
956 {
957     int             c, h, i, j, r, *rateCat, shortCut, *lState=NULL, *rState=NULL,
958                     nGammaCats;
959     CLFlt           *clL, *clR, *clP, *pL, *pR, *tiPL, *tiPR;
960     ModelInfo       *m;
961 #   if !defined (DEBUG_NOSHORTCUTS)
962     int k;
963 #   endif
964 
965     m = &modelSettings[division];
966 
967     /* flip conditional likelihood space */
968     FlipCondLikeSpace (m, chain, p->index);
969 
970     /* find conditional likelihood pointers */
971     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
972     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
973     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
974 
975     /* find transition probabilities */
976     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
977     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
978 
979     /* find rate category index  and number of gamma categories */
980     rateCat = m->tiIndex + chain * m->numChars;
981     nGammaCats = m->numRateCats;
982 
983     /* find likelihoods of site patterns for left branch if terminal */
984     shortCut = 0;
985 #   if !defined (DEBUG_NOSHORTCUTS)
986     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
987         {
988         shortCut |= 1;
989         lState = m->termState[p->left->index];
990         tiPL = pL;
991         for (k=j=0; k<nGammaCats; k++)
992             {
993             for (i=0; i<4; i++)
994                 {
995                 preLikeL[j++] = tiPL[0];
996                 preLikeL[j++] = tiPL[4];
997                 preLikeL[j++] = tiPL[8];
998                 preLikeL[j++] = tiPL[12];
999                 tiPL++;
1000                 }
1001             /* for ambiguous */
1002             for (i=0; i<4; i++)
1003                 preLikeL[j++] = 1.0;
1004             tiPL += 12;
1005             }
1006         }
1007 
1008     /* find likelihoods of site patterns for right branch if terminal */
1009     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
1010         {
1011         shortCut |= 2;
1012         rState =  m->termState[p->right->index];
1013         tiPR = pR;
1014         for (k=j=0; k<nGammaCats; k++)
1015             {
1016             for (i=0; i<4; i++)
1017                 {
1018                 preLikeR[j++] = tiPR[0];
1019                 preLikeR[j++] = tiPR[4];
1020                 preLikeR[j++] = tiPR[8];
1021                 preLikeR[j++] = tiPR[12];
1022                 tiPR++;
1023                 }
1024             /* for ambiguous */
1025             for (i=0; i<4; i++)
1026                 preLikeR[j++] = 1.0;
1027             tiPR += 12;
1028             }
1029         }
1030 #   endif
1031 
1032     switch (shortCut)
1033         {
1034         case 0:
1035             for (c=h=0; c<m->numChars; c++)
1036                 {
1037                 r = rateCat[c];
1038                 if (r < nGammaCats)
1039                     {
1040                     tiPL = pL + r * 16;
1041                     tiPR = pR + r * 16;
1042                     clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
1043                                 *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T]);
1044                     clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
1045                                 *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T]);
1046                     clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
1047                                 *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T]);
1048                     clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
1049                                 *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T]);
1050                     }
1051                 else
1052                     h += 4;
1053                 clL += 4;
1054                 clR += 4;
1055                 }
1056             break;
1057         case 1:
1058             for (c=h=0; c<m->numChars; c++)
1059                 {
1060                 r = rateCat[c];
1061                 if (r < nGammaCats)
1062                     {
1063                     tiPR = pR + r * 16;
1064                     i = lState[c] + r * 20;
1065                     clP[h++] =   preLikeL[i++]
1066                                 *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T]);
1067                     clP[h++] =   preLikeL[i++]
1068                                 *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T]);
1069                     clP[h++] =   preLikeL[i++]
1070                                 *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T]);
1071                     clP[h++] =   preLikeL[i++]
1072                                 *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T]);
1073                     }
1074                 else
1075                     h += 4;
1076                 clR += 4;
1077                 }
1078             break;
1079         case 2:
1080             for (c=h=0; c<m->numChars; c++)
1081                 {
1082                 r = rateCat[c];
1083                 if (r < nGammaCats)
1084                     {
1085                     tiPL = pL + r * 16;
1086                     i = rState[c] + r * 20;
1087                     clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
1088                                 *preLikeR[i++];
1089                     clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
1090                                 *preLikeR[i++];
1091                     clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
1092                                 *preLikeR[i++];
1093                     clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
1094                                 *preLikeR[i++];
1095                     }
1096                 else
1097                     h += 4;
1098                 clL += 4;
1099                 }
1100             break;
1101         case 3:
1102             for (c=h=0; c<m->numChars; c++)
1103                 {
1104                 r = rateCat[c];
1105                 if (r < nGammaCats)
1106                     {
1107                     i = lState[c] + r * 20;
1108                     j = rState[c] + r * 20;
1109                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
1110                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
1111                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
1112                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
1113                     }
1114                 else
1115                     h += 4;
1116                 }
1117             break;
1118         }
1119 
1120     return NO_ERROR;
1121 }
1122 
1123 
1124 #if defined (FMA_ENABLED)
1125 /*----------------------------------------------------------------
1126  |
1127  |   CondLikeDown_NUC4_FMA: 4by4 nucleotide model with or without rate
1128  |       variation, using AVX + FMA instructions
1129  |
1130  -----------------------------------------------------------------*/
CondLikeDown_NUC4_FMA(TreeNode * p,int division,int chain)1131 int CondLikeDown_NUC4_FMA (TreeNode *p, int division, int chain)
1132 {
1133     int             c, k;
1134     CLFlt           *pL, *pR, *tiPL, *tiPR;
1135     __m256          *clL, *clR, *clP;
1136     __m256          m1, m2, m3, m4;
1137     ModelInfo       *m;
1138 
1139     m = &modelSettings[division];
1140 
1141     /* flip state of node so that we are not overwriting old cond likes */
1142     FlipCondLikeSpace (m, chain, p->index);
1143 
1144     /* find conditional likelihood pointers */
1145     clL = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1146     clR = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
1147     clP = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
1148 
1149     /* find transition probabilities */
1150     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1151     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1152 
1153     tiPL = pL;
1154     tiPR = pR;
1155     for (k=0; k<m->numRateCats; k++)
1156     {
1157         for (c=0; c<m->numVecChars; c++)
1158         {
1159             m1 = _mm256_broadcast_ss (&tiPL[AA]);
1160             m2 = _mm256_broadcast_ss (&tiPR[AA]);
1161             m3 = _mm256_mul_ps (m1, clL[A]);
1162             m4 = _mm256_mul_ps (m2, clR[A]);
1163 
1164             m1 = _mm256_broadcast_ss (&tiPL[AC]);
1165             m2 = _mm256_broadcast_ss (&tiPR[AC]);
1166             m3 = _mm256_fmadd_ps (m1, clL[C], m3);
1167             m4 = _mm256_fmadd_ps (m2, clR[C], m4);
1168 
1169             m1 = _mm256_broadcast_ss (&tiPL[AG]);
1170             m2 = _mm256_broadcast_ss (&tiPR[AG]);
1171             m3 = _mm256_fmadd_ps (m1, clL[G], m3);
1172             m4 = _mm256_fmadd_ps (m2, clR[G], m4);
1173 
1174             m1 = _mm256_broadcast_ss (&tiPL[AT]);
1175             m2 = _mm256_broadcast_ss (&tiPR[AT]);
1176             m3 = _mm256_fmadd_ps (m1, clL[T], m3);
1177             m4 = _mm256_fmadd_ps (m2, clR[T], m4);
1178 
1179             *clP++ = _mm256_mul_ps (m3, m4);
1180 
1181             m1 = _mm256_broadcast_ss (&tiPL[CA]);
1182             m2 = _mm256_broadcast_ss (&tiPR[CA]);
1183             m3 = _mm256_mul_ps (m1, clL[A]);
1184             m4 = _mm256_mul_ps (m2, clR[A]);
1185 
1186             m1 = _mm256_broadcast_ss (&tiPL[CC]);
1187             m2 = _mm256_broadcast_ss (&tiPR[CC]);
1188             m3 = _mm256_fmadd_ps (m1, clL[C], m3);
1189             m4 = _mm256_fmadd_ps (m2, clR[C], m4);
1190 
1191             m1 = _mm256_broadcast_ss (&tiPL[CG]);
1192             m2 = _mm256_broadcast_ss (&tiPR[CG]);
1193             m3 = _mm256_fmadd_ps (m1, clL[G], m3);
1194             m4 = _mm256_fmadd_ps (m2, clR[G], m4);
1195 
1196             m1 = _mm256_broadcast_ss (&tiPL[CT]);
1197             m2 = _mm256_broadcast_ss (&tiPR[CT]);
1198             m3 = _mm256_fmadd_ps (m1, clL[T], m3);
1199             m4 = _mm256_fmadd_ps (m2, clR[T], m4);
1200 
1201             *clP++ = _mm256_mul_ps (m3, m4);
1202 
1203             m1 = _mm256_broadcast_ss (&tiPL[GA]);
1204             m2 = _mm256_broadcast_ss (&tiPR[GA]);
1205             m3 = _mm256_mul_ps (m1, clL[A]);
1206             m4 = _mm256_mul_ps (m2, clR[A]);
1207 
1208             m1 = _mm256_broadcast_ss (&tiPL[GC]);
1209             m2 = _mm256_broadcast_ss (&tiPR[GC]);
1210             m3 = _mm256_fmadd_ps (m1, clL[C], m3);
1211             m4 = _mm256_fmadd_ps (m2, clR[C], m4);
1212 
1213             m1 = _mm256_broadcast_ss (&tiPL[GG]);
1214             m2 = _mm256_broadcast_ss (&tiPR[GG]);
1215             m3 = _mm256_fmadd_ps (m1, clL[G], m3);
1216             m4 = _mm256_fmadd_ps (m2, clR[G], m4);
1217 
1218             m1 = _mm256_broadcast_ss (&tiPL[GT]);
1219             m2 = _mm256_broadcast_ss (&tiPR[GT]);
1220             m3 = _mm256_fmadd_ps (m1, clL[T], m3);
1221             m4 = _mm256_fmadd_ps (m2, clR[T], m4);
1222 
1223             *clP++ = _mm256_mul_ps (m3, m4);
1224 
1225             m1 = _mm256_broadcast_ss (&tiPL[TA]);
1226             m2 = _mm256_broadcast_ss (&tiPR[TA]);
1227             m3 = _mm256_mul_ps (m1, clL[A]);
1228             m4 = _mm256_mul_ps (m2, clR[A]);
1229 
1230             m1 = _mm256_broadcast_ss (&tiPL[TC]);
1231             m2 = _mm256_broadcast_ss (&tiPR[TC]);
1232             m3 = _mm256_fmadd_ps (m1, clL[C], m3);
1233             m4 = _mm256_fmadd_ps (m2, clR[C], m4);
1234 
1235             m1 = _mm256_broadcast_ss (&tiPL[TG]);
1236             m2 = _mm256_broadcast_ss (&tiPR[TG]);
1237             m3 = _mm256_fmadd_ps (m1, clL[G], m3);
1238             m4 = _mm256_fmadd_ps (m2, clR[G], m4);
1239 
1240             m1 = _mm256_broadcast_ss (&tiPL[TT]);
1241             m2 = _mm256_broadcast_ss (&tiPR[TT]);
1242             m3 = _mm256_fmadd_ps (m1, clL[T], m3);
1243             m4 = _mm256_fmadd_ps (m2, clR[T], m4);
1244 
1245             *clP++ = _mm256_mul_ps (m3, m4);
1246 
1247             clL += 4;
1248             clR += 4;
1249         }
1250         tiPL += 16;
1251         tiPR += 16;
1252     }
1253 
1254     return NO_ERROR;
1255 
1256 }
1257 #endif
1258 
1259 
1260 #if defined (AVX_ENABLED)
1261 /*----------------------------------------------------------------
1262  |
1263  |   CondLikeDown_NUC4_AVX: 4by4 nucleotide model with or without rate
1264  |       variation, using AVX instructions
1265  |
1266  -----------------------------------------------------------------*/
CondLikeDown_NUC4_AVX(TreeNode * p,int division,int chain)1267 int CondLikeDown_NUC4_AVX (TreeNode *p, int division, int chain)
1268 {
1269     int             c, k;
1270     CLFlt           *pL, *pR, *tiPL, *tiPR;
1271     __m256          *clL, *clR, *clP;
1272     __m256          m1, m2, m3, m4, m5, m6;
1273     ModelInfo       *m;
1274 
1275     m = &modelSettings[division];
1276 
1277     /* flip state of node so that we are not overwriting old cond likes */
1278     FlipCondLikeSpace (m, chain, p->index);
1279 
1280     /* find conditional likelihood pointers */
1281     clL = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1282     clR = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
1283     clP = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
1284 
1285     /* find transition probabilities */
1286     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1287     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1288 
1289     tiPL = pL;
1290     tiPR = pR;
1291     for (k=0; k<m->numRateCats; k++)
1292     {
1293         for (c=0; c<m->numVecChars; c++)
1294         {
1295             m1 = _mm256_broadcast_ss (&tiPL[AA]);
1296             m2 = _mm256_broadcast_ss (&tiPR[AA]);
1297             m5 = _mm256_mul_ps (m1, clL[A]);
1298             m6 = _mm256_mul_ps (m2, clR[A]);
1299 
1300             m1 = _mm256_broadcast_ss (&tiPL[AC]);
1301             m2 = _mm256_broadcast_ss (&tiPR[AC]);
1302             m3 = _mm256_mul_ps (m1, clL[C]);
1303             m4 = _mm256_mul_ps (m2, clR[C]);
1304             m5 = _mm256_add_ps (m3, m5);
1305             m6 = _mm256_add_ps (m4, m6);
1306 
1307             m1 = _mm256_broadcast_ss (&tiPL[AG]);
1308             m2 = _mm256_broadcast_ss (&tiPR[AG]);
1309             m3 = _mm256_mul_ps (m1, clL[G]);
1310             m4 = _mm256_mul_ps (m2, clR[G]);
1311             m5 = _mm256_add_ps (m3, m5);
1312             m6 = _mm256_add_ps (m4, m6);
1313 
1314             m1 = _mm256_broadcast_ss (&tiPL[AT]);
1315             m2 = _mm256_broadcast_ss (&tiPR[AT]);
1316             m3 = _mm256_mul_ps (m1, clL[T]);
1317             m4 = _mm256_mul_ps (m2, clR[T]);
1318             m5 = _mm256_add_ps (m3, m5);
1319             m6 = _mm256_add_ps (m4, m6);
1320 
1321             *clP++ = _mm256_mul_ps (m5, m6);
1322 
1323             m1 = _mm256_broadcast_ss (&tiPL[CA]);
1324             m2 = _mm256_broadcast_ss (&tiPR[CA]);
1325             m5 = _mm256_mul_ps (m1, clL[A]);
1326             m6 = _mm256_mul_ps (m2, clR[A]);
1327 
1328             m1 = _mm256_broadcast_ss (&tiPL[CC]);
1329             m2 = _mm256_broadcast_ss (&tiPR[CC]);
1330             m3 = _mm256_mul_ps (m1, clL[C]);
1331             m4 = _mm256_mul_ps (m2, clR[C]);
1332             m5 = _mm256_add_ps (m3, m5);
1333             m6 = _mm256_add_ps (m4, m6);
1334 
1335             m1 = _mm256_broadcast_ss (&tiPL[CG]);
1336             m2 = _mm256_broadcast_ss (&tiPR[CG]);
1337             m3 = _mm256_mul_ps (m1, clL[G]);
1338             m4 = _mm256_mul_ps (m2, clR[G]);
1339             m5 = _mm256_add_ps (m3, m5);
1340             m6 = _mm256_add_ps (m4, m6);
1341 
1342             m1 = _mm256_broadcast_ss (&tiPL[CT]);
1343             m2 = _mm256_broadcast_ss (&tiPR[CT]);
1344             m3 = _mm256_mul_ps (m1, clL[T]);
1345             m4 = _mm256_mul_ps (m2, clR[T]);
1346             m5 = _mm256_add_ps (m3, m5);
1347             m6 = _mm256_add_ps (m4, m6);
1348 
1349             *clP++ = _mm256_mul_ps (m5, m6);
1350 
1351             m1 = _mm256_broadcast_ss (&tiPL[GA]);
1352             m2 = _mm256_broadcast_ss (&tiPR[GA]);
1353             m5 = _mm256_mul_ps (m1, clL[A]);
1354             m6 = _mm256_mul_ps (m2, clR[A]);
1355 
1356             m1 = _mm256_broadcast_ss (&tiPL[GC]);
1357             m2 = _mm256_broadcast_ss (&tiPR[GC]);
1358             m3 = _mm256_mul_ps (m1, clL[C]);
1359             m4 = _mm256_mul_ps (m2, clR[C]);
1360             m5 = _mm256_add_ps (m3, m5);
1361             m6 = _mm256_add_ps (m4, m6);
1362 
1363             m1 = _mm256_broadcast_ss (&tiPL[GG]);
1364             m2 = _mm256_broadcast_ss (&tiPR[GG]);
1365             m3 = _mm256_mul_ps (m1, clL[G]);
1366             m4 = _mm256_mul_ps (m2, clR[G]);
1367             m5 = _mm256_add_ps (m3, m5);
1368             m6 = _mm256_add_ps (m4, m6);
1369 
1370             m1 = _mm256_broadcast_ss (&tiPL[GT]);
1371             m2 = _mm256_broadcast_ss (&tiPR[GT]);
1372             m3 = _mm256_mul_ps (m1, clL[T]);
1373             m4 = _mm256_mul_ps (m2, clR[T]);
1374             m5 = _mm256_add_ps (m3, m5);
1375             m6 = _mm256_add_ps (m4, m6);
1376 
1377             *clP++ = _mm256_mul_ps (m5, m6);
1378 
1379             m1 = _mm256_broadcast_ss (&tiPL[TA]);
1380             m2 = _mm256_broadcast_ss (&tiPR[TA]);
1381             m5 = _mm256_mul_ps (m1, clL[A]);
1382             m6 = _mm256_mul_ps (m2, clR[A]);
1383 
1384             m1 = _mm256_broadcast_ss (&tiPL[TC]);
1385             m2 = _mm256_broadcast_ss (&tiPR[TC]);
1386             m3 = _mm256_mul_ps (m1, clL[C]);
1387             m4 = _mm256_mul_ps (m2, clR[C]);
1388             m5 = _mm256_add_ps (m3, m5);
1389             m6 = _mm256_add_ps (m4, m6);
1390 
1391             m1 = _mm256_broadcast_ss (&tiPL[TG]);
1392             m2 = _mm256_broadcast_ss (&tiPR[TG]);
1393             m3 = _mm256_mul_ps (m1, clL[G]);
1394             m4 = _mm256_mul_ps (m2, clR[G]);
1395             m5 = _mm256_add_ps (m3, m5);
1396             m6 = _mm256_add_ps (m4, m6);
1397 
1398             m1 = _mm256_broadcast_ss (&tiPL[TT]);
1399             m2 = _mm256_broadcast_ss (&tiPR[TT]);
1400             m3 = _mm256_mul_ps (m1, clL[T]);
1401             m4 = _mm256_mul_ps (m2, clR[T]);
1402             m5 = _mm256_add_ps (m3, m5);
1403             m6 = _mm256_add_ps (m4, m6);
1404 
1405             *clP++ = _mm256_mul_ps (m5, m6);
1406 
1407             clL += 4;
1408             clR += 4;
1409         }
1410         tiPL += 16;
1411         tiPR += 16;
1412     }
1413 
1414     return NO_ERROR;
1415 
1416 }
1417 #endif
1418 
1419 
1420 #if defined (SSE_ENABLED)
1421 /*----------------------------------------------------------------
1422 |
1423 |   CondLikeDown_NUC4_SSE: 4by4 nucleotide model with or without rate
1424 |       variation, using SSE instructions
1425 |
1426 -----------------------------------------------------------------*/
CondLikeDown_NUC4_SSE(TreeNode * p,int division,int chain)1427 int CondLikeDown_NUC4_SSE (TreeNode *p, int division, int chain)
1428 {
1429     int             c, k;
1430     CLFlt           *pL, *pR, *tiPL, *tiPR;
1431     __m128          *clL, *clR, *clP;
1432     __m128          m1, m2, m3, m4, m5, m6;
1433     ModelInfo       *m;
1434 
1435     m = &modelSettings[division];
1436 
1437     /* flip state of node so that we are not overwriting old cond likes */
1438     FlipCondLikeSpace (m, chain, p->index);
1439 
1440     /* find conditional likelihood pointers */
1441     clL = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1442     clR = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
1443     clP = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
1444 
1445     /* find transition probabilities */
1446     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1447     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1448 
1449     tiPL = pL;
1450     tiPR = pR;
1451     for (k=0; k<m->numRateCats; k++)
1452         {
1453         for (c=0; c<m->numVecChars; c++)
1454             {
1455             m1 = _mm_load1_ps (&tiPL[AA]);
1456             m2 = _mm_load1_ps (&tiPR[AA]);
1457             m5 = _mm_mul_ps (m1, clL[A]);
1458             m6 = _mm_mul_ps (m2, clR[A]);
1459 
1460             m1 = _mm_load1_ps (&tiPL[AC]);
1461             m2 = _mm_load1_ps (&tiPR[AC]);
1462             m3 = _mm_mul_ps (m1, clL[C]);
1463             m4 = _mm_mul_ps (m2, clR[C]);
1464             m5 = _mm_add_ps (m3, m5);
1465             m6 = _mm_add_ps (m4, m6);
1466 
1467             m1 = _mm_load1_ps (&tiPL[AG]);
1468             m2 = _mm_load1_ps (&tiPR[AG]);
1469             m3 = _mm_mul_ps (m1, clL[G]);
1470             m4 = _mm_mul_ps (m2, clR[G]);
1471             m5 = _mm_add_ps (m3, m5);
1472             m6 = _mm_add_ps (m4, m6);
1473 
1474             m1 = _mm_load1_ps (&tiPL[AT]);
1475             m2 = _mm_load1_ps (&tiPR[AT]);
1476             m3 = _mm_mul_ps (m1, clL[T]);
1477             m4 = _mm_mul_ps (m2, clR[T]);
1478             m5 = _mm_add_ps (m3, m5);
1479             m6 = _mm_add_ps (m4, m6);
1480 
1481             *clP++ = _mm_mul_ps (m5, m6);
1482 
1483             m1 = _mm_load1_ps (&tiPL[CA]);
1484             m2 = _mm_load1_ps (&tiPR[CA]);
1485             m5 = _mm_mul_ps (m1, clL[A]);
1486             m6 = _mm_mul_ps (m2, clR[A]);
1487 
1488             m1 = _mm_load1_ps (&tiPL[CC]);
1489             m2 = _mm_load1_ps (&tiPR[CC]);
1490             m3 = _mm_mul_ps (m1, clL[C]);
1491             m4 = _mm_mul_ps (m2, clR[C]);
1492             m5 = _mm_add_ps (m3, m5);
1493             m6 = _mm_add_ps (m4, m6);
1494 
1495             m1 = _mm_load1_ps (&tiPL[CG]);
1496             m2 = _mm_load1_ps (&tiPR[CG]);
1497             m3 = _mm_mul_ps (m1, clL[G]);
1498             m4 = _mm_mul_ps (m2, clR[G]);
1499             m5 = _mm_add_ps (m3, m5);
1500             m6 = _mm_add_ps (m4, m6);
1501 
1502             m1 = _mm_load1_ps (&tiPL[CT]);
1503             m2 = _mm_load1_ps (&tiPR[CT]);
1504             m3 = _mm_mul_ps (m1, clL[T]);
1505             m4 = _mm_mul_ps (m2, clR[T]);
1506             m5 = _mm_add_ps (m3, m5);
1507             m6 = _mm_add_ps (m4, m6);
1508 
1509             *clP++ = _mm_mul_ps (m5, m6);
1510 
1511             m1 = _mm_load1_ps (&tiPL[GA]);
1512             m2 = _mm_load1_ps (&tiPR[GA]);
1513             m5 = _mm_mul_ps (m1, clL[A]);
1514             m6 = _mm_mul_ps (m2, clR[A]);
1515 
1516             m1 = _mm_load1_ps (&tiPL[GC]);
1517             m2 = _mm_load1_ps (&tiPR[GC]);
1518             m3 = _mm_mul_ps (m1, clL[C]);
1519             m4 = _mm_mul_ps (m2, clR[C]);
1520             m5 = _mm_add_ps (m3, m5);
1521             m6 = _mm_add_ps (m4, m6);
1522 
1523             m1 = _mm_load1_ps (&tiPL[GG]);
1524             m2 = _mm_load1_ps (&tiPR[GG]);
1525             m3 = _mm_mul_ps (m1, clL[G]);
1526             m4 = _mm_mul_ps (m2, clR[G]);
1527             m5 = _mm_add_ps (m3, m5);
1528             m6 = _mm_add_ps (m4, m6);
1529 
1530             m1 = _mm_load1_ps (&tiPL[GT]);
1531             m2 = _mm_load1_ps (&tiPR[GT]);
1532             m3 = _mm_mul_ps (m1, clL[T]);
1533             m4 = _mm_mul_ps (m2, clR[T]);
1534             m5 = _mm_add_ps (m3, m5);
1535             m6 = _mm_add_ps (m4, m6);
1536 
1537             *clP++ = _mm_mul_ps (m5, m6);
1538 
1539             m1 = _mm_load1_ps (&tiPL[TA]);
1540             m2 = _mm_load1_ps (&tiPR[TA]);
1541             m5 = _mm_mul_ps (m1, clL[A]);
1542             m6 = _mm_mul_ps (m2, clR[A]);
1543 
1544             m1 = _mm_load1_ps (&tiPL[TC]);
1545             m2 = _mm_load1_ps (&tiPR[TC]);
1546             m3 = _mm_mul_ps (m1, clL[C]);
1547             m4 = _mm_mul_ps (m2, clR[C]);
1548             m5 = _mm_add_ps (m3, m5);
1549             m6 = _mm_add_ps (m4, m6);
1550 
1551             m1 = _mm_load1_ps (&tiPL[TG]);
1552             m2 = _mm_load1_ps (&tiPR[TG]);
1553             m3 = _mm_mul_ps (m1, clL[G]);
1554             m4 = _mm_mul_ps (m2, clR[G]);
1555             m5 = _mm_add_ps (m3, m5);
1556             m6 = _mm_add_ps (m4, m6);
1557 
1558             m1 = _mm_load1_ps (&tiPL[TT]);
1559             m2 = _mm_load1_ps (&tiPR[TT]);
1560             m3 = _mm_mul_ps (m1, clL[T]);
1561             m4 = _mm_mul_ps (m2, clR[T]);
1562             m5 = _mm_add_ps (m3, m5);
1563             m6 = _mm_add_ps (m4, m6);
1564 
1565             *clP++ = _mm_mul_ps (m5, m6);
1566 
1567             clL += 4;
1568             clR += 4;
1569             }
1570         tiPL += 16;
1571         tiPR += 16;
1572         }
1573 
1574     return NO_ERROR;
1575 
1576 }
1577 #endif
1578 
1579 
1580 #if !defined (SSE_ENABLED) || 1
1581 /*----------------------------------------------------------------
1582 |
1583 |   CondLikeDown_NY98: codon model with omega variation
1584 |
1585 -----------------------------------------------------------------*/
CondLikeDown_NY98(TreeNode * p,int division,int chain)1586 int CondLikeDown_NY98 (TreeNode *p, int division, int chain)
1587 {
1588     int             a, b, c, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, nStates, nStatesSquared;
1589     CLFlt           likeL, likeR, *pL, *pR, *tiPL, *tiPR, *clL, *clR, *clP;
1590     ModelInfo       *m;
1591 
1592     /* find model settings for this division and nStates, nStatesSquared */
1593     m = &modelSettings[division];
1594     nStates = m->numModelStates;
1595     nStatesSquared = nStates * nStates;
1596 
1597     /* Flip conditional likelihood space */
1598     FlipCondLikeSpace (m, chain, p->index);
1599 
1600     /* find conditional likelihood pointers */
1601     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1602     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
1603     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
1604 
1605     /* find transition probabilities */
1606     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1607     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1608 
1609     /* find likelihoods of site patterns for left branch if terminal */
1610     shortCut = 0;
1611 #   if !defined (DEBUG_NOSHORTCUTS)
1612     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
1613         {
1614         shortCut |= 1;
1615         lState = m->termState[p->left->index];
1616         tiPL = pL;
1617         for (k=a=0; k<m->numOmegaCats; k++)
1618             {
1619             for (i=0; i<nStates; i++)
1620                 for (j=i; j<nStatesSquared; j+=nStates)
1621                     preLikeL[a++] = tiPL[j];
1622             /* for ambiguous */
1623             for (i=0; i<nStates; i++)
1624                 preLikeL[a++] = 1.0;
1625             tiPL += nStatesSquared;
1626             }
1627         }
1628 
1629     /* find likelihoods of site patterns for right branch if terminal */
1630     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
1631         {
1632         shortCut |= 2;
1633         rState = m->termState[p->right->index];
1634         tiPR = pR;
1635         for (k=a=0; k<m->numOmegaCats; k++)
1636             {
1637             for (i=0; i<nStates; i++)
1638                 for (j=i; j<nStatesSquared; j+=nStates)
1639                     preLikeR[a++] = tiPR[j];
1640             /* for ambiguous */
1641             for (i=0; i<nStates; i++)
1642                 preLikeR[a++] = 1.0;
1643             tiPR += nStatesSquared;
1644             }
1645         }
1646 #   endif
1647 
1648     switch (shortCut)
1649         {
1650         case 0:
1651             tiPL = pL;
1652             tiPR = pR;
1653             for (k=0; k<m->numOmegaCats; k++)
1654                 {
1655                 for (c=0; c<m->numChars; c++)
1656                     {
1657                     for (i=h=0; i<nStates; i++)
1658                         {
1659                         likeL = likeR = 0.0;
1660                         for (j=0; j<nStates; j++)
1661                             {
1662                             likeL += tiPL[h]*clL[j];
1663                             likeR += tiPR[h++]*clR[j];
1664                             }
1665                         *(clP++) = likeL * likeR;
1666                         }
1667                     clL += nStates;
1668                     clR += nStates;
1669                     }
1670                 tiPL += nStatesSquared;
1671                 tiPR += nStatesSquared;
1672                 }
1673             break;
1674         case 1:
1675             tiPR = pR;
1676             for (k=0; k<m->numOmegaCats; k++)
1677                 {
1678                 for (c=0; c<m->numChars; c++)
1679                     {
1680                     a = lState[c] + k*(nStatesSquared+nStates);
1681                     for (i=h=0; i<nStates; i++)
1682                         {
1683                         likeR = 0.0;
1684                         for (j=0; j<nStates; j++)
1685                             {
1686                             likeR += tiPR[h++]*clR[j];
1687                             }
1688                         *(clP++) = preLikeL[a++] * likeR;
1689                         }
1690                     clR += nStates;
1691                     }
1692                 tiPR += nStatesSquared;
1693                 }
1694             break;
1695         case 2:
1696             tiPL = pL;
1697             for (k=0; k<m->numOmegaCats; k++)
1698                 {
1699                 for (c=0; c<m->numChars; c++)
1700                     {
1701                     a = rState[c] + k*(nStatesSquared+nStates);
1702                     for (i=h=0; i<nStates; i++)
1703                         {
1704                         likeL = 0.0;
1705                         for (j=0; j<nStates; j++)
1706                             {
1707                             likeL += tiPL[h++]*clL[j];
1708                             }
1709                         *(clP++) = preLikeR[a++] * likeL;
1710                         }
1711                     clL += nStates;
1712                     }
1713                 tiPL += nStatesSquared;
1714                 }
1715             break;
1716         case 3:
1717             for (k=0; k<m->numOmegaCats; k++)
1718                 {
1719                 for (c=0; c<m->numChars; c++)
1720                     {
1721                     a = rState[c] + k*(nStatesSquared+nStates);
1722                     b = lState[c] + k*(nStatesSquared+nStates);
1723                     for (i=0; i<nStates; i++)
1724                         {
1725                         *(clP++) = preLikeR[a++] * preLikeL[b++];
1726                         }
1727                     }
1728                 }
1729             break;
1730         }
1731 
1732     return NO_ERROR;
1733 }
1734 #endif
1735 
1736 
1737 #if defined (SSE_ENABLED)
1738 /*----------------------------------------------------------------
1739 |
1740 |   CondLikeDown_NY98_SSE: codon model with omega variation
1741 |
1742 -----------------------------------------------------------------*/
CondLikeDown_NY98_SSE(TreeNode * p,int division,int chain)1743 int CondLikeDown_NY98_SSE (TreeNode *p, int division, int chain)
1744 {
1745     int             c, c1, h, i, j, k, t, shortCut, *lState=NULL, *rState=NULL, nStates, nStatesSquared;
1746     CLFlt           *pL, *pR, *tiPL, *tiPR;
1747     __m128          *clL, *clR, *clP;
1748     __m128          mTiPL, mTiPR, mL, mR, mAcumL, mAcumR;
1749     ModelInfo       *m;
1750     CLFlt           *preLikeRV[4] = {0};
1751     CLFlt           *preLikeLV[4] = {0};
1752 #   if !defined (DEBUG_NOSHORTCUTS)
1753     int             a;
1754 #   endif
1755 
1756     /* find model settings for this division and nStates, nStatesSquared */
1757     m = &modelSettings[division];
1758     nStates = m->numModelStates;
1759     nStatesSquared = nStates * nStates;
1760 
1761     /* Flip conditional likelihood space */
1762     FlipCondLikeSpace (m, chain, p->index);
1763 
1764     /* find conditional likelihood pointers */
1765     clL = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1766     clR = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->right->index]];
1767     clP = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->index       ]];
1768 
1769     /* find transition probabilities */
1770     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1771     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1772 
1773     /* find likelihoods of site patterns for left branch if terminal */
1774     shortCut = 0;
1775 #   if !defined (DEBUG_NOSHORTCUTS)
1776     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
1777         {
1778         shortCut |= 1;
1779         lState = m->termState[p->left->index];
1780         tiPL = pL;
1781         for (k=a=0; k<m->numOmegaCats; k++)
1782             {
1783             for (i=0; i<nStates; i++)
1784                 for (j=i; j<nStatesSquared; j+=nStates)
1785                     preLikeL[a++] = tiPL[j];
1786             /* for ambiguous */
1787             for (i=0; i<nStates; i++)
1788                 preLikeL[a++] = 1.0;
1789             tiPL += nStatesSquared;
1790             }
1791         }
1792 
1793     /* find likelihoods of site patterns for right branch if terminal */
1794     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
1795         {
1796         shortCut |= 2;
1797         rState = m->termState[p->right->index];
1798         tiPR = pR;
1799         for (k=a=0; k<m->numOmegaCats; k++)
1800             {
1801             for (i=0; i<nStates; i++)
1802                 for (j=i; j<nStatesSquared; j+=nStates)
1803                     preLikeR[a++] = tiPR[j];
1804             /* for ambiguous */
1805             for (i=0; i<nStates; i++)
1806                 preLikeR[a++] = 1.0;
1807             tiPR += nStatesSquared;
1808             }
1809         }
1810 #   endif
1811 
1812     switch (shortCut)
1813         {
1814         case 0:
1815             tiPL = pL;
1816             tiPR = pR;
1817             for (k=0; k<m->numOmegaCats; k++)
1818                 {
1819                 for (c=0; c<m->numVecChars; c++)
1820                     {
1821                     for (i=h=0; i<nStates; i++)
1822                         {
1823                         mAcumL = _mm_setzero_ps();
1824                         mAcumR = _mm_setzero_ps();
1825                         for (j=0; j<nStates; j++)
1826                             {
1827                             mTiPL  = _mm_load1_ps (&tiPL[h]);
1828                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
1829                             mL     = _mm_mul_ps (mTiPL, clL[j]);
1830                             mR     = _mm_mul_ps (mTiPR, clR[j]);
1831                             mAcumL = _mm_add_ps (mL, mAcumL);
1832                             mAcumR = _mm_add_ps (mR, mAcumR);
1833                             }
1834                         *(clP++) = _mm_mul_ps (mAcumL, mAcumR);
1835                         }
1836                     clL += nStates;
1837                     clR += nStates;
1838                     }
1839                 tiPL += nStatesSquared;
1840                 tiPR += nStatesSquared;
1841                 }
1842             break;
1843         case 1:
1844             tiPR = pR;
1845             for (k=0; k<m->numOmegaCats; k++)
1846                 {
1847                 for (c=t=0; c<m->numVecChars; c++)
1848                     {
1849                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
1850                         {
1851                         preLikeLV[c1] = &preLikeL[lState[t] + k*(nStatesSquared+nStates)];
1852                         }
1853                     for (i=h=0; i<nStates; i++)
1854                         {
1855                         assert (m->numFloatsPerVec == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
1856                         mAcumL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
1857                         mAcumR = _mm_setzero_ps();
1858                         for (j=0; j<nStates; j++)
1859                             {
1860                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
1861                             mR     = _mm_mul_ps (mTiPR, clR[j]);
1862                             mAcumR = _mm_add_ps (mR, mAcumR);
1863                             }
1864                         *(clP++) = _mm_mul_ps (mAcumL,mAcumR);
1865                         }
1866                     clR += nStates;
1867                     }
1868                 tiPR += nStatesSquared;
1869                 }
1870             break;
1871         case 2:
1872             tiPL = pL;
1873             for (k=0; k<m->numOmegaCats; k++)
1874                 {
1875                 for (c=t=0; c<m->numVecChars; c++)
1876                     {
1877                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
1878                         {
1879                         preLikeRV[c1] = &preLikeR[rState[t] + k*(nStatesSquared+nStates)];
1880                         }
1881                     for (i=h=0; i<nStates; i++)
1882                         {
1883                         assert (m->numFloatsPerVec == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
1884                         mAcumR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
1885                         mAcumL = _mm_setzero_ps();
1886                         for (j=0; j<nStates; j++)
1887                             {
1888                             mTiPL  = _mm_load1_ps (&tiPL[h++]);
1889                             mL     = _mm_mul_ps (mTiPL, clL[j]);
1890                             mAcumL = _mm_add_ps (mL, mAcumL);
1891                             }
1892                         *(clP++) = _mm_mul_ps (mAcumL,mAcumR);
1893                         }
1894                     clL += nStates;
1895                     }
1896                 tiPL += nStatesSquared;
1897                 }
1898             break;
1899         case 3:
1900             for (k=0; k<m->numOmegaCats; k++)
1901                 {
1902                 for (c=t=0; c<m->numVecChars; c++)
1903                     {
1904                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
1905                         {
1906                         preLikeRV[c1] = &preLikeR[rState[t] + k*(nStatesSquared+nStates)];
1907                         preLikeLV[c1] = &preLikeL[lState[t] + k*(nStatesSquared+nStates)];
1908                         }
1909                     for (i=0; i<nStates; i++)
1910                         {
1911                         assert (m->numFloatsPerVec == 4); /* In the following 2 statments we assume that SSE register can hold exactly 4 ClFlts. */
1912                         mL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
1913                         mR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
1914                         *(clP++) = _mm_mul_ps (mL,mR);
1915                         }
1916                     }
1917                 }
1918             break;
1919         }
1920 
1921     return NO_ERROR;
1922 }
1923 #endif
1924 
1925 
1926 /*----------------------------------------------------------------
1927 |
1928 |   CondLikeDown_Std: variable number of states model
1929 |       with or without rate variation
1930 |
1931 -----------------------------------------------------------------*/
CondLikeDown_Std(TreeNode * p,int division,int chain)1932 int CondLikeDown_Std (TreeNode *p, int division, int chain)
1933 {
1934     int             a, c, h, i, j, k, nStates, nCats, tmp;
1935     CLFlt           *clL, *clR, *clP, *pL, *pR, *tiPL, *tiPR, likeL, likeR;
1936     ModelInfo       *m;
1937 
1938     m = &modelSettings[division];
1939 
1940     /* Flip conditional likelihood space */
1941     FlipCondLikeSpace (m, chain, p->index);
1942 
1943     /* find conditional likelihood pointers */
1944     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1945     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
1946     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
1947 
1948     /* find transition probabilities */
1949     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1950     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1951 
1952     /* Conditional likelihood space is assumed to be arranged in numGammaCats blocks of data. Each block contains all data for one gamma category.
1953     Each gamma cat block consist of numChars sequences of data, each of this sequences corresponds to a character of data matrix.
1954     A sequence consists of nStates for all non-binary data, otherwise length of sequence is nStates*numBetaCats (i.e. 2*numBetaCats) */
1955 
1956     /* calculate ancestral probabilities */
1957     for (k=h=0; k<m->numRateCats; k++)
1958         {
1959         /* calculate ancestral probabilities */
1960         for (c=0; c<m->numChars; c++)
1961             {
1962             nStates = m->nStates[c];
1963 
1964             /* the following lines ensure that nCats is 1 unless */
1965             /* the character is binary and beta categories are used  */
1966             if (nStates == 2)
1967                 nCats = m->numBetaCats;
1968             else
1969                 nCats = 1;
1970 
1971             tmp = k*nStates*nStates; /* tmp contains offset to skip rate cats that already processed*/
1972             tiPL = pL + m->tiIndex[c] + tmp;
1973             tiPR = pR + m->tiIndex[c] + tmp;
1974             tmp = (m->numRateCats-1)*2*2; /* tmp contains size of block of tpi matrices across all rate cats (minus one) for single beta category. Further used only if character is binary to jump to next beta category */
1975 
1976             for (j=0; j<nCats;j++)
1977                 {
1978                 for (a=0; a<nStates; a++)
1979                     {
1980                     likeL = likeR = 0.0;
1981                     for (i=0; i<nStates; i++)
1982                         {
1983                         likeL += *(tiPL++) * clL[i];
1984                         likeR += *(tiPR++) * clR[i];
1985                         }
1986                     clP[h++] = likeL * likeR;
1987                     }
1988                 clL += nStates;
1989                 clR += nStates;
1990 
1991                 tiPL += tmp;
1992                 tiPR += tmp;
1993                 }
1994             }
1995         }
1996 
1997     return NO_ERROR;
1998 }
1999 
2000 
2001 #if !defined (SSE_ENABLED) || 1
2002 /*----------------------------------------------------------------
2003 |
2004 |   CondLikeRoot_Bin: binary model with or without rate
2005 |       variation
2006 |
2007 -----------------------------------------------------------------*/
CondLikeRoot_Bin(TreeNode * p,int division,int chain)2008 int CondLikeRoot_Bin (TreeNode *p, int division, int chain)
2009 {
2010     int             c, k;
2011     CLFlt           *clL, *clR, *clP, *clA, *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
2012     ModelInfo       *m;
2013 
2014     /* find model settings for this division */
2015     m = &modelSettings[division];
2016 
2017     /* flip state of node so that we are not overwriting old cond likes */
2018     FlipCondLikeSpace (m, chain, p->index);
2019 
2020     /* find conditional likelihood pointers */
2021     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
2022     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
2023     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
2024     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
2025 
2026     /* find transition probabilities (or calculate instead) */
2027     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
2028     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
2029     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
2030 
2031     tiPL = pL;
2032     tiPR = pR;
2033     tiPA = pA;
2034     for (k=0; k<m->numRateCats; k++)
2035         {
2036         for (c=0; c<m->numChars; c++)
2037             {
2038             *(clP++) = (tiPL[0]*clL[0] + tiPL[1]*clL[1])
2039                       *(tiPR[0]*clR[0] + tiPR[1]*clR[1])
2040                       *(tiPA[0]*clA[0] + tiPA[1]*clA[1]);
2041             *(clP++) = (tiPL[2]*clL[0] + tiPL[3]*clL[1])
2042                       *(tiPR[2]*clR[0] + tiPR[3]*clR[1])
2043                       *(tiPA[2]*clA[0] + tiPA[3]*clA[1]);
2044 
2045             clA += 2;
2046             clL += 2;
2047             clR += 2;
2048             }
2049         tiPA += 4;
2050         tiPL += 4;
2051         tiPR += 4;
2052         }
2053 
2054     return NO_ERROR;
2055 }
2056 #endif
2057 
2058 
2059 #if defined (SSE_ENABLED)
2060 /*----------------------------------------------------------------
2061 |
2062 |   CondLikeRoot_Bin_SSE:binary model with or without rate
2063 |       variation
2064 |
2065 -----------------------------------------------------------------*/
CondLikeRoot_Bin_SSE(TreeNode * p,int division,int chain)2066 int CondLikeRoot_Bin_SSE (TreeNode *p, int division, int chain)
2067 {
2068     int             c, k;
2069     CLFlt           *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
2070     __m128          *clL, *clR, *clP, *clA;
2071     __m128          m1, m2, m3, m4, m5, m6, m7;
2072     ModelInfo       *m;
2073 
2074     m = &modelSettings[division];
2075 
2076     /* flip state of node so that we are not overwriting old cond likes */
2077     FlipCondLikeSpace (m, chain, p->index);
2078 
2079     /* find conditional likelihood pointers */
2080     clL = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
2081     clR = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
2082     clP = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
2083     clA = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
2084 
2085     /* find transition probabilities */
2086     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
2087     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
2088     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
2089 
2090     tiPL = pL;
2091     tiPR = pR;
2092     tiPA = pA;
2093     for (k=0; k<m->numRateCats; k++)
2094         {
2095         for (c=0; c<m->numVecChars; c++)
2096             {
2097             m1 = _mm_load1_ps (&tiPL[0]);
2098             m5 = *clL++;
2099             m2 = _mm_mul_ps (m1, m5);
2100             m1 = _mm_load1_ps (&tiPL[2]);
2101             m6 = _mm_mul_ps (m1, m5);
2102 
2103             m1 = _mm_load1_ps (&tiPL[1]);
2104             m5 = *clL++;
2105             m3 = _mm_mul_ps (m1, m5);
2106             m1 = _mm_load1_ps (&tiPL[3]);
2107             m5 = _mm_mul_ps (m1, m5);
2108 
2109             m4 = _mm_add_ps (m2, m3); /* in m4 we get (tiPL[0]*clL[0] + tiPL[1]*clL[1]) */
2110             m6 = _mm_add_ps (m5, m6); /* in m6 we get (tiPL[2]*clL[0] + tiPL[3]*clL[1]) */
2111 
2112             m1 = _mm_load1_ps (&tiPR[0]);
2113             m5 = *clR++;
2114             m2 = _mm_mul_ps (m1, m5);
2115             m1 = _mm_load1_ps (&tiPR[2]);
2116             m7 = _mm_mul_ps (m1, m5);
2117 
2118             m1 = _mm_load1_ps (&tiPR[1]);
2119             m5 = *clR++;
2120             m3 = _mm_mul_ps (m1, m5);
2121             m1 = _mm_load1_ps (&tiPR[3]);
2122             m5 = _mm_mul_ps (m1, m5);
2123 
2124             m1 = _mm_add_ps (m2, m3); /* in m1 we get (tiPR[0]*clR[0] + tiPR[1]*clR[1]) */
2125             m7 = _mm_add_ps (m5, m7); /* in m7 we get (tiPR[2]*clR[0] + tiPR[3]*clR[1]) */
2126 
2127             m4 = _mm_mul_ps (m1, m4); /* in m4 we get (tiPL[0]*clL[0] + tiPL[1]*clL[1])*(tiPR[0]*clR[0] + tiPR[1]*clR[1]) */
2128             m7 = _mm_mul_ps (m6, m7); /* in m7 we get (tiPL[2]*clL[0] + tiPL[3]*clL[1])*(tiPR[2]*clR[0] + tiPR[3]*clR[1]) */
2129 
2130             m1 = _mm_load1_ps (&tiPA[0]);
2131             m5 = *clA++;
2132             m2 = _mm_mul_ps (m1, m5);
2133             m1 = _mm_load1_ps (&tiPA[2]);
2134             m6 = _mm_mul_ps (m1, m5);
2135 
2136             m1 = _mm_load1_ps (&tiPA[1]);
2137             m5 = *clA++;
2138             m3 = _mm_mul_ps (m1, m5);
2139             m1 = _mm_load1_ps (&tiPA[3]);
2140             m1 = _mm_mul_ps (m1, m5);
2141 
2142             m2 = _mm_add_ps (m2, m3); /* in m1 we get (tiPA[0]*clA[0] + tiPA[1]*clA[1]) */
2143             m1 = _mm_add_ps (m1, m6); /* in m1 we get (tiPA[2]*clA[0] + tiPA[3]*clA[1]) */
2144 
2145             *clP++ = _mm_mul_ps (m2, m4);
2146             *clP++ = _mm_mul_ps (m1, m7);
2147 
2148             }
2149         tiPL += 4;
2150         tiPR += 4;
2151         tiPA += 4;
2152         }
2153 
2154     return NO_ERROR;
2155 
2156 }
2157 #endif
2158 
2159 
2160 /*----------------------------------------------------------------
2161 |
2162 |   CondLikeRoot_Gen: general n-state model with or without rate
2163 |       variation
2164 |
2165 -----------------------------------------------------------------*/
CondLikeRoot_Gen(TreeNode * p,int division,int chain)2166 int CondLikeRoot_Gen (TreeNode *p, int division, int chain)
2167 {
2168     int             a, b, c, d, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, *aState=NULL,
2169                     nObsStates, nStates, nStatesSquared, preLikeJump;
2170     CLFlt           likeL, likeR, likeA, *clL, *clR, *clP, *clA, *pL, *pR, *pA,
2171                     *tiPL, *tiPR, *tiPA;
2172     ModelInfo       *m;
2173 #   if !defined (DEBUG_NOSHORTCUTS)
2174     int catStart;
2175 #   endif
2176 
2177     /* find model settings for this division and nStates, nStatesSquared */
2178     m = &modelSettings[division];
2179     nObsStates = m->numStates;
2180     nStates = m->numModelStates;
2181     nStatesSquared = nStates * nStates;
2182     preLikeJump = nObsStates * nStates;
2183 
2184     /* flip state of node so that we are not overwriting old cond likes */
2185     FlipCondLikeSpace (m, chain, p->index);
2186 
2187     /* find conditional likelihood pointers */
2188     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
2189     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
2190     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
2191     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
2192 
2193     /* find transition probabilities (or calculate instead) */
2194     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
2195     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
2196     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
2197 
2198     /* find likelihoods of site patterns for left branch if terminal */
2199     shortCut = 0;
2200 #   if !defined (DEBUG_NOSHORTCUTS)
2201     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
2202         {
2203         shortCut |= 1;
2204         lState = m->termState[p->left->index];
2205         tiPL = pL;
2206         for (k=a=0; k<m->numRateCats; k++)
2207             {
2208             catStart = a;
2209             for (i=0; i<nObsStates; i++)
2210                 for (j=i; j<nStatesSquared; j+=nStates)
2211                     preLikeL[a++] = tiPL[j];
2212             for (b=1; b<nStates/nObsStates; b++)
2213                 {
2214                 a = catStart;
2215                 for (i=0; i<nObsStates; i++)
2216                     {
2217                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2218                         preLikeL[a++] += tiPL[j];
2219                     }
2220                 }
2221             /* for ambiguous */
2222             for (i=0; i<nStates; i++)
2223                 preLikeL[a++] = 1.0;
2224             tiPL += nStatesSquared;
2225             }
2226         }
2227 
2228     /* find likelihoods of site patterns for right branch if terminal */
2229     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
2230         {
2231         shortCut |= 2;
2232         rState = m->termState[p->right->index];
2233         tiPR = pR;
2234         for (k=a=0; k<m->numRateCats; k++)
2235             {
2236             catStart = a;
2237             for (i=0; i<nObsStates; i++)
2238                 for (j=i; j<nStatesSquared; j+=nStates)
2239                     preLikeR[a++] = tiPR[j];
2240             for (b=1; b<nStates/nObsStates; b++)
2241                 {
2242                 a = catStart;
2243                 for (i=0; i<nObsStates; i++)
2244                     {
2245                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2246                         preLikeR[a++] += tiPR[j];
2247                     }
2248                 }
2249             /* for ambiguous */
2250             for (i=0; i<nStates; i++)
2251                 preLikeR[a++] = 1.0;
2252             tiPR += nStatesSquared;
2253             }
2254         }
2255 
2256     /* find likelihoods of site patterns for anc branch, always terminal */
2257     if (m->isPartAmbig[p->anc->index] == YES)
2258         {
2259         shortCut = 4;
2260         }
2261     else
2262         {
2263         aState = m->termState[p->anc->index];
2264         tiPA = pA;
2265         for (k=a=0; k<m->numRateCats; k++)
2266             {
2267             catStart = a;
2268             for (i=0; i<nObsStates; i++)
2269                 for (j=i; j<nStatesSquared; j+=nStates)
2270                     preLikeA[a++] = tiPA[j];
2271             for (b=1; b<nStates/nObsStates; b++)
2272                 {
2273                 a = catStart;
2274                 for (i=0; i<nObsStates; i++)
2275                     {
2276                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2277                         preLikeA[a++] += tiPA[j];
2278                     }
2279                 }
2280             /* for ambiguous */
2281             for (i=0; i<nStates; i++)
2282                 preLikeA[a++] = 1.0;
2283             tiPA += nStatesSquared;
2284             }
2285         }
2286 #   else
2287     shortCut = 4;
2288 #   endif
2289 
2290     //shortCut = 4;
2291     switch (shortCut)
2292         {
2293         case 4:
2294             tiPL = pL;
2295             tiPR = pR;
2296             tiPA = pA;
2297             for (k=0; k<m->numRateCats; k++)
2298                 {
2299                 for (c=0; c<m->numChars; c++)
2300                     {
2301                     for (i=h=0; i<nStates; i++)
2302                         {
2303                         likeL = likeR = likeA = 0.0;
2304                         for (j=0; j<nStates; j++)
2305                             {
2306                             likeL += tiPL[h]*clL[j];
2307                             likeR += tiPR[h]*clR[j];
2308                             likeA += tiPA[h++]*clA[j];
2309                             }
2310                         *(clP++) = likeL * likeR * likeA;
2311                         }
2312                     clL += nStates;
2313                     clR += nStates;
2314                     clA += nStates;
2315                     }
2316                 tiPL += nStatesSquared;
2317                 tiPR += nStatesSquared;
2318                 tiPA += nStatesSquared;
2319                 }
2320             break;
2321         case 0:
2322             tiPR = pR;
2323             tiPL = pL;
2324             for (k=0; k<m->numRateCats; k++)
2325                 {
2326                 for (c=0; c<m->numChars; c++)
2327                     {
2328                     a = aState[c] + k*(preLikeJump+nStates);
2329                     for (i=h=0; i<nStates; i++)
2330                         {
2331                         likeR = likeL = 0.0;
2332                         for (j=0; j<nStates; j++)
2333                             {
2334                             likeR += tiPR[h]*clR[j];
2335                             likeL += tiPL[h++]*clL[j];
2336                             }
2337                         *(clP++) = preLikeA[a++] * likeR * likeL;
2338                         }
2339                     clR += nStates;
2340                     clL += nStates;
2341                     }
2342                 tiPR += nStatesSquared;
2343                 tiPL += nStatesSquared;
2344                 }
2345             break;
2346         case 1:
2347             tiPR = pR;
2348             for (k=0; k<m->numRateCats; k++)
2349                 {
2350                 for (c=0; c<m->numChars; c++)
2351                     {
2352                     a = lState[c] + k*(preLikeJump+nStates);
2353                     b = aState[c] + k*(preLikeJump+nStates);
2354                     for (i=h=0; i<nStates; i++)
2355                         {
2356                         likeR = 0.0;
2357                         for (j=0; j<nStates; j++)
2358                             {
2359                             likeR += tiPR[h++]*clR[j];
2360                             }
2361                         *(clP++) = preLikeL[a++] * preLikeA[b++] * likeR;
2362                         }
2363                     clR += nStates;
2364                     }
2365                 tiPR += nStatesSquared;
2366                 }
2367             break;
2368         case 2:
2369             tiPL = pL;
2370             for (k=0; k<m->numRateCats; k++)
2371                 {
2372                 for (c=0; c<m->numChars; c++)
2373                     {
2374                     a = rState[c] + k*(preLikeJump+nStates);
2375                     b = aState[c] + k*(preLikeJump+nStates);
2376                     for (i=h=0; i<nStates; i++)
2377                         {
2378                         likeL = 0.0;
2379                         for (j=0; j<nStates; j++)
2380                             {
2381                             likeL += tiPL[h++]*clL[j];
2382                             }
2383                         *(clP++) = preLikeR[a++] * preLikeA[b++] * likeL;
2384                         }
2385                     clL += nStates;
2386                     }
2387                 tiPL += nStatesSquared;
2388                 }
2389             break;
2390         case 3:
2391             for (k=0; k<m->numRateCats; k++)
2392                 {
2393                 for (c=0; c<m->numChars; c++)
2394                     {
2395                     a = rState[c] + k*(preLikeJump+nStates);
2396                     b = lState[c] + k*(preLikeJump+nStates);
2397                     d = aState[c] + k*(preLikeJump+nStates);
2398                     for (i=0; i<nStates; i++)
2399                         {
2400                         *(clP++) = preLikeR[a++] * preLikeL[b++] * preLikeA[d++];
2401                         }
2402                     }
2403                 }
2404             break;
2405         }
2406 
2407     return NO_ERROR;
2408 }
2409 
2410 
2411 #if defined (SSE_ENABLED)
2412 /*----------------------------------------------------------------
2413 |
2414 |   CondLikeRoot_Gen_SSE:general n-state model with or without rate
2415 |       variation
2416 |
2417 -----------------------------------------------------------------*/
CondLikeRoot_Gen_SSE(TreeNode * p,int division,int chain)2418 int CondLikeRoot_Gen_SSE (TreeNode *p, int division, int chain)
2419 {
2420     int             c, c1, t, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, *aState=NULL, nObsStates, preLikeJump,
2421                     nStates, nStatesSquared;
2422     CLFlt           *pL, *pR, *pA,
2423                     *tiPL, *tiPR, *tiPA;
2424     __m128          *clL, *clR, *clP, *clA;
2425     __m128          mTiPL, mTiPR, mTiPA, mL, mR, mA, mAcumL, mAcumR, mAcumA;
2426     ModelInfo       *m;
2427     CLFlt           *preLikeRV[4] = {0};
2428     CLFlt           *preLikeLV[4] = {0};
2429     CLFlt           *preLikeAV[4] = {0};
2430 
2431 #   if !defined (DEBUG_NOSHORTCUTS)
2432     int a, b, catStart;
2433 #   endif
2434 
2435     /* find model settings for this division and nStates, nStatesSquared */
2436     m = &modelSettings[division];
2437     nObsStates = m->numStates;
2438     nStates = m->numModelStates;
2439     nStatesSquared = nStates * nStates;
2440     preLikeJump = nObsStates * nStates;
2441 
2442     /* flip state of node so that we are not overwriting old cond likes */
2443     FlipCondLikeSpace (m, chain, p->index);
2444 
2445     /* find conditional likelihood pointers */
2446     clL = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->left->index ]];
2447     clR = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->right->index]];
2448     clP = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->index       ]];
2449     clA = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
2450 
2451     /* find transition probabilities (or calculate instead) */
2452     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
2453     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
2454     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
2455 
2456     /* find likelihoods of site patterns for left branch if terminal */
2457     shortCut = 0;
2458 #   if !defined (DEBUG_NOSHORTCUTS)
2459     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
2460         {
2461         shortCut |= 1;
2462         lState = m->termState[p->left->index];
2463         tiPL = pL;
2464         for (k=a=0; k<m->numRateCats; k++)
2465             {
2466             catStart = a;
2467             for (i=0; i<nObsStates; i++)
2468                 for (j=i; j<nStatesSquared; j+=nStates)
2469                     preLikeL[a++] = tiPL[j];
2470             for (b=1; b<nStates/nObsStates; b++)
2471                 {
2472                 a = catStart;
2473                 for (i=0; i<nObsStates; i++)
2474                     {
2475                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2476                         preLikeL[a++] += tiPL[j];
2477                     }
2478                 }
2479             /* for ambiguous */
2480             for (i=0; i<nStates; i++)
2481                 preLikeL[a++] = 1.0;
2482             tiPL += nStatesSquared;
2483             }
2484         }
2485 
2486     /* find likelihoods of site patterns for right branch if terminal */
2487     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
2488         {
2489         shortCut |= 2;
2490         rState = m->termState[p->right->index];
2491         tiPR = pR;
2492         for (k=a=0; k<m->numRateCats; k++)
2493             {
2494             catStart = a;
2495             for (i=0; i<nObsStates; i++)
2496                 for (j=i; j<nStatesSquared; j+=nStates)
2497                     preLikeR[a++] = tiPR[j];
2498             for (b=1; b<nStates/nObsStates; b++)
2499                 {
2500                 a = catStart;
2501                 for (i=0; i<nObsStates; i++)
2502                     {
2503                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2504                         preLikeR[a++] += tiPR[j];
2505                     }
2506                 }
2507             /* for ambiguous */
2508             for (i=0; i<nStates; i++)
2509                 preLikeR[a++] = 1.0;
2510             tiPR += nStatesSquared;
2511             }
2512         }
2513 
2514     /* find likelihoods of site patterns for anc branch, always terminal */
2515     if (m->isPartAmbig[p->anc->index] == YES)
2516         {
2517         shortCut = 4;
2518         }
2519     else
2520         {
2521         aState = m->termState[p->anc->index];
2522         tiPA = pA;
2523         for (k=a=0; k<m->numRateCats; k++)
2524             {
2525             catStart = a;
2526             for (i=0; i<nObsStates; i++)
2527                 for (j=i; j<nStatesSquared; j+=nStates)
2528                     preLikeA[a++] = tiPA[j];
2529             for (b=1; b<nStates/nObsStates; b++)
2530                 {
2531                 a = catStart;
2532                 for (i=0; i<nObsStates; i++)
2533                     {
2534                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2535                         preLikeA[a++] += tiPA[j];
2536                     }
2537                 }
2538             /* for ambiguous */
2539             for (i=0; i<nStates; i++)
2540                 preLikeA[a++] = 1.0;
2541             tiPA += nStatesSquared;
2542             }
2543         }
2544 #   else
2545     shortCut = 4;
2546 #   endif
2547 
2548         switch (shortCut)
2549         {
2550         case 4:
2551             tiPL = pL;
2552             tiPR = pR;
2553             tiPA = pA;
2554             for (k=0; k<m->numRateCats; k++)
2555                 {
2556                 for (c=0; c<m->numVecChars; c++)
2557                     {
2558                     for (i=h=0; i<nStates; i++)
2559                         {
2560                         mAcumL = _mm_setzero_ps();
2561                         mAcumR = _mm_setzero_ps();
2562                         mAcumA = _mm_setzero_ps();
2563                         for (j=0; j<nStates; j++)
2564                             {
2565                             mTiPL  = _mm_load1_ps (&tiPL[h]);
2566                             mTiPR  = _mm_load1_ps (&tiPR[h]);
2567                             mTiPA  = _mm_load1_ps (&tiPA[h++]);
2568                             mL     = _mm_mul_ps (mTiPL, clL[j]);
2569                             mR     = _mm_mul_ps (mTiPR, clR[j]);
2570                             mA     = _mm_mul_ps (mTiPA, clA[j]);
2571                             mAcumL = _mm_add_ps (mL, mAcumL);
2572                             mAcumR = _mm_add_ps (mR, mAcumR);
2573                             mAcumA = _mm_add_ps (mA, mAcumA);
2574                             }
2575                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
2576                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
2577                         }
2578                     clL += nStates;
2579                     clR += nStates;
2580                     clA += nStates;
2581                     }
2582                 tiPL += nStatesSquared;
2583                 tiPR += nStatesSquared;
2584                 tiPA += nStatesSquared;
2585                 }
2586             break;
2587         case 0:
2588             tiPL =pL;
2589             tiPR =pR;
2590             for (k=0; k<m->numRateCats; k++)
2591                 {
2592                 for (c=t=0; c<m->numVecChars; c++)
2593                     {
2594                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
2595                         {
2596                         preLikeAV[c1] = &preLikeA[aState[t] + k*(preLikeJump+nStates)];
2597                         }
2598                     for (i=h=0; i<nStates; i++)
2599                         {
2600                         assert (m->numFloatsPerVec == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
2601                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
2602                         mAcumL = _mm_setzero_ps();
2603                         mAcumR = _mm_setzero_ps();
2604                         for (j=0; j<nStates; j++)
2605                             {
2606                             mTiPL  = _mm_load1_ps (&tiPL[h]);
2607                             mL     = _mm_mul_ps (mTiPL, clL[j]);
2608                             mAcumL = _mm_add_ps (mL, mAcumL);
2609                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
2610                             mR     = _mm_mul_ps (mTiPR, clR[j]);
2611                             mAcumR = _mm_add_ps (mR, mAcumR);
2612                             }
2613                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
2614                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
2615                         }
2616                     clR += nStates;
2617                     clL += nStates;
2618                     }
2619                 tiPL += nStatesSquared;
2620                 tiPR += nStatesSquared;
2621                 }
2622             break;
2623         case 1:
2624             tiPR = pR;
2625             for (k=0; k<m->numRateCats; k++)
2626                 {
2627                 for (c=t=0; c<m->numVecChars; c++)
2628                     {
2629                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
2630                         {
2631                         preLikeLV[c1] = &preLikeL[lState[t] + k*(preLikeJump+nStates)];
2632                         preLikeAV[c1] = &preLikeA[aState[t] + k*(preLikeJump+nStates)];
2633                         }
2634                     for (i=h=0; i<nStates; i++)
2635                         {
2636                         assert (m->numFloatsPerVec == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
2637                         mAcumL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
2638                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
2639                         mAcumR = _mm_setzero_ps();
2640                         for (j=0; j<nStates; j++)
2641                             {
2642                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
2643                             mR     = _mm_mul_ps (mTiPR, clR[j]);
2644                             mAcumR = _mm_add_ps (mR, mAcumR);
2645                             }
2646                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
2647                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
2648                         }
2649                     clR += nStates;
2650                     }
2651                 tiPR += nStatesSquared;
2652                 }
2653             break;
2654         case 2:
2655             tiPL = pL;
2656             for (k=0; k<m->numRateCats; k++)
2657                 {
2658                 for (c=t=0; c<m->numVecChars; c++)
2659                     {
2660                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
2661                         {
2662                         preLikeRV[c1] = &preLikeR[rState[t] + k*(preLikeJump+nStates)];
2663                         preLikeAV[c1] = &preLikeA[aState[t] + k*(preLikeJump+nStates)];
2664                         }
2665                     for (i=h=0; i<nStates; i++)
2666                         {
2667                         assert (m->numFloatsPerVec == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
2668                         mAcumR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
2669                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
2670                         mAcumL = _mm_setzero_ps();
2671                         for (j=0; j<nStates; j++)
2672                             {
2673                             mTiPL  = _mm_load1_ps (&tiPL[h++]);
2674                             mL     = _mm_mul_ps (mTiPL, clL[j]);
2675                             mAcumL = _mm_add_ps (mL, mAcumL);
2676                             }
2677                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
2678                         *(clP++) = _mm_mul_ps (mAcumL,mAcumA);
2679                         }
2680                     clL += nStates;
2681                     }
2682                 tiPL += nStatesSquared;
2683                 }
2684             break;
2685         case 3:
2686             for (k=0; k<m->numRateCats; k++)
2687                 {
2688                 for (c=t=0; c<m->numVecChars; c++)
2689                     {
2690                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
2691                         {
2692                         preLikeRV[c1] = &preLikeR[rState[t] + k*(preLikeJump+nStates)];
2693                         preLikeLV[c1] = &preLikeL[lState[t] + k*(preLikeJump+nStates)];
2694                         preLikeAV[c1] = &preLikeA[aState[t] + k*(preLikeJump+nStates)];
2695                         }
2696                     for (i=0; i<nStates; i++)
2697                         {
2698                         assert (m->numFloatsPerVec == 4); /* In the following 2 statments we assume that SSE register can hold exactly 4 ClFlts. */
2699                         mL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
2700                         mR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
2701                         mA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
2702                         mL = _mm_mul_ps (mL,mR);
2703                         *(clP++) = _mm_mul_ps (mL,mA);
2704                         }
2705                     }
2706                 }
2707             break;
2708         }
2709 
2710     return NO_ERROR;
2711 }
2712 #endif
2713 
2714 
2715 /*----------------------------------------------------------------
2716 |
2717 |   CondLikeRoot_Gen_GibbsGamma: general n-state model with rate
2718 |       variation modeled using a discrete gamma distribution with
2719 |       Gibbs resampling of rate categories
2720 |
2721 -----------------------------------------------------------------*/
CondLikeRoot_Gen_GibbsGamma(TreeNode * p,int division,int chain)2722 int CondLikeRoot_Gen_GibbsGamma (TreeNode *p, int division, int chain)
2723 {
2724     int             a, b, c, i, j, r, *rateCat, shortCut, *lState=NULL,
2725                     *rState=NULL, *aState=NULL, nObsStates, nStates,
2726                     nStatesSquared, nRateCats;
2727     CLFlt           likeL, likeR, likeA, *clL, *clR, *clP, *clA, *pL, *pR, *pA,
2728                     *tiPL, *tiPR, *tiPA;
2729     ModelInfo       *m;
2730 #   if !defined (DEBUG_NOSHORTCUTS)
2731     int k, catStart;
2732 #endif
2733 
2734     /* find model settings for this division and nStates, nStatesSquared */
2735     m = &modelSettings[division];
2736     nObsStates = m->numStates;
2737     nStates = m->numModelStates;
2738     nStatesSquared = nStates * nStates;
2739 
2740     /* flip conditional likelihood space */
2741     FlipCondLikeSpace (m, chain, p->index);
2742 
2743     /* find conditional likelihood pointers */
2744     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
2745     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
2746     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
2747     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
2748 
2749     /* find transition probabilities (or calculate instead) */
2750     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
2751     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
2752     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
2753 
2754     /* find rate category index and number of rate categories */
2755     rateCat = m->tiIndex + chain * m->numChars;
2756     nRateCats = m->numRateCats;
2757 
2758     /* find likelihoods of site patterns for left branch if terminal */
2759     shortCut = 0;
2760 #   if !defined (DEBUG_NOSHORTCUTS)
2761     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
2762         {
2763         shortCut |= 1;
2764         lState = m->termState[p->left->index];
2765         tiPL = pL;
2766         for (k=a=0; k<nRateCats; k++)
2767             {
2768             catStart = a;
2769             for (i=0; i<nObsStates; i++)
2770                 for (j=i; j<nStatesSquared; j+=nStates)
2771                     preLikeL[a++] = tiPL[j];
2772             for (b=1; b<nStates/nObsStates; b++)
2773                 {
2774                 a = catStart;
2775                 for (i=0; i<nObsStates; i++)
2776                     {
2777                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2778                         preLikeL[a++] += tiPL[j];
2779                     }
2780                 }
2781             /* for ambiguous */
2782             for (i=0; i<nStates; i++)
2783                 preLikeL[a++] = 1.0;
2784             tiPL += nStatesSquared;
2785             }
2786         }
2787 
2788     /* find likelihoods of site patterns for right branch if terminal */
2789     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
2790         {
2791         shortCut |= 2;
2792         rState = m->termState[p->right->index];
2793         tiPR = pR;
2794         for (k=a=0; k<nRateCats; k++)
2795             {
2796             catStart = a;
2797             for (i=0; i<nObsStates; i++)
2798                 for (j=i; j<nStatesSquared; j+=nStates)
2799                     preLikeR[a++] = tiPR[j];
2800             for (b=1; b<nStates/nObsStates; b++)
2801                 {
2802                 a = catStart;
2803                 for (i=0; i<nObsStates; i++)
2804                     {
2805                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2806                         preLikeR[a++] += tiPR[j];
2807                     }
2808                 }
2809             /* for ambiguous */
2810             for (i=0; i<nStates; i++)
2811                 preLikeR[a++] = 1.0;
2812             tiPR += nStatesSquared;
2813             }
2814         }
2815 
2816     /* find likelihoods of site patterns for anc branch, always terminal */
2817     if (m->isPartAmbig[p->anc->index] == YES)
2818         {
2819         shortCut = 4;
2820         }
2821     else
2822         {
2823         aState = m->termState[p->anc->index];
2824         tiPA = pA;
2825         for (k=a=0; k<nRateCats; k++)
2826             {
2827             catStart = a;
2828             for (i=0; i<nObsStates; i++)
2829                 for (j=i; j<nStatesSquared; j+=nStates)
2830                     preLikeA[a++] = tiPA[j];
2831             for (b=1; b<nStates/nObsStates; b++)
2832                 {
2833                 a = catStart;
2834                 for (i=0; i<nObsStates; i++)
2835                     {
2836                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2837                         preLikeA[a++] += tiPA[j];
2838                     }
2839                 }
2840             /* for ambiguous */
2841             for (i=0; i<nStates; i++)
2842                 preLikeA[a++] = 1.0;
2843             tiPA += nStatesSquared;
2844             }
2845         }
2846 #   else
2847     shortCut = 4;
2848 #   endif
2849 
2850     switch (shortCut)
2851         {
2852     case 4:
2853         for (c=0; c<m->numChars; c++)
2854             {
2855             r = (*rateCat++);
2856             if (r < nRateCats)
2857                 {
2858                 tiPL = pL + r*nStatesSquared;
2859                 tiPR = pR + r*nStatesSquared;
2860                 tiPA = pA + r*nStatesSquared;
2861                 for (i=0; i<nStates; i++)
2862                     {
2863                     likeL = likeR = likeA = 0.0;
2864                     for (j=0; j<nStates; j++)
2865                         {
2866                         likeL += (*tiPL++) * clL[j];
2867                         likeR += (*tiPR++) * clR[j];
2868                         likeA += (*tiPA++) * clA[j];
2869                         }
2870                     *(clP++) = likeL * likeR * likeA;
2871                     }
2872                 }
2873             else
2874                 clP += nStates;
2875             clL += nStates;
2876             clR += nStates;
2877             clA += nStates;
2878             }
2879         break;
2880     case 0:
2881     case 3:
2882         for (c=0; c<m->numChars; c++)
2883             {
2884             r = (*rateCat++);
2885             if (r < nRateCats)
2886                 {
2887                 tiPL = pL + r*nStatesSquared;
2888                 tiPR = pR + r*nStatesSquared;
2889                 a = aState[c] + r*(nStatesSquared+nStates);
2890                 for (i=0; i<nStates; i++)
2891                     {
2892                     likeL = likeR = 0.0;
2893                     for (j=0; j<nStates; j++)
2894                         {
2895                         likeL += (*tiPL++) * clL[j];
2896                         likeR += (*tiPR++) * clR[j];
2897                         }
2898                     *(clP++) = likeL * likeR * preLikeA[a++];
2899                     }
2900                 }
2901             else
2902                 clP += nStates;
2903             clL += nStates;
2904             clR += nStates;
2905             }
2906         break;
2907     case 1:
2908         for (c=0; c<m->numChars; c++)
2909             {
2910             r = (*rateCat++);
2911             if (r < nRateCats)
2912                 {
2913                 tiPR = pR + r*nStatesSquared;
2914                 a = lState[c] + r*(nStatesSquared+nStates);
2915                 b = aState[c] + r*(nStatesSquared+nStates);
2916                 for (i=0; i<nStates; i++)
2917                     {
2918                     likeR = 0.0;
2919                     for (j=0; j<nStates; j++)
2920                         {
2921                         likeR += (*tiPR++) * clR[j];
2922                         }
2923                     *(clP++) = preLikeL[a++] * likeR * preLikeA[b++];
2924                     }
2925                 }
2926             else
2927                 clP += nStates;
2928             clR += nStates;
2929             }
2930         break;
2931     case 2:
2932         for (c=0; c<m->numChars; c++)
2933             {
2934             r = (*rateCat++);
2935             if (r < nRateCats)
2936                 {
2937                 tiPL = pL + r*nStatesSquared;
2938                 a = rState[c] + r*(nStatesSquared+nStates);
2939                 b = aState[c] + r*(nStatesSquared+nStates);
2940                 for (i=0; i<nStates; i++)
2941                     {
2942                     likeL = 0.0;
2943                     for (j=0; j<nStates; j++)
2944                         {
2945                         likeL += (*tiPL++) * clL[j];
2946                         }
2947                     *(clP++) = likeL * preLikeR[a++] * preLikeA[b++];
2948                     }
2949                 }
2950             else
2951                 clP += nStates;
2952             clL += nStates;
2953             }
2954         break;
2955         }
2956 
2957     return NO_ERROR;
2958 }
2959 
2960 
2961 /*----------------------------------------------------------------
2962 |
2963 |   CondLikeRoot_NUC4: 4by4 nucleotide model with or without rate
2964 |       variation
2965 |
2966 -----------------------------------------------------------------*/
CondLikeRoot_NUC4(TreeNode * p,int division,int chain)2967 int CondLikeRoot_NUC4 (TreeNode *p, int division, int chain)
2968 {
2969     int             a, c, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, *aState=NULL;
2970     CLFlt           *clL, *clR, *clP, *clA, *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
2971     ModelInfo       *m;
2972 
2973     m = &modelSettings[division];
2974 
2975     /* flip state of node so that we are not overwriting old cond likes */
2976     FlipCondLikeSpace (m, chain, p->index);
2977 
2978     /* find conditional likelihood pointers */
2979     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
2980     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
2981     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
2982     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
2983 
2984     /* find transition probabilities (or calculate instead) */
2985     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
2986     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
2987     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
2988 
2989     /* find likelihoods of site patterns for left branch if terminal */
2990     shortCut = 0;
2991 #   if !defined (DEBUG_NOSHORTCUTS)
2992     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
2993         {
2994         shortCut |= 1;
2995         lState = m->termState[p->left->index];
2996         tiPL = pL;
2997         for (k=j=0; k<m->numRateCats; k++)
2998             {
2999             for (i=0; i<4; i++)
3000                 {
3001                 preLikeL[j++] = tiPL[0];
3002                 preLikeL[j++] = tiPL[4];
3003                 preLikeL[j++] = tiPL[8];
3004                 preLikeL[j++] = tiPL[12];
3005                 tiPL++;
3006                 }
3007             /* for ambiguous */
3008             for (i=0; i<4; i++)
3009                 preLikeL[j++] = 1.0;
3010             tiPL += 12;
3011             }
3012         }
3013 
3014     /* find likelihoods of site patterns for right branch if terminal */
3015     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
3016         {
3017         shortCut |= 2;
3018         rState = m->termState[p->right->index];
3019         tiPR = pR;
3020         for (k=j=0; k<m->numRateCats; k++)
3021             {
3022             for (i=0; i<4; i++)
3023                 {
3024                 preLikeR[j++] = tiPR[0];
3025                 preLikeR[j++] = tiPR[4];
3026                 preLikeR[j++] = tiPR[8];
3027                 preLikeR[j++] = tiPR[12];
3028                 tiPR++;
3029                 }
3030             /* for ambiguous */
3031             for (i=0; i<4; i++)
3032                 preLikeR[j++] = 1.0;
3033             tiPR += 12;
3034             }
3035         }
3036 
3037     /* find likelihoods of site patterns for anc branch, always terminal */
3038     if (m->isPartAmbig[p->anc->index] == YES)
3039         {
3040         shortCut = 4;
3041         }
3042     else
3043         {
3044         aState = m->termState[p->anc->index];
3045         tiPA = pA;
3046         for (k=j=0; k<m->numRateCats; k++)
3047             {
3048             for (i=0; i<4; i++)
3049                 {
3050                 preLikeA[j++] = tiPA[0];
3051                 preLikeA[j++] = tiPA[4];
3052                 preLikeA[j++] = tiPA[8];
3053                 preLikeA[j++] = tiPA[12];
3054                 tiPA++;
3055                 }
3056             /* for ambiguous */
3057             for (i=0; i<4; i++)
3058                 preLikeA[j++] = 1.0;
3059             tiPA += 12;
3060             }
3061         }
3062 #   else
3063     shortCut = 4;
3064 #   endif
3065 
3066     switch (shortCut)
3067         {
3068     case 4:
3069         tiPL = pL;
3070         tiPR = pR;
3071         tiPA = pA;
3072         for (k=h=0; k<m->numRateCats; k++)
3073             {
3074             for (c=0; c<m->numChars; c++)
3075                 {
3076                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
3077                             *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
3078                             *(tiPA[AA]*clA[A] + tiPA[AC]*clA[C] + tiPA[AG]*clA[G] + tiPA[AT]*clA[T]);
3079                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
3080                             *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
3081                             *(tiPA[CA]*clA[A] + tiPA[CC]*clA[C] + tiPA[CG]*clA[G] + tiPA[CT]*clA[T]);
3082                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
3083                             *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
3084                             *(tiPA[GA]*clA[A] + tiPA[GC]*clA[C] + tiPA[GG]*clA[G] + tiPA[GT]*clA[T]);
3085                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
3086                             *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
3087                             *(tiPA[TA]*clA[A] + tiPA[TC]*clA[C] + tiPA[TG]*clA[G] + tiPA[TT]*clA[T]);
3088                 clL += 4;
3089                 clR += 4;
3090                 clA += 4;
3091                 }
3092             tiPL += 16;
3093             tiPR += 16;
3094             tiPA += 16;
3095             }
3096         break;
3097 
3098     case 0:
3099         tiPL = pL;
3100         tiPR = pR;
3101         for (k=h=0; k<m->numRateCats; k++)
3102             {
3103             for (c=0; c<m->numChars; c++)
3104                 {
3105                 i = aState[c] + k*20;
3106                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
3107                             *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
3108                             *preLikeA[i++];
3109                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
3110                             *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
3111                             *preLikeA[i++];
3112                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
3113                             *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
3114                             *preLikeA[i++];
3115                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
3116                             *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
3117                             *preLikeA[i++];
3118                 clL += 4;
3119                 clR += 4;
3120                 }
3121             tiPL += 16;
3122             tiPR += 16;
3123             }
3124         break;
3125 
3126     case 1:
3127         tiPR = pR;
3128         for (k=h=0; k<m->numRateCats; k++)
3129             {
3130             for (c=0; c<m->numChars; c++)
3131                 {
3132                 i = lState[c] + k*20;
3133                 j = aState[c] + k*20;
3134                 clP[h++] =   (tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
3135                             *preLikeL[i++]*preLikeA[j++];
3136                 clP[h++] =   (tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
3137                             *preLikeL[i++]*preLikeA[j++];
3138                 clP[h++] =   (tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
3139                             *preLikeL[i++]*preLikeA[j++];
3140                 clP[h++] =   (tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
3141                             *preLikeL[i++]*preLikeA[j++];
3142                 clR += 4;
3143                 }
3144             tiPR += 16;
3145             }
3146         break;
3147 
3148     case 2:
3149         tiPL = pL;
3150         for (k=h=0; k<m->numRateCats; k++)
3151             {
3152             for (c=0; c<m->numChars; c++)
3153                 {
3154                 i = rState[c] + k*20;
3155                 j = aState[c] + k*20;
3156                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
3157                             *preLikeR[i++]*preLikeA[j++];
3158                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
3159                             *preLikeR[i++]*preLikeA[j++];
3160                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
3161                             *preLikeR[i++]*preLikeA[j++];
3162                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
3163                             *preLikeR[i++]*preLikeA[j++];
3164                 clL += 4;
3165                 }
3166             tiPL += 16;
3167             }
3168         break;
3169 
3170     case 3:
3171         for (k=h=0; k<m->numRateCats; k++)
3172             {
3173             for (c=0; c<m->numChars; c++)
3174                 {
3175                 a = lState[c] + k*20;
3176                 i = rState[c] + k*20;
3177                 j = aState[c] + k*20;
3178                 clP[h++] =   preLikeL[a++]*preLikeR[i++]*preLikeA[j++];
3179                 clP[h++] =   preLikeL[a++]*preLikeR[i++]*preLikeA[j++];
3180                 clP[h++] =   preLikeL[a++]*preLikeR[i++]*preLikeA[j++];
3181                 clP[h++] =   preLikeL[a++]*preLikeR[i++]*preLikeA[j++];
3182                 }
3183             }
3184         break;
3185         }
3186 
3187     return NO_ERROR;
3188 }
3189 
3190 
3191 /*----------------------------------------------------------------
3192 |
3193 |   CondLikeRoot_NUC4_GibbsGamma: 4by4 nucleotide model with rate
3194 |       variation approimated by Gibbs sampling from gamma
3195 |
3196 -----------------------------------------------------------------*/
CondLikeRoot_NUC4_GibbsGamma(TreeNode * p,int division,int chain)3197 int CondLikeRoot_NUC4_GibbsGamma (TreeNode *p, int division, int chain)
3198 {
3199     int             c, h, i, j, r, *rateCat, shortCut, *lState=NULL, *rState=NULL, *aState=NULL,
3200                     nRateCats;
3201     CLFlt           *clL, *clR, *clP, *clA, *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
3202     ModelInfo       *m;
3203 #   if !defined (DEBUG_NOSHORTCUTS)
3204     int k;
3205 #   endif
3206 
3207     m = &modelSettings[division];
3208 
3209     /* flip conditional likelihood space */
3210     FlipCondLikeSpace (m, chain, p->index);
3211 
3212         /* find conditional likelihood pointers */
3213     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
3214     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
3215     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
3216     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
3217 
3218     /* find transition probabilities (or calculate instead) */
3219     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
3220     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
3221     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
3222 
3223     /* find rate category index and number of gamma categories */
3224     rateCat = m->tiIndex + chain * m->numChars;
3225     nRateCats = m->numRateCats;
3226 
3227     /* find likelihoods of site patterns for left branch if terminal */
3228     shortCut = 0;
3229 #   if !defined (DEBUG_NOSHORTCUTS)
3230     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
3231         {
3232         shortCut |= 1;
3233         lState = m->termState[p->left->index];
3234         tiPL = pL;
3235         for (k=j=0; k<nRateCats; k++)
3236             {
3237             for (i=0; i<4; i++)
3238                 {
3239                 preLikeL[j++] = tiPL[0];
3240                 preLikeL[j++] = tiPL[4];
3241                 preLikeL[j++] = tiPL[8];
3242                 preLikeL[j++] = tiPL[12];
3243                 tiPL++;
3244                 }
3245             /* for ambiguous */
3246             for (i=0; i<4; i++)
3247                 preLikeL[j++] = 1.0;
3248             tiPL += 12;
3249             }
3250         }
3251 
3252     /* find likelihoods of site patterns for right branch if terminal */
3253     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
3254         {
3255         shortCut |= 2;
3256         rState = m->termState[p->right->index];
3257         tiPR = pR;
3258         for (k=j=0; k<nRateCats; k++)
3259             {
3260             for (i=0; i<4; i++)
3261                 {
3262                 preLikeR[j++] = tiPR[0];
3263                 preLikeR[j++] = tiPR[4];
3264                 preLikeR[j++] = tiPR[8];
3265                 preLikeR[j++] = tiPR[12];
3266                 tiPR++;
3267                 }
3268             /* for ambiguous */
3269             for (i=0; i<4; i++)
3270                 preLikeR[j++] = 1.0;
3271             tiPR += 12;
3272             }
3273         }
3274 
3275     /* find likelihoods of site patterns for anc branch, always terminal */
3276     if (m->isPartAmbig[p->anc->index] == YES)
3277         {
3278         shortCut = 4;
3279         }
3280     else
3281         {
3282         aState = m->termState[p->anc->index];
3283         tiPA = pA;
3284         for (k=j=0; k<nRateCats; k++)
3285             {
3286             for (i=0; i<4; i++)
3287                 {
3288                 preLikeA[j++] = tiPA[0];
3289                 preLikeA[j++] = tiPA[4];
3290                 preLikeA[j++] = tiPA[8];
3291                 preLikeA[j++] = tiPA[12];
3292                 tiPA++;
3293                 }
3294             /* for ambiguous */
3295             for (i=0; i<4; i++)
3296                 preLikeA[j++] = 1.0;
3297             tiPA += 12;
3298             }
3299         }
3300 #   else
3301     shortCut = 4;
3302 #   endif
3303 
3304     switch (shortCut)
3305         {
3306     case 4:
3307         for (c=h=0; c<m->numChars; c++)
3308             {
3309             r = rateCat[c];
3310             if (r < nRateCats)
3311                 {
3312                 tiPL = pL + r * 16;
3313                 tiPR = pR + r * 16;
3314                 tiPA = pA + r * 16;
3315                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
3316                             *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
3317                             *(tiPA[AA]*clA[A] + tiPA[AC]*clA[C] + tiPA[AG]*clA[G] + tiPA[AT]*clA[T]);
3318                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
3319                             *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
3320                             *(tiPA[CA]*clA[A] + tiPA[CC]*clA[C] + tiPA[CG]*clA[G] + tiPA[CT]*clA[T]);
3321                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
3322                             *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
3323                             *(tiPA[GA]*clA[A] + tiPA[GC]*clA[C] + tiPA[GG]*clA[G] + tiPA[GT]*clA[T]);
3324                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
3325                             *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
3326                             *(tiPA[TA]*clA[A] + tiPA[TC]*clA[C] + tiPA[TG]*clA[G] + tiPA[TT]*clA[T]);
3327                 }
3328             else
3329                 h += 4;
3330             clL += 4;
3331             clR += 4;
3332             clA += 4;
3333             }
3334         break;
3335 
3336     case 0:
3337     case 3:
3338         for (c=h=0; c<m->numChars; c++)
3339             {
3340             r = rateCat[c];
3341             if (r < nRateCats)
3342                 {
3343                 tiPL = pL + r * 16;
3344                 tiPR = pR + r * 16;
3345                 i = aState[c] + r * 20;
3346                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
3347                             *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
3348                             *preLikeA[i++];
3349                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
3350                             *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
3351                             *preLikeA[i++];
3352                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
3353                             *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
3354                             *preLikeA[i++];
3355                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
3356                             *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
3357                             *preLikeA[i++];
3358                 }
3359             else
3360                 h += 4;
3361             clL += 4;
3362             clR += 4;
3363             }
3364         break;
3365 
3366     case 1:
3367         for (c=h=0; c<m->numChars; c++)
3368             {
3369             r = rateCat[c];
3370             if (r < nRateCats)
3371                 {
3372                 tiPR = pR + r * 16;
3373                 i = lState[c] + r * 20;
3374                 j = aState[c] + r * 20;
3375                 clP[h++] =   (tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
3376                             *preLikeL[i++]*preLikeA[j++];
3377                 clP[h++] =   (tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
3378                             *preLikeL[i++]*preLikeA[j++];
3379                 clP[h++] =   (tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
3380                             *preLikeL[i++]*preLikeA[j++];
3381                 clP[h++] =   (tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
3382                             *preLikeL[i++]*preLikeA[j++];
3383                 }
3384             else
3385                 h += 4;
3386             clR += 4;
3387             }
3388         break;
3389 
3390     case 2:
3391         for (c=h=0; c<m->numChars; c++)
3392             {
3393             r = rateCat[c];
3394             if (r < nRateCats)
3395                 {
3396                 tiPL = pL + r * 16;
3397                 i = rState[c] + r * 20;
3398                 j = aState[c] + r * 20;
3399                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
3400                             *preLikeR[i++]*preLikeA[j++];
3401                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
3402                             *preLikeR[i++]*preLikeA[j++];
3403                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
3404                             *preLikeR[i++]*preLikeA[j++];
3405                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
3406                             *preLikeR[i++]*preLikeA[j++];
3407                 }
3408             else
3409                 h += 4;
3410             clL += 4;
3411             }
3412         break;
3413         }
3414 
3415     return NO_ERROR;
3416 }
3417 
3418 
3419 #if defined (FMA_ENABLED)
3420 /*----------------------------------------------------------------
3421  |
3422  |   CondLikeRoot_NUC4_FMA: 4by4 nucleotide model with or without rate
3423  |       variation using AVX + FMA instructions
3424  |
3425  -----------------------------------------------------------------*/
CondLikeRoot_NUC4_FMA(TreeNode * p,int division,int chain)3426 int CondLikeRoot_NUC4_FMA (TreeNode *p, int division, int chain)
3427 {
3428     int             c, k;
3429     CLFlt           *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
3430     __m256          *clL, *clR, *clP, *clA;
3431     __m256          m1, m2, m3, m4, m5, m6;
3432     ModelInfo       *m;
3433 
3434     m = &modelSettings[division];
3435 
3436     /* flip state of node so that we are not overwriting old cond likes */
3437     FlipCondLikeSpace (m, chain, p->index);
3438 
3439     /* find conditional likelihood pointers */
3440     clL = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
3441     clR = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
3442     clP = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
3443     clA = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
3444 
3445     /* find transition probabilities */
3446     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
3447     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
3448     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
3449 
3450     tiPL = pL;
3451     tiPR = pR;
3452     tiPA = pA;
3453     for (k=0; k<m->numRateCats; k++)
3454     {
3455         for (c=0; c<m->numVecChars; c++)
3456         {
3457             m1 = _mm256_broadcast_ss (&tiPL[AA]);
3458             m2 = _mm256_broadcast_ss (&tiPR[AA]);
3459             m3 = _mm256_broadcast_ss (&tiPA[AA]);
3460             m4 = _mm256_mul_ps (m1, clL[A]);
3461             m5 = _mm256_mul_ps (m2, clR[A]);
3462             m6 = _mm256_mul_ps (m3, clA[A]);
3463 
3464             m1 = _mm256_broadcast_ss (&tiPL[AC]);
3465             m2 = _mm256_broadcast_ss (&tiPR[AC]);
3466             m3 = _mm256_broadcast_ss (&tiPA[AC]);
3467             m4 = _mm256_fmadd_ps (m1, clL[C], m4);
3468             m5 = _mm256_fmadd_ps (m2, clR[C], m5);
3469             m6 = _mm256_fmadd_ps (m3, clA[C], m6);
3470 
3471             m1 = _mm256_broadcast_ss (&tiPL[AG]);
3472             m2 = _mm256_broadcast_ss (&tiPR[AG]);
3473             m3 = _mm256_broadcast_ss (&tiPA[AG]);
3474             m4 = _mm256_fmadd_ps (m1, clL[G], m4);
3475             m5 = _mm256_fmadd_ps (m2, clR[G], m5);
3476             m6 = _mm256_fmadd_ps (m3, clA[G], m6);
3477 
3478             m1 = _mm256_broadcast_ss (&tiPL[AT]);
3479             m2 = _mm256_broadcast_ss (&tiPR[AT]);
3480             m3 = _mm256_broadcast_ss (&tiPA[AT]);
3481             m4 = _mm256_fmadd_ps (m1, clL[T], m4);
3482             m5 = _mm256_fmadd_ps (m2, clR[T], m5);
3483             m6 = _mm256_fmadd_ps (m3, clA[T], m6);
3484 
3485             m4 = _mm256_mul_ps (m4, m5);
3486             *clP++ = _mm256_mul_ps (m4, m6);
3487 
3488             m1 = _mm256_broadcast_ss (&tiPL[CA]);
3489             m2 = _mm256_broadcast_ss (&tiPR[CA]);
3490             m3 = _mm256_broadcast_ss (&tiPA[CA]);
3491             m4 = _mm256_mul_ps (m1, clL[A]);
3492             m5 = _mm256_mul_ps (m2, clR[A]);
3493             m6 = _mm256_mul_ps (m3, clA[A]);
3494 
3495             m1 = _mm256_broadcast_ss (&tiPL[CC]);
3496             m2 = _mm256_broadcast_ss (&tiPR[CC]);
3497             m3 = _mm256_broadcast_ss (&tiPA[CC]);
3498             m4 = _mm256_fmadd_ps (m1, clL[C], m4);
3499             m5 = _mm256_fmadd_ps (m2, clR[C], m5);
3500             m6 = _mm256_fmadd_ps (m3, clA[C], m6);
3501 
3502             m1 = _mm256_broadcast_ss (&tiPL[CG]);
3503             m2 = _mm256_broadcast_ss (&tiPR[CG]);
3504             m3 = _mm256_broadcast_ss (&tiPA[CG]);
3505             m4 = _mm256_fmadd_ps (m1, clL[G], m4);
3506             m5 = _mm256_fmadd_ps (m2, clR[G], m5);
3507             m6 = _mm256_fmadd_ps (m3, clA[G], m6);
3508 
3509             m1 = _mm256_broadcast_ss (&tiPL[CT]);
3510             m2 = _mm256_broadcast_ss (&tiPR[CT]);
3511             m3 = _mm256_broadcast_ss (&tiPA[CT]);
3512             m4 = _mm256_fmadd_ps (m1, clL[T], m4);
3513             m5 = _mm256_fmadd_ps (m2, clR[T], m5);
3514             m6 = _mm256_fmadd_ps (m3, clA[T], m6);
3515 
3516             m4 = _mm256_mul_ps (m4, m5);
3517             *clP++ = _mm256_mul_ps (m4, m6);
3518 
3519             m1 = _mm256_broadcast_ss (&tiPL[GA]);
3520             m2 = _mm256_broadcast_ss (&tiPR[GA]);
3521             m3 = _mm256_broadcast_ss (&tiPA[GA]);
3522             m4 = _mm256_mul_ps (m1, clL[A]);
3523             m5 = _mm256_mul_ps (m2, clR[A]);
3524             m6 = _mm256_mul_ps (m3, clA[A]);
3525 
3526             m1 = _mm256_broadcast_ss (&tiPL[GC]);
3527             m2 = _mm256_broadcast_ss (&tiPR[GC]);
3528             m3 = _mm256_broadcast_ss (&tiPA[GC]);
3529             m4 = _mm256_fmadd_ps (m1, clL[C], m4);
3530             m5 = _mm256_fmadd_ps (m2, clR[C], m5);
3531             m6 = _mm256_fmadd_ps (m3, clA[C], m6);
3532 
3533             m1 = _mm256_broadcast_ss (&tiPL[GG]);
3534             m2 = _mm256_broadcast_ss (&tiPR[GG]);
3535             m3 = _mm256_broadcast_ss (&tiPA[GG]);
3536             m4 = _mm256_fmadd_ps (m1, clL[G], m4);
3537             m5 = _mm256_fmadd_ps (m2, clR[G], m5);
3538             m6 = _mm256_fmadd_ps (m3, clA[G], m6);
3539 
3540             m1 = _mm256_broadcast_ss (&tiPL[GT]);
3541             m2 = _mm256_broadcast_ss (&tiPR[GT]);
3542             m3 = _mm256_broadcast_ss (&tiPA[GT]);
3543             m4 = _mm256_fmadd_ps (m1, clL[T], m4);
3544             m5 = _mm256_fmadd_ps (m2, clR[T], m5);
3545             m6 = _mm256_fmadd_ps (m3, clA[T], m6);
3546 
3547             m4 = _mm256_mul_ps (m4, m5);
3548             *clP++ = _mm256_mul_ps (m4, m6);
3549 
3550             m1 = _mm256_broadcast_ss (&tiPL[TA]);
3551             m2 = _mm256_broadcast_ss (&tiPR[TA]);
3552             m3 = _mm256_broadcast_ss (&tiPA[TA]);
3553             m4 = _mm256_mul_ps (m1, clL[A]);
3554             m5 = _mm256_mul_ps (m2, clR[A]);
3555             m6 = _mm256_mul_ps (m3, clA[A]);
3556 
3557             m1 = _mm256_broadcast_ss (&tiPL[TC]);
3558             m2 = _mm256_broadcast_ss (&tiPR[TC]);
3559             m3 = _mm256_broadcast_ss (&tiPA[TC]);
3560             m4 = _mm256_fmadd_ps (m1, clL[C], m4);
3561             m5 = _mm256_fmadd_ps (m2, clR[C], m5);
3562             m6 = _mm256_fmadd_ps (m3, clA[C], m6);
3563 
3564             m1 = _mm256_broadcast_ss (&tiPL[TG]);
3565             m2 = _mm256_broadcast_ss (&tiPR[TG]);
3566             m3 = _mm256_broadcast_ss (&tiPA[TG]);
3567             m4 = _mm256_fmadd_ps (m1, clL[G], m4);
3568             m5 = _mm256_fmadd_ps (m2, clR[G], m5);
3569             m6 = _mm256_fmadd_ps (m3, clA[G], m6);
3570 
3571             m1 = _mm256_broadcast_ss (&tiPL[TT]);
3572             m2 = _mm256_broadcast_ss (&tiPR[TT]);
3573             m3 = _mm256_broadcast_ss (&tiPA[TT]);
3574             m4 = _mm256_fmadd_ps (m1, clL[T], m4);
3575             m5 = _mm256_fmadd_ps (m2, clR[T], m5);
3576             m6 = _mm256_fmadd_ps (m3, clA[T], m6);
3577 
3578             m4 = _mm256_mul_ps (m4, m5);
3579             *clP++ = _mm256_mul_ps (m4, m6);
3580 
3581             clL += 4;
3582             clR += 4;
3583             clA += 4;
3584         }
3585         tiPL += 16;
3586         tiPR += 16;
3587         tiPA += 16;
3588     }
3589 
3590     return NO_ERROR;
3591 }
3592 #endif
3593 
3594 
3595 #if defined (AVX_ENABLED)
3596 /*----------------------------------------------------------------
3597  |
3598  |   CondLikeRoot_NUC4_AVX: 4by4 nucleotide model with or without rate
3599  |       variation using AVX instructions
3600  |
3601  -----------------------------------------------------------------*/
CondLikeRoot_NUC4_AVX(TreeNode * p,int division,int chain)3602 int CondLikeRoot_NUC4_AVX (TreeNode *p, int division, int chain)
3603 {
3604     int             c, k;
3605     CLFlt           *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
3606     __m256          *clL, *clR, *clP, *clA;
3607     __m256          m1, m2, m3, m4, m5, m6, m7, m8, m9;
3608     ModelInfo       *m;
3609 
3610     m = &modelSettings[division];
3611 
3612     /* flip state of node so that we are not overwriting old cond likes */
3613     FlipCondLikeSpace (m, chain, p->index);
3614 
3615     /* find conditional likelihood pointers */
3616     clL = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
3617     clR = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
3618     clP = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
3619     clA = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
3620 
3621     /* find transition probabilities */
3622     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
3623     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
3624     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
3625 
3626     tiPL = pL;
3627     tiPR = pR;
3628     tiPA = pA;
3629     for (k=0; k<m->numRateCats; k++)
3630     {
3631         for (c=0; c<m->numVecChars; c++)
3632         {
3633             m1 = _mm256_broadcast_ss (&tiPL[AA]);
3634             m2 = _mm256_broadcast_ss (&tiPR[AA]);
3635             m3 = _mm256_broadcast_ss (&tiPA[AA]);
3636             m7 = _mm256_mul_ps (m1, clL[A]);
3637             m8 = _mm256_mul_ps (m2, clR[A]);
3638             m9 = _mm256_mul_ps (m3, clA[A]);
3639 
3640             m1 = _mm256_broadcast_ss (&tiPL[AC]);
3641             m2 = _mm256_broadcast_ss (&tiPR[AC]);
3642             m3 = _mm256_broadcast_ss (&tiPA[AC]);
3643             m4 = _mm256_mul_ps (m1, clL[C]);
3644             m5 = _mm256_mul_ps (m2, clR[C]);
3645             m6 = _mm256_mul_ps (m3, clA[C]);
3646             m7 = _mm256_add_ps (m4, m7);
3647             m8 = _mm256_add_ps (m5, m8);
3648             m9 = _mm256_add_ps (m6, m9);
3649 
3650             m1 = _mm256_broadcast_ss (&tiPL[AG]);
3651             m2 = _mm256_broadcast_ss (&tiPR[AG]);
3652             m3 = _mm256_broadcast_ss (&tiPA[AG]);
3653             m4 = _mm256_mul_ps (m1, clL[G]);
3654             m5 = _mm256_mul_ps (m2, clR[G]);
3655             m6 = _mm256_mul_ps (m3, clA[G]);
3656             m7 = _mm256_add_ps (m4, m7);
3657             m8 = _mm256_add_ps (m5, m8);
3658             m9 = _mm256_add_ps (m6, m9);
3659 
3660             m1 = _mm256_broadcast_ss (&tiPL[AT]);
3661             m2 = _mm256_broadcast_ss (&tiPR[AT]);
3662             m3 = _mm256_broadcast_ss (&tiPA[AT]);
3663             m4 = _mm256_mul_ps (m1, clL[T]);
3664             m5 = _mm256_mul_ps (m2, clR[T]);
3665             m6 = _mm256_mul_ps (m3, clA[T]);
3666             m7 = _mm256_add_ps (m4, m7);
3667             m8 = _mm256_add_ps (m5, m8);
3668             m9 = _mm256_add_ps (m6, m9);
3669 
3670             m7 = _mm256_mul_ps (m7, m8);
3671             *clP++ = _mm256_mul_ps (m7, m9);
3672 
3673             m1 = _mm256_broadcast_ss (&tiPL[CA]);
3674             m2 = _mm256_broadcast_ss (&tiPR[CA]);
3675             m3 = _mm256_broadcast_ss (&tiPA[CA]);
3676             m7 = _mm256_mul_ps (m1, clL[A]);
3677             m8 = _mm256_mul_ps (m2, clR[A]);
3678             m9 = _mm256_mul_ps (m3, clA[A]);
3679 
3680             m1 = _mm256_broadcast_ss (&tiPL[CC]);
3681             m2 = _mm256_broadcast_ss (&tiPR[CC]);
3682             m3 = _mm256_broadcast_ss (&tiPA[CC]);
3683             m4 = _mm256_mul_ps (m1, clL[C]);
3684             m5 = _mm256_mul_ps (m2, clR[C]);
3685             m6 = _mm256_mul_ps (m3, clA[C]);
3686             m7 = _mm256_add_ps (m4, m7);
3687             m8 = _mm256_add_ps (m5, m8);
3688             m9 = _mm256_add_ps (m6, m9);
3689 
3690             m1 = _mm256_broadcast_ss (&tiPL[CG]);
3691             m2 = _mm256_broadcast_ss (&tiPR[CG]);
3692             m3 = _mm256_broadcast_ss (&tiPA[CG]);
3693             m4 = _mm256_mul_ps (m1, clL[G]);
3694             m5 = _mm256_mul_ps (m2, clR[G]);
3695             m6 = _mm256_mul_ps (m3, clA[G]);
3696             m7 = _mm256_add_ps (m4, m7);
3697             m8 = _mm256_add_ps (m5, m8);
3698             m9 = _mm256_add_ps (m6, m9);
3699 
3700             m1 = _mm256_broadcast_ss (&tiPL[CT]);
3701             m2 = _mm256_broadcast_ss (&tiPR[CT]);
3702             m3 = _mm256_broadcast_ss (&tiPA[CT]);
3703             m4 = _mm256_mul_ps (m1, clL[T]);
3704             m5 = _mm256_mul_ps (m2, clR[T]);
3705             m6 = _mm256_mul_ps (m3, clA[T]);
3706             m7 = _mm256_add_ps (m4, m7);
3707             m8 = _mm256_add_ps (m5, m8);
3708             m9 = _mm256_add_ps (m6, m9);
3709 
3710             m7 = _mm256_mul_ps (m7, m8);
3711             *clP++ = _mm256_mul_ps (m7, m9);
3712 
3713             m1 = _mm256_broadcast_ss (&tiPL[GA]);
3714             m2 = _mm256_broadcast_ss (&tiPR[GA]);
3715             m3 = _mm256_broadcast_ss (&tiPA[GA]);
3716             m7 = _mm256_mul_ps (m1, clL[A]);
3717             m8 = _mm256_mul_ps (m2, clR[A]);
3718             m9 = _mm256_mul_ps (m3, clA[A]);
3719 
3720             m1 = _mm256_broadcast_ss (&tiPL[GC]);
3721             m2 = _mm256_broadcast_ss (&tiPR[GC]);
3722             m3 = _mm256_broadcast_ss (&tiPA[GC]);
3723             m4 = _mm256_mul_ps (m1, clL[C]);
3724             m5 = _mm256_mul_ps (m2, clR[C]);
3725             m6 = _mm256_mul_ps (m3, clA[C]);
3726             m7 = _mm256_add_ps (m4, m7);
3727             m8 = _mm256_add_ps (m5, m8);
3728             m9 = _mm256_add_ps (m6, m9);
3729 
3730             m1 = _mm256_broadcast_ss (&tiPL[GG]);
3731             m2 = _mm256_broadcast_ss (&tiPR[GG]);
3732             m3 = _mm256_broadcast_ss (&tiPA[GG]);
3733             m4 = _mm256_mul_ps (m1, clL[G]);
3734             m5 = _mm256_mul_ps (m2, clR[G]);
3735             m6 = _mm256_mul_ps (m3, clA[G]);
3736             m7 = _mm256_add_ps (m4, m7);
3737             m8 = _mm256_add_ps (m5, m8);
3738             m9 = _mm256_add_ps (m6, m9);
3739 
3740             m1 = _mm256_broadcast_ss (&tiPL[GT]);
3741             m2 = _mm256_broadcast_ss (&tiPR[GT]);
3742             m3 = _mm256_broadcast_ss (&tiPA[GT]);
3743             m4 = _mm256_mul_ps (m1, clL[T]);
3744             m5 = _mm256_mul_ps (m2, clR[T]);
3745             m6 = _mm256_mul_ps (m3, clA[T]);
3746             m7 = _mm256_add_ps (m4, m7);
3747             m8 = _mm256_add_ps (m5, m8);
3748             m9 = _mm256_add_ps (m6, m9);
3749 
3750             m7 = _mm256_mul_ps (m7, m8);
3751             *clP++ = _mm256_mul_ps (m7, m9);
3752 
3753             m1 = _mm256_broadcast_ss (&tiPL[TA]);
3754             m2 = _mm256_broadcast_ss (&tiPR[TA]);
3755             m3 = _mm256_broadcast_ss (&tiPA[TA]);
3756             m7 = _mm256_mul_ps (m1, clL[A]);
3757             m8 = _mm256_mul_ps (m2, clR[A]);
3758             m9 = _mm256_mul_ps (m3, clA[A]);
3759 
3760             m1 = _mm256_broadcast_ss (&tiPL[TC]);
3761             m2 = _mm256_broadcast_ss (&tiPR[TC]);
3762             m3 = _mm256_broadcast_ss (&tiPA[TC]);
3763             m4 = _mm256_mul_ps (m1, clL[C]);
3764             m5 = _mm256_mul_ps (m2, clR[C]);
3765             m6 = _mm256_mul_ps (m3, clA[C]);
3766             m7 = _mm256_add_ps (m4, m7);
3767             m8 = _mm256_add_ps (m5, m8);
3768             m9 = _mm256_add_ps (m6, m9);
3769 
3770             m1 = _mm256_broadcast_ss (&tiPL[TG]);
3771             m2 = _mm256_broadcast_ss (&tiPR[TG]);
3772             m3 = _mm256_broadcast_ss (&tiPA[TG]);
3773             m4 = _mm256_mul_ps (m1, clL[G]);
3774             m5 = _mm256_mul_ps (m2, clR[G]);
3775             m6 = _mm256_mul_ps (m3, clA[G]);
3776             m7 = _mm256_add_ps (m4, m7);
3777             m8 = _mm256_add_ps (m5, m8);
3778             m9 = _mm256_add_ps (m6, m9);
3779 
3780             m1 = _mm256_broadcast_ss (&tiPL[TT]);
3781             m2 = _mm256_broadcast_ss (&tiPR[TT]);
3782             m3 = _mm256_broadcast_ss (&tiPA[TT]);
3783             m4 = _mm256_mul_ps (m1, clL[T]);
3784             m5 = _mm256_mul_ps (m2, clR[T]);
3785             m6 = _mm256_mul_ps (m3, clA[T]);
3786             m7 = _mm256_add_ps (m4, m7);
3787             m8 = _mm256_add_ps (m5, m8);
3788             m9 = _mm256_add_ps (m6, m9);
3789 
3790             m7 = _mm256_mul_ps (m7, m8);
3791             *clP++ = _mm256_mul_ps (m7, m9);
3792 
3793             clL += 4;
3794             clR += 4;
3795             clA += 4;
3796         }
3797         tiPL += 16;
3798         tiPR += 16;
3799         tiPA += 16;
3800     }
3801 
3802     return NO_ERROR;
3803 }
3804 #endif
3805 
3806 
3807 #if defined (SSE_ENABLED)
3808 /*----------------------------------------------------------------
3809 |
3810 |   CondLikeRoot_NUC4_SSE: 4by4 nucleotide model with or without rate
3811 |       variation using SSE instructions
3812 |
3813 -----------------------------------------------------------------*/
CondLikeRoot_NUC4_SSE(TreeNode * p,int division,int chain)3814 int CondLikeRoot_NUC4_SSE (TreeNode *p, int division, int chain)
3815 {
3816     int             c, k;
3817     CLFlt           *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
3818     __m128          *clL, *clR, *clP, *clA;
3819     __m128          m1, m2, m3, m4, m5, m6, m7, m8, m9;
3820     ModelInfo       *m;
3821 
3822     m = &modelSettings[division];
3823 
3824     /* flip state of node so that we are not overwriting old cond likes */
3825     FlipCondLikeSpace (m, chain, p->index);
3826 
3827     /* find conditional likelihood pointers */
3828     clL = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
3829     clR = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
3830     clP = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
3831     clA = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
3832 
3833     /* find transition probabilities */
3834     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
3835     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
3836     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
3837 
3838     tiPL = pL;
3839     tiPR = pR;
3840     tiPA = pA;
3841     for (k=0; k<m->numRateCats; k++)
3842         {
3843         for (c=0; c<m->numVecChars; c++)
3844             {
3845             m1 = _mm_load1_ps (&tiPL[AA]);
3846             m2 = _mm_load1_ps (&tiPR[AA]);
3847             m3 = _mm_load1_ps (&tiPA[AA]);
3848             m7 = _mm_mul_ps (m1, clL[A]);
3849             m8 = _mm_mul_ps (m2, clR[A]);
3850             m9 = _mm_mul_ps (m3, clA[A]);
3851 
3852             m1 = _mm_load1_ps (&tiPL[AC]);
3853             m2 = _mm_load1_ps (&tiPR[AC]);
3854             m3 = _mm_load1_ps (&tiPA[AC]);
3855             m4 = _mm_mul_ps (m1, clL[C]);
3856             m5 = _mm_mul_ps (m2, clR[C]);
3857             m6 = _mm_mul_ps (m3, clA[C]);
3858             m7 = _mm_add_ps (m4, m7);
3859             m8 = _mm_add_ps (m5, m8);
3860             m9 = _mm_add_ps (m6, m9);
3861 
3862             m1 = _mm_load1_ps (&tiPL[AG]);
3863             m2 = _mm_load1_ps (&tiPR[AG]);
3864             m3 = _mm_load1_ps (&tiPA[AG]);
3865             m4 = _mm_mul_ps (m1, clL[G]);
3866             m5 = _mm_mul_ps (m2, clR[G]);
3867             m6 = _mm_mul_ps (m3, clA[G]);
3868             m7 = _mm_add_ps (m4, m7);
3869             m8 = _mm_add_ps (m5, m8);
3870             m9 = _mm_add_ps (m6, m9);
3871 
3872             m1 = _mm_load1_ps (&tiPL[AT]);
3873             m2 = _mm_load1_ps (&tiPR[AT]);
3874             m3 = _mm_load1_ps (&tiPA[AT]);
3875             m4 = _mm_mul_ps (m1, clL[T]);
3876             m5 = _mm_mul_ps (m2, clR[T]);
3877             m6 = _mm_mul_ps (m3, clA[T]);
3878             m7 = _mm_add_ps (m4, m7);
3879             m8 = _mm_add_ps (m5, m8);
3880             m9 = _mm_add_ps (m6, m9);
3881 
3882             m7 = _mm_mul_ps (m7, m8);
3883             *clP++ = _mm_mul_ps (m7, m9);
3884 
3885             m1 = _mm_load1_ps (&tiPL[CA]);
3886             m2 = _mm_load1_ps (&tiPR[CA]);
3887             m3 = _mm_load1_ps (&tiPA[CA]);
3888             m7 = _mm_mul_ps (m1, clL[A]);
3889             m8 = _mm_mul_ps (m2, clR[A]);
3890             m9 = _mm_mul_ps (m3, clA[A]);
3891 
3892             m1 = _mm_load1_ps (&tiPL[CC]);
3893             m2 = _mm_load1_ps (&tiPR[CC]);
3894             m3 = _mm_load1_ps (&tiPA[CC]);
3895             m4 = _mm_mul_ps (m1, clL[C]);
3896             m5 = _mm_mul_ps (m2, clR[C]);
3897             m6 = _mm_mul_ps (m3, clA[C]);
3898             m7 = _mm_add_ps (m4, m7);
3899             m8 = _mm_add_ps (m5, m8);
3900             m9 = _mm_add_ps (m6, m9);
3901 
3902             m1 = _mm_load1_ps (&tiPL[CG]);
3903             m2 = _mm_load1_ps (&tiPR[CG]);
3904             m3 = _mm_load1_ps (&tiPA[CG]);
3905             m4 = _mm_mul_ps (m1, clL[G]);
3906             m5 = _mm_mul_ps (m2, clR[G]);
3907             m6 = _mm_mul_ps (m3, clA[G]);
3908             m7 = _mm_add_ps (m4, m7);
3909             m8 = _mm_add_ps (m5, m8);
3910             m9 = _mm_add_ps (m6, m9);
3911 
3912             m1 = _mm_load1_ps (&tiPL[CT]);
3913             m2 = _mm_load1_ps (&tiPR[CT]);
3914             m3 = _mm_load1_ps (&tiPA[CT]);
3915             m4 = _mm_mul_ps (m1, clL[T]);
3916             m5 = _mm_mul_ps (m2, clR[T]);
3917             m6 = _mm_mul_ps (m3, clA[T]);
3918             m7 = _mm_add_ps (m4, m7);
3919             m8 = _mm_add_ps (m5, m8);
3920             m9 = _mm_add_ps (m6, m9);
3921 
3922             m7 = _mm_mul_ps (m7, m8);
3923             *clP++ = _mm_mul_ps (m7, m9);
3924 
3925             m1 = _mm_load1_ps (&tiPL[GA]);
3926             m2 = _mm_load1_ps (&tiPR[GA]);
3927             m3 = _mm_load1_ps (&tiPA[GA]);
3928             m7 = _mm_mul_ps (m1, clL[A]);
3929             m8 = _mm_mul_ps (m2, clR[A]);
3930             m9 = _mm_mul_ps (m3, clA[A]);
3931 
3932             m1 = _mm_load1_ps (&tiPL[GC]);
3933             m2 = _mm_load1_ps (&tiPR[GC]);
3934             m3 = _mm_load1_ps (&tiPA[GC]);
3935             m4 = _mm_mul_ps (m1, clL[C]);
3936             m5 = _mm_mul_ps (m2, clR[C]);
3937             m6 = _mm_mul_ps (m3, clA[C]);
3938             m7 = _mm_add_ps (m4, m7);
3939             m8 = _mm_add_ps (m5, m8);
3940             m9 = _mm_add_ps (m6, m9);
3941 
3942             m1 = _mm_load1_ps (&tiPL[GG]);
3943             m2 = _mm_load1_ps (&tiPR[GG]);
3944             m3 = _mm_load1_ps (&tiPA[GG]);
3945             m4 = _mm_mul_ps (m1, clL[G]);
3946             m5 = _mm_mul_ps (m2, clR[G]);
3947             m6 = _mm_mul_ps (m3, clA[G]);
3948             m7 = _mm_add_ps (m4, m7);
3949             m8 = _mm_add_ps (m5, m8);
3950             m9 = _mm_add_ps (m6, m9);
3951 
3952             m1 = _mm_load1_ps (&tiPL[GT]);
3953             m2 = _mm_load1_ps (&tiPR[GT]);
3954             m3 = _mm_load1_ps (&tiPA[GT]);
3955             m4 = _mm_mul_ps (m1, clL[T]);
3956             m5 = _mm_mul_ps (m2, clR[T]);
3957             m6 = _mm_mul_ps (m3, clA[T]);
3958             m7 = _mm_add_ps (m4, m7);
3959             m8 = _mm_add_ps (m5, m8);
3960             m9 = _mm_add_ps (m6, m9);
3961 
3962             m7 = _mm_mul_ps (m7, m8);
3963             *clP++ = _mm_mul_ps (m7, m9);
3964 
3965             m1 = _mm_load1_ps (&tiPL[TA]);
3966             m2 = _mm_load1_ps (&tiPR[TA]);
3967             m3 = _mm_load1_ps (&tiPA[TA]);
3968             m7 = _mm_mul_ps (m1, clL[A]);
3969             m8 = _mm_mul_ps (m2, clR[A]);
3970             m9 = _mm_mul_ps (m3, clA[A]);
3971 
3972             m1 = _mm_load1_ps (&tiPL[TC]);
3973             m2 = _mm_load1_ps (&tiPR[TC]);
3974             m3 = _mm_load1_ps (&tiPA[TC]);
3975             m4 = _mm_mul_ps (m1, clL[C]);
3976             m5 = _mm_mul_ps (m2, clR[C]);
3977             m6 = _mm_mul_ps (m3, clA[C]);
3978             m7 = _mm_add_ps (m4, m7);
3979             m8 = _mm_add_ps (m5, m8);
3980             m9 = _mm_add_ps (m6, m9);
3981 
3982             m1 = _mm_load1_ps (&tiPL[TG]);
3983             m2 = _mm_load1_ps (&tiPR[TG]);
3984             m3 = _mm_load1_ps (&tiPA[TG]);
3985             m4 = _mm_mul_ps (m1, clL[G]);
3986             m5 = _mm_mul_ps (m2, clR[G]);
3987             m6 = _mm_mul_ps (m3, clA[G]);
3988             m7 = _mm_add_ps (m4, m7);
3989             m8 = _mm_add_ps (m5, m8);
3990             m9 = _mm_add_ps (m6, m9);
3991 
3992             m1 = _mm_load1_ps (&tiPL[TT]);
3993             m2 = _mm_load1_ps (&tiPR[TT]);
3994             m3 = _mm_load1_ps (&tiPA[TT]);
3995             m4 = _mm_mul_ps (m1, clL[T]);
3996             m5 = _mm_mul_ps (m2, clR[T]);
3997             m6 = _mm_mul_ps (m3, clA[T]);
3998             m7 = _mm_add_ps (m4, m7);
3999             m8 = _mm_add_ps (m5, m8);
4000             m9 = _mm_add_ps (m6, m9);
4001 
4002             m7 = _mm_mul_ps (m7, m8);
4003             *clP++ = _mm_mul_ps (m7, m9);
4004 
4005             clL += 4;
4006             clR += 4;
4007             clA += 4;
4008             }
4009         tiPL += 16;
4010         tiPR += 16;
4011         tiPA += 16;
4012         }
4013 
4014     return NO_ERROR;
4015 }
4016 #endif
4017 
4018 
4019 #if !defined (SSE_ENABLED) || 1
4020 /*----------------------------------------------------------------
4021 |
4022 |   CondLikeRoot_NY98: codon model with omega variation
4023 |
4024 -----------------------------------------------------------------*/
CondLikeRoot_NY98(TreeNode * p,int division,int chain)4025 int CondLikeRoot_NY98 (TreeNode *p, int division, int chain)
4026 {
4027     int             a, b, c, d, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, *aState=NULL,
4028                     nStates, nStatesSquared;
4029     CLFlt           likeL, likeR, likeA, *clL, *clR, *clP, *clA, *pL, *pR, *pA,
4030                     *tiPL, *tiPR, *tiPA;
4031     ModelInfo       *m;
4032 
4033     /* find model settings for this division and nStates, nStatesSquared */
4034     m = &modelSettings[division];
4035     nStates = m->numModelStates;
4036     nStatesSquared = nStates * nStates;
4037 
4038     /* flip state of node so that we are not overwriting old cond likes */
4039     FlipCondLikeSpace (m, chain, p->index);
4040 
4041     /* find conditional likelihood pointers */
4042     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
4043     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
4044     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
4045     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
4046 
4047     /* find transition probabilities (or calculate instead) */
4048     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
4049     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
4050     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
4051 
4052     /* find likelihoods of site patterns for left branch if terminal */
4053     shortCut = 0;
4054 #   if !defined (DEBUG_NOSHORTCUTS)
4055     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
4056         {
4057         shortCut |= 1;
4058         lState = m->termState[p->left->index];
4059         tiPL = pL;
4060         for (k=a=0; k<m->numOmegaCats; k++)
4061             {
4062             for (i=0; i<nStates; i++)
4063                 for (j=i; j<nStatesSquared; j+=nStates)
4064                     preLikeL[a++] = tiPL[j];
4065             /* for ambiguous */
4066             for (i=0; i<nStates; i++)
4067                 preLikeL[a++] = 1.0;
4068             tiPL += nStatesSquared;
4069             }
4070         }
4071 
4072     /* find likelihoods of site patterns for right branch if terminal */
4073     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
4074         {
4075         shortCut |= 2;
4076         rState = m->termState[p->right->index];
4077         tiPR = pR;
4078         for (k=a=0; k<m->numOmegaCats; k++)
4079             {
4080             for (i=0; i<nStates; i++)
4081                 for (j=i; j<nStatesSquared; j+=nStates)
4082                     preLikeR[a++] = tiPR[j];
4083             /* for ambiguous */
4084             for (i=0; i<nStates; i++)
4085                 preLikeR[a++] = 1.0;
4086             tiPR += nStatesSquared;
4087             }
4088         }
4089 
4090     /* find likelihoods of site patterns for anc branch, always terminal */
4091     if (m->isPartAmbig[p->anc->index] == YES)
4092         {
4093         shortCut = 4;
4094         }
4095     else
4096         {
4097         aState = m->termState[p->anc->index];
4098         tiPA = pA;
4099         for (k=a=0; k<m->numOmegaCats; k++)
4100             {
4101             for (i=0; i<nStates; i++)
4102                 for (j=i; j<nStatesSquared; j+=nStates)
4103                     preLikeA[a++] = tiPA[j];
4104             /* for ambiguous */
4105             for (i=0; i<nStates; i++)
4106                 preLikeA[a++] = 1.0;
4107             tiPA += nStatesSquared;
4108             }
4109         }
4110 #   else
4111     shortCut = 4;
4112 #   endif
4113 
4114         switch (shortCut)
4115         {
4116         case 4:
4117             tiPL = pL;
4118             tiPR = pR;
4119             tiPA = pA;
4120             for (k=0; k<m->numOmegaCats; k++)
4121                 {
4122                 for (c=0; c<m->numChars; c++)
4123                     {
4124                     for (i=h=0; i<nStates; i++)
4125                         {
4126                         likeL = likeR = likeA = 0.0;
4127                         for (j=0; j<nStates; j++)
4128                             {
4129                             likeA += tiPA[h]*clA[j];
4130                             likeL += tiPL[h]*clL[j];
4131                             likeR += tiPR[h++]*clR[j];
4132                             }
4133                         *(clP++) = likeL * likeR * likeA;
4134                         }
4135                     clL += nStates;
4136                     clR += nStates;
4137                     clA += nStates;
4138                     }
4139                 tiPL += nStatesSquared;
4140                 tiPR += nStatesSquared;
4141                 tiPA += nStatesSquared;
4142                 }
4143             break;
4144         case 0:
4145             tiPR = pR;
4146             tiPL = pL;
4147             for (k=0; k<m->numOmegaCats; k++)
4148                 {
4149                 for (c=0; c<m->numChars; c++)
4150                     {
4151                     b = aState[c] + k*(nStatesSquared+nStates);
4152                     for (i=h=0; i<nStates; i++)
4153                         {
4154                         likeR = likeL = 0.0;
4155                         for (j=0; j<nStates; j++)
4156                             {
4157                             likeR += tiPR[h]*clR[j];
4158                             likeL += tiPL[h++]*clL[j];
4159                             }
4160                         *(clP++) =  preLikeA[b++] * likeL * likeR;
4161                         }
4162                     clR += nStates;
4163                     clL += nStates;
4164                     }
4165                 tiPR += nStatesSquared;
4166                 tiPL += nStatesSquared;
4167                 }
4168             break;
4169         case 1:
4170             tiPR = pR;
4171             for (k=0; k<m->numOmegaCats; k++)
4172                 {
4173                 for (c=0; c<m->numChars; c++)
4174                     {
4175                     a = lState[c] + k*(nStatesSquared+nStates);
4176                     b = aState[c] + k*(nStatesSquared+nStates);
4177                     for (i=h=0; i<nStates; i++)
4178                         {
4179                         likeR = 0.0;
4180                         for (j=0; j<nStates; j++)
4181                             {
4182                             likeR += tiPR[h++]*clR[j];
4183                             }
4184                         *(clP++) = preLikeL[a++] * preLikeA[b++] * likeR;
4185                         }
4186                     clR += nStates;
4187                     }
4188                 tiPR += nStatesSquared;
4189                 }
4190             break;
4191         case 2:
4192             tiPL = pL;
4193             for (k=0; k<m->numOmegaCats; k++)
4194                 {
4195                 for (c=0; c<m->numChars; c++)
4196                     {
4197                     a = rState[c] + k*(nStatesSquared+nStates);
4198                     b = aState[c] + k*(nStatesSquared+nStates);
4199                     for (i=h=0; i<nStates; i++)
4200                         {
4201                         likeL = 0.0;
4202                         for (j=0; j<nStates; j++)
4203                             {
4204                             likeL += tiPL[h++]*clL[j];
4205                             }
4206                         *(clP++) = preLikeR[a++] * preLikeA[b++] * likeL;
4207                         }
4208                     clL += nStates;
4209                     }
4210                 tiPL += nStatesSquared;
4211                 }
4212             break;
4213         case 3:
4214             for (k=0; k<m->numOmegaCats; k++)
4215                 {
4216                 for (c=0; c<m->numChars; c++)
4217                     {
4218                     a = rState[c] + k*(nStatesSquared+nStates);
4219                     b = lState[c] + k*(nStatesSquared+nStates);
4220                     d = aState[c] + k*(nStatesSquared+nStates);
4221                     for (i=0; i<nStates; i++)
4222                         {
4223                         *(clP++) = preLikeR[a++] * preLikeL[b++] * preLikeA[d++];
4224                         }
4225                     }
4226                 }
4227             break;
4228         }
4229 
4230     return NO_ERROR;
4231 }
4232 #endif
4233 
4234 
4235 #if defined (SSE_ENABLED)
4236 /*----------------------------------------------------------------
4237 |
4238 |   CondLikeRoot_NY98_SSE: codon model with omega variation
4239 |
4240 -----------------------------------------------------------------*/
CondLikeRoot_NY98_SSE(TreeNode * p,int division,int chain)4241 int CondLikeRoot_NY98_SSE (TreeNode *p, int division, int chain)
4242 {
4243     int             c, c1, t, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, *aState=NULL,
4244                     nStates, nStatesSquared;
4245     CLFlt           *pL, *pR, *pA,
4246                     *tiPL, *tiPR, *tiPA;
4247     __m128          *clL, *clR, *clP, *clA;
4248     __m128          mTiPL, mTiPR, mTiPA, mL, mR, mA, mAcumL, mAcumR, mAcumA;
4249     ModelInfo       *m;
4250     CLFlt           *preLikeRV[4] = {0};
4251     CLFlt           *preLikeLV[4] = {0};
4252     CLFlt           *preLikeAV[4] = {0};
4253 
4254 #   if !defined (DEBUG_NOSHORTCUTS)
4255     int             a;
4256 
4257 #   endif
4258 
4259     /* find model settings for this division and nStates, nStatesSquared */
4260     m = &modelSettings[division];
4261     nStates = m->numModelStates;
4262     nStatesSquared = nStates * nStates;
4263 
4264     /* flip state of node so that we are not overwriting old cond likes */
4265     FlipCondLikeSpace (m, chain, p->index);
4266 
4267     /* find conditional likelihood pointers */
4268     clL = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->left->index ]];
4269     clR = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->right->index]];
4270     clP = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->index       ]];
4271     clA = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
4272 
4273     /* find transition probabilities (or calculate instead) */
4274     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
4275     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
4276     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
4277 
4278     /* find likelihoods of site patterns for left branch if terminal */
4279     shortCut = 0;
4280 #   if !defined (DEBUG_NOSHORTCUTS)
4281     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
4282         {
4283         shortCut |= 1;
4284         lState = m->termState[p->left->index];
4285         tiPL = pL;
4286         for (k=a=0; k<m->numOmegaCats; k++)
4287             {
4288             for (i=0; i<nStates; i++)
4289                 for (j=i; j<nStatesSquared; j+=nStates)
4290                     preLikeL[a++] = tiPL[j];
4291             /* for ambiguous */
4292             for (i=0; i<nStates; i++)
4293                 preLikeL[a++] = 1.0;
4294             tiPL += nStatesSquared;
4295             }
4296         }
4297 
4298     /* find likelihoods of site patterns for right branch if terminal */
4299     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
4300         {
4301         shortCut |= 2;
4302         rState = m->termState[p->right->index];
4303         tiPR = pR;
4304         for (k=a=0; k<m->numOmegaCats; k++)
4305             {
4306             for (i=0; i<nStates; i++)
4307                 for (j=i; j<nStatesSquared; j+=nStates)
4308                     preLikeR[a++] = tiPR[j];
4309             /* for ambiguous */
4310             for (i=0; i<nStates; i++)
4311                 preLikeR[a++] = 1.0;
4312             tiPR += nStatesSquared;
4313             }
4314         }
4315 
4316     /* find likelihoods of site patterns for anc branch, always terminal */
4317     if (m->isPartAmbig[p->anc->index] == YES)
4318         {
4319         shortCut = 4;
4320         }
4321     else
4322         {
4323         aState = m->termState[p->anc->index];
4324         tiPA = pA;
4325         for (k=a=0; k<m->numOmegaCats; k++)
4326             {
4327             for (i=0; i<nStates; i++)
4328                 for (j=i; j<nStatesSquared; j+=nStates)
4329                     preLikeA[a++] = tiPA[j];
4330             /* for ambiguous */
4331             for (i=0; i<nStates; i++)
4332                 preLikeA[a++] = 1.0;
4333             tiPA += nStatesSquared;
4334             }
4335         }
4336 #   else
4337     shortCut = 4;
4338 #   endif
4339         switch (shortCut)
4340         {
4341         case 4:
4342             tiPL = pL;
4343             tiPR = pR;
4344             tiPA = pA;
4345             for (k=0; k<m->numOmegaCats; k++)
4346                 {
4347                 for (c=0; c<m->numVecChars; c++)
4348                     {
4349                     for (i=h=0; i<nStates; i++)
4350                         {
4351                         mAcumL = _mm_setzero_ps();
4352                         mAcumR = _mm_setzero_ps();
4353                         mAcumA = _mm_setzero_ps();
4354                         for (j=0; j<nStates; j++)
4355                             {
4356                             mTiPL  = _mm_load1_ps (&tiPL[h]);
4357                             mTiPR  = _mm_load1_ps (&tiPR[h]);
4358                             mTiPA  = _mm_load1_ps (&tiPA[h++]);
4359                             mL     = _mm_mul_ps (mTiPL, clL[j]);
4360                             mR     = _mm_mul_ps (mTiPR, clR[j]);
4361                             mA     = _mm_mul_ps (mTiPA, clA[j]);
4362                             mAcumL = _mm_add_ps (mL, mAcumL);
4363                             mAcumR = _mm_add_ps (mR, mAcumR);
4364                             mAcumA = _mm_add_ps (mA, mAcumA);
4365                             }
4366                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
4367                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
4368                         }
4369                     clL += nStates;
4370                     clR += nStates;
4371                     clA += nStates;
4372                     }
4373                 tiPL += nStatesSquared;
4374                 tiPR += nStatesSquared;
4375                 tiPA += nStatesSquared;
4376                 }
4377             break;
4378         case 0:
4379             tiPL =pL;
4380             tiPR =pR;
4381             for (k=0; k<m->numOmegaCats; k++)
4382                 {
4383                 for (c=t=0; c<m->numVecChars; c++)
4384                     {
4385                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
4386                         {
4387                         preLikeAV[c1] = &preLikeA[aState[t] + k*(nStatesSquared+nStates)];
4388                         }
4389                     for (i=h=0; i<nStates; i++)
4390                         {
4391                         assert (m->numFloatsPerVec == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
4392                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
4393                         mAcumL = _mm_setzero_ps();
4394                         mAcumR = _mm_setzero_ps();
4395                         for (j=0; j<nStates; j++)
4396                             {
4397                             mTiPL  = _mm_load1_ps (&tiPL[h]);
4398                             mL     = _mm_mul_ps (mTiPL, clL[j]);
4399                             mAcumL = _mm_add_ps (mL, mAcumL);
4400                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
4401                             mR     = _mm_mul_ps (mTiPR, clR[j]);
4402                             mAcumR = _mm_add_ps (mR, mAcumR);
4403                             }
4404                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
4405                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
4406                         }
4407                     clR += nStates;
4408                     clL += nStates;
4409                     }
4410                 tiPL += nStatesSquared;
4411                 tiPR += nStatesSquared;
4412                 }
4413             break;
4414         case 1:
4415             tiPR = pR;
4416             for (k=0; k<m->numOmegaCats; k++)
4417                 {
4418                 for (c=t=0; c<m->numVecChars; c++)
4419                     {
4420                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
4421                         {
4422                         preLikeLV[c1] = &preLikeL[lState[t] + k*(nStatesSquared+nStates)];
4423                         preLikeAV[c1] = &preLikeA[aState[t] + k*(nStatesSquared+nStates)];
4424                         }
4425                     for (i=h=0; i<nStates; i++)
4426                         {
4427                         assert (m->numFloatsPerVec == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
4428                         mAcumL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
4429                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
4430                         mAcumR = _mm_setzero_ps();
4431                         for (j=0; j<nStates; j++)
4432                             {
4433                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
4434                             mR     = _mm_mul_ps (mTiPR, clR[j]);
4435                             mAcumR = _mm_add_ps (mR, mAcumR);
4436                             }
4437                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
4438                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
4439                         }
4440                     clR += nStates;
4441                     }
4442                 tiPR += nStatesSquared;
4443                 }
4444             break;
4445         case 2:
4446             tiPL = pL;
4447             for (k=0; k<m->numOmegaCats; k++)
4448                 {
4449                 for (c=t=0; c<m->numVecChars; c++)
4450                     {
4451                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
4452                         {
4453                         preLikeRV[c1] = &preLikeR[rState[t] + k*(nStatesSquared+nStates)];
4454                         preLikeAV[c1] = &preLikeA[aState[t] + k*(nStatesSquared+nStates)];
4455                         }
4456                     for (i=h=0; i<nStates; i++)
4457                         {
4458                         assert (m->numFloatsPerVec == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
4459                         mAcumR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
4460                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
4461                         mAcumL = _mm_setzero_ps();
4462                         for (j=0; j<nStates; j++)
4463                             {
4464                             mTiPL  = _mm_load1_ps (&tiPL[h++]);
4465                             mL     = _mm_mul_ps (mTiPL, clL[j]);
4466                             mAcumL = _mm_add_ps (mL, mAcumL);
4467                             }
4468                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
4469                         *(clP++) = _mm_mul_ps (mAcumL,mAcumA);
4470                         }
4471                     clL += nStates;
4472                     }
4473                 tiPL += nStatesSquared;
4474                 }
4475             break;
4476         case 3:
4477             for (k=0; k<m->numOmegaCats; k++)
4478                 {
4479                 for (c=t=0; c<m->numVecChars; c++)
4480                     {
4481                     for (c1=0; c1<m->numFloatsPerVec; c1++,t++)
4482                         {
4483                         preLikeRV[c1] = &preLikeR[rState[t] + k*(nStatesSquared+nStates)];
4484                         preLikeLV[c1] = &preLikeL[lState[t] + k*(nStatesSquared+nStates)];
4485                         preLikeAV[c1] = &preLikeA[aState[t] + k*(nStatesSquared+nStates)];
4486                         }
4487                     for (i=0; i<nStates; i++)
4488                         {
4489                         assert (m->numFloatsPerVec == 4); /* In the following 2 statments we assume that SSE register can hold exactly 4 ClFlts. */
4490                         mL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
4491                         mR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
4492                         mA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
4493                         mL = _mm_mul_ps (mL,mR);
4494                         *(clP++) = _mm_mul_ps (mL,mA);
4495                         }
4496                     }
4497                 }
4498             break;
4499         }
4500 
4501     return NO_ERROR;
4502 }
4503 #endif
4504 
4505 
4506 /*----------------------------------------------------------------
4507 |
4508 |   CondLikeRoot_Std: variable number of states model
4509 |       with or without rate variation
4510 |
4511 -----------------------------------------------------------------*/
CondLikeRoot_Std(TreeNode * p,int division,int chain)4512 int CondLikeRoot_Std (TreeNode *p, int division, int chain)
4513 {
4514     int             a, c, h, i, j, k, nStates=0, nCats=0, tmp;
4515     CLFlt           *clL, *clR, *clP, *clA, *pL, *pR, *pA, *tiPL, *tiPR, *tiPA,
4516                     likeL, likeR, likeA;
4517     ModelInfo       *m;
4518 
4519     m = &modelSettings[division];
4520 
4521     /* flip state of node so that we are not overwriting old cond likes */
4522     FlipCondLikeSpace (m, chain, p->index);
4523 
4524     /* find conditional likelihood pointers */
4525     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
4526     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
4527     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
4528     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
4529 
4530     /* find transition probabilities (or calculate instead) */
4531     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
4532     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
4533     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
4534 
4535     /* calculate ancestral probabilities */
4536     for (k=h=0; k<m->numRateCats; k++)
4537         {
4538         /* calculate ancestral probabilities */
4539         for (c=0; c<m->numChars; c++)
4540             {
4541             nStates = m->nStates[c];
4542 
4543             /* the following lines ensure that nCats is 1 unless */
4544             /* the character is binary and beta categories are used  */
4545             if (nStates == 2)
4546                 nCats = m->numBetaCats;
4547             else
4548                 nCats = 1;
4549 
4550             tmp = k*nStates*nStates; /* tmp contains offset to skip gamma cats that already processed*/
4551             tiPL = pL + m->tiIndex[c] + tmp;
4552             tiPR = pR + m->tiIndex[c] + tmp;
4553             tiPA = pA + m->tiIndex[c] + tmp;
4554             tmp = (m->numRateCats-1)*2*2; /* tmp contains size of block of tpi matrices across all rate cats (minus one) for single beta category. Further used only if character is binary to jump to next beta category */
4555 
4556             for (j=0; j<nCats;j++)
4557                 {
4558                 for (a=0; a<nStates; a++)
4559                     {
4560                     likeL = likeR = likeA = 0.0;
4561                     for (i=0; i<nStates; i++)
4562                         {
4563                         likeL += *(tiPL++) * clL[i];
4564                         likeR += *(tiPR++) * clR[i];
4565                         likeA += *(tiPA++) * clA[i];
4566                         }
4567                     clP[h++] = likeL * likeR * likeA;
4568                     }
4569                 clL += nStates;
4570                 clR += nStates;
4571                 clA += nStates;
4572 
4573                 tiPL += tmp;
4574                 tiPR += tmp;
4575                 tiPA += tmp;
4576                 }
4577             }
4578         }
4579 
4580     return NO_ERROR;
4581 }
4582 
4583 
4584 /*----------------------------------------------------------------
4585 |
4586 |   CondLikeUp_Bin: pull likelihoods up and calculate scaled
4587 |       finals, binary model with or without rate variation
4588 |
4589 -----------------------------------------------------------------*/
CondLikeUp_Bin(TreeNode * p,int division,int chain)4590 int CondLikeUp_Bin (TreeNode *p, int division, int chain)
4591 {
4592     int             c, k;
4593     CLFlt           *clFA, *clFP, *clDP, *tiP, condLikeUp[2], sum[2];
4594     ModelInfo       *m;
4595 
4596     /* find model settings for this division */
4597     m = &modelSettings[division];
4598 
4599     if (p->anc->anc == NULL)
4600         {
4601         /* this is the root node */
4602         /* find conditional likelihood pointers = down cond likes */
4603         /* use conditional likelihood scratch space for final cond likes */
4604         clDP = m->condLikes[m->condLikeIndex[chain][p->index]];
4605         clFP = m->condLikes[m->condLikeScratchIndex[p->index]];
4606 
4607         for (k=0; k<m->numRateCats; k++)
4608             {
4609             for (c=0; c<m->numChars; c++)
4610                 {
4611                 *(clFP++) = *(clDP++);
4612                 *(clFP++) = *(clDP++);
4613                 }
4614             }
4615         }
4616     else
4617         {
4618         /* find conditional likelihood pointers */
4619         /* use conditional likelihood scratch space for final cond likes */
4620         clFA = m->condLikes[m->condLikeScratchIndex[p->anc->index]];
4621         clFP = m->condLikes[m->condLikeScratchIndex[p->index     ]];
4622         clDP = m->condLikes[m->condLikeIndex[chain][p->index     ]];
4623 
4624         /* find transition probabilities */
4625         tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
4626 
4627         for (k=0; k<m->numRateCats; k++)
4628             {
4629             for (c=0; c<m->numChars; c++)
4630                 {
4631                 condLikeUp[0] = condLikeUp[1] = 0.0;
4632 
4633                 sum[0] = tiP[0]*clDP[0] + tiP[1]*clDP[1];
4634                 sum[1] = tiP[2]*clDP[0] + tiP[3]*clDP[1];
4635 
4636                 if (sum[0] != 0.0) condLikeUp[0] = clFA[0] / sum[0];
4637                 if (sum[1] != 0.0) condLikeUp[1] = clFA[1] / sum[1];
4638 
4639                 *(clFP++) = (condLikeUp[0]*tiP[0] + condLikeUp[1]*tiP[1])*clDP[0];
4640                 *(clFP++) = (condLikeUp[0]*tiP[2] + condLikeUp[1]*tiP[3])*clDP[1];
4641 
4642                 clFA += 2;
4643                 clDP += 2;
4644                 }
4645             tiP += 4;
4646             }
4647         }
4648 
4649     return NO_ERROR;
4650 }
4651 
4652 
4653 /*----------------------------------------------------------------
4654 |
4655 |   CondLikeUp_Gen: pull likelihoods up and calculate scaled
4656 |       finals for an interior node
4657 |
4658 -----------------------------------------------------------------*/
CondLikeUp_Gen(TreeNode * p,int division,int chain)4659 int CondLikeUp_Gen (TreeNode *p, int division, int chain)
4660 {
4661     int             a, c, i, j, k, nStates, nStatesSquared, nRateCats;
4662     CLFlt           *clFA, *clFP, *clDP, *tiP, *condLikeUp, sum;
4663     ModelInfo       *m;
4664 
4665     /* find model settings for this division */
4666     m = &modelSettings[division];
4667 
4668     /* find number of states in the model */
4669     nStates = m->numModelStates;
4670     nStatesSquared = nStates * nStates;
4671 
4672     /* find number of gamma cats */
4673     nRateCats = m->numRateCats;
4674     if (m->gibbsGamma == YES)
4675         nRateCats = 1;
4676 
4677     /* use preallocated scratch space */
4678     condLikeUp = m->ancStateCondLikes;
4679 
4680     /* calculate final states */
4681     if (p->anc->anc == NULL)
4682         {
4683         /* this is the root node */
4684         /* find conditional likelihood pointers = down cond likes */
4685         /* use conditional likelihood scratch space for final cond likes */
4686         clDP = m->condLikes[m->condLikeIndex[chain][p->index]];
4687         clFP = m->condLikes[m->condLikeScratchIndex[p->index]];
4688 
4689         /* final cond likes = downpass cond likes */
4690         for (k=0; k<nRateCats; k++)
4691             {
4692             /* copy cond likes */
4693             for (c=0; c<m->numChars*nStates; c++)
4694                 *(clFP++) = *(clDP++);
4695             }
4696         }
4697     else
4698         {
4699         /* find conditional likelihood pointers */
4700         /* use conditional likelihood scratch space for final cond likes */
4701         clFA = m->condLikes[m->condLikeScratchIndex[p->anc->index]];
4702         clFP = m->condLikes[m->condLikeScratchIndex[p->index     ]];
4703         clDP = m->condLikes[m->condLikeIndex[chain][p->index     ]];
4704 
4705         /* find transition probabilities */
4706         tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
4707 
4708         for (k=0; k<nRateCats; k++)
4709             {
4710             for (c=0; c<m->numChars; c++)
4711                 {
4712                 for (a=j=0; a<nStates; a++)
4713                     {
4714                     sum = 0.0;
4715                     for (i=0; i<nStates; i++)
4716                         sum += tiP[j++]*clDP[i];
4717                     if (sum != 0.0) condLikeUp[a] = clFA[a] / sum;
4718                     }
4719 
4720                 for (a=j=0; a<nStates; a++)
4721                     {
4722                     sum = 0.0;
4723                     for (i=0; i<nStates; i++)
4724                         {
4725                         sum += condLikeUp[i] * tiP[j++];
4726                         }
4727                     *(clFP++) = sum * clDP[a];
4728                     }
4729 
4730                 clFA += nStates;
4731                 clDP += nStates;
4732                 }
4733             tiP += nStatesSquared;
4734             }
4735         }
4736 
4737     return NO_ERROR;
4738 }
4739 
4740 
4741 /*----------------------------------------------------------------
4742 |
4743 |   CondLikeUp_NUC4: pull likelihoods up and calculate scaled
4744 |       finals for an interior node
4745 |
4746 -----------------------------------------------------------------*/
CondLikeUp_NUC4(TreeNode * p,int division,int chain)4747 int     CondLikeUp_NUC4 (TreeNode *p, int division, int chain)
4748 {
4749     int             c, k, nRateCats;
4750     CLFlt           *clFA, *clFP, *clDP, *tiP, condLikeUp[4], sum[4];
4751     ModelInfo       *m;
4752 
4753     /* find model settings for this division */
4754     m = &modelSettings[division];
4755 
4756     /* find number of rate cats */
4757     nRateCats = m->numRateCats;
4758     if (m->gibbsGamma == YES)
4759         nRateCats = 1;
4760 
4761     /* calculate final states */
4762     if (p->anc->anc == NULL)
4763         {
4764         /* this is the root node */
4765         /* find conditional likelihood pointers = down cond likes */
4766         /* use conditional likelihood scratch space for final cond likes */
4767         clDP = m->condLikes[m->condLikeIndex[chain][p->index]];
4768         clFP = m->condLikes[m->condLikeScratchIndex[p->index]];
4769 
4770         /* final cond likes = downpass cond likes */
4771         for (k=0; k<nRateCats; k++)
4772             {
4773             /* copy cond likes */
4774             for (c=0; c<m->numChars; c++)
4775                 {
4776                 *(clFP++) = *(clDP++);
4777                 *(clFP++) = *(clDP++);
4778                 *(clFP++) = *(clDP++);
4779                 *(clFP++) = *(clDP++);
4780                 }
4781             }
4782         }
4783     else
4784         {
4785         /* find conditional likelihood pointers */
4786         /* use conditional likelihood scratch space for final cond likes */
4787         clFA = m->condLikes[m->condLikeScratchIndex[p->anc->index]];
4788         clFP = m->condLikes[m->condLikeScratchIndex[p->index     ]];
4789         clDP = m->condLikes[m->condLikeIndex[chain][p->index     ]];
4790 
4791         /* find transition probabilities */
4792         tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
4793 
4794         for (k=0; k<nRateCats; k++)
4795             {
4796             for (c=0; c<m->numChars; c++)
4797                 {
4798                 condLikeUp[A] = condLikeUp[C] = condLikeUp[G] = condLikeUp[T] = 0.0;
4799 
4800                 sum[A] = (tiP[AA]*clDP[A] + tiP[AC]*clDP[C] + tiP[AG]*clDP[G] + tiP[AT]*clDP[T]);
4801                 sum[C] = (tiP[CA]*clDP[A] + tiP[CC]*clDP[C] + tiP[CG]*clDP[G] + tiP[CT]*clDP[T]);
4802                 sum[G] = (tiP[GA]*clDP[A] + tiP[GC]*clDP[C] + tiP[GG]*clDP[G] + tiP[GT]*clDP[T]);
4803                 sum[T] = (tiP[TA]*clDP[A] + tiP[TC]*clDP[C] + tiP[TG]*clDP[G] + tiP[TT]*clDP[T]);
4804 
4805                 if (sum[A] != 0.0) condLikeUp[A] = clFA[A] / sum[A];
4806                 if (sum[C] != 0.0) condLikeUp[C] = clFA[C] / sum[C];
4807                 if (sum[G] != 0.0) condLikeUp[G] = clFA[G] / sum[G];
4808                 if (sum[T] != 0.0) condLikeUp[T] = clFA[T] / sum[T];
4809 
4810 /*
4811                 clFP[A] = (condLikeUp[A]*tiP[AA] + condLikeUp[C]*tiP[CA] + condLikeUp[G]*tiP[GA] + condLikeUp[T]*tiP[TA])*clDP[A];
4812                 clFP[C] = (condLikeUp[A]*tiP[AC] + condLikeUp[C]*tiP[CC] + condLikeUp[G]*tiP[GC] + condLikeUp[T]*tiP[TC])*clDP[C];
4813                 clFP[G] = (condLikeUp[A]*tiP[AG] + condLikeUp[C]*tiP[CG] + condLikeUp[G]*tiP[GG] + condLikeUp[T]*tiP[TG])*clDP[G];
4814                 clFP[T] = (condLikeUp[A]*tiP[AT] + condLikeUp[C]*tiP[CT] + condLikeUp[G]*tiP[GT] + condLikeUp[T]*tiP[TT])*clDP[T];
4815 */
4816 
4817                 clFP[A] = (condLikeUp[A]*tiP[AA] + condLikeUp[C]*tiP[AC] + condLikeUp[G]*tiP[AG] + condLikeUp[T]*tiP[AT])*clDP[A];
4818                 clFP[C] = (condLikeUp[A]*tiP[CA] + condLikeUp[C]*tiP[CC] + condLikeUp[G]*tiP[CG] + condLikeUp[T]*tiP[CT])*clDP[C];
4819                 clFP[G] = (condLikeUp[A]*tiP[GA] + condLikeUp[C]*tiP[GC] + condLikeUp[G]*tiP[GG] + condLikeUp[T]*tiP[GT])*clDP[G];
4820                 clFP[T] = (condLikeUp[A]*tiP[TA] + condLikeUp[C]*tiP[TC] + condLikeUp[G]*tiP[TG] + condLikeUp[T]*tiP[TT])*clDP[T];
4821 
4822                 clFA += 4;
4823                 clFP += 4;
4824                 clDP += 4;
4825                 }
4826             tiP += 16;
4827             }
4828         }
4829 
4830     return NO_ERROR;
4831 }
4832 
4833 
4834 /*----------------------------------------------------------------
4835 |
4836 |   CondLikeUp_Std: pull likelihoods up and calculate scaled
4837 |       finals for an interior node
4838 |
4839 -----------------------------------------------------------------*/
CondLikeUp_Std(TreeNode * p,int division,int chain)4840 int     CondLikeUp_Std (TreeNode *p, int division, int chain)
4841 {
4842     int             a, c, i, j, k, t, nStates, nCats, coppySize,tmp;
4843     CLFlt           *clFA, *clFP, *clDP, *pA, *tiP, condLikeUp[10], sum;
4844     ModelInfo       *m;
4845 
4846     /* find model settings for this division */
4847     m = &modelSettings[division];
4848 
4849     /* calculate final states */
4850     if (p->anc->anc == NULL)
4851         {
4852         /* this is the root node */
4853         /* find conditional likelihood pointers = down cond likes */
4854         /* use conditional likelihood scratch space for final cond likes */
4855         clDP = m->condLikes[m->condLikeIndex[chain][p->index]];
4856         clFP = m->condLikes[m->condLikeScratchIndex[p->index]];
4857 
4858         coppySize=0;
4859         /* final cond likes = downpass cond likes */
4860         for (c=0; c<m->numChars; c++)
4861             {
4862             /* calculate nStates and nCats */
4863             nStates = m->nStates[c];
4864 
4865             /* the following lines ensure that nCats is 1 unless */
4866             /* the character is binary and beta categories are used  */
4867             if (nStates == 2)
4868                 nCats = m->numBetaCats;
4869             else
4870                 nCats = 1;
4871 
4872             coppySize+=nCats*nStates;
4873             }
4874 
4875         /* finally multiply with the rate cats */
4876         coppySize *= m->numRateCats;
4877 
4878         /* copy cond likes */
4879         for (k=0; k<coppySize; k++)
4880             *(clFP++) = *(clDP++);
4881         }
4882     else
4883         {
4884         /* find conditional likelihood pointers */
4885         /* use conditional likelihood scratch space for final cond likes */
4886         clFA = m->condLikes[m->condLikeScratchIndex[p->anc->index]];
4887         clFP = m->condLikes[m->condLikeScratchIndex[p->index     ]];
4888         clDP = m->condLikes[m->condLikeIndex[chain][p->index     ]];
4889 
4890         /* find transition probabilities */
4891         pA = m->tiProbs[m->tiProbsIndex[chain][p->index]];
4892 
4893         for (k=0; k<m->numRateCats; k++)
4894             {
4895             for (c=0; c<m->numChars; c++)
4896                 {
4897 
4898                 /* calculate nStates and nCats */
4899                 nStates = m->nStates[c];
4900 
4901                 /* the following lines ensure that nCats is 1 unless */
4902                 /* the character is binary and beta categories are used  */
4903                 if (nStates == 2)
4904                     nCats = m->numBetaCats;
4905                 else
4906                     nCats = 1;
4907 
4908                 tmp = k*nStates*nStates; /* tmp contains offset to skip rate cats that already processed*/
4909                 tiP = pA + m->tiIndex[c] + tmp;
4910                 tmp = (m->numRateCats-1)*2*2; /* tmp contains size of block of tpi matrices across all rate cats (minus one) for single beta category. Further used only if character is binary to jump to next beta category */
4911 
4912                 /* now calculate the final cond likes */
4913                 for (t=0; t<nCats; t++)
4914                     {
4915                     for (a=j=0; a<nStates; a++)
4916                         {
4917                         sum = 0.0;
4918                         for (i=0; i<nStates; i++)
4919                             sum += tiP[j++]*clDP[i];
4920                         if (sum == 0.0)
4921                             condLikeUp[a] = 0.0;    /* we lost the conditional likelihood in the downpass (can occur in gamma model) */
4922                         else
4923                             condLikeUp[a] = clFA[a] / sum;
4924                         }
4925 
4926                     for (a=j=0; a<nStates; a++)
4927                         {
4928                         sum = 0.0;
4929                         for (i=0; i<nStates; i++)
4930                             {
4931                             sum += condLikeUp[i] * tiP[j++];
4932                             }
4933                         clFP[a] = sum * clDP[a];
4934                         }
4935 
4936                     clFP += nStates;
4937                     clFA += nStates;
4938                     clDP += nStates;
4939                     tiP += tmp;
4940                     }
4941                 }
4942             }
4943         }
4944 
4945     return NO_ERROR;
4946 }
4947 
4948 
4949 /*----------------------------------------------------------------
4950 |
4951 |   CondLikeScaler_Gen: general n-state model with or without rate
4952 |       variation
4953 |
4954 -----------------------------------------------------------------*/
CondLikeScaler_Gen(TreeNode * p,int division,int chain)4955 int CondLikeScaler_Gen (TreeNode *p, int division, int chain)
4956 {
4957     int             c, k, n, nStates;
4958     CLFlt           scaler, **clP, *clPtr, *scP, *lnScaler;
4959     ModelInfo       *m;
4960 
4961     m = &modelSettings[division];
4962     nStates = m->numModelStates;
4963 
4964     /* find conditional likelihood pointers */
4965     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
4966     clP   = m->clP;
4967     for (k=0; k<m->numRateCats; k++)
4968         {
4969         clP[k] = clPtr;
4970         clPtr += m->numChars * m->numModelStates;
4971         }
4972 
4973     /* find node scalers */
4974     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4975 
4976     /* find site scalers */
4977     lnScaler = m->scalers[m->siteScalerIndex[chain]];
4978 
4979     /* rescale */
4980     for (c=0; c<m->numChars; c++)
4981         {
4982         scaler = 0.0;
4983         for (k=0; k<m->numRateCats; k++)
4984             {
4985             for (n=0; n<nStates; n++)
4986                 {
4987                 if (clP[k][n] > scaler)
4988                     scaler = clP[k][n];
4989                 }
4990             }
4991 
4992         for (k=0; k<m->numRateCats; k++)
4993             {
4994             for (n=0; n<nStates; n++)
4995                 clP[k][n] /= scaler;
4996             clP[k] += n;
4997             }
4998 
4999         scP[c]       = (CLFlt) log (scaler);    /* store node scaler */
5000         lnScaler[c] += scP[c];  /* add into tree scaler  */
5001         }
5002 
5003     m->unscaledNodes[chain][p->index] = 0;
5004 
5005     return (NO_ERROR);
5006 }
5007 
5008 
5009 #if defined (SSE_ENABLED)
5010 /*----------------------------------------------------------------
5011 |
5012 |   CondLikeScaler_Gen_SSE: general n-state model with or without rate
5013 |       variation
5014 |
5015 -----------------------------------------------------------------*/
CondLikeScaler_Gen_SSE(TreeNode * p,int division,int chain)5016 int CondLikeScaler_Gen_SSE (TreeNode *p, int division, int chain)
5017 {
5018     int             c, k, n, nStates;
5019     CLFlt           *scP, *lnScaler;
5020     __m128          *clPtr, **clP, m1;
5021     ModelInfo       *m;
5022 
5023     m = &modelSettings[division];
5024     nStates = m->numModelStates;
5025 
5026     /* find conditional likelihood pointers */
5027     clPtr = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index]];
5028     clP   = m->clP_SSE;
5029     for (k=0; k<m->numRateCats; k++)
5030         {
5031         clP[k] = clPtr;
5032         clPtr += m->numVecChars * m->numModelStates;
5033         }
5034 
5035     /* find node scalers */
5036     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
5037     //scP_SSE = (__m128 *) scP;
5038 
5039     /* find site scalers */
5040     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5041 
5042     /* rescale */
5043     for (c=0; c<m->numVecChars; c++)
5044         {
5045         //scaler = 0.0;
5046         m1 = _mm_setzero_ps ();
5047         for (k=0; k<m->numRateCats; k++)
5048             {
5049             for (n=0; n<nStates; n++)
5050                 {
5051                 m1 = _mm_max_ps (m1, clP[k][n]);
5052                 }
5053             }
5054         _mm_store_ps (scP,  m1);
5055         scP += m->numFloatsPerVec;
5056 
5057         for (k=0; k<m->numRateCats; k++)
5058             {
5059             for (n=0; n<nStates; n++)
5060                 {
5061                 *clP[k] = _mm_div_ps (*clP[k], m1);
5062                 clP[k]++;
5063                 }
5064             }
5065         }
5066 
5067     /* Reset scP to original position*/
5068     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
5069     for (c=0; c<m->numChars; c++)
5070         {
5071         scP[c]       = (CLFlt) log (scP[c]);    /* store node scaler */
5072         lnScaler[c] += scP[c];                  /* add into tree scaler  */
5073         }
5074 
5075     m->unscaledNodes[chain][p->index] = 0;
5076 
5077     return (NO_ERROR);
5078 }
5079 #endif
5080 
5081 
5082 /*----------------------------------------------------------------
5083 |
5084 |   CondLikeScaler_Gen_GibbsGamma: general n-state model with Gibbs
5085 |       sampling of rate categories in discrete gamma
5086 |
5087 -----------------------------------------------------------------*/
CondLikeScaler_Gen_GibbsGamma(TreeNode * p,int division,int chain)5088 int CondLikeScaler_Gen_GibbsGamma (TreeNode *p, int division, int chain)
5089 {
5090     int             c, i, j, n, nStates, *rateCat, nRateCats;
5091     CLFlt           scaler, *clP, *scP, *lnScaler;
5092     ModelInfo       *m;
5093 
5094     m = &modelSettings[division];
5095     nStates = m->numModelStates;
5096 
5097     /* find conditional likelihood pointer */
5098     clP = m->condLikes[m->condLikeIndex[chain][p->index]];
5099 
5100     /* flip node scalers */
5101     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
5102 
5103     /* find site scalers */
5104     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5105 
5106     /* find rate category index and number of rate categories */
5107     rateCat = m->tiIndex + chain * m->numChars;
5108     nRateCats = m->numRateCats;
5109 
5110     /* scale */
5111     i = j = 0;
5112     for (c=0; c<m->numChars; c++)
5113         {
5114         if (rateCat[c] < nRateCats)
5115             {
5116             scaler = 0.0;
5117             for (n=0; n<nStates; n++)
5118                 {
5119                 if (clP[i] > scaler)
5120                     scaler = clP[i];
5121                 i++;
5122                 }
5123 
5124 
5125             for (n=0; n<nStates; n++)
5126                 clP[j++] /= scaler;
5127 
5128             scP[c]       = (CLFlt) log (scaler);    /* store node scaler */
5129             lnScaler[c] += scP[c];                  /* add into tree scaler  */
5130 
5131             }
5132         else
5133             {
5134             scP[c] = 0.0;
5135             /* no need to add it to the lnScaler */
5136             i += nStates;
5137             j += nStates;
5138             }
5139         }
5140 
5141     m->unscaledNodes[chain][p->index] = 0;
5142 
5143     return (NO_ERROR);
5144 }
5145 
5146 
5147 /*----------------------------------------------------------------
5148 |
5149 |   CondLikeScaler_NUC4: 4by4 nucleotide model with or without rate
5150 |       variation
5151 |
5152 -----------------------------------------------------------------*/
CondLikeScaler_NUC4(TreeNode * p,int division,int chain)5153 int CondLikeScaler_NUC4 (TreeNode *p, int division, int chain)
5154 {
5155     int             c, k;
5156     CLFlt           scaler, *scP, *lnScaler, *clPtr, **clP;
5157     ModelInfo       *m;
5158 
5159     m = &modelSettings[division];
5160 
5161     /* find conditional likelihood pointers */
5162     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
5163     clP   = m->clP;
5164     for (k=0; k<m->numRateCats; k++)
5165         {
5166         clP[k] = clPtr;
5167         clPtr += m->numChars * m->numModelStates;
5168         }
5169 
5170     /* find node scalers */
5171     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
5172 
5173     /* find site scalers */
5174     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5175 
5176     /* rescale values */
5177     for (c=0; c<m->numChars; c++)
5178         {
5179         scaler = 0.0;
5180         for (k=0; k<m->numRateCats; k++)
5181             {
5182             if (clP[k][A] > scaler)
5183                 scaler = clP[k][A];
5184             if (clP[k][C] > scaler)
5185                 scaler = clP[k][C];
5186             if (clP[k][G] > scaler)
5187                 scaler = clP[k][G];
5188             if (clP[k][T] > scaler)
5189                 scaler = clP[k][T];
5190             }
5191 
5192         for (k=0; k<m->numRateCats; k++)
5193             {
5194             clP[k][A] /= scaler;
5195             clP[k][C] /= scaler;
5196             clP[k][G] /= scaler;
5197             clP[k][T] /= scaler;
5198             clP[k] += 4;
5199             }
5200 
5201         scP[c]       = (CLFlt) log(scaler); /* store node scaler */
5202         lnScaler[c] += scP[c];  /* add into tree scaler  */
5203         }
5204 
5205     m->unscaledNodes[chain][p->index] = 0;   /* set unscaled nodes to 0 */
5206 
5207     return NO_ERROR;
5208 }
5209 
5210 
5211 #if defined (AVX_ENABLED)
5212 /*----------------------------------------------------------------
5213  |
5214  |   CondLikeScaler_NUC4_AVX: 4by4 nucleotide model with or without rate
5215  |       variation using AVX (or AVX + FMA) code
5216  |
5217  -----------------------------------------------------------------*/
CondLikeScaler_NUC4_AVX(TreeNode * p,int division,int chain)5218 int CondLikeScaler_NUC4_AVX (TreeNode *p, int division, int chain)
5219 {
5220     int             c, k;
5221     CLFlt           *scP, *lnScaler;
5222     __m256          *clPtr, **clP, *scP_AVX, m1;
5223     ModelInfo       *m;
5224 
5225     m = &modelSettings[division];
5226 
5227     /* find conditional likelihood pointers */
5228     clPtr = (__m256 *) m->condLikes[m->condLikeIndex[chain][p->index]];
5229     clP   = m->clP_AVX;
5230     for (k=0; k<m->numRateCats; k++)
5231     {
5232         clP[k] = clPtr;
5233         clPtr += m->numVecChars * m->numModelStates;
5234     }
5235 
5236     /* find node scalers */
5237     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
5238     scP_AVX = (__m256 *) scP;
5239 
5240     /* find site scalers */
5241     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5242 
5243     /* rescale */
5244     for (c=0; c<m->numVecChars; c++)
5245     {
5246         m1 = _mm256_setzero_ps ();
5247 
5248         for (k=0; k<m->numRateCats; k++)
5249         {
5250             m1 = _mm256_max_ps (m1, clP[k][A]);
5251             m1 = _mm256_max_ps (m1, clP[k][C]);
5252             m1 = _mm256_max_ps (m1, clP[k][G]);
5253             m1 = _mm256_max_ps (m1, clP[k][T]);
5254         }
5255 
5256         for (k=0; k<m->numRateCats; k++)
5257         {
5258             *clP[k] = _mm256_div_ps (*clP[k], m1);
5259             clP[k]++;
5260             *clP[k] = _mm256_div_ps (*clP[k], m1);
5261             clP[k]++;
5262             *clP[k] = _mm256_div_ps (*clP[k], m1);
5263             clP[k]++;
5264             *clP[k] = _mm256_div_ps (*clP[k], m1);
5265             clP[k]++;
5266         }
5267 
5268         (*scP_AVX++) = m1;
5269     }
5270 
5271     /* update site scalers */
5272     for (c=0; c<m->numChars; c++)
5273         lnScaler[c] += (scP[c] = logf (scP[c]));    /* add log of new scaler into tree scaler  */
5274 
5275     m->unscaledNodes[chain][p->index] = 0;   /* set unscaled nodes to 0 */
5276 
5277     return NO_ERROR;
5278 
5279 }
5280 #endif
5281 
5282 
5283 #if defined (SSE_ENABLED)
5284 /*----------------------------------------------------------------
5285 |
5286 |   CondLikeScaler_NUC4_SSE: 4by4 nucleotide model with or without rate
5287 |       variation using SSE code
5288 |
5289 -----------------------------------------------------------------*/
CondLikeScaler_NUC4_SSE(TreeNode * p,int division,int chain)5290 int CondLikeScaler_NUC4_SSE (TreeNode *p, int division, int chain)
5291 {
5292     int             c, k;
5293     CLFlt           *scP, *lnScaler;
5294     __m128          *clPtr, **clP, *scP_SSE, m1;
5295     ModelInfo       *m;
5296 
5297     m = &modelSettings[division];
5298 
5299     /* find conditional likelihood pointers */
5300     clPtr = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index]];
5301     clP   = m->clP_SSE;
5302     for (k=0; k<m->numRateCats; k++)
5303         {
5304         clP[k] = clPtr;
5305         clPtr += m->numVecChars * m->numModelStates;
5306         }
5307 
5308     /* find node scalers */
5309     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
5310     scP_SSE = (__m128 *) scP;
5311 
5312     /* find site scalers */
5313     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5314 
5315     /* rescale */
5316     for (c=0; c<m->numVecChars; c++)
5317         {
5318         m1 = _mm_setzero_ps ();
5319         for (k=0; k<m->numRateCats; k++)
5320             {
5321             m1 = _mm_max_ps (m1, clP[k][A]);
5322             m1 = _mm_max_ps (m1, clP[k][C]);
5323             m1 = _mm_max_ps (m1, clP[k][G]);
5324             m1 = _mm_max_ps (m1, clP[k][T]);
5325             }
5326 
5327         for (k=0; k<m->numRateCats; k++)
5328             {
5329             *clP[k] = _mm_div_ps (*clP[k], m1);
5330             clP[k]++;
5331             *clP[k] = _mm_div_ps (*clP[k], m1);
5332             clP[k]++;
5333             *clP[k] = _mm_div_ps (*clP[k], m1);
5334             clP[k]++;
5335             *clP[k] = _mm_div_ps (*clP[k], m1);
5336             clP[k]++;
5337             }
5338 
5339         (*scP_SSE++) = m1;
5340         }
5341 
5342     /* update site scalers */
5343     for (c=0; c<m->numChars; c++)
5344         lnScaler[c] += (scP[c] = (CLFlt)(log (scP[c])));    /* add log of new scaler into tree scaler  */
5345 
5346     m->unscaledNodes[chain][p->index] = 0;   /* number of unscaled nodes is 0 */
5347 
5348     return NO_ERROR;
5349 
5350 }
5351 #endif
5352 
5353 
5354 /*----------------------------------------------------------------
5355 |
5356 |   CondLikeScaler_NUC4_GibbsGamma: 4by4 nucleotide model with rate
5357 |       variation approximated by Gibbs sampling from gamma
5358 |
5359 -----------------------------------------------------------------*/
CondLikeScaler_NUC4_GibbsGamma(TreeNode * p,int division,int chain)5360 int CondLikeScaler_NUC4_GibbsGamma (TreeNode *p, int division, int chain)
5361 {
5362     int             c, i, j, nRateCats, *rateCat;
5363     CLFlt           scaler, *clP, *scP, *lnScaler;
5364     ModelInfo       *m;
5365 
5366     m = &modelSettings[division];
5367 
5368     /* find conditional likelihood pointer */
5369     clP = m->condLikes[m->condLikeIndex[chain][p->index]];
5370 
5371     /* find node scalers */
5372     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
5373 
5374     /* find site scalers */
5375     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5376 
5377     /* find rate category index and number of gamma categories */
5378     rateCat = m->tiIndex + chain * m->numChars;
5379     nRateCats = m->numRateCats;
5380 
5381     /* scale */
5382     i = j = 0;
5383     for (c=0; c<m->numChars; c++)
5384         {
5385         if (rateCat[c] < nRateCats)
5386             {
5387             scaler = 0.0;
5388             if (clP[i] > scaler)
5389                 scaler = clP[i];
5390             i++;
5391             if (clP[i] > scaler)
5392                 scaler = clP[i];
5393             i++;
5394             if (clP[i] > scaler)
5395                 scaler = clP[i];
5396             i++;
5397             if (clP[i] > scaler)
5398                 scaler = clP[i];
5399             i++;
5400 
5401             clP[j++] /= scaler;
5402             clP[j++] /= scaler;
5403             clP[j++] /= scaler;
5404             clP[j++] /= scaler;
5405 
5406             scP[c]       = (CLFlt) log (scaler);    /* store node scaler */
5407             lnScaler[c] += scP[c];                  /* add into tree scaler  */
5408             }
5409         else
5410             {
5411             scP[c] = 0.0;   /* store node scaler */
5412             /* no need to add it to the lnScaler */
5413             i += 4;
5414             j += 4;
5415             }
5416         }
5417 
5418     m->unscaledNodes[chain][p->index] = 0;
5419 
5420     return NO_ERROR;
5421 }
5422 
5423 
5424 #if !defined (SSE_ENABLED) || 1
5425 /*----------------------------------------------------------------
5426 |
5427 |   CondLikeScaler_NY98: codon model with omega variation
5428 |
5429 -----------------------------------------------------------------*/
CondLikeScaler_NY98(TreeNode * p,int division,int chain)5430 int CondLikeScaler_NY98 (TreeNode *p, int division, int chain)
5431 {
5432     int             c, k, n, nStates;
5433     CLFlt           scaler, **clP, *clPtr, *scP, *lnScaler;
5434     ModelInfo       *m;
5435 
5436     m = &modelSettings[division];
5437     nStates = m->numModelStates;
5438 
5439     /* find conditional likelihood pointers */
5440     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
5441     clP   = m->clP;
5442     for (k=0; k<m->numOmegaCats; k++)
5443         {
5444         clP[k] = clPtr;
5445         clPtr += m->numChars * m->numModelStates;
5446         }
5447 
5448     /* find node scalers */
5449     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
5450 
5451     /* find site scalers */
5452     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5453 
5454     /* rescale */
5455     for (c=0; c<m->numChars; c++)
5456         {
5457         scaler = 0.0;
5458         for (k=0; k<m->numOmegaCats; k++)
5459             {
5460             for (n=0; n<nStates; n++)
5461                 {
5462                 if (clP[k][n] > scaler)
5463                     scaler = clP[k][n];
5464                 }
5465             }
5466 
5467         for (k=0; k<m->numOmegaCats; k++)
5468             {
5469             for (n=0; n<nStates; n++)
5470                 {
5471                 clP[k][n] /= scaler;
5472                 }
5473             clP[k] += n;
5474             }
5475 
5476         scP[c]       = (CLFlt) log (scaler);    /* store node scaler */
5477         lnScaler[c] += scP[c];                  /* add into tree scaler  */
5478         }
5479 
5480     m->unscaledNodes[chain][p->index] = 0;
5481 
5482     return (NO_ERROR);
5483 }
5484 #endif
5485 
5486 
5487 #if defined (SSE_ENABLED)
5488 /*----------------------------------------------------------------
5489 |
5490 |   CondLikeScaler_NY98_SSE: codon model with omega variation
5491 |
5492 -----------------------------------------------------------------*/
CondLikeScaler_NY98_SSE(TreeNode * p,int division,int chain)5493 int CondLikeScaler_NY98_SSE (TreeNode *p, int division, int chain)
5494 {
5495     int             c, k, n, nStates;
5496     CLFlt           *scP, *lnScaler;
5497     __m128          *clPtr, **clP, m1;
5498     ModelInfo       *m;
5499 
5500     m = &modelSettings[division];
5501     nStates = m->numModelStates;
5502 
5503     /* find conditional likelihood pointers */
5504     clPtr = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index]];
5505     clP   = m->clP_SSE;
5506     for (k=0; k<m->numOmegaCats; k++)
5507         {
5508         clP[k] = clPtr;
5509         clPtr += m->numVecChars * m->numModelStates;
5510         }
5511 
5512     /* find node scalers */
5513     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
5514     //scP_SSE = (__m128 *) scP;
5515 
5516     /* find site scalers */
5517     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5518 
5519     /* rescale */
5520     for (c=0; c<m->numVecChars; c++)
5521         {
5522         //scaler = 0.0;
5523         m1 = _mm_setzero_ps ();
5524         for (k=0; k<m->numOmegaCats; k++)
5525             {
5526             for (n=0; n<nStates; n++)
5527                 {
5528                 m1 = _mm_max_ps (m1, clP[k][n]);
5529                 }
5530             }
5531         _mm_store_ps (scP,  m1);
5532         scP += m->numFloatsPerVec;
5533 
5534         for (k=0; k<m->numOmegaCats; k++)
5535             {
5536             for (n=0; n<nStates; n++)
5537                 {
5538                 *clP[k] = _mm_div_ps (*clP[k], m1);
5539                 clP[k]++;
5540                 }
5541             }
5542         }
5543 
5544     /* Reset scP to original position*/
5545     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
5546     for (c=0; c<m->numChars; c++)
5547         {
5548         scP[c]       = (CLFlt) log (scP[c]);    /* store node scaler */
5549         lnScaler[c] += scP[c];                  /* add into tree scaler  */
5550         }
5551 
5552     m->unscaledNodes[chain][p->index] = 0;
5553 
5554     return (NO_ERROR);
5555 }
5556 #endif
5557 
5558 
5559 /*----------------------------------------------------------------
5560 |
5561 |   CondLikeScaler_Std: variable states model with or without
5562 |       rate variation
5563 |
5564 -----------------------------------------------------------------*/
CondLikeScaler_Std(TreeNode * p,int division,int chain)5565 int CondLikeScaler_Std (TreeNode *p, int division, int chain)
5566 {
5567     int             c, n, k, nStates, numReps;
5568     CLFlt           scaler, *clPtr, **clP, *scP, *lnScaler;
5569     ModelInfo       *m;
5570 
5571     m = &modelSettings[division];
5572 
5573     numReps=0;
5574     for (c=0; c<m->numChars; c++)
5575         {
5576         if (m->nStates[c] == 2)
5577             numReps += m->numBetaCats * 2;
5578         else
5579             numReps += m->nStates[c];
5580         }
5581 
5582     /* find conditional likelihood pointers */
5583     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
5584     clP   = m->clP;
5585     for (k=0; k<m->numRateCats; k++)
5586         {
5587         clP[k] = clPtr;
5588         clPtr += numReps;
5589         }
5590 
5591     /* find node scalers */
5592     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
5593 
5594     /* find site scalers */
5595     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5596 
5597     /* rescale */
5598     for (c=0; c<m->numChars; c++)
5599         {
5600         scaler = 0.0;
5601         nStates = m->nStates[c];
5602         if (nStates == 2)
5603             nStates = m->numBetaCats * 2;
5604 
5605         for (k=0; k<m->numRateCats; k++)
5606             {
5607             for (n=0; n<nStates; n++)
5608                 {
5609                 if (clP[k][n] > scaler)
5610                     scaler = clP[k][n];
5611                 }
5612             }
5613 
5614         for (k=0; k<m->numRateCats; k++)
5615             {
5616             for (n=0; n<nStates; n++)
5617                 clP[k][n] /= scaler;
5618             clP[k] += nStates;
5619             }
5620 
5621         scP[c]       = (CLFlt) log (scaler);    /* store node scaler */
5622         lnScaler[c] += scP[c];                  /* add into tree scaler  */
5623         }
5624 
5625     m->unscaledNodes[chain][p->index] = 0;
5626 
5627     return NO_ERROR;
5628 }
5629 
5630 
5631 /*------------------------------------------------------------------
5632 |
5633 |   Likelihood_Adgamma: all n-state models with autocorrelated
5634 |        discrete gamma rate variation, NOT morph, restriction,
5635 |        codon or doublet models; just fill in rateProbs
5636 |
5637 -------------------------------------------------------------------*/
Likelihood_Adgamma(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)5638 int Likelihood_Adgamma (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
5639 {
5640     int             c, j, k, i, nStates, nStatesDiv2;
5641     MrBFlt          *bs, *swr, s01, s10, probOn, probOff, covBF[40];
5642     MrBFlt          like, *rP;
5643     CLFlt           *clP;
5644     ModelInfo       *m;
5645 
5646     /* NOTE: whichSitePats offsets numSitesOfPat by whichSitePats X numCompressedChars.
5647        This is done so we can use the character reweighting scheme for "heating" chains. This was easy to
5648        accomplish for all of the models except this one, which doesn't use numSitesOfPat when calculating
5649        likelihoods. Either we disallow autocorrelated rates when using MCMC with character reweighting, or
5650        we properly calculate likelihoods when some site patterns have increased or decreased weight. For
5651        now, we do not allow MCMCMC with character reweighting with this HMM; we bail out in the function
5652        FillNumSitesOfPat if we have Adgamma rate variation and reweighting. */
5653     k = whichSitePats;  /* FIXME: Not used (from clang static analyzer) */
5654 
5655     /* find model settings */
5656     m = &modelSettings[division];
5657 
5658     /* get the number of states */
5659     nStates = m->numModelStates;
5660     nStatesDiv2 = nStates / 2;
5661 
5662     /* find base frequencies */
5663     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
5664 
5665     /* find conditional likelihood pointer */
5666     clP = m->condLikes[m->condLikeIndex[chain][p->index]];
5667 
5668     /* find pointer to rate probabilities */
5669     rP = rateProbs[chain] + state[chain] * rateProbRowSize + m->rateProbStart;
5670 
5671     /* loop over characters and calculate rate probs */
5672     if (m->switchRates != NULL)
5673         {
5674         swr = GetParamVals (m->switchRates, chain, state[chain]);
5675         s01 = swr[0];
5676         s10 = swr[1];
5677         probOn = s01 / (s01 + s10);
5678         probOff =  1.0 - probOn;
5679         for (j=0; j<nStatesDiv2; j++)
5680             {
5681             covBF[j] = bs[j] * probOn;
5682             covBF[j+nStatesDiv2] = bs[j] * probOff;
5683             }
5684         bs = covBF;
5685         }
5686 
5687     for (c=i=0; c<m->numChars; c++)
5688         {
5689         for (k=0; k<m->numRateCats; k++)
5690             {
5691             like =  0.0;
5692             for (j=0; j<nStates; j++)
5693                 like += (*(clP++)) *  bs[j];
5694             rP[i++] = like;
5695             }
5696         }
5697 
5698     /* reset lnL, likelihood calculated later for this model */
5699     *lnL =  0.0;
5700 
5701     return (NO_ERROR);
5702 }
5703 
5704 
5705 /*------------------------------------------------------------------
5706 |
5707 |   Likelihood_Gen: general n-state models with or without rate
5708 |       variation
5709 |
5710 -------------------------------------------------------------------*/
Likelihood_Gen(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)5711 int Likelihood_Gen (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
5712 {
5713     int             c, j, k, nStates, hasPInvar;
5714     MrBFlt          s01, s10, probOn, probOff, *swr;
5715     MrBFlt          covBF[40], freq, *bs, like, likeI, pInvar=0.0, lnLike;
5716     CLFlt           *clPtr, **clP, *lnScaler, *nSitesOfPat, *clInvar=NULL;
5717     ModelInfo       *m;
5718 
5719     /* find model settings and nStates, pInvar, invar cond likes */
5720     m = &modelSettings[division];
5721     nStates = m->numModelStates;
5722     if (m->pInvar == NULL)
5723         {
5724         hasPInvar = NO;
5725         }
5726     else
5727         {
5728         hasPInvar = YES;
5729         pInvar =  *(GetParamVals (m->pInvar, chain, state[chain]));
5730         clInvar = m->invCondLikes;
5731         }
5732 
5733     /* find conditional likelihood pointers */
5734     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
5735     clP = m->clP;
5736     for (k=0; k<m->numRateCats; k++)
5737         {
5738         clP[k] = clPtr;
5739         clPtr += m->numChars * m->numModelStates;
5740         }
5741 
5742 
5743     /* find base frequencies */
5744     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
5745 
5746     /* if covarion model, adjust base frequencies */
5747     if (m->switchRates != NULL)
5748         {
5749         /* find the stationary frequencies */
5750         swr = GetParamVals(m->switchRates, chain, state[chain]);
5751         s01 = swr[0];
5752         s10 = swr[1];
5753         probOn = s01 / (s01 + s10);
5754         probOff =  1.0 - probOn;
5755 
5756         /* now adjust the base frequencies; on-state stored first in cond likes */
5757         for (j=0; j<nStates/2; j++)
5758             {
5759             covBF[j] = bs[j] * probOn;
5760             covBF[j+nStates/2] = bs[j] * probOff;
5761             }
5762 
5763         /* finally set bs pointer to adjusted values */
5764         bs = covBF;
5765         }
5766 
5767     /* find category frequencies */
5768     if (hasPInvar == NO)
5769         freq =  1.0 /  m->numRateCats;
5770     else
5771         freq = (1.0 - pInvar) /  m->numRateCats;
5772 
5773     /* find site scaler */
5774     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5775 
5776     /* find nSitesOfPat */
5777     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
5778 
5779     /* reset lnL */
5780     *lnL = 0.0;
5781 
5782     /* loop over characters */
5783     if (hasPInvar == NO)
5784         {
5785         for (c=0; c<m->numChars; c++)
5786             {
5787             like = 0.0;
5788             for (k=0; k<m->numRateCats; k++)
5789                 for (j=0; j<nStates; j++)
5790                     {
5791                     like += (*(clP[k]++)) * bs[j];
5792 #   ifdef DEBUG_LIKELIHOOD
5793                     // printf ("char=%d cat=%d j=%d like %E\n",c, k,j,like);
5794 #   endif
5795                     }
5796             like *= freq;
5797 
5798             /* check against LIKE_EPSILON (values close to zero are problematic) */
5799             if (like < LIKE_EPSILON)
5800                 {
5801 #   ifdef DEBUG_LIKELIHOOD
5802                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5803 #   endif
5804                 (*lnL) = MRBFLT_NEG_MAX;
5805                 abortMove = YES;
5806                 return ERROR;
5807                 }
5808             else
5809                 {
5810                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
5811                 }
5812             }
5813         }
5814     else
5815         {
5816         /* has invariable category */
5817         for (c=0; c<m->numChars; c++)
5818             {
5819             likeI = like = 0.0;
5820             for (k=0; k<m->numRateCats; k++)
5821                 for (j=0; j<nStates; j++)
5822                     {
5823                     like += (*(clP[k]++)) * bs[j];
5824                     }
5825             like *= freq;
5826             for (j=0; j<nStates; j++)
5827                 likeI += (*(clInvar++)) * bs[j] * pInvar;
5828             if (lnScaler[c] < -200.0)
5829                 {
5830                 /* we are not going to be able to exponentiate the scaling factor */
5831                 if (likeI > 1E-70)
5832                     {
5833                     /* forget about like; it is going to be insignificant compared to likeI */
5834                     lnLike = log(likeI);
5835                     }
5836                 else
5837                     {
5838                     /* treat likeI as if 0.0, that is, ignore it completely */
5839                     lnLike = log(like) + lnScaler[c];
5840                     }
5841                 }
5842             else
5843                 lnLike = log (like + (likeI / exp (lnScaler[c]))) + lnScaler[c];
5844 
5845             /* check against LIKE_EPSILON (values close to zero are problematic) */
5846             if (like < LIKE_EPSILON)
5847                 {
5848 #   ifdef DEBUG_LIKELIHOOD
5849                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5850 #   endif
5851                 (*lnL) = MRBFLT_NEG_MAX;
5852                 abortMove = YES;
5853                 return ERROR;
5854                 }
5855             else
5856                 {
5857                 (*lnL) += lnLike * nSitesOfPat[c];
5858                 }
5859             }
5860         }
5861 
5862     return NO_ERROR;
5863 }
5864 
5865 
5866 #if defined (SSE_ENABLED)
5867 //#   if 0
5868 //CLFlt DeleteME[1000];
5869 //int PrintOld_SSE (TreeNode *p, int division, int chain){
5870 //
5871 //    int             c, c1, j, k, nStates;
5872 //    //MrBFlt            *swr, likeI, pInvar=0.0, lnLike;
5873 //    CLFlt           *temp_vector;
5874 //    __m128          *clPtr, **clP;
5875 //    ModelInfo       *m;
5876 //
5877 //    m = &modelSettings[division];
5878 //    nStates = m->numModelStates;
5879 //    /* find conditional likelihood pointers */
5880 //
5881 //    temp_vector =  DeleteME;
5882 //
5883 //    clPtr = (__m128 *) (m->condLikes[m->condLikeIndex[chain][p->index]]);
5884 //    clP = m->clP_SSE;
5885 //    for (k=0; k<m->numRateCats; k++)
5886 //        {
5887 //        clP[k] = clPtr;
5888 //        clPtr += m->numVecChars * m->numModelStates;
5889 //        }
5890 //
5891 //    for (c=0; c<m->numChars; c++)
5892 //        {
5893 //        c1 = c / FLOATS_PER_VEC;
5894 //        for (k=0; k<m->numRateCats; k++)
5895 //            {
5896 //            for (j=0; j<nStates; j++)
5897 //                {
5898 //                *temp_vector++ = *(((CLFlt*)&clP[k][c1*nStates+j])+c % FLOATS_PER_VEC);
5899 //                }
5900 //            }
5901 //        }
5902 //    temp_vector=DeleteME;
5903 //
5904 //    return 1;
5905 //}
5906 //#   endif
5907 
5908 
5909 /*------------------------------------------------------------------
5910 |
5911 |   Likelihood_Gen_SSE: general n-state model with or without rate
5912 |       variation
5913 |
5914 -------------------------------------------------------------------*/
Likelihood_Gen_SSE(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)5915 int Likelihood_Gen_SSE (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
5916 {
5917     int             c, j, k, nStates, hasPInvar;
5918     MrBFlt          like, *bs;
5919     MrBFlt          s01, s10, probOn, probOff, *swr, covBF[40], freq, likeI, pInvar=0.0, lnLike;
5920     CLFlt           *lnScaler, *nSitesOfPat, *lnL_SSE, *lnLI_SSE;
5921     __m128          *clPtr, **clP, *clInvar=NULL;
5922     __m128          m1, mCatLike, mLike, mFreq;
5923     ModelInfo       *m;
5924 
5925     /* find model settings and nStates, pInvar, invar cond likes */
5926     m = &modelSettings[division];
5927     nStates = m->numModelStates;
5928     if (m->pInvar == NULL)
5929         {
5930         hasPInvar = NO;
5931         }
5932     else
5933         {
5934         hasPInvar = YES;
5935         pInvar =  *(GetParamVals (m->pInvar, chain, state[chain]));
5936         clInvar = (__m128 *) (m->invCondLikes);
5937         }
5938 
5939     /* find conditional likelihood pointers */
5940     clPtr = (__m128 *) (m->condLikes[m->condLikeIndex[chain][p->index]]);
5941     clP = m->clP_SSE;
5942     for (k=0; k<m->numRateCats; k++)
5943         {
5944         clP[k] = clPtr;
5945         clPtr += m->numVecChars * m->numModelStates;
5946         }
5947     lnL_SSE  = m->lnL_Vec;
5948     lnLI_SSE = m->lnLI_Vec;
5949 
5950     /* find base frequencies */
5951     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
5952 
5953     /* if covarion model, adjust base frequencies */
5954     if (m->switchRates != NULL)
5955         {
5956         /* find the stationary frequencies */
5957         swr = GetParamVals(m->switchRates, chain, state[chain]);
5958         s01 = swr[0];
5959         s10 = swr[1];
5960         probOn = s01 / (s01 + s10);
5961         probOff =  1.0 - probOn;
5962 
5963         /* now adjust the base frequencies; on-state stored first in cond likes */
5964         for (j=0; j<nStates/2; j++)
5965             {
5966             covBF[j] = bs[j] * probOn;
5967             covBF[j+nStates/2] = bs[j] * probOff;
5968             }
5969 
5970         /* finally set bs pointer to adjusted values */
5971         bs = covBF;
5972         }
5973 
5974     /* find category frequencies */
5975     if (hasPInvar == NO)
5976         freq =  1.0 /  m->numRateCats;
5977     else
5978         freq = (1.0 - pInvar) /  m->numRateCats;
5979 
5980     mFreq = _mm_set1_ps ((CLFlt)(freq));
5981 
5982     /* find site scaler */
5983     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5984 
5985     /* find nSitesOfPat */
5986     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
5987 
5988     /* reset lnL */
5989     *lnL = 0.0;
5990 
5991     for (c=0; c<m->numVecChars; c++)
5992         {
5993         mLike = _mm_setzero_ps ();
5994         for (k=0; k<m->numRateCats; k++)
5995             {
5996             mCatLike = _mm_setzero_ps ();
5997             for (j=0; j<nStates; j++)
5998                 {
5999                 m1 = _mm_mul_ps (clP[k][j], _mm_set1_ps ((CLFlt)bs[j]));
6000                 mCatLike = _mm_add_ps (mCatLike, m1);
6001                 }
6002             m1 = _mm_mul_ps (mCatLike, mFreq);
6003             mLike = _mm_add_ps (mLike, m1);
6004             clP[k] += nStates;
6005             }
6006         _mm_store_ps (lnL_SSE, mLike);
6007         lnL_SSE += m->numFloatsPerVec;
6008         }
6009 
6010     /* loop over characters */
6011     if (hasPInvar == NO)
6012         {
6013         for (c=0; c<m->numChars; c++)
6014             {
6015             like = m->lnL_Vec[c];
6016             /* check against LIKE_EPSILON (values close to zero are problematic) */
6017             if (like < LIKE_EPSILON)
6018                 {
6019 #   ifdef DEBUG_LIKELIHOOD
6020                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6021 #   endif
6022                 (*lnL) = MRBFLT_NEG_MAX;
6023                 abortMove = YES;
6024                 return ERROR;
6025                 }
6026             else
6027                 {
6028                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6029                 }
6030             }
6031         }
6032     else
6033         {
6034         /* has invariable category */
6035         for (c=0; c<m->numVecChars; c++)
6036             {
6037             mCatLike = _mm_setzero_ps ();
6038             for (j=0; j<nStates; j++)
6039                 {
6040                 m1 = _mm_mul_ps (clInvar[j], _mm_set1_ps ((CLFlt)bs[j]));
6041                 mCatLike = _mm_add_ps (mCatLike, m1);
6042                 }
6043             clInvar += nStates;
6044             _mm_store_ps (lnL_SSE, mCatLike);
6045             lnLI_SSE += m->numFloatsPerVec;
6046             }
6047 
6048         for (c=0; c<m->numChars; c++)
6049             {
6050             like  = m->lnL_Vec[c];
6051             likeI = m->lnLI_Vec[c];
6052             if (lnScaler[c] < -200.0)
6053                 {
6054                 /* we are not going to be able to exponentiate the scaling factor */
6055                 if (likeI > 1E-70)
6056                     {
6057                     /* forget about like; it is going to be insignificant compared to likeI */
6058                     lnLike = log(likeI);
6059                     }
6060                 else
6061                     {
6062                     /* treat likeI as if 0.0, that is, ignore it completely */
6063                     lnLike = log(like) + lnScaler[c];
6064                     }
6065                 }
6066             else
6067                 lnLike = log (like + (likeI / exp (lnScaler[c]))) + lnScaler[c];
6068 
6069             /* check against LIKE_EPSILON (values close to zero are problematic) */
6070             if (like < LIKE_EPSILON)
6071                 {
6072 #   ifdef DEBUG_LIKELIHOOD
6073                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6074 #   endif
6075                 (*lnL) = MRBFLT_NEG_MAX;
6076                 abortMove = YES;
6077                 return ERROR;
6078                 }
6079             else
6080                 {
6081                 (*lnL) += lnLike * nSitesOfPat[c];
6082                 }
6083             }
6084         }
6085 
6086     return NO_ERROR;
6087 
6088 }
6089 #endif
6090 
6091 
6092 /*------------------------------------------------------------------
6093 |
6094 |   Likelihood_Gen_GibbsGamma: general n-state models using
6095 |       Gibbs resampling of discrete gamma rate categories
6096 |
6097 -------------------------------------------------------------------*/
Likelihood_Gen_GibbsGamma(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)6098 int Likelihood_Gen_GibbsGamma (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6099 {
6100     int             c, j, nStates, nGammaCats, *rateCat;
6101     MrBFlt          s01, s10, probOn, probOff, *swr;
6102     MrBFlt          covBF[40], *bs, like;
6103     CLFlt           *clP, *lnScaler, *nSitesOfPat, *clInvar=NULL;
6104     ModelInfo       *m;
6105 
6106     /* find model settings, nStates and invar cond likes */
6107     m = &modelSettings[division];
6108     nStates = m->numModelStates;
6109     clInvar = m->invCondLikes;
6110 
6111     /* find conditional likelihood pointer */
6112     clP = m->condLikes[m->condLikeIndex[chain][p->index]];
6113 
6114     /* find base frequencies */
6115     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6116 
6117     /* if covarion model, adjust base frequencies */
6118     if (m->switchRates != NULL)
6119         {
6120         /* find the stationary frequencies */
6121         swr = GetParamVals(m->switchRates, chain, state[chain]);
6122         s01 = swr[0];
6123         s10 = swr[1];
6124         probOn = s01 / (s01 + s10);
6125         probOff =  1.0 - probOn;
6126 
6127         /* now adjust the base frequencies; on-state stored first in cond likes */
6128         for (j=0; j<nStates/2; j++)
6129             {
6130             covBF[j] = bs[j] * probOn;
6131             covBF[j+nStates/2] = bs[j] * probOff;
6132             }
6133 
6134         /* finally set bs pointer to adjusted values */
6135         bs = covBF;
6136         }
6137 
6138     /* find site scaler */
6139     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6140 
6141     /* find nSitesOfPat */
6142     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6143 
6144     /* find rate category index and number of gamma categories */
6145     rateCat = m->tiIndex + chain * m->numChars;
6146     nGammaCats = m->numRateCats;
6147 
6148     /* reset lnL */
6149     *lnL = 0.0;
6150 
6151     /* loop over characters */
6152     if (m->pInvar == NULL)
6153         {
6154         for (c=0; c<m->numChars; c++)
6155             {
6156             like = 0.0;
6157             for (j=0; j<nStates; j++)
6158                 {
6159                 like += (*(clP++)) * bs[j];
6160 #   ifdef DEBUG_LIKELIHOOD
6161                 // printf ("char=%d cat=%d j=%d like %E\n",c, k,j,like);
6162 #   endif
6163                 }
6164 
6165             /* check against LIKE_EPSILON (values close to zero are problematic) */
6166             if (like < LIKE_EPSILON)
6167                 {
6168 #   ifdef DEBUG_LIKELIHOOD
6169                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6170 #   endif
6171                 (*lnL) = MRBFLT_NEG_MAX;
6172                 abortMove = YES;
6173                 return ERROR;
6174                 }
6175             else
6176                 {
6177                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6178                 }
6179             }
6180         }
6181     else
6182         {
6183         /* has invariable category */
6184         for (c=0; c<m->numChars; c++)
6185             {
6186             like = 0.0;
6187             if (rateCat[c] < nGammaCats)
6188                 {
6189                 for (j=0; j<nStates; j++)
6190                     like += (*(clP++)) * bs[j];
6191                 clInvar += nStates;
6192                 }
6193             else
6194                 {
6195                 for (j=0; j<nStates; j++)
6196                     like += (*(clInvar++)) * bs[j];
6197                 clP += nStates;
6198                 }
6199 
6200             /* check against LIKE_EPSILON (values close to zero are problematic) */
6201             if (like < LIKE_EPSILON)
6202                 {
6203 #   ifdef DEBUG_LIKELIHOOD
6204                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6205 #   endif
6206                 (*lnL) = MRBFLT_NEG_MAX;
6207                 abortMove = YES;
6208                 return ERROR;
6209                 }
6210             else
6211                 {
6212                 (*lnL) += (log(like) + lnScaler[c]) * nSitesOfPat[c];
6213                 }
6214             }
6215         }
6216 
6217     return NO_ERROR;
6218 }
6219 
6220 
6221 /*------------------------------------------------------------------
6222 |
6223 |   Likelihood_NUC4: 4by4 nucleotide models with or without rate
6224 |       variation
6225 |
6226 -------------------------------------------------------------------*/
Likelihood_NUC4(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)6227 int Likelihood_NUC4 (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6228 {
6229     int             c, k, hasPInvar;
6230     MrBFlt          freq, likeI, *bs, like, pInvar=0.0;
6231     CLFlt           *clPtr, **clP, *lnScaler, *nSitesOfPat, *clInvar=NULL;
6232     ModelInfo       *m;
6233 
6234     /* find model settings and pInvar, invar cond likes */
6235     m = &modelSettings[division];
6236     if (m->pInvar == NULL)
6237         {
6238         hasPInvar = NO;
6239         }
6240     else
6241         {
6242         hasPInvar = YES;
6243         pInvar =  *(GetParamVals (m->pInvar, chain, state[chain]));
6244         clInvar = m->invCondLikes;
6245         }
6246 
6247     /* find conditional likelihood pointers */
6248     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
6249     clP = m->clP;
6250     for (k=0; k<m->numRateCats; k++)
6251         {
6252         clP[k] = clPtr;
6253         clPtr += m->numChars * m->numModelStates;
6254         }
6255 
6256     /* find base frequencies */
6257     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6258 
6259     /* find category frequencies */
6260     if (hasPInvar == NO)
6261         freq =  1.0 /  m->numRateCats;
6262     else
6263         freq =  (1.0 - pInvar) /  m->numRateCats;
6264 
6265     /* find tree scaler */
6266     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6267 
6268     /* find nSitesOfPat */
6269     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6270 
6271     /* reset lnL */
6272     *lnL = 0.0;
6273 
6274     /* loop over characters */
6275     if (hasPInvar == NO)
6276         {
6277         for (c=0; c<m->numChars; c++)
6278             {
6279             like = 0.0;
6280             for (k=0; k<m->numRateCats; k++)
6281                 {
6282                 like += (clP[k][A] * bs[A] + clP[k][C] * bs[C] + clP[k][G] * bs[G] + clP[k][T] * bs[T]);
6283                 clP[k] += 4;
6284                 }
6285             like *= freq;
6286 
6287             /* check against LIKE_EPSILON (values close to zero are problematic) */
6288             if (like < LIKE_EPSILON)
6289                 {
6290 #   ifdef DEBUG_LIKELIHOOD
6291                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6292 #   endif
6293                 (*lnL) = MRBFLT_NEG_MAX;
6294                 abortMove = YES;
6295                 return ERROR;
6296                 }
6297             else
6298                 {
6299                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6300                 }
6301             }
6302         }
6303     else
6304         {
6305         /* has invariable category */
6306         for (c=0; c<m->numChars; c++)
6307             {
6308             like = 0.0;
6309             for (k=0; k<m->numRateCats; k++)
6310                 {
6311                 like += (clP[k][A] * bs[A] + clP[k][C] * bs[C] + clP[k][G] * bs[G] + clP[k][T] * bs[T]);
6312                 clP[k] += 4;
6313                 }
6314             like *= freq;
6315             likeI = (clInvar[A] * bs[A] + clInvar[C] * bs[C] + clInvar[G] * bs[G] + clInvar[T] * bs[T]) * pInvar;
6316             if (lnScaler[c] < -200)
6317                 {
6318                 /* we are not going to be able to exponentiate the scaling factor */
6319                 if (likeI > 1E-70)
6320                     {
6321                     /* forget about like; it is going to be insignificant compared to likeI */
6322                     like = likeI;
6323                     }
6324                 else
6325                     {
6326                     /* treat likeI as if 0.0, that is, ignore it completely */
6327                     }
6328                 }
6329             else
6330                 like = like + (likeI / exp (lnScaler[c]));
6331 
6332             clInvar += 4;
6333 
6334             /* check against LIKE_EPSILON (values close to zero are problematic) */
6335             if (like < LIKE_EPSILON)
6336                 {
6337 #   ifdef DEBUG_LIKELIHOOD
6338                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6339 #   endif
6340                 (*lnL) = MRBFLT_NEG_MAX;
6341                 abortMove = YES;
6342                 return ERROR;
6343                 }
6344             else
6345                 {
6346                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6347                 }
6348             }
6349         }
6350 
6351 
6352     return NO_ERROR;
6353 }
6354 
6355 
6356 /*------------------------------------------------------------------
6357 |
6358 |   Likelihood_NUC4_GibbsGamma: 4by4 nucleotide models with rate
6359 |       variation using Gibbs sampling from gamma rate categories
6360 |
6361 -------------------------------------------------------------------*/
Likelihood_NUC4_GibbsGamma(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)6362 int Likelihood_NUC4_GibbsGamma (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6363 {
6364     int             c, i, r, nGammaCats, *rateCat;
6365     MrBFlt          *bs, like;
6366     CLFlt           *clP, *lnScaler, *nSitesOfPat, *clInvar;
6367     ModelInfo       *m;
6368 
6369     /* find model settings and invar cond likes */
6370     m = &modelSettings[division];
6371     clInvar = m->invCondLikes;
6372 
6373     /* find conditional likelihood pointer */
6374     clP = m->condLikes[m->condLikeIndex[chain][p->index]];
6375 
6376     /* find base frequencies */
6377     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6378 
6379     /* find tree scaler */
6380     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6381 
6382     /* find nSitesOfPat */
6383     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6384 
6385     /* find rate category index  and number of gamma categories */
6386     rateCat = m->tiIndex + chain * m->numChars;
6387     nGammaCats = m->numRateCats;
6388 
6389     /* reset lnL */
6390     *lnL = 0.0;
6391 
6392     /* loop over characters */
6393     if (m->pInvar == NULL)
6394         {
6395         for (c=i=0; c<m->numChars; c++)
6396             {
6397             like = (clP[A] * bs[A] + clP[C] * bs[C] + clP[G] * bs[G] + clP[T] * bs[T]);
6398             clP += 4;
6399 
6400             /* check against LIKE_EPSILON (values close to zero are problematic) */
6401             if (like < LIKE_EPSILON)
6402                 {
6403 #   ifdef DEBUG_LIKELIHOOD
6404                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6405 #   endif
6406                 (*lnL) = MRBFLT_NEG_MAX;
6407                 abortMove = YES;
6408                 return ERROR;
6409                 }
6410             else
6411                 {
6412                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6413                 }
6414             }
6415         }
6416     else
6417         {
6418         /* has invariable category */
6419         for (c=i=0; c<m->numChars; c++)
6420             {
6421             r = rateCat[c];
6422             if (r < nGammaCats)
6423                 like = (clP[A] * bs[A] + clP[C] * bs[C] + clP[G] * bs[G] + clP[T] * bs[T]);
6424             else
6425                 like = (clInvar[A] * bs[A] + clInvar[C] * bs[C] + clInvar[G] * bs[G] + clInvar[T] * bs[T]);
6426             clInvar += 4;
6427             clP += 4;
6428 
6429             /* check against LIKE_EPSILON (values close to zero are problematic) */
6430             if (like < LIKE_EPSILON)
6431                 {
6432 #   ifdef DEBUG_LIKELIHOOD
6433                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6434 #   endif
6435                 (*lnL) = MRBFLT_NEG_MAX;
6436                 abortMove = YES;
6437                 return ERROR;
6438                 }
6439             else
6440                 {
6441                 (*lnL) += (log (like) + lnScaler[c]) * nSitesOfPat[c];
6442                 }
6443             }
6444         }
6445 
6446     return NO_ERROR;
6447 }
6448 
6449 
6450 //#if defined (SSE_ENABLED)
6451 ///*------------------------------------------------------------------
6452 // |
6453 // | Likelihood_NUC4_GibbsGamma: 4by4 nucleotide models with rate
6454 // |     variation using Gibbs sampling from gamma rate categories
6455 // |
6456 // -------------------------------------------------------------------*/
6457 //int Likelihood_NUC4_GibbsGamma_SSE (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6458 //{
6459 //    int             c, i, r, nRateCats, *rateCat;
6460 //    MrBFlt          *bs, like;
6461 //    CLFlt           *lnScaler, *nSitesOfPat, *lnL_SSE, *lnLI_SSE;
6462 //    __m128          *clP, *clInvar=NULL;
6463 //    __m128          m1, mA, mC, mG, mT, mFreq, mPInvar, mLike;
6464 //    ModelInfo       *m;
6465 //
6466 //#if defined (FAST_LOG)
6467 //    int             k, index;
6468 //    MrBFlt          likeAdjust = 1.0, f;
6469 //#endif
6470 //
6471 //    /* find model settings and invar cond likes */
6472 //    m = &modelSettings[division];
6473 //    clInvar = (__m128 *)m->invCondLikes;
6474 //    /* find conditional likelihood pointer */
6475 //    clP = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->index]];
6476 //
6477 //    lnL_SSE  = m->lnL_SSE;
6478 //    lnLI_SSE = m->lnLI_SSE;
6479 //
6480 //    /* find base frequencies */
6481 //    bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6482 //
6483 //    /* find tree scaler */
6484 //    lnScaler = m->scalers[m->siteScalerIndex[chain]];
6485 //
6486 //    /* find nSitesOfPat */
6487 //    nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6488 //
6489 //    /* find rate category index  and number of rate categories */
6490 //    rateCat = m->tiIndex + chain * m->numChars;
6491 //    nRateCats = m->numRateCats;
6492 //
6493 //    /* reset lnL */
6494 //    *lnL = 0.0;
6495 //
6496 //    /* calculate variable likelihood */
6497 //    for (c=0; c<m->numVecChars; c++)
6498 //    {
6499 //        mLike = _mm_mul_ps (clP[A], mA);
6500 //        m1    = _mm_mul_ps (clP[C], mC);
6501 //        mLike = _mm_add_ps (mLike, m1);
6502 //        m1    = _mm_mul_ps (clP[G], mG);
6503 //        mLike = _mm_add_ps (mLike, m1);
6504 //        m1    = _mm_mul_ps (clP[T], mT);
6505 //        mLike = _mm_add_ps (mLike, m1);
6506 //
6507 //        clP += 4;
6508 //        _mm_store_ps (lnL_SSE, mLike);
6509 //        lnL_SSE += FLOATS_PER_VEC;
6510 //    }
6511 //
6512 //    /* calculate invariable likelihood */
6513 //    if (hasPInvar == YES)
6514 //    {
6515 //        for (c=0; c<m->numVecChars; c++)
6516 //        {
6517 //            mLike = _mm_mul_ps (clInvar[A], mA);
6518 //            m1    = _mm_mul_ps (clInvar[C], mC);
6519 //            mLike = _mm_add_ps (mLike, m1);
6520 //            m1    = _mm_mul_ps (clInvar[G], mG);
6521 //            mLike = _mm_add_ps (mLike, m1);
6522 //            m1    = _mm_mul_ps (clInvar[T], mT);
6523 //            mLike = _mm_add_ps (mLike, m1);
6524 //            mLike = _mm_mul_ps (mLike, mPInvar);
6525 //
6526 //            _mm_store_ps (lnLI_SSE, mLike);
6527 //            clInvar += 4;
6528 //            lnLI_SSE += FLOATS_PER_VEC;
6529 //        }
6530 //    }
6531 //
6532 //
6533 //    /* loop over characters */
6534 //    if (m->pInvar == NULL)
6535 //    {
6536 //        for (c=i=0; c<m->numChars; c++)
6537 //        {
6538 //            like = m->lnL_SSE[c];
6539 //            /* check against LIKE_EPSILON (values close to zero are problematic) */
6540 //            if (like < LIKE_EPSILON)
6541 //            {
6542 //                MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30lf\n", spacer, division, c, like);
6543 //                (*lnL) = MRBFLT_NEG_MAX;
6544 //                return ERROR;
6545 //            }
6546 //            else
6547 //            {
6548 //#if defined (FAST_LOG)
6549 //                f = frexp (like, &index);
6550 //                index = 1-index;
6551 //                (*lnL) += (lnScaler[c] +  logValue[index]) * nSitesOfPat[c];
6552 //                for (k=0; k<(int)nSitesOfPat[c]; k++)
6553 //                    likeAdjust *= f;
6554 //#else
6555 //                (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6556 //#endif
6557 //            }
6558 //        }
6559 //    }
6560 //    else
6561 //    {
6562 //        /* has invariable category */
6563 //        for (c=i=0; c<m->numChars; c++)
6564 //        {
6565 //            r = rateCat[c];
6566 //            if (r < nRateCats)
6567 //                like = m->lnL_SSE[c];
6568 //            else
6569 //                like = m->lnLI_SSE[c];
6570 //
6571 //            /* check against LIKE_EPSILON (values close to zero are problematic) */
6572 //            if (like < LIKE_EPSILON)
6573 //            {
6574 //                MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30lf\n", spacer, division, c, like);
6575 //                (*lnL) = MRBFLT_NEG_MAX;
6576 //                return ERROR;
6577 //            }
6578 //            else
6579 //            {
6580 //                (*lnL) += (log (like) + lnScaler[c]) * nSitesOfPat[c];
6581 //            }
6582 //        }
6583 //    }
6584 //
6585 //#if defined (FAST_LOG)
6586 //    (*lnL) += log (likeAdjust);
6587 //#endif
6588 //
6589 //    return NO_ERROR;
6590 //}
6591 //#endif
6592 
6593 
6594 #if defined (FMA_ENABLED)
6595 /*------------------------------------------------------------------
6596  |
6597  |   Likelihood_NUC4_FMA: 4by4 nucleotide models with or without rate
6598  |       variation using AVX + FMA code
6599  |
6600  -------------------------------------------------------------------*/
Likelihood_NUC4_FMA(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)6601 int Likelihood_NUC4_FMA (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6602 {
6603     int             c, k, hasPInvar;
6604     MrBFlt          freq, *bs, pInvar=0.0, like, likeI;
6605     CLFlt           *lnScaler, *nSitesOfPat, *lnL_Vec, *lnLI_Vec;
6606     __m256          *clPtr, **clP, *clInvar=NULL;
6607     __m256          mA, mC, mG, mT, mFreq, mPInvar=_mm256_set1_ps(0.0f), mLike;
6608     ModelInfo       *m;
6609 
6610     /* find model settings and pInvar, invar cond likes */
6611     m = &modelSettings[division];
6612     if (m->pInvar == NULL)
6613     {
6614         hasPInvar = NO;
6615     }
6616     else
6617     {
6618         hasPInvar = YES;
6619         pInvar =  *(GetParamVals (m->pInvar, chain, state[chain]));
6620         mPInvar = _mm256_set1_ps ((CLFlt)(pInvar));
6621         clInvar = (__m256 *) (m->invCondLikes);
6622     }
6623 
6624     /* find conditional likelihood pointers */
6625     clPtr = (__m256 *) (m->condLikes[m->condLikeIndex[chain][p->index]]);
6626     clP = m->clP_AVX;
6627     for (k=0; k<m->numRateCats; k++)
6628     {
6629         clP[k] = clPtr;
6630         clPtr += m->numVecChars * m->numModelStates;
6631     }
6632     lnL_Vec  = m->lnL_Vec;
6633     lnLI_Vec = m->lnLI_Vec;
6634 
6635     /* find base frequencies */
6636     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6637     mA = _mm256_set1_ps ((CLFlt)(bs[A]));
6638     mC = _mm256_set1_ps ((CLFlt)(bs[C]));
6639     mG = _mm256_set1_ps ((CLFlt)(bs[G]));
6640     mT = _mm256_set1_ps ((CLFlt)(bs[T]));
6641 
6642     /* find category frequencies */
6643     if (hasPInvar == NO)
6644         freq =  1.0 / m->numRateCats;
6645     else
6646         freq =  (1.0 - pInvar) / m->numRateCats;
6647     mFreq = _mm256_set1_ps ((CLFlt)(freq));
6648 
6649     /* find tree scaler */
6650     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6651 
6652     /* find nSitesOfPat */
6653     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6654 
6655     /* reset lnL */
6656     *lnL = 0.0;
6657 
6658     /* calculate variable likelihood */
6659     for (c=0; c<m->numVecChars; c++)
6660     {
6661         mLike = _mm256_setzero_ps ();
6662         for (k=0; k<m->numRateCats; k++)
6663         {
6664             mLike = _mm256_fmadd_ps (clP[k][A], mA, mLike);
6665             mLike = _mm256_fmadd_ps (clP[k][C], mC, mLike);
6666             mLike = _mm256_fmadd_ps (clP[k][G], mG, mLike);
6667             mLike = _mm256_fmadd_ps (clP[k][T], mT, mLike);
6668             clP[k] += 4;
6669         }
6670         mLike = _mm256_mul_ps (mLike, mFreq);
6671         _mm256_store_ps (lnL_Vec, mLike);
6672         lnL_Vec += m->numFloatsPerVec;
6673     }
6674 
6675     /* calculate invariable likelihood */
6676     if (hasPInvar == YES)
6677     {
6678         for (c=0; c<m->numVecChars; c++)
6679         {
6680             mLike = _mm256_mul_ps (clInvar[A], mA);
6681             mLike = _mm256_fmadd_ps (clInvar[C], mC, mLike);
6682             mLike = _mm256_fmadd_ps (clInvar[G], mG, mLike);
6683             mLike = _mm256_fmadd_ps (clInvar[T], mT, mLike);
6684             mLike = _mm256_mul_ps (mLike, mPInvar);
6685             _mm256_store_ps (lnLI_Vec, mLike);
6686             clInvar += 4;
6687             lnLI_Vec += m->numFloatsPerVec;
6688         }
6689     }
6690 
6691     /* accumulate results */
6692     if (hasPInvar == NO)
6693     {
6694         for (c=0; c<m->numChars; c++)
6695         {
6696             like = m->lnL_Vec[c];
6697             /* check against LIKE_EPSILON (values close to zero are problematic) */
6698             if (like < LIKE_EPSILON)
6699             {
6700 #   ifdef DEBUG_LIKELIHOOD
6701                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6702 #   endif
6703                 (*lnL) = MRBFLT_NEG_MAX;
6704                 abortMove = YES;
6705                 return ERROR;
6706             }
6707             else
6708             {
6709                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6710             }
6711         }
6712     }
6713     else
6714     {
6715         /* has invariable category */
6716         for (c=0; c<m->numChars; c++)
6717         {
6718             like  = m->lnL_Vec[c];
6719             likeI = m->lnLI_Vec[c];
6720             if (lnScaler[c] < -200)
6721             {
6722                 /* we are not going to be able to exponentiate the scaling factor */
6723                 if (likeI > 1E-70)
6724                 {
6725                     /* forget about like; it is going to be insignificant compared to likeI */
6726                     like = likeI;
6727                 }
6728                 else
6729                 {
6730                     /* treat likeI as if 0.0, that is, ignore it completely */
6731                 }
6732             }
6733             else
6734                 like = like + (likeI / exp (lnScaler[c]));
6735 
6736             /* check against LIKE_EPSILON (values close to zero are problematic) */
6737             if (like < LIKE_EPSILON)
6738             {
6739 #   ifdef DEBUG_LIKELIHOOD
6740                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6741 #   endif
6742                 (*lnL) = MRBFLT_NEG_MAX;
6743                 abortMove = YES;
6744                 return ERROR;
6745             }
6746             else
6747             {
6748                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6749             }
6750         }
6751     }
6752 
6753     return NO_ERROR;
6754 }
6755 #endif
6756 
6757 
6758 #if defined (AVX_ENABLED)
6759 /*------------------------------------------------------------------
6760  |
6761  |   Likelihood_NUC4_AVX: 4by4 nucleotide models with or without rate
6762  |       variation using AVX code
6763  |
6764  -------------------------------------------------------------------*/
Likelihood_NUC4_AVX(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)6765 int Likelihood_NUC4_AVX (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6766 {
6767     int             c, k, hasPInvar;
6768     MrBFlt          freq, *bs, pInvar=0.0, like, likeI;
6769     CLFlt           *lnScaler, *nSitesOfPat, *lnL_Vec, *lnLI_Vec;
6770     __m256          *clPtr, **clP, *clInvar=NULL;
6771     __m256          m1, mA, mC, mG, mT, mFreq, mPInvar=_mm256_set1_ps(0.0f), mLike;
6772     ModelInfo       *m;
6773 
6774     /* find model settings and pInvar, invar cond likes */
6775     m = &modelSettings[division];
6776     if (m->pInvar == NULL)
6777     {
6778         hasPInvar = NO;
6779     }
6780     else
6781     {
6782         hasPInvar = YES;
6783         pInvar =  *(GetParamVals (m->pInvar, chain, state[chain]));
6784         mPInvar = _mm256_set1_ps ((CLFlt)(pInvar));
6785         clInvar = (__m256 *) (m->invCondLikes);
6786     }
6787 
6788     /* find conditional likelihood pointers */
6789     clPtr = (__m256 *) (m->condLikes[m->condLikeIndex[chain][p->index]]);
6790     clP = m->clP_AVX;
6791     for (k=0; k<m->numRateCats; k++)
6792     {
6793         clP[k] = clPtr;
6794         clPtr += m->numVecChars * m->numModelStates;
6795     }
6796     lnL_Vec  = m->lnL_Vec;
6797     lnLI_Vec = m->lnLI_Vec;
6798 
6799     /* find base frequencies */
6800     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6801     mA = _mm256_set1_ps ((CLFlt)(bs[A]));
6802     mC = _mm256_set1_ps ((CLFlt)(bs[C]));
6803     mG = _mm256_set1_ps ((CLFlt)(bs[G]));
6804     mT = _mm256_set1_ps ((CLFlt)(bs[T]));
6805 
6806     /* find category frequencies */
6807     if (hasPInvar == NO)
6808         freq =  1.0 / m->numRateCats;
6809     else
6810         freq =  (1.0 - pInvar) / m->numRateCats;
6811     mFreq = _mm256_set1_ps ((CLFlt)(freq));
6812 
6813     /* find tree scaler */
6814     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6815 
6816     /* find nSitesOfPat */
6817     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6818 
6819     /* reset lnL */
6820     *lnL = 0.0;
6821 
6822     /* calculate variable likelihood */
6823     for (c=0; c<m->numVecChars; c++)
6824     {
6825         mLike = _mm256_setzero_ps ();
6826         for (k=0; k<m->numRateCats; k++)
6827         {
6828             m1    = _mm256_mul_ps (clP[k][A], mA);
6829             mLike = _mm256_add_ps (mLike, m1);
6830             m1    = _mm256_mul_ps (clP[k][C], mC);
6831             mLike = _mm256_add_ps (mLike, m1);
6832             m1    = _mm256_mul_ps (clP[k][G], mG);
6833             mLike = _mm256_add_ps (mLike, m1);
6834             m1    = _mm256_mul_ps (clP[k][T], mT);
6835             mLike = _mm256_add_ps (mLike, m1);
6836             clP[k] += 4;
6837         }
6838         mLike = _mm256_mul_ps (mLike, mFreq);
6839         _mm256_store_ps (lnL_Vec, mLike);
6840         lnL_Vec += m->numFloatsPerVec;
6841     }
6842 
6843     /* calculate invariable likelihood */
6844     if (hasPInvar == YES)
6845     {
6846         for (c=0; c<m->numVecChars; c++)
6847         {
6848             mLike = _mm256_mul_ps (clInvar[A], mA);
6849             m1    = _mm256_mul_ps (clInvar[C], mC);
6850             mLike = _mm256_add_ps (mLike, m1);
6851             m1    = _mm256_mul_ps (clInvar[G], mG);
6852             mLike = _mm256_add_ps (mLike, m1);
6853             m1    = _mm256_mul_ps (clInvar[T], mT);
6854             mLike = _mm256_add_ps (mLike, m1);
6855             mLike = _mm256_mul_ps (mLike, mPInvar);
6856 
6857             _mm256_store_ps (lnLI_Vec, mLike);
6858             clInvar += 4;
6859             lnLI_Vec += m->numFloatsPerVec;
6860         }
6861     }
6862 
6863     /* accumulate results */
6864     if (hasPInvar == NO)
6865     {
6866         for (c=0; c<m->numChars; c++)
6867         {
6868             like = m->lnL_Vec[c];
6869             /* check against LIKE_EPSILON (values close to zero are problematic) */
6870             if (like < LIKE_EPSILON)
6871             {
6872 #   ifdef DEBUG_LIKELIHOOD
6873                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6874 #   endif
6875                 (*lnL) = MRBFLT_NEG_MAX;
6876                 abortMove = YES;
6877                 return ERROR;
6878             }
6879             else
6880             {
6881                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6882             }
6883         }
6884     }
6885     else
6886     {
6887         /* has invariable category */
6888         for (c=0; c<m->numChars; c++)
6889         {
6890             like  = m->lnL_Vec[c];
6891             likeI = m->lnLI_Vec[c];
6892             if (lnScaler[c] < -200)
6893             {
6894                 /* we are not going to be able to exponentiate the scaling factor */
6895                 if (likeI > 1E-70)
6896                 {
6897                     /* forget about like; it is going to be insignificant compared to likeI */
6898                     like = likeI;
6899                 }
6900                 else
6901                 {
6902                     /* treat likeI as if 0.0, that is, ignore it completely */
6903                 }
6904             }
6905             else
6906                 like = like + (likeI / exp (lnScaler[c]));
6907 
6908             /* check against LIKE_EPSILON (values close to zero are problematic) */
6909             if (like < LIKE_EPSILON)
6910             {
6911 #   ifdef DEBUG_LIKELIHOOD
6912                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6913 #   endif
6914                 (*lnL) = MRBFLT_NEG_MAX;
6915                 abortMove = YES;
6916                 return ERROR;
6917             }
6918             else
6919             {
6920                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6921             }
6922         }
6923     }
6924 
6925     return NO_ERROR;
6926 }
6927 #endif
6928 
6929 
6930 #if defined (SSE_ENABLED)
6931 /*------------------------------------------------------------------
6932 |
6933 |   Likelihood_NUC4_SSE: 4by4 nucleotide models with or without rate
6934 |       variation
6935 |
6936 -------------------------------------------------------------------*/
Likelihood_NUC4_SSE(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)6937 int Likelihood_NUC4_SSE (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6938 {
6939     int             c, k, hasPInvar;
6940     MrBFlt          freq, *bs, pInvar=0.0, like, likeI;
6941     CLFlt           *lnScaler, *nSitesOfPat, *lnL_Vec, *lnLI_Vec;
6942     __m128          *clPtr, **clP, *clInvar=NULL;
6943     __m128          m1, mA, mC, mG, mT, mFreq, mPInvar=_mm_set1_ps(0.0f), mLike;
6944     ModelInfo       *m;
6945 
6946     /* find model settings and pInvar, invar cond likes */
6947     m = &modelSettings[division];
6948     if (m->pInvar == NULL)
6949         {
6950         hasPInvar = NO;
6951         }
6952     else
6953         {
6954         hasPInvar = YES;
6955         pInvar =  *(GetParamVals (m->pInvar, chain, state[chain]));
6956         mPInvar = _mm_set1_ps ((CLFlt)(pInvar));
6957         clInvar = (__m128 *) (m->invCondLikes);
6958         }
6959 
6960     /* find conditional likelihood pointers */
6961     clPtr = (__m128 *) (m->condLikes[m->condLikeIndex[chain][p->index]]);
6962     clP = m->clP_SSE;
6963     for (k=0; k<m->numRateCats; k++)
6964         {
6965         clP[k] = clPtr;
6966         clPtr += m->numVecChars * m->numModelStates;
6967         }
6968     lnL_Vec  = m->lnL_Vec;
6969     lnLI_Vec = m->lnLI_Vec;
6970 
6971     /* find base frequencies */
6972     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6973     mA = _mm_set1_ps ((CLFlt)(bs[A]));
6974     mC = _mm_set1_ps ((CLFlt)(bs[C]));
6975     mG = _mm_set1_ps ((CLFlt)(bs[G]));
6976     mT = _mm_set1_ps ((CLFlt)(bs[T]));
6977 
6978     /* find category frequencies */
6979     if (hasPInvar == NO)
6980         freq =  1.0 / m->numRateCats;
6981     else
6982         freq =  (1.0 - pInvar) / m->numRateCats;
6983     mFreq = _mm_set1_ps ((CLFlt)(freq));
6984 
6985     /* find tree scaler */
6986     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6987 
6988     /* find nSitesOfPat */
6989     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6990 
6991     /* reset lnL */
6992     *lnL = 0.0;
6993 
6994     /* calculate variable likelihood */
6995     for (c=0; c<m->numVecChars; c++)
6996         {
6997         mLike = _mm_setzero_ps ();
6998         for (k=0; k<m->numRateCats; k++)
6999             {
7000             m1    = _mm_mul_ps (clP[k][A], mA);
7001             mLike = _mm_add_ps (mLike, m1);
7002             m1    = _mm_mul_ps (clP[k][C], mC);
7003             mLike = _mm_add_ps (mLike, m1);
7004             m1    = _mm_mul_ps (clP[k][G], mG);
7005             mLike = _mm_add_ps (mLike, m1);
7006             m1    = _mm_mul_ps (clP[k][T], mT);
7007             mLike = _mm_add_ps (mLike, m1);
7008             clP[k] += 4;
7009             }
7010         mLike = _mm_mul_ps (mLike, mFreq);
7011         _mm_store_ps (lnL_Vec, mLike);
7012         lnL_Vec += m->numFloatsPerVec;
7013         }
7014 
7015     /* calculate invariable likelihood */
7016     if (hasPInvar == YES)
7017         {
7018         for (c=0; c<m->numVecChars; c++)
7019             {
7020             mLike = _mm_mul_ps (clInvar[A], mA);
7021             m1    = _mm_mul_ps (clInvar[C], mC);
7022             mLike = _mm_add_ps (mLike, m1);
7023             m1    = _mm_mul_ps (clInvar[G], mG);
7024             mLike = _mm_add_ps (mLike, m1);
7025             m1    = _mm_mul_ps (clInvar[T], mT);
7026             mLike = _mm_add_ps (mLike, m1);
7027             mLike = _mm_mul_ps (mLike, mPInvar);
7028 
7029             _mm_store_ps (lnLI_Vec, mLike);
7030             clInvar += 4;
7031             lnLI_Vec += m->numFloatsPerVec;
7032             }
7033         }
7034 
7035     /* accumulate results */
7036     if (hasPInvar == NO)
7037         {
7038         for (c=0; c<m->numChars; c++)
7039             {
7040             like = m->lnL_Vec[c];
7041             /* check against LIKE_EPSILON (values close to zero are problematic) */
7042             if (like < LIKE_EPSILON)
7043                 {
7044 #   ifdef DEBUG_LIKELIHOOD
7045                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
7046 #   endif
7047                 (*lnL) = MRBFLT_NEG_MAX;
7048                 abortMove = YES;
7049                 return ERROR;
7050                 }
7051             else
7052                 {
7053                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
7054                 }
7055             }
7056         }
7057     else
7058         {
7059         /* has invariable category */
7060         for (c=0; c<m->numChars; c++)
7061             {
7062             like  = m->lnL_Vec[c];
7063             likeI = m->lnLI_Vec[c];
7064             if (lnScaler[c] < -200)
7065                 {
7066                 /* we are not going to be able to exponentiate the scaling factor */
7067                 if (likeI > 1E-70)
7068                     {
7069                     /* forget about like; it is going to be insignificant compared to likeI */
7070                     like = likeI;
7071                     }
7072                 else
7073                     {
7074                     /* treat likeI as if 0.0, that is, ignore it completely */
7075                     }
7076                 }
7077             else
7078                 like = like + (likeI / exp (lnScaler[c]));
7079 
7080             /* check against LIKE_EPSILON (values close to zero are problematic) */
7081             if (like < LIKE_EPSILON)
7082                 {
7083 #   ifdef DEBUG_LIKELIHOOD
7084                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
7085 #   endif
7086                 (*lnL) = MRBFLT_NEG_MAX;
7087                 abortMove = YES;
7088                 return ERROR;
7089                 }
7090             else
7091                 {
7092                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
7093                 }
7094             }
7095         }
7096 
7097     return NO_ERROR;
7098 }
7099 #endif
7100 
7101 
7102 /*------------------------------------------------------------------
7103 |
7104 |   Likelihood_NY98: Codon model with three selection categories,
7105 |       after Nielsen and Yang (1998).
7106 |
7107 -------------------------------------------------------------------*/
Likelihood_NY98(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)7108 int Likelihood_NY98 (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
7109 {
7110     int             c, j, k, nStates;
7111     MrBFlt          catLike, like, *bs, *omegaCatFreq;
7112     CLFlt           **clP,*clPtr, *lnScaler, *nSitesOfPat;
7113     ModelInfo       *m;
7114 
7115     m = &modelSettings[division];
7116 
7117     /* number of states */
7118     nStates = m->numModelStates;
7119 
7120     /* find conditional likelihood pointers */
7121     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
7122     clP   = m->clP;
7123     for (k=0; k<m->numOmegaCats; k++)
7124         {
7125         clP[k] = clPtr;
7126         clPtr += m->numChars * m->numModelStates;
7127         }
7128 
7129     /* find codon frequencies */
7130     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
7131 
7132     /* find category frequencies */
7133     omegaCatFreq = GetParamSubVals (m->omega, chain, state[chain]);
7134 
7135     /* find site scaler */
7136     lnScaler = m->scalers[m->siteScalerIndex[chain]];
7137 
7138     /* find nSitesOfPat */
7139     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
7140 
7141     *lnL = 0.0; /* reset lnL */
7142 
7143     for (c=m->numDummyChars; c<m->numChars; c++)
7144         {
7145         like = 0.0;
7146         for (k=0; k<m->numOmegaCats; k++)
7147             {
7148             catLike = 0.0;
7149             for (j=0; j<nStates; j++)
7150                 catLike += clP[k][j] * bs[j];
7151             like += catLike * omegaCatFreq[k];
7152             clP[k] += nStates;
7153             }
7154         /* check against LIKE_EPSILON (values close to zero are problematic) */
7155         if (like < LIKE_EPSILON)
7156             {
7157 #   ifdef DEBUG_LIKELIHOOD
7158             MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
7159 #   endif
7160             (*lnL) = MRBFLT_NEG_MAX;
7161             abortMove = YES;
7162             return ERROR;
7163             }
7164         else
7165             {
7166             (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
7167             }
7168         }
7169 
7170     return NO_ERROR;
7171 }
7172 
7173 
7174 #if defined (SSE_ENABLED)
7175 /*------------------------------------------------------------------
7176 |
7177 |   Likelihood_NY98_SSE: Codon model with three selection categories,
7178 |       after Nielsen and Yang (1998).
7179 |
7180 -------------------------------------------------------------------*/
Likelihood_NY98_SSE(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)7181 int Likelihood_NY98_SSE (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
7182 {
7183     int             c, j, k, nStates;
7184     MrBFlt          like, *bs, *omegaCatFreq;
7185     CLFlt           *lnScaler, *nSitesOfPat, *lnL_Vec;
7186     __m128          *clPtr, **clP;
7187     __m128          m1, mCatLike, mLike;
7188     ModelInfo       *m;
7189 
7190     m = &modelSettings[division];
7191 
7192     /* number of states */
7193     nStates = m->numModelStates;
7194 
7195     /* find conditional likelihood pointers */
7196     clPtr = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index]];
7197     clP   = m->clP_SSE;
7198     for (k=0; k<m->numOmegaCats; k++)
7199         {
7200         clP[k] = clPtr;
7201         clPtr += m->numVecChars * nStates;
7202         }
7203 
7204     /* find codon frequencies */
7205     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
7206 
7207     /* find category frequencies */
7208     omegaCatFreq = GetParamSubVals (m->omega, chain, state[chain]);
7209 
7210     /* find site scaler */
7211     lnScaler = m->scalers[m->siteScalerIndex[chain]];
7212 
7213     /* find nSitesOfPat */
7214     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
7215 
7216     *lnL = 0.0; /* reset lnL */
7217 
7218     lnL_Vec  = m->lnL_Vec;
7219     for (c=0; c<m->numVecChars; c++)
7220         {
7221         mLike = _mm_setzero_ps ();
7222         for (k=0; k<m->numOmegaCats; k++)
7223             {
7224             mCatLike = _mm_setzero_ps ();
7225             for (j=0; j<nStates; j++)
7226                 {
7227                 m1 = _mm_mul_ps (clP[k][j], _mm_set1_ps ((CLFlt)bs[j]));
7228                 mCatLike = _mm_add_ps (mCatLike, m1);
7229                 }
7230             m1 = _mm_mul_ps (mCatLike, _mm_set1_ps ((CLFlt)omegaCatFreq[k]));
7231             mLike = _mm_add_ps (mLike, m1);
7232             clP[k] += nStates;
7233             }
7234         _mm_store_ps (lnL_Vec, mLike);
7235         lnL_Vec += m->numFloatsPerVec;
7236         }
7237     for (c=m->numDummyChars; c<m->numChars; c++)
7238         {
7239         like = m->lnL_Vec[c];
7240         /* check against LIKE_EPSILON (values close to zero are problematic) */
7241         if (like < LIKE_EPSILON)
7242             {
7243 #   ifdef DEBUG_LIKELIHOOD
7244             MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
7245 #   endif
7246             (*lnL) = MRBFLT_NEG_MAX;
7247             abortMove = YES;
7248             return ERROR;
7249             }
7250         else
7251             {
7252             (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
7253             }
7254         }
7255 
7256     return NO_ERROR;
7257 }
7258 #endif
7259 
7260 
7261 /*------------------------------------------------------------------
7262 |
7263 |   Likelihood_Res: restriction site model with or without rate
7264 |       variation
7265 |
7266 -------------------------------------------------------------------*/
Likelihood_Res(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)7267 int Likelihood_Res (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
7268 {
7269     int             c, k;
7270     MrBFlt          *bs, freq, like, pUnobserved, pObserved;
7271     CLFlt           *clPtr, **clP, *lnScaler, *nSitesOfPat;
7272     ModelInfo       *m;
7273 
7274 
7275     m = &modelSettings[division];
7276 
7277     /* find conditional likelihood pointer */
7278     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
7279     clP = m->clP;
7280     for (k=0; k<m->numRateCats; k++)
7281         {
7282         clP[k] = clPtr;
7283         clPtr += m->numChars * m->numModelStates;
7284         }
7285 
7286     /* find base frequencies */
7287     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
7288 
7289     /* find category frequencies */
7290     freq =  1.0 /  m->numRateCats;
7291 
7292     /* find site scaler */
7293     lnScaler = m->scalers[m->siteScalerIndex[chain]];
7294 
7295     /* find nSitesOfPat */
7296     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
7297 
7298     *lnL = 0.0; /* reset lnL */
7299 
7300     pUnobserved = 0.0;
7301     for (c=0; c<m->numDummyChars; c++)
7302         {
7303         like = 0.0;
7304         for (k=0; k<m->numRateCats; k++)
7305             {
7306             like += (clP[k][0]*bs[0] + clP[k][1]*bs[1]) * freq;
7307             clP[k] += 2;
7308             }
7309         pUnobserved += like *  exp(lnScaler[c]);
7310         }
7311 
7312     pObserved =  1.0 - pUnobserved;
7313     if (pObserved < LIKE_EPSILON)
7314         {
7315 #   ifdef DEBUG_LIKELIHOOD
7316         MrBayesPrint ("%s   WARNING: p(Observed) < LIKE_EPSILON - for division %d p(Observed) = %1.30le\n", spacer, division+1, pObserved);
7317 #   endif
7318         (*lnL) = MRBFLT_NEG_MAX;
7319         abortMove = YES;
7320         return ERROR;
7321         }
7322 
7323     for (c=m->numDummyChars; c<m->numChars; c++)
7324         {
7325         like = 0.0;
7326         for (k=0; k<m->numRateCats; k++)
7327             {
7328             like += (clP[k][0]*bs[0] + clP[k][1]*bs[1]) * freq;
7329             clP[k] += 2;
7330             }
7331         /* check against LIKE_EPSILON (values close to zero are problematic) */
7332         if (like < LIKE_EPSILON)
7333             {
7334 #   ifdef DEBUG_LIKELIHOOD
7335             MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
7336 #   endif
7337             (*lnL) = MRBFLT_NEG_MAX;
7338             abortMove = YES;
7339             return ERROR;
7340             }
7341         else
7342             {
7343             (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
7344             }
7345         }
7346 
7347     /* correct for absent characters */
7348     (*lnL) -=  log(pObserved) * (m->numUncompressedChars);
7349 
7350     return NO_ERROR;
7351 }
7352 
7353 
7354 #if defined (SSE_ENABLED)
7355 /*------------------------------------------------------------------
7356 |
7357 |   Likelihood_Res_SSE: restriction site model with or without rate
7358 |       variation
7359 |
7360 -------------------------------------------------------------------*/
Likelihood_Res_SSE(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)7361 int Likelihood_Res_SSE (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
7362 {
7363     int             c, k;
7364     MrBFlt          freq, *bs, like, pUnobserved, pObserved;
7365     CLFlt           *lnScaler, *nSitesOfPat, *lnL_Vec;
7366     __m128          *clPtr, **clP;
7367     __m128          m1, mA, mB, mFreq, mLike;
7368     ModelInfo       *m;
7369 
7370     /* find model settings and pInvar, invar cond likes */
7371     m = &modelSettings[division];
7372 
7373     /* find conditional likelihood pointers */
7374     clPtr = (__m128 *) (m->condLikes[m->condLikeIndex[chain][p->index]]);
7375     clP = m->clP_SSE;
7376     for (k=0; k<m->numRateCats; k++)
7377         {
7378         clP[k] = clPtr;
7379         clPtr += m->numVecChars * m->numModelStates;
7380         }
7381     lnL_Vec  = m->lnL_Vec;
7382 
7383     /* find base frequencies */
7384     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
7385     mA = _mm_set1_ps ((CLFlt)(bs[0]));
7386     mB = _mm_set1_ps ((CLFlt)(bs[1]));
7387 
7388     freq =  1.0 / m->numRateCats;
7389     mFreq = _mm_set1_ps ((CLFlt)(freq));
7390 
7391     /* find tree scaler */
7392     lnScaler = m->scalers[m->siteScalerIndex[chain]];
7393 
7394     /* find nSitesOfPat */
7395     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
7396 
7397     /* reset lnL */
7398     *lnL = 0.0;
7399 
7400     /* calculate variable likelihood */
7401     for (c=0; c<m->numVecChars; c++)
7402         {
7403         mLike = _mm_setzero_ps ();
7404         for (k=0; k<m->numRateCats; k++)
7405             {
7406             m1    = _mm_mul_ps (clP[k][0], mA);
7407             mLike = _mm_add_ps (mLike, m1);
7408             m1    = _mm_mul_ps (clP[k][1], mB);
7409             mLike = _mm_add_ps (mLike, m1);
7410             clP[k] += 2;
7411             }
7412         mLike = _mm_mul_ps (mLike, mFreq);
7413         _mm_store_ps (lnL_Vec, mLike);
7414         lnL_Vec += m->numFloatsPerVec;
7415         }
7416 
7417     pUnobserved = 0.0;
7418     for (c=0; c<m->numDummyChars; c++)
7419         {
7420         like  = m->lnL_Vec[c];
7421         pUnobserved += like *  exp(lnScaler[c]);
7422         }
7423 
7424     pObserved =  1.0 - pUnobserved;
7425     if (pObserved < LIKE_EPSILON)
7426         {
7427 #   ifdef DEBUG_LIKELIHOOD
7428         MrBayesPrint ("%s   WARNING: p(Observed) < LIKE_EPSILON - for division %d p(Observed) = %1.30le\n", spacer, division+1, pObserved);
7429 #   endif
7430         (*lnL) = MRBFLT_NEG_MAX;
7431         abortMove = YES;
7432         return ERROR;
7433         }
7434 
7435     for (c=m->numDummyChars; c<m->numChars; c++)
7436         {
7437         like  = m->lnL_Vec[c];
7438         /* check against LIKE_EPSILON (values close to zero are problematic) */
7439         if (like < LIKE_EPSILON)
7440             {
7441 #   ifdef DEBUG_LIKELIHOOD
7442             MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
7443 #   endif
7444             (*lnL) = MRBFLT_NEG_MAX;
7445             abortMove = YES;
7446             return ERROR;
7447             }
7448         else
7449             {
7450             (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
7451             }
7452         }
7453 
7454     /* correct for absent characters */
7455     (*lnL) -=  log(pObserved) * (m->numUncompressedChars);
7456 
7457     return NO_ERROR;
7458 }
7459 #endif
7460 
7461 
7462 /*------------------------------------------------------------------
7463 |
7464 |   Likelihood_Std: variable states model with or without rate
7465 |       variation
7466 |
7467 -------------------------------------------------------------------*/
Likelihood_Std(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)7468 int Likelihood_Std (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
7469 {
7470     int             b, c, j, k, nBetaCats, nRateCats, nStates, numReps;
7471     MrBFlt          catLike, catFreq, rateFreq, like, *bs, *bsBase,
7472                     pUnobserved, pObserved;
7473     CLFlt           *clPtr, **clP, *lnScaler, *nSitesOfPat;
7474     ModelInfo       *m;
7475 
7476     m = &modelSettings[division];
7477 
7478     numReps=0;
7479     for (c=0; c<m->numChars; c++)
7480         {
7481         if (m->nStates[c] == 2)
7482             numReps += m->numBetaCats * 2;
7483         else
7484             numReps += m->nStates[c];
7485         }
7486     /* find conditional likelihood pointers */
7487     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
7488     clP   = m->clP;
7489     for (k=0; k<m->numRateCats; k++)
7490         {
7491         clP[k] = clPtr;
7492         clPtr += numReps;
7493         }
7494 
7495     /* find base frequencies */
7496     bsBase = GetParamStdStateFreqs (m->stateFreq, chain, state[chain]);
7497 
7498     /* find rate category number and frequencies */
7499     nRateCats = m->numRateCats;
7500     rateFreq = 1.0 / nRateCats;
7501 
7502     /* find site scaler */
7503     lnScaler = m->scalers[m->siteScalerIndex[chain]];
7504 
7505     /* find nSitesOfPat */
7506     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
7507 
7508     *lnL = 0.0; /* reset lnL */
7509 
7510     if (m->numBetaCats == 1)
7511         {
7512         pUnobserved = 0.0;
7513         catFreq = rateFreq;
7514         for (c=j=0; c<m->numDummyChars; c++)
7515             {
7516             like = 0.0;
7517             nStates = m->nStates[c];
7518             bs = bsBase + m->bsIndex[c];
7519             for (k=0; k<nRateCats; k++)
7520                 {
7521                 catLike = 0.0;
7522                 for (j=0; j<nStates; j++)
7523                     catLike += clP[k][j] * bs[j];
7524                 like += catLike * catFreq;
7525                 clP[k] += nStates;
7526                 }
7527             pUnobserved += like *  exp(lnScaler[c]);
7528             }
7529 
7530         pObserved =  1.0 - pUnobserved;
7531         if (pObserved < LIKE_EPSILON)
7532             pObserved = LIKE_EPSILON;
7533 
7534         for (c=m->numDummyChars; c<m->numChars; c++)
7535             {
7536             like = 0.0;
7537             nStates = m->nStates[c];
7538             bs = bsBase + m->bsIndex[c];
7539 
7540             for (k=0; k<nRateCats; k++)
7541                 {
7542                 catLike = 0.0;
7543                 for (j=0; j<nStates; j++)
7544                     catLike += clP[k][j] * bs[j];
7545                 like += catLike * catFreq;
7546                 clP[k] += nStates;
7547                 }
7548             /* check against LIKE_EPSILON (values close to zero are problematic) */
7549             if (like < LIKE_EPSILON)
7550                 {
7551 #   ifdef DEBUG_LIKELIHOOD
7552                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
7553 #   endif
7554                 (*lnL) = MRBFLT_NEG_MAX;
7555                 abortMove = YES;
7556                 return ERROR;
7557                 }
7558             else
7559                 {
7560                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
7561                 }
7562             }
7563         }
7564     else
7565         {
7566         pUnobserved = 0.0;
7567         for (c=j=0; c<m->numDummyChars; c++)
7568             {
7569             like = 0.0;
7570             nStates = m->nStates[c];
7571             bs = bsBase + m->bsIndex[c];
7572             if (nStates == 2)
7573                 {
7574                 nBetaCats = m->numBetaCats;
7575                 catFreq = rateFreq / nBetaCats;
7576                 }
7577             else
7578                 {
7579                 nBetaCats = 1;
7580                 catFreq = rateFreq;
7581                 }
7582             for (b=0; b<nBetaCats; b++)
7583                 {
7584                 for (k=0; k<nRateCats; k++)
7585                     {
7586                     catLike = 0.0;
7587                     for (j=0; j<nStates; j++)
7588                         catLike += clP[k][j] * bs[j];
7589                     like += catLike * catFreq;
7590                     clP[k] += nStates;
7591                     }
7592                 bs += nStates;
7593                 }
7594             pUnobserved += like *  exp(lnScaler[c]);
7595             }
7596 
7597         pObserved =  1.0 - pUnobserved;
7598         if (pObserved < LIKE_EPSILON)
7599             pObserved = LIKE_EPSILON;
7600 
7601         for (c=m->numDummyChars; c<m->numChars; c++)
7602             {
7603             like = 0.0;
7604             nStates = m->nStates[c];
7605             bs = bsBase + m->bsIndex[c];
7606             if (nStates == 2)
7607                 {
7608                 nBetaCats = m->numBetaCats;
7609                 catFreq = rateFreq / nBetaCats;
7610                 }
7611             else
7612                 {
7613                 nBetaCats = 1;
7614                 catFreq = rateFreq;
7615                 }
7616             for (b=0; b<nBetaCats; b++)
7617                 {
7618                 for (k=0; k<nRateCats; k++)
7619                     {
7620                     catLike = 0.0;
7621                     for (j=0; j<nStates; j++)
7622                         catLike += clP[k][j] * bs[j];
7623                     like += catLike * catFreq;
7624                     clP[k] += nStates;
7625                     }
7626                 bs += nStates;
7627                 }
7628             /* check against LIKE_EPSILON (values close to zero are problematic) */
7629             if (like < LIKE_EPSILON)
7630                 {
7631 #   ifdef DEBUG_LIKELIHOOD
7632                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
7633 #   endif
7634                 (*lnL) = MRBFLT_NEG_MAX;
7635                 abortMove = YES;
7636                 return ERROR;
7637                 }
7638             else
7639                 {
7640                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
7641                 }
7642             }
7643         }
7644 
7645     /* correct for absent characters */
7646     (*lnL) -=  log(pObserved) * (m->numUncompressedChars);
7647 
7648     return NO_ERROR;
7649 }
7650 
7651 
7652 /*------------------------------------------------------------------
7653 |
7654 |   Likelihood_Pars: likelihood under the Tuffley and Steel (1997)
7655 |       model for characters with constant number of states. The idea
7656 |       is described in:
7657 |
7658 |       Tuffley, C., and M. Steel. 1997. Links between maximum likelihood
7659 |          and maximum parsimony under a simple model of site substitution.
7660 |          Bull. Math. Bio. 59:581-607.
7661 |
7662 |       The likelihood under the Tuffley and Steel (1997) model is:
7663 |
7664 |       L = k^[-(T + n)]
7665 |
7666 |       where L is the likelihood
7667 |             k is the number of character states
7668 |             T is the parsimony tree length
7669 |             n is the number of characters
7670 |
7671 |   The parsimony calculator does not use character packing; this is
7672 |       to enable reweighting of characters
7673 |
7674 |   Note that this is an empirical Bayes approach in that it uses the
7675 |       maximum likelihood branch length.
7676 |
7677 -------------------------------------------------------------------*/
Likelihood_Pars(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)7678 int Likelihood_Pars (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
7679 {
7680     int             c, i, nStates;
7681     BitsLong        done, *pL, *pR, *pP, *pA, *oldpP, x;
7682     CLFlt           nParsChars, treeLength;
7683     CLFlt           length, *nSitesOfPat, *newNodeLength, oldNodeLength;
7684     Tree            *t;
7685     ModelInfo       *m;
7686 
7687     /* Find model settings */
7688     m = &modelSettings[division];
7689 
7690     /* Get tree */
7691     t = GetTree(m->brlens,chain,state[chain]);
7692 
7693     /* Get parsimony tree length */
7694     treeLength = (CLFlt) m->parsTreeLength[2 * chain + state[chain]];
7695 
7696     /* Get number of states */
7697     nStates = m->numStates;
7698 
7699     /* Get number of sites of pat */
7700     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
7701 
7702     /* Mark the nodes that can be stop nodes                 */
7703     /* (there must not be any touched side nodes below them) */
7704     p = t->root;
7705     p->marked = YES;
7706     for (i=t->nIntNodes-1; i>=0; i--)
7707         {
7708         p = t->intDownPass[i];
7709         p->marked = NO;
7710         if (p->upDateCl == YES && p->anc->marked == YES)
7711             {
7712             if (p->left->upDateCl == NO || p->right->upDateCl == NO)
7713                 p->marked = YES;
7714             }
7715         }
7716 
7717     /* Now make downpass node by node */
7718     for (i=0; i<t->nIntNodes; i++)
7719         {
7720         p = t->intDownPass[i];
7721 
7722         /* continue if no work needs to be done */
7723         if (p->upDateCl == NO)
7724             continue;
7725 
7726         /* flip space */
7727         FlipCondLikeSpace(m, chain, p->index);
7728 
7729         /* find parsimony sets for the node and its environment */
7730         pL    = m->parsSets[m->condLikeIndex[chain][p->left->index ]];
7731         pR    = m->parsSets[m->condLikeIndex[chain][p->right->index]];
7732         oldpP = m->parsSets[m->condLikeScratchIndex[p->index       ]];
7733         pP    = m->parsSets[m->condLikeIndex[chain][p->index       ]];
7734 
7735         /* find old and new node lengths */
7736         oldNodeLength =  m->parsNodeLens[m->condLikeScratchIndex[p->index]];
7737         newNodeLength = &m->parsNodeLens[m->condLikeIndex[chain][p->index]];
7738 
7739         if (t->isRooted == NO && p->anc->anc == NULL)
7740             {
7741             pA = m->parsSets[m->condLikeIndex[chain][p->anc->index]];
7742             length = 0.0;
7743             for (c=0; c<m->numChars; c++)
7744                 {
7745                 x = pL[c] & pR[c];
7746                 if (x == 0)
7747                     {
7748                     x = pL[c] | pR[c];
7749                     length += nSitesOfPat[c];
7750                     }
7751                 if ((x & pA[c]) == 0)
7752                     length += nSitesOfPat[c];
7753                 pP[c] = x;
7754                 }
7755             treeLength += (length - oldNodeLength);
7756             newNodeLength[0] = length;
7757             }
7758         else
7759             {
7760             length = 0.0;
7761             done = 0;
7762             for (c=0; c<m->numChars; c++)
7763                 {
7764                 x = pL[c] & pR[c];
7765                 if (x == 0)
7766                     {
7767                     x = pL[c] | pR[c];
7768                     length += nSitesOfPat[c];
7769                     }
7770                 pP[c] = x;
7771                 done |= (x^oldpP[c]);
7772                 }
7773             treeLength += (length - oldNodeLength);
7774             newNodeLength[0] = length;
7775             if (p->marked == YES && done == 0)
7776                 break;
7777             }
7778         }
7779 
7780     /* Count number of characters in the partition. It is calculated
7781        on the fly because this number is going to differ for
7782        different chains if character reweighting is used. */
7783     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
7784     nParsChars = 0.0;
7785     for (c=0; c<m->numChars; c++)
7786         nParsChars += nSitesOfPat[c];
7787 
7788     /* Calculate likelihood from parsimony tree length */
7789     *lnL = - ((treeLength + nParsChars) *  log (nStates));
7790 
7791     /* Store current parsimony tree length */
7792     m->parsTreeLength[2 * chain + state[chain]] = treeLength;
7793 
7794     return (NO_ERROR);
7795 }
7796 
7797 
7798 #if 0
7799 int Likelihood_ParsCodon (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
7800 {
7801     int             x, y;
7802     TreeNode        *q;
7803 
7804     /* no warnings */
7805     q = p;
7806     x = division;
7807     y = chain;
7808     *lnL = 0.0;
7809     x = whichSitePats;
7810 
7811     MrBayesPrint ("%s   Parsimony calculator for codons not yet implemented\n", spacer);
7812 
7813     return ERROR;
7814 }
7815 #   endif
7816 
7817 
7818 /*------------------------------------------------------------------
7819 |
7820 |   Likelihood_Pars: likelihood under the Tuffley and Steel (1997)
7821 |       model for characters with constant number of states. The idea
7822 |       is described in:
7823 |
7824 |       Tuffley, C., and M. Steel. 1997. Links between maximum likelihood
7825 |          and maximum parsimony under a simple model of site substitution.
7826 |          Bull. Math. Bio. 59:581-607.
7827 |
7828 |       The likelihood under the Tuffley and Steel (1997) model is:
7829 |
7830 |       L = k^[-(T + n)]
7831 |
7832 |       where L is the likelihood
7833 |             k is the number of character states
7834 |             T is the parsimony tree length
7835 |             n is the number of characters
7836 |
7837 |   The parsimony calculator does not use character packing; this is
7838 |       to enable reweighting of characters
7839 |
7840 |   Note that this is an empirical Bayes approach in that it uses the
7841 |       maximum likelihood branch length.
7842 |
7843 |   This variant of the calculator assumes that the number of states
7844 |       is variable. It does not take state order into account.
7845 |
7846 -------------------------------------------------------------------*/
Likelihood_ParsStd(TreeNode * p,int division,int chain,MrBFlt * lnL,int whichSitePats)7847 int Likelihood_ParsStd (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
7848 {
7849     int             c, i, *nStates;
7850     BitsLong        *pL, *pR, *pP, *pA, x;
7851     CLFlt           *treeLength;
7852     CLFlt           *nSitesOfPat;
7853     Tree            *t;
7854     ModelInfo       *m;
7855 
7856     /* Find model settings */
7857     m = &modelSettings[division];
7858 
7859     /* Get tree */
7860     t = GetTree(m->brlens,chain,state[chain]);
7861 
7862     /* Allocate space for parsimony tree length */
7863     treeLength = (CLFlt *) SafeCalloc (m->numChars, sizeof (CLFlt));
7864 
7865     /* Get number of states */
7866     nStates = m->nStates;
7867 
7868     /* Get number of sites of pat */
7869     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
7870 
7871     /* Make downpass node by node; do not skip any nodes */
7872     for (i=0; i<t->nIntNodes; i++)
7873         {
7874         p = t->intDownPass[i];
7875 
7876         /* flip space */
7877         FlipCondLikeSpace(m, chain, p->index);
7878 
7879         /* find parsimony sets for the node and its environment */
7880         pL    = m->parsSets[m->condLikeIndex[chain][p->left->index ]];
7881         pR    = m->parsSets[m->condLikeIndex[chain][p->right->index]];
7882         pP    = m->parsSets[m->condLikeIndex[chain][p->index       ]];
7883 
7884         if (t->isRooted == NO && p->anc->anc == NULL)
7885             {
7886             pA = m->parsSets[m->condLikeIndex[chain][p->anc->index]];
7887             for (c=0; c<m->numChars; c++)
7888                 {
7889                 x = pL[c] & pR[c];
7890                 if (x == 0)
7891                     {
7892                     x = pL[c] | pR[c];
7893                     treeLength[c] += nSitesOfPat[c];
7894                     }
7895                 if ((x & pA[c]) == 0)
7896                     treeLength[c] += nSitesOfPat[c];
7897                 pP[c] = x;
7898                 }
7899             }
7900         else
7901             {
7902             for (c=0; c<m->numChars; c++)
7903                 {
7904                 x = pL[c] & pR[c];
7905                 if (x == 0)
7906                     {
7907                     x = pL[c] | pR[c];
7908                     treeLength[c] += nSitesOfPat[c];
7909                     }
7910                 pP[c] = x;
7911                 }
7912             }
7913         }
7914 
7915     /* Calculate the likelihood one character at a time */
7916     *lnL = 0.0;
7917     for (c=0; c<m->numChars; c++)
7918         {
7919         *lnL -= ((treeLength[c] + nSitesOfPat[c]) * log (nStates[c]));
7920         }
7921 
7922     /* Free space for parsimony character states */
7923     free (treeLength);
7924 
7925     return (NO_ERROR);
7926 }
7927 
7928 #if defined(BEAGLE_V3_ENABLED)
7929 /*-----------------------------------------------------------------
7930 |
7931 |   LaunchLogLikeForBeagleMultiPartition: calculate the log likelihood of the
7932 |       new state of the chain for all divisions with Beagle
7933 |
7934 -----------------------------------------------------------------*/
LaunchLogLikeForBeagleMultiPartition(int chain,MrBFlt * lnL)7935 void LaunchLogLikeForBeagleMultiPartition(int chain, MrBFlt* lnL)
7936 {
7937     int             d, divisionCount;
7938     int             *divisions;
7939     ModelInfo       *m;
7940      divisions = (int *) SafeCalloc (numCurrentDivisions, sizeof(int));
7941     divisionCount = 0;
7942      /* Cycle through divisions and recalculate tis and cond likes as necessary. */
7943     /* Code below does not try to avoid recalculating ti probs for divisions    */
7944     /* that could share ti probs with other divisions.                          */
7945     for (d=0; d<numCurrentDivisions; d++)
7946         {
7947 #   if defined (BEST_MPI_ENABLED)
7948         if (isDivisionActive[d] == NO)
7949             continue;
7950 #   endif
7951         m = &modelSettings[d];
7952         if (m->upDateCl == YES)
7953             {
7954             if (m->upDateCijk == YES)
7955                 {
7956                 if (UpDateCijk(d, chain) == ERROR)
7957                     {
7958                     (*lnL) = MRBFLT_NEG_MAX; /* effectively abort the move */
7959                     continue;
7960                     }
7961                 m->upDateAll = YES;
7962                 }
7963             divisions[divisionCount++] = d;
7964 #if defined (DEBUG_MB_BEAGLE_MULTIPART)
7965             printf("divisions[%d] = %d\n", divisionCount-1, d);
7966 #endif
7967             }
7968         }
7969      LaunchBEAGLELogLikeMultiPartition(divisions, divisionCount, chain, lnL);
7970      if (divisionCount != numCurrentDivisions)
7971         {
7972         for (d=0; d<numCurrentDivisions; d++)
7973             {
7974             m = &modelSettings[d];
7975             if (m->upDateCl == NO)
7976                 {
7977                 /* add log likelihood of divisions that were not updated */
7978                 (*lnL) += m->lnLike[2*chain + state[chain]];
7979                 }
7980             }
7981         }
7982      free(divisions);
7983      return;
7984 }
7985 #endif /* BEAGLE_MULTI_PART_ENABLED */
7986 
7987 /*-----------------------------------------------------------------
7988 |
7989 |   LaunchLogLikeForDivision: calculate the log likelihood of the
7990 |       new state of the chain for a single division
7991 |
7992 -----------------------------------------------------------------*/
LaunchLogLikeForDivision(int chain,int d,MrBFlt * lnL)7993 void LaunchLogLikeForDivision(int chain, int d, MrBFlt* lnL)
7994 {
7995     int i;
7996     TreeNode        *p;
7997     ModelInfo       *m;
7998     Tree            *tree;
7999 #   if defined (TIMING_ANALIZ)
8000     clock_t         CPUTimeStart;
8001 #   endif
8002 
8003     m = &modelSettings[d];
8004     tree = GetTree(m->brlens, chain, state[chain]);
8005 
8006     if (m->upDateCijk == YES)
8007         {
8008         if (UpDateCijk(d, chain)== ERROR)
8009             {
8010             (*lnL) = MRBFLT_NEG_MAX; /* effectively abort the move */
8011             return;
8012             }
8013         m->upDateAll = YES;
8014         }
8015 
8016 #   if defined (BEAGLE_ENABLED)
8017     if (m->useBeagle == YES)
8018         {
8019         LaunchBEAGLELogLikeForDivision(chain, d, m, tree, lnL);
8020         return;
8021         }
8022 #   endif
8023 
8024     /* Flip and copy or reset site scalers */
8025     FlipSiteScalerSpace(m, chain);
8026     if (m->upDateAll == YES)
8027         ResetSiteScalers(m, chain);
8028     else
8029         CopySiteScalers(m, chain);
8030 
8031     if (m->parsModelId == NO)
8032         {
8033         for (i=0; i<tree->nIntNodes; i++)
8034             {
8035             p = tree->intDownPass[i];
8036 
8037             if (p->left->upDateTi == YES)
8038                 {
8039                 /* shift state of ti probs for node */
8040                 FlipTiProbsSpace (m, chain, p->left->index);
8041                 m->TiProbs (p->left, d, chain);
8042                 }
8043 
8044             if (p->right->upDateTi == YES)
8045                 {
8046                 /* shift state of ti probs for node */
8047                 FlipTiProbsSpace (m, chain, p->right->index);
8048                 m->TiProbs (p->right, d, chain);
8049                 }
8050 
8051             if (tree->isRooted == NO)
8052                 {
8053                 if (p->anc->anc == NULL /* && p->upDateTi == YES */)
8054                     {
8055                     /* shift state of ti probs for node */
8056                     FlipTiProbsSpace (m, chain, p->index);
8057                     m->TiProbs (p, d, chain);
8058                     }
8059                 }
8060 
8061             if (p->upDateCl == YES)
8062                 {
8063                 if (tree->isRooted == NO)
8064                     {
8065                     if (p->anc->anc == NULL)
8066                         {
8067                         TIME(m->CondLikeRoot (p, d, chain),CPUCondLikeRoot);
8068                         }
8069                     else
8070                         {
8071                         TIME(m->CondLikeDown (p, d, chain),CPUCondLikeDown);
8072                         }
8073                     }
8074                 else
8075                     {
8076                     TIME(m->CondLikeDown (p, d, chain),CPUCondLikeDown);
8077                     }
8078 
8079                 if (m->unscaledNodes[chain][p->index] == 0 && m->upDateAll == NO)
8080                     {
8081 #if defined (SSE_ENABLED)
8082                     if (m->useVec == VEC_SSE)
8083                         {
8084                         TIME(RemoveNodeScalers_SSE (p, d, chain),CPUScalersRemove);
8085                         }
8086 #if defined (AVX_ENABLED)
8087                     else if (m->useVec == VEC_AVX)
8088                         {
8089                         TIME(RemoveNodeScalers_AVX (p, d, chain),CPUScalersRemove);
8090                         }
8091 #endif
8092                     else
8093                         {
8094                         TIME(RemoveNodeScalers (p, d, chain),CPUScalersRemove);
8095                         }
8096 #   else
8097                     TIME(RemoveNodeScalers (p, d, chain),CPUScalersRemove);
8098 #   endif
8099                     }
8100                 FlipNodeScalerSpace (m, chain, p->index);
8101                 m->unscaledNodes[chain][p->index] = 1 + m->unscaledNodes[chain][p->left->index] + m->unscaledNodes[chain][p->right->index];
8102 
8103                 if (m->unscaledNodes[chain][p->index] >= m->rescaleFreq[chain] && p->anc->anc != NULL)
8104                     {
8105                     TIME(m->CondLikeScaler (p, d, chain),CPUScalers);
8106                     }
8107                 }
8108             }
8109         }
8110     TIME(m->Likelihood (tree->root->left, d, chain, lnL, (chainId[chain] % chainParams.numChains)),CPULilklihood);
8111     return;
8112 }
8113 
8114 
8115 /*----------------------------------------------------------------
8116 |
8117 |   RemoveNodeScalers: Remove node scalers
8118 |
8119 -----------------------------------------------------------------*/
RemoveNodeScalers(TreeNode * p,int division,int chain)8120 int RemoveNodeScalers (TreeNode *p, int division, int chain)
8121 {
8122     int             c;
8123     CLFlt           *scP, *lnScaler;
8124     ModelInfo       *m;
8125 
8126     m = &modelSettings[division];
8127     assert (m->unscaledNodes[chain][p->index] == 0);
8128 
8129     /* find scalers */
8130     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
8131 
8132     /* find site scalers */
8133     lnScaler = m->scalers[m->siteScalerIndex[chain]];
8134 
8135     /* remove scalers */
8136     for (c=0; c<m->numChars; c++)
8137         lnScaler[c] -= scP[c];
8138 
8139     return NO_ERROR;
8140 }
8141 
8142 
8143 #if defined (AVX_ENABLED)
8144 /*----------------------------------------------------------------
8145  |
8146  |   RemoveNodeScalers_AVX: Remove node scalers, AVX code
8147  |
8148  -----------------------------------------------------------------*/
RemoveNodeScalers_AVX(TreeNode * p,int division,int chain)8149 int RemoveNodeScalers_AVX (TreeNode *p, int division, int chain)
8150 {
8151     int             c;
8152     __m256          *scP_AVX, *lnScaler_AVX;
8153     ModelInfo       *m;
8154 
8155     m = &modelSettings[division];
8156     assert (m->unscaledNodes[chain][p->index] == 0);
8157 
8158     /* find scalers */
8159     scP_AVX = (__m256*)(m->scalers[m->nodeScalerIndex[chain][p->index]]);
8160 
8161     /* find site scalers */
8162     lnScaler_AVX = (__m256*)(m->scalers[m->siteScalerIndex[chain]]);
8163 
8164     /* remove scalers */
8165     for (c=0; c<m->numVecChars; c++)
8166     {
8167         lnScaler_AVX[c] = _mm256_sub_ps(lnScaler_AVX[c], scP_AVX[c]);
8168     }
8169 
8170     return NO_ERROR;
8171 
8172 }
8173 #endif
8174 
8175 
8176 #if defined (SSE_ENABLED)
8177 /*----------------------------------------------------------------
8178 |
8179 |   RemoveNodeScalers_SSE: Remove node scalers, SSE code
8180 |
8181 -----------------------------------------------------------------*/
RemoveNodeScalers_SSE(TreeNode * p,int division,int chain)8182 int RemoveNodeScalers_SSE (TreeNode *p, int division, int chain)
8183 {
8184     int             c;
8185     __m128          *scP_SSE, *lnScaler_SSE;
8186     ModelInfo       *m;
8187 
8188     m = &modelSettings[division];
8189     assert (m->unscaledNodes[chain][p->index] == 0);
8190 
8191     /* find scalers */
8192     scP_SSE = (__m128*)(m->scalers[m->nodeScalerIndex[chain][p->index]]);
8193 
8194     /* find site scalers */
8195     lnScaler_SSE = (__m128*)(m->scalers[m->siteScalerIndex[chain]]);
8196 
8197     /* remove scalers */
8198     for (c=0; c<m->numVecChars; c++)
8199         {
8200         lnScaler_SSE[c] = _mm_sub_ps(lnScaler_SSE[c], scP_SSE[c]);
8201         }
8202 
8203     return NO_ERROR;
8204 
8205 }
8206 #endif
8207 
8208 
SetBinaryQMatrix(MrBFlt ** a,int whichChain,int division)8209 int SetBinaryQMatrix (MrBFlt **a, int whichChain, int division)
8210 {
8211     MrBFlt          scaler, *bs;
8212     ModelInfo       *m;
8213 
8214     /* set up pointers to the appropriate model information */
8215     m = &modelSettings[division];
8216     assert (m->numModelStates == 2);
8217 
8218     bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
8219     scaler = 1.0 / (2*bs[0]*bs[1]);
8220     a[0][0]= -bs[1]*scaler;
8221     a[0][1]=  bs[1]*scaler;
8222     a[1][0]=  bs[0]*scaler;
8223     a[1][1]= -bs[0]*scaler;
8224 
8225     return (NO_ERROR);
8226 }
8227 
8228 
SetNucQMatrix(MrBFlt ** a,int n,int whichChain,int division,MrBFlt rateMult,MrBFlt * rA,MrBFlt * rS)8229 int SetNucQMatrix (MrBFlt **a, int n, int whichChain, int division, MrBFlt rateMult, MrBFlt *rA, MrBFlt *rS)
8230 {
8231     register int    i, j, k;
8232     int             isTransition=0, nDiff, rtNum=0;
8233     MrBFlt          scaler, mult=0.0, probOn, sum, *swr, s01, s10, s[4][4], nonsyn, *rateValues=NULL, *bs, dN, dS;
8234     ModelInfo       *m;
8235     ModelParams     *mp;
8236 #   if defined BEAGLE_ENABLED
8237     MrBFlt          trans;
8238 #   endif
8239 
8240     /* set up pointers to the appropriate model information */
8241     mp = &modelParams[division];
8242     m = &modelSettings[division];
8243     assert (m->numModelStates == n);
8244 
8245     /* All of the models that are set up in this function require the frequencies
8246        of the nucleotides (or doublets or codons). They will also require either
8247        a transition/transversion rate ratio or the GTR rate parameters. The
8248        "rateValues" will either be
8249 
8250           rateValues[0] = transtion/transversion rate (kappa)
8251 
8252        for nst=2 models or
8253 
8254           rateValues[0] = A <-> C rate
8255           rateValues[1] = A <-> G rate
8256           rateValues[2] = A <-> T rate
8257           rateValues[3] = C <-> G rate
8258           rateValues[4] = C <-> T rate
8259           rateValues[5] = G <-> T rate
8260 
8261        for nst=6 models. */
8262     bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
8263     if (m->nst == 2)
8264         {
8265         rateValues = GetParamVals(m->tRatio, whichChain, state[whichChain]);
8266 #   if defined (BEAGLE_ENABLED)
8267         /* transversions assumed to have rate 1.0; */
8268         trans = rateValues[0];
8269         if (m->numModelStates == 4)   /* code to satisfy Beagle */
8270             {
8271             rateValues = (MrBFlt *) SafeCalloc (6, sizeof(MrBFlt));
8272             rateValues[0] = rateValues[2] = rateValues[3] = rateValues[5] =1.0; /* Setting transversions */
8273             rateValues[1] = rateValues[4] = trans; /* Setting transitions */
8274             }
8275 #   endif
8276         }
8277 
8278     else if (m->nst == 6 || m->nst == NST_MIXED)
8279         rateValues = GetParamVals(m->revMat, whichChain, state[whichChain]);
8280 #   if defined (BEAGLE_ENABLED)
8281     else if (m->nst == 1 && m->numModelStates == 4)   /* code to satisfy Beagle */
8282         {
8283         rateValues = (MrBFlt *) SafeCalloc (6, sizeof(MrBFlt));
8284         for (i=0; i<6; i++)
8285             rateValues[i] = 1.0;
8286         }
8287 #   endif
8288 
8289     if (n == 4)
8290         {
8291         /* 4 X 4 model:
8292 
8293            Here, we set the rate matrix for the GTR model (Tavare, 1986). We
8294            need not only the 6 rates for this model (rateValues), but also the
8295            base frequencies (bs). */
8296 
8297         /* set diagonal of Q matrix to 0 */
8298         for (i=0; i<4; i++)
8299             a[i][i] = 0.0;
8300 
8301         /* initialize Q matrix */
8302         scaler = 0.0;
8303         for (i=0; i<4; i++)
8304             {
8305             for (j=i+1; j<4; j++)
8306                 {
8307                 if (i == 0 && j == 1)
8308                     mult = rateValues[0];
8309                 else if (i == 0 && j == 2)
8310                     mult = rateValues[1];
8311                 else if (i == 0 && j == 3)
8312                     mult = rateValues[2];
8313                 else if (i == 1 && j == 2)
8314                     mult = rateValues[3];
8315                 else if (i == 1 && j == 3)
8316                     mult = rateValues[4];
8317                 else if (i == 2 && j == 3)
8318                     mult = rateValues[5];
8319                 a[i][i] -= (a[i][j] = bs[j] * mult);
8320                 a[j][j] -= (a[j][i] = bs[i] * mult);
8321                 scaler += bs[i] * a[i][j];
8322                 scaler += bs[j] * a[j][i];
8323                 }
8324             }
8325 
8326         /* rescale Q matrix */
8327         scaler = 1.0 / scaler;
8328         for (i=0; i<4; i++)
8329             for (j=0; j<4; j++)
8330                 a[i][j] *= scaler;
8331         }
8332     else if (n == 8) /* we have a 4 X 4 covarion model */
8333         {
8334         /* 8 X 8 covarion model:
8335 
8336            Here, we set the rate matrix for the covarion model (Tuffley and
8337            Steel, 1997). We need the rate parameters of the model
8338            (contained in rateValues), the frequencies of the four nucleotides,
8339            and the switching rates to completely specify the rate matrix. We
8340            first set up the 4 X 4 submatrix that represents changes (the upper
8341            left portion of the 8 X 8 matrix). Note that if we have rate
8342            variation across sites, that we need to deal with the multiplication
8343            in the rate matrix (i.e., we cannot simply deal with rate variation
8344            by multiplying the branch length by a rate multiplier as we can
8345            with other models). Instead, we multiply the scaled rate matrix
8346            by the rate multiplier. */
8347 
8348         /* Get the switching rates. The rate of off->on is s01 and the rate
8349            of on->off is s10. The stationary probability of the switch process
8350            is prob1 = s01/(s01+s10) and prob0 = s10/(s01+s10). */
8351         swr = GetParamVals (m->switchRates, whichChain, state[whichChain]);
8352         s01 = swr[0];
8353         s10 = swr[1];
8354         probOn = s01 / (s01 + s10);
8355 
8356         /* set matrix a to 0 */
8357         for (i=0; i<8; i++)
8358             for (j=0; j<8; j++)
8359                 a[i][j] = 0.0;
8360 
8361         /* set up the 4 X 4 matrix representing substitutions (s[][]; upper left) */
8362         if (m->nst == 1)
8363             {
8364             scaler = 0.0;
8365             for (i=0; i<4; i++)
8366                 {
8367                 for (j=i+1; j<4; j++)
8368                     {
8369                     s[i][j] = bs[j];
8370                     s[j][i] = bs[i];
8371                     scaler += bs[i] * s[i][j] * probOn;
8372                     scaler += bs[j] * s[j][i] * probOn;
8373                     }
8374                 }
8375             }
8376         else if (m->nst == 2)
8377             {
8378             scaler = 0.0;
8379             for (i=0; i<4; i++)
8380                 {
8381                 for (j=i+1; j<4; j++)
8382                     {
8383                     if ((i == 0 && j == 2) || (i == 2 && j == 0) || (i == 1 && j == 3) || (i == 3 && j == 1))
8384                         mult = rateValues[0];
8385                     else
8386                         mult = 1.0;
8387                     s[i][j] = bs[j] * mult;
8388                     s[j][i] = bs[i] * mult;
8389                     scaler += bs[i] * s[i][j] * probOn;
8390                     scaler += bs[j] * s[j][i] * probOn;
8391                     }
8392                 }
8393             }
8394         else
8395             {
8396             scaler = 0.0;
8397             for (i=0; i<4; i++)
8398                 {
8399                 for (j=i+1; j<4; j++)
8400                     {
8401                     if (i == 0 && j == 1)
8402                         mult = rateValues[0];
8403                     else if (i == 0 && j == 2)
8404                         mult = rateValues[1];
8405                     else if (i == 0 && j == 3)
8406                         mult = rateValues[2];
8407                     else if (i == 1 && j == 2)
8408                         mult = rateValues[3];
8409                     else if (i == 1 && j == 3)
8410                         mult = rateValues[4];
8411                     else if (i == 2 && j == 3)
8412                         mult = rateValues[5];
8413 
8414                     s[i][j] = bs[j] * mult;
8415                     s[j][i] = bs[i] * mult;
8416                     scaler += bs[i] * s[i][j] * probOn;
8417                     scaler += bs[j] * s[j][i] * probOn;
8418                     }
8419                 }
8420             }
8421 
8422         /* rescale off diagonal elements of s[][] matrix */
8423         scaler = 1.0 / scaler;
8424         for (i=0; i<4; i++)
8425             {
8426             for (j=0; j<4; j++)
8427                 {
8428                 if (i != j)
8429                     s[i][j] *= scaler;
8430                 }
8431             }
8432 
8433         /* now, scale s[][] by rate factor */
8434         for (i=0; i<4; i++)
8435             {
8436             for (j=0; j<4; j++)
8437                 {
8438                 if (i != j)
8439                     s[i][j] *= rateMult;
8440                 }
8441             }
8442 
8443         /* put in diagonal elements of s[][] */
8444         for (i=0; i<4; i++)
8445             {
8446             sum = 0.0;
8447             for (j=0; j<4; j++)
8448                 {
8449                 if (i != j)
8450                     sum += s[i][j];
8451                 }
8452             s[i][i] = -(sum + s10);
8453             }
8454 
8455         /* Now, put s[][] into top left portion of a matrix and fill in the
8456            other parts of the matrix with the appropriate switching rates. */
8457         for (i=0; i<4; i++)
8458             for (j=0; j<4; j++)
8459                 a[i][j] = s[i][j];
8460         for (i=4; i<8; i++)
8461             a[i][i] = -s01;
8462         a[0][4] = s10;
8463         a[1][5] = s10;
8464         a[2][6] = s10;
8465         a[3][7] = s10;
8466         a[4][0] = s01;
8467         a[5][1] = s01;
8468         a[6][2] = s01;
8469         a[7][3] = s01;
8470 
8471 #       if 0
8472         for (i=0; i<8; i++)
8473             {
8474             for (j=0; j<8; j++)
8475                 printf ("%1.10lf ", a[i][j]);
8476             printf ("\n");
8477             }
8478         for (i=0; i<4; i++)
8479             printf ("%lf ", bs[i]);
8480         printf ("\n");
8481         printf ("s01 = %lf s10 = %lf pi1 = %lf pi0 = %lf\n", s01, s10, probOn, 1-probOn);
8482 #       endif
8483         }
8484     else if (n == 16)
8485         {
8486         /* 16 X 16 doublet model:
8487 
8488            We have a doublet model. The states are in the order AA, AC, AG, AT, CA, CC
8489            CG, CT, GA, GC, GG, GT, TA, TC, TG, TT. The rate matrix is straight-forward
8490            to set up. We simply multiply the rate parameter (e.g., the ti/tv rate
8491            ratio) by the doublet frequencies. */
8492 
8493         /* set diagonal of Q matrix to 0 */
8494         for (i=0; i<16; i++)
8495             a[i][i] = 0.0;
8496 
8497         if (m->nst == 1) /* F81-like doublet model */
8498             {
8499             scaler = 0.0;
8500             for (i=0; i<16; i++)
8501                 {
8502                 for (j=i+1; j<16; j++)
8503                     {
8504                     if (((doublet[i].first & doublet[j].first) == 0) && ((doublet[i].second & doublet[j].second) == 0))
8505                         mult = 0.0;
8506                     else
8507                         mult = 1.0;
8508                     a[i][i] -= (a[i][j] = bs[j] * mult);
8509                     a[j][j] -= (a[j][i] = bs[i] * mult);
8510                     scaler += bs[i] * a[i][j];
8511                     scaler += bs[j] * a[j][i];
8512                     }
8513                 }
8514             }
8515         else if (m->nst == 2) /* HKY-like doublet model */
8516             {
8517             scaler = 0.0;
8518             for (i=0; i<16; i++)
8519                 {
8520                 for (j=i+1; j<16; j++)
8521                     {
8522                     if (((doublet[i].first & doublet[j].first) == 0) && ((doublet[i].second & doublet[j].second) == 0))
8523                         mult = 0.0;
8524                     else
8525                         {
8526                         if ((doublet[i].first & doublet[j].first) == 0)
8527                             {
8528                             if ((doublet[i].first + doublet[j].first) == 5 || (doublet[i].first + doublet[j].first) == 10)
8529                                 mult = rateValues[0];
8530                             else
8531                                 mult = 1.0;
8532                             }
8533                         else
8534                             {
8535                             if ((doublet[i].second + doublet[j].second) == 5 || (doublet[i].second + doublet[j].second) == 10)
8536                                 mult = rateValues[0];
8537                             else
8538                                 mult = 1.0;
8539                             }
8540                         }
8541                     a[i][i] -= (a[i][j] = bs[j] * mult);
8542                     a[j][j] -= (a[j][i] = bs[i] * mult);
8543                     scaler += bs[i] * a[i][j];
8544                     scaler += bs[j] * a[j][i];
8545                     }
8546                 }
8547             }
8548         else /* GTR-like doublet model */
8549             {
8550             scaler = 0.0;
8551             for (i=0; i<16; i++)
8552                 {
8553                 for (j=i+1; j<16; j++)
8554                     {
8555                     if (((doublet[i].first & doublet[j].first) == 0) && ((doublet[i].second & doublet[j].second) == 0))
8556                         mult = 0.0;
8557                     else
8558                         {
8559                         if ((doublet[i].first & doublet[j].first) == 0)
8560                             {
8561                             if ((doublet[i].first + doublet[j].first) == 3)
8562                                 mult = rateValues[0];
8563                             else if ((doublet[i].first + doublet[j].first) == 5)
8564                                 mult = rateValues[1];
8565                             else if ((doublet[i].first + doublet[j].first) == 9)
8566                                 mult = rateValues[2];
8567                             else if ((doublet[i].first + doublet[j].first) == 6)
8568                                 mult = rateValues[3];
8569                             else if ((doublet[i].first + doublet[j].first) == 10)
8570                                 mult = rateValues[4];
8571                             else
8572                                 mult = rateValues[5];
8573                             }
8574                         else
8575                             {
8576                             if ((doublet[i].second + doublet[j].second) == 3)
8577                                 mult = rateValues[0];
8578                             else if ((doublet[i].second + doublet[j].second) == 5)
8579                                 mult = rateValues[1];
8580                             else if ((doublet[i].second + doublet[j].second) == 9)
8581                                 mult = rateValues[2];
8582                             else if ((doublet[i].second + doublet[j].second) == 6)
8583                                 mult = rateValues[3];
8584                             else if ((doublet[i].second + doublet[j].second) == 10)
8585                                 mult = rateValues[4];
8586                             else
8587                                 mult = rateValues[5];
8588                             }
8589                         }
8590                     a[i][i] -= (a[i][j] = bs[j] * mult);
8591                     a[j][j] -= (a[j][i] = bs[i] * mult);
8592                     scaler += bs[i] * a[i][j];
8593                     scaler += bs[j] * a[j][i];
8594                     }
8595                 }
8596             }
8597 
8598 
8599         /* rescale Q matrix */
8600         scaler = 1.0 / scaler;
8601         for (i=0; i<16; i++)
8602             for (j=0; j<16; j++)
8603                 a[i][j] *= scaler;
8604         }
8605     else
8606         {
8607         /* 64(ish) X 64(ish) codon model:
8608 
8609            Here, we set the rate matrix for the codon model (see Goldman and
8610            Yang, 1994). Note that we can specifiy any general type of codon
8611            model, with these constraints:
8612 
8613             a[i][j] = 0                      -> if i and j differ at 2 or 3 nucleotides
8614             a[i][j] = rateValues[0] * bs[j]  -> if synonymous A <-> C change
8615             a[i][j] = rateValues[1] * bs[j]  -> if synonymous A <-> G change
8616             a[i][j] = rateValues[2] * bs[j]  -> if synonymous A <-> T change
8617             a[i][j] = rateValues[3] * bs[j]  -> if synonymous C <-> G change
8618             a[i][j] = rateValues[4] * bs[j]  -> if synonymous C <-> T change
8619             a[i][j] = rateValues[5] * bs[j]  -> if synonymous G <-> T change
8620 
8621             a[i][j] = rateValues[0] * nonsyn * bs[j]  -> if nonsynonymous A <-> C change
8622             a[i][j] = rateValues[1] * nonsyn * bs[j]  -> if nonsynonymous A <-> G change
8623             a[i][j] = rateValues[2] * nonsyn * bs[j]  -> if nonsynonymous A <-> T change
8624             a[i][j] = rateValues[3] * nonsyn * bs[j]  -> if nonsynonymous C <-> G change
8625             a[i][j] = rateValues[4] * nonsyn * bs[j]  -> if nonsynonymous C <-> T change
8626             a[i][j] = rateValues[5] * nonsyn * bs[j]  -> if nonsynonymous G <-> T change
8627 
8628           Other models, such as the one used by Nielsen & Yang (1998) can be obtained
8629           from this model by restricing transitions and transversions to have the same rate.
8630           nonsyn is the nonsynonymous/synonymous rate ratio (often called the
8631           dN/dS ratio). If we are in this part of the function, then we rely on it
8632           being called with the "rateMult" parameter specifying the dN/dS ratio. Note
8633           that the size of the matrix will never be 64 X 64 as we only consider changes
8634           among coding triplets (i.e., we exclude the stop codons). */
8635 
8636         /* get the nonsynonymous/synonymous rate ratio */
8637         nonsyn = rateMult;
8638 
8639         /* set diagonal of Q matrix to 0 */
8640         for (i=0; i<n; i++)
8641             a[i][i] = 0.0;
8642 
8643         /* set dN and dS rates to zero */
8644         dN = dS = 0.0;
8645 
8646         if (m->nst == 1) /* F81-like codon model */
8647             {
8648             scaler = 0.0;
8649             for (i=0; i<n; i++)
8650                 {
8651                 for (j=i+1; j<n; j++)
8652                     {
8653                     nDiff = 0;
8654                     for (k=0; k<3; k++)
8655                         {
8656                         if (mp->codonNucs[i][k] != mp->codonNucs[j][k])
8657                             nDiff++;
8658                         }
8659                     if (nDiff > 1)
8660                         {
8661                         mult = 0.0;
8662                         }
8663                     else
8664                         {
8665                         if (mp->codonAAs[i] == mp->codonAAs[j])
8666                             mult = 1.0;
8667                         else
8668                             mult = nonsyn;
8669                         }
8670 
8671                     a[i][i] -= (a[i][j] = bs[j] * mult);
8672                     a[j][j] -= (a[j][i] = bs[i] * mult);
8673                     if (mp->codonAAs[i] == mp->codonAAs[j])
8674                         dS += (bs[i] * a[i][j] + bs[j] * a[j][i]);
8675                     else
8676                         dN += (bs[i] * a[i][j] + bs[j] * a[j][i]);
8677                     scaler += bs[i] * a[i][j];
8678                     scaler += bs[j] * a[j][i];
8679                     }
8680                 }
8681             }
8682         else if (m->nst == 2) /* HKY-like codon model */
8683             {
8684             scaler = 0.0;
8685             for (i=0; i<n; i++)
8686                 {
8687                 for (j=i+1; j<n; j++)
8688                     {
8689                     nDiff = 0;
8690                     for (k=0; k<3; k++)
8691                         {
8692                         if (mp->codonNucs[i][k] != mp->codonNucs[j][k])
8693                             {
8694                             nDiff++;
8695                             if ((mp->codonNucs[i][k] == 0 && mp->codonNucs[j][k] == 2) || (mp->codonNucs[i][k] == 2 && mp->codonNucs[j][k] == 0) ||
8696                                 (mp->codonNucs[i][k] == 1 && mp->codonNucs[j][k] == 3) || (mp->codonNucs[i][k] == 3 && mp->codonNucs[j][k] == 1))
8697                                 isTransition = YES;
8698                             else
8699                                 isTransition = NO;
8700                             }
8701                         }
8702                     if (nDiff > 1)
8703                         {
8704                         mult = 0.0;
8705                         }
8706                     else
8707                         {
8708                         if (mp->codonAAs[i] == mp->codonAAs[j])
8709                             mult = 1.0;
8710                         else
8711                             mult = nonsyn;
8712                         if (isTransition == YES)
8713                             mult *= rateValues[0];
8714                         }
8715 
8716                     a[i][i] -= (a[i][j] = bs[j] * mult);
8717                     a[j][j] -= (a[j][i] = bs[i] * mult);
8718                     if (mp->codonAAs[i] == mp->codonAAs[j])
8719                         dS += (bs[i] * a[i][j] + bs[j] * a[j][i]);
8720                     else
8721                         dN += (bs[i] * a[i][j] + bs[j] * a[j][i]);
8722                     scaler += bs[i] * a[i][j];
8723                     scaler += bs[j] * a[j][i];
8724                     }
8725                 }
8726             }
8727         else /* GTR-like codon model */
8728             {
8729             scaler = 0.0;
8730             for (i=0; i<n; i++)
8731                 {
8732                 for (j=i+1; j<n; j++)
8733                     {
8734                     nDiff = 0;
8735                     for (k=0; k<3; k++)
8736                         {
8737                         if (mp->codonNucs[i][k] != mp->codonNucs[j][k])
8738                             {
8739                             nDiff++;
8740                             if ((mp->codonNucs[i][k] == 0 && mp->codonNucs[j][k] == 1) || (mp->codonNucs[i][k] == 1 && mp->codonNucs[j][k] == 0))
8741                                 rtNum = 0;
8742                             else if ((mp->codonNucs[i][k] == 0 && mp->codonNucs[j][k] == 2) || (mp->codonNucs[i][k] == 2 && mp->codonNucs[j][k] == 0))
8743                                 rtNum = 1;
8744                             else if ((mp->codonNucs[i][k] == 0 && mp->codonNucs[j][k] == 3) || (mp->codonNucs[i][k] == 3 && mp->codonNucs[j][k] == 0))
8745                                 rtNum = 2;
8746                             else if ((mp->codonNucs[i][k] == 1 && mp->codonNucs[j][k] == 2) || (mp->codonNucs[i][k] == 2 && mp->codonNucs[j][k] == 1))
8747                                 rtNum = 3;
8748                             else if ((mp->codonNucs[i][k] == 1 && mp->codonNucs[j][k] == 3) || (mp->codonNucs[i][k] == 3 && mp->codonNucs[j][k] == 1))
8749                                 rtNum = 4;
8750                             else
8751                                 rtNum = 5;
8752                             }
8753                         }
8754                     if (nDiff > 1)
8755                         {
8756                         mult = 0.0;
8757                         }
8758                     else
8759                         {
8760                         if (mp->codonAAs[i] == mp->codonAAs[j])
8761                             mult = 1.0;
8762                         else
8763                             mult = nonsyn;
8764                         if (rtNum == 0)
8765                             mult *= rateValues[0];
8766                         else if (rtNum == 1)
8767                             mult *= rateValues[1];
8768                         else if (rtNum == 2)
8769                             mult *= rateValues[2];
8770                         else if (rtNum == 3)
8771                             mult *= rateValues[3];
8772                         else if (rtNum == 4)
8773                             mult *= rateValues[4];
8774                         else
8775                             mult *= rateValues[5];
8776                         }
8777 
8778                     a[i][i] -= (a[i][j] = bs[j] * mult);
8779                     a[j][j] -= (a[j][i] = bs[i] * mult);
8780                     if (mp->codonAAs[i] == mp->codonAAs[j])
8781                         dS += (bs[i] * a[i][j] + bs[j] * a[j][i]);
8782                     else
8783                         dN += (bs[i] * a[i][j] + bs[j] * a[j][i]);
8784                     scaler += bs[i] * a[i][j];
8785                     scaler += bs[j] * a[j][i];
8786                     }
8787                 }
8788             }
8789 
8790         /* rescale Q matrix */
8791         if (m->nucModelId == NUCMODEL_CODON && m->numOmegaCats > 1)
8792             {
8793             /* If we have a positive selection model with multiple categories, then
8794                we do not rescale the rate matrix until we have finished generating
8795                all of the necessary rate matrices. The rescaling occurrs in
8796                UpDateCijk. */
8797             (*rA) = dN;
8798             (*rS) = dS;
8799             }
8800         else
8801             {
8802             scaler = 1.0 / scaler;
8803             for (i=0; i<n; i++)
8804                 for (j=0; j<n; j++)
8805                     a[i][j] *= scaler;
8806             (*rA) = (*rS) = 1.0;
8807             }
8808         }
8809 
8810 #   if 0
8811     for (i=0; i<n; i++)
8812         {
8813         for (j=0; j<n; j++)
8814             printf ("%0.5lf ", a[i][j]);
8815         printf ("\n");
8816         }
8817 #   endif
8818 
8819 #   if defined (BEAGLE_ENABLED)
8820     if ((m->nst == 1 || m->nst == 2) && m->numModelStates == 4)
8821         free (rateValues);
8822 #   endif
8823 
8824     return (NO_ERROR);
8825 }
8826 
8827 
SetProteinQMatrix(MrBFlt ** a,int n,int whichChain,int division,MrBFlt rateMult)8828 int SetProteinQMatrix (MrBFlt **a, int n, int whichChain, int division, MrBFlt rateMult)
8829 {
8830     register int    i, j, k;
8831     int             aaModelID;
8832     MrBFlt          scaler, probOn, sum, *swr, s01, s10, *bs, *rt;
8833     ModelInfo       *m;
8834 
8835     /* set up pointers to the appropriate model information */
8836     m = &modelSettings[division];
8837 
8838     /* get amino acid model ID
8839         AAMODEL_POISSON         0
8840         AAMODEL_JONES           1
8841         AAMODEL_DAY             2
8842         AAMODEL_MTREV           3
8843         AAMODEL_MTMAM           4
8844         AAMODEL_WAG             5
8845         AAMODEL_RTREV           6
8846         AAMODEL_CPREV           7
8847         AAMODEL_VT              8
8848         AAMODEL_BLOSUM          9
8849         AAMODEL_LG             10
8850         AAMODEL_EQ             11
8851         AAMODEL_GTR            12 */
8852 
8853     if (m->aaModelId >= 0)
8854         aaModelID = m->aaModelId;
8855     else
8856         aaModelID = (int)*GetParamVals(m->aaModel, whichChain, state[whichChain]);
8857 
8858     /* Make certain that we have either 20 or 40 states. Anything
8859        else means we have a real problem. */
8860     if (n != 20 && n != 40)
8861         {
8862         MrBayesPrint ("%s   ERROR: There should be 20 or 40 states for the aa model\n");
8863         return (ERROR);
8864         }
8865 
8866     if (n == 20)
8867         {
8868         /* We have a run-of-the-mill amino acid model (i.e., 20 X 20). */
8869         if (aaModelID == AAMODEL_POISSON)
8870             {
8871             scaler = 1.0 / 19.0;
8872             for (i=0; i<20; i++)
8873                 {
8874                 for (j=i+1; j<20; j++)
8875                     {
8876                     a[i][j] = scaler;
8877                     a[j][i] = scaler;
8878                     }
8879                 }
8880             for (i=0; i<20; i++)
8881                 a[i][i] = -1.0;
8882             }
8883         else if (aaModelID == AAMODEL_EQ)
8884             {
8885             bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
8886             for (i=0; i<20; i++)
8887                 for (j=0; j<20; j++)
8888                     a[i][j] = 0.0;
8889             scaler = 0.0;
8890             for (i=0; i<20; i++)
8891                 {
8892                 for (j=i+1; j<20; j++)
8893                     {
8894                     a[i][i] -= (a[i][j] = bs[j]);
8895                     a[j][j] -= (a[j][i] = bs[i]);
8896                     scaler += bs[i] * a[i][j];
8897                     scaler += bs[j] * a[j][i];
8898                     }
8899                 }
8900             scaler = 1.0 / scaler;
8901             for (i=0; i<20; i++)
8902                 for (j=0; j<20; j++)
8903                     a[i][j] *= scaler;
8904             }
8905         else if (aaModelID == AAMODEL_GTR)
8906             {
8907             bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
8908             rt = GetParamVals (m->revMat, whichChain, state[whichChain]);
8909             for (i=0; i<20; i++)
8910                 for (j=0; j<20; j++)
8911                     a[i][j] = 0.0;
8912             scaler = 0.0;
8913             for (i=k=0; i<20; i++)
8914                 {
8915                 for (j=i+1; j<20; j++)
8916                     {
8917                     a[i][i] -= (a[i][j] = bs[j] * rt[k]);
8918                     a[j][j] -= (a[j][i] = bs[i] * rt[k]);
8919                     k++;
8920                     }
8921                 }
8922             for (i=0; i<20; i++)
8923                 scaler += -(bs[i] * a[i][i]);
8924             for (i=0; i<20; i++)
8925                 for (j=0; j<20; j++)
8926                     a[i][j] /= scaler;
8927             }
8928         else if (aaModelID == AAMODEL_JONES)
8929             {
8930             for (i=0; i<20; i++)
8931                 for (j=0; j<20; j++)
8932                     a[i][j] = aaJones[i][j];
8933             }
8934         else if (aaModelID == AAMODEL_DAY)
8935             {
8936             for (i=0; i<20; i++)
8937                 for (j=0; j<20; j++)
8938                     a[i][j] = aaDayhoff[i][j];
8939             }
8940         else if (aaModelID == AAMODEL_MTREV)
8941             {
8942             for (i=0; i<20; i++)
8943                 for (j=0; j<20; j++)
8944                     a[i][j] = aaMtrev24[i][j];
8945             }
8946         else if (aaModelID == AAMODEL_MTMAM)
8947             {
8948             for (i=0; i<20; i++)
8949                 for (j=0; j<20; j++)
8950                     a[i][j] = aaMtmam[i][j];
8951             }
8952         else if (aaModelID == AAMODEL_RTREV)
8953             {
8954             for (i=0; i<20; i++)
8955                 for (j=0; j<20; j++)
8956                     a[i][j] = aartREV[i][j];
8957             }
8958         else if (aaModelID == AAMODEL_WAG)
8959             {
8960             for (i=0; i<20; i++)
8961                 for (j=0; j<20; j++)
8962                     a[i][j] = aaWAG[i][j];
8963             }
8964         else if (aaModelID == AAMODEL_CPREV)
8965             {
8966             for (i=0; i<20; i++)
8967                 for (j=0; j<20; j++)
8968                     a[i][j] = aacpREV[i][j];
8969             }
8970         else if (aaModelID == AAMODEL_VT)
8971             {
8972             for (i=0; i<20; i++)
8973                 for (j=0; j<20; j++)
8974                     a[i][j] = aaVt[i][j];
8975             }
8976         else if (aaModelID == AAMODEL_BLOSUM)
8977             {
8978             for (i=0; i<20; i++)
8979                 for (j=0; j<20; j++)
8980                     a[i][j] = aaBlosum[i][j];
8981             }
8982         else if (aaModelID == AAMODEL_LG)
8983             {
8984             for (i=0; i<20; i++)
8985                 for (j=0; j<20; j++)
8986                     a[i][j] = aaLG[i][j];
8987             }
8988         else
8989             {
8990             MrBayesPrint ("%s   ERROR: Don't understand which amino acid model is needed\n");
8991             return (ERROR);
8992             }
8993 #       if 0
8994         for (i=0; i<20; i++)
8995             {
8996             for (j=0; j<20; j++)
8997                 printf ("%1.3lf ", a[i][j]);
8998             printf ("\n");
8999             }
9000 #       endif
9001         }
9002     else
9003         {
9004         /* 40 X 40 covarion model:
9005 
9006            We have a covarion model, and must set up the other quadrants. Note that if
9007            we are at this point in the code, that we have already set up the upper left
9008            portion of the 40 X 40 rate matrix. Note that if we have rate
9009            variation across sites, that we need to deal with the multiplication
9010            in the rate matrix (i.e., we cannot simply deal with rate variation
9011            by multiplying the branch length by a rate multiplier as we can
9012            with other models). Instead, we multiply the scaled rate matrix
9013            by the rate multiplier. */
9014 
9015         /* Get the switching rates. The rate of off->on is s01 and the rate
9016            of on->off is s10. The stationary probability of the switch process
9017            is prob1 = s01/(s01+s10) and prob0 = s10/(s01+s10). */
9018         swr = GetParamVals (m->switchRates, whichChain, state[whichChain]);
9019         s01 = swr[0];
9020         s10 = swr[1];
9021         probOn = s01 / (s01 + s10);
9022 
9023         /* set matrix a[][] to 0 */
9024         for (i=0; i<40; i++)
9025             for (j=0; j<40; j++)
9026                 a[i][j] = 0.0;
9027 
9028         /* fill in upper-left sub matrix (where substitutions occur */
9029         if (aaModelID == AAMODEL_POISSON)
9030             {
9031             scaler = 0.0;
9032             for (i=0; i<20; i++)
9033                 {
9034                 for (j=i+1; j<20; j++)
9035                     {
9036                     a[i][j] = 0.05;
9037                     a[j][i] = 0.05;
9038                     scaler += 0.05 * a[i][j] * probOn;
9039                     scaler += 0.05 * a[j][i] * probOn;
9040                     }
9041                 }
9042             }
9043         else if (aaModelID == AAMODEL_EQ)
9044             {
9045             bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
9046             scaler = 0.0;
9047             for (i=0; i<20; i++)
9048                 {
9049                 for (j=i+1; j<20; j++)
9050                     {
9051                     a[i][j] = bs[j];
9052                     a[j][i] = bs[i];
9053                     scaler += bs[i] * a[i][j] * probOn;
9054                     scaler += bs[j] * a[j][i] * probOn;
9055                     }
9056                 }
9057             }
9058         else if (aaModelID == AAMODEL_GTR)
9059             {
9060             bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
9061             rt = GetParamVals (m->revMat, whichChain, state[whichChain]);
9062             for (i=0; i<20; i++)
9063                 for (j=0; j<20; j++)
9064                     a[i][j] = 0.0;
9065             scaler = 0.0;
9066             for (i=k=0; i<20; i++)
9067                 {
9068                 for (j=i+1; j<20; j++)
9069                     {
9070                     a[i][i] -= (a[i][j] = bs[j] * rt[k]);
9071                     a[j][j] -= (a[j][i] = bs[i] * rt[k]);
9072                     k++;
9073                     }
9074                 }
9075             for (i=0; i<20; i++)
9076                 scaler += -(bs[i] * a[i][i]);
9077             for (i=0; i<20; i++)
9078                 for (j=0; j<20; j++)
9079                     a[i][j] /= scaler;
9080             for (i=0; i<20; i++)
9081                 {
9082                 for (j=i+1; j<20; j++)
9083                     {
9084                     a[i][j] = bs[j];
9085                     a[j][i] = bs[i];
9086                     scaler += bs[i] * a[i][j] * probOn;
9087                     scaler += bs[j] * a[j][i] * probOn;
9088                     }
9089                 }
9090             }
9091         else if (aaModelID == AAMODEL_JONES)
9092             {
9093             scaler = 0.0;
9094             for (i=0; i<20; i++)
9095                 {
9096                 for (j=i+1; j<20; j++)
9097                     {
9098                     a[i][j] = aaJones[i][j];
9099                     a[j][i] = aaJones[j][i];
9100                     scaler += jonesPi[i] * a[i][j] * probOn;
9101                     scaler += jonesPi[j] * a[j][i] * probOn;
9102                     }
9103                 }
9104             }
9105         else if (aaModelID == AAMODEL_DAY)
9106             {
9107             scaler = 0.0;
9108             for (i=0; i<20; i++)
9109                 {
9110                 for (j=i+1; j<20; j++)
9111                     {
9112                     a[i][j] = aaDayhoff[i][j];
9113                     a[j][i] = aaDayhoff[j][i];
9114                     scaler += dayhoffPi[i] * a[i][j] * probOn;
9115                     scaler += dayhoffPi[j] * a[j][i] * probOn;
9116                     }
9117                 }
9118             }
9119         else if (aaModelID == AAMODEL_MTREV)
9120             {
9121             scaler = 0.0;
9122             for (i=0; i<20; i++)
9123                 {
9124                 for (j=i+1; j<20; j++)
9125                     {
9126                     a[i][j] = aaMtrev24[i][j];
9127                     a[j][i] = aaMtrev24[j][i];
9128                     scaler += mtrev24Pi[i] * a[i][j] * probOn;
9129                     scaler += mtrev24Pi[j] * a[j][i] * probOn;
9130                     }
9131                 }
9132             }
9133         else if (aaModelID == AAMODEL_MTMAM)
9134             {
9135             scaler = 0.0;
9136             for (i=0; i<20; i++)
9137                 {
9138                 for (j=i+1; j<20; j++)
9139                     {
9140                     a[i][j] = aaMtmam[i][j];
9141                     a[j][i] = aaMtmam[j][i];
9142                     scaler += mtmamPi[i] * a[i][j] * probOn;
9143                     scaler += mtmamPi[j] * a[j][i] * probOn;
9144                     }
9145                 }
9146             }
9147         else if (aaModelID == AAMODEL_RTREV)
9148             {
9149             scaler = 0.0;
9150             for (i=0; i<20; i++)
9151                 {
9152                 for (j=i+1; j<20; j++)
9153                     {
9154                     a[i][j] = aartREV[i][j];
9155                     a[j][i] = aartREV[j][i];
9156                     scaler += rtrevPi[i] * a[i][j] * probOn;
9157                     scaler += rtrevPi[j] * a[j][i] * probOn;
9158                     }
9159                 }
9160             }
9161         else if (aaModelID == AAMODEL_WAG)
9162             {
9163             scaler = 0.0;
9164             for (i=0; i<20; i++)
9165                 {
9166                 for (j=i+1; j<20; j++)
9167                     {
9168                     a[i][j] = aaWAG[i][j];
9169                     a[j][i] = aaWAG[j][i];
9170                     scaler += wagPi[i] * a[i][j] * probOn;
9171                     scaler += wagPi[j] * a[j][i] * probOn;
9172                     }
9173                 }
9174             }
9175         else if (aaModelID == AAMODEL_CPREV)
9176             {
9177             scaler = 0.0;
9178             for (i=0; i<20; i++)
9179                 {
9180                 for (j=i+1; j<20; j++)
9181                     {
9182                     a[i][j] = aacpREV[i][j];
9183                     a[j][i] = aacpREV[j][i];
9184                     scaler += cprevPi[i] * a[i][j] * probOn;
9185                     scaler += cprevPi[j] * a[j][i] * probOn;
9186                     }
9187                 }
9188             }
9189         else if (aaModelID == AAMODEL_VT)
9190             {
9191             scaler = 0.0;
9192             for (i=0; i<20; i++)
9193                 {
9194                 for (j=i+1; j<20; j++)
9195                     {
9196                     a[i][j] = aaVt[i][j];
9197                     a[j][i] = aaVt[j][i];
9198                     scaler += vtPi[i] * a[i][j] * probOn;
9199                     scaler += vtPi[j] * a[j][i] * probOn;
9200                     }
9201                 }
9202             }
9203         else if (aaModelID == AAMODEL_BLOSUM)
9204             {
9205             scaler = 0.0;
9206             for (i=0; i<20; i++)
9207                 {
9208                 for (j=i+1; j<20; j++)
9209                     {
9210                     a[i][j] = aaBlosum[i][j];
9211                     a[j][i] = aaBlosum[j][i];
9212                     scaler += blosPi[i] * a[i][j] * probOn;
9213                     scaler += blosPi[j] * a[j][i] * probOn;
9214                     }
9215                 }
9216             }
9217         else if (aaModelID == AAMODEL_LG)
9218             {
9219             scaler = 0.0;
9220             for (i=0; i<20; i++)
9221                 {
9222                 for (j=i+1; j<20; j++)
9223                     {
9224                     a[i][j] = aaLG[i][j];
9225                     a[j][i] = aaLG[j][i];
9226                     scaler += lgPi[i] * a[i][j] * probOn;
9227                     scaler += lgPi[j] * a[j][i] * probOn;
9228                     }
9229                 }
9230             }
9231         else
9232             {
9233             MrBayesPrint ("%s   ERROR: Don't understand which amino acid model is needed\n");
9234             return (ERROR);
9235             }
9236 
9237         /* rescale off diagonal elements of Q matrix */
9238         scaler = 1.0 / scaler;
9239         for (i=0; i<20; i++)
9240             {
9241             for (j=0; j<20; j++)
9242                 {
9243                 if (i != j)
9244                     a[i][j] *= scaler;
9245                 }
9246             }
9247 
9248         /* now, scale by rate factor */
9249         for (i=0; i<20; i++)
9250             {
9251             for (j=0; j<20; j++)
9252                 {
9253                 if (i != j)
9254                     a[i][j] *= rateMult;
9255                 }
9256             }
9257 
9258         /* put in diagonal elements */
9259         for (i=0; i<20; i++)
9260             {
9261             sum = 0.0;
9262             for (j=0; j<20; j++)
9263                 {
9264                 if (i != j)
9265                     sum += a[i][j];
9266                 a[i][i] = -(sum + s10);
9267                 }
9268             }
9269 
9270         /* fill in the other three submatrices */
9271         for (i=20; i<40; i++)
9272             a[i][i] = -s01;
9273         for (i=0; i<20; i++)
9274             {
9275             a[i][20+i] = s10;
9276             a[20+i][i] = s01;
9277             }
9278 
9279         }
9280 
9281     return (NO_ERROR);
9282 }
9283 
9284 
SetStdQMatrix(MrBFlt ** a,int nStates,MrBFlt * bs,int cType)9285 int SetStdQMatrix (MrBFlt **a, int nStates, MrBFlt *bs, int cType)
9286 {
9287     register int    i, j;
9288     MrBFlt          scaler;
9289 
9290     /* This function sets up ordered or unordered models for standard characters
9291        with unequal stationary state frequencies. It requires the stationary
9292        frequencies of the states (passed when calling the function). It also
9293        needs to know the number of states and the type (ordered or unordered)
9294        of the character. */
9295 
9296     /* set Q matrix to 0 */
9297     for (i=0; i<nStates; i++)
9298         for (j=0; j<nStates; j++)
9299             a[i][j] = 0.0;
9300 
9301     /* initialize Q matrix */
9302     scaler = 0.0;
9303     if (cType == UNORD)
9304         {
9305         /* unordered characters */
9306         for (i=0; i<nStates; i++)
9307             {
9308             for (j=0; j<nStates; j++)
9309                 {
9310                 if (i != j)
9311                     {
9312                     a[i][i] -= (a[i][j] = bs[j]);
9313                     scaler += bs[i] * a[i][j];
9314                     }
9315                 }
9316             }
9317         }
9318     else
9319         {
9320         /* ordered characters */
9321         for (i=0; i<nStates; i++)
9322             {
9323             for (j=0; j<nStates; j++)
9324                 {
9325                 if (abs(i - j) == 1)
9326                     {
9327                     a[i][i] -= (a[i][j] = bs[j]);
9328                     scaler += bs[i] * a[i][j];
9329                     }
9330                 }
9331             }
9332         }
9333 
9334     /* rescale Q matrix */
9335     for (i=0; i<nStates; i++)
9336         for (j=0; j<nStates; j++)
9337             a[i][j] /= scaler;
9338 
9339 #   if defined DEBUG_SETSTDQMATRIX
9340     for (i=0; i<nStates; i++)
9341         {
9342         for (j=0; j<nStates; j++)
9343             printf ("%0.5lf ", a[i][j]);
9344         printf ("\n");
9345         }
9346 #   endif
9347 
9348     return (NO_ERROR);
9349 }
9350 
9351 
TiProbs_Fels(TreeNode * p,int division,int chain)9352 int TiProbs_Fels (TreeNode *p, int division, int chain)
9353 {
9354     int         i, j, k, index;
9355     MrBFlt      t, u, x, z, beta, bigPi_j[4], pij, bigPij,
9356                 *catRate, baseRate, theRate, *pis, length;
9357     CLFlt       *tiP;
9358     ModelInfo   *m;
9359 
9360     m = &modelSettings[division];
9361 
9362     /* find transition probabilities */
9363     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
9364 
9365     /* get base frequencies */
9366     pis = GetParamSubVals (m->stateFreq, chain, state[chain]);
9367 
9368     /* get base rate */
9369     baseRate = GetRate (division, chain);
9370 
9371     /* compensate for invariable sites if appropriate */
9372     if (m->pInvar != NULL)
9373         baseRate /= (1.0 - (*GetParamVals(m->pInvar, chain, state[chain])));
9374 
9375     /* get category rates */
9376     theRate = 1.0;
9377     if (m->shape != NULL)
9378         catRate = GetParamSubVals (m->shape, chain, state[chain]);
9379     else if (m->mixtureRates != NULL)
9380         catRate = GetParamSubVals (m->mixtureRates, chain, state[chain]);
9381     else
9382         catRate = &theRate;
9383 
9384     /* rescale beta */
9385     beta =  (0.5 / ((pis[0] + pis[2])*(pis[1] + pis[3]) + ((pis[0]*pis[2]) + (pis[1]*pis[3]))));
9386 
9387     bigPi_j[0] =  (pis[0] + pis[2]);
9388     bigPi_j[1] =  (pis[1] + pis[3]);
9389     bigPi_j[2] =  (pis[0] + pis[2]);
9390     bigPi_j[3] =  (pis[1] + pis[3]);
9391 
9392     /* find length */
9393     if (m->cppEvents != NULL)
9394         {
9395         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
9396         }
9397     else if (m->tk02BranchRates != NULL)
9398         {
9399         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
9400         }
9401     else if (m->igrBranchRates != NULL)
9402         {
9403         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
9404         }
9405     else if (m->mixedBrchRates != NULL)
9406         {
9407         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
9408         }
9409     else
9410         length = p->length;
9411 
9412     /* numerical errors will ensue if we allow very large or very small branch lengths,
9413        which might occur in relaxed clock models */
9414 
9415     /* fill in values */
9416     for (k=index=0; k<m->numRateCats; k++)
9417         {
9418         t =  length * baseRate * catRate[k];
9419 
9420         if (t < TIME_MIN)
9421             {
9422             /* Fill in identity matrix */
9423             for (i=0; i<4; i++)
9424                 {
9425                 for (j=0; j<4; j++)
9426                     {
9427                     if (i == j)
9428                         tiP[index++] = 1.0;
9429                     else
9430                         tiP[index++] = 0.0;
9431                     }
9432                 }
9433             }
9434         else if (t > TIME_MAX)
9435             {
9436             /* Fill in stationary matrix */
9437             for (i=0; i<4; i++)
9438                 for (j=0; j<4; j++)
9439                     tiP[index++] = (CLFlt) pis[j];
9440             }
9441         else
9442             {
9443             /* calculate probabilities */
9444             for (i=0; i<4; i++)
9445                 {
9446                 for (j=0; j<4; j++)
9447                     {
9448                     bigPij = bigPi_j[j];
9449                     pij =  pis[j];
9450                     u =  1.0/bigPij -  1.0;
9451                     x =  exp(-beta * t);
9452                     z = (bigPij - pij) / bigPij;
9453 
9454                     if (i == j)
9455                         tiP[index++] = (CLFlt) (pij + pij * u * x + z * x);
9456                     else
9457                         tiP[index++] = (CLFlt) (pij + pij * u * x - (pij/bigPij) * x);
9458                     }
9459                 }
9460             }
9461         }
9462 
9463     return NO_ERROR;
9464 }
9465 
9466 
9467 /*----------------------------------------------------------------
9468 |
9469 |   TiProbs_Gen: Calculates transition probabilities for general
9470 |       models with or without rate variation. This function does
9471 |       not work with:
9472 |
9473 |       1. codon models with omega variation or
9474 |       2. covarion models with rate variation
9475 |
9476 |   In either of these cases, TiProbs_GenCov is used
9477 |
9478 -----------------------------------------------------------------*/
TiProbs_Gen(TreeNode * p,int division,int chain)9479 int TiProbs_Gen (TreeNode *p, int division, int chain)
9480 {
9481     register int    i, j, k, n, s, index;
9482     MrBFlt          t, *catRate, baseRate, *eigenValues, *cijk, *bs,
9483                     EigValexp[64], sum, *ptr, theRate, correctionFactor,
9484                     length;
9485     CLFlt           *tiP;
9486     ModelInfo       *m;
9487 
9488     m = &modelSettings[division];
9489     n = m->numModelStates;
9490 
9491     /* find the correction factor to make branch lengths
9492        in terms of expected number of substitutions per character */
9493     correctionFactor = 1.0;
9494     if (m->dataType == DNA || m->dataType == RNA)
9495         {
9496         if (m->nucModelId == NUCMODEL_DOUBLET)
9497             correctionFactor = 2.0;
9498         else if (m->nucModelId == NUCMODEL_CODON)
9499             correctionFactor = 3.0;
9500         }
9501 
9502     /* find transition probabilities */
9503     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
9504 
9505     /* get base rate */
9506     baseRate = GetRate (division, chain);
9507 
9508     /* compensate for invariable sites if appropriate */
9509     if (m->pInvar != NULL)
9510         baseRate /= (1.0 - (*GetParamVals(m->pInvar, chain, state[chain])));
9511 
9512     /* get category rates */
9513     theRate = 1.0;
9514     if (m->shape != NULL)
9515         catRate = GetParamSubVals (m->shape, chain, state[chain]);
9516     else if (m->mixtureRates != NULL)
9517         catRate = GetParamSubVals (m->mixtureRates, chain, state[chain]);
9518     else
9519         catRate = &theRate;
9520 
9521     /* get eigenvalues and cijk pointers */
9522     eigenValues = m->cijks[m->cijkIndex[chain]];
9523     cijk        = eigenValues + (2 * n);
9524 
9525     /* find length */
9526     if (m->cppEvents != NULL)
9527         {
9528         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
9529         }
9530     else if (m->tk02BranchRates != NULL)
9531         {
9532         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
9533         }
9534     else if (m->igrBranchRates != NULL)
9535         {
9536         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
9537         }
9538     else if (m->mixedBrchRates != NULL)
9539         {
9540         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
9541         }
9542     else
9543         length = p->length;
9544 
9545     /* fill in values */
9546     for (k=index=0; k<m->numRateCats; k++)
9547         {
9548         t =  length * baseRate * catRate[k] * correctionFactor;
9549 
9550         if (t < TIME_MIN)
9551             {
9552             /* Fill in identity matrix */
9553             for (i=0; i<n; i++)
9554                 {
9555                 for (j=0; j<n; j++)
9556                     {
9557                     if (i == j)
9558                         tiP[index++] = 1.0;
9559                     else
9560                         tiP[index++] = 0.0;
9561                     }
9562                 }
9563             }
9564         else if (t > TIME_MAX)
9565             {
9566             /* Get base freq */
9567             bs = GetParamSubVals(m->stateFreq, chain, state[chain]);
9568             /* Fill in stationary matrix */
9569             for (i=0; i<n; i++)
9570                 for (j=0; j<n; j++)
9571                     tiP[index++] = (CLFlt) bs[j];
9572             }
9573         else
9574             {
9575             /* We actually need to do some work... */
9576             for (s=0; s<n; s++)
9577                 EigValexp[s] =  exp(eigenValues[s] * t);
9578 
9579             ptr = cijk;
9580             for (i=0; i<n; i++)
9581                 {
9582                 for (j=0; j<n; j++)
9583                     {
9584                     sum = 0.0;
9585                     for (s=0; s<n; s++)
9586                         sum += (*ptr++) * EigValexp[s];
9587                     tiP[index++] = (CLFlt) ((sum < 0.0) ? 0.0 : sum);
9588                     }
9589                 }
9590             }
9591         }
9592 
9593 #   if 0
9594     printf ("v = %lf (%d)\n", t, p->index);
9595     for (i=index=0; i<n; i++)
9596         {
9597         for (j=0; j<n; j++)
9598             printf ("%1.4lf ", tiP[index++]);
9599         printf ("\n");
9600         }
9601     printf ("\n");
9602 #   endif
9603 
9604     return NO_ERROR;
9605 }
9606 
9607 
9608 /*----------------------------------------------------------------
9609 |
9610 |   TiProbs_GenCov: Calculates transition probabilities for codon
9611 |       models with omega variation or covarion models with
9612 |       rate variation.
9613 |
9614 -----------------------------------------------------------------*/
TiProbs_GenCov(TreeNode * p,int division,int chain)9615 int TiProbs_GenCov (TreeNode *p, int division, int chain)
9616 {
9617     register int    i, j, k, n, s, index;
9618     int             sizeOfSingleCijk;
9619     MrBFlt          t, *eigenValues, *cijk, EigValexp[64], sum, *ptr, correctionFactor,
9620                     length, *bs;
9621     CLFlt           *tiP;
9622     ModelInfo       *m;
9623 
9624     m = &modelSettings[division];
9625     n = m->numModelStates;
9626 
9627     /* find the correction factor to make branch lengths
9628        in terms of expected number of substitutions per character */
9629     correctionFactor = 1.0;
9630     if (m->dataType == DNA || m->dataType == RNA)
9631         {
9632         if (m->nucModelId == NUCMODEL_DOUBLET)
9633             correctionFactor = 2.0;
9634         else if (m->nucModelId == NUCMODEL_CODON)
9635             correctionFactor = 3.0;
9636         }
9637 
9638     /* find transition probabilities */
9639     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
9640 
9641     /* get eigenvalues and cijk pointers */
9642     eigenValues = m->cijks[m->cijkIndex[chain]];
9643     cijk        = eigenValues + (2 * n);
9644 
9645     /* get offset size (we need to move the pointers to the appropriate
9646        cijk information for these models) */
9647     sizeOfSingleCijk = m->cijkLength / m->nCijkParts;
9648 
9649     /* find length */
9650     if (m->cppEvents != NULL)
9651         {
9652         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
9653         }
9654     else if (m->tk02BranchRates != NULL)
9655         {
9656         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
9657         }
9658     else if (m->igrBranchRates != NULL)
9659         {
9660         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
9661         }
9662     else if (m->mixedBrchRates != NULL)
9663         {
9664         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
9665         }
9666     else
9667         length = p->length;
9668 
9669     /* numerical errors will ensue if we allow very large or very small branch lengths,
9670        which might occur in relaxed clock models */
9671 
9672     /* fill in values */
9673     for (k=index=0; k<m->nCijkParts; k++)
9674         {
9675         t =  length * correctionFactor;
9676 
9677         if (t < TIME_MIN)
9678             {
9679             /* Fill in identity matrix */
9680             for (i=0; i<n; i++)
9681                 {
9682                 for (j=0; j<n; j++)
9683                     {
9684                     if (i == j)
9685                         tiP[index++] = 1.0;
9686                     else
9687                         tiP[index++] = 0.0;
9688                     }
9689                 }
9690             }
9691         else if (t > TIME_MAX)
9692             {
9693             /* Get base freq */
9694             bs = GetParamSubVals(m->stateFreq, chain, state[chain]);
9695             /* Fill in stationary matrix */
9696             for (i=0; i<n; i++)
9697                 for (j=0; j<n; j++)
9698                     tiP[index++] = (CLFlt) bs[j];
9699             }
9700         else
9701             {
9702             /* We actually need to do some work... */
9703             for (s=0; s<n; s++)
9704                 EigValexp[s] =  exp(eigenValues[s] * t);
9705 
9706             ptr = cijk;
9707             for (i=0; i<n; i++)
9708                 {
9709                 for (j=0; j<n; j++)
9710                     {
9711                     sum = 0.0;
9712                     for (s=0; s<n; s++)
9713                         sum += (*ptr++) * EigValexp[s];
9714                     tiP[index++] = (CLFlt) ((sum < 0.0) ? 0.0 : sum);
9715                     }
9716                 }
9717 
9718             /* increment pointers by m->cijkLength */
9719             if (k+1 < m->nCijkParts)
9720                 {
9721                 /* shift pointers */
9722                 eigenValues += sizeOfSingleCijk;
9723                 cijk        += sizeOfSingleCijk;
9724                 }
9725             }
9726         }
9727 
9728 #   if 0
9729     for (i=index=0; i<n; i++)
9730         {
9731         for (j=0; j<n; j++)
9732             printf ("%1.4lf ", tiP[index++]);
9733         printf ("\n");
9734         }
9735 #   endif
9736 
9737     return NO_ERROR;
9738 }
9739 
9740 
9741 /*-----------------------------------------------------------------
9742 |
9743 |   TiProbs_Hky: update transition probabilities for 4by4
9744 |       nucleotide model with nst == 2 (K80/HKY85)
9745 |       with or without rate variation
9746 |
9747 ------------------------------------------------------------------*/
TiProbs_Hky(TreeNode * p,int division,int chain)9748 int TiProbs_Hky (TreeNode *p, int division, int chain)
9749 {
9750     int         i, j, k, index;
9751     MrBFlt      t, kap, u, w, x, y, z, beta, bigPi_j[4], pij, bigPij, *pis,
9752                 *catRate, baseRate, theRate, length;
9753     CLFlt       *tiP;
9754     ModelInfo   *m;
9755 
9756     m = &modelSettings[division];
9757 
9758     /* find transition probabilities */
9759     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
9760 
9761     /* get kappa */
9762     kap =  *GetParamVals (m->tRatio, chain, state[chain]);
9763 
9764     /* get base frequencies */
9765     pis = GetParamSubVals (m->stateFreq, chain, state[chain]);
9766 
9767     /* get base rate */
9768     baseRate = GetRate (division, chain);
9769 
9770     /* compensate for invariable sites if appropriate */
9771     if (m->pInvar != NULL)
9772         baseRate /= (1.0 - (*GetParamVals(m->pInvar, chain, state[chain])));
9773 
9774     /* get category rates */
9775     theRate = 1.0;
9776     if (m->shape != NULL)
9777         catRate = GetParamSubVals (m->shape, chain, state[chain]);
9778     else if (m->mixtureRates != NULL)
9779         catRate = GetParamSubVals (m->mixtureRates, chain, state[chain]);
9780     else
9781         catRate = &theRate;
9782 
9783     /* rescale beta */
9784     beta =  0.5 / ((pis[0] + pis[2])*(pis[1] + pis[3]) + kap*((pis[0]*pis[2]) + (pis[1]*pis[3])));
9785 
9786     bigPi_j[0] = pis[0] + pis[2];
9787     bigPi_j[1] = pis[1] + pis[3];
9788     bigPi_j[2] = pis[0] + pis[2];
9789     bigPi_j[3] = pis[1] + pis[3];
9790 
9791     /* find length */
9792     if (m->cppEvents != NULL)
9793         {
9794         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
9795         }
9796     else if (m->tk02BranchRates != NULL)
9797         {
9798         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
9799         }
9800     else if (m->igrBranchRates != NULL)
9801         {
9802         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
9803         }
9804     else if (m->mixedBrchRates != NULL)
9805         {
9806         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
9807         }
9808     else
9809         length = p->length;
9810 
9811     /* numerical errors will ensue if we allow very large or very small branch lengths,
9812        which might occur in relaxed clock models */
9813 
9814     /* fill in values */
9815     for (k=index=0; k<m->numRateCats; k++)
9816         {
9817         t =  length * baseRate * catRate[k];
9818 
9819         if (t < TIME_MIN)
9820             {
9821             /* Fill in identity matrix */
9822             for (i=0; i<4; i++)
9823                 {
9824                 for (j=0; j<4; j++)
9825                     {
9826                     if (i == j)
9827                         tiP[index++] = 1.0;
9828                     else
9829                         tiP[index++] = 0.0;
9830                     }
9831                 }
9832             }
9833         else if (t > TIME_MAX)
9834             {
9835             /* Fill in stationary matrix */
9836             for (i=0; i<4; i++)
9837                 for (j=0; j<4; j++)
9838                     tiP[index++] = (CLFlt) pis[j];
9839             }
9840         else
9841             {
9842             /* calculate probabilities */
9843             for (i=0; i<4; i++)
9844                 {
9845                 for (j=0; j<4; j++)
9846                     {
9847                     bigPij = bigPi_j[j];
9848                     pij = pis[j];
9849                     u =  1.0/bigPij -  1.0;
9850                     w = -beta * (1.0 + bigPij * (kap -  1.0));
9851                     x =  exp(-beta * t);
9852                     y =  exp(w * t);
9853                     z = (bigPij - pij) / bigPij;
9854 
9855                     if (i == j)
9856                         tiP[index++] = (CLFlt) (pij + pij * u * x + z * y);
9857                     else if ((i == 0 && j == 2) || (i == 2 && j == 0) || (i == 1 && j == 3) || (i == 3 && j == 1))
9858                         tiP[index++] = (CLFlt) (pij + pij * u * x - (pij/bigPij) * y);
9859                     else
9860                         tiP[index++] = (CLFlt) (pij * (1.0 - x));
9861                     }
9862                 }
9863             }
9864         }
9865 
9866     return NO_ERROR;
9867 }
9868 
9869 
9870 /*-----------------------------------------------------------------
9871 |
9872 |   TiProbs_JukesCantor: update transition probabilities for 4by4
9873 |       nucleotide model with nst == 1 (Jukes-Cantor)
9874 |       with or without rate variation
9875 |
9876 ------------------------------------------------------------------*/
TiProbs_JukesCantor(TreeNode * p,int division,int chain)9877 int TiProbs_JukesCantor (TreeNode *p, int division, int chain)
9878 {
9879     /* calculate Jukes Cantor transition probabilities */
9880 
9881     int         i, j, k, index;
9882     MrBFlt      t, *catRate, baseRate, theRate, length;
9883     CLFlt       pNoChange, pChange, *tiP;
9884     ModelInfo   *m;
9885 
9886     m = &modelSettings[division];
9887 
9888     /* find transition probabilities */
9889     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
9890 
9891     /* get base rate */
9892     baseRate = GetRate (division, chain);
9893 
9894     /* compensate for invariable sites if appropriate */
9895     if (m->pInvar != NULL)
9896         baseRate /= (1.0 - (*GetParamVals(m->pInvar, chain, state[chain])));
9897 
9898     /* get category rates */
9899     theRate = 1.0;
9900     if (m->shape != NULL)
9901         catRate = GetParamSubVals (m->shape, chain, state[chain]);
9902     else if (m->mixtureRates != NULL)
9903         catRate = GetParamSubVals (m->mixtureRates, chain, state[chain]);
9904     else
9905         catRate = &theRate;
9906 
9907     /* find length */
9908     if (m->cppEvents != NULL)
9909         {
9910         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
9911         }
9912     else if (m->tk02BranchRates != NULL)
9913         {
9914         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
9915         }
9916     else if (m->igrBranchRates != NULL)
9917         {
9918         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
9919         }
9920     else if (m->mixedBrchRates != NULL)
9921         {
9922         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
9923         }
9924     else
9925         length = p->length;
9926 
9927     /* numerical errors will ensue if we allow very large or very small branch lengths,
9928        which might occur in relaxed clock models */
9929 
9930     /* fill in values */
9931     for (k=index=0; k<m->numRateCats; k++)
9932         {
9933         t = length * baseRate * catRate[k];
9934 
9935         if (t < TIME_MIN)
9936             {
9937             /* Fill in identity matrix */
9938             for (i=0; i<4; i++)
9939                 {
9940                 for (j=0; j<4; j++)
9941                     {
9942                     if (i == j)
9943                         tiP[index++] = 1.0;
9944                     else
9945                         tiP[index++] = 0.0;
9946                     }
9947                 }
9948             }
9949         else if (t > TIME_MAX)
9950             {
9951             /* Fill in stationary matrix */
9952             for (i=0; i<4; i++)
9953                 for (j=0; j<4; j++)
9954                     tiP[index++] = 0.25;
9955             }
9956         else
9957             {
9958             /* calculate probabilities */
9959             pChange   = (CLFlt) (0.25 - 0.25 * exp(-(4.0/3.0)*t));
9960             pNoChange = (CLFlt) (0.25 + 0.75 * exp(-(4.0/3.0)*t));
9961             for (i=0; i<4; i++)
9962                 {
9963                 for (j=0; j<4; j++)
9964                     {
9965                     if (i == j)
9966                         tiP[index++] = pNoChange;
9967                     else
9968                         tiP[index++] = pChange;
9969                     }
9970                 }
9971             }
9972         }
9973 
9974     return NO_ERROR;
9975 }
9976 
9977 
9978 /*-----------------------------------------------------------------
9979 |
9980 |   TiProbs_Res: update transition probabilities for binary
9981 |       restriction site model with or without rate variation
9982 |
9983 ------------------------------------------------------------------*/
TiProbs_Res(TreeNode * p,int division,int chain)9984 int TiProbs_Res (TreeNode *p, int division, int chain)
9985 {
9986     int         k, index;
9987     MrBFlt      baseRate, eV, mu, theRate, v,
9988                 *bs, *catRate, length;
9989     CLFlt       *tiP;
9990     ModelInfo   *m;
9991 
9992     /* find model settings for the division */
9993     m = &modelSettings[division];
9994 
9995     /* find transition probabilities */
9996     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
9997 
9998     /* get base rate */
9999     baseRate = GetRate (division, chain);
10000 
10001     /* get category rates */
10002     theRate = 1.0;
10003     if (m->shape != NULL)
10004         catRate = GetParamSubVals (m->shape, chain, state[chain]);
10005     else if (m->mixtureRates != NULL)
10006         catRate = GetParamSubVals (m->mixtureRates, chain, state[chain]);
10007     else
10008         catRate = &theRate;
10009 
10010     /* find base frequencies */
10011     bs = GetParamSubVals(m->stateFreq, chain, state[chain]);
10012 
10013     /* calculate scaling factor */
10014     mu =  1.0 / (2.0 * bs[0] * bs[1]);
10015 
10016     /* find length */
10017     if (m->cppEvents != NULL)
10018         {
10019         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
10020         }
10021     else if (m->tk02BranchRates != NULL)
10022         {
10023         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
10024         }
10025     else if (m->igrBranchRates != NULL)
10026         {
10027         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
10028         }
10029     else if (m->mixedBrchRates != NULL)
10030         {
10031         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
10032         }
10033     else
10034         length = p->length;
10035 
10036     /* numerical errors will ensue if we allow very large or very small branch lengths,
10037        which might occur in relaxed clock models */
10038 
10039     /* fill in values */
10040     for (k=index=0; k<m->numRateCats; k++)
10041         {
10042         v =  length * baseRate * catRate[k];
10043 
10044         if (v < TIME_MIN)
10045             {
10046             /* Fill in identity matrix */
10047             tiP[index++] = (CLFlt) (bs[0] + bs[1]);
10048             tiP[index++] = (CLFlt) (bs[1] - bs[1]);
10049             tiP[index++] = (CLFlt) (bs[0] - bs[0]);
10050             tiP[index++] = (CLFlt) (bs[1] + bs[0]);
10051             }
10052         else if (v > TIME_MAX)
10053             {
10054             /* Fill in stationary matrix */
10055             tiP[index++] = (CLFlt) bs[0];
10056             tiP[index++] = (CLFlt) bs[1];
10057             tiP[index++] = (CLFlt) bs[0];
10058             tiP[index++] = (CLFlt) bs[1];
10059             }
10060         else
10061             {
10062             /* calculate probabilities */
10063             eV =  exp(-mu * v);
10064             tiP[index++] = (CLFlt) (bs[0] + bs[1] * eV);
10065             tiP[index++] = (CLFlt) (bs[1] - bs[1] * eV);
10066             tiP[index++] = (CLFlt) (bs[0] - bs[0] * eV);
10067             tiP[index++] = (CLFlt) (bs[1] + bs[0] * eV);
10068             }
10069         }
10070 
10071     return NO_ERROR;
10072 }
10073 
10074 
10075 /*-----------------------------------------------------------------
10076 |
10077 |   TiProbs_Std: update transition probabilities for
10078 |       variable states model with or without rate variation
10079 |
10080 ------------------------------------------------------------------*/
TiProbs_Std(TreeNode * p,int division,int chain)10081 int TiProbs_Std (TreeNode *p, int division, int chain)
10082 {
10083     int         b, c, i, j, k, n, s, nStates, index=0, index2;
10084     MrBFlt      v, eV1, eV2, eV3, eV4, eV5, *catRate,
10085                 baseRate, theRate, pi, f1, f2, f3, f4, f5, f6, f7, root,
10086                 *eigenValues, *cijk, sum, *bs, mu, length;
10087     CLFlt       pNoChange, pChange, *tiP;
10088     ModelInfo   *m;
10089 #   if defined (DEBUG_TIPROBS_STD)
10090     int         index3;
10091 #   endif
10092 
10093     m = &modelSettings[division];
10094 
10095     /* find transition probabilities */
10096     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
10097 
10098     /* get base rate */
10099     baseRate = GetRate (division, chain);
10100 
10101     /* get category rates */
10102     theRate = 1.0;
10103     if (m->shape != NULL)
10104         catRate = GetParamSubVals (m->shape, chain, state[chain]);
10105     else if (m->mixtureRates != NULL)
10106         catRate = GetParamSubVals (m->mixtureRates, chain, state[chain]);
10107     else
10108         catRate = &theRate;
10109 
10110 #   if defined (DEBUG_TIPROBS_STD)
10111     /* find base frequencies */
10112     bs = GetParamStdStateFreqs (m->stateFreq, chain, state[chain]);
10113 #   endif
10114 
10115     /* find length */
10116     if (m->cppEvents != NULL)
10117         {
10118         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
10119         }
10120     else if (m->tk02BranchRates != NULL)
10121         {
10122         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
10123         }
10124     else if (m->igrBranchRates != NULL)
10125         {
10126         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
10127         }
10128     else if (m->mixedBrchRates != NULL)
10129         {
10130         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
10131         }
10132     else
10133         length = p->length;
10134 
10135     /* numerical errors will ensue if we allow very large or very small branch lengths, which might
10136        occur in relaxed clock models; an elegant solution would be to substitute the stationary
10137        probs and initial probs but for now we truncate lengths at small or large values TODO */
10138     if (length > BRLENS_MAX)
10139         length = BRLENS_MAX;
10140     else if (length < BRLENS_MIN)
10141         length = BRLENS_MIN;
10142 
10143     /* fill in values; this has to be done differently if state freqs are not equal */
10144     if (m->stateFreq->paramId == SYMPI_EQUAL)
10145         {
10146         /* equal state frequencies */
10147         /* fill in values for unordered characters */
10148         index = 0;
10149 #   if defined (DEBUG_TIPROBS_STD)
10150         index3 = 0;
10151 #   endif
10152         for (nStates=2; nStates<=10; nStates++)
10153             {
10154             if (m->isTiNeeded[nStates-2] == NO)
10155                 continue;
10156             for (k=0; k<m->numRateCats; k++)
10157                 {
10158                 /* calculate probabilities */
10159                 v =  length * catRate[k] * baseRate;
10160                 eV1 =  exp(-(nStates / (nStates -  1.0)) * v);
10161                 pChange   = (CLFlt) ((1.0 / nStates) - ((1.0 / nStates) * eV1));
10162                 pNoChange = (CLFlt) ((1.0 / nStates) + ((nStates - 1.0) / nStates) * eV1);
10163                 if (pChange<0.0)
10164                     pChange = (CLFlt) 0.0;
10165                 for (i=0; i<nStates; i++)
10166                     {
10167                     for (j=0; j<nStates; j++)
10168                         {
10169                         if (i == j)
10170                             tiP[index++] = pNoChange;
10171                         else
10172                             tiP[index++] = pChange;
10173                         }
10174                     }
10175 #   if defined (DEBUG_TIPROBS_STD)
10176                 PrintTiProbs (tiP+index-(nStates*nStates), bs+index3, nStates);
10177 #   endif
10178                 }
10179 #   if defined (DEBUG_TIPROBS_STD)
10180             index3 += nStates;
10181 #   endif
10182             }
10183 
10184         /* fill in values for 3-state ordered character */
10185         if (m->isTiNeeded[9] == YES)
10186             {
10187             nStates = 3;
10188             for (k=0; k<m->numRateCats; k++)
10189                 {
10190                 /* calculate probabilities */
10191                 v =  length * catRate[k] * baseRate;
10192                 eV1 =  exp (-(3.0 / 4.0) * v);
10193                 eV2 =  exp (-(9.0 / 4.0) * v);
10194 
10195                 /* pij(0,0) */
10196                 tiP[index] = (CLFlt) ((1.0 / 3.0) + (eV1 / 2.0) + (eV2 / 6.0));
10197                 /* pij(0,1) = pij(1,0) */
10198                 tiP[index+1] = tiP[index+3] = (CLFlt) ((1.0 / 3.0) - (eV2 / 3.0));
10199                 /* pij(0,2) */
10200                 tiP[index+2] = (CLFlt) ((1.0 / 3.0) - (eV1 / 2.0) + (eV2 / 6.0));
10201                 /* pij(1,1) */
10202                 tiP[index+4] = (CLFlt) ((1.0 / 3.0) + (2.0 * eV2 / 3.0));
10203 
10204                 /* fill in mirror part of matrix */
10205                 index += 5;
10206                 index2 = index - 2;
10207                 for (i=0; i<4; i++)
10208                     tiP[index++] = tiP[index2--];
10209 
10210                 /* make sure no value is negative */
10211                 for (i=index-(nStates*nStates); i<index; i++) {
10212                     if (tiP[i] < 0.0)
10213                         tiP[i] = (CLFlt) 0.0;
10214                 }
10215 #   if defined (DEBUG_TIPROBS_STD)
10216                 PrintTiProbs (tiP+index-(nStates*nStates), bs+index3, nStates);
10217 #   endif
10218                 }
10219 
10220 #   if defined (DEBUG_TIPROBS_STD)
10221             index3 += nStates;
10222 #   endif
10223             }
10224 
10225         /* 4-state ordered character */
10226         if (m->isTiNeeded[10] == YES)
10227             {
10228             nStates = 4;
10229             pi = 1.0 / 4.0;
10230             root =  sqrt (2.0);
10231             f1 = root +  1.0;
10232             f2 = root -  1.0;
10233 
10234             for (k=0; k<m->numRateCats; k++)
10235                 {
10236                 /* calculate probabilities */
10237                 v =  length * catRate[k] * baseRate;
10238                 eV1 =  1.0 / (exp ((4.0 * v) / 3.0));
10239                 eV2 =  exp ((2.0 * (root - 2.0) * v) / 3.0) / root;
10240                 eV3 =  1.0 / (root *  exp ((2.0 * (root + 2.0) * v) / 3.0));
10241 
10242                 /* pij(0,0) */
10243                 tiP[index] = (CLFlt) (pi * (1.0 + eV1 + (f1*eV2) + (f2*eV3)));
10244                 /* pij(0,1) = pij(1,0) */
10245                 tiP[index+1] = tiP[index+4] = (CLFlt) (pi * (1.0 - eV1 + eV2 - eV3));
10246                 /* pij(0,2) = tiP(1,3) */
10247                 tiP[index+2] = tiP[index+7] = (CLFlt) (pi * (1.0 - eV1 - eV2 + eV3));
10248                 /* pij(0,3) */
10249                 tiP[index+3] = (CLFlt) (pi * (1.0 + eV1 - (f1*eV2) - (f2*eV3)));
10250                 /* pij(1,1) */
10251                 tiP[index+5] = (CLFlt) (pi * (1.0 + eV1 + (f2*eV2) + (f1*eV3)));
10252                 /* pij(1,2) */
10253                 tiP[index+6] = (CLFlt) (pi * (1.0 + eV1 - (f2*eV2) - (f1*eV3)));
10254 
10255                 /* fill in mirror part of matrix */
10256                 index += 8;
10257                 index2 = index - 1;
10258                 for (i=0; i<8; i++)
10259                     tiP[index++] = tiP[index2--];
10260 
10261                 /* make sure no value is negative */
10262                 for (i=index-(nStates*nStates); i<index; i++) {
10263                     if (tiP[i] < 0.0)
10264                         tiP[i] = (CLFlt) 0.0;
10265                 }
10266 #   if defined (DEBUG_TIPROBS_STD)
10267                 PrintTiProbs (tiP+index-(nStates*nStates), bs+index3, nStates);
10268 #   endif
10269                 }
10270 #   if defined (DEBUG_TIPROBS_STD)
10271             index3 += nStates;
10272 #   endif
10273             }
10274 
10275         /* 5-state ordered character */
10276         if (m->isTiNeeded[11] == YES)
10277             {
10278             nStates = 5;
10279             pi = 1.0 / 5.0;
10280             root =  sqrt (5.0);
10281 
10282             f5 = root /  4.0;
10283             f1 =  0.75 + f5;;
10284             f2 =  1.25 + f5;
10285             f3 =  1.25 - f5;
10286             f4 =  0.75 - f5;
10287             f5 = f5 *  2.0;
10288             f6 = f5 +  0.5;
10289             f7 = f5 -  0.5;
10290 
10291             for (k=0; k<m->numRateCats; k++)
10292                 {
10293                 /* calculate probabilities */
10294                 v =  length * catRate[k] * baseRate;
10295                 v *=  5.0 /  16.0;
10296 
10297                 eV1 =  exp ((root -  3.0) * v);
10298                 eV2 =  exp (-(root +  3.0) * v);
10299                 eV3 =  exp ((root -  5.0) * v);
10300                 eV4 =  exp (-(root +  5.0) * v);
10301 
10302                 /* pij(0,0) */
10303                 tiP[index] = (CLFlt) (pi* (1.0 + (f1*eV3) + (f2*eV1) + (f3*eV2) + (f4*eV4)));
10304                 /* pij(0,1) = pij(1,0) */
10305                 tiP[index+1] = tiP[index+5] =
10306                     (CLFlt) (pi*(1.0 - (eV3/2.0) + (f5*eV1) - (f5*eV2) - (eV4/2.0)));
10307                 /* pij(0,2) = pij(2,0) */
10308                 tiP[index+2] = tiP[index+10] = (CLFlt) (pi*(1.0 - (f6*eV3) + (f7*eV4)));
10309                 /* pij(0,3) = pij(1,4) */
10310                 tiP[index+3] = tiP[index+9] =
10311                     (CLFlt) (pi*(1.0 - (eV3/2.0) - (f5*eV1) + (f5*eV2) - (eV4/2.0)));
10312                 /* pij(0,4) */
10313                 tiP[index+4] = (CLFlt) (pi*(1.0 + (f1*eV3) - (f2*eV1) - (f3*eV2) + (f4*eV4)));
10314                 /* pij(1,1) */
10315                 tiP[index+6] = (CLFlt) (pi*(1.0 + (f4*eV3) + (f3*eV1) + (f2*eV2) + (f1*eV4)));
10316                 /* pij(1,2) = pij(2,1) */
10317                 tiP[index+7] = tiP[index+11] = (CLFlt) (pi*(1.0 + (f7*eV3) - (f6*eV4)));
10318                 /* pij(1,3) */
10319                 tiP[index+8] = (CLFlt) (pi*(1.0 + (f4*eV3) - (f3*eV1) - (f2*eV2) + (f1*eV4)));
10320                 /* pij(2,2) */
10321                 tiP[index+12] = (CLFlt) (pi*(1.0 + (2.0*eV3) + (2.0*eV4)));
10322 
10323                 /* fill in mirror part of matrix */
10324                 index += 13;
10325                 index2 = index - 2;
10326                 for (i=0; i<12; i++)
10327                     tiP[index++] = tiP[index2--];
10328 
10329                 /* make sure no value is negative */
10330                 for (i=index-(nStates*nStates); i<index; i++) {
10331                     if (tiP[i] < 0.0)
10332                         tiP[i] = (CLFlt) 0.0;
10333                 }
10334 #   if defined (DEBUG_TIPROBS_STD)
10335                 PrintTiProbs (tiP+index-(nStates*nStates), bs+index3, nStates);
10336 #   endif
10337                 }
10338 #   if defined (DEBUG_TIPROBS_STD)
10339             index3 += nStates;
10340 #   endif
10341             }
10342 
10343         /* 6-state ordered character */
10344         if (m->isTiNeeded[12] == YES)
10345             {
10346             nStates = 6;
10347             pi =  1.0 /  6.0;
10348             root =  sqrt (3.0);
10349 
10350             f4 = (3.0 / (2.0 * root));
10351             f1 =  1.0 + f4;
10352             f2 =  1.0 - f4;
10353             f3 =  0.5 + f4;
10354             f4 =  0.5 - f4;
10355 
10356             for (k=0; k<m->numRateCats; k++)
10357                 {
10358                 /* calculate probabilities */
10359                 v =  length * catRate[k] * baseRate;
10360                 v /=  5.0;
10361 
10362                 eV1 =  exp (-9 * v);
10363                 eV2 =  exp (-6 * v);
10364                 eV3 =  exp (-3 * v);
10365                 eV4 =  exp (3.0 * (root - 2.0) * v);
10366                 eV5 =  exp (-3.0 * (root + 2.0) * v);
10367 
10368                 /* pij(0,0) */
10369                 tiP[index] = (CLFlt) (pi* (1.0 + (0.5*eV1) + eV2 + (1.5*eV3) + (f1*eV4) + (f2*eV5)));
10370                 /* pij(0,1) = pij(1,0) */
10371                 tiP[index+1] = tiP[index+6] = (CLFlt) (pi*(1.0 - eV1 - eV2 + (f3*eV4) + (f4*eV5)));
10372                 /* pij(0,2) = pij(2,0) */
10373                 tiP[index+2] = tiP[index+12] =
10374                     (CLFlt) (pi*(1.0 + (0.5*eV1) - eV2 - (1.5*eV3) + (0.5*eV4) + (0.5*eV5)));
10375                 /* pij(0,3) = pij(2,5) */
10376                 tiP[index+3] = tiP[index+17] =
10377                     (CLFlt) (pi*(1.0 + (0.5*eV1) + eV2 - (1.5*eV3) - (0.5*eV4) - (0.5*eV5)));
10378                 /* pij(0,4) = pij(1,5) */
10379                 tiP[index+4] = tiP[index+11] = (CLFlt) (pi*(1.0 - eV1 + eV2 - (f3*eV4) - (f4*eV5)));
10380                 /* pij(0,5) */
10381                 tiP[index+5] = (CLFlt) (pi*(1.0 + (0.5*eV1) - eV2 + (1.5*eV3) - (f1*eV4) - (f2*eV5)));
10382                 /* pij(1,1) */
10383                 tiP[index+7] = (CLFlt) (pi*(1.0 + (2.0*eV1) + eV2 + eV4 + eV5));
10384                 /* pij(1,2) = pij(2,1) */
10385                 tiP[index+8] = tiP[index+13] = (CLFlt) (pi*(1.0 - eV1 + eV2 - (f4*eV4) - (f3*eV5)));
10386                 /* pij(1,3) = pij(2,4) */
10387                 tiP[index+9] = tiP[index+16] = (CLFlt) (pi*(1.0 - eV1 - eV2 + (f4*eV4) + (f3*eV5)));
10388                 /* pij(1,4) */
10389                 tiP[index+10] = (CLFlt) (pi*(1.0 + (2.0*eV1) - eV2 - eV4 - eV5));
10390                 /* pij(2,2) */
10391                 tiP[index+14] = (CLFlt) (pi*(1.0 + (0.5*eV1) + eV2 + (1.5*eV3) + (f2*eV4) + (f1*eV5)));
10392                 /* pij(2,3) */
10393                 tiP[index+15] = (CLFlt) (pi*(1.0 + (0.5*eV1) - eV2 + (1.5*eV3) - (f2*eV4) - (f1*eV5)));
10394 
10395                 /* fill in mirror part of matrix */
10396                 index += 18;
10397                 index2 = index - 1;
10398                 for (i=0; i<18; i++)
10399                     tiP[index++] = tiP[index2--];
10400 
10401                 /* make sure no value is negative */
10402                 for (i=index-(nStates*nStates); i<index; i++) {
10403                     if (tiP[i] < 0.0)
10404                         tiP[i] = (CLFlt) 0.0;
10405                 }
10406 #   if defined (DEBUG_TIPROBS_STD)
10407                 PrintTiProbs (tiP+index-(nStates*nStates), bs+index3, nStates);
10408 #   endif
10409                 }
10410 #   if defined (DEBUG_TIPROBS_STD)
10411             index3 += nStates;
10412 #   endif
10413             }
10414         }
10415     else
10416         {
10417         /* unequal state frequencies */
10418         index = 0;
10419 
10420         /* first fill in for binary characters using beta categories if needed */
10421         if (m->isTiNeeded[0] == YES)
10422             {
10423             /* find base frequencies */
10424             bs = GetParamStdStateFreqs (m->stateFreq, chain, state[chain]);
10425 
10426             /* cycle through beta and gamma cats */
10427             for (b=0; b<m->numBetaCats; b++)
10428                 {
10429                 mu =  1.0 / (2.0 * bs[0] * bs[1]);
10430                 for (k=0; k<m->numRateCats; k++)
10431                     {
10432                     /* calculate probabilities */
10433                     v =  length * catRate[k] * baseRate;
10434                     eV1 =  exp(- mu * v);
10435                     tiP[index++] = (CLFlt) (bs[0] + (bs[1] * eV1));
10436                     tiP[index++] = (CLFlt) (bs[1] - (bs[1] * eV1));
10437                     tiP[index++] = (CLFlt) (bs[0] - (bs[0] * eV1));
10438                     tiP[index++] = (CLFlt) (bs[1] + (bs[0] * eV1));
10439                     }
10440                 /* update stationary state frequency pointer */
10441                 bs += 2;
10442                 }
10443             }
10444 
10445         /* now use general algorithm for the other cases */
10446         if (m->cijkLength > 0)
10447             {
10448             /* first update cijk if necessary */
10449             if (m->cijkLength > 0 && m->upDateCijk == YES)
10450                 {
10451                 if (UpDateCijk (division, chain) == ERROR)
10452                     return (ERROR);
10453                 }
10454 
10455             /* then get first set of eigenvalues */
10456             eigenValues = m->cijks[m->cijkIndex[chain]];
10457 
10458             /* and cycle through the relevant characters */
10459             for (c=0; c<m->stateFreq->nSympi; c++)
10460                 {
10461                 n = m->stateFreq->sympinStates[c];
10462 
10463                 /* fill in values */
10464                 for (k=0; k<m->numRateCats; k++)
10465                     {
10466                     v =  length * baseRate * catRate[k];
10467                     cijk = eigenValues + (2 * n);
10468 
10469                     for (i=0; i<n; i++)
10470                         {
10471                         for (j=0; j<n; j++)
10472                             {
10473                             sum = 0.0;
10474                             for (s=0; s<n; s++)
10475                                 sum += (*cijk++) * exp(eigenValues[s] * v);
10476                             tiP[index++] = (CLFlt) ((sum <  0.0) ?  0.0 : sum);
10477                             }
10478                         }
10479                     }
10480 
10481                 /* update eigenValues pointer */
10482                 eigenValues += (n * n * n) + (2 * n);
10483                 }
10484             }
10485         }
10486 
10487     return NO_ERROR;
10488 }
10489 
10490 
UpDateCijk(int whichPart,int whichChain)10491 int UpDateCijk (int whichPart, int whichChain)
10492 {
10493     int         c, i, j, k, n, n3, isComplex, sizeOfSingleCijk, cType, numQAllocated;
10494     MrBFlt      **q[100], **eigvecs, **inverseEigvecs;
10495     MrBFlt      *eigenValues, *eigvalsImag, *cijk;
10496     MrBFlt      *bs, *bsBase, *rateOmegaValues=NULL, rA=0.0, rS=0.0, posScaler, *omegaCatFreq=NULL;
10497     MrBComplex     **Ceigvecs, **CinverseEigvecs;
10498     ModelInfo   *m;
10499     Param       *p;
10500 #   if defined (BEAGLE_ENABLED)
10501     int         u, divisionOffset;
10502     double      *beagleEigvecs=NULL, *beagleInverseEigvecs=NULL;
10503 #   endif
10504 
10505     /* get a pointer to the model settings for this partition */
10506     m = &modelSettings[whichPart];
10507     assert (m->upDateCijk == YES);
10508 
10509     /* we should only go through here if we have cijk information available for the partition */
10510     if (m->cijkLength > 0)
10511         {
10512         /* flip cijk space */
10513         FlipCijkSpace(m, whichChain);
10514 
10515         /* figure out information on either omega values or rate values, if necessary */
10516         if (m->dataType == DNA || m->dataType == RNA)
10517             {
10518             if (m->nucModelId == NUCMODEL_CODON)                                                    /* we have a NY98 model     */
10519                 {
10520                 rateOmegaValues = GetParamVals(m->omega, whichChain, state[whichChain]);
10521                 if (m->numOmegaCats > 1)
10522                     omegaCatFreq = GetParamSubVals (m->omega, whichChain, state[whichChain]);
10523                 }
10524             else if (m->nCijkParts > 1 && m->nucModelId == NUCMODEL_4BY4 && m->numModelStates == 8)
10525                 {
10526                 /* we have a covarion (covariotide) model with rate variation */
10527                 if (m->shape != NULL)
10528                     rateOmegaValues = GetParamSubVals (m->shape, whichChain, state[whichChain]);
10529                 else if (m->mixtureRates != NULL)
10530                     rateOmegaValues = GetParamSubVals (m->mixtureRates, whichChain, state[whichChain]);
10531                 }
10532             }
10533         else if (m->dataType == PROTEIN)
10534             {
10535             if (m->nCijkParts > 1)
10536                 {
10537                 /* we have a covarion model with rate variation */
10538                 if (m->shape != NULL)
10539                     rateOmegaValues = GetParamSubVals (m->shape, whichChain, state[whichChain]);
10540                 else if (m->mixtureRates != NULL)
10541                     rateOmegaValues = GetParamSubVals (m->mixtureRates, whichChain, state[whichChain]);
10542                 }
10543             }
10544 #   if defined (BEAGLE_ENABLED)
10545         else if (m->dataType == RESTRICTION){}
10546 #   endif
10547         else if (m->dataType != STANDARD)
10548             {
10549             MrBayesPrint ("%s   ERROR: Should not be updating cijks!\n", spacer);
10550             return (ERROR);
10551             }
10552 
10553         if (m->dataType == STANDARD)
10554             {
10555             /* set pointers and other stuff needed */
10556             numQAllocated = 1;
10557             p = m->stateFreq;
10558             eigenValues = m->cijks[m->cijkIndex[whichChain]];
10559             q[0] = AllocateSquareDoubleMatrix (10);
10560             eigvecs = AllocateSquareDoubleMatrix (10);
10561             inverseEigvecs = AllocateSquareDoubleMatrix (10);
10562             Ceigvecs = AllocateSquareComplexMatrix (10);
10563             CinverseEigvecs = AllocateSquareComplexMatrix (10);
10564             bsBase = GetParamStdStateFreqs (m->stateFreq, whichChain, state[whichChain]);
10565 
10566             /* cycle over characters needing cijks */
10567             for (c=0; c<p->nSympi; c++)
10568                 {
10569                 n = p->sympinStates[c];
10570                 bs = bsBase + p->sympiBsIndex[c];
10571                 cType = p->sympiCType[c];
10572                 n3 = n * n * n;
10573                 eigvalsImag = eigenValues + n;
10574                 cijk = eigenValues + (2 * n);
10575                 if (SetStdQMatrix (q[0], n, bs, cType) == ERROR)
10576                     return (ERROR);
10577                 isComplex = GetEigens (n, q[0], eigenValues, eigvalsImag, eigvecs, inverseEigvecs, Ceigvecs, CinverseEigvecs);
10578                 if (isComplex == NO)
10579                     {
10580                     CalcCijk (n, cijk, eigvecs, inverseEigvecs);
10581                     }
10582                 else
10583                     {
10584                     if (isComplex == YES)
10585                         MrBayesPrint ("%s   ERROR: Complex eigenvalues found!\n", spacer);
10586                     else
10587                         MrBayesPrint ("%s   ERROR: Computing eigenvalues problem!\n", spacer);
10588                     goto errorExit;
10589                     }
10590                 eigenValues += (n3 + (2 * n));
10591                 }
10592             }
10593         else
10594             {
10595             /* all other data types */
10596             numQAllocated = m->nCijkParts;
10597             sizeOfSingleCijk = m->cijkLength / m->nCijkParts;
10598             n = m->numModelStates;
10599 #   if defined (BEAGLE_ENABLED)
10600             if (m->useBeagle == YES)
10601                 eigenValues = m->cijks[m->cijkIndex[whichChain]/m->nCijkParts];
10602             else
10603                 eigenValues = m->cijks[m->cijkIndex[whichChain]];
10604 #   else
10605             eigenValues = m->cijks[m->cijkIndex[whichChain]];
10606 #   endif
10607             eigvalsImag = eigenValues + n;
10608             cijk        = eigenValues + (2 * n);
10609             for (k=0; k<numQAllocated; k++)
10610                 q[k] = AllocateSquareDoubleMatrix (n);
10611             eigvecs = AllocateSquareDoubleMatrix (n);
10612             inverseEigvecs = AllocateSquareDoubleMatrix (n);
10613             Ceigvecs = AllocateSquareComplexMatrix (n);
10614             CinverseEigvecs = AllocateSquareComplexMatrix (n);
10615 
10616             if (m->nCijkParts == 1)
10617                 {
10618                 if (m->dataType == DNA || m->dataType == RNA)
10619                     {
10620                     if (m->nucModelId == NUCMODEL_CODON)
10621                         {
10622                         if (SetNucQMatrix (q[0], n, whichChain, whichPart, rateOmegaValues[0], &rA, &rS) == ERROR)
10623                             goto errorExit;
10624                         }
10625                     else
10626                         {
10627                         if (SetNucQMatrix (q[0], n, whichChain, whichPart, 1.0, &rA, &rS) == ERROR)
10628                             goto errorExit;
10629                         }
10630                     }
10631 #   if defined (BEAGLE_ENABLED)
10632                 else if (m->dataType == RESTRICTION)
10633                     {
10634                     SetBinaryQMatrix (q[0], whichChain, whichPart);
10635                     }
10636 #   endif
10637                 else
10638                     {
10639                     if (SetProteinQMatrix (q[0], n, whichChain, whichPart, 1.0) == ERROR)
10640                         goto errorExit;
10641                     }
10642                 isComplex = GetEigens (n, q[0], eigenValues, eigvalsImag, eigvecs, inverseEigvecs, Ceigvecs, CinverseEigvecs);
10643 #   if defined (BEAGLE_ENABLED)
10644                 if (isComplex == YES)
10645                     {
10646                     if (isComplex == YES)
10647                         MrBayesPrint ("%s   ERROR: Complex eigenvalues found!\n", spacer);
10648                     else
10649                         MrBayesPrint ("%s   ERROR: Computing eigenvalues problem!\n", spacer);
10650                     goto errorExit;
10651                     }
10652                 if (m->useBeagle == YES)
10653                     {
10654                     /* TODO: only allocate this space once at initialization */
10655                     beagleEigvecs = (double*) SafeCalloc (2*n*n, sizeof(double));
10656                     beagleInverseEigvecs = beagleEigvecs + n*n;
10657                     for (i=k=0; i<n; i++)
10658                         {
10659                         // eigenValues[i] = 0.1;
10660                         for (j=0; j<n; j++)
10661                             {
10662                             beagleEigvecs[k] = eigvecs[i][j];
10663                             beagleInverseEigvecs[k] = inverseEigvecs[i][j];
10664                             k++;
10665                             }
10666                         }
10667                     divisionOffset = 0;
10668                     if (m->useBeagleMultiPartitions == YES)
10669                         divisionOffset = (numLocalChains + 1) * m->nCijkParts * m->divisionIndex;
10670                     beagleSetEigenDecomposition(m->beagleInstance,
10671                                                 m->cijkIndex[whichChain] + divisionOffset,
10672                                                 beagleEigvecs,
10673                                                 beagleInverseEigvecs,
10674                                                 eigenValues);
10675                     free(beagleEigvecs);
10676                     }
10677                 else
10678                     {
10679                     CalcCijk (n, cijk, eigvecs, inverseEigvecs);
10680                     }
10681 #   else
10682                 if (isComplex == NO)
10683                     {
10684                     CalcCijk (n, cijk, eigvecs, inverseEigvecs);
10685                     }
10686                 else
10687                     {
10688                     MrBayesPrint ("%s   ERROR: Complex eigenvalues found!\n", spacer);
10689                     goto errorExit;
10690                     }
10691 #   endif
10692                 }
10693             else
10694                 {
10695                 /* Here, we calculate the rate matrices (Q) for various nucleotide and amino acid
10696                    data models. Usually, when the rate matrix is set in SetNucQMatrix, it is scaled
10697                    such that the average substitution rate is one. However, there is a complication
10698                    for positive selection models using codon rate matrices. First, we have more than
10699                    one matrix; in fact, we have as many rate matrices as there are omega values. Second,
10700                    the mean substitution rate still has to be one. And third, we want the synonymous
10701                    rate to be the same across the rate matrices. For positive selection models, the Q
10702                    matrix comes out of SetNucQMatrix unscaled. Once we have all m->nCijkParts rate
10703                    matrices, we then scale again, this time to ensure that the mean substitution rate is one. */
10704 
10705                 /* First, calculate rate matrices for each category: */
10706                 posScaler = 0.0;
10707                 for (k=0; k<m->nCijkParts; k++)
10708                     {
10709                     if (m->dataType == DNA || m->dataType == RNA)
10710                         {
10711                         if (SetNucQMatrix (q[k], n, whichChain, whichPart, rateOmegaValues[k], &rA, &rS) == ERROR)
10712                             goto errorExit;
10713                         }
10714                     else
10715                         {
10716                         if (SetProteinQMatrix (q[k], n, whichChain, whichPart, rateOmegaValues[k]) == ERROR)
10717                             goto errorExit;
10718                         }
10719                     if (m->nucModelId == NUCMODEL_CODON && m->numOmegaCats > 1)
10720                         posScaler += omegaCatFreq[k] * (rS + rA);
10721                     }
10722 
10723                 /* Then rescale the rate matrices, if this is a positive selection model: */
10724                 if (m->nucModelId == NUCMODEL_CODON && m->numOmegaCats > 1)
10725                     {
10726                     posScaler = 1.0 / posScaler;
10727                     for (k=0; k<m->nCijkParts; k++)
10728                         {
10729                         for (i=0; i<n; i++)
10730                             for (j=0; j<n; j++)
10731                                 q[k][i][j] *= posScaler;
10732                         }
10733                     }
10734 
10735                 /* Finally, calculate eigenvalues, etc.: */
10736 #   if defined (BEAGLE_ENABLED)
10737                 if (m->useBeagle == YES)
10738                     {
10739                     /* TODO: only allocate this space once at initialization */
10740                     beagleEigvecs = (double*) SafeCalloc (2*n*n, sizeof(double));
10741                     beagleInverseEigvecs = beagleEigvecs + n*n;
10742                     }
10743 #   endif
10744                 for (k=0; k<m->nCijkParts; k++)
10745                     {
10746                     isComplex = GetEigens (n, q[k], eigenValues, eigvalsImag, eigvecs, inverseEigvecs, Ceigvecs, CinverseEigvecs);
10747 #   if defined (BEAGLE_ENABLED)
10748                     if (isComplex == YES)
10749                         {
10750                         if (isComplex == YES)
10751                             MrBayesPrint ("%s   ERROR: Complex eigenvalues found!\n", spacer);
10752                         else
10753                             MrBayesPrint ("%s   ERROR: Computing eigenvalues problem!\n", spacer);
10754                         goto errorExit;
10755                         }
10756                     if (m->useBeagle == YES)
10757                         {
10758                         for (i=u=0; i<n; i++)
10759                             {
10760                             for (j=0; j<n; j++)
10761                                 {
10762                                 beagleEigvecs[u] = eigvecs[i][j];
10763                                 beagleInverseEigvecs[u] = inverseEigvecs[i][j];
10764                                 u++;
10765                                 }
10766                             }
10767                         divisionOffset = 0;
10768                         if (m->useBeagleMultiPartitions == YES)
10769                             divisionOffset = (numLocalChains + 1) * m->nCijkParts * m->divisionIndex;
10770                         beagleSetEigenDecomposition(m->beagleInstance,
10771                                                     m->cijkIndex[whichChain] + k + divisionOffset,
10772                                                     beagleEigvecs,
10773                                                     beagleInverseEigvecs,
10774                                                     eigenValues);
10775                         }
10776                     else
10777                         {
10778                         CalcCijk (n, cijk, eigvecs, inverseEigvecs);
10779                         }
10780 #   else
10781                     if (isComplex == NO)
10782                         {
10783                         CalcCijk (n, cijk, eigvecs, inverseEigvecs);
10784                         }
10785                     else
10786                         {
10787                         MrBayesPrint ("%s   ERROR: Complex eigenvalues found!\n", spacer);
10788                         goto errorExit;
10789                         }
10790 #   endif
10791                     /* shift pointers */
10792                     eigenValues += sizeOfSingleCijk;
10793                     eigvalsImag += sizeOfSingleCijk;
10794                     cijk        += sizeOfSingleCijk;
10795                     }
10796 #   if defined (BEAGLE_ENABLED)
10797                 free(beagleEigvecs);
10798 #   endif
10799                 }
10800             }
10801 
10802         for (k=0; k<numQAllocated; k++)
10803             FreeSquareDoubleMatrix (q[k]);
10804         FreeSquareDoubleMatrix (eigvecs);
10805         FreeSquareDoubleMatrix (inverseEigvecs);
10806         FreeSquareComplexMatrix (Ceigvecs);
10807         FreeSquareComplexMatrix (CinverseEigvecs);
10808         }
10809 
10810     return (NO_ERROR);
10811 
10812     errorExit:
10813         for (k=0; k<numQAllocated; k++)
10814             FreeSquareDoubleMatrix (q[k]);
10815         FreeSquareDoubleMatrix (eigvecs);
10816         FreeSquareDoubleMatrix (inverseEigvecs);
10817         FreeSquareComplexMatrix (Ceigvecs);
10818         FreeSquareComplexMatrix (CinverseEigvecs);
10819 
10820         return ERROR;
10821 }
10822 
10823