1 /**
2  * PLL (version 1.0.0) a software library for phylogenetic inference
3  * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
4  *
5  * Derived from
6  * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
7  * trees by Alexandros Stamatakis
8  *
9  * This program is free software: you can redistribute it and/or modify it
10  * under the terms of the GNU General Public License as published by the Free
11  * Software Foundation, either version 3 of the License, or (at your option)
12  * any later version.
13  *
14  * This program is distributed in the hope that it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  * more details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program.  If not, see <http://www.gnu.org/licenses/>.
21  *
22  * For any other enquiries send an Email to Tomas Flouri
23  * Tomas.Flouri@h-its.org
24  *
25  * When publishing work that uses PLL please cite PLL
26  *
27  * @file parsimony.c
28  */
29 #include "mem_alloc.h"
30 
31 #ifndef WIN32
32 #include <sys/times.h>
33 #include <sys/types.h>
34 #include <sys/time.h>
35 #include <unistd.h>
36 #endif
37 
38 #include <limits.h>
39 #include <math.h>
40 #include <time.h>
41 #include <stdlib.h>
42 #include <stdio.h>
43 #include <ctype.h>
44 #include <string.h>
45 #include <stdint.h>
46 #include <assert.h>
47 
48 #if defined(__MIC_NATIVE)
49 
50 #include <immintrin.h>
51 
52 #define INTS_PER_VECTOR 16
53 #define LONG_INTS_PER_VECTOR 8
54 #define INT_TYPE __m512i
55 #define CAST double*
56 #define SET_ALL_BITS_ONE _mm512_set1_epi32(0xFFFFFFFF)
57 #define SET_ALL_BITS_ZERO _mm512_setzero_epi32()
58 #define VECTOR_LOAD _mm512_load_epi32
59 #define VECTOR_STORE  _mm512_store_epi32
60 #define VECTOR_BIT_AND _mm512_and_epi32
61 #define VECTOR_BIT_OR  _mm512_or_epi32
62 #define VECTOR_AND_NOT _mm512_andnot_epi32
63 
64 #elif defined(__AVX)
65 
66 #include <xmmintrin.h>
67 #include <immintrin.h>
68 #include <pmmintrin.h>
69 
70 #define ULINT_SIZE 64
71 #define INTS_PER_VECTOR 8
72 #define LONG_INTS_PER_VECTOR 4
73 #define INT_TYPE __m256d
74 #define CAST double*
75 #define SET_ALL_BITS_ONE (__m256d)_mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
76 #define SET_ALL_BITS_ZERO (__m256d)_mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)
77 #define VECTOR_LOAD _mm256_load_pd
78 #define VECTOR_BIT_AND _mm256_and_pd
79 #define VECTOR_BIT_OR  _mm256_or_pd
80 #define VECTOR_STORE  _mm256_store_pd
81 #define VECTOR_AND_NOT _mm256_andnot_pd
82 
83 #elif (defined(__SSE3))
84 
85 #include <xmmintrin.h>
86 #include <pmmintrin.h>
87 
88 #define INTS_PER_VECTOR 4
89 #ifdef __i386__
90 #define ULINT_SIZE 32
91 #define LONG_INTS_PER_VECTOR 4
92 #else
93 #define ULINT_SIZE 64
94 #define LONG_INTS_PER_VECTOR 2
95 #endif
96 #define INT_TYPE __m128i
97 #define CAST __m128i*
98 #define SET_ALL_BITS_ONE _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
99 #define SET_ALL_BITS_ZERO _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000)
100 #define VECTOR_LOAD _mm_load_si128
101 #define VECTOR_BIT_AND _mm_and_si128
102 #define VECTOR_BIT_OR  _mm_or_si128
103 #define VECTOR_STORE  _mm_store_si128
104 #define VECTOR_AND_NOT _mm_andnot_si128
105 
106 #endif
107 
108 #include "pll.h"
109 #include "pllInternal.h"
110 
111 extern const unsigned int mask32[32];
112 
vectorPopcount(INT_TYPE v)113 static __inline unsigned int vectorPopcount(INT_TYPE v)
114 {
115   unsigned long
116     counts[LONG_INTS_PER_VECTOR] __attribute__ ((aligned (PLL_BYTE_ALIGNMENT)));
117 
118   int
119     i,
120     sum = 0;
121 
122   VECTOR_STORE((CAST)counts, v);
123 
124   for(i = 0; i < LONG_INTS_PER_VECTOR; i++)
125      sum += __builtin_popcountl(counts[i]);
126 
127   return ((unsigned int)sum);
128 }
129 
storePerSiteScores(partitionList * pr,int model,INT_TYPE v,unsigned int offset)130 static __inline void storePerSiteScores (partitionList * pr, int model, INT_TYPE v, unsigned int offset)
131 {
132   unsigned long
133     counts[LONG_INTS_PER_VECTOR] __attribute__ ((aligned (PLL_BYTE_ALIGNMENT)));
134   parsimonyNumber * buf;
135 
136   int
137     i,
138     j;
139 
140   VECTOR_STORE((CAST)counts, v);
141 
142   for (i = 0; i < LONG_INTS_PER_VECTOR; ++i)
143    {
144      buf = &(pr->partitionData[model]->perSiteParsScores[offset * PLL_PCF + i * ULINT_SIZE]);
145      for (j = 0; j < ULINT_SIZE; ++ j)
146         buf[j] += ((counts[i] >> j) & 1);
147    }
148 
149 }
150 
getxnodeLocal(nodeptr p)151 static void getxnodeLocal (nodeptr p)
152 {
153   nodeptr  s;
154 
155   if((s = p->next)->xPars || (s = s->next)->xPars)
156     {
157       p->xPars = s->xPars;
158       s->xPars = 0;
159     }
160 
161   assert(p->next->xPars || p->next->next->xPars || p->xPars);
162 
163 }
164 
computeTraversalInfoParsimony(nodeptr p,int * ti,int * counter,int maxTips,pllBoolean full)165 static void computeTraversalInfoParsimony(nodeptr p, int *ti, int *counter, int maxTips, pllBoolean full)
166 {
167   nodeptr
168     q = p->next->back,
169     r = p->next->next->back;
170 
171   if(! p->xPars)
172     getxnodeLocal(p);
173 
174   if(full)
175     {
176        if(q->number > maxTips)
177          computeTraversalInfoParsimony(q, ti, counter, maxTips, full);
178 
179       if(r->number > maxTips)
180         computeTraversalInfoParsimony(r, ti, counter, maxTips, full);
181     }
182   else
183     {
184       if(q->number > maxTips && !q->xPars)
185         computeTraversalInfoParsimony(q, ti, counter, maxTips, full);
186 
187       if(r->number > maxTips && !r->xPars)
188         computeTraversalInfoParsimony(r, ti, counter, maxTips, full);
189     }
190 
191 
192   ti[*counter]     = p->number;
193   ti[*counter + 1] = q->number;
194   ti[*counter + 2] = r->number;
195   *counter = *counter + 4;
196 }
197 
198 /* check whether site contains at least 2 different letters, i.e.
199    whether it will generate a score */
isInformative(pllInstance * tr,int dataType,int site)200 static pllBoolean isInformative(pllInstance *tr, int dataType, int site)
201 {
202   int
203     informativeCounter = 0,
204     check[256],
205     j,
206     undetermined = getUndetermined(dataType);
207 
208   const unsigned int
209     *bitVector = getBitVector(dataType);
210 
211   unsigned char
212     nucleotide;
213 
214 
215   for(j = 0; j < 256; j++)
216     check[j] = 0;
217 
218   for(j = 1; j <= tr->mxtips; j++)
219     {
220       nucleotide = tr->yVector[j][site];
221       check[nucleotide] = 1;
222       assert(bitVector[nucleotide] > 0);
223     }
224 
225   for(j = 0; j < undetermined; j++)
226     {
227       if(check[j] > 0)
228         informativeCounter++;
229     }
230 
231   if(informativeCounter > 1)
232     return PLL_TRUE;
233 
234   return PLL_FALSE;
235 }
236 
compressDNA(pllInstance * tr,partitionList * pr,int * informative,int perSiteScores)237 static void compressDNA(pllInstance *tr, partitionList *pr, int *informative, int perSiteScores)
238 {
239   size_t
240     totalNodes,
241     i,
242     model;
243 
244   totalNodes = 2 * (size_t)tr->mxtips;
245 
246 
247 
248   for(model = 0; model < (size_t) pr->numberOfPartitions; model++)
249     {
250       size_t
251         k,
252         states = (size_t)pr->partitionData[model]->states,
253         compressedEntries,
254         compressedEntriesPadded,
255         entries = 0,
256         lower = pr->partitionData[model]->lower,
257         upper = pr->partitionData[model]->upper;
258 
259       parsimonyNumber
260         **compressedTips = (parsimonyNumber **)rax_malloc(states * sizeof(parsimonyNumber*)),
261         *compressedValues = (parsimonyNumber *)rax_malloc(states * sizeof(parsimonyNumber));
262 
263       for(i = lower; i < upper; i++)
264         if(informative[i])
265           entries += (size_t)tr->aliaswgt[i];
266 
267       compressedEntries = entries / PLL_PCF;
268 
269       if(entries % PLL_PCF != 0)
270         compressedEntries++;
271 
272 #if (defined(__SSE3) || defined(__AVX))
273       if(compressedEntries % INTS_PER_VECTOR != 0)
274         compressedEntriesPadded = compressedEntries + (INTS_PER_VECTOR - (compressedEntries % INTS_PER_VECTOR));
275       else
276         compressedEntriesPadded = compressedEntries;
277 #else
278       compressedEntriesPadded = compressedEntries;
279 #endif
280 
281 
282       rax_posix_memalign ((void **) &(pr->partitionData[model]->parsVect), PLL_BYTE_ALIGNMENT, (size_t)compressedEntriesPadded * states * totalNodes * sizeof(parsimonyNumber));
283       if (perSiteScores)
284        {
285          rax_posix_memalign ((void **) &(pr->partitionData[model]->perSiteParsScores), PLL_BYTE_ALIGNMENT, (size_t)pr->partitionData[model]->width* sizeof (parsimonyNumber));
286          for (i = 0; i < (size_t)pr->partitionData[model]->width; ++i) pr->partitionData[model]->perSiteParsScores[i] = 0;
287        }
288 
289 
290       for(i = 0; i < compressedEntriesPadded * states * totalNodes; i++)
291         pr->partitionData[model]->parsVect[i] = 0;
292 
293       for(i = 0; i < (size_t)tr->mxtips; i++)
294         {
295           size_t
296             w = 0,
297             compressedIndex = 0,
298             compressedCounter = 0,
299             index = 0;
300 
301           for(k = 0; k < states; k++)
302             {
303               compressedTips[k] = &(pr->partitionData[model]->parsVect[(compressedEntriesPadded * states * (i + 1)) + (compressedEntriesPadded * k)]);
304               compressedValues[k] = 0;
305             }
306 
307           for(index = lower; index < (size_t)upper; index++)
308             {
309               if(informative[index])
310                 {
311                   const unsigned int
312                     *bitValue = getBitVector(pr->partitionData[model]->dataType);
313 
314                   parsimonyNumber
315                     value = bitValue[tr->yVector[i + 1][index]];
316 
317                   for(w = 0; w < (size_t)tr->aliaswgt[index]; w++)
318                     {
319                       for(k = 0; k < states; k++)
320                         {
321                           if(value & mask32[k])
322                             compressedValues[k] |= mask32[compressedCounter];
323                         }
324 
325                       compressedCounter++;
326 
327                       if(compressedCounter == PLL_PCF)
328                         {
329                           for(k = 0; k < states; k++)
330                             {
331                               compressedTips[k][compressedIndex] = compressedValues[k];
332                               compressedValues[k] = 0;
333                             }
334 
335                           compressedCounter = 0;
336                           compressedIndex++;
337                         }
338                     }
339                 }
340             }
341 
342           for(;compressedIndex < compressedEntriesPadded; compressedIndex++)
343             {
344               for(;compressedCounter < PLL_PCF; compressedCounter++)
345                 for(k = 0; k < states; k++)
346                   compressedValues[k] |= mask32[compressedCounter];
347 
348               for(k = 0; k < states; k++)
349                 {
350                   compressedTips[k][compressedIndex] = compressedValues[k];
351                   compressedValues[k] = 0;
352                 }
353 
354               compressedCounter = 0;
355             }
356         }
357 
358       pr->partitionData[model]->parsimonyLength = compressedEntriesPadded;
359 
360       rax_free(compressedTips);
361       rax_free(compressedValues);
362     }
363 
364   rax_posix_memalign ((void **) &(tr->parsimonyScore), PLL_BYTE_ALIGNMENT, sizeof(unsigned int) * totalNodes);
365 
366   for(i = 0; i < totalNodes; i++)
367     tr->parsimonyScore[i] = 0;
368 }
369 
determineUninformativeSites(pllInstance * tr,partitionList * pr,int * informative)370 static void determineUninformativeSites(pllInstance *tr, partitionList *pr, int *informative)
371 {
372   int
373     model,
374     number = 0,
375     i;
376 
377   /*
378      Not all characters are useful in constructing a parsimony tree.
379      Invariant characters, those that have the same state in all taxa,
380      are obviously useless and are ignored by the method. Characters in
381      which a state occurs in only one taxon are also ignored.
382      All these characters are called parsimony uninformative.
383 
384      Alternative definition: informative columns contain at least two types
385      of nucleotides, and each nucleotide must appear at least twice in each
386      column. Kind of a pain if we intend to check for this when using, e.g.,
387      amibiguous DNA encoding.
388   */
389 
390 
391   for(model = 0; model < pr->numberOfPartitions; model++)
392     {
393       for(i = pr->partitionData[model]->lower; i < pr->partitionData[model]->upper; i++)
394         {
395            if(isInformative(tr, pr->partitionData[model]->dataType, i))
396              informative[i] = 1;
397            else
398              {
399                informative[i] = 0;
400                number++;
401              }
402         }
403     }
404 
405   /* printf("Uninformative Patterns: %d\n", number); */
406 }
407 
pllInitParsimonyStructures(pllInstance * tr,partitionList * pr,pllBoolean perSiteScores)408 void pllInitParsimonyStructures(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores)
409 {
410   int
411     i,
412     *informative = (int *)rax_malloc(sizeof(int) * (size_t)tr->originalCrunchedLength);
413 
414   for (i = 0; i < pr->numberOfPartitions; ++ i)
415      rax_free (pr->partitionData[i]->parsVect);
416 
417   rax_free (tr->parsimonyScore);
418 
419   determineUninformativeSites(tr, pr, informative);
420 
421   compressDNA(tr, pr, informative, perSiteScores);
422 
423   for(i = tr->mxtips + 1; i <= tr->mxtips + tr->mxtips - 1; i++)
424     {
425       nodeptr
426         p = tr->nodep[i];
427 
428       p->xPars             = 1;
429       p->next->xPars       = 0;
430       p->next->next->xPars = 0;
431     }
432 
433   tr->ti = (int*)rax_malloc(sizeof(int) * 4 * (size_t)tr->mxtips);
434 
435   rax_free(informative);
436 }
437 
newviewParsimonyIterativeFast(pllInstance * tr,partitionList * pr,pllBoolean perSiteScores)438 static void newviewParsimonyIterativeFast(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores)
439 {
440   INT_TYPE
441     allOne = SET_ALL_BITS_ONE;
442 
443   int
444     model,
445     *ti = tr->ti,
446     count = ti[0],
447     index;
448 
449   for(index = 4; index < count; index += 4)
450     {
451       unsigned int
452         totalScore = 0;
453 
454       size_t
455         pNumber = (size_t)ti[index],
456         qNumber = (size_t)ti[index + 1],
457         rNumber = (size_t)ti[index + 2];
458 
459       for(model = 0; model < pr->numberOfPartitions; model++)
460         {
461           size_t
462             k,
463             states = pr->partitionData[model]->states,
464             width = pr->partitionData[model]->parsimonyLength;
465 
466           unsigned int
467             i;
468 
469           switch(states)
470             {
471             case 2:
472               {
473                 parsimonyNumber
474                   *left[2],
475                   *right[2],
476                   *this[2];
477 
478                 for(k = 0; k < 2; k++)
479                   {
480                     left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
481                     right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * rNumber) + width * k]);
482                     this[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
483                   }
484 
485                 for(i = 0; i < width; i += INTS_PER_VECTOR)
486                   {
487                     INT_TYPE
488                       s_r, s_l, v_N,
489                       l_A, l_C,
490                       v_A, v_C;
491 
492                     s_l = VECTOR_LOAD((CAST)(&left[0][i]));
493                     s_r = VECTOR_LOAD((CAST)(&right[0][i]));
494                     l_A = VECTOR_BIT_AND(s_l, s_r);
495                     v_A = VECTOR_BIT_OR(s_l, s_r);
496 
497                     s_l = VECTOR_LOAD((CAST)(&left[1][i]));
498                     s_r = VECTOR_LOAD((CAST)(&right[1][i]));
499                     l_C = VECTOR_BIT_AND(s_l, s_r);
500                     v_C = VECTOR_BIT_OR(s_l, s_r);
501 
502                     v_N = VECTOR_BIT_OR(l_A, l_C);
503 
504                     VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A)));
505                     VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C)));
506 
507                     v_N = VECTOR_AND_NOT(v_N, allOne);
508 
509                     totalScore += vectorPopcount(v_N);
510                     if (perSiteScores)
511                        storePerSiteScores (pr, model, v_N, i);
512                   }
513               }
514               break;
515             case 4:
516               {
517                 parsimonyNumber
518                   *left[4],
519                   *right[4],
520                   *this[4];
521 
522                 for(k = 0; k < 4; k++)
523                   {
524                     left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
525                     right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * rNumber) + width * k]);
526                     this[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
527                   }
528                 for(i = 0; i < width; i += INTS_PER_VECTOR)
529                   {
530                     INT_TYPE
531                       s_r, s_l, v_N,
532                       l_A, l_C, l_G, l_T,
533                       v_A, v_C, v_G, v_T;
534 
535                     s_l = VECTOR_LOAD((CAST)(&left[0][i]));
536                     s_r = VECTOR_LOAD((CAST)(&right[0][i]));
537                     l_A = VECTOR_BIT_AND(s_l, s_r);
538                     v_A = VECTOR_BIT_OR(s_l, s_r);
539 
540                     s_l = VECTOR_LOAD((CAST)(&left[1][i]));
541                     s_r = VECTOR_LOAD((CAST)(&right[1][i]));
542                     l_C = VECTOR_BIT_AND(s_l, s_r);
543                     v_C = VECTOR_BIT_OR(s_l, s_r);
544 
545                     s_l = VECTOR_LOAD((CAST)(&left[2][i]));
546                     s_r = VECTOR_LOAD((CAST)(&right[2][i]));
547                     l_G = VECTOR_BIT_AND(s_l, s_r);
548                     v_G = VECTOR_BIT_OR(s_l, s_r);
549 
550                     s_l = VECTOR_LOAD((CAST)(&left[3][i]));
551                     s_r = VECTOR_LOAD((CAST)(&right[3][i]));
552                     l_T = VECTOR_BIT_AND(s_l, s_r);
553                     v_T = VECTOR_BIT_OR(s_l, s_r);
554 
555                     v_N = VECTOR_BIT_OR(VECTOR_BIT_OR(l_A, l_C), VECTOR_BIT_OR(l_G, l_T));
556 
557                     VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A)));
558                     VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C)));
559                     VECTOR_STORE((CAST)(&this[2][i]), VECTOR_BIT_OR(l_G, VECTOR_AND_NOT(v_N, v_G)));
560                     VECTOR_STORE((CAST)(&this[3][i]), VECTOR_BIT_OR(l_T, VECTOR_AND_NOT(v_N, v_T)));
561 
562                     v_N = VECTOR_AND_NOT(v_N, allOne);
563 
564                     totalScore += vectorPopcount(v_N);
565 
566                     if (perSiteScores)
567                        storePerSiteScores (pr, model, v_N, i);
568                   }
569               }
570               break;
571             case 20:
572               {
573                 parsimonyNumber
574                   *left[20],
575                   *right[20],
576                   *this[20];
577 
578                 for(k = 0; k < 20; k++)
579                   {
580                     left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
581                     right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * rNumber) + width * k]);
582                     this[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
583                   }
584 
585                 for(i = 0; i < width; i += INTS_PER_VECTOR)
586                   {
587                     size_t j;
588 
589                     INT_TYPE
590                       s_r, s_l,
591                       v_N = SET_ALL_BITS_ZERO,
592                       l_A[20],
593                       v_A[20];
594 
595                     for(j = 0; j < 20; j++)
596                       {
597                         s_l = VECTOR_LOAD((CAST)(&left[j][i]));
598                         s_r = VECTOR_LOAD((CAST)(&right[j][i]));
599                         l_A[j] = VECTOR_BIT_AND(s_l, s_r);
600                         v_A[j] = VECTOR_BIT_OR(s_l, s_r);
601 
602                         v_N = VECTOR_BIT_OR(v_N, l_A[j]);
603                       }
604 
605                     for(j = 0; j < 20; j++)
606                       VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j])));
607 
608                     v_N = VECTOR_AND_NOT(v_N, allOne);
609 
610                     totalScore += vectorPopcount(v_N);
611 
612                     if (perSiteScores)
613                        storePerSiteScores (pr, model, v_N, i);
614                   }
615               }
616               break;
617             default:
618               {
619                 parsimonyNumber
620                   *left[32],
621                   *right[32],
622                   *this[32];
623 
624                 assert(states <= 32);
625 
626                 for(k = 0; k < states; k++)
627                   {
628                     left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
629                     right[k] = &(pr->partitionData[model]->parsVect[(width * states * rNumber) + width * k]);
630                     this[k]  = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
631                   }
632 
633                 for(i = 0; i < width; i += INTS_PER_VECTOR)
634                   {
635                     size_t j;
636 
637                     INT_TYPE
638                       s_r, s_l,
639                       v_N = SET_ALL_BITS_ZERO,
640                       l_A[32],
641                       v_A[32];
642 
643                     for(j = 0; j < states; j++)
644                       {
645                         s_l = VECTOR_LOAD((CAST)(&left[j][i]));
646                         s_r = VECTOR_LOAD((CAST)(&right[j][i]));
647                         l_A[j] = VECTOR_BIT_AND(s_l, s_r);
648                         v_A[j] = VECTOR_BIT_OR(s_l, s_r);
649 
650                         v_N = VECTOR_BIT_OR(v_N, l_A[j]);
651                       }
652 
653                     for(j = 0; j < states; j++)
654                       VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j])));
655 
656                     v_N = VECTOR_AND_NOT(v_N, allOne);
657 
658                     totalScore += vectorPopcount(v_N);
659 
660                     if (perSiteScores)
661                        storePerSiteScores (pr, model, v_N, i);
662                   }
663               }
664             }
665         }
666       tr->parsimonyScore[pNumber] = totalScore + tr->parsimonyScore[rNumber] + tr->parsimonyScore[qNumber];
667     }
668 }
669 
evaluateParsimonyIterativeFast(pllInstance * tr,partitionList * pr,pllBoolean perSiteScores)670 static unsigned int evaluateParsimonyIterativeFast(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores)
671 {
672   INT_TYPE
673     allOne = SET_ALL_BITS_ONE;
674 
675   size_t
676     pNumber = (size_t)tr->ti[1],
677     qNumber = (size_t)tr->ti[2];
678 
679   int
680     model;
681 
682   unsigned int
683     bestScore = tr->bestParsimony,
684     sum;
685 
686   if(tr->ti[0] > 4)
687     newviewParsimonyIterativeFast(tr, pr, perSiteScores);
688 
689   sum = tr->parsimonyScore[pNumber] + tr->parsimonyScore[qNumber];
690 
691   for(model = 0; model < pr->numberOfPartitions; model++)
692     {
693       size_t
694         k,
695         states = pr->partitionData[model]->states,
696         width  = pr->partitionData[model]->parsimonyLength,
697         i;
698 
699        switch(states)
700          {
701          case 2:
702            {
703              parsimonyNumber
704                *left[2],
705                *right[2];
706 
707              for(k = 0; k < 2; k++)
708                {
709                  left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
710                  right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
711                }
712 
713              for(i = 0; i < width; i += INTS_PER_VECTOR)
714                {
715                  INT_TYPE
716                    l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[0][i])), VECTOR_LOAD((CAST)(&right[0][i]))),
717                    l_C = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[1][i])), VECTOR_LOAD((CAST)(&right[1][i]))),
718                    v_N = VECTOR_BIT_OR(l_A, l_C);
719 
720                  v_N = VECTOR_AND_NOT(v_N, allOne);
721 
722                  sum += vectorPopcount(v_N);
723                   if (perSiteScores)
724                     storePerSiteScores (pr, model, v_N, i);
725                }
726            }
727            break;
728          case 4:
729            {
730              parsimonyNumber
731                *left[4],
732                *right[4];
733 
734              for(k = 0; k < 4; k++)
735                {
736                  left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
737                  right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
738                }
739 
740              for(i = 0; i < width; i += INTS_PER_VECTOR)
741                {
742                  INT_TYPE
743                    l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[0][i])), VECTOR_LOAD((CAST)(&right[0][i]))),
744                    l_C = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[1][i])), VECTOR_LOAD((CAST)(&right[1][i]))),
745                    l_G = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[2][i])), VECTOR_LOAD((CAST)(&right[2][i]))),
746                    l_T = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[3][i])), VECTOR_LOAD((CAST)(&right[3][i]))),
747                    v_N = VECTOR_BIT_OR(VECTOR_BIT_OR(l_A, l_C), VECTOR_BIT_OR(l_G, l_T));
748 
749                  v_N = VECTOR_AND_NOT(v_N, allOne);
750 
751                  sum += vectorPopcount(v_N);
752                   if (perSiteScores)
753                     storePerSiteScores (pr, model, v_N, i);
754                }
755            }
756            break;
757          case 20:
758            {
759              parsimonyNumber
760                *left[20],
761                *right[20];
762 
763               for(k = 0; k < 20; k++)
764                 {
765                   left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
766                   right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
767                 }
768 
769               for(i = 0; i < width; i += INTS_PER_VECTOR)
770                 {
771                   int
772                     j;
773 
774                   INT_TYPE
775                     l_A,
776                     v_N = SET_ALL_BITS_ZERO;
777 
778                   for(j = 0; j < 20; j++)
779                     {
780                       l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[j][i])), VECTOR_LOAD((CAST)(&right[j][i])));
781                       v_N = VECTOR_BIT_OR(l_A, v_N);
782                     }
783 
784                   v_N = VECTOR_AND_NOT(v_N, allOne);
785 
786                   sum += vectorPopcount(v_N);
787                   if (perSiteScores)
788                     storePerSiteScores (pr, model, v_N, i);
789                 }
790            }
791            break;
792          default:
793            {
794              parsimonyNumber
795                *left[32],
796                *right[32];
797 
798              assert(states <= 32);
799 
800              for(k = 0; k < states; k++)
801                {
802                  left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
803                  right[k] = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
804                }
805 
806              for(i = 0; i < width; i += INTS_PER_VECTOR)
807                {
808                  size_t
809                    j;
810 
811                  INT_TYPE
812                    l_A,
813                    v_N = SET_ALL_BITS_ZERO;
814 
815                  for(j = 0; j < states; j++)
816                    {
817                      l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[j][i])), VECTOR_LOAD((CAST)(&right[j][i])));
818                      v_N = VECTOR_BIT_OR(l_A, v_N);
819                    }
820 
821                  v_N = VECTOR_AND_NOT(v_N, allOne);
822 
823                  sum += vectorPopcount(v_N);
824                  if (perSiteScores)
825                    storePerSiteScores (pr, model, v_N, i);
826                }
827            }
828          }
829     }
830 
831   return sum;
832 }
833 
pllEvaluateParsimony(pllInstance * tr,partitionList * pr,nodeptr p,pllBoolean full,pllBoolean perSiteScores)834 unsigned int pllEvaluateParsimony(pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean full, pllBoolean perSiteScores)
835 {
836   volatile unsigned int result;
837   nodeptr q = p->back;
838   int
839     *ti = tr->ti,
840     counter = 4;
841 
842   ti[1] = p->number;
843   ti[2] = q->number;
844 
845   if(full)
846     {
847       if(p->number > tr->mxtips)
848         computeTraversalInfoParsimony(p, ti, &counter, tr->mxtips, full);
849       if(q->number > tr->mxtips)
850         computeTraversalInfoParsimony(q, ti, &counter, tr->mxtips, full);
851     }
852   else
853     {
854       if(p->number > tr->mxtips && !p->xPars)
855         computeTraversalInfoParsimony(p, ti, &counter, tr->mxtips, full);
856       if(q->number > tr->mxtips && !q->xPars)
857         computeTraversalInfoParsimony(q, ti, &counter, tr->mxtips, full);
858     }
859 
860   ti[0] = counter;
861 
862   result = evaluateParsimonyIterativeFast(tr, pr, perSiteScores);
863 
864   return result;
865 }
866