1 static char rcsid[] = "$Id: genome128_consec.c 222860 2020-06-15 07:08:07Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11 
12 #include "genome128_consec.h"
13 
14 #include <stdio.h>
15 #include <stddef.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <ctype.h>		/* For tolower() */
19 
20 #include "assert.h"
21 #include "except.h"
22 #include "compress.h"
23 #include "popcount.h"
24 
25 #ifdef WORDS_BIGENDIAN
26 #include "bigendian.h"
27 #else
28 #include "littleendian.h"
29 #endif
30 
31 
32 /* Consecutive_matches_rightward and leftward */
33 /* Slower with shift and wrap, perhaps because we need to extract integers from the SIMD object */
34 /* #define USE_SHIFT_FIRST_MISMATCH 1 */
35 /* #define USE_WRAP_FIRST_MISMATCH 1 */
36 
37 /* Faster to use a straight shift, and _mm_bsrli_si128 is not defined in gcc 4.7 */
38 /* #define USE_SHIFT_HILO 1 */
39 
40 
41 #ifdef HAVE_SSE2
42 #define QUERY_NEXTCOL 1		/* high0, high1, high2, high3 */
43 #define QUERY_NEXTROW 8
44 #else
45 #define QUERY_NEXTCOL 3		/* high, low, flags */
46 /* #define QUERY_NEXTROW 0 */
47 #endif
48 
49 #define GENOME_NEXTCOL 1
50 #define GENOME_NEXTROW 8
51 
52 
53 #ifdef WORDS_BIGENDIAN
54 /* Do not use SIMD */
55 #elif defined(HAVE_SSE2)
56 #include <emmintrin.h>
57 #endif
58 #ifdef HAVE_SSSE3
59 #include <tmmintrin.h>
60 #endif
61 #ifdef HAVE_SSE4_1
62 #include <smmintrin.h>
63 #endif
64 #ifdef HAVE_AVX2
65 #include <immintrin.h>
66 #endif
67 #ifdef HAVE_AVX512
68 #include <immintrin.h>
69 #endif
70 
71 #if !defined(HAVE_SSE4_2)
72 /* Skip popcnt, which comes after SSE4.2 */
73 #elif defined(HAVE_POPCNT)
74 #include <immintrin.h>
75 #elif defined(HAVE_MM_POPCNT)
76 #include <nmmintrin.h>
77 #endif
78 
79 #if !defined(HAVE_SSE4_2)
80 /* Skip lzcnt and tzcnt, which come after SSE4.2 */
81 #elif defined(HAVE_LZCNT) || defined(HAVE_TZCNT)
82 #include <immintrin.h>
83 #endif
84 
85 #ifdef DEBUG
86 #define debug(x) x
87 #else
88 #define debug(x)
89 #endif
90 
91 /* Fragments */
92 #ifdef DEBUG1
93 #define debug1(x) x
94 #else
95 #define debug1(x)
96 #endif
97 
98 /* Genome_consecutive_matches_pair */
99 #ifdef DEBUG2
100 #define debug2(x) x
101 #else
102 #define debug2(x)
103 #endif
104 
105 /* clear_highbit and clear_lowbit */
106 #ifdef DEBUG3
107 #define debug3(x) x
108 #else
109 #define debug3(x)
110 #endif
111 
112 /* count_leading_zeroes and count_trailing_zeroes */
113 #ifdef DEBUG4
114 #define debug4(x) x
115 #else
116 #define debug4(x)
117 #endif
118 
119 /* mark mismatches */
120 #ifdef DEBUG5
121 #define debug5(x) x
122 #else
123 #define debug5(x)
124 #endif
125 
126 /* 32-bit shortcuts */
127 #ifdef DEBUG14
128 #define debug14(x) x
129 #else
130 #define debug14(x)
131 #endif
132 
133 
134 /* Genome_consec code starts here */
135 
136 #if defined(DEBUG) || defined(DEBUG5)
137 #ifdef HAVE_SSE4_1
138 static void
print_vector_hex(__m128i x)139 print_vector_hex (__m128i x) {
140   printf("%08X %08X %08X %08X\n",
141 	 _mm_extract_epi32(x,0),_mm_extract_epi32(x,1),_mm_extract_epi32(x,2),_mm_extract_epi32(x,3));
142   return;
143 }
144 
145 static void
print_vector_dec(__m128i x)146 print_vector_dec (__m128i x) {
147   printf("%u %u %u %u\n",
148 	 _mm_extract_epi32(x,0),_mm_extract_epi32(x,1),_mm_extract_epi32(x,2),_mm_extract_epi32(x,3));
149   return;
150 }
151 
152 #elif defined(HAVE_SSE2)
153 static void
print_vector_hex(__m128i x)154 print_vector_hex (__m128i x) {
155   printf("%08X %08X %08X %08X\n",
156 	 (_mm_extract_epi16(x,1) << 16) | (_mm_extract_epi16(x,0) & 0x0000FFFF),
157 	 (_mm_extract_epi16(x,3) << 16) | (_mm_extract_epi16(x,2) & 0x0000FFFF),
158 	 (_mm_extract_epi16(x,5) << 16) | (_mm_extract_epi16(x,4) & 0x0000FFFF),
159 	 (_mm_extract_epi16(x,7) << 16) | (_mm_extract_epi16(x,6) & 0x0000FFFF));
160   return;
161 }
162 
163 static void
print_vector_dec(__m128i x)164 print_vector_dec (__m128i x) {
165   printf("%u %u %u %u\n",
166 	 (_mm_extract_epi16(x,1) << 16) | (_mm_extract_epi16(x,0) & 0x0000FFFF),
167 	 (_mm_extract_epi16(x,3) << 16) | (_mm_extract_epi16(x,2) & 0x0000FFFF),
168 	 (_mm_extract_epi16(x,5) << 16) | (_mm_extract_epi16(x,4) & 0x0000FFFF),
169 	 (_mm_extract_epi16(x,7) << 16) | (_mm_extract_epi16(x,6) & 0x0000FFFF));
170   return;
171 }
172 #endif
173 
174 #ifdef HAVE_AVX2
175 static void
print_vector_256_hex(__m256i x)176 print_vector_256_hex (__m256i x) {
177   printf("%08X %08X %08X %08X %08X %08X %08X %08X\n",
178 	 _mm256_extract_epi32(x,0),_mm256_extract_epi32(x,1),_mm256_extract_epi32(x,2),_mm256_extract_epi32(x,3),
179 	 _mm256_extract_epi32(x,4),_mm256_extract_epi32(x,5),_mm256_extract_epi32(x,6),_mm256_extract_epi32(x,7));
180   return;
181 }
182 
183 static void
print_vector_256_dec(__m256i x)184 print_vector_256_dec (__m256i x) {
185   printf("%u %u %u %u %u %u %u %u\n",
186 	 _mm256_extract_epi32(x,0),_mm256_extract_epi32(x,1),_mm256_extract_epi32(x,2),_mm256_extract_epi32(x,3),
187 	 _mm256_extract_epi32(x,4),_mm256_extract_epi32(x,5),_mm256_extract_epi32(x,6),_mm256_extract_epi32(x,7));
188   return;
189 }
190 #endif
191 
192 #ifdef HAVE_AVX512
193 static void
print_vector_512_hex(__m512i x)194 print_vector_512_hex (__m512i x) {
195   unsigned int array[16];
196 
197   _mm512_store_si512((__m512i *) array,x);
198   printf("%08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X\n",
199 	 array[0],array[1],array[2],array[3],array[4],array[5],array[6],array[7],
200 	 array[8],array[9],array[10],array[11],array[12],array[13],array[14],array[15]);
201   return;
202 }
203 
204 static void
print_vector_512_dec(__m512i x)205 print_vector_512_dec (__m512i x) {
206   unsigned int array[16];
207 
208   _mm512_store_si512((__m512i *) array,x);
209   printf("%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u\n",
210 	 array[0],array[1],array[2],array[3],array[4],array[5],array[6],array[7],
211 	 array[8],array[9],array[10],array[11],array[12],array[13],array[14],array[15]);
212   return;
213 }
214 #endif
215 
216 #endif
217 
218 
219 #if 0
220 /* Note: outer unshuffle operation, as implemented below has twice the
221    instruction count as lookup of reduce_nt array */
222 static inline UINT4
223 reduce_nt_unshuffle (UINT4 xhigh, UINT4 xlow) {
224   UINT8 x, t;
225 
226   x = (UINT8) xhigh;
227   x <<= 32;
228   x |= xlow;
229 
230   t = (x ^ (x >> 1))  & 0x2222222222222222;  x = x ^ t ^ (t << 1);
231   t = (x ^ (x >> 2))  & 0x0C0C0C0C0C0C0C0C;  x = x ^ t ^ (t << 2);
232   t = (x ^ (x >> 4))  & 0x00F000F000F000F0;  x = x ^ t ^ (t << 4);
233   t = (x ^ (x >> 8))  & 0x0000FF000000FF00;  x = x ^ t ^ (t << 8);
234   t = (x ^ (x >> 16)) & 0x00000000FFFF0000;  x = x ^ t ^ (t << 16);
235 
236   return (UINT4) ((x >> 32) | x);
237 }
238 #endif
239 
240 
241 #if defined(DEBUG) || defined(DEBUG2) || defined(DEBUG5)
242 static void
write_chars(Genomecomp_T high,Genomecomp_T low,Genomecomp_T flags)243 write_chars (Genomecomp_T high, Genomecomp_T low, Genomecomp_T flags) {
244   char Buffer[33];
245   int i;
246 
247   Buffer[32] = '\0';
248   /* printf("%08X %08X %08X => ",high,low,flags); */
249 
250   for (i = 0; i < 32; i++) {
251     switch (((high & 0x01) << 1) | (low & 0x01)) {
252     case 0U: Buffer[i] = 'A'; break;
253     case 1U: Buffer[i] = 'C'; break;
254     case 2U: Buffer[i] = 'G'; break;
255     case 3U: Buffer[i] = 'T'; break;
256     default: abort();
257     }
258     high >>= 1;
259     low >>= 1;
260   }
261 
262   if (flags != 0U) {
263     for (i = 0; i < 32; i++) {
264       if (flags & 0x01) {
265 	Buffer[i] = 'N';
266       }
267       flags >>= 1;
268     }
269   }
270 
271   printf("%s",Buffer);
272   return;
273 }
274 #endif
275 
276 
277 
278 #if defined(DEBUG) || defined(DEBUG2) || defined(DEBUG5)
279 static void
print_blocks(Genomecomp_T * blocks,Univcoord_T startpos,Univcoord_T endpos)280 print_blocks (Genomecomp_T *blocks, Univcoord_T startpos, Univcoord_T endpos) {
281   /* Chrpos_T length = endpos - startpos; */
282   Genomecomp_T *ptr;
283   Univcoord_T startblocki, endblocki;
284   int startcolumni, endcolumni;
285   int startdiscard32, enddiscard32;
286   Genomecomp_T high, low, flags;
287   int i;
288 
289   /* sequence = (char *) CALLOC(length+1,sizeof(char)); */
290 
291 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
292   startcolumni = (startpos % 128) / 32;
293   startblocki = startpos/128U*12 + startcolumni;
294 
295   endcolumni = (endpos % 128) / 32;
296   endblocki = endpos/128U*12 + endcolumni;
297 #else
298   startcolumni = (startpos % 128) / 32;
299   endcolumni = (endpos % 128) / 32;
300 
301   startblocki = startpos/128U*12;
302   endblocki = endpos/128U*12;
303 #endif
304 
305   startdiscard32 = startpos % 32;
306   enddiscard32 = endpos % 32;
307 
308 
309   ptr = &(blocks[startblocki]);
310   while (ptr <= &(blocks[endblocki])) {
311 #if defined(WORDS_BIGENDIAN)
312     high = Bigendian_convert_uint(ptr[0]);
313     low = Bigendian_convert_uint(ptr[4]);
314     flags = Bigendian_convert_uint(ptr[8]);
315     printf("high: %08X  low: %08X  flags: %08X\t",high,low,flags);
316     write_chars(high,low,flags);
317     printf("\n");
318 
319     ptr += GENOME_NEXTCOL; if (++startcolumni == 4) {ptr += GENOME_NEXTROW; startcolumni = 0;}
320 #elif !defined(HAVE_SSE2)
321     high = ptr[0]; low = ptr[4]; flags = ptr[8];
322     printf("high: %08X  low: %08X  flags: %08X\t",high,low,flags);
323     write_chars(high,low,flags);
324     printf("\n");
325 
326     ptr += GENOME_NEXTCOL; if (++startcolumni == 4) {ptr += GENOME_NEXTROW; startcolumni = 0;}
327 
328 #else
329     if (startcolumni == 0) {
330       /*      high: 9F61B62A  low: 6D68A157  flags: 00000000 */
331       printf("                                              \t");
332       /* printf("%u\t",startblocki); */
333       for (i = 0; i < startdiscard32; i++) {
334 	printf("*");
335       }
336       for ( ; i < 32; i++) {
337 	printf(" ");
338       }
339       printf("\n");
340       startcolumni = -1;
341     }
342     high = ptr[0]; low = ptr[4]; flags = ptr[8];
343     printf("high: %08X  low: %08X  flags: %08X\t",high,low,flags);
344     write_chars(high,low,flags);
345     printf("\n");
346     if (ptr == &(blocks[endblocki]) && endcolumni == 0) {
347       /*      high: 9F61B62A  low: 6D68A157  flags: 00000000 */
348       printf("                                              \t");
349       /* printf("%u\t",startblocki); */
350       for (i = 0; i < enddiscard32; i++) {
351 	printf(" ");
352       }
353       for ( ; i < 32; i++) {
354 	printf("*");
355       }
356       printf("\n");
357     }
358 
359 
360     if (startcolumni == 1) {
361       /*      high: 9F61B62A  low: 6D68A157  flags: 00000000 */
362       printf("                                              \t");
363       /* printf("%u\t",startblocki); */
364       for (i = 0; i < startdiscard32; i++) {
365 	printf("*");
366       }
367       for ( ; i < 32; i++) {
368 	printf(" ");
369       }
370       printf("\n");
371       startcolumni = -1;
372     }
373     high = ptr[1]; low = ptr[5]; flags = ptr[9];
374     printf("high: %08X  low: %08X  flags: %08X\t",high,low,flags);
375     write_chars(high,low,flags);
376     printf("\n");
377     if (ptr == &(blocks[endblocki]) && endcolumni == 1) {
378       /*      high: 9F61B62A  low: 6D68A157  flags: 00000000 */
379       printf("                                              \t");
380       /* printf("%u\t",startblocki); */
381       for (i = 0; i < enddiscard32; i++) {
382 	printf(" ");
383       }
384       for ( ; i < 32; i++) {
385 	printf("*");
386       }
387       printf("\n");
388     }
389 
390     if (startcolumni == 2) {
391       /*      high: 9F61B62A  low: 6D68A157  flags: 00000000 */
392       printf("                                              \t");
393       /* printf("%u\t",startblocki); */
394       for (i = 0; i < startdiscard32; i++) {
395 	printf("*");
396       }
397       for ( ; i < 32; i++) {
398 	printf(" ");
399       }
400       printf("\n");
401       startcolumni = -1;
402     }
403     high = ptr[2]; low = ptr[6]; flags = ptr[10];
404     printf("high: %08X  low: %08X  flags: %08X\t",high,low,flags);
405     write_chars(high,low,flags);
406     printf("\n");
407     if (ptr == &(blocks[endblocki]) && endcolumni == 2) {
408       /*      high: 9F61B62A  low: 6D68A157  flags: 00000000 */
409       printf("                                              \t");
410       /* printf("%u\t",startblocki); */
411       for (i = 0; i < enddiscard32; i++) {
412 	printf(" ");
413       }
414       for ( ; i < 32; i++) {
415 	printf("*");
416       }
417       printf("\n");
418     }
419 
420     if (startcolumni == 3) {
421       /*      high: 9F61B62A  low: 6D68A157  flags: 00000000 */
422       printf("                                              \t");
423       /* printf("%u\t",startblocki); */
424       for (i = 0; i < startdiscard32; i++) {
425 	printf("*");
426       }
427       for ( ; i < 32; i++) {
428 	printf(" ");
429       }
430       printf("\n");
431       startcolumni = -1;
432     }
433     high = ptr[3]; low = ptr[7]; flags = ptr[11];
434     printf("high: %08X  low: %08X  flags: %08X\t",high,low,flags);
435     write_chars(high,low,flags);
436     printf("\n");
437     if (ptr == &(blocks[endblocki]) && endcolumni == 3) {
438       /*      high: 9F61B62A  low: 6D68A157  flags: 00000000 */
439       printf("                                              \t");
440       /* printf("%u\t",startblocki); */
441       for (i = 0; i < enddiscard32; i++) {
442 	printf(" ");
443       }
444       for ( ; i < 32; i++) {
445 	printf("*");
446       }
447       printf("\n");
448     }
449 
450     printf("\n");
451     ptr += 12;
452 #endif
453   }
454 
455   return;
456 }
457 #endif
458 
459 
460 #if defined(USE_SHIFT_HILO) && defined(HAVE_SSE2)
461 static inline void
read_128_shift_lo(__m128i * __restrict__ high,__m128i * __restrict__ low,__m128i * __restrict__ flags,UINT4 * __restrict__ ptr,int startcolumni)462 read_128_shift_lo (__m128i *__restrict__ high, __m128i *__restrict__ low, __m128i *__restrict__ flags, UINT4 *__restrict__ ptr,
463 		    int startcolumni) {
464   __m128i a, b, c;
465 
466   ptr -= startcolumni;
467   a = _mm_load_si128((__m128i *) ptr); ptr += 4;
468   b = _mm_load_si128((__m128i *) ptr); ptr += 4;
469   c = _mm_load_si128((__m128i *) ptr); ptr += 4;
470 
471   switch (startcolumni) {
472   case 0:
473     *high = _mm_bsrli_si128(a, 0);
474     *low = _mm_bsrli_si128(b, 0);
475     *flags = _mm_bsrli_si128(c, 0);
476     break;
477   case 1:
478     *high = _mm_bsrli_si128(a, 4);
479     *low = _mm_bsrli_si128(b, 4);
480     *flags = _mm_bsrli_si128(c, 4);
481     break;
482   case 2:
483     *high = _mm_bsrli_si128(a, 8);
484     *low = _mm_bsrli_si128(b, 8);
485     *flags = _mm_bsrli_si128(c, 8);
486     break;
487   default:
488     *high = _mm_bsrli_si128(a, 12);
489     *low = _mm_bsrli_si128(b, 12);
490     *flags = _mm_bsrli_si128(c, 12);
491     break;
492   }
493 
494   return;
495 }
496 
497 static inline void
read_128_shift_hi(__m128i * __restrict__ high,__m128i * __restrict__ low,__m128i * __restrict__ flags,UINT4 * __restrict__ ptr,int endcolumni)498 read_128_shift_hi (__m128i *__restrict__ high, __m128i *__restrict__ low, __m128i *__restrict__ flags, UINT4 *__restrict__ ptr,
499 		   int endcolumni) {
500   __m128i a, b, c;
501 
502   ptr -= endcolumni;
503   a = _mm_load_si128((__m128i *) ptr); ptr += 4;
504   b = _mm_load_si128((__m128i *) ptr); ptr += 4;
505   c = _mm_load_si128((__m128i *) ptr); ptr += 4;
506 
507   switch (endcolumni) {
508   case 0:
509     *high = _mm_bslli_si128(a, 12);
510     *low = _mm_bslli_si128(b, 12);
511     *flags = _mm_bslli_si128(c, 12);
512     break;
513   case 1:
514     *high = _mm_bslli_si128(a, 8);
515     *low = _mm_bslli_si128(b, 8);
516     *flags = _mm_bslli_si128(c, 8);
517     break;
518   case 2:
519     *high = _mm_bslli_si128(a, 4);
520     *low = _mm_bslli_si128(b, 4);
521     *flags = _mm_bslli_si128(c, 4);
522     break;
523   default:
524     *high = _mm_bslli_si128(a, 0);
525     *low = _mm_bslli_si128(b, 0);
526     *flags = _mm_bslli_si128(c, 0);
527     break;
528   }
529 
530   return;
531 }
532 #endif
533 
534 
535 #ifdef HAVE_SSSE3
536 static inline void
read_128_wrap_lo(__m128i * __restrict__ high,__m128i * __restrict__ low,__m128i * __restrict__ flags,UINT4 * __restrict__ ptr,int startcolumni)537 read_128_wrap_lo (__m128i *__restrict__ high, __m128i *__restrict__ low, __m128i *__restrict__ flags, UINT4 *__restrict__ ptr,
538 		  int startcolumni) {
539   __m128i a, b, c, d, e, f;
540 
541   ptr -= startcolumni;
542   a = _mm_load_si128((__m128i *) ptr); ptr += 4;
543   b = _mm_load_si128((__m128i *) ptr); ptr += 4;
544   c = _mm_load_si128((__m128i *) ptr); ptr += 4;
545   d = _mm_load_si128((__m128i *) ptr); ptr += 4;
546   e = _mm_load_si128((__m128i *) ptr); ptr += 4;
547   f = _mm_load_si128((__m128i *) ptr);
548 
549   switch (startcolumni) {
550   case 0:
551     *high = _mm_alignr_epi8(d, a, 0);
552     *low = _mm_alignr_epi8(e, b, 0);
553     *flags = _mm_alignr_epi8(f, c, 0);
554     break;
555   case 1:
556     *high = _mm_alignr_epi8(d, a, 4);
557     *low = _mm_alignr_epi8(e, b, 4);
558     *flags = _mm_alignr_epi8(f, c, 4);
559     break;
560   case 2:
561     *high = _mm_alignr_epi8(d, a, 8);
562     *low = _mm_alignr_epi8(e, b, 8);
563     *flags = _mm_alignr_epi8(f, c, 8);
564     break;
565   default:
566     *high = _mm_alignr_epi8(d, a, 12);
567     *low = _mm_alignr_epi8(e, b, 12);
568     *flags = _mm_alignr_epi8(f, c, 12);
569     break;
570   }
571 
572   return;
573 }
574 
575 static inline void
read_128_wrap_hi(__m128i * __restrict__ high,__m128i * __restrict__ low,__m128i * __restrict__ flags,UINT4 * __restrict__ ptr,int endcolumni)576 read_128_wrap_hi (__m128i *__restrict__ high, __m128i *__restrict__ low, __m128i *__restrict__ flags, UINT4 *__restrict__ ptr,
577 		  int endcolumni) {
578   __m128i a, b, c, d, e, f;
579 
580   ptr -= endcolumni;
581   ptr -= 12;
582   a = _mm_load_si128((__m128i *) ptr); ptr += 4;
583   b = _mm_load_si128((__m128i *) ptr); ptr += 4;
584   c = _mm_load_si128((__m128i *) ptr); ptr += 4;
585   d = _mm_load_si128((__m128i *) ptr); ptr += 4;
586   e = _mm_load_si128((__m128i *) ptr); ptr += 4;
587   f = _mm_load_si128((__m128i *) ptr);
588 
589   switch (endcolumni) {
590   case 0:
591     *high = _mm_alignr_epi8(d, a, 4);
592     *low = _mm_alignr_epi8(e, b, 4);
593     *flags = _mm_alignr_epi8(f, c, 4);
594     break;
595   case 1:
596     *high = _mm_alignr_epi8(d, a, 8);
597     *low = _mm_alignr_epi8(e, b, 8);
598     *flags = _mm_alignr_epi8(f, c, 8);
599     break;
600   case 2:
601     *high = _mm_alignr_epi8(d, a, 12);
602     *low = _mm_alignr_epi8(e, b, 12);
603     *flags = _mm_alignr_epi8(f, c, 12);
604     break;
605   default:
606     *high = _mm_alignr_epi8(d, a, 16);
607     *low = _mm_alignr_epi8(e, b, 16);
608     *flags = _mm_alignr_epi8(f, c, 16);
609     break;
610   }
611 
612   return;
613 }
614 #endif
615 
616 
617 #ifdef HAVE_AVX2
618 static inline void
read_256(__m256i * __restrict__ high,__m256i * __restrict__ low,__m256i * __restrict__ flags,UINT4 * __restrict__ ptr)619 read_256 (__m256i *__restrict__ high, __m256i *__restrict__ low, __m256i *__restrict__ flags, UINT4 *__restrict__ ptr) {
620   __m256i a, b, c;
621   a = _mm256_loadu_si256((__m256i *) ptr); /* query0_high, query0_low */
622   b = _mm256_loadu_si256((__m256i *) &(ptr[8])); /* query0_flags, query1_high */
623   c = _mm256_loadu_si256((__m256i *) &(ptr[16])); /* query1_low, query1_flags */
624 
625   *high = _mm256_permute2x128_si256(a, b, 0x30);
626   *low = _mm256_permute2x128_si256(a, c, 0x21);
627   *flags = _mm256_permute2x128_si256(b, c, 0x30);
628 
629   return;
630 }
631 #endif
632 
633 #ifdef HAVE_AVX512
634 static inline void
read_512(__m512i * __restrict__ high,__m512i * __restrict__ low,__m512i * __restrict__ flags,UINT4 * __restrict__ ptr)635 read_512 (__m512i *__restrict__ high, __m512i *__restrict__ low, __m512i *__restrict__ flags, UINT4 *__restrict__ ptr) {
636   __m512i a, b, c, d, e, f;
637   a = _mm512_loadu_si512((__m512i *) ptr); /* query0_high, query0_low, query0_flags, query1_high */
638   b = _mm512_loadu_si512((__m512i *) &(ptr[16])); /* query1_low, query1_flags, query2_high, query2_low */
639   c = _mm512_loadu_si512((__m512i *) &(ptr[32])); /* query2_flags, query3_high, query3_low, query3_flags */
640 
641   d = _mm512_permutex2var_epi32(a, _mm512_setr_epi32(0, 1, 2, 3, 12, 13, 14, 15,
642 						     4, 5, 6, 7, 16+0, 16+1, 16+2, 16+3), b);
643   e = _mm512_permutex2var_epi32(b, _mm512_setr_epi32(8, 9, 10, 11, 16+4, 16+5, 16+6, 16+7,
644 						     12, 13, 14, 15, 16+8, 16+9, 16+10, 16+11), c);
645   f = _mm512_permutex2var_epi32(a, _mm512_setr_epi32(8, 9, 10, 11, 16+4, 16+5, 16+6, 16+7,
646 						     12, 13, 14, 15, 16+8, 16+9, 16+10, 16+11), b);
647 
648   *high = _mm512_permutex2var_epi64(d, _mm512_setr_epi64(0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3), e);
649   *low = _mm512_permutex2var_epi64(d, _mm512_setr_epi64(4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7), e);
650   *flags = _mm512_permutex2var_epi64(f, _mm512_setr_epi64(0, 1, 2, 3, 8+0, 8+1, 8+6, 8+7), c);
651 
652   return;
653 }
654 #endif
655 
656 
657 /* These are global values, used for alignment.  Previously for
658    trimming, treated query N's as mismatches, but this is not correct
659    for query N's in the middle of the read.  Also, trimming query N's
660    can affect the --clip-overlap feature. */
661 static bool query_unk_mismatch_p = false;
662 static bool genome_unk_mismatch_p = true;
663 
664 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
665 #define STEP_SIZE 32
666 #else
667 /* Holds for SSE2, AVX2, and AVX512 */
668 #define STEP_SIZE 128
669 #endif
670 
671 
672 static UINT4
block_diff_standard_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)673 block_diff_standard_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
674 			bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
675   UINT4 diff;
676 
677   debug(printf("Comparing high: query %08X with genome %08X ",query_shifted[0],ref_ptr[0]));
678 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
679 #else
680   debug(printf("Comparing low: query %08X with genome %08X ",query_shifted[4],ref_ptr[4]));
681 #endif
682 
683 #ifdef WORDS_BIGENDIAN
684   diff = (query_shifted[0] ^ Bigendian_convert_uint(ref_ptr[0])) | (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
685 #elif !defined(HAVE_SSE2)
686   diff = (query_shifted[0] ^ ref_ptr[0]) | (query_shifted[1] ^ ref_ptr[4]);
687 #else
688   diff = (query_shifted[0] ^ ref_ptr[0]) | (query_shifted[4] ^ ref_ptr[4]);
689 #endif
690 
691   /* Query Ns */
692 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
693   if (query_unk_mismatch_local_p) {
694     /* Query: Considering N as a mismatch */
695     diff |= query_shifted[2];
696   } else {
697     /* Query: Considering N as a wildcard */
698     diff &= ~(query_shifted[2]);
699   }
700 #else
701   if (query_unk_mismatch_local_p) {
702     /* Query: Considering N as a mismatch */
703     diff |= query_shifted[8];
704   } else {
705     /* Query: Considering N as a wildcard */
706     diff &= ~(query_shifted[8]);
707   }
708 #endif
709 
710   /* Genome Ns */
711   if (genome_unk_mismatch_p) {
712     /* Genome: Considering N as a mismatch */
713 #ifdef WORDS_BIGENDIAN
714     diff |= Bigendian_convert_uint(ref_ptr[8]);
715 #else
716     diff |= ref_ptr[8];
717 #endif
718   } else {
719     /* Genome: Considering N as a wildcard */
720 #ifdef WORDS_BIGENDIAN
721     diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
722 #else
723     diff &= ~(ref_ptr[8]);
724 #endif
725   }
726 
727   debug(printf(" => diff %08X\n",diff));
728 
729   return diff;
730 }
731 
732 
733 #ifdef HAVE_SSE2
734 static __m128i
block_diff_standard_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)735 block_diff_standard_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
736 			 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
737   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
738 
739   _query_high = _mm_load_si128((__m128i *) query_shifted);
740   _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
741   _ref_high = _mm_load_si128((__m128i *) ref_ptr);
742   _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
743 
744   _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
745 
746   _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
747   if (query_unk_mismatch_local_p) {
748     _diff = _mm_or_si128(_query_flags, _diff);
749   } else {
750     _diff = _mm_andnot_si128(_query_flags, _diff);
751   }
752 
753   _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
754   if (genome_unk_mismatch_p) {
755     _diff = _mm_or_si128(_ref_flags, _diff);
756   } else {
757     _diff = _mm_andnot_si128(_ref_flags, _diff);
758   }
759 
760   return _diff;
761 }
762 
763 #ifdef USE_SHIFT_HILO
764 static __m128i
block_diff_standard_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)765 block_diff_standard_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
766 				  bool plusp, int genestrand, bool query_unk_mismatch_local_p,
767 				  int startcolumni) {
768   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
769 
770   read_128_shift_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
771   read_128_shift_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
772 
773   _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
774 
775   if (query_unk_mismatch_local_p) {
776     _diff = _mm_or_si128(_query_flags, _diff);
777   } else {
778     _diff = _mm_andnot_si128(_query_flags, _diff);
779   }
780 
781   if (genome_unk_mismatch_p) {
782     _diff = _mm_or_si128(_ref_flags, _diff);
783   } else {
784     _diff = _mm_andnot_si128(_ref_flags, _diff);
785   }
786 
787   return _diff;
788 }
789 
790 static __m128i
block_diff_standard_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)791 block_diff_standard_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
792 				  bool plusp, int genestrand, bool query_unk_mismatch_local_p,
793 				  int endcolumni) {
794   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
795 
796   read_128_shift_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
797   read_128_shift_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
798 
799   _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
800 
801   if (query_unk_mismatch_local_p) {
802     _diff = _mm_or_si128(_query_flags, _diff);
803   } else {
804     _diff = _mm_andnot_si128(_query_flags, _diff);
805   }
806 
807   if (genome_unk_mismatch_p) {
808     _diff = _mm_or_si128(_ref_flags, _diff);
809   } else {
810     _diff = _mm_andnot_si128(_ref_flags, _diff);
811   }
812 
813   return _diff;
814 }
815 #endif
816 #endif
817 
818 #ifdef HAVE_SSSE3
819 static __m128i
block_diff_standard_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)820 block_diff_standard_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
821 				 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
822 				 int startcolumni) {
823   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
824 
825   read_128_wrap_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
826   read_128_wrap_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
827 
828   _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
829 
830   if (query_unk_mismatch_local_p) {
831     _diff = _mm_or_si128(_query_flags, _diff);
832   } else {
833     _diff = _mm_andnot_si128(_query_flags, _diff);
834   }
835 
836   if (genome_unk_mismatch_p) {
837     _diff = _mm_or_si128(_ref_flags, _diff);
838   } else {
839     _diff = _mm_andnot_si128(_ref_flags, _diff);
840   }
841 
842   return _diff;
843 }
844 
845 static __m128i
block_diff_standard_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)846 block_diff_standard_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
847 				 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
848 				 int endcolumni) {
849   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
850 
851   read_128_wrap_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
852   read_128_wrap_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
853 
854   _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
855 
856   if (query_unk_mismatch_local_p) {
857     _diff = _mm_or_si128(_query_flags, _diff);
858   } else {
859     _diff = _mm_andnot_si128(_query_flags, _diff);
860   }
861 
862   if (genome_unk_mismatch_p) {
863     _diff = _mm_or_si128(_ref_flags, _diff);
864   } else {
865     _diff = _mm_andnot_si128(_ref_flags, _diff);
866   }
867 
868   return _diff;
869 }
870 #endif
871 
872 #ifdef HAVE_AVX2
873 static __m256i
block_diff_standard_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)874 block_diff_standard_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
875 			 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
876   __m256i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
877 
878   read_256(&_query_high,&_query_low,&_query_flags,query_shifted);
879   read_256(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
880 
881   _diff = _mm256_or_si256(_mm256_xor_si256(_query_high, _ref_high), _mm256_xor_si256(_query_low, _ref_low));
882 
883   if (query_unk_mismatch_local_p) {
884     _diff = _mm256_or_si256(_query_flags, _diff);
885   } else {
886     _diff = _mm256_andnot_si256(_query_flags, _diff);
887   }
888 
889   if (genome_unk_mismatch_p) {
890     _diff = _mm256_or_si256(_ref_flags, _diff);
891   } else {
892     _diff = _mm256_andnot_si256(_ref_flags, _diff);
893   }
894 
895   return _diff;
896 }
897 #endif
898 
899 #ifdef HAVE_AVX512
900 static __m512i
block_diff_standard_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)901 block_diff_standard_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
902 			 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
903   __m512i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
904 
905   read_512(&_query_high,&_query_low,&_query_flags,query_shifted);
906   read_512(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
907 
908   _diff = _mm512_or_si512(_mm512_xor_si512(_query_high, _ref_high), _mm512_xor_si512(_query_low, _ref_low));
909 
910   if (query_unk_mismatch_local_p) {
911     _diff = _mm512_or_si512(_query_flags, _diff);
912   } else {
913     _diff = _mm512_andnot_si512(_query_flags, _diff);
914   }
915 
916   if (genome_unk_mismatch_p) {
917     _diff = _mm512_or_si512(_ref_flags, _diff);
918   } else {
919     _diff = _mm512_andnot_si512(_ref_flags, _diff);
920   }
921 
922   return _diff;
923 }
924 #endif
925 
926 
927 /************************************************************************
928  *   CMET
929  ************************************************************************/
930 
931 static UINT4
block_diff_metct_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)932 block_diff_metct_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
933 		     bool query_unk_mismatch_local_p) {
934   UINT4 diff;
935 
936   /* sarrayp == true */
937   /* Convert everything to 3-nucleotide space */
938   diff = 0U;
939 
940   /* Compare reduced C->T nts  */
941 #ifdef WORDS_BIGENDIAN
942   diff |= ((query_shifted[0] | query_shifted[1]) ^ (Bigendian_convert_uint(ref_ptr[0]) | Bigendian_convert_uint(ref_ptr[4]))) |
943     (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
944 #elif !defined(HAVE_SSE2)
945   diff |= ((query_shifted[0] | query_shifted[1]) ^ (ref_ptr[0] | ref_ptr[4])) | (query_shifted[1] ^ ref_ptr[4]);
946 #else
947   diff |= ((query_shifted[0] | query_shifted[4]) ^ (ref_ptr[0] | ref_ptr[4])) | (query_shifted[4] ^ ref_ptr[4]);
948 #endif
949   debug(printf(" => diff %08X\n",diff));
950 
951 
952   /* Flags: Considering N as a mismatch */
953 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
954   if (query_unk_mismatch_local_p) {
955     debug(printf("Marking query flags: query %08X ",query_shifted[8]));
956     diff |= query_shifted[8];
957   } else {
958     debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
959     diff &= ~(query_shifted[8]);
960   }
961 #else
962   if (query_unk_mismatch_local_p) {
963     debug(printf("Marking query flags: query %08X ",query_shifted[8]));
964     diff |= query_shifted[8];
965   } else {
966     debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
967     diff &= ~(query_shifted[8]);
968   }
969 #endif
970 
971   if (genome_unk_mismatch_p) {
972     debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
973 #ifdef WORDS_BIGENDIAN
974     diff |= Bigendian_convert_uint(ref_ptr[8]);
975 #else
976     diff |= (ref_ptr[8]);
977 #endif
978   } else {
979     debug(printf("Clearing genome flags: genome %08X ",ref_ptr[8]));
980 #ifdef WORDS_BIGENDIAN
981     diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
982 #else
983     diff &= ~(ref_ptr[8]);
984 #endif
985   }
986   debug(printf(" => diff %08X\n",diff));
987 
988   return diff;
989 }
990 
991 
992 #ifdef HAVE_SSE2
993 /* Convert C to T: high/low (A) 0 0 => new high 0; (C) 0 1 => 1; (G) 1 0 => 1; (T) 1 0 => 1 */
994 /* new high = high | low */
995 static __m128i
block_diff_metct_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)996 block_diff_metct_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
997 		      bool query_unk_mismatch_local_p) {
998   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
999 
1000   _query_high = _mm_load_si128((__m128i *) query_shifted);
1001   _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
1002   _ref_high = _mm_load_si128((__m128i *) ref_ptr);
1003   _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
1004 
1005   /* sarrayp == true */
1006   /* Ignore genome-T to query-C mismatches.  Convert everything to 3-nucleotide space */
1007   _diff = _mm_setzero_si128();
1008 
1009   /* Compare reduced C->T nts  */
1010   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
1011   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1012 
1013   _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
1014   if (query_unk_mismatch_local_p) {
1015     _diff = _mm_or_si128(_query_flags, _diff);
1016   } else {
1017     _diff = _mm_andnot_si128(_query_flags, _diff);
1018   }
1019 
1020   _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
1021   if (genome_unk_mismatch_p) {
1022     _diff = _mm_or_si128(_ref_flags, _diff);
1023   } else {
1024     _diff = _mm_andnot_si128(_ref_flags, _diff);
1025   }
1026 
1027   return _diff;
1028 }
1029 
1030 #ifdef USE_SHIFT_HILO
1031 static __m128i
block_diff_metct_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1032 block_diff_metct_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1033 			       bool query_unk_mismatch_local_p, int startcolumni) {
1034   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1035 
1036   read_128_shift_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1037   read_128_shift_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1038 
1039   /* sarrayp == true */
1040   /* Ignore genome-T to query-C mismatches.  Convert everything to 3-nucleotide space */
1041   _diff = _mm_setzero_si128();
1042 
1043   /* Compare reduced C->T nts  */
1044   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
1045   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1046 
1047   if (query_unk_mismatch_local_p) {
1048     _diff = _mm_or_si128(_query_flags, _diff);
1049   } else {
1050     _diff = _mm_andnot_si128(_query_flags, _diff);
1051   }
1052 
1053   if (genome_unk_mismatch_p) {
1054     _diff = _mm_or_si128(_ref_flags, _diff);
1055   } else {
1056     _diff = _mm_andnot_si128(_ref_flags, _diff);
1057   }
1058 
1059   return _diff;
1060 }
1061 
1062 static __m128i
block_diff_metct_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1063 block_diff_metct_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1064 			       bool query_unk_mismatch_local_p, int endcolumni) {
1065   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1066 
1067   read_128_shift_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1068   read_128_shift_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1069 
1070   /* sarrayp == true */
1071   /* Ignore genome-T to query-C mismatches.  Convert everything to 3-nucleotide space */
1072   _diff = _mm_setzero_si128();
1073 
1074   /* Compare reduced C->T nts  */
1075   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
1076   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1077 
1078   if (query_unk_mismatch_local_p) {
1079     _diff = _mm_or_si128(_query_flags, _diff);
1080   } else {
1081     _diff = _mm_andnot_si128(_query_flags, _diff);
1082   }
1083 
1084   if (genome_unk_mismatch_p) {
1085     _diff = _mm_or_si128(_ref_flags, _diff);
1086   } else {
1087     _diff = _mm_andnot_si128(_ref_flags, _diff);
1088   }
1089 
1090   return _diff;
1091 }
1092 #endif
1093 #endif
1094 
1095 #ifdef HAVE_SSSE3
1096 /* Convert C to T: high/low (A) 0 0 => new high 0; (C) 0 1 => 1; (G) 1 0 => 1; (T) 1 0 => 1 */
1097 /* new high = high | low */
1098 static __m128i
block_diff_metct_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1099 block_diff_metct_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1100 			      bool query_unk_mismatch_local_p, int startcolumni) {
1101   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1102 
1103   read_128_wrap_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1104   read_128_wrap_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1105 
1106   /* sarrayp == true */
1107   /* Ignore genome-T to query-C mismatches.  Convert everything to 3-nucleotide space */
1108   _diff = _mm_setzero_si128();
1109 
1110   /* Compare reduced C->T nts  */
1111   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
1112   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1113 
1114   if (query_unk_mismatch_local_p) {
1115     _diff = _mm_or_si128(_query_flags, _diff);
1116   } else {
1117     _diff = _mm_andnot_si128(_query_flags, _diff);
1118   }
1119 
1120   if (genome_unk_mismatch_p) {
1121     _diff = _mm_or_si128(_ref_flags, _diff);
1122   } else {
1123     _diff = _mm_andnot_si128(_ref_flags, _diff);
1124   }
1125 
1126   return _diff;
1127 }
1128 
1129 static __m128i
block_diff_metct_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1130 block_diff_metct_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1131 			      bool query_unk_mismatch_local_p, int endcolumni) {
1132   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1133 
1134   read_128_wrap_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1135   read_128_wrap_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1136 
1137   /* sarrayp == true */
1138   /* Ignore genome-T to query-C mismatches.  Convert everything to 3-nucleotide space */
1139   _diff = _mm_setzero_si128();
1140 
1141   /* Compare reduced C->T nts  */
1142   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
1143   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1144 
1145   if (query_unk_mismatch_local_p) {
1146     _diff = _mm_or_si128(_query_flags, _diff);
1147   } else {
1148     _diff = _mm_andnot_si128(_query_flags, _diff);
1149   }
1150 
1151   if (genome_unk_mismatch_p) {
1152     _diff = _mm_or_si128(_ref_flags, _diff);
1153   } else {
1154     _diff = _mm_andnot_si128(_ref_flags, _diff);
1155   }
1156 
1157   return _diff;
1158 }
1159 #endif
1160 
1161 
1162 #ifdef HAVE_AVX2
1163 /* Convert C to T: high/low (A) 0 0 => new high 0; (C) 0 1 => 1; (G) 1 0 => 1; (T) 1 0 => 1 */
1164 /* new high = high | low */
1165 static __m256i
block_diff_metct_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1166 block_diff_metct_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1167 		      bool query_unk_mismatch_local_p) {
1168   __m256i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1169 
1170   read_256(&_query_high,&_query_low,&_query_flags,query_shifted);
1171   read_256(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1172 
1173   /* sarrayp == true */
1174   /* Ignore genome-T to query-C mismatches.  Convert everything to 3-nucleotide space */
1175   _diff = _mm256_setzero_si256();
1176 
1177   /* Compare reduced C->T nts  */
1178   _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_mm256_or_si256(_query_high, _query_low), _mm256_or_si256(_ref_high, _ref_low)));
1179   _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_query_low, _ref_low));
1180 
1181   if (query_unk_mismatch_local_p) {
1182     _diff = _mm256_or_si256(_query_flags, _diff);
1183   } else {
1184     _diff = _mm256_andnot_si256(_query_flags, _diff);
1185   }
1186 
1187   if (genome_unk_mismatch_p) {
1188     _diff = _mm256_or_si256(_ref_flags, _diff);
1189   } else {
1190     _diff = _mm256_andnot_si256(_ref_flags, _diff);
1191   }
1192 
1193   return _diff;
1194 }
1195 #endif
1196 
1197 #ifdef HAVE_AVX512
1198 /* Convert C to T: high/low (A) 0 0 => new high 0; (C) 0 1 => 1; (G) 1 0 => 1; (T) 1 0 => 1 */
1199 /* new high = high | low */
1200 static __m512i
block_diff_metct_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1201 block_diff_metct_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1202 		      bool query_unk_mismatch_local_p) {
1203   __m512i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1204 
1205   read_512(&_query_high,&_query_low,&_query_flags,query_shifted);
1206   read_512(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1207 
1208   /* sarrayp == true */
1209   /* Ignore genome-T to query-C mismatches.  Convert everything to 3-nucleotide space */
1210   _diff = _mm512_setzero_si512();
1211 
1212   /* Compare reduced C->T nts  */
1213   _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_mm512_or_si512(_query_high, _query_low), _mm512_or_si512(_ref_high, _ref_low)));
1214   _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_query_low, _ref_low));
1215 
1216   if (query_unk_mismatch_local_p) {
1217     _diff = _mm512_or_si512(_query_flags, _diff);
1218   } else {
1219     _diff = _mm512_andnot_si512(_query_flags, _diff);
1220   }
1221 
1222   if (genome_unk_mismatch_p) {
1223     _diff = _mm512_or_si512(_ref_flags, _diff);
1224   } else {
1225     _diff = _mm512_andnot_si512(_ref_flags, _diff);
1226   }
1227 
1228   return _diff;
1229 }
1230 #endif
1231 
1232 
1233 static UINT4
block_diff_metga_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1234 block_diff_metga_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1235 		     bool query_unk_mismatch_local_p) {
1236   UINT4 diff;
1237 
1238   /* sarrayp == true */
1239   /* Ignore genome-A to query-G mismatches.  Convert everything to 3-nucleotide space. */
1240   diff = 0U;
1241 
1242   /* Compare reduced G->A nts  */
1243 #ifdef WORDS_BIGENDIAN
1244   diff |= ((query_shifted[0] & query_shifted[1]) ^ (Bigendian_convert_uint(ref_ptr[0]) & Bigendian_convert_uint(ref_ptr[4]))) |
1245     (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
1246 #elif !defined(HAVE_SSE2)
1247   diff |= ((query_shifted[0] & query_shifted[1]) ^ (ref_ptr[0] & ref_ptr[4])) | (query_shifted[1] ^ ref_ptr[4]);
1248 #else
1249   diff |= ((query_shifted[0] & query_shifted[4]) ^ (ref_ptr[0] & ref_ptr[4])) | (query_shifted[4] ^ ref_ptr[4]);
1250 #endif
1251   debug(printf(" => diff %08X\n",diff));
1252 
1253 
1254   /* Flags: Considering N as a mismatch */
1255 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
1256   if (query_unk_mismatch_local_p) {
1257     debug(printf("Marking query flags: query %08X ",query_shifted[2]));
1258     diff |= query_shifted[2];
1259   } else {
1260     debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
1261     diff &= ~(query_shifted[2]);
1262   }
1263 #else
1264   if (query_unk_mismatch_local_p) {
1265     debug(printf("Marking query flags: query %08X ",query_shifted[8]));
1266     diff |= query_shifted[8];
1267   } else {
1268     debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
1269     diff &= ~(query_shifted[8]);
1270   }
1271 #endif
1272 
1273   if (genome_unk_mismatch_p) {
1274     debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
1275 #ifdef WORDS_BIGENDIAN
1276     diff |= Bigendian_convert_uint(ref_ptr[8]);
1277 #else
1278     diff |= (ref_ptr[8]);
1279 #endif
1280   } else {
1281     debug(printf("Clearing genome flags: genome %08X ",ref_ptr[8]));
1282 #ifdef WORDS_BIGENDIAN
1283     diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
1284 #else
1285     diff &= ~(ref_ptr[8]);
1286 #endif
1287   }
1288   debug(printf(" => diff %08X\n",diff));
1289 
1290   return diff;
1291 }
1292 
1293 
1294 #ifdef HAVE_SSE2
1295 /* Convert G to A: high/low (A) 0 0 => new high 0; (C) 0 1 => 0; (G) 1 0 => 0; (T) 1 0 => 1 */
1296 /* new high = high & low */
1297 static __m128i
block_diff_metga_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1298 block_diff_metga_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1299 		      bool query_unk_mismatch_local_p) {
1300   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1301 
1302   _query_high = _mm_load_si128((__m128i *) query_shifted);
1303   _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
1304   _ref_high = _mm_load_si128((__m128i *) ref_ptr);
1305   _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
1306 
1307   /* sarrayp == true */
1308   /* Ignore genome-A to query-G mismatches.  Convert everything to 3-nucleotide space. */
1309   _diff = _mm_setzero_si128();
1310 
1311   /* Compare reduced G->A nts  */
1312   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
1313   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1314 
1315   _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
1316   if (query_unk_mismatch_local_p) {
1317     _diff = _mm_or_si128(_query_flags, _diff);
1318   } else {
1319     _diff = _mm_andnot_si128(_query_flags, _diff);
1320   }
1321 
1322   _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
1323   if (genome_unk_mismatch_p) {
1324     _diff = _mm_or_si128(_ref_flags, _diff);
1325   } else {
1326     _diff = _mm_andnot_si128(_ref_flags, _diff);
1327   }
1328 
1329   return _diff;
1330 }
1331 
1332 #ifdef USE_SHIFT_HILO
1333 static __m128i
block_diff_metga_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1334 block_diff_metga_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1335 			       bool query_unk_mismatch_local_p, int startcolumni) {
1336   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1337 
1338   read_128_shift_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1339   read_128_shift_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1340 
1341   /* sarrayp == true */
1342   /* Ignore genome-A to query-G mismatches.  Convert everything to 3-nucleotide space. */
1343   _diff = _mm_setzero_si128();
1344 
1345   /* Compare reduced G->A nts  */
1346   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
1347   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1348 
1349   if (query_unk_mismatch_local_p) {
1350     _diff = _mm_or_si128(_query_flags, _diff);
1351   } else {
1352     _diff = _mm_andnot_si128(_query_flags, _diff);
1353   }
1354 
1355   if (genome_unk_mismatch_p) {
1356     _diff = _mm_or_si128(_ref_flags, _diff);
1357   } else {
1358     _diff = _mm_andnot_si128(_ref_flags, _diff);
1359   }
1360 
1361   return _diff;
1362 }
1363 
1364 static __m128i
block_diff_metga_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1365 block_diff_metga_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1366 			       bool query_unk_mismatch_local_p, int endcolumni) {
1367   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1368 
1369   read_128_shift_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1370   read_128_shift_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1371 
1372   /* sarrayp == true */
1373   /* Ignore genome-A to query-G mismatches.  Convert everything to 3-nucleotide space. */
1374   _diff = _mm_setzero_si128();
1375 
1376   /* Compare reduced G->A nts  */
1377   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
1378   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1379 
1380   if (query_unk_mismatch_local_p) {
1381     _diff = _mm_or_si128(_query_flags, _diff);
1382   } else {
1383     _diff = _mm_andnot_si128(_query_flags, _diff);
1384   }
1385 
1386   if (genome_unk_mismatch_p) {
1387     _diff = _mm_or_si128(_ref_flags, _diff);
1388   } else {
1389     _diff = _mm_andnot_si128(_ref_flags, _diff);
1390   }
1391 
1392   return _diff;
1393 }
1394 #endif
1395 #endif
1396 
1397 #ifdef HAVE_SSSE3
1398 /* Convert G to A: high/low (A) 0 0 => new high 0; (C) 0 1 => 0; (G) 1 0 => 0; (T) 1 0 => 1 */
1399 /* new high = high & low */
1400 static __m128i
block_diff_metga_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1401 block_diff_metga_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1402 			      bool query_unk_mismatch_local_p, int startcolumni) {
1403   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1404 
1405   read_128_wrap_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1406   read_128_wrap_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1407 
1408   /* sarrayp == true */
1409   /* Ignore genome-A to query-G mismatches.  Convert everything to 3-nucleotide space. */
1410   _diff = _mm_setzero_si128();
1411 
1412   /* Compare reduced G->A nts  */
1413   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
1414   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1415 
1416   if (query_unk_mismatch_local_p) {
1417     _diff = _mm_or_si128(_query_flags, _diff);
1418   } else {
1419     _diff = _mm_andnot_si128(_query_flags, _diff);
1420   }
1421 
1422   if (genome_unk_mismatch_p) {
1423     _diff = _mm_or_si128(_ref_flags, _diff);
1424   } else {
1425     _diff = _mm_andnot_si128(_ref_flags, _diff);
1426   }
1427 
1428   return _diff;
1429 }
1430 
1431 static __m128i
block_diff_metga_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1432 block_diff_metga_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1433 			      bool query_unk_mismatch_local_p, int endcolumni) {
1434   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1435 
1436   read_128_wrap_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1437   read_128_wrap_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1438 
1439   /* sarrayp == true */
1440   /* Ignore genome-A to query-G mismatches.  Convert everything to 3-nucleotide space. */
1441   _diff = _mm_setzero_si128();
1442 
1443   /* Compare reduced G->A nts  */
1444   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
1445   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1446 
1447   if (query_unk_mismatch_local_p) {
1448     _diff = _mm_or_si128(_query_flags, _diff);
1449   } else {
1450     _diff = _mm_andnot_si128(_query_flags, _diff);
1451   }
1452 
1453   if (genome_unk_mismatch_p) {
1454     _diff = _mm_or_si128(_ref_flags, _diff);
1455   } else {
1456     _diff = _mm_andnot_si128(_ref_flags, _diff);
1457   }
1458 
1459   return _diff;
1460 }
1461 #endif
1462 
1463 #ifdef HAVE_AVX2
1464 /* Convert G to A: high/low (A) 0 0 => new high 0; (C) 0 1 => 0; (G) 1 0 => 0; (T) 1 0 => 1 */
1465 /* new high = high & low */
1466 static __m256i
block_diff_metga_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1467 block_diff_metga_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1468 		      bool query_unk_mismatch_local_p) {
1469   __m256i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1470 
1471   read_256(&_query_high,&_query_low,&_query_flags,query_shifted);
1472   read_256(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1473 
1474   /* sarrayp == true */
1475   /* Ignore genome-A to query-G mismatches.  Convert everything to 3-nucleotide space. */
1476   _diff = _mm256_setzero_si256();
1477 
1478   /* Compare reduced G->A nts  */
1479   _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_mm256_and_si256(_query_high, _query_low), _mm256_and_si256(_ref_high, _ref_low)));
1480   _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_query_low, _ref_low));
1481 
1482   if (query_unk_mismatch_local_p) {
1483     _diff = _mm256_or_si256(_query_flags, _diff);
1484   } else {
1485     _diff = _mm256_andnot_si256(_query_flags, _diff);
1486   }
1487 
1488   if (genome_unk_mismatch_p) {
1489     _diff = _mm256_or_si256(_ref_flags, _diff);
1490   } else {
1491     _diff = _mm256_andnot_si256(_ref_flags, _diff);
1492   }
1493 
1494   return _diff;
1495 }
1496 #endif
1497 
1498 #ifdef HAVE_AVX512
1499 /* Convert G to A: high/low (A) 0 0 => new high 0; (C) 0 1 => 0; (G) 1 0 => 0; (T) 1 0 => 1 */
1500 /* new high = high & low */
1501 static __m512i
block_diff_metga_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1502 block_diff_metga_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1503 		      bool query_unk_mismatch_local_p) {
1504   __m512i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1505 
1506   read_512(&_query_high,&_query_low,&_query_flags,query_shifted);
1507   read_512(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1508 
1509   /* sarrayp == true */
1510   /* Ignore genome-A to query-G mismatches.  Convert everything to 3-nucleotide space. */
1511   _diff = _mm512_setzero_si512();
1512 
1513   /* Compare reduced G->A nts  */
1514   _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_mm512_and_si512(_query_high, _query_low), _mm512_and_si512(_ref_high, _ref_low)));
1515   _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_query_low, _ref_low));
1516 
1517   if (query_unk_mismatch_local_p) {
1518     _diff = _mm512_or_si512(_query_flags, _diff);
1519   } else {
1520     _diff = _mm512_andnot_si512(_query_flags, _diff);
1521   }
1522 
1523   if (genome_unk_mismatch_p) {
1524     _diff = _mm512_or_si512(_ref_flags, _diff);
1525   } else {
1526     _diff = _mm512_andnot_si512(_ref_flags, _diff);
1527   }
1528 
1529   return _diff;
1530 }
1531 #endif
1532 
1533 
1534 static UINT4
block_diff_cmet_sarray_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)1535 block_diff_cmet_sarray_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1536 			   bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
1537   if (genestrand == +2) {
1538     if (plusp) {
1539       return block_diff_metga_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1540     } else {
1541       return block_diff_metct_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1542     }
1543   } else {
1544     if (plusp) {
1545       return block_diff_metct_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1546     } else {
1547       return block_diff_metga_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1548     }
1549   }
1550 }
1551 
1552 #ifdef HAVE_SSE2
1553 static __m128i
block_diff_cmet_sarray_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)1554 block_diff_cmet_sarray_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1555 			    bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
1556   if (genestrand == +2) {
1557     if (plusp) {
1558       return block_diff_metga_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1559     } else {
1560       return block_diff_metct_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1561     }
1562   } else {
1563     if (plusp) {
1564       return block_diff_metct_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1565     } else {
1566       return block_diff_metga_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1567     }
1568   }
1569 }
1570 
1571 #ifdef USE_SHIFT_HILO
1572 static __m128i
block_diff_cmet_sarray_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)1573 block_diff_cmet_sarray_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1574 				     bool plusp, int genestrand, bool query_unk_mismatch_local_p,
1575 				     int startcolumni) {
1576   if (genestrand == +2) {
1577     if (plusp) {
1578       return block_diff_metga_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1579 					   startcolumni);
1580     } else {
1581       return block_diff_metct_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1582 					   startcolumni);
1583     }
1584   } else {
1585     if (plusp) {
1586       return block_diff_metct_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1587 					   startcolumni);
1588     } else {
1589       return block_diff_metga_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1590 					   startcolumni);
1591     }
1592   }
1593 }
1594 
1595 static __m128i
block_diff_cmet_sarray_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)1596 block_diff_cmet_sarray_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1597 				     bool plusp, int genestrand, bool query_unk_mismatch_local_p,
1598 				     int endcolumni) {
1599   if (genestrand == +2) {
1600     if (plusp) {
1601       return block_diff_metga_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1602 					   endcolumni);
1603     } else {
1604       return block_diff_metct_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1605 					   endcolumni);
1606     }
1607   } else {
1608     if (plusp) {
1609       return block_diff_metct_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1610 					   endcolumni);
1611     } else {
1612       return block_diff_metga_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1613 					   endcolumni);
1614     }
1615   }
1616 }
1617 #endif
1618 #endif
1619 
1620 #ifdef HAVE_SSSE3
1621 static __m128i
block_diff_cmet_sarray_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)1622 block_diff_cmet_sarray_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1623 				    bool plusp, int genestrand, bool query_unk_mismatch_local_p,
1624 				    int startcolumni) {
1625   if (genestrand == +2) {
1626     if (plusp) {
1627       return block_diff_metga_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1628 					  startcolumni);
1629     } else {
1630       return block_diff_metct_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1631 					  startcolumni);
1632     }
1633   } else {
1634     if (plusp) {
1635       return block_diff_metct_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1636 					  startcolumni);
1637     } else {
1638       return block_diff_metga_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1639 					  startcolumni);
1640     }
1641   }
1642 }
1643 
1644 static __m128i
block_diff_cmet_sarray_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)1645 block_diff_cmet_sarray_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1646 				    bool plusp, int genestrand, bool query_unk_mismatch_local_p,
1647 				    int endcolumni) {
1648   if (genestrand == +2) {
1649     if (plusp) {
1650       return block_diff_metga_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1651 					  endcolumni);
1652     } else {
1653       return block_diff_metct_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1654 					  endcolumni);
1655     }
1656   } else {
1657     if (plusp) {
1658       return block_diff_metct_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1659 					  endcolumni);
1660     } else {
1661       return block_diff_metga_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1662 					  endcolumni);
1663     }
1664   }
1665 }
1666 #endif
1667 
1668 #ifdef HAVE_AVX2
1669 static __m256i
block_diff_cmet_sarray_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)1670 block_diff_cmet_sarray_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1671 			    bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
1672   if (genestrand == +2) {
1673     if (plusp) {
1674       return block_diff_metga_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1675     } else {
1676       return block_diff_metct_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1677     }
1678   } else {
1679     if (plusp) {
1680       return block_diff_metct_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1681     } else {
1682       return block_diff_metga_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1683     }
1684   }
1685 }
1686 #endif
1687 
1688 #ifdef HAVE_AVX512
1689 static __m512i
block_diff_cmet_sarray_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)1690 block_diff_cmet_sarray_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1691 			    bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
1692   if (genestrand == +2) {
1693     if (plusp) {
1694       return block_diff_metga_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1695     } else {
1696       return block_diff_metct_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1697     }
1698   } else {
1699     if (plusp) {
1700       return block_diff_metct_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1701     } else {
1702       return block_diff_metga_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1703     }
1704   }
1705 }
1706 #endif
1707 
1708 
1709 
1710 /************************************************************************
1711  *   ATOI
1712  ************************************************************************/
1713 
1714 static UINT4
block_diff_a2iag_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1715 block_diff_a2iag_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1716 		     bool query_unk_mismatch_local_p) {
1717   UINT4 diff;
1718 
1719   /* sarrayp == true */
1720   /* Ignore genome-G to query-A mismatches.  Convert everything to 3-nucleotide space. */
1721   diff = 0U;
1722 
1723   /* Compare reduced A->G nts  */
1724 #ifdef WORDS_BIGENDIAN
1725   diff |= ((query_shifted[0] | ~(query_shifted[1])) ^ (Bigendian_convert_uint(ref_ptr[0]) | ~(Bigendian_convert_uint(ref_ptr[4])))) |
1726     (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
1727 #elif !defined(HAVE_SSE2)
1728   diff |= ((query_shifted[0] | ~(query_shifted[1])) ^ (ref_ptr[0] | ~(ref_ptr[4]))) | (query_shifted[1] ^ ref_ptr[4]);
1729   /* Because (a ^ b) = (~a ^ ~b), this is equivalent to
1730   diff |= ((~query_shifted[0] & query_shifted[1]) ^ (~ref_ptr[0] & ref_ptr[4])) | (query_shifted[1] ^ ref_ptr[4]);
1731   */
1732 #else
1733   diff |= ((query_shifted[0] | ~(query_shifted[4])) ^ (ref_ptr[0] | ~(ref_ptr[4]))) | (query_shifted[4] ^ ref_ptr[4]);
1734 #endif
1735   debug(printf(" => diff %08X\n",diff));
1736 
1737   /* Flags: Considering N as a mismatch */
1738 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
1739   if (query_unk_mismatch_local_p) {
1740     debug(printf("Marking query flags: query %08X ",query_shifted[2]));
1741     diff |= query_shifted[2];
1742   } else {
1743     debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
1744     diff &= ~(query_shifted[2]);
1745   }
1746 #else
1747   if (query_unk_mismatch_local_p) {
1748     debug(printf("Marking query flags: query %08X ",query_shifted[8]));
1749     diff |= query_shifted[8];
1750   } else {
1751     debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
1752     diff &= ~(query_shifted[8]);
1753   }
1754 #endif
1755 
1756   if (genome_unk_mismatch_p) {
1757     debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
1758 #ifdef WORDS_BIGENDIAN
1759     diff |= Bigendian_convert_uint(ref_ptr[8]);
1760 #else
1761     diff |= (ref_ptr[8]);
1762 #endif
1763   } else {
1764     debug(printf("Clearing genome flags: genome %08X ",ref_ptr[8]));
1765 #ifdef WORDS_BIGENDIAN
1766     diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
1767 #else
1768     diff &= ~(ref_ptr[8]);
1769 #endif
1770   }
1771   debug(printf(" => diff %08X\n",diff));
1772 
1773   return diff;
1774 }
1775 
1776 
1777 #ifdef HAVE_SSE2
1778 /* Convert A->G: new high = high | ~low */
1779 static __m128i
block_diff_a2iag_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1780 block_diff_a2iag_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1781 		      bool query_unk_mismatch_local_p) {
1782   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1783 
1784   _query_high = _mm_load_si128((__m128i *) query_shifted);
1785   _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
1786   _ref_high = _mm_load_si128((__m128i *) ref_ptr);
1787   _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
1788 
1789   /* sarrayp == true */
1790   /* Ignore genome-G to query-A mismatches.  Convert everything to 3-nucleotide space. */
1791   _diff = _mm_setzero_si128();
1792 
1793   /* Compare reduced A->G nts  */
1794   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_high, _query_low), _mm_andnot_si128(_ref_high, _ref_low)));
1795   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1796 
1797   _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
1798   if (query_unk_mismatch_local_p) {
1799     _diff = _mm_or_si128(_query_flags, _diff);
1800   } else {
1801     _diff = _mm_andnot_si128(_query_flags, _diff);
1802   }
1803 
1804   _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
1805   if (genome_unk_mismatch_p) {
1806     _diff = _mm_or_si128(_ref_flags, _diff);
1807   } else {
1808     _diff = _mm_andnot_si128(_ref_flags, _diff);
1809   }
1810 
1811   return _diff;
1812 }
1813 
1814 #ifdef USE_SHIFT_HILO
1815 static __m128i
block_diff_a2iag_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1816 block_diff_a2iag_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1817 			       bool query_unk_mismatch_local_p, int startcolumni) {
1818   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1819 
1820   read_128_shift_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1821   read_128_shift_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1822 
1823   /* sarrayp == true */
1824   /* Ignore genome-G to query-A mismatches.  Convert everything to 3-nucleotide space. */
1825   _diff = _mm_setzero_si128();
1826 
1827   /* Compare reduced A->G nts  */
1828   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_high, _query_low), _mm_andnot_si128(_ref_high, _ref_low)));
1829   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1830 
1831   if (query_unk_mismatch_local_p) {
1832     _diff = _mm_or_si128(_query_flags, _diff);
1833   } else {
1834     _diff = _mm_andnot_si128(_query_flags, _diff);
1835   }
1836 
1837   if (genome_unk_mismatch_p) {
1838     _diff = _mm_or_si128(_ref_flags, _diff);
1839   } else {
1840     _diff = _mm_andnot_si128(_ref_flags, _diff);
1841   }
1842 
1843   return _diff;
1844 }
1845 
1846 static __m128i
block_diff_a2iag_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1847 block_diff_a2iag_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1848 			       bool query_unk_mismatch_local_p, int endcolumni) {
1849   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1850 
1851   read_128_shift_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1852   read_128_shift_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1853 
1854   /* sarrayp == true */
1855   /* Ignore genome-G to query-A mismatches.  Convert everything to 3-nucleotide space. */
1856   _diff = _mm_setzero_si128();
1857 
1858   /* Compare reduced A->G nts  */
1859   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_high, _query_low), _mm_andnot_si128(_ref_high, _ref_low)));
1860   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1861 
1862   if (query_unk_mismatch_local_p) {
1863     _diff = _mm_or_si128(_query_flags, _diff);
1864   } else {
1865     _diff = _mm_andnot_si128(_query_flags, _diff);
1866   }
1867 
1868   if (genome_unk_mismatch_p) {
1869     _diff = _mm_or_si128(_ref_flags, _diff);
1870   } else {
1871     _diff = _mm_andnot_si128(_ref_flags, _diff);
1872   }
1873 
1874   return _diff;
1875 }
1876 #endif
1877 #endif
1878 
1879 #ifdef HAVE_SSSE3
1880 /* Convert A->G: new high = high | ~low */
1881 static __m128i
block_diff_a2iag_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1882 block_diff_a2iag_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1883 			      bool query_unk_mismatch_local_p, int startcolumni) {
1884   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1885 
1886   read_128_wrap_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1887   read_128_wrap_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1888 
1889   /* sarrayp == true */
1890   /* Ignore genome-G to query-A mismatches.  Convert everything to 3-nucleotide space. */
1891   _diff = _mm_setzero_si128();
1892 
1893   /* Compare reduced A->G nts  */
1894   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_high, _query_low), _mm_andnot_si128(_ref_high, _ref_low)));
1895   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1896 
1897   if (query_unk_mismatch_local_p) {
1898     _diff = _mm_or_si128(_query_flags, _diff);
1899   } else {
1900     _diff = _mm_andnot_si128(_query_flags, _diff);
1901   }
1902 
1903   if (genome_unk_mismatch_p) {
1904     _diff = _mm_or_si128(_ref_flags, _diff);
1905   } else {
1906     _diff = _mm_andnot_si128(_ref_flags, _diff);
1907   }
1908 
1909   return _diff;
1910 }
1911 
1912 static __m128i
block_diff_a2iag_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1913 block_diff_a2iag_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1914 			      bool query_unk_mismatch_local_p, int endcolumni) {
1915   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1916 
1917   read_128_wrap_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1918   read_128_wrap_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1919 
1920   /* sarrayp == true */
1921   /* Ignore genome-G to query-A mismatches.  Convert everything to 3-nucleotide space. */
1922   _diff = _mm_setzero_si128();
1923 
1924   /* Compare reduced A->G nts  */
1925   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_high, _query_low), _mm_andnot_si128(_ref_high, _ref_low)));
1926   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1927 
1928   if (query_unk_mismatch_local_p) {
1929     _diff = _mm_or_si128(_query_flags, _diff);
1930   } else {
1931     _diff = _mm_andnot_si128(_query_flags, _diff);
1932   }
1933 
1934   if (genome_unk_mismatch_p) {
1935     _diff = _mm_or_si128(_ref_flags, _diff);
1936   } else {
1937     _diff = _mm_andnot_si128(_ref_flags, _diff);
1938   }
1939 
1940   return _diff;
1941 }
1942 #endif
1943 
1944 #ifdef HAVE_AVX2
1945 /* Convert A->G: new high = high | ~low */
1946 static __m256i
block_diff_a2iag_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1947 block_diff_a2iag_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1948 		      bool query_unk_mismatch_local_p) {
1949   __m256i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1950 
1951   read_256(&_query_high,&_query_low,&_query_flags,query_shifted);
1952   read_256(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1953 
1954   /* sarrayp == true */
1955   /* Ignore genome-G to query-A mismatches.  Convert everything to 3-nucleotide space. */
1956   _diff = _mm256_setzero_si256();
1957 
1958   /* Compare reduced A->G nts  */
1959   _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_mm256_andnot_si256(_query_high, _query_low), _mm256_andnot_si256(_ref_high, _ref_low)));
1960   _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_query_low, _ref_low));
1961 
1962   if (query_unk_mismatch_local_p) {
1963     _diff = _mm256_or_si256(_query_flags, _diff);
1964   } else {
1965     _diff = _mm256_andnot_si256(_query_flags, _diff);
1966   }
1967 
1968   if (genome_unk_mismatch_p) {
1969     _diff = _mm256_or_si256(_ref_flags, _diff);
1970   } else {
1971     _diff = _mm256_andnot_si256(_ref_flags, _diff);
1972   }
1973 
1974   return _diff;
1975 }
1976 #endif
1977 
1978 #ifdef HAVE_AVX512
1979 /* Convert A->G: new high = high | ~low */
1980 static __m512i
block_diff_a2iag_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1981 block_diff_a2iag_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1982 		      bool query_unk_mismatch_local_p) {
1983   __m512i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1984 
1985   read_512(&_query_high,&_query_low,&_query_flags,query_shifted);
1986   read_512(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1987 
1988   /* sarrayp == true */
1989   /* Ignore genome-G to query-A mismatches.  Convert everything to 3-nucleotide space. */
1990   _diff = _mm512_setzero_si512();
1991 
1992   /* Compare reduced A->G nts  */
1993   _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_mm512_andnot_si512(_query_high, _query_low), _mm512_andnot_si512(_ref_high, _ref_low)));
1994   _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_query_low, _ref_low));
1995 
1996   if (query_unk_mismatch_local_p) {
1997     _diff = _mm512_or_si512(_query_flags, _diff);
1998   } else {
1999     _diff = _mm512_andnot_si512(_query_flags, _diff);
2000   }
2001 
2002   if (genome_unk_mismatch_p) {
2003     _diff = _mm512_or_si512(_ref_flags, _diff);
2004   } else {
2005     _diff = _mm512_andnot_si512(_ref_flags, _diff);
2006   }
2007 
2008   return _diff;
2009 }
2010 #endif
2011 
2012 
2013 static UINT4
block_diff_a2itc_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)2014 block_diff_a2itc_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2015 		     bool query_unk_mismatch_local_p) {
2016   UINT4 diff;
2017 
2018   /* sarrayp == true */
2019   /* Ignore genome-C to query-T mismatches */
2020   diff = 0U;
2021 
2022   /* Compare reduced T->C nts  */
2023 #ifdef WORDS_BIGENDIAN
2024   diff |= ((query_shifted[0] & ~(query_shifted[1])) ^ (Bigendian_convert_uint(ref_ptr[0]) & ~(Bigendian_convert_uint(ref_ptr[4])))) |
2025     (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
2026 #elif !defined(HAVE_SSE2)
2027   diff |= ((query_shifted[0] & ~(query_shifted[1])) ^ (ref_ptr[0] & ~(ref_ptr[4]))) | (query_shifted[1] ^ ref_ptr[4]);
2028 #else
2029   diff |= ((query_shifted[0] & ~(query_shifted[4])) ^ (ref_ptr[0] & ~(ref_ptr[4]))) | (query_shifted[4] ^ ref_ptr[4]);
2030 #endif
2031   debug(printf(" => diff %08X\n",diff));
2032 
2033   /* Flags: Considering N as a mismatch */
2034 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
2035   if (query_unk_mismatch_local_p) {
2036     debug(printf("Marking query flags: query %08X ",query_shifted[8]));
2037     diff |= query_shifted[8];
2038   } else {
2039     debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
2040     diff &= ~(query_shifted[8]);
2041   }
2042 #else
2043   if (query_unk_mismatch_local_p) {
2044     debug(printf("Marking query flags: query %08X ",query_shifted[2]));
2045     diff |= query_shifted[2];
2046   } else {
2047     debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
2048     diff &= ~(query_shifted[2]);
2049   }
2050 #endif
2051 
2052   if (genome_unk_mismatch_p) {
2053     debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
2054 #ifdef WORDS_BIGENDIAN
2055     diff |= Bigendian_convert_uint(ref_ptr[8]);
2056 #else
2057     diff |= (ref_ptr[8]);
2058 #endif
2059   } else {
2060     debug(printf("Clearing genome flags: genome %08X ",ref_ptr[8]));
2061 #ifdef WORDS_BIGENDIAN
2062     diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
2063 #else
2064     diff &= ~(ref_ptr[8]);
2065 #endif
2066   }
2067   debug(printf(" => diff %08X\n",diff));
2068 
2069   return diff;
2070 }
2071 
2072 
2073 #ifdef HAVE_SSE2
2074 /* Convert T->C: new high = high & ~low */
2075 static __m128i
block_diff_a2itc_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)2076 block_diff_a2itc_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2077 		      bool query_unk_mismatch_local_p) {
2078   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2079 
2080   _query_high = _mm_load_si128((__m128i *) query_shifted);
2081   _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
2082   _ref_high = _mm_load_si128((__m128i *) ref_ptr);
2083   _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
2084 
2085   /* sarrayp == true */
2086   /* Ignore genome-C to query-T mismatches */
2087   _diff = _mm_setzero_si128();
2088 
2089   /* Compare reduced T->C nts  */
2090   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_low, _query_high), _mm_andnot_si128(_ref_low, _ref_high)));
2091   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
2092 
2093   _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
2094   if (query_unk_mismatch_local_p) {
2095     _diff = _mm_or_si128(_query_flags, _diff);
2096   } else {
2097     _diff = _mm_andnot_si128(_query_flags, _diff);
2098   }
2099 
2100   _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
2101   if (genome_unk_mismatch_p) {
2102     _diff = _mm_or_si128(_ref_flags, _diff);
2103   } else {
2104     _diff = _mm_andnot_si128(_ref_flags, _diff);
2105   }
2106 
2107   return _diff;
2108 }
2109 
2110 #ifdef USE_SHIFT_HILO
2111 static __m128i
block_diff_a2itc_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)2112 block_diff_a2itc_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2113 			       bool query_unk_mismatch_local_p, int startcolumni) {
2114   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2115 
2116   read_128_shift_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
2117   read_128_shift_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
2118 
2119   /* sarrayp == true */
2120   /* Ignore genome-C to query-T mismatches */
2121   _diff = _mm_setzero_si128();
2122 
2123   /* Compare reduced T->C nts  */
2124   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_low, _query_high), _mm_andnot_si128(_ref_low, _ref_high)));
2125   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
2126 
2127   if (query_unk_mismatch_local_p) {
2128     _diff = _mm_or_si128(_query_flags, _diff);
2129   } else {
2130     _diff = _mm_andnot_si128(_query_flags, _diff);
2131   }
2132 
2133   if (genome_unk_mismatch_p) {
2134     _diff = _mm_or_si128(_ref_flags, _diff);
2135   } else {
2136     _diff = _mm_andnot_si128(_ref_flags, _diff);
2137   }
2138 
2139   return _diff;
2140 }
2141 
2142 static __m128i
block_diff_a2itc_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)2143 block_diff_a2itc_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2144 			       bool query_unk_mismatch_local_p, int endcolumni) {
2145   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2146 
2147   read_128_shift_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
2148   read_128_shift_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
2149 
2150   /* sarrayp == true */
2151   /* Ignore genome-C to query-T mismatches */
2152   _diff = _mm_setzero_si128();
2153 
2154   /* Compare reduced T->C nts  */
2155   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_low, _query_high), _mm_andnot_si128(_ref_low, _ref_high)));
2156   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
2157 
2158   if (query_unk_mismatch_local_p) {
2159     _diff = _mm_or_si128(_query_flags, _diff);
2160   } else {
2161     _diff = _mm_andnot_si128(_query_flags, _diff);
2162   }
2163 
2164   if (genome_unk_mismatch_p) {
2165     _diff = _mm_or_si128(_ref_flags, _diff);
2166   } else {
2167     _diff = _mm_andnot_si128(_ref_flags, _diff);
2168   }
2169 
2170   return _diff;
2171 }
2172 #endif
2173 #endif
2174 
2175 #ifdef HAVE_SSSE3
2176 /* Convert T->C: new high = high & ~low */
2177 static __m128i
block_diff_a2itc_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)2178 block_diff_a2itc_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2179 			      bool query_unk_mismatch_local_p, int startcolumni) {
2180   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2181 
2182   read_128_wrap_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
2183   read_128_wrap_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
2184 
2185   /* sarrayp == true */
2186   /* Ignore genome-C to query-T mismatches */
2187   _diff = _mm_setzero_si128();
2188 
2189   /* Compare reduced T->C nts  */
2190   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_low, _query_high), _mm_andnot_si128(_ref_low, _ref_high)));
2191   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
2192 
2193   if (query_unk_mismatch_local_p) {
2194     _diff = _mm_or_si128(_query_flags, _diff);
2195   } else {
2196     _diff = _mm_andnot_si128(_query_flags, _diff);
2197   }
2198 
2199   if (genome_unk_mismatch_p) {
2200     _diff = _mm_or_si128(_ref_flags, _diff);
2201   } else {
2202     _diff = _mm_andnot_si128(_ref_flags, _diff);
2203   }
2204 
2205   return _diff;
2206 }
2207 
2208 static __m128i
block_diff_a2itc_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)2209 block_diff_a2itc_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2210 			      bool query_unk_mismatch_local_p, int endcolumni) {
2211   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2212 
2213   read_128_wrap_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
2214   read_128_wrap_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
2215 
2216   /* sarrayp == true */
2217   /* Ignore genome-C to query-T mismatches */
2218   _diff = _mm_setzero_si128();
2219 
2220   /* Compare reduced T->C nts  */
2221   _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_low, _query_high), _mm_andnot_si128(_ref_low, _ref_high)));
2222   _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
2223 
2224   if (query_unk_mismatch_local_p) {
2225     _diff = _mm_or_si128(_query_flags, _diff);
2226   } else {
2227     _diff = _mm_andnot_si128(_query_flags, _diff);
2228   }
2229 
2230   if (genome_unk_mismatch_p) {
2231     _diff = _mm_or_si128(_ref_flags, _diff);
2232   } else {
2233     _diff = _mm_andnot_si128(_ref_flags, _diff);
2234   }
2235 
2236   return _diff;
2237 }
2238 #endif
2239 
2240 #ifdef HAVE_AVX2
2241 /* Convert T->C: new high = high & ~low */
2242 static __m256i
block_diff_a2itc_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)2243 block_diff_a2itc_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2244 		      bool query_unk_mismatch_local_p) {
2245   __m256i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2246 
2247   read_256(&_query_high,&_query_low,&_query_flags,query_shifted);
2248   read_256(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
2249 
2250   /* sarrayp == true */
2251   /* Ignore genome-C to query-T mismatches */
2252   _diff = _mm256_setzero_si256();
2253 
2254   /* Compare reduced T->C nts  */
2255   _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_mm256_andnot_si256(_query_low, _query_high), _mm256_andnot_si256(_ref_low, _ref_high)));
2256   _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_query_low, _ref_low));
2257 
2258   if (query_unk_mismatch_local_p) {
2259     _diff = _mm256_or_si256(_query_flags, _diff);
2260   } else {
2261     _diff = _mm256_andnot_si256(_query_flags, _diff);
2262   }
2263 
2264   if (genome_unk_mismatch_p) {
2265     _diff = _mm256_or_si256(_ref_flags, _diff);
2266   } else {
2267     _diff = _mm256_andnot_si256(_ref_flags, _diff);
2268   }
2269 
2270   return _diff;
2271 }
2272 #endif
2273 
2274 #ifdef HAVE_AVX512
2275 /* Convert T->C: new high = high & ~low */
2276 static __m512i
block_diff_a2itc_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)2277 block_diff_a2itc_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2278 		      bool query_unk_mismatch_local_p) {
2279   __m512i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2280 
2281   read_512(&_query_high,&_query_low,&_query_flags,query_shifted);
2282   read_512(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
2283 
2284   /* sarrayp == true */
2285   /* Ignore genome-C to query-T mismatches */
2286   _diff = _mm512_setzero_si512();
2287 
2288   /* Compare reduced T->C nts  */
2289   _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_mm512_andnot_si512(_query_low, _query_high), _mm512_andnot_si512(_ref_low, _ref_high)));
2290   _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_query_low, _ref_low));
2291 
2292   if (query_unk_mismatch_local_p) {
2293     _diff = _mm512_or_si512(_query_flags, _diff);
2294   } else {
2295     _diff = _mm512_andnot_si512(_query_flags, _diff);
2296   }
2297 
2298   if (genome_unk_mismatch_p) {
2299     _diff = _mm512_or_si512(_ref_flags, _diff);
2300   } else {
2301     _diff = _mm512_andnot_si512(_ref_flags, _diff);
2302   }
2303 
2304   return _diff;
2305 }
2306 #endif
2307 
2308 
2309 static UINT4
block_diff_atoi_sarray_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2310 block_diff_atoi_sarray_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2311 			   bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2312   if (genestrand == +2) {
2313     if (plusp) {
2314       return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2315     } else {
2316       return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2317     }
2318   } else {
2319     if (plusp) {
2320       return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2321     } else {
2322       return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2323     }
2324   }
2325 }
2326 
2327 #ifdef HAVE_SSE2
2328 static __m128i
block_diff_atoi_sarray_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2329 block_diff_atoi_sarray_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2330 			    bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2331   if (genestrand == +2) {
2332     if (plusp) {
2333       return block_diff_a2itc_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2334     } else {
2335       return block_diff_a2iag_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2336     }
2337   } else {
2338     if (plusp) {
2339       return block_diff_a2iag_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2340     } else {
2341       return block_diff_a2itc_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2342     }
2343   }
2344 }
2345 
2346 #ifdef USE_SHIFT_HILO
2347 static __m128i
block_diff_atoi_sarray_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)2348 block_diff_atoi_sarray_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2349 				     bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2350 				     int startcolumni) {
2351   if (genestrand == +2) {
2352     if (plusp) {
2353       return block_diff_a2itc_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2354 					   startcolumni);
2355     } else {
2356       return block_diff_a2iag_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2357 					   startcolumni);
2358     }
2359   } else {
2360     if (plusp) {
2361       return block_diff_a2iag_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2362 					   startcolumni);
2363     } else {
2364       return block_diff_a2itc_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2365 					   startcolumni);
2366     }
2367   }
2368 }
2369 
2370 static __m128i
block_diff_atoi_sarray_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)2371 block_diff_atoi_sarray_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2372 				     bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2373 				     int endcolumni) {
2374   if (genestrand == +2) {
2375     if (plusp) {
2376       return block_diff_a2itc_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2377 					   endcolumni);
2378     } else {
2379       return block_diff_a2iag_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2380 					   endcolumni);
2381     }
2382   } else {
2383     if (plusp) {
2384       return block_diff_a2iag_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2385 					   endcolumni);
2386     } else {
2387       return block_diff_a2itc_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2388 					   endcolumni);
2389     }
2390   }
2391 }
2392 #endif
2393 #endif
2394 
2395 #ifdef HAVE_SSSE3
2396 static __m128i
block_diff_atoi_sarray_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)2397 block_diff_atoi_sarray_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2398 				    bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2399 				    int startcolumni) {
2400   if (genestrand == +2) {
2401     if (plusp) {
2402       return block_diff_a2itc_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2403 					  startcolumni);
2404     } else {
2405       return block_diff_a2iag_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2406 					  startcolumni);
2407     }
2408   } else {
2409     if (plusp) {
2410       return block_diff_a2iag_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2411 					  startcolumni);
2412     } else {
2413       return block_diff_a2itc_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2414 					  startcolumni);
2415     }
2416   }
2417 }
2418 
2419 static __m128i
block_diff_atoi_sarray_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)2420 block_diff_atoi_sarray_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2421 				    bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2422 				    int endcolumni) {
2423   if (genestrand == +2) {
2424     if (plusp) {
2425       return block_diff_a2itc_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2426 					  endcolumni);
2427     } else {
2428       return block_diff_a2iag_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2429 					  endcolumni);
2430     }
2431   } else {
2432     if (plusp) {
2433       return block_diff_a2iag_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2434 					  endcolumni);
2435     } else {
2436       return block_diff_a2itc_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2437 					  endcolumni);
2438     }
2439   }
2440 }
2441 #endif
2442 
2443 #ifdef HAVE_AVX2
2444 static __m256i
block_diff_atoi_sarray_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2445 block_diff_atoi_sarray_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2446 			    bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2447   if (genestrand == +2) {
2448     if (plusp) {
2449       return block_diff_a2itc_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2450     } else {
2451       return block_diff_a2iag_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2452     }
2453   } else {
2454     if (plusp) {
2455       return block_diff_a2iag_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2456     } else {
2457       return block_diff_a2itc_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2458     }
2459   }
2460 }
2461 #endif
2462 
2463 #ifdef HAVE_AVX512
2464 static __m512i
block_diff_atoi_sarray_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2465 block_diff_atoi_sarray_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2466 			    bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2467   if (genestrand == +2) {
2468     if (plusp) {
2469       return block_diff_a2itc_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2470     } else {
2471       return block_diff_a2iag_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2472     }
2473   } else {
2474     if (plusp) {
2475       return block_diff_a2iag_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2476     } else {
2477       return block_diff_a2itc_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2478     }
2479   }
2480 }
2481 #endif
2482 
2483 
2484 
2485 /************************************************************************
2486  *  TTOC
2487  ************************************************************************/
2488 
2489 static UINT4
block_diff_ttoc_sarray_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2490 block_diff_ttoc_sarray_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2491 			   bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2492   if (genestrand == +2) {
2493     if (plusp) {
2494       return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2495     } else {
2496       return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2497     }
2498   } else {
2499     if (plusp) {
2500       return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2501     } else {
2502       return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2503     }
2504   }
2505 }
2506 
2507 #ifdef HAVE_SSE2
2508 static __m128i
block_diff_ttoc_sarray_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2509 block_diff_ttoc_sarray_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2510 			    bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2511   if (genestrand == +2) {
2512     if (plusp) {
2513       return block_diff_a2iag_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2514     } else {
2515       return block_diff_a2itc_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2516     }
2517   } else {
2518     if (plusp) {
2519       return block_diff_a2itc_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2520     } else {
2521       return block_diff_a2iag_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2522     }
2523   }
2524 }
2525 
2526 #ifdef USE_SHIFT_HILO
2527 static __m128i
block_diff_ttoc_sarray_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)2528 block_diff_ttoc_sarray_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2529 				     bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2530 				     int startcolumni) {
2531   if (genestrand == +2) {
2532     if (plusp) {
2533       return block_diff_a2iag_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2534 					   startcolumni);
2535     } else {
2536       return block_diff_a2itc_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2537 					   startcolumni);
2538     }
2539   } else {
2540     if (plusp) {
2541       return block_diff_a2itc_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2542 					   startcolumni);
2543     } else {
2544       return block_diff_a2iag_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2545 					   startcolumni);
2546     }
2547   }
2548 }
2549 
2550 static __m128i
block_diff_ttoc_sarray_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)2551 block_diff_ttoc_sarray_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2552 				     bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2553 				     int endcolumni) {
2554   if (genestrand == +2) {
2555     if (plusp) {
2556       return block_diff_a2iag_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2557 					   endcolumni);
2558     } else {
2559       return block_diff_a2itc_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2560 					   endcolumni);
2561     }
2562   } else {
2563     if (plusp) {
2564       return block_diff_a2itc_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2565 					   endcolumni);
2566     } else {
2567       return block_diff_a2iag_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2568 					   endcolumni);
2569     }
2570   }
2571 }
2572 #endif
2573 #endif
2574 
2575 #ifdef HAVE_SSSE3
2576 static __m128i
block_diff_ttoc_sarray_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)2577 block_diff_ttoc_sarray_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2578 				    bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2579 				    int startcolumni) {
2580   if (genestrand == +2) {
2581     if (plusp) {
2582       return block_diff_a2iag_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2583 					  startcolumni);
2584     } else {
2585       return block_diff_a2itc_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2586 					  startcolumni);
2587     }
2588   } else {
2589     if (plusp) {
2590       return block_diff_a2itc_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2591 					  startcolumni);
2592     } else {
2593       return block_diff_a2iag_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2594 					  startcolumni);
2595     }
2596   }
2597 }
2598 
2599 static __m128i
block_diff_ttoc_sarray_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)2600 block_diff_ttoc_sarray_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2601 				    bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2602 				    int endcolumni) {
2603   if (genestrand == +2) {
2604     if (plusp) {
2605       return block_diff_a2iag_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2606 					  endcolumni);
2607     } else {
2608       return block_diff_a2itc_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2609 					  endcolumni);
2610     }
2611   } else {
2612     if (plusp) {
2613       return block_diff_a2itc_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2614 					  endcolumni);
2615     } else {
2616       return block_diff_a2iag_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2617 					  endcolumni);
2618     }
2619   }
2620 }
2621 #endif
2622 
2623 #ifdef HAVE_AVX2
2624 static __m256i
block_diff_ttoc_sarray_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2625 block_diff_ttoc_sarray_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2626 			    bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2627   if (genestrand == +2) {
2628     if (plusp) {
2629       return block_diff_a2iag_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2630     } else {
2631       return block_diff_a2itc_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2632     }
2633   } else {
2634     if (plusp) {
2635       return block_diff_a2itc_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2636     } else {
2637       return block_diff_a2iag_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2638     }
2639   }
2640 }
2641 #endif
2642 
2643 #ifdef HAVE_AVX512
2644 static __m512i
block_diff_ttoc_sarray_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2645 block_diff_ttoc_sarray_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2646 			    bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2647   if (genestrand == +2) {
2648     if (plusp) {
2649       return block_diff_a2iag_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2650     } else {
2651       return block_diff_a2itc_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2652     }
2653   } else {
2654     if (plusp) {
2655       return block_diff_a2itc_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2656     } else {
2657       return block_diff_a2iag_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2658     }
2659   }
2660 }
2661 #endif
2662 
2663 
2664 /* query_shifted, ref_ptr, plusp, genestrand, query_unk_mismatch_local_p */
2665 #ifdef HAVE_AVX512
2666 typedef __m512i (*Diffproc_512_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool);
2667 #endif
2668 
2669 #ifdef HAVE_AVX2
2670 typedef __m256i (*Diffproc_256_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool);
2671 #endif
2672 
2673 #ifdef HAVE_SSSE3
2674 typedef __m128i (*Diffproc_128_wrap_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool, int);
2675 #endif
2676 
2677 #ifdef HAVE_SSE2
2678 #ifdef USE_SHIFT_HILO
2679 typedef __m128i (*Diffproc_128_shift_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool, int);
2680 #endif
2681 
2682 typedef __m128i (*Diffproc_128_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool);
2683 #endif
2684 
2685 typedef UINT4 (*Diffproc_32_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool);
2686 
2687 
2688 /* For CMET and ATOI, ignores genome-to-query mismatches.  Used by
2689    Genome_consecutive procedures, called only by sarray-read.c */
2690 #ifdef HAVE_AVX512
2691 static Diffproc_512_T block_diff_sarray_512;
2692 #endif
2693 #ifdef HAVE_AVX2
2694 static Diffproc_256_T block_diff_sarray_256;
2695 #endif
2696 #ifdef HAVE_SSSE3
2697 static Diffproc_128_wrap_T block_diff_sarray_128_wrap_lo;
2698 static Diffproc_128_wrap_T block_diff_sarray_128_wrap_hi;
2699 #endif
2700 #ifdef HAVE_SSE2
2701 static Diffproc_128_T block_diff_sarray_128;
2702 #ifdef USE_SHIFT_HILO
2703 static Diffproc_128_shift_T block_diff_sarray_128_shift_lo;
2704 static Diffproc_128_shift_T block_diff_sarray_128_shift_hi;
2705 #endif
2706 #endif
2707 static Diffproc_32_T block_diff_sarray_32;
2708 
2709 #ifdef HAVE_AVX512
2710 static __m512i _BOUND_HIGH_512;
2711 static __m512i _BOUND_LOW_512;
2712 #endif
2713 #ifdef HAVE_AVX2
2714 static __m256i _BOUND_HIGH_256;
2715 static __m256i _BOUND_LOW_256;
2716 #endif
2717 #ifdef HAVE_SSE2
2718 static __m128i _BOUND_HIGH;
2719 static __m128i _BOUND_LOW;
2720 #endif
2721 
2722 void
Genome_consec_setup(bool query_unk_mismatch_p_in,bool genome_unk_mismatch_p_in,Mode_T mode)2723 Genome_consec_setup (bool query_unk_mismatch_p_in, bool genome_unk_mismatch_p_in,
2724 		     Mode_T mode) {
2725 
2726 #ifdef HAVE_AVX512
2727   _BOUND_HIGH_512 = _mm512_set_epi32(512,480,448,416, 384,352,320,288, 256,224,192,160, 128,96,64,32);
2728   _BOUND_LOW_512 = _mm512_set_epi32(480,448,416,384, 352,320,288,256, 224,192,160,128, 96,64,32,0);
2729 #endif
2730 #ifdef HAVE_AVX2
2731   _BOUND_HIGH_256 = _mm256_set_epi32(256,224,192,160,128,96,64,32);
2732   _BOUND_LOW_256 = _mm256_set_epi32(224,192,160,128,96,64,32,0);
2733 #endif
2734 #ifdef HAVE_SSE2
2735   _BOUND_HIGH = _mm_set_epi32(128,96,64,32);
2736   _BOUND_LOW = _mm_set_epi32(96,64,32,0);
2737 #endif
2738 
2739   query_unk_mismatch_p = query_unk_mismatch_p_in;
2740   genome_unk_mismatch_p = genome_unk_mismatch_p_in;
2741 
2742   switch (mode) {
2743   case STANDARD:
2744 #ifdef HAVE_AVX512
2745     block_diff_sarray_512 = block_diff_standard_512;
2746 #endif
2747 #ifdef HAVE_AVX2
2748     block_diff_sarray_256 = block_diff_standard_256;
2749 #endif
2750 #ifdef HAVE_SSSE3
2751     block_diff_sarray_128_wrap_lo = block_diff_standard_128_wrap_lo;
2752     block_diff_sarray_128_wrap_hi = block_diff_standard_128_wrap_hi;
2753 #endif
2754 #ifdef HAVE_SSE2
2755     block_diff_sarray_128 = block_diff_standard_128;
2756 #ifdef USE_SHIFT_HILO
2757     block_diff_sarray_128_shift_lo = block_diff_standard_128_shift_lo;
2758     block_diff_sarray_128_shift_hi = block_diff_standard_128_shift_hi;
2759 #endif
2760 #endif
2761     block_diff_sarray_32 = block_diff_standard_32;
2762     break;
2763 
2764   case CMET_STRANDED: case CMET_NONSTRANDED:
2765 #ifdef HAVE_AVX512
2766     block_diff_sarray_512 = block_diff_cmet_sarray_512;
2767 #endif
2768 #ifdef HAVE_AVX2
2769     block_diff_sarray_256 = block_diff_cmet_sarray_256;
2770 #endif
2771 #ifdef HAVE_SSSE3
2772     block_diff_sarray_128_wrap_lo = block_diff_cmet_sarray_128_wrap_lo;
2773     block_diff_sarray_128_wrap_hi = block_diff_cmet_sarray_128_wrap_hi;
2774 #endif
2775 #ifdef HAVE_SSE2
2776     block_diff_sarray_128 = block_diff_cmet_sarray_128;
2777 #ifdef USE_SHIFT_HILO
2778     block_diff_sarray_128_shift_lo = block_diff_cmet_128_shift_lo;
2779     block_diff_sarray_128_shift_hi = block_diff_cmet_128_shift_hi;
2780 #endif
2781 #endif
2782     block_diff_sarray_32 = block_diff_cmet_sarray_32;
2783     break;
2784 
2785   case ATOI_STRANDED: case ATOI_NONSTRANDED:
2786 #ifdef HAVE_AVX512
2787     block_diff_sarray_512 = block_diff_atoi_sarray_512;
2788 #endif
2789 #ifdef HAVE_AVX2
2790     block_diff_sarray_256 = block_diff_atoi_sarray_256;
2791 #endif
2792 #ifdef HAVE_SSSE3
2793     block_diff_sarray_128_wrap_lo = block_diff_atoi_sarray_128_wrap_lo;
2794     block_diff_sarray_128_wrap_hi = block_diff_atoi_sarray_128_wrap_hi;
2795 #endif
2796 #ifdef HAVE_SSE2
2797     block_diff_sarray_128 = block_diff_atoi_sarray_128;
2798 #ifdef USE_SHIFT_HILO
2799     block_diff_sarray_128_shift_lo = block_diff_atoi_128_shift_lo;
2800     block_diff_sarray_128_shift_hi = block_diff_atoi_128_shift_hi;
2801 #endif
2802 #endif
2803     block_diff_sarray_32 = block_diff_atoi_sarray_32;
2804     break;
2805 
2806   case TTOC_STRANDED: case TTOC_NONSTRANDED:
2807 #ifdef HAVE_AVX512
2808     block_diff_sarray_512 = block_diff_ttoc_sarray_512;
2809 #endif
2810 #ifdef HAVE_AVX2
2811     block_diff_sarray_256 = block_diff_ttoc_sarray_256;
2812 #endif
2813 #ifdef HAVE_SSSE3
2814     block_diff_sarray_128_wrap_lo = block_diff_ttoc_sarray_128_wrap_lo;
2815     block_diff_sarray_128_wrap_hi = block_diff_ttoc_sarray_128_wrap_hi;
2816 #endif
2817 #ifdef HAVE_SSE2
2818     block_diff_sarray_128 = block_diff_ttoc_sarray_128;
2819 #ifdef USE_SHIFT_HILO
2820     block_diff_sarray_128_shift_lo = block_diff_ttoc_128_shift_lo;
2821     block_diff_sarray_128_shift_hi = block_diff_ttoc_128_shift_hi;
2822 #endif
2823 #endif
2824     block_diff_sarray_32 = block_diff_ttoc_sarray_32;
2825     break;
2826 
2827   default: fprintf(stderr,"Mode %d not recognized\n",mode); abort();
2828   }
2829 
2830   return;
2831 }
2832 
2833 
2834 /************************************************************************/
2835 
2836 #ifdef HAVE_AVX512
2837 /* Need to implement.  Extract procedures not available. */
2838 #endif
2839 
2840 
2841 #ifdef HAVE_AVX2
2842 #define nonzero_p_256(diff) !_mm256_testz_si256(diff,diff)
2843 
2844 
2845 #if defined(HAVE_POPCNT)
2846 #define popcount_ones_256(_diff) (_popcnt64(_mm256_extract_epi64(_diff,0)) + _popcnt64(_mm256_extract_epi64(_diff,1)) + popcnt64(_mm256_extract_epi64(_diff,2)) + _popcnt64(_mm256_extract_epi64(_diff,3)))
2847 #elif defined(HAVE_MM_POPCNT)
2848 #define popcount_ones_256(_diff) (_mm_popcnt_u64(_mm256_extract_epi64(_diff,0)) + _mm_popcnt_u64(_mm256_extract_epi64(_diff,1)) + _mm_popcnt_u64(_mm256_extract_epi64(_diff,2)) + _mm_popcnt_u64(_mm256_extract_epi64(_diff,3)))
2849 #else
2850 #define popcount_ones_256(_diff) (__builtin_popcountll(_mm256_extract_epi64(_diff,0)) + __builtin_popcountll(_mm256_extract_epi64(_diff,1)) + __builtin_popcountll(_mm256_extract_epi64(_diff,2)) + __builtin_popcountll(_mm256_extract_epi64(_diff,3)))
2851 #endif
2852 
2853 static int
count_leading_zeroes_256(__m256i _diff)2854 count_leading_zeroes_256 (__m256i _diff) {
2855   debug4(printf("Entered count_leading_zeroes with "));
2856   debug4(print_vector_256_hex(_diff));
2857   UINT8 x;
2858 
2859 #ifdef HAVE_LZCNT
2860   if ((x = _mm256_extract_epi64(_diff,3)) != 0) {
2861     return (int) _lzcnt_u64(x);
2862   } else if ((x = _mm256_extract_epi64(_diff,2)) != 0) {
2863     return 64 + (int) _lzcnt_u64(x);
2864   } else if ((x = _mm256_extract_epi64(_diff,1)) != 0) {
2865     return 128 + (int) _lzcnt_u64(x);
2866   } else {
2867     return 192 + (int) _lzcnt_u64(_mm256_extract_epi64(_diff,0));
2868   }
2869 
2870 #elif defined(HAVE_BUILTIN_CLZ)
2871   if ((x = _mm256_extract_epi64(_diff,3)) != 0) {
2872     return (int) __builtin_clzll(x);
2873   } else if ((x = _mm256_extract_epi64(_diff,2)) != 0) {
2874     return 64 + (int) __builtin_clzll(x);
2875   } else if ((x = _mm256_extract_epi64(_diff,1)) != 0) {
2876     return 128 + (int) __builtin_clzll(x);
2877   } else {
2878     return 192 + (int) __builtin_clzll(_mm256_extract_epi64(_diff,0));
2879   }
2880 
2881 #else
2882   abort();
2883 #endif
2884 }
2885 
2886 static int
count_trailing_zeroes_256(__m256i _diff)2887 count_trailing_zeroes_256 (__m256i _diff) {
2888   debug4(printf("Entered count_trailing_zeroes with "));
2889   debug4(print_vector_256_hex(_diff));
2890   UINT8 x;
2891 
2892 #ifdef HAVE_TZCNT
2893   if ((x = _mm256_extract_epi64(_diff,0)) != 0) {
2894     return (int) _tzcnt_u64(x);
2895   } else if ((x = _mm256_extract_epi64(_diff,1)) != 0) {
2896     return 64 + (int) _tzcnt_u64(x);
2897   } else if ((x = _mm256_extract_epi64(_diff,2)) != 0) {
2898     return 128 + (int) _tzcnt_u64(x);
2899   } else {
2900     return 192 + (int) _tzcnt_u64(_mm256_extract_epi64(_diff,3));
2901   }
2902 
2903 #elif defined(HAVE_BUILTIN_CTZ)
2904   if ((x = _mm256_extract_epi64(_diff,0)) != 0) {
2905     return (int) __builtin_ctzll(x);
2906   } else if ((x = _mm256_extract_epi64(_diff,1)) != 0) {
2907     return 64 + (int) __builtin_ctzll(x);
2908   } else if ((x = _mm256_extract_epi64(_diff,2)) != 0) {
2909     return 128 + (int) __builtin_ctzll(x);
2910   } else {
2911     return 192 + (int) __builtin_ctzll(_mm256_extract_epi64(_diff,3));
2912   }
2913 
2914 #else
2915   abort();
2916 #endif
2917 
2918 }
2919 
2920 static __m256i
clear_highbit_256(__m256i _diff,int leading_zeroes)2921 clear_highbit_256 (__m256i _diff, int leading_zeroes) {
2922   __m256i _subtract, _relpos;
2923   int relpos;
2924 
2925   relpos = 255 - leading_zeroes;
2926   debug3(printf("Clearing high bit at relpos %d\n",relpos));
2927 
2928   _subtract = _mm256_slli_epi32(_mm256_set1_epi32(1), relpos % 32);
2929   _relpos = _mm256_set1_epi32(relpos);
2930   _subtract = _mm256_and_si256(_mm256_cmpgt_epi32(_BOUND_HIGH_256, _relpos), _subtract);
2931   _subtract = _mm256_andnot_si256(_mm256_cmpgt_epi32(_BOUND_LOW_256, _relpos), _subtract);
2932 
2933   debug3(printf("Subtract: "));
2934   debug3(print_vector_256_hex(_subtract));
2935 #if 0
2936   /* latency 1, throughput: 0.5 */
2937   return _mm256_sub_epi32(_diff, _subtract);
2938 #else
2939   /* _mm256_xor_si128 also works if all other bits are 0.  latency 1, throughput: 0.33 */
2940   return _mm256_xor_si256(_diff, _subtract);
2941 #endif
2942 }
2943 
2944 /* relpos is equal to trailing_zeroes */
2945 static __m256i
clear_lowbit_256(__m256i _diff,int relpos)2946 clear_lowbit_256 (__m256i _diff, int relpos) {
2947   __m256i _subtract, _relpos;
2948 
2949   debug3(printf("Clearing low bit at relpos %d\n",relpos));
2950 
2951   _subtract = _mm256_slli_epi32(_mm256_set1_epi32(1), relpos % 32);
2952   _relpos = _mm256_set1_epi32(relpos);
2953   _subtract = _mm256_and_si256(_mm256_cmpgt_epi32(_BOUND_HIGH_256, _relpos), _subtract);
2954   _subtract = _mm256_andnot_si256(_mm256_cmpgt_epi32(_BOUND_LOW_256, _relpos), _subtract);
2955 
2956   debug3(printf("Subtract: "));
2957   debug3(print_vector_256_hex(_subtract));
2958 #if 0
2959   /* latency 1, throughput: 0.5 */
2960   return _mm256_sub_epi32(_diff, _subtract);
2961 #else
2962   /* _mm256_xor_si128 also works if all other bits are 0.  latency 1, throughput: 0.33 */
2963   return _mm256_xor_si256(_diff, _subtract);
2964 #endif
2965 }
2966 
2967 #endif
2968 
2969 
2970 #ifdef HAVE_SSE2
2971 
2972 #ifdef HAVE_SSE4_1
2973 #define nonzero_p_128(diff) !_mm_testz_si128(diff,diff)
2974 #else
2975 #define nonzero_p_128(diff) _mm_movemask_epi8(_mm_cmpeq_epi8(diff,_mm_setzero_si128())) != 0xFFFF
2976 #endif
2977 
2978 #if (defined(USE_SHIFT_FIRST_MISMATCH) && defined(HAVE_SSE2)) || (defined(USE_WRAP_FIRST_MISMATCH) && defined(HAVE_SSSE3))
2979 static __m128i
clear_start_128(__m128i _diff,int startdiscard)2980 clear_start_128 (__m128i _diff, int startdiscard) {
2981   __m128i _mask, _startdiscard;
2982 #ifdef DEBUG
2983   __m128i _result;
2984 #endif
2985 
2986   debug(printf("Clearing start to startdiscard %d\n",startdiscard));
2987   debug(printf("Before: "));
2988   debug(print_vector_hex(_diff));
2989 
2990 #ifdef DEFECTIVE_SSE2_COMPILER
2991   _mask = _mm_sll_epi32(_mm_set1_epi32(~0U), _mm_setr_epi32(startdiscard % 32,0,0,0));
2992 #else
2993   _mask = _mm_slli_epi32(_mm_set1_epi32(~0U), startdiscard % 32);
2994 #endif
2995   _startdiscard = _mm_set1_epi32(startdiscard);
2996   _mask = _mm_or_si128(_mask, _mm_cmplt_epi32(_startdiscard, _BOUND_LOW));
2997   _mask = _mm_and_si128(_mask, _mm_cmplt_epi32(_startdiscard, _BOUND_HIGH));
2998 
2999 #ifdef DEBUG
3000   _result = _mm_and_si128(_mask, _diff);
3001   debug(printf("After:  "));
3002   print_vector_hex(_result);
3003 #endif
3004 
3005   return _mm_and_si128(_mask, _diff);
3006 }
3007 #endif
3008 
3009 #if (defined(USE_SHIFT_FIRST_MISMATCH) && defined(HAVE_SSE2)) || (defined(USE_WRAP_FIRST_MISMATCH) && defined(HAVE_SSSE3))
3010 static __m128i
clear_end_128(__m128i _diff,int enddiscard)3011 clear_end_128 (__m128i _diff, int enddiscard) {
3012   __m128i _mask, _enddiscard;
3013 #ifdef DEBUG
3014   __m128i _result;
3015 #endif
3016 
3017   debug(printf("Clearing end from enddiscard %d\n",enddiscard));
3018   debug(printf("Before: "));
3019   debug(print_vector_hex(_diff));
3020 
3021 #ifdef DEFECTIVE_SSE2_COMPILER
3022   _mask = _mm_sll_epi32(_mm_set1_epi32(~0U), _mm_setr_epi32(enddiscard % 32,0,0,0));
3023 #else
3024   _mask = _mm_slli_epi32(_mm_set1_epi32(~0U), enddiscard % 32);
3025 #endif
3026   _enddiscard = _mm_set1_epi32(enddiscard);
3027   _mask = _mm_or_si128(_mm_cmplt_epi32(_enddiscard, _BOUND_LOW), _mask);
3028   _mask = _mm_and_si128(_mm_cmplt_epi32(_enddiscard, _BOUND_HIGH), _mask);
3029 
3030 #ifdef DEBUG
3031   _result = _mm_andnot_si128(_mask, _diff);
3032   debug(printf("After:  "));
3033   print_vector_hex(_result);
3034 #endif
3035 
3036   return _mm_andnot_si128(_mask, _diff);
3037 }
3038 #endif
3039 
3040 
3041 
3042 #if !defined(HAVE_SSE4_2) || !defined(HAVE_MM_EXTRACT_EPI64)
3043 
3044 #if 0
3045 /* Naive method for pre-SSE4.2.  Requires four popcount operations. */
3046 static int
3047 popcount_ones_128 (__m128i _diff) {
3048   UINT4 diff[4];
3049 
3050   _mm_store_si128((__m128i *) diff,_diff);
3051 
3052   return __builtin_popcount(diff[0]) + __builtin_popcount(diff[1]) + __builtin_popcount(diff[2]) + __builtin_popcount(diff[3]);
3053 }
3054 #endif
3055 
3056 
3057 /************************************************************************
3058  *  Method for pre-SSE4.2: Using Harley's method to reduce number of
3059  *  popcount operations when we need to compute four 32-bit popcounts
3060  *  in a 128-bit register.
3061  *
3062  *  The naive method is to do _popcnt32(diff[0]) + _popcnt32(diff[1]) + _popcnt32(diff[2]) + _popcnt32(diff[3]);
3063  *
3064  *  Harley's method uses a carry-save adder (CSA) to reduce the number
3065  *  of popcount operations for 3 elements from 3 to 2.
3066  ************************************************************************/
3067 
3068 #define CSA(h,l, a,b,c, u,v) u = a ^ b; v = c; h = (a & b) | (u & v); l = u ^ v;
3069 
3070 static int
popcount_ones_128(__m128i _diff)3071 popcount_ones_128 (__m128i _diff) {
3072   UINT4 ones, twos, u, v;
3073   UINT4 diff[4];
3074 
3075   _mm_store_si128((__m128i *) diff,_diff);
3076 
3077   CSA(twos, ones, diff[0], diff[1], diff[2], u, v);
3078 
3079   return 2*__builtin_popcount(twos) + __builtin_popcount(ones) + __builtin_popcount(diff[3]);
3080 }
3081 
3082 
3083 #elif defined(HAVE_POPCNT)
3084 #define popcount_ones_128(_diff) (_popcnt64(_mm_extract_epi64(_diff,0)) + _popcnt64(_mm_extract_epi64(_diff,1)))
3085 #elif defined(HAVE_MM_POPCNT) && defined(HAVE_MM_POPCNT_U64)
3086 #define popcount_ones_128(_diff) (_mm_popcnt_u64(_mm_extract_epi64(_diff,0)) + _mm_popcnt_u64(_mm_extract_epi64(_diff,1)))
3087 #else
3088 #define popcount_ones_128(_diff) (__builtin_popcountll(_mm_extract_epi64(_diff,0)) + __builtin_popcountll(_mm_extract_epi64(_diff,1)))
3089 #endif
3090 
3091 
3092 static int
count_leading_zeroes_128(__m128i _diff)3093 count_leading_zeroes_128 (__m128i _diff) {
3094   debug4(printf("Entered count_leading_zeroes_128 with "));
3095   debug4(print_vector_hex(_diff));
3096 
3097 #if defined(HAVE_SSE4_2) && defined(HAVE_LZCNT) && defined(HAVE_MM_EXTRACT_EPI64)
3098   UINT8 x;
3099 
3100   if ((x = _mm_extract_epi64(_diff,1)) != 0) {
3101     return (int) _lzcnt_u64(x);
3102   } else {
3103     return 64 + (int) _lzcnt_u64(_mm_extract_epi64(_diff,0));
3104   }
3105 
3106 #elif defined(HAVE_SSE4_1) && defined(HAVE_MM_EXTRACT_EPI64)
3107   UINT8 x;
3108 
3109   if ((x = _mm_extract_epi64(_diff,1)) != 0) {
3110     return __builtin_clzll(x);
3111   } else {
3112     return 64 + __builtin_clzll(_mm_extract_epi64(_diff,0));
3113   }
3114 
3115 #else
3116   UINT4 x;
3117 
3118   if ((x = (_mm_extract_epi16(_diff,7) << 16) | (_mm_extract_epi16(_diff,6) & 0x0000FFFF)) != 0) {
3119     debug4(printf("word 3 is non-empty, so returning %d vs %d\n",__builtin_clz(x),__builtin_clz(((UINT4 *) &_diff)[3])));
3120     return __builtin_clz(x);
3121   } else if ((x = (_mm_extract_epi16(_diff,5) << 16) | (_mm_extract_epi16(_diff,4) & 0x0000FFFF)) != 0) {
3122     debug4(printf("word 2 is non-empty, so returning 32 + %d vs %d\n",__builtin_clz(x),__builtin_clz(((UINT4 *) &_diff)[2])));
3123     return 32 + __builtin_clz(x);
3124   } else if ((x = (_mm_extract_epi16(_diff,3) << 16) | (_mm_extract_epi16(_diff,2) & 0x0000FFFF)) != 0) {
3125     debug4(printf("word 1 is non-empty, so returning 64 + %d vs %d\n",__builtin_clz(x),__builtin_clz(((UINT4 *) &_diff)[1])));
3126     return 64 + __builtin_clz(x);
3127   } else {
3128     x = (_mm_extract_epi16(_diff,1) << 16) | (_mm_extract_epi16(_diff,0) & 0x0000FFFF);
3129     debug4(printf("word 0 is non-empty, so returning 96 + %d vs %d\n",__builtin_clz(x),__builtin_clz(((UINT4 *) &_diff)[0])));
3130     return 96 + __builtin_clz(x);
3131   }
3132 #endif
3133 }
3134 
3135 static int
count_trailing_zeroes_128(__m128i _diff)3136 count_trailing_zeroes_128 (__m128i _diff) {
3137   debug4(printf("Entered count_trailing_zeroes_128 with "));
3138   debug4(print_vector_hex(_diff));
3139 
3140 #if defined(HAVE_SSE4_2) && defined(HAVE_TZCNT) && defined(HAVE_MM_EXTRACT_EPI64)
3141   UINT8 x;
3142 
3143   if ((x = _mm_extract_epi64(_diff,0)) != 0) {
3144     return (int) _tzcnt_u64(x);
3145   } else {
3146     return 64 + (int) _tzcnt_u64(_mm_extract_epi64(_diff,1));
3147   }
3148 
3149 #elif defined(HAVE_SSE4_1) && defined(HAVE_MM_EXTRACT_EPI64)
3150   UINT8 x;
3151 
3152   if ((x = _mm_extract_epi64(_diff,0)) != 0) {
3153     return __builtin_ctzll(x);
3154   } else {
3155     return 64 + __builtin_ctzll(_mm_extract_epi64(_diff,1));
3156   }
3157 
3158 #else
3159   UINT4 x;
3160 
3161   if ((x = (_mm_extract_epi16(_diff,1) << 16) | (_mm_extract_epi16(_diff,0) & 0x0000FFFF)) != 0) {
3162     debug4(printf("word 0 is non-empty, so returning %d vs %d\n",__builtin_ctz(x),__builtin_ctz(((UINT4 *) &_diff)[0])));
3163     return __builtin_ctz(x);
3164   } else if ((x = (_mm_extract_epi16(_diff,3) << 16) | (_mm_extract_epi16(_diff,2) & 0x0000FFFF)) != 0) {
3165     debug4(printf("word 1 is non-empty, so returning 32 + %d vs %d\n",__builtin_ctz(x),__builtin_ctz(((UINT4 *) &_diff)[1])));
3166     return 32 + __builtin_ctz(x);
3167   } else if ((x = (_mm_extract_epi16(_diff,5) << 16) | (_mm_extract_epi16(_diff,4) & 0x0000FFFF)) != 0) {
3168     debug4(printf("word 2 is non-empty, so returning 64 + %d vs %d\n",__builtin_ctz(x),__builtin_ctz(((UINT4 *) &_diff)[2])));
3169     return 64 + __builtin_ctz(x);
3170   } else {
3171     x = (_mm_extract_epi16(_diff,7) << 16) | (_mm_extract_epi16(_diff,6) & 0x0000FFFF);
3172     debug4(printf("word 3 is non-empty, so returning 96 + %d vs %d\n",__builtin_ctz(x),__builtin_ctz(((UINT4 *) &_diff)[3])));
3173     return 96 + __builtin_ctz(x);
3174   }
3175 #endif
3176 }
3177 
3178 #endif
3179 
3180 
3181 /*                 76543210 */
3182 #define HIGH_BIT 0x80000000
3183 
3184 #define nonzero_p_32(diff) diff
3185 
3186 #define clear_start_32(diff,startdiscard) (diff & (~0U << (startdiscard)))
3187 #define clear_end_32(diff,enddiscard) (diff & ~(~0U << (enddiscard)))
3188 
3189 /* For trimming */
3190 #define set_start_32(diff,startdiscard) (diff | ~(~0U << startdiscard))
3191 #define set_end_32(diff,enddiscard) (diff | (~0U << enddiscard))
3192 
3193 /* For fragment functions that evaluate only the end 16-mer */
3194 #define clear_start_mask(startdiscard) (~0U << (startdiscard))
3195 #define clear_end_mask(enddiscard) (~(~0U << (enddiscard)))
3196 
3197 
3198 /* Same speed: clear_highbit(diff,relpos) (diff - (HIGH_BIT >> relpos)) */
3199 /* Note: xor assumes that bit at relpos was on */
3200 #define clear_highbit_32(diff,relpos) (diff ^ (HIGH_BIT >> relpos))
3201 
3202 /* Slower: clear_lowbit(diff,relpos) diff -= (1 << relpos) */
3203 #define clear_lowbit_32(diff,relpos) (diff & (diff - 1));
3204 
3205 
3206 #if !defined(HAVE_SSE4_2)
3207 #define popcount_ones_32(diff) (count_bits[diff & 0x0000FFFF] + count_bits[diff >> 16])
3208 #elif defined(HAVE_POPCNT)
3209 #define popcount_ones_32(diff) (_popcnt32(diff))
3210 #elif defined(HAVE_MM_POPCNT)
3211 #define popcount_ones_32(diff) (_mm_popcnt_u32(diff))
3212 #elif defined(HAVE_BUILTIN_POPCOUNT)
3213 #define popcount_ones_32(diff) (__builtin_popcount(diff))
3214 #else
3215 #define popcount_ones_32(diff) (count_bits[diff & 0x0000FFFF] + count_bits[diff >> 16])
3216 #endif
3217 
3218 #if !defined(HAVE_SSE4_2)
3219 #define count_leading_zeroes_32(diff) ((diff >> 16) ? clz_table[diff >> 16] : 16 + clz_table[diff])
3220 #elif defined(HAVE_LZCNT)
3221 #define count_leading_zeroes_32(diff) _lzcnt_u32(diff)
3222 #elif defined(HAVE_BUILTIN_CLZ)
3223 #define count_leading_zeroes_32(diff) __builtin_clz(diff)
3224 #else
3225 #define count_leading_zeroes_32(diff) ((diff >> 16) ? clz_table[diff >> 16] : 16 + clz_table[diff])
3226 #endif
3227 
3228 #if !defined(HAVE_SSE4_2)
3229 #define count_trailing_zeroes_32(diff) mod_37_bit_position[(-diff & diff) % 37]
3230 #elif defined(HAVE_TZCNT)
3231 #define count_trailing_zeroes_32(diff) _tzcnt_u32(diff)
3232 #elif defined(HAVE_BUILTIN_CTZ)
3233 #define count_trailing_zeroes_32(diff) __builtin_ctz(diff)
3234 #else
3235 /* lowbit = -diff & diff */
3236 #define count_trailing_zeroes_32(diff) mod_37_bit_position[(-diff & diff) % 37]
3237 #endif
3238 
3239 
3240 /* Counts matches from pos5 to pos3 up to first mismatch.  Modified from mismatches_left */
3241 int
Genome_consecutive_matches_rightward(Genome_T ome,Compress_T query_compress,Univcoord_T left,int pos5,int pos3,bool plusp,int genestrand)3242 Genome_consecutive_matches_rightward (Genome_T ome, Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
3243 				      bool plusp, int genestrand) {
3244   Genomecomp_T *ref_blocks = Genome_blocks(ome);
3245   int mismatch_position, offset, nshift;
3246   int startdiscard, enddiscard;
3247   Univcoord_T startblocki, endblocki, startblocki_32, endblocki_32;
3248   Genomecomp_T *query_shifted, *ptr, *endptr;
3249   int relpos;
3250   int startcolumni, endcolumni;
3251   UINT4 diff_32;
3252 #ifdef HAVE_SSE2
3253   __m128i diff_128;
3254 #endif
3255 #ifdef HAVE_AVX2
3256   __m256i diff_256;
3257 #endif
3258 
3259   debug(
3260 	printf("\n\n");
3261 	printf("Genome (in consecutive_matches_rightward):\n");
3262 	print_blocks(ref_blocks,left+pos5,left+pos3);
3263 	printf("\n");
3264 	);
3265 
3266 
3267   startblocki = (left+pos5)/128U*12;
3268   startcolumni = ((left+pos5) % 128) / 32;
3269   startblocki_32 = startblocki + startcolumni;
3270 
3271   endblocki = (left+pos3)/128U*12;
3272   endcolumni = ((left+pos3) % 128) / 32;
3273   endblocki_32 = endblocki + endcolumni;
3274 
3275   debug(printf("left = %u, pos5 = %d, pos3 = %d, startblocki = %u, endblocki = %u\n",
3276 	       left,pos5,pos3,startblocki,endblocki));
3277 
3278   nshift = left % STEP_SIZE;
3279   query_shifted = Compress_shift(query_compress,nshift);
3280   debug(printf("Query shifted %d:\n",nshift));
3281   debug(Compress_print_blocks(query_shifted,nshift,pos5,pos3));
3282   query_shifted += (nshift+pos5)/STEP_SIZE*COMPRESS_BLOCKSIZE;
3283 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
3284 #else
3285   query_shifted += startcolumni;
3286 #endif
3287 
3288   startdiscard = (left+pos5) % 32;
3289   enddiscard = (left+pos3) % 32;
3290   offset = -startdiscard + pos5;
3291   ptr = &(ref_blocks[startblocki_32]);
3292   endptr = &(ref_blocks[endblocki_32]);
3293 
3294   if (endblocki_32 == startblocki_32) {
3295     /* Single block */
3296     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
3297 
3298     diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3299 				     plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3300     diff_32 = clear_start_32(diff_32,startdiscard);
3301     diff_32 = clear_end_32(diff_32,enddiscard);
3302 
3303     if (nonzero_p_32(diff_32)) {
3304       mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3305       debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3306       return (mismatch_position - pos5);
3307     } else {
3308       debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3309       return (pos3 - pos5);
3310     }
3311 
3312   } else if (endblocki == startblocki) {
3313 #if defined(USE_SHIFT_FIRST_MISMATCH) && defined(HAVE_SSE2)
3314     /* Shift */
3315     enddiscard += (endcolumni - startcolumni)*32;
3316     assert(startdiscard == ((left+pos5) % 128) - startcolumni*32);
3317     assert(enddiscard == ((left+pos3) % 128) - startcolumni*32);
3318 
3319     diff_128 = (block_diff_sarray_128_shift_lo)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true,
3320 						startcolumni);
3321     diff_128 = clear_start_128(diff_128,startdiscard);
3322     diff_128 = clear_end_128(diff_128,enddiscard);
3323 
3324     if (nonzero_p_128(diff_128)) {
3325       mismatch_position = offset + (relpos = count_trailing_zeroes_128(diff_128));
3326       debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3327       return (mismatch_position - pos5);
3328     } else {
3329       debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3330       return (pos3 - pos5);
3331     }
3332 
3333 #else
3334     /* Start block */
3335     diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3336                                      plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3337     diff_32 = clear_start_32(diff_32,startdiscard);
3338 
3339     if (nonzero_p_32(diff_32)) {
3340       mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3341       debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3342       return (mismatch_position - pos5);
3343     }
3344     query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3345     offset += 32;
3346 
3347     /* Single row */
3348     while (++startcolumni < endcolumni) {
3349       diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3350 				       plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3351 
3352       if (nonzero_p_32(diff_32)) {
3353 	mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3354 	debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3355 	return (mismatch_position - pos5);
3356       }
3357       query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3358       offset += 32;
3359     }
3360 
3361     /* End block */
3362     diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3363                                      plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3364     diff_32 = clear_end_32(diff_32,enddiscard);
3365 
3366     if (nonzero_p_32(diff_32)) {
3367       mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3368       debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3369       return (mismatch_position - pos5);
3370     } else {
3371       debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3372       return (pos3 - pos5);
3373     }
3374 #endif
3375 
3376 #if defined(USE_WRAP_FIRST_MISMATCH) && defined(HAVE_SSSE3)
3377   } else if (endblocki == startblocki + 12 && endcolumni < startcolumni) {
3378     /* Wrap */
3379     enddiscard += (4 + endcolumni - startcolumni)*32;
3380     assert(startdiscard == ((left+pos5) % 128) - startcolumni*32);
3381     assert(enddiscard == ((left+pos3) % 128) + (4 - startcolumni)*32);
3382 
3383     diff_128 = (block_diff_sarray_128_wrap_lo)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true,
3384 					       startcolumni);
3385     diff_128 = clear_start_128(diff_128,startdiscard);
3386     diff_128 = clear_end_128(diff_128,enddiscard);
3387 
3388     if (nonzero_p_128(diff_128)) {
3389       mismatch_position = offset + (relpos = count_trailing_zeroes_128(diff_128));
3390       debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3391       return (mismatch_position - pos5);
3392     } else {
3393       debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3394       return (pos3 - pos5);
3395     }
3396 
3397 #endif
3398 
3399   } else {
3400     /* Start block */
3401     diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3402                                      plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3403     diff_32 = clear_start_32(diff_32,startdiscard);
3404 
3405     if (nonzero_p_32(diff_32)) {
3406       mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3407       debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3408       return (mismatch_position - pos5);
3409     }
3410     query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3411     offset += 32;
3412 
3413     /* Start row */
3414     while (++startcolumni < 4) {
3415       diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3416 				       plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3417       if (nonzero_p_32(diff_32)) {
3418 	mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3419 	debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3420 	return (mismatch_position - pos5);
3421       }
3422       query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3423       offset += 32;
3424     }
3425 #ifdef HAVE_SSE2
3426     query_shifted += QUERY_NEXTROW;
3427 #endif
3428     ptr += GENOME_NEXTROW;
3429 
3430 
3431     /* Middle rows */
3432 #ifdef HAVE_AVX2
3433     while (ptr + 24 <= endptr) {
3434       diff_256 = (block_diff_sarray_256)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3435 
3436       if (nonzero_p_256(diff_256)) {
3437 	mismatch_position = offset + (relpos = count_trailing_zeroes_256(diff_256));
3438 	debug(printf("returning %d - %d consecutive matches\n",mismatch_position,pos5));
3439 	return (mismatch_position - pos5);
3440       }
3441       query_shifted += 24; ptr += 24;
3442       offset += 256;
3443     }
3444 #endif
3445 
3446 #ifdef HAVE_SSE2
3447     while (ptr + 12 <= endptr) {
3448       diff_128 = (block_diff_sarray_128)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3449 
3450       if (nonzero_p_128(diff_128)) {
3451 	mismatch_position = offset + (relpos = count_trailing_zeroes_128(diff_128));
3452 	debug(printf("returning %d - %d consecutive matches\n",mismatch_position,pos5));
3453 	return (mismatch_position - pos5);
3454       }
3455       query_shifted += 12; ptr += 12;
3456       offset += 128;
3457     }
3458 #else
3459     while (ptr + 12 <= endptr) {
3460       for (startcolumni = 0; startcolumni < 4; startcolumni++) {
3461 	diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3462 					 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3463 	if (nonzero_p_32(diff_32)) {
3464 	  mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3465 	  debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3466 	  return (mismatch_position - pos5);
3467 	}
3468 	query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3469 	offset += 32;
3470       }
3471       /* query_shifted += QUERY_NEXTROW; */ ptr += GENOME_NEXTROW;
3472     }
3473 #endif
3474 
3475     /* End row */
3476     while (ptr < endptr) {
3477       diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3478 				       plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3479 
3480       if (nonzero_p_32(diff_32)) {
3481 	mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3482 	debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3483 	return (mismatch_position - pos5);
3484       }
3485       query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3486       offset += 32;
3487     }
3488 
3489     /* End block */
3490     diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3491 				     plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3492     diff_32 = clear_end_32(diff_32,enddiscard);
3493 
3494     if (nonzero_p_32(diff_32)) {
3495       mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3496       debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3497       return (mismatch_position - pos5);
3498     } else {
3499       debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3500       return (pos3 - pos5);
3501     }
3502   }
3503 }
3504 
3505 
3506 /* Counts matches from pos3 to pos5 up to first mismatch.  Modified from mismatches_right */
3507 int
Genome_consecutive_matches_leftward(Genome_T ome,Compress_T query_compress,Univcoord_T left,int pos5,int pos3,bool plusp,int genestrand)3508 Genome_consecutive_matches_leftward (Genome_T ome, Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
3509 				     bool plusp, int genestrand) {
3510   Genomecomp_T *ref_blocks = Genome_blocks(ome);
3511   int mismatch_position, offset, relpos, nshift;
3512   int startdiscard, enddiscard;
3513   Univcoord_T startblocki, endblocki, startblocki_32, endblocki_32;
3514   Genomecomp_T *query_shifted, *ptr, *startptr;
3515 #ifndef HAVE_BUILTIN_CLZ
3516   Genomecomp_T top;
3517 #endif
3518   int startcolumni, endcolumni;
3519   UINT4 diff_32;
3520 #ifdef HAVE_SSE2
3521   __m128i diff_128;
3522 #endif
3523 #ifdef HAVE_AVX2
3524   __m256i diff_256;
3525 #endif
3526 
3527   /* static int ncalls = 0; */
3528   /* printf("Number of calls to leftward: %d\n",++ncalls); */
3529 
3530   debug(
3531 	printf("\n\n");
3532 	printf("Genome (in consecutive_matches_leftward):\n");
3533 	print_blocks(ref_blocks,left+pos5,left+pos3);
3534 	printf("\n");
3535 	);
3536 
3537 
3538   startblocki = (left+pos5)/128U*12;
3539   startcolumni = ((left+pos5) % 128) / 32;
3540   startblocki_32 = startblocki + startcolumni;
3541 
3542   endblocki = (left+pos3)/128U*12;
3543   endcolumni = ((left+pos3) % 128) / 32;
3544   endblocki_32 = endblocki + endcolumni;
3545 
3546   debug(printf("left = %u, pos5 = %d, pos3 = %d, startblocki = %u, endblocki = %u\n",
3547 	       left,pos5,pos3,startblocki,endblocki));
3548 
3549   nshift = left % STEP_SIZE;
3550   query_shifted = Compress_shift(query_compress,nshift);
3551   debug(printf("Query shifted %d:\n",nshift));
3552   debug(Compress_print_blocks(query_shifted,nshift,pos5,pos3));
3553   query_shifted += (nshift+pos3)/STEP_SIZE*COMPRESS_BLOCKSIZE;
3554 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
3555 #else
3556   query_shifted += endcolumni;
3557 #endif
3558 
3559   startdiscard = (left+pos5) % 32;
3560   enddiscard = (left+pos3) % 32;
3561   offset = (pos3 - 1) - enddiscard + 32;
3562   ptr = &(ref_blocks[endblocki_32]);
3563   startptr = &(ref_blocks[startblocki_32]);
3564 
3565   if (startblocki_32 == endblocki_32) {
3566     /* Single block */
3567     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
3568 
3569     diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3570 				     plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3571     diff_32 = clear_start_32(diff_32,startdiscard);
3572     diff_32 = clear_end_32(diff_32,enddiscard);
3573 
3574     if (nonzero_p_32(diff_32)) {
3575       mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3576       debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3577       return (pos3 - mismatch_position - 1);
3578     } else {
3579       debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3580       return (pos3 - pos5);
3581     }
3582 
3583   } else if (startblocki == endblocki) {
3584 #if defined(USE_SHIFT_FIRST_MISMATCH) && defined(HAVE_SSE2)
3585     /* Shift */
3586     startdiscard += 96 - (endcolumni - startcolumni)*32;
3587     enddiscard += 96;
3588     assert(startdiscard == ((left+pos5) % 128) + (3 - endcolumni)*32);
3589     assert(enddiscard == ((left+pos3) % 128) + (3 - endcolumni)*32);
3590 
3591     diff_128 = (block_diff_sarray_128_shift_hi)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true,
3592 						endcolumni);
3593     diff_128 = clear_start_128(diff_128,startdiscard);
3594     diff_128 = clear_end_128(diff_128,enddiscard);
3595 
3596     if (nonzero_p_128(diff_128)) {
3597       mismatch_position = offset - (relpos = count_leading_zeroes_128(diff_128));
3598       debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3599       return (pos3 - mismatch_position - 1);
3600     } else {
3601       debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3602       return (pos3 - pos5);
3603     }
3604 
3605 #else
3606     /* End block */
3607     diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3608                                      plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3609     diff_32 = clear_end_32(diff_32,enddiscard);
3610 
3611     if (nonzero_p_32(diff_32)) {
3612       mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3613       debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3614       return (pos3 - mismatch_position - 1);
3615     }
3616     query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3617     offset -= 32;
3618 
3619     /* Single row */
3620     while (--endcolumni > startcolumni) {
3621       diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3622 				       plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3623 
3624       if (nonzero_p_32(diff_32)) {
3625 	mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3626 	debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3627 	return (pos3 - mismatch_position - 1);
3628       }
3629       query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3630       offset -= 32;
3631     }
3632 
3633     /* Start block */
3634     diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3635                                      plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3636     diff_32 = clear_start_32(diff_32,startdiscard);
3637 
3638     if (nonzero_p_32(diff_32)) {
3639       mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3640       debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3641       return (pos3 - mismatch_position - 1);
3642     } else {
3643       debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3644       return (pos3 - pos5);
3645     }
3646 #endif
3647 
3648 #if defined(USE_WRAP_FIRST_MISMATCH) && defined(HAVE_SSSE3)
3649   } else if (endblocki == startblocki + 12 && endcolumni < startcolumni) {
3650     /* Wrap */
3651     startdiscard += (startcolumni - endcolumni - 1)*32;
3652     enddiscard += 96;
3653     assert(startdiscard == ((left+pos5) % 128) - (endcolumni + 1)*32);
3654     assert(enddiscard == ((left+pos3) % 128) + (3 - endcolumni)*32);
3655 
3656     diff_128 = (block_diff_sarray_128_wrap_hi)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true,
3657 					       endcolumni);
3658     diff_128 = clear_start_128(diff_128,startdiscard);
3659     diff_128 = clear_end_128(diff_128,enddiscard);
3660 
3661     if (nonzero_p_128(diff_128)) {
3662       mismatch_position = offset - (relpos = count_leading_zeroes_128(diff_128));
3663       debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3664       return (pos3 - mismatch_position - 1);
3665     } else {
3666       debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3667       return (pos3 - pos5);
3668     }
3669 #endif
3670 
3671   } else {
3672     /* End block */
3673     diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3674 				     plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3675     diff_32 = clear_end_32(diff_32,enddiscard);
3676 
3677     if (nonzero_p_32(diff_32)) {
3678       mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3679       debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3680       return (pos3 - mismatch_position - 1);
3681     }
3682     query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3683     offset -= 32;
3684 
3685     /* End row */
3686     while (--endcolumni >= 0) {
3687       diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3688 				       plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3689 
3690       if (nonzero_p_32(diff_32)) {
3691 	mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3692 	debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3693 	return (pos3 - mismatch_position - 1);
3694       }
3695       query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3696       offset -= 32;
3697     }
3698 #ifdef HAVE_SSE2
3699     query_shifted -= QUERY_NEXTROW;
3700 #endif
3701     ptr -= GENOME_NEXTROW;
3702 
3703 
3704     /* Middle rows */
3705 #ifdef HAVE_AVX2
3706     while (ptr >= startptr + 24) {
3707       diff_256 = (block_diff_sarray_256)(&(query_shifted[-15]),&(ptr[-15]),plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3708 
3709       if (nonzero_p_256(diff_256)) {
3710 	mismatch_position = offset - (relpos = count_leading_zeroes_256(diff_256));
3711 	debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3712 	return (pos3 - mismatch_position - 1);
3713       }
3714       query_shifted -= 24; ptr -= 24;
3715       offset -= 256;
3716     }
3717 #endif
3718 
3719 #ifdef HAVE_SSE2
3720     while (ptr >= startptr + 12) {
3721       diff_128 = (block_diff_sarray_128)(&(query_shifted[-3]),&(ptr[-3]),plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3722 
3723       if (nonzero_p_128(diff_128)) {
3724 	mismatch_position = offset - (relpos = count_leading_zeroes_128(diff_128));
3725 	debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3726 	return (pos3 - mismatch_position - 1);
3727       }
3728       query_shifted -= 12; ptr -= 12;
3729       offset -= 128;
3730     }
3731 #else
3732     while (ptr >= startptr + 12) {
3733      for (endcolumni = 3; endcolumni >= 0; --endcolumni) {
3734        diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3735 					plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3736 
3737        if (nonzero_p_32(diff_32)) {
3738 	 mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3739 	 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3740 	 return (pos3 - mismatch_position - 1);
3741        }
3742        query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3743        offset -= 32;
3744      }
3745      /* query_shifted -= QUERY_NEXTROW; */ ptr -= GENOME_NEXTROW;
3746     }
3747 #endif
3748 
3749     /* Start row */
3750     while (ptr > startptr) {
3751       diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3752 				       plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3753       if (nonzero_p_32(diff_32)) {
3754 	mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3755 	debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3756 	return (pos3 - mismatch_position - 1);
3757       }
3758       query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3759       offset -= 32;
3760     }
3761 
3762     /* Start block */
3763     diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3764                                      plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3765     diff_32 = clear_start_32(diff_32,startdiscard);
3766 
3767     if (nonzero_p_32(diff_32)) {
3768       mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3769       debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3770       return (pos3 - mismatch_position - 1);
3771     } else {
3772       debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3773       return (pos3 - pos5);
3774     }
3775   }
3776 }
3777 
3778 
3779 /* Modified from Genome_consecutive_matches_rightward.  Intended for
3780    writing LCP only, so coords are UINT4, and does not need SIMD
3781    instructions. */
3782 
3783 int
Genome_consecutive_matches_pair(Genome_T ome,UINT4 lefta,UINT4 leftb,UINT4 genomelength)3784 Genome_consecutive_matches_pair (Genome_T ome, UINT4 lefta, UINT4 leftb, UINT4 genomelength) {
3785   Genomecomp_T *ref_blocks = Genome_blocks(ome);
3786   int mismatch_position, offset, nshift, rightshift, nshift1, nshift2, nshifta, nshiftb;
3787   int startdiscard, enddiscard, nblocks;
3788   UINT4 left1, left2;
3789   UINT4 startblocki_1, startblocki_2, endblocki;
3790   int startcolumni_1, startcolumni_2, endcolumni;
3791   Genomecomp_T *ptr1, *ptr2, *end, *ptr1_prev;
3792   Genomecomp_T diff;
3793   Genomecomp_T shifted1[3];
3794   int relpos;
3795 
3796   nshifta = lefta % 32;		/* Not STEP_SIZE */
3797   nshiftb = leftb % 32;		/* Not STEP_SIZE */
3798   if (nshifta < nshiftb) {
3799     left1 = lefta;
3800     left2 = leftb;
3801     nshift1 = nshifta;
3802     nshift2 = nshiftb;
3803 
3804   } else {
3805     left1 = leftb;
3806     left2 = lefta;
3807     nshift1 = nshiftb;
3808     nshift2 = nshifta;
3809   }
3810   nshift = nshift2 - nshift1;
3811   rightshift = 32 - nshift;	/* Not STEP_SIZE */
3812   startdiscard = nshift2;
3813 
3814   /* Non-SSE2 code */
3815   startcolumni_1 = (left1 % 128) / 32;
3816   startblocki_1 = left1/128U*12 + startcolumni_1;
3817 
3818   startcolumni_2 = (left2 % 128) / 32;
3819   startblocki_2 = left2/128U*12 + startcolumni_2;
3820 
3821   endcolumni = (genomelength % 128) / 32;
3822   endblocki = genomelength/128U*12 + endcolumni;
3823 
3824 
3825   ptr1 = &(ref_blocks[startblocki_1]);
3826   ptr2 = &(ref_blocks[startblocki_2]);
3827   end = &(ref_blocks[endblocki]);
3828 
3829   offset = -startdiscard;
3830 
3831   debug2(printf("\n\n"));
3832   debug2(printf("left1 = %u, left2 = %u, startblocki_1 = %u, startblocki_2 = %u, endblocki = %u\n",
3833 	       left1,left2,startblocki_1,startblocki_2,endblocki));
3834   debug2(printf("nshift1 = %d, nshift2 = %d, nshift = %d, startdiscard = %u\n",
3835 	       nshift1,nshift2,nshift,startdiscard));
3836 
3837   if (ptr1 == end) {
3838     /* Single block */
3839     enddiscard = genomelength % 32; /* Not STEP_SIZE */
3840     if (nshift + enddiscard < 32) { /* Not STEP_SIZE */
3841       enddiscard = nshift + enddiscard;
3842 
3843       ptr1 = &(ref_blocks[startblocki_1]);
3844       ptr2 = &(ref_blocks[startblocki_2]);
3845 #ifdef WORDS_BIGENDIAN
3846       shifted1[0] = Bigendian_convert_uint(ptr1[0]) << nshift;
3847       shifted1[1] = Bigendian_convert_uint(ptr1[4]) << nshift;
3848       shifted1[2] = Bigendian_convert_uint(ptr1[8]) << nshift;
3849 #else
3850       shifted1[0] = ptr1[0] << nshift;
3851       shifted1[1] = ptr1[4] << nshift;
3852       shifted1[2] = ptr1[8] << nshift;
3853 #endif
3854       debug2(Compress_print_one_block(ptr1));
3855       debug2(Compress_print_one_block(ptr2));
3856       debug2(Compress_print_one_block(shifted1));
3857 
3858 #ifdef WORDS_BIGENDIAN
3859       diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
3860 #else
3861       diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
3862 #endif
3863       diff = clear_start_32(diff,startdiscard);
3864       diff = clear_end_32(diff,enddiscard);
3865 
3866       if (diff /* != 0U */) {
3867 #ifdef HAVE_BUILTIN_CTZ
3868 	mismatch_position = offset + (relpos = __builtin_ctz(diff));
3869 #else
3870 	mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
3871 #endif
3872 	debug2(printf("case 1: returning %d matches\n",mismatch_position));
3873 	return mismatch_position;
3874       } else {
3875 	debug2(printf("case 2: returning %d - %d matches\n",enddiscard,startdiscard));
3876 	return (enddiscard - startdiscard);
3877       }
3878 
3879     } else {
3880       /* Two blocks */
3881       if (nshift > 0) {
3882 	enddiscard -= (32 - nshift); /* Not STEP_SIZE */
3883       }
3884 
3885       /* Block 1 */
3886       ptr1 = &(ref_blocks[startblocki_1]);
3887       ptr2 = &(ref_blocks[startblocki_2]);
3888 #ifdef WORDS_BIGENDIAN
3889       shifted1[0] = Bigendian_convert_uint(ptr1[0]) << nshift;
3890       shifted1[1] = Bigendian_convert_uint(ptr1[4]) << nshift;
3891       shifted1[2] = Bigendian_convert_uint(ptr1[8]) << nshift;
3892 #else
3893       shifted1[0] = ptr1[0] << nshift;
3894       shifted1[1] = ptr1[4] << nshift;
3895       shifted1[2] = ptr1[8] << nshift;
3896 #endif
3897       debug2(Compress_print_one_block(ptr1));
3898       debug2(Compress_print_one_block(ptr2));
3899       debug2(Compress_print_one_block(shifted1));
3900 
3901 #ifdef WORDS_BIGENDIAN
3902       diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
3903 #else
3904       diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
3905 #endif
3906       diff = clear_start_32(diff,startdiscard);
3907 
3908       if (diff /* != 0U */) {
3909 #ifdef HAVE_BUILTIN_CTZ
3910 	mismatch_position = offset + (relpos = __builtin_ctz(diff));
3911 #else
3912 	mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
3913 #endif
3914 	debug2(printf("case 3: returning %d matches\n",mismatch_position));
3915 	return mismatch_position;
3916       } else {
3917 	ptr1_prev = ptr1;
3918 	ptr1 += 1; if (++startcolumni_1 == 4) {ptr1 += 8; startcolumni_1 = 0;}
3919 	ptr2 += 1; if (++startcolumni_2 == 4) {ptr2 += 8; startcolumni_2 = 0;}
3920 	offset += 32; /* Not STEP_SIZE */
3921       }
3922 
3923       /* Block 2 */
3924       if (nshift == 0) {
3925 	/* rightshift of 32 is a no-op */
3926 #ifdef WORDS_BIGENDIAN
3927 	shifted1[0] = Bigendian_convert_uint(ptr1[0]); shifted1[1] = Bigendian_convert_uint(ptr1[4]); shifted1[2] = Bigendian_convert_uint(ptr1[8]);
3928 #else
3929 	shifted1[0] = ptr1[0]; shifted1[1] = ptr1[4]; shifted1[2] = ptr1[8];
3930 #endif
3931       } else {
3932 #ifdef WORDS_BIGENDIAN
3933 	shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift) | (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
3934 	shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift) | (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
3935 	shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift) | (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
3936 #else
3937 	shifted1[0] = (ptr1[0] << nshift) | (ptr1_prev[0] >> rightshift);
3938 	shifted1[1] = (ptr1[4] << nshift) | (ptr1_prev[4] >> rightshift);
3939 	shifted1[2] = (ptr1[8] << nshift) | (ptr1_prev[8] >> rightshift);
3940 #endif
3941       }
3942       debug2(Compress_print_one_block(ptr1));
3943       debug2(Compress_print_one_block(ptr2));
3944       debug2(Compress_print_one_block(shifted1));
3945 
3946 #ifdef WORDS_BIGENDIAN
3947       diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
3948 #else
3949       diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
3950 #endif
3951       diff = clear_end_32(diff,enddiscard);
3952 
3953       if (diff /* != 0U */) {
3954 #ifdef HAVE_BUILTIN_CTZ
3955 	mismatch_position = offset + (relpos = __builtin_ctz(diff));
3956 #else
3957 	mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
3958 #endif
3959 	debug2(printf("case 4: returning %d matches\n",mismatch_position));
3960 	return mismatch_position;
3961       } else {
3962 	debug2(printf("case 5: returning offset %d + enddiscard %d matches\n",offset,enddiscard));
3963 	return offset + enddiscard;
3964       }
3965     }
3966 
3967   } else if (ptr2 == end) {
3968     /* Single block */
3969     enddiscard = genomelength % 32; /* Not STEP_SIZE */
3970 
3971     ptr1 = &(ref_blocks[startblocki_1]);
3972     ptr2 = &(ref_blocks[startblocki_2]);
3973 #ifdef WORDS_BIGENDIAN
3974     shifted1[0] = Bigendian_convert_uint(ptr1[0]) << nshift;
3975     shifted1[1] = Bigendian_convert_uint(ptr1[4]) << nshift;
3976     shifted1[2] = Bigendian_convert_uint(ptr1[8]) << nshift;
3977 #else
3978     shifted1[0] = ptr1[0] << nshift;
3979     shifted1[1] = ptr1[4] << nshift;
3980     shifted1[2] = ptr1[8] << nshift;
3981 #endif
3982     debug2(Compress_print_one_block(ptr1));
3983     debug2(Compress_print_one_block(ptr2));
3984     debug2(Compress_print_one_block(shifted1));
3985 
3986 #ifdef WORDS_BIGENDIAN
3987     diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
3988 #else
3989     diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
3990 #endif
3991     diff = clear_start_32(diff,startdiscard);
3992     diff = clear_end_32(diff,enddiscard);
3993 
3994     if (diff /* != 0U */) {
3995 #ifdef HAVE_BUILTIN_CTZ
3996       mismatch_position = offset + (relpos = __builtin_ctz(diff));
3997 #else
3998       mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
3999 #endif
4000       debug2(printf("case 6: returning %d matches\n",mismatch_position));
4001       return mismatch_position;
4002     } else {
4003       debug2(printf("case 7: returning %d - %d matches\n",enddiscard,startdiscard));
4004       return (enddiscard - startdiscard);
4005     }
4006 
4007   } else {
4008 
4009     /* Startblock */
4010     ptr1 = &(ref_blocks[startblocki_1]);
4011     ptr2 = &(ref_blocks[startblocki_2]);
4012 #ifdef WORDS_BIGENDIAN
4013     shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift);
4014     shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift);
4015     shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift);
4016 #else
4017     shifted1[0] = (ptr1[0] << nshift);
4018     shifted1[1] = (ptr1[4] << nshift);
4019     shifted1[2] = (ptr1[8] << nshift);
4020 #endif
4021     debug2(Compress_print_one_block(ptr1));
4022     debug2(Compress_print_one_block(ptr2));
4023     debug2(Compress_print_one_block(shifted1));
4024 
4025 #ifdef WORDS_BIGENDIAN
4026     diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
4027 #else
4028     diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
4029 #endif
4030     diff = clear_start_32(diff,startdiscard);
4031 
4032     if (diff /* != 0U */) {
4033 #ifdef HAVE_BUILTIN_CTZ
4034       mismatch_position = offset + (relpos = __builtin_ctz(diff));
4035 #else
4036       mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
4037 #endif
4038       debug2(printf("case 8: returning %d matches\n",mismatch_position));
4039       return mismatch_position;
4040     } else {
4041       ptr1_prev = ptr1;
4042       ptr1 += 1; if (++startcolumni_1 == 4) {ptr1 += 8; startcolumni_1 = 0;}
4043       ptr2 += 1; if (++startcolumni_2 == 4) {ptr2 += 8; startcolumni_2 = 0;}
4044       offset += 32;		/* Not STEP_SIZE */
4045     }
4046 
4047     while (ptr1 < end && ptr2 < end) {
4048       if (nshift == 0) {
4049 	/* rightshift of 32 is a no-op */
4050 #ifdef WORDS_BIGENDIAN
4051 	shifted1[0] = Bigendian_convert_uint(ptr1[0]); shifted1[1] = Bigendian_convert_uint(ptr1[4]); shifted1[2] = Bigendian_convert_uint(ptr1[8]);
4052 #else
4053 	shifted1[0] = ptr1[0]; shifted1[1] = ptr1[4]; shifted1[2] = ptr1[8];
4054 #endif
4055       } else {
4056 #ifdef WORDS_BIGENDIAN
4057 	shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift) | (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
4058 	shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift) | (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
4059 	shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift) | (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
4060 #else
4061 	shifted1[0] = (ptr1[0] << nshift) | (ptr1_prev[0] >> rightshift);
4062 	shifted1[1] = (ptr1[4] << nshift) | (ptr1_prev[4] >> rightshift);
4063 	shifted1[2] = (ptr1[8] << nshift) | (ptr1_prev[8] >> rightshift);
4064 #endif
4065       }
4066       debug2(Compress_print_one_block(ptr1));
4067       debug2(Compress_print_one_block(ptr2));
4068       debug2(Compress_print_one_block(shifted1));
4069 
4070 #ifdef WORDS_BIGENDIAN
4071       diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
4072 #else
4073       diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
4074 #endif
4075       if (diff /* != 0U */) {
4076 #ifdef HAVE_BUILTIN_CTZ
4077 	mismatch_position = offset + (relpos = __builtin_ctz(diff));
4078 #else
4079 	mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
4080 #endif
4081 	debug2(printf("case 9: returning %d matches\n",mismatch_position));
4082 	return mismatch_position;
4083       } else {
4084 	ptr1_prev = ptr1;
4085 	ptr1 += 1; if (++startcolumni_1 == 4) {ptr1 += 8; startcolumni_1 = 0;}
4086 	ptr2 += 1; if (++startcolumni_2 == 4) {ptr2 += 8; startcolumni_2 = 0;}
4087 	offset += 32;		/* Not STEP_SIZE */
4088       }
4089     }
4090 
4091     /* Last block of entire genome */
4092     enddiscard = genomelength % 32; /* Not STEP_SIZE */
4093     if (ptr2 == end) {
4094       debug2(printf("ptr2 == end\n"));
4095       /* Keep enddiscard */
4096       nblocks = 1;
4097     } else if (nshift + enddiscard < 32) {
4098       debug2(printf("ptr1 == end and nshift %d + enddiscard %d < 32\n",nshift,enddiscard));
4099       enddiscard = nshift + enddiscard;
4100       nblocks = 1;
4101     } else if (nshift > 0) {
4102       debug2(printf("ptr1 == end and nshift %d + enddiscard %d >= 32\n",nshift,enddiscard));
4103       enddiscard -= (32 - nshift);
4104       nblocks = 2;
4105     } else {
4106       debug2(printf("ptr1 == end and nshift %d + enddiscard %d >= 32\n",nshift,enddiscard));
4107       /* Keep enddiscard */
4108       nblocks = 2;
4109     }
4110 
4111     /* Block 1 */
4112     if (nshift == 0) {
4113       /* rightshift of 32 is a no-op */
4114 #ifdef WORDS_BIGENDIAN
4115       shifted1[0] = Bigendian_convert_uint(ptr1[0]); shifted1[1] = Bigendian_convert_uint(ptr1[4]); shifted1[2] = Bigendian_convert_uint(ptr1[8]);
4116 #else
4117       shifted1[0] = ptr1[0]; shifted1[1] = ptr1[4]; shifted1[2] = ptr1[8];
4118 #endif
4119     } else {
4120 #ifdef WORDS_BIGENDIAN
4121       shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift) | (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
4122       shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift) | (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
4123       shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift) | (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
4124 #else
4125       shifted1[0] = (ptr1[0] << nshift) | (ptr1_prev[0] >> rightshift);
4126       shifted1[1] = (ptr1[4] << nshift) | (ptr1_prev[4] >> rightshift);
4127       shifted1[2] = (ptr1[8] << nshift) | (ptr1_prev[8] >> rightshift);
4128 #endif
4129     }
4130     debug2(Compress_print_one_block(ptr1));
4131     debug2(Compress_print_one_block(ptr2));
4132     debug2(Compress_print_one_block(shifted1));
4133 
4134 #ifdef WORDS_BIGENDIAN
4135     diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
4136 #else
4137     diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
4138 #endif
4139     if (nblocks == 1) {
4140       diff = clear_end_32(diff,enddiscard);
4141     }
4142 
4143     if (diff /* != 0U */) {
4144 #ifdef HAVE_BUILTIN_CTZ
4145       mismatch_position = offset + (relpos = __builtin_ctz(diff));
4146 #else
4147       mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
4148 #endif
4149       debug2(printf("case 10: returning %d matches\n",mismatch_position));
4150       return mismatch_position;
4151     } else if (nblocks == 1) {
4152       debug2(printf("case 11: returning offset %d + enddiscard %d matches\n",offset,enddiscard));
4153       return offset + enddiscard;
4154     } else {
4155       ptr1_prev = ptr1;
4156       ptr1 += 1; if (++startcolumni_1 == 4) {ptr1 += 8; startcolumni_1 = 0;}
4157       ptr2 += 1; if (++startcolumni_2 == 4) {ptr2 += 8; startcolumni_2 = 0;}
4158       offset += 32;		/* Not STEP_SIZE */
4159     }
4160 
4161     /* Block 2 */
4162 #ifdef WORDS_BIGENDIAN
4163     shifted1[0] = (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
4164     shifted1[1] = (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
4165     shifted1[2] = (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
4166 #else
4167     shifted1[0] = (ptr1_prev[0] >> rightshift);
4168     shifted1[1] = (ptr1_prev[4] >> rightshift);
4169     shifted1[2] = (ptr1_prev[8] >> rightshift);
4170 #endif
4171     debug2(Compress_print_one_block(ptr1));
4172     debug2(Compress_print_one_block(ptr2));
4173     debug2(Compress_print_one_block(shifted1));
4174 
4175 #ifdef WORDS_BIGENDIAN
4176     diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
4177 #else
4178     diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
4179 #endif
4180     diff = clear_end_32(diff,enddiscard);
4181 
4182     if (diff /* != 0U */) {
4183 #ifdef HAVE_BUILTIN_CTZ
4184       mismatch_position = offset + (relpos = __builtin_ctz(diff));
4185 #else
4186       mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
4187 #endif
4188       debug2(printf("case 12: returning %d matches\n",mismatch_position));
4189       return mismatch_position;
4190     } else {
4191       debug2(printf("case 13: returning offset %d + enddiscard %d matches\n",offset,enddiscard));
4192       return offset + enddiscard;
4193     }
4194   }
4195 }
4196 
4197