1 static char rcsid[] = "$Id: genome128_consec.c 222860 2020-06-15 07:08:07Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11
12 #include "genome128_consec.h"
13
14 #include <stdio.h>
15 #include <stddef.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <ctype.h> /* For tolower() */
19
20 #include "assert.h"
21 #include "except.h"
22 #include "compress.h"
23 #include "popcount.h"
24
25 #ifdef WORDS_BIGENDIAN
26 #include "bigendian.h"
27 #else
28 #include "littleendian.h"
29 #endif
30
31
32 /* Consecutive_matches_rightward and leftward */
33 /* Slower with shift and wrap, perhaps because we need to extract integers from the SIMD object */
34 /* #define USE_SHIFT_FIRST_MISMATCH 1 */
35 /* #define USE_WRAP_FIRST_MISMATCH 1 */
36
37 /* Faster to use a straight shift, and _mm_bsrli_si128 is not defined in gcc 4.7 */
38 /* #define USE_SHIFT_HILO 1 */
39
40
41 #ifdef HAVE_SSE2
42 #define QUERY_NEXTCOL 1 /* high0, high1, high2, high3 */
43 #define QUERY_NEXTROW 8
44 #else
45 #define QUERY_NEXTCOL 3 /* high, low, flags */
46 /* #define QUERY_NEXTROW 0 */
47 #endif
48
49 #define GENOME_NEXTCOL 1
50 #define GENOME_NEXTROW 8
51
52
53 #ifdef WORDS_BIGENDIAN
54 /* Do not use SIMD */
55 #elif defined(HAVE_SSE2)
56 #include <emmintrin.h>
57 #endif
58 #ifdef HAVE_SSSE3
59 #include <tmmintrin.h>
60 #endif
61 #ifdef HAVE_SSE4_1
62 #include <smmintrin.h>
63 #endif
64 #ifdef HAVE_AVX2
65 #include <immintrin.h>
66 #endif
67 #ifdef HAVE_AVX512
68 #include <immintrin.h>
69 #endif
70
71 #if !defined(HAVE_SSE4_2)
72 /* Skip popcnt, which comes after SSE4.2 */
73 #elif defined(HAVE_POPCNT)
74 #include <immintrin.h>
75 #elif defined(HAVE_MM_POPCNT)
76 #include <nmmintrin.h>
77 #endif
78
79 #if !defined(HAVE_SSE4_2)
80 /* Skip lzcnt and tzcnt, which come after SSE4.2 */
81 #elif defined(HAVE_LZCNT) || defined(HAVE_TZCNT)
82 #include <immintrin.h>
83 #endif
84
85 #ifdef DEBUG
86 #define debug(x) x
87 #else
88 #define debug(x)
89 #endif
90
91 /* Fragments */
92 #ifdef DEBUG1
93 #define debug1(x) x
94 #else
95 #define debug1(x)
96 #endif
97
98 /* Genome_consecutive_matches_pair */
99 #ifdef DEBUG2
100 #define debug2(x) x
101 #else
102 #define debug2(x)
103 #endif
104
105 /* clear_highbit and clear_lowbit */
106 #ifdef DEBUG3
107 #define debug3(x) x
108 #else
109 #define debug3(x)
110 #endif
111
112 /* count_leading_zeroes and count_trailing_zeroes */
113 #ifdef DEBUG4
114 #define debug4(x) x
115 #else
116 #define debug4(x)
117 #endif
118
119 /* mark mismatches */
120 #ifdef DEBUG5
121 #define debug5(x) x
122 #else
123 #define debug5(x)
124 #endif
125
126 /* 32-bit shortcuts */
127 #ifdef DEBUG14
128 #define debug14(x) x
129 #else
130 #define debug14(x)
131 #endif
132
133
134 /* Genome_consec code starts here */
135
136 #if defined(DEBUG) || defined(DEBUG5)
137 #ifdef HAVE_SSE4_1
138 static void
print_vector_hex(__m128i x)139 print_vector_hex (__m128i x) {
140 printf("%08X %08X %08X %08X\n",
141 _mm_extract_epi32(x,0),_mm_extract_epi32(x,1),_mm_extract_epi32(x,2),_mm_extract_epi32(x,3));
142 return;
143 }
144
145 static void
print_vector_dec(__m128i x)146 print_vector_dec (__m128i x) {
147 printf("%u %u %u %u\n",
148 _mm_extract_epi32(x,0),_mm_extract_epi32(x,1),_mm_extract_epi32(x,2),_mm_extract_epi32(x,3));
149 return;
150 }
151
152 #elif defined(HAVE_SSE2)
153 static void
print_vector_hex(__m128i x)154 print_vector_hex (__m128i x) {
155 printf("%08X %08X %08X %08X\n",
156 (_mm_extract_epi16(x,1) << 16) | (_mm_extract_epi16(x,0) & 0x0000FFFF),
157 (_mm_extract_epi16(x,3) << 16) | (_mm_extract_epi16(x,2) & 0x0000FFFF),
158 (_mm_extract_epi16(x,5) << 16) | (_mm_extract_epi16(x,4) & 0x0000FFFF),
159 (_mm_extract_epi16(x,7) << 16) | (_mm_extract_epi16(x,6) & 0x0000FFFF));
160 return;
161 }
162
163 static void
print_vector_dec(__m128i x)164 print_vector_dec (__m128i x) {
165 printf("%u %u %u %u\n",
166 (_mm_extract_epi16(x,1) << 16) | (_mm_extract_epi16(x,0) & 0x0000FFFF),
167 (_mm_extract_epi16(x,3) << 16) | (_mm_extract_epi16(x,2) & 0x0000FFFF),
168 (_mm_extract_epi16(x,5) << 16) | (_mm_extract_epi16(x,4) & 0x0000FFFF),
169 (_mm_extract_epi16(x,7) << 16) | (_mm_extract_epi16(x,6) & 0x0000FFFF));
170 return;
171 }
172 #endif
173
174 #ifdef HAVE_AVX2
175 static void
print_vector_256_hex(__m256i x)176 print_vector_256_hex (__m256i x) {
177 printf("%08X %08X %08X %08X %08X %08X %08X %08X\n",
178 _mm256_extract_epi32(x,0),_mm256_extract_epi32(x,1),_mm256_extract_epi32(x,2),_mm256_extract_epi32(x,3),
179 _mm256_extract_epi32(x,4),_mm256_extract_epi32(x,5),_mm256_extract_epi32(x,6),_mm256_extract_epi32(x,7));
180 return;
181 }
182
183 static void
print_vector_256_dec(__m256i x)184 print_vector_256_dec (__m256i x) {
185 printf("%u %u %u %u %u %u %u %u\n",
186 _mm256_extract_epi32(x,0),_mm256_extract_epi32(x,1),_mm256_extract_epi32(x,2),_mm256_extract_epi32(x,3),
187 _mm256_extract_epi32(x,4),_mm256_extract_epi32(x,5),_mm256_extract_epi32(x,6),_mm256_extract_epi32(x,7));
188 return;
189 }
190 #endif
191
192 #ifdef HAVE_AVX512
193 static void
print_vector_512_hex(__m512i x)194 print_vector_512_hex (__m512i x) {
195 unsigned int array[16];
196
197 _mm512_store_si512((__m512i *) array,x);
198 printf("%08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X %08X\n",
199 array[0],array[1],array[2],array[3],array[4],array[5],array[6],array[7],
200 array[8],array[9],array[10],array[11],array[12],array[13],array[14],array[15]);
201 return;
202 }
203
204 static void
print_vector_512_dec(__m512i x)205 print_vector_512_dec (__m512i x) {
206 unsigned int array[16];
207
208 _mm512_store_si512((__m512i *) array,x);
209 printf("%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u\n",
210 array[0],array[1],array[2],array[3],array[4],array[5],array[6],array[7],
211 array[8],array[9],array[10],array[11],array[12],array[13],array[14],array[15]);
212 return;
213 }
214 #endif
215
216 #endif
217
218
219 #if 0
220 /* Note: outer unshuffle operation, as implemented below has twice the
221 instruction count as lookup of reduce_nt array */
222 static inline UINT4
223 reduce_nt_unshuffle (UINT4 xhigh, UINT4 xlow) {
224 UINT8 x, t;
225
226 x = (UINT8) xhigh;
227 x <<= 32;
228 x |= xlow;
229
230 t = (x ^ (x >> 1)) & 0x2222222222222222; x = x ^ t ^ (t << 1);
231 t = (x ^ (x >> 2)) & 0x0C0C0C0C0C0C0C0C; x = x ^ t ^ (t << 2);
232 t = (x ^ (x >> 4)) & 0x00F000F000F000F0; x = x ^ t ^ (t << 4);
233 t = (x ^ (x >> 8)) & 0x0000FF000000FF00; x = x ^ t ^ (t << 8);
234 t = (x ^ (x >> 16)) & 0x00000000FFFF0000; x = x ^ t ^ (t << 16);
235
236 return (UINT4) ((x >> 32) | x);
237 }
238 #endif
239
240
241 #if defined(DEBUG) || defined(DEBUG2) || defined(DEBUG5)
242 static void
write_chars(Genomecomp_T high,Genomecomp_T low,Genomecomp_T flags)243 write_chars (Genomecomp_T high, Genomecomp_T low, Genomecomp_T flags) {
244 char Buffer[33];
245 int i;
246
247 Buffer[32] = '\0';
248 /* printf("%08X %08X %08X => ",high,low,flags); */
249
250 for (i = 0; i < 32; i++) {
251 switch (((high & 0x01) << 1) | (low & 0x01)) {
252 case 0U: Buffer[i] = 'A'; break;
253 case 1U: Buffer[i] = 'C'; break;
254 case 2U: Buffer[i] = 'G'; break;
255 case 3U: Buffer[i] = 'T'; break;
256 default: abort();
257 }
258 high >>= 1;
259 low >>= 1;
260 }
261
262 if (flags != 0U) {
263 for (i = 0; i < 32; i++) {
264 if (flags & 0x01) {
265 Buffer[i] = 'N';
266 }
267 flags >>= 1;
268 }
269 }
270
271 printf("%s",Buffer);
272 return;
273 }
274 #endif
275
276
277
278 #if defined(DEBUG) || defined(DEBUG2) || defined(DEBUG5)
279 static void
print_blocks(Genomecomp_T * blocks,Univcoord_T startpos,Univcoord_T endpos)280 print_blocks (Genomecomp_T *blocks, Univcoord_T startpos, Univcoord_T endpos) {
281 /* Chrpos_T length = endpos - startpos; */
282 Genomecomp_T *ptr;
283 Univcoord_T startblocki, endblocki;
284 int startcolumni, endcolumni;
285 int startdiscard32, enddiscard32;
286 Genomecomp_T high, low, flags;
287 int i;
288
289 /* sequence = (char *) CALLOC(length+1,sizeof(char)); */
290
291 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
292 startcolumni = (startpos % 128) / 32;
293 startblocki = startpos/128U*12 + startcolumni;
294
295 endcolumni = (endpos % 128) / 32;
296 endblocki = endpos/128U*12 + endcolumni;
297 #else
298 startcolumni = (startpos % 128) / 32;
299 endcolumni = (endpos % 128) / 32;
300
301 startblocki = startpos/128U*12;
302 endblocki = endpos/128U*12;
303 #endif
304
305 startdiscard32 = startpos % 32;
306 enddiscard32 = endpos % 32;
307
308
309 ptr = &(blocks[startblocki]);
310 while (ptr <= &(blocks[endblocki])) {
311 #if defined(WORDS_BIGENDIAN)
312 high = Bigendian_convert_uint(ptr[0]);
313 low = Bigendian_convert_uint(ptr[4]);
314 flags = Bigendian_convert_uint(ptr[8]);
315 printf("high: %08X low: %08X flags: %08X\t",high,low,flags);
316 write_chars(high,low,flags);
317 printf("\n");
318
319 ptr += GENOME_NEXTCOL; if (++startcolumni == 4) {ptr += GENOME_NEXTROW; startcolumni = 0;}
320 #elif !defined(HAVE_SSE2)
321 high = ptr[0]; low = ptr[4]; flags = ptr[8];
322 printf("high: %08X low: %08X flags: %08X\t",high,low,flags);
323 write_chars(high,low,flags);
324 printf("\n");
325
326 ptr += GENOME_NEXTCOL; if (++startcolumni == 4) {ptr += GENOME_NEXTROW; startcolumni = 0;}
327
328 #else
329 if (startcolumni == 0) {
330 /* high: 9F61B62A low: 6D68A157 flags: 00000000 */
331 printf(" \t");
332 /* printf("%u\t",startblocki); */
333 for (i = 0; i < startdiscard32; i++) {
334 printf("*");
335 }
336 for ( ; i < 32; i++) {
337 printf(" ");
338 }
339 printf("\n");
340 startcolumni = -1;
341 }
342 high = ptr[0]; low = ptr[4]; flags = ptr[8];
343 printf("high: %08X low: %08X flags: %08X\t",high,low,flags);
344 write_chars(high,low,flags);
345 printf("\n");
346 if (ptr == &(blocks[endblocki]) && endcolumni == 0) {
347 /* high: 9F61B62A low: 6D68A157 flags: 00000000 */
348 printf(" \t");
349 /* printf("%u\t",startblocki); */
350 for (i = 0; i < enddiscard32; i++) {
351 printf(" ");
352 }
353 for ( ; i < 32; i++) {
354 printf("*");
355 }
356 printf("\n");
357 }
358
359
360 if (startcolumni == 1) {
361 /* high: 9F61B62A low: 6D68A157 flags: 00000000 */
362 printf(" \t");
363 /* printf("%u\t",startblocki); */
364 for (i = 0; i < startdiscard32; i++) {
365 printf("*");
366 }
367 for ( ; i < 32; i++) {
368 printf(" ");
369 }
370 printf("\n");
371 startcolumni = -1;
372 }
373 high = ptr[1]; low = ptr[5]; flags = ptr[9];
374 printf("high: %08X low: %08X flags: %08X\t",high,low,flags);
375 write_chars(high,low,flags);
376 printf("\n");
377 if (ptr == &(blocks[endblocki]) && endcolumni == 1) {
378 /* high: 9F61B62A low: 6D68A157 flags: 00000000 */
379 printf(" \t");
380 /* printf("%u\t",startblocki); */
381 for (i = 0; i < enddiscard32; i++) {
382 printf(" ");
383 }
384 for ( ; i < 32; i++) {
385 printf("*");
386 }
387 printf("\n");
388 }
389
390 if (startcolumni == 2) {
391 /* high: 9F61B62A low: 6D68A157 flags: 00000000 */
392 printf(" \t");
393 /* printf("%u\t",startblocki); */
394 for (i = 0; i < startdiscard32; i++) {
395 printf("*");
396 }
397 for ( ; i < 32; i++) {
398 printf(" ");
399 }
400 printf("\n");
401 startcolumni = -1;
402 }
403 high = ptr[2]; low = ptr[6]; flags = ptr[10];
404 printf("high: %08X low: %08X flags: %08X\t",high,low,flags);
405 write_chars(high,low,flags);
406 printf("\n");
407 if (ptr == &(blocks[endblocki]) && endcolumni == 2) {
408 /* high: 9F61B62A low: 6D68A157 flags: 00000000 */
409 printf(" \t");
410 /* printf("%u\t",startblocki); */
411 for (i = 0; i < enddiscard32; i++) {
412 printf(" ");
413 }
414 for ( ; i < 32; i++) {
415 printf("*");
416 }
417 printf("\n");
418 }
419
420 if (startcolumni == 3) {
421 /* high: 9F61B62A low: 6D68A157 flags: 00000000 */
422 printf(" \t");
423 /* printf("%u\t",startblocki); */
424 for (i = 0; i < startdiscard32; i++) {
425 printf("*");
426 }
427 for ( ; i < 32; i++) {
428 printf(" ");
429 }
430 printf("\n");
431 startcolumni = -1;
432 }
433 high = ptr[3]; low = ptr[7]; flags = ptr[11];
434 printf("high: %08X low: %08X flags: %08X\t",high,low,flags);
435 write_chars(high,low,flags);
436 printf("\n");
437 if (ptr == &(blocks[endblocki]) && endcolumni == 3) {
438 /* high: 9F61B62A low: 6D68A157 flags: 00000000 */
439 printf(" \t");
440 /* printf("%u\t",startblocki); */
441 for (i = 0; i < enddiscard32; i++) {
442 printf(" ");
443 }
444 for ( ; i < 32; i++) {
445 printf("*");
446 }
447 printf("\n");
448 }
449
450 printf("\n");
451 ptr += 12;
452 #endif
453 }
454
455 return;
456 }
457 #endif
458
459
460 #if defined(USE_SHIFT_HILO) && defined(HAVE_SSE2)
461 static inline void
read_128_shift_lo(__m128i * __restrict__ high,__m128i * __restrict__ low,__m128i * __restrict__ flags,UINT4 * __restrict__ ptr,int startcolumni)462 read_128_shift_lo (__m128i *__restrict__ high, __m128i *__restrict__ low, __m128i *__restrict__ flags, UINT4 *__restrict__ ptr,
463 int startcolumni) {
464 __m128i a, b, c;
465
466 ptr -= startcolumni;
467 a = _mm_load_si128((__m128i *) ptr); ptr += 4;
468 b = _mm_load_si128((__m128i *) ptr); ptr += 4;
469 c = _mm_load_si128((__m128i *) ptr); ptr += 4;
470
471 switch (startcolumni) {
472 case 0:
473 *high = _mm_bsrli_si128(a, 0);
474 *low = _mm_bsrli_si128(b, 0);
475 *flags = _mm_bsrli_si128(c, 0);
476 break;
477 case 1:
478 *high = _mm_bsrli_si128(a, 4);
479 *low = _mm_bsrli_si128(b, 4);
480 *flags = _mm_bsrli_si128(c, 4);
481 break;
482 case 2:
483 *high = _mm_bsrli_si128(a, 8);
484 *low = _mm_bsrli_si128(b, 8);
485 *flags = _mm_bsrli_si128(c, 8);
486 break;
487 default:
488 *high = _mm_bsrli_si128(a, 12);
489 *low = _mm_bsrli_si128(b, 12);
490 *flags = _mm_bsrli_si128(c, 12);
491 break;
492 }
493
494 return;
495 }
496
497 static inline void
read_128_shift_hi(__m128i * __restrict__ high,__m128i * __restrict__ low,__m128i * __restrict__ flags,UINT4 * __restrict__ ptr,int endcolumni)498 read_128_shift_hi (__m128i *__restrict__ high, __m128i *__restrict__ low, __m128i *__restrict__ flags, UINT4 *__restrict__ ptr,
499 int endcolumni) {
500 __m128i a, b, c;
501
502 ptr -= endcolumni;
503 a = _mm_load_si128((__m128i *) ptr); ptr += 4;
504 b = _mm_load_si128((__m128i *) ptr); ptr += 4;
505 c = _mm_load_si128((__m128i *) ptr); ptr += 4;
506
507 switch (endcolumni) {
508 case 0:
509 *high = _mm_bslli_si128(a, 12);
510 *low = _mm_bslli_si128(b, 12);
511 *flags = _mm_bslli_si128(c, 12);
512 break;
513 case 1:
514 *high = _mm_bslli_si128(a, 8);
515 *low = _mm_bslli_si128(b, 8);
516 *flags = _mm_bslli_si128(c, 8);
517 break;
518 case 2:
519 *high = _mm_bslli_si128(a, 4);
520 *low = _mm_bslli_si128(b, 4);
521 *flags = _mm_bslli_si128(c, 4);
522 break;
523 default:
524 *high = _mm_bslli_si128(a, 0);
525 *low = _mm_bslli_si128(b, 0);
526 *flags = _mm_bslli_si128(c, 0);
527 break;
528 }
529
530 return;
531 }
532 #endif
533
534
535 #ifdef HAVE_SSSE3
536 static inline void
read_128_wrap_lo(__m128i * __restrict__ high,__m128i * __restrict__ low,__m128i * __restrict__ flags,UINT4 * __restrict__ ptr,int startcolumni)537 read_128_wrap_lo (__m128i *__restrict__ high, __m128i *__restrict__ low, __m128i *__restrict__ flags, UINT4 *__restrict__ ptr,
538 int startcolumni) {
539 __m128i a, b, c, d, e, f;
540
541 ptr -= startcolumni;
542 a = _mm_load_si128((__m128i *) ptr); ptr += 4;
543 b = _mm_load_si128((__m128i *) ptr); ptr += 4;
544 c = _mm_load_si128((__m128i *) ptr); ptr += 4;
545 d = _mm_load_si128((__m128i *) ptr); ptr += 4;
546 e = _mm_load_si128((__m128i *) ptr); ptr += 4;
547 f = _mm_load_si128((__m128i *) ptr);
548
549 switch (startcolumni) {
550 case 0:
551 *high = _mm_alignr_epi8(d, a, 0);
552 *low = _mm_alignr_epi8(e, b, 0);
553 *flags = _mm_alignr_epi8(f, c, 0);
554 break;
555 case 1:
556 *high = _mm_alignr_epi8(d, a, 4);
557 *low = _mm_alignr_epi8(e, b, 4);
558 *flags = _mm_alignr_epi8(f, c, 4);
559 break;
560 case 2:
561 *high = _mm_alignr_epi8(d, a, 8);
562 *low = _mm_alignr_epi8(e, b, 8);
563 *flags = _mm_alignr_epi8(f, c, 8);
564 break;
565 default:
566 *high = _mm_alignr_epi8(d, a, 12);
567 *low = _mm_alignr_epi8(e, b, 12);
568 *flags = _mm_alignr_epi8(f, c, 12);
569 break;
570 }
571
572 return;
573 }
574
575 static inline void
read_128_wrap_hi(__m128i * __restrict__ high,__m128i * __restrict__ low,__m128i * __restrict__ flags,UINT4 * __restrict__ ptr,int endcolumni)576 read_128_wrap_hi (__m128i *__restrict__ high, __m128i *__restrict__ low, __m128i *__restrict__ flags, UINT4 *__restrict__ ptr,
577 int endcolumni) {
578 __m128i a, b, c, d, e, f;
579
580 ptr -= endcolumni;
581 ptr -= 12;
582 a = _mm_load_si128((__m128i *) ptr); ptr += 4;
583 b = _mm_load_si128((__m128i *) ptr); ptr += 4;
584 c = _mm_load_si128((__m128i *) ptr); ptr += 4;
585 d = _mm_load_si128((__m128i *) ptr); ptr += 4;
586 e = _mm_load_si128((__m128i *) ptr); ptr += 4;
587 f = _mm_load_si128((__m128i *) ptr);
588
589 switch (endcolumni) {
590 case 0:
591 *high = _mm_alignr_epi8(d, a, 4);
592 *low = _mm_alignr_epi8(e, b, 4);
593 *flags = _mm_alignr_epi8(f, c, 4);
594 break;
595 case 1:
596 *high = _mm_alignr_epi8(d, a, 8);
597 *low = _mm_alignr_epi8(e, b, 8);
598 *flags = _mm_alignr_epi8(f, c, 8);
599 break;
600 case 2:
601 *high = _mm_alignr_epi8(d, a, 12);
602 *low = _mm_alignr_epi8(e, b, 12);
603 *flags = _mm_alignr_epi8(f, c, 12);
604 break;
605 default:
606 *high = _mm_alignr_epi8(d, a, 16);
607 *low = _mm_alignr_epi8(e, b, 16);
608 *flags = _mm_alignr_epi8(f, c, 16);
609 break;
610 }
611
612 return;
613 }
614 #endif
615
616
617 #ifdef HAVE_AVX2
618 static inline void
read_256(__m256i * __restrict__ high,__m256i * __restrict__ low,__m256i * __restrict__ flags,UINT4 * __restrict__ ptr)619 read_256 (__m256i *__restrict__ high, __m256i *__restrict__ low, __m256i *__restrict__ flags, UINT4 *__restrict__ ptr) {
620 __m256i a, b, c;
621 a = _mm256_loadu_si256((__m256i *) ptr); /* query0_high, query0_low */
622 b = _mm256_loadu_si256((__m256i *) &(ptr[8])); /* query0_flags, query1_high */
623 c = _mm256_loadu_si256((__m256i *) &(ptr[16])); /* query1_low, query1_flags */
624
625 *high = _mm256_permute2x128_si256(a, b, 0x30);
626 *low = _mm256_permute2x128_si256(a, c, 0x21);
627 *flags = _mm256_permute2x128_si256(b, c, 0x30);
628
629 return;
630 }
631 #endif
632
633 #ifdef HAVE_AVX512
634 static inline void
read_512(__m512i * __restrict__ high,__m512i * __restrict__ low,__m512i * __restrict__ flags,UINT4 * __restrict__ ptr)635 read_512 (__m512i *__restrict__ high, __m512i *__restrict__ low, __m512i *__restrict__ flags, UINT4 *__restrict__ ptr) {
636 __m512i a, b, c, d, e, f;
637 a = _mm512_loadu_si512((__m512i *) ptr); /* query0_high, query0_low, query0_flags, query1_high */
638 b = _mm512_loadu_si512((__m512i *) &(ptr[16])); /* query1_low, query1_flags, query2_high, query2_low */
639 c = _mm512_loadu_si512((__m512i *) &(ptr[32])); /* query2_flags, query3_high, query3_low, query3_flags */
640
641 d = _mm512_permutex2var_epi32(a, _mm512_setr_epi32(0, 1, 2, 3, 12, 13, 14, 15,
642 4, 5, 6, 7, 16+0, 16+1, 16+2, 16+3), b);
643 e = _mm512_permutex2var_epi32(b, _mm512_setr_epi32(8, 9, 10, 11, 16+4, 16+5, 16+6, 16+7,
644 12, 13, 14, 15, 16+8, 16+9, 16+10, 16+11), c);
645 f = _mm512_permutex2var_epi32(a, _mm512_setr_epi32(8, 9, 10, 11, 16+4, 16+5, 16+6, 16+7,
646 12, 13, 14, 15, 16+8, 16+9, 16+10, 16+11), b);
647
648 *high = _mm512_permutex2var_epi64(d, _mm512_setr_epi64(0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3), e);
649 *low = _mm512_permutex2var_epi64(d, _mm512_setr_epi64(4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7), e);
650 *flags = _mm512_permutex2var_epi64(f, _mm512_setr_epi64(0, 1, 2, 3, 8+0, 8+1, 8+6, 8+7), c);
651
652 return;
653 }
654 #endif
655
656
657 /* These are global values, used for alignment. Previously for
658 trimming, treated query N's as mismatches, but this is not correct
659 for query N's in the middle of the read. Also, trimming query N's
660 can affect the --clip-overlap feature. */
661 static bool query_unk_mismatch_p = false;
662 static bool genome_unk_mismatch_p = true;
663
664 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
665 #define STEP_SIZE 32
666 #else
667 /* Holds for SSE2, AVX2, and AVX512 */
668 #define STEP_SIZE 128
669 #endif
670
671
672 static UINT4
block_diff_standard_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)673 block_diff_standard_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
674 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
675 UINT4 diff;
676
677 debug(printf("Comparing high: query %08X with genome %08X ",query_shifted[0],ref_ptr[0]));
678 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
679 #else
680 debug(printf("Comparing low: query %08X with genome %08X ",query_shifted[4],ref_ptr[4]));
681 #endif
682
683 #ifdef WORDS_BIGENDIAN
684 diff = (query_shifted[0] ^ Bigendian_convert_uint(ref_ptr[0])) | (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
685 #elif !defined(HAVE_SSE2)
686 diff = (query_shifted[0] ^ ref_ptr[0]) | (query_shifted[1] ^ ref_ptr[4]);
687 #else
688 diff = (query_shifted[0] ^ ref_ptr[0]) | (query_shifted[4] ^ ref_ptr[4]);
689 #endif
690
691 /* Query Ns */
692 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
693 if (query_unk_mismatch_local_p) {
694 /* Query: Considering N as a mismatch */
695 diff |= query_shifted[2];
696 } else {
697 /* Query: Considering N as a wildcard */
698 diff &= ~(query_shifted[2]);
699 }
700 #else
701 if (query_unk_mismatch_local_p) {
702 /* Query: Considering N as a mismatch */
703 diff |= query_shifted[8];
704 } else {
705 /* Query: Considering N as a wildcard */
706 diff &= ~(query_shifted[8]);
707 }
708 #endif
709
710 /* Genome Ns */
711 if (genome_unk_mismatch_p) {
712 /* Genome: Considering N as a mismatch */
713 #ifdef WORDS_BIGENDIAN
714 diff |= Bigendian_convert_uint(ref_ptr[8]);
715 #else
716 diff |= ref_ptr[8];
717 #endif
718 } else {
719 /* Genome: Considering N as a wildcard */
720 #ifdef WORDS_BIGENDIAN
721 diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
722 #else
723 diff &= ~(ref_ptr[8]);
724 #endif
725 }
726
727 debug(printf(" => diff %08X\n",diff));
728
729 return diff;
730 }
731
732
733 #ifdef HAVE_SSE2
734 static __m128i
block_diff_standard_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)735 block_diff_standard_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
736 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
737 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
738
739 _query_high = _mm_load_si128((__m128i *) query_shifted);
740 _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
741 _ref_high = _mm_load_si128((__m128i *) ref_ptr);
742 _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
743
744 _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
745
746 _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
747 if (query_unk_mismatch_local_p) {
748 _diff = _mm_or_si128(_query_flags, _diff);
749 } else {
750 _diff = _mm_andnot_si128(_query_flags, _diff);
751 }
752
753 _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
754 if (genome_unk_mismatch_p) {
755 _diff = _mm_or_si128(_ref_flags, _diff);
756 } else {
757 _diff = _mm_andnot_si128(_ref_flags, _diff);
758 }
759
760 return _diff;
761 }
762
763 #ifdef USE_SHIFT_HILO
764 static __m128i
block_diff_standard_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)765 block_diff_standard_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
766 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
767 int startcolumni) {
768 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
769
770 read_128_shift_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
771 read_128_shift_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
772
773 _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
774
775 if (query_unk_mismatch_local_p) {
776 _diff = _mm_or_si128(_query_flags, _diff);
777 } else {
778 _diff = _mm_andnot_si128(_query_flags, _diff);
779 }
780
781 if (genome_unk_mismatch_p) {
782 _diff = _mm_or_si128(_ref_flags, _diff);
783 } else {
784 _diff = _mm_andnot_si128(_ref_flags, _diff);
785 }
786
787 return _diff;
788 }
789
790 static __m128i
block_diff_standard_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)791 block_diff_standard_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
792 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
793 int endcolumni) {
794 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
795
796 read_128_shift_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
797 read_128_shift_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
798
799 _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
800
801 if (query_unk_mismatch_local_p) {
802 _diff = _mm_or_si128(_query_flags, _diff);
803 } else {
804 _diff = _mm_andnot_si128(_query_flags, _diff);
805 }
806
807 if (genome_unk_mismatch_p) {
808 _diff = _mm_or_si128(_ref_flags, _diff);
809 } else {
810 _diff = _mm_andnot_si128(_ref_flags, _diff);
811 }
812
813 return _diff;
814 }
815 #endif
816 #endif
817
818 #ifdef HAVE_SSSE3
819 static __m128i
block_diff_standard_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)820 block_diff_standard_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
821 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
822 int startcolumni) {
823 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
824
825 read_128_wrap_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
826 read_128_wrap_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
827
828 _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
829
830 if (query_unk_mismatch_local_p) {
831 _diff = _mm_or_si128(_query_flags, _diff);
832 } else {
833 _diff = _mm_andnot_si128(_query_flags, _diff);
834 }
835
836 if (genome_unk_mismatch_p) {
837 _diff = _mm_or_si128(_ref_flags, _diff);
838 } else {
839 _diff = _mm_andnot_si128(_ref_flags, _diff);
840 }
841
842 return _diff;
843 }
844
845 static __m128i
block_diff_standard_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)846 block_diff_standard_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
847 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
848 int endcolumni) {
849 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
850
851 read_128_wrap_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
852 read_128_wrap_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
853
854 _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
855
856 if (query_unk_mismatch_local_p) {
857 _diff = _mm_or_si128(_query_flags, _diff);
858 } else {
859 _diff = _mm_andnot_si128(_query_flags, _diff);
860 }
861
862 if (genome_unk_mismatch_p) {
863 _diff = _mm_or_si128(_ref_flags, _diff);
864 } else {
865 _diff = _mm_andnot_si128(_ref_flags, _diff);
866 }
867
868 return _diff;
869 }
870 #endif
871
872 #ifdef HAVE_AVX2
873 static __m256i
block_diff_standard_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)874 block_diff_standard_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
875 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
876 __m256i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
877
878 read_256(&_query_high,&_query_low,&_query_flags,query_shifted);
879 read_256(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
880
881 _diff = _mm256_or_si256(_mm256_xor_si256(_query_high, _ref_high), _mm256_xor_si256(_query_low, _ref_low));
882
883 if (query_unk_mismatch_local_p) {
884 _diff = _mm256_or_si256(_query_flags, _diff);
885 } else {
886 _diff = _mm256_andnot_si256(_query_flags, _diff);
887 }
888
889 if (genome_unk_mismatch_p) {
890 _diff = _mm256_or_si256(_ref_flags, _diff);
891 } else {
892 _diff = _mm256_andnot_si256(_ref_flags, _diff);
893 }
894
895 return _diff;
896 }
897 #endif
898
899 #ifdef HAVE_AVX512
900 static __m512i
block_diff_standard_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)901 block_diff_standard_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
902 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
903 __m512i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
904
905 read_512(&_query_high,&_query_low,&_query_flags,query_shifted);
906 read_512(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
907
908 _diff = _mm512_or_si512(_mm512_xor_si512(_query_high, _ref_high), _mm512_xor_si512(_query_low, _ref_low));
909
910 if (query_unk_mismatch_local_p) {
911 _diff = _mm512_or_si512(_query_flags, _diff);
912 } else {
913 _diff = _mm512_andnot_si512(_query_flags, _diff);
914 }
915
916 if (genome_unk_mismatch_p) {
917 _diff = _mm512_or_si512(_ref_flags, _diff);
918 } else {
919 _diff = _mm512_andnot_si512(_ref_flags, _diff);
920 }
921
922 return _diff;
923 }
924 #endif
925
926
927 /************************************************************************
928 * CMET
929 ************************************************************************/
930
931 static UINT4
block_diff_metct_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)932 block_diff_metct_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
933 bool query_unk_mismatch_local_p) {
934 UINT4 diff;
935
936 /* sarrayp == true */
937 /* Convert everything to 3-nucleotide space */
938 diff = 0U;
939
940 /* Compare reduced C->T nts */
941 #ifdef WORDS_BIGENDIAN
942 diff |= ((query_shifted[0] | query_shifted[1]) ^ (Bigendian_convert_uint(ref_ptr[0]) | Bigendian_convert_uint(ref_ptr[4]))) |
943 (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
944 #elif !defined(HAVE_SSE2)
945 diff |= ((query_shifted[0] | query_shifted[1]) ^ (ref_ptr[0] | ref_ptr[4])) | (query_shifted[1] ^ ref_ptr[4]);
946 #else
947 diff |= ((query_shifted[0] | query_shifted[4]) ^ (ref_ptr[0] | ref_ptr[4])) | (query_shifted[4] ^ ref_ptr[4]);
948 #endif
949 debug(printf(" => diff %08X\n",diff));
950
951
952 /* Flags: Considering N as a mismatch */
953 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
954 if (query_unk_mismatch_local_p) {
955 debug(printf("Marking query flags: query %08X ",query_shifted[8]));
956 diff |= query_shifted[8];
957 } else {
958 debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
959 diff &= ~(query_shifted[8]);
960 }
961 #else
962 if (query_unk_mismatch_local_p) {
963 debug(printf("Marking query flags: query %08X ",query_shifted[8]));
964 diff |= query_shifted[8];
965 } else {
966 debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
967 diff &= ~(query_shifted[8]);
968 }
969 #endif
970
971 if (genome_unk_mismatch_p) {
972 debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
973 #ifdef WORDS_BIGENDIAN
974 diff |= Bigendian_convert_uint(ref_ptr[8]);
975 #else
976 diff |= (ref_ptr[8]);
977 #endif
978 } else {
979 debug(printf("Clearing genome flags: genome %08X ",ref_ptr[8]));
980 #ifdef WORDS_BIGENDIAN
981 diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
982 #else
983 diff &= ~(ref_ptr[8]);
984 #endif
985 }
986 debug(printf(" => diff %08X\n",diff));
987
988 return diff;
989 }
990
991
992 #ifdef HAVE_SSE2
993 /* Convert C to T: high/low (A) 0 0 => new high 0; (C) 0 1 => 1; (G) 1 0 => 1; (T) 1 0 => 1 */
994 /* new high = high | low */
995 static __m128i
block_diff_metct_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)996 block_diff_metct_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
997 bool query_unk_mismatch_local_p) {
998 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
999
1000 _query_high = _mm_load_si128((__m128i *) query_shifted);
1001 _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
1002 _ref_high = _mm_load_si128((__m128i *) ref_ptr);
1003 _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
1004
1005 /* sarrayp == true */
1006 /* Ignore genome-T to query-C mismatches. Convert everything to 3-nucleotide space */
1007 _diff = _mm_setzero_si128();
1008
1009 /* Compare reduced C->T nts */
1010 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
1011 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1012
1013 _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
1014 if (query_unk_mismatch_local_p) {
1015 _diff = _mm_or_si128(_query_flags, _diff);
1016 } else {
1017 _diff = _mm_andnot_si128(_query_flags, _diff);
1018 }
1019
1020 _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
1021 if (genome_unk_mismatch_p) {
1022 _diff = _mm_or_si128(_ref_flags, _diff);
1023 } else {
1024 _diff = _mm_andnot_si128(_ref_flags, _diff);
1025 }
1026
1027 return _diff;
1028 }
1029
1030 #ifdef USE_SHIFT_HILO
1031 static __m128i
block_diff_metct_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1032 block_diff_metct_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1033 bool query_unk_mismatch_local_p, int startcolumni) {
1034 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1035
1036 read_128_shift_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1037 read_128_shift_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1038
1039 /* sarrayp == true */
1040 /* Ignore genome-T to query-C mismatches. Convert everything to 3-nucleotide space */
1041 _diff = _mm_setzero_si128();
1042
1043 /* Compare reduced C->T nts */
1044 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
1045 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1046
1047 if (query_unk_mismatch_local_p) {
1048 _diff = _mm_or_si128(_query_flags, _diff);
1049 } else {
1050 _diff = _mm_andnot_si128(_query_flags, _diff);
1051 }
1052
1053 if (genome_unk_mismatch_p) {
1054 _diff = _mm_or_si128(_ref_flags, _diff);
1055 } else {
1056 _diff = _mm_andnot_si128(_ref_flags, _diff);
1057 }
1058
1059 return _diff;
1060 }
1061
1062 static __m128i
block_diff_metct_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1063 block_diff_metct_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1064 bool query_unk_mismatch_local_p, int endcolumni) {
1065 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1066
1067 read_128_shift_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1068 read_128_shift_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1069
1070 /* sarrayp == true */
1071 /* Ignore genome-T to query-C mismatches. Convert everything to 3-nucleotide space */
1072 _diff = _mm_setzero_si128();
1073
1074 /* Compare reduced C->T nts */
1075 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
1076 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1077
1078 if (query_unk_mismatch_local_p) {
1079 _diff = _mm_or_si128(_query_flags, _diff);
1080 } else {
1081 _diff = _mm_andnot_si128(_query_flags, _diff);
1082 }
1083
1084 if (genome_unk_mismatch_p) {
1085 _diff = _mm_or_si128(_ref_flags, _diff);
1086 } else {
1087 _diff = _mm_andnot_si128(_ref_flags, _diff);
1088 }
1089
1090 return _diff;
1091 }
1092 #endif
1093 #endif
1094
1095 #ifdef HAVE_SSSE3
1096 /* Convert C to T: high/low (A) 0 0 => new high 0; (C) 0 1 => 1; (G) 1 0 => 1; (T) 1 0 => 1 */
1097 /* new high = high | low */
1098 static __m128i
block_diff_metct_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1099 block_diff_metct_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1100 bool query_unk_mismatch_local_p, int startcolumni) {
1101 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1102
1103 read_128_wrap_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1104 read_128_wrap_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1105
1106 /* sarrayp == true */
1107 /* Ignore genome-T to query-C mismatches. Convert everything to 3-nucleotide space */
1108 _diff = _mm_setzero_si128();
1109
1110 /* Compare reduced C->T nts */
1111 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
1112 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1113
1114 if (query_unk_mismatch_local_p) {
1115 _diff = _mm_or_si128(_query_flags, _diff);
1116 } else {
1117 _diff = _mm_andnot_si128(_query_flags, _diff);
1118 }
1119
1120 if (genome_unk_mismatch_p) {
1121 _diff = _mm_or_si128(_ref_flags, _diff);
1122 } else {
1123 _diff = _mm_andnot_si128(_ref_flags, _diff);
1124 }
1125
1126 return _diff;
1127 }
1128
1129 static __m128i
block_diff_metct_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1130 block_diff_metct_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1131 bool query_unk_mismatch_local_p, int endcolumni) {
1132 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1133
1134 read_128_wrap_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1135 read_128_wrap_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1136
1137 /* sarrayp == true */
1138 /* Ignore genome-T to query-C mismatches. Convert everything to 3-nucleotide space */
1139 _diff = _mm_setzero_si128();
1140
1141 /* Compare reduced C->T nts */
1142 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
1143 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1144
1145 if (query_unk_mismatch_local_p) {
1146 _diff = _mm_or_si128(_query_flags, _diff);
1147 } else {
1148 _diff = _mm_andnot_si128(_query_flags, _diff);
1149 }
1150
1151 if (genome_unk_mismatch_p) {
1152 _diff = _mm_or_si128(_ref_flags, _diff);
1153 } else {
1154 _diff = _mm_andnot_si128(_ref_flags, _diff);
1155 }
1156
1157 return _diff;
1158 }
1159 #endif
1160
1161
1162 #ifdef HAVE_AVX2
1163 /* Convert C to T: high/low (A) 0 0 => new high 0; (C) 0 1 => 1; (G) 1 0 => 1; (T) 1 0 => 1 */
1164 /* new high = high | low */
1165 static __m256i
block_diff_metct_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1166 block_diff_metct_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1167 bool query_unk_mismatch_local_p) {
1168 __m256i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1169
1170 read_256(&_query_high,&_query_low,&_query_flags,query_shifted);
1171 read_256(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1172
1173 /* sarrayp == true */
1174 /* Ignore genome-T to query-C mismatches. Convert everything to 3-nucleotide space */
1175 _diff = _mm256_setzero_si256();
1176
1177 /* Compare reduced C->T nts */
1178 _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_mm256_or_si256(_query_high, _query_low), _mm256_or_si256(_ref_high, _ref_low)));
1179 _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_query_low, _ref_low));
1180
1181 if (query_unk_mismatch_local_p) {
1182 _diff = _mm256_or_si256(_query_flags, _diff);
1183 } else {
1184 _diff = _mm256_andnot_si256(_query_flags, _diff);
1185 }
1186
1187 if (genome_unk_mismatch_p) {
1188 _diff = _mm256_or_si256(_ref_flags, _diff);
1189 } else {
1190 _diff = _mm256_andnot_si256(_ref_flags, _diff);
1191 }
1192
1193 return _diff;
1194 }
1195 #endif
1196
1197 #ifdef HAVE_AVX512
1198 /* Convert C to T: high/low (A) 0 0 => new high 0; (C) 0 1 => 1; (G) 1 0 => 1; (T) 1 0 => 1 */
1199 /* new high = high | low */
1200 static __m512i
block_diff_metct_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1201 block_diff_metct_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1202 bool query_unk_mismatch_local_p) {
1203 __m512i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1204
1205 read_512(&_query_high,&_query_low,&_query_flags,query_shifted);
1206 read_512(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1207
1208 /* sarrayp == true */
1209 /* Ignore genome-T to query-C mismatches. Convert everything to 3-nucleotide space */
1210 _diff = _mm512_setzero_si512();
1211
1212 /* Compare reduced C->T nts */
1213 _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_mm512_or_si512(_query_high, _query_low), _mm512_or_si512(_ref_high, _ref_low)));
1214 _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_query_low, _ref_low));
1215
1216 if (query_unk_mismatch_local_p) {
1217 _diff = _mm512_or_si512(_query_flags, _diff);
1218 } else {
1219 _diff = _mm512_andnot_si512(_query_flags, _diff);
1220 }
1221
1222 if (genome_unk_mismatch_p) {
1223 _diff = _mm512_or_si512(_ref_flags, _diff);
1224 } else {
1225 _diff = _mm512_andnot_si512(_ref_flags, _diff);
1226 }
1227
1228 return _diff;
1229 }
1230 #endif
1231
1232
1233 static UINT4
block_diff_metga_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1234 block_diff_metga_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1235 bool query_unk_mismatch_local_p) {
1236 UINT4 diff;
1237
1238 /* sarrayp == true */
1239 /* Ignore genome-A to query-G mismatches. Convert everything to 3-nucleotide space. */
1240 diff = 0U;
1241
1242 /* Compare reduced G->A nts */
1243 #ifdef WORDS_BIGENDIAN
1244 diff |= ((query_shifted[0] & query_shifted[1]) ^ (Bigendian_convert_uint(ref_ptr[0]) & Bigendian_convert_uint(ref_ptr[4]))) |
1245 (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
1246 #elif !defined(HAVE_SSE2)
1247 diff |= ((query_shifted[0] & query_shifted[1]) ^ (ref_ptr[0] & ref_ptr[4])) | (query_shifted[1] ^ ref_ptr[4]);
1248 #else
1249 diff |= ((query_shifted[0] & query_shifted[4]) ^ (ref_ptr[0] & ref_ptr[4])) | (query_shifted[4] ^ ref_ptr[4]);
1250 #endif
1251 debug(printf(" => diff %08X\n",diff));
1252
1253
1254 /* Flags: Considering N as a mismatch */
1255 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
1256 if (query_unk_mismatch_local_p) {
1257 debug(printf("Marking query flags: query %08X ",query_shifted[2]));
1258 diff |= query_shifted[2];
1259 } else {
1260 debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
1261 diff &= ~(query_shifted[2]);
1262 }
1263 #else
1264 if (query_unk_mismatch_local_p) {
1265 debug(printf("Marking query flags: query %08X ",query_shifted[8]));
1266 diff |= query_shifted[8];
1267 } else {
1268 debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
1269 diff &= ~(query_shifted[8]);
1270 }
1271 #endif
1272
1273 if (genome_unk_mismatch_p) {
1274 debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
1275 #ifdef WORDS_BIGENDIAN
1276 diff |= Bigendian_convert_uint(ref_ptr[8]);
1277 #else
1278 diff |= (ref_ptr[8]);
1279 #endif
1280 } else {
1281 debug(printf("Clearing genome flags: genome %08X ",ref_ptr[8]));
1282 #ifdef WORDS_BIGENDIAN
1283 diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
1284 #else
1285 diff &= ~(ref_ptr[8]);
1286 #endif
1287 }
1288 debug(printf(" => diff %08X\n",diff));
1289
1290 return diff;
1291 }
1292
1293
1294 #ifdef HAVE_SSE2
1295 /* Convert G to A: high/low (A) 0 0 => new high 0; (C) 0 1 => 0; (G) 1 0 => 0; (T) 1 0 => 1 */
1296 /* new high = high & low */
1297 static __m128i
block_diff_metga_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1298 block_diff_metga_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1299 bool query_unk_mismatch_local_p) {
1300 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1301
1302 _query_high = _mm_load_si128((__m128i *) query_shifted);
1303 _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
1304 _ref_high = _mm_load_si128((__m128i *) ref_ptr);
1305 _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
1306
1307 /* sarrayp == true */
1308 /* Ignore genome-A to query-G mismatches. Convert everything to 3-nucleotide space. */
1309 _diff = _mm_setzero_si128();
1310
1311 /* Compare reduced G->A nts */
1312 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
1313 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1314
1315 _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
1316 if (query_unk_mismatch_local_p) {
1317 _diff = _mm_or_si128(_query_flags, _diff);
1318 } else {
1319 _diff = _mm_andnot_si128(_query_flags, _diff);
1320 }
1321
1322 _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
1323 if (genome_unk_mismatch_p) {
1324 _diff = _mm_or_si128(_ref_flags, _diff);
1325 } else {
1326 _diff = _mm_andnot_si128(_ref_flags, _diff);
1327 }
1328
1329 return _diff;
1330 }
1331
1332 #ifdef USE_SHIFT_HILO
1333 static __m128i
block_diff_metga_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1334 block_diff_metga_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1335 bool query_unk_mismatch_local_p, int startcolumni) {
1336 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1337
1338 read_128_shift_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1339 read_128_shift_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1340
1341 /* sarrayp == true */
1342 /* Ignore genome-A to query-G mismatches. Convert everything to 3-nucleotide space. */
1343 _diff = _mm_setzero_si128();
1344
1345 /* Compare reduced G->A nts */
1346 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
1347 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1348
1349 if (query_unk_mismatch_local_p) {
1350 _diff = _mm_or_si128(_query_flags, _diff);
1351 } else {
1352 _diff = _mm_andnot_si128(_query_flags, _diff);
1353 }
1354
1355 if (genome_unk_mismatch_p) {
1356 _diff = _mm_or_si128(_ref_flags, _diff);
1357 } else {
1358 _diff = _mm_andnot_si128(_ref_flags, _diff);
1359 }
1360
1361 return _diff;
1362 }
1363
1364 static __m128i
block_diff_metga_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1365 block_diff_metga_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1366 bool query_unk_mismatch_local_p, int endcolumni) {
1367 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1368
1369 read_128_shift_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1370 read_128_shift_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1371
1372 /* sarrayp == true */
1373 /* Ignore genome-A to query-G mismatches. Convert everything to 3-nucleotide space. */
1374 _diff = _mm_setzero_si128();
1375
1376 /* Compare reduced G->A nts */
1377 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
1378 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1379
1380 if (query_unk_mismatch_local_p) {
1381 _diff = _mm_or_si128(_query_flags, _diff);
1382 } else {
1383 _diff = _mm_andnot_si128(_query_flags, _diff);
1384 }
1385
1386 if (genome_unk_mismatch_p) {
1387 _diff = _mm_or_si128(_ref_flags, _diff);
1388 } else {
1389 _diff = _mm_andnot_si128(_ref_flags, _diff);
1390 }
1391
1392 return _diff;
1393 }
1394 #endif
1395 #endif
1396
1397 #ifdef HAVE_SSSE3
1398 /* Convert G to A: high/low (A) 0 0 => new high 0; (C) 0 1 => 0; (G) 1 0 => 0; (T) 1 0 => 1 */
1399 /* new high = high & low */
1400 static __m128i
block_diff_metga_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1401 block_diff_metga_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1402 bool query_unk_mismatch_local_p, int startcolumni) {
1403 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1404
1405 read_128_wrap_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1406 read_128_wrap_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1407
1408 /* sarrayp == true */
1409 /* Ignore genome-A to query-G mismatches. Convert everything to 3-nucleotide space. */
1410 _diff = _mm_setzero_si128();
1411
1412 /* Compare reduced G->A nts */
1413 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
1414 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1415
1416 if (query_unk_mismatch_local_p) {
1417 _diff = _mm_or_si128(_query_flags, _diff);
1418 } else {
1419 _diff = _mm_andnot_si128(_query_flags, _diff);
1420 }
1421
1422 if (genome_unk_mismatch_p) {
1423 _diff = _mm_or_si128(_ref_flags, _diff);
1424 } else {
1425 _diff = _mm_andnot_si128(_ref_flags, _diff);
1426 }
1427
1428 return _diff;
1429 }
1430
1431 static __m128i
block_diff_metga_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1432 block_diff_metga_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1433 bool query_unk_mismatch_local_p, int endcolumni) {
1434 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1435
1436 read_128_wrap_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1437 read_128_wrap_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1438
1439 /* sarrayp == true */
1440 /* Ignore genome-A to query-G mismatches. Convert everything to 3-nucleotide space. */
1441 _diff = _mm_setzero_si128();
1442
1443 /* Compare reduced G->A nts */
1444 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
1445 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1446
1447 if (query_unk_mismatch_local_p) {
1448 _diff = _mm_or_si128(_query_flags, _diff);
1449 } else {
1450 _diff = _mm_andnot_si128(_query_flags, _diff);
1451 }
1452
1453 if (genome_unk_mismatch_p) {
1454 _diff = _mm_or_si128(_ref_flags, _diff);
1455 } else {
1456 _diff = _mm_andnot_si128(_ref_flags, _diff);
1457 }
1458
1459 return _diff;
1460 }
1461 #endif
1462
1463 #ifdef HAVE_AVX2
1464 /* Convert G to A: high/low (A) 0 0 => new high 0; (C) 0 1 => 0; (G) 1 0 => 0; (T) 1 0 => 1 */
1465 /* new high = high & low */
1466 static __m256i
block_diff_metga_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1467 block_diff_metga_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1468 bool query_unk_mismatch_local_p) {
1469 __m256i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1470
1471 read_256(&_query_high,&_query_low,&_query_flags,query_shifted);
1472 read_256(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1473
1474 /* sarrayp == true */
1475 /* Ignore genome-A to query-G mismatches. Convert everything to 3-nucleotide space. */
1476 _diff = _mm256_setzero_si256();
1477
1478 /* Compare reduced G->A nts */
1479 _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_mm256_and_si256(_query_high, _query_low), _mm256_and_si256(_ref_high, _ref_low)));
1480 _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_query_low, _ref_low));
1481
1482 if (query_unk_mismatch_local_p) {
1483 _diff = _mm256_or_si256(_query_flags, _diff);
1484 } else {
1485 _diff = _mm256_andnot_si256(_query_flags, _diff);
1486 }
1487
1488 if (genome_unk_mismatch_p) {
1489 _diff = _mm256_or_si256(_ref_flags, _diff);
1490 } else {
1491 _diff = _mm256_andnot_si256(_ref_flags, _diff);
1492 }
1493
1494 return _diff;
1495 }
1496 #endif
1497
1498 #ifdef HAVE_AVX512
1499 /* Convert G to A: high/low (A) 0 0 => new high 0; (C) 0 1 => 0; (G) 1 0 => 0; (T) 1 0 => 1 */
1500 /* new high = high & low */
1501 static __m512i
block_diff_metga_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1502 block_diff_metga_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1503 bool query_unk_mismatch_local_p) {
1504 __m512i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1505
1506 read_512(&_query_high,&_query_low,&_query_flags,query_shifted);
1507 read_512(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1508
1509 /* sarrayp == true */
1510 /* Ignore genome-A to query-G mismatches. Convert everything to 3-nucleotide space. */
1511 _diff = _mm512_setzero_si512();
1512
1513 /* Compare reduced G->A nts */
1514 _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_mm512_and_si512(_query_high, _query_low), _mm512_and_si512(_ref_high, _ref_low)));
1515 _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_query_low, _ref_low));
1516
1517 if (query_unk_mismatch_local_p) {
1518 _diff = _mm512_or_si512(_query_flags, _diff);
1519 } else {
1520 _diff = _mm512_andnot_si512(_query_flags, _diff);
1521 }
1522
1523 if (genome_unk_mismatch_p) {
1524 _diff = _mm512_or_si512(_ref_flags, _diff);
1525 } else {
1526 _diff = _mm512_andnot_si512(_ref_flags, _diff);
1527 }
1528
1529 return _diff;
1530 }
1531 #endif
1532
1533
1534 static UINT4
block_diff_cmet_sarray_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)1535 block_diff_cmet_sarray_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1536 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
1537 if (genestrand == +2) {
1538 if (plusp) {
1539 return block_diff_metga_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1540 } else {
1541 return block_diff_metct_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1542 }
1543 } else {
1544 if (plusp) {
1545 return block_diff_metct_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1546 } else {
1547 return block_diff_metga_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1548 }
1549 }
1550 }
1551
1552 #ifdef HAVE_SSE2
1553 static __m128i
block_diff_cmet_sarray_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)1554 block_diff_cmet_sarray_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1555 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
1556 if (genestrand == +2) {
1557 if (plusp) {
1558 return block_diff_metga_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1559 } else {
1560 return block_diff_metct_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1561 }
1562 } else {
1563 if (plusp) {
1564 return block_diff_metct_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1565 } else {
1566 return block_diff_metga_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1567 }
1568 }
1569 }
1570
1571 #ifdef USE_SHIFT_HILO
1572 static __m128i
block_diff_cmet_sarray_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)1573 block_diff_cmet_sarray_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1574 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
1575 int startcolumni) {
1576 if (genestrand == +2) {
1577 if (plusp) {
1578 return block_diff_metga_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1579 startcolumni);
1580 } else {
1581 return block_diff_metct_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1582 startcolumni);
1583 }
1584 } else {
1585 if (plusp) {
1586 return block_diff_metct_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1587 startcolumni);
1588 } else {
1589 return block_diff_metga_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1590 startcolumni);
1591 }
1592 }
1593 }
1594
1595 static __m128i
block_diff_cmet_sarray_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)1596 block_diff_cmet_sarray_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1597 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
1598 int endcolumni) {
1599 if (genestrand == +2) {
1600 if (plusp) {
1601 return block_diff_metga_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1602 endcolumni);
1603 } else {
1604 return block_diff_metct_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1605 endcolumni);
1606 }
1607 } else {
1608 if (plusp) {
1609 return block_diff_metct_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1610 endcolumni);
1611 } else {
1612 return block_diff_metga_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1613 endcolumni);
1614 }
1615 }
1616 }
1617 #endif
1618 #endif
1619
1620 #ifdef HAVE_SSSE3
1621 static __m128i
block_diff_cmet_sarray_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)1622 block_diff_cmet_sarray_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1623 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
1624 int startcolumni) {
1625 if (genestrand == +2) {
1626 if (plusp) {
1627 return block_diff_metga_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1628 startcolumni);
1629 } else {
1630 return block_diff_metct_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1631 startcolumni);
1632 }
1633 } else {
1634 if (plusp) {
1635 return block_diff_metct_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1636 startcolumni);
1637 } else {
1638 return block_diff_metga_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1639 startcolumni);
1640 }
1641 }
1642 }
1643
1644 static __m128i
block_diff_cmet_sarray_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)1645 block_diff_cmet_sarray_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1646 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
1647 int endcolumni) {
1648 if (genestrand == +2) {
1649 if (plusp) {
1650 return block_diff_metga_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1651 endcolumni);
1652 } else {
1653 return block_diff_metct_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1654 endcolumni);
1655 }
1656 } else {
1657 if (plusp) {
1658 return block_diff_metct_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1659 endcolumni);
1660 } else {
1661 return block_diff_metga_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
1662 endcolumni);
1663 }
1664 }
1665 }
1666 #endif
1667
1668 #ifdef HAVE_AVX2
1669 static __m256i
block_diff_cmet_sarray_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)1670 block_diff_cmet_sarray_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1671 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
1672 if (genestrand == +2) {
1673 if (plusp) {
1674 return block_diff_metga_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1675 } else {
1676 return block_diff_metct_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1677 }
1678 } else {
1679 if (plusp) {
1680 return block_diff_metct_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1681 } else {
1682 return block_diff_metga_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1683 }
1684 }
1685 }
1686 #endif
1687
1688 #ifdef HAVE_AVX512
1689 static __m512i
block_diff_cmet_sarray_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)1690 block_diff_cmet_sarray_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1691 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
1692 if (genestrand == +2) {
1693 if (plusp) {
1694 return block_diff_metga_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1695 } else {
1696 return block_diff_metct_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1697 }
1698 } else {
1699 if (plusp) {
1700 return block_diff_metct_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1701 } else {
1702 return block_diff_metga_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
1703 }
1704 }
1705 }
1706 #endif
1707
1708
1709
1710 /************************************************************************
1711 * ATOI
1712 ************************************************************************/
1713
1714 static UINT4
block_diff_a2iag_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1715 block_diff_a2iag_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1716 bool query_unk_mismatch_local_p) {
1717 UINT4 diff;
1718
1719 /* sarrayp == true */
1720 /* Ignore genome-G to query-A mismatches. Convert everything to 3-nucleotide space. */
1721 diff = 0U;
1722
1723 /* Compare reduced A->G nts */
1724 #ifdef WORDS_BIGENDIAN
1725 diff |= ((query_shifted[0] | ~(query_shifted[1])) ^ (Bigendian_convert_uint(ref_ptr[0]) | ~(Bigendian_convert_uint(ref_ptr[4])))) |
1726 (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
1727 #elif !defined(HAVE_SSE2)
1728 diff |= ((query_shifted[0] | ~(query_shifted[1])) ^ (ref_ptr[0] | ~(ref_ptr[4]))) | (query_shifted[1] ^ ref_ptr[4]);
1729 /* Because (a ^ b) = (~a ^ ~b), this is equivalent to
1730 diff |= ((~query_shifted[0] & query_shifted[1]) ^ (~ref_ptr[0] & ref_ptr[4])) | (query_shifted[1] ^ ref_ptr[4]);
1731 */
1732 #else
1733 diff |= ((query_shifted[0] | ~(query_shifted[4])) ^ (ref_ptr[0] | ~(ref_ptr[4]))) | (query_shifted[4] ^ ref_ptr[4]);
1734 #endif
1735 debug(printf(" => diff %08X\n",diff));
1736
1737 /* Flags: Considering N as a mismatch */
1738 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
1739 if (query_unk_mismatch_local_p) {
1740 debug(printf("Marking query flags: query %08X ",query_shifted[2]));
1741 diff |= query_shifted[2];
1742 } else {
1743 debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
1744 diff &= ~(query_shifted[2]);
1745 }
1746 #else
1747 if (query_unk_mismatch_local_p) {
1748 debug(printf("Marking query flags: query %08X ",query_shifted[8]));
1749 diff |= query_shifted[8];
1750 } else {
1751 debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
1752 diff &= ~(query_shifted[8]);
1753 }
1754 #endif
1755
1756 if (genome_unk_mismatch_p) {
1757 debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
1758 #ifdef WORDS_BIGENDIAN
1759 diff |= Bigendian_convert_uint(ref_ptr[8]);
1760 #else
1761 diff |= (ref_ptr[8]);
1762 #endif
1763 } else {
1764 debug(printf("Clearing genome flags: genome %08X ",ref_ptr[8]));
1765 #ifdef WORDS_BIGENDIAN
1766 diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
1767 #else
1768 diff &= ~(ref_ptr[8]);
1769 #endif
1770 }
1771 debug(printf(" => diff %08X\n",diff));
1772
1773 return diff;
1774 }
1775
1776
1777 #ifdef HAVE_SSE2
1778 /* Convert A->G: new high = high | ~low */
1779 static __m128i
block_diff_a2iag_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1780 block_diff_a2iag_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1781 bool query_unk_mismatch_local_p) {
1782 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1783
1784 _query_high = _mm_load_si128((__m128i *) query_shifted);
1785 _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
1786 _ref_high = _mm_load_si128((__m128i *) ref_ptr);
1787 _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
1788
1789 /* sarrayp == true */
1790 /* Ignore genome-G to query-A mismatches. Convert everything to 3-nucleotide space. */
1791 _diff = _mm_setzero_si128();
1792
1793 /* Compare reduced A->G nts */
1794 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_high, _query_low), _mm_andnot_si128(_ref_high, _ref_low)));
1795 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1796
1797 _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
1798 if (query_unk_mismatch_local_p) {
1799 _diff = _mm_or_si128(_query_flags, _diff);
1800 } else {
1801 _diff = _mm_andnot_si128(_query_flags, _diff);
1802 }
1803
1804 _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
1805 if (genome_unk_mismatch_p) {
1806 _diff = _mm_or_si128(_ref_flags, _diff);
1807 } else {
1808 _diff = _mm_andnot_si128(_ref_flags, _diff);
1809 }
1810
1811 return _diff;
1812 }
1813
1814 #ifdef USE_SHIFT_HILO
1815 static __m128i
block_diff_a2iag_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1816 block_diff_a2iag_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1817 bool query_unk_mismatch_local_p, int startcolumni) {
1818 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1819
1820 read_128_shift_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1821 read_128_shift_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1822
1823 /* sarrayp == true */
1824 /* Ignore genome-G to query-A mismatches. Convert everything to 3-nucleotide space. */
1825 _diff = _mm_setzero_si128();
1826
1827 /* Compare reduced A->G nts */
1828 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_high, _query_low), _mm_andnot_si128(_ref_high, _ref_low)));
1829 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1830
1831 if (query_unk_mismatch_local_p) {
1832 _diff = _mm_or_si128(_query_flags, _diff);
1833 } else {
1834 _diff = _mm_andnot_si128(_query_flags, _diff);
1835 }
1836
1837 if (genome_unk_mismatch_p) {
1838 _diff = _mm_or_si128(_ref_flags, _diff);
1839 } else {
1840 _diff = _mm_andnot_si128(_ref_flags, _diff);
1841 }
1842
1843 return _diff;
1844 }
1845
1846 static __m128i
block_diff_a2iag_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1847 block_diff_a2iag_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1848 bool query_unk_mismatch_local_p, int endcolumni) {
1849 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1850
1851 read_128_shift_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1852 read_128_shift_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1853
1854 /* sarrayp == true */
1855 /* Ignore genome-G to query-A mismatches. Convert everything to 3-nucleotide space. */
1856 _diff = _mm_setzero_si128();
1857
1858 /* Compare reduced A->G nts */
1859 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_high, _query_low), _mm_andnot_si128(_ref_high, _ref_low)));
1860 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1861
1862 if (query_unk_mismatch_local_p) {
1863 _diff = _mm_or_si128(_query_flags, _diff);
1864 } else {
1865 _diff = _mm_andnot_si128(_query_flags, _diff);
1866 }
1867
1868 if (genome_unk_mismatch_p) {
1869 _diff = _mm_or_si128(_ref_flags, _diff);
1870 } else {
1871 _diff = _mm_andnot_si128(_ref_flags, _diff);
1872 }
1873
1874 return _diff;
1875 }
1876 #endif
1877 #endif
1878
1879 #ifdef HAVE_SSSE3
1880 /* Convert A->G: new high = high | ~low */
1881 static __m128i
block_diff_a2iag_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)1882 block_diff_a2iag_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1883 bool query_unk_mismatch_local_p, int startcolumni) {
1884 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1885
1886 read_128_wrap_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
1887 read_128_wrap_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
1888
1889 /* sarrayp == true */
1890 /* Ignore genome-G to query-A mismatches. Convert everything to 3-nucleotide space. */
1891 _diff = _mm_setzero_si128();
1892
1893 /* Compare reduced A->G nts */
1894 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_high, _query_low), _mm_andnot_si128(_ref_high, _ref_low)));
1895 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1896
1897 if (query_unk_mismatch_local_p) {
1898 _diff = _mm_or_si128(_query_flags, _diff);
1899 } else {
1900 _diff = _mm_andnot_si128(_query_flags, _diff);
1901 }
1902
1903 if (genome_unk_mismatch_p) {
1904 _diff = _mm_or_si128(_ref_flags, _diff);
1905 } else {
1906 _diff = _mm_andnot_si128(_ref_flags, _diff);
1907 }
1908
1909 return _diff;
1910 }
1911
1912 static __m128i
block_diff_a2iag_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)1913 block_diff_a2iag_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1914 bool query_unk_mismatch_local_p, int endcolumni) {
1915 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1916
1917 read_128_wrap_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
1918 read_128_wrap_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
1919
1920 /* sarrayp == true */
1921 /* Ignore genome-G to query-A mismatches. Convert everything to 3-nucleotide space. */
1922 _diff = _mm_setzero_si128();
1923
1924 /* Compare reduced A->G nts */
1925 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_high, _query_low), _mm_andnot_si128(_ref_high, _ref_low)));
1926 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
1927
1928 if (query_unk_mismatch_local_p) {
1929 _diff = _mm_or_si128(_query_flags, _diff);
1930 } else {
1931 _diff = _mm_andnot_si128(_query_flags, _diff);
1932 }
1933
1934 if (genome_unk_mismatch_p) {
1935 _diff = _mm_or_si128(_ref_flags, _diff);
1936 } else {
1937 _diff = _mm_andnot_si128(_ref_flags, _diff);
1938 }
1939
1940 return _diff;
1941 }
1942 #endif
1943
1944 #ifdef HAVE_AVX2
1945 /* Convert A->G: new high = high | ~low */
1946 static __m256i
block_diff_a2iag_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1947 block_diff_a2iag_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1948 bool query_unk_mismatch_local_p) {
1949 __m256i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1950
1951 read_256(&_query_high,&_query_low,&_query_flags,query_shifted);
1952 read_256(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1953
1954 /* sarrayp == true */
1955 /* Ignore genome-G to query-A mismatches. Convert everything to 3-nucleotide space. */
1956 _diff = _mm256_setzero_si256();
1957
1958 /* Compare reduced A->G nts */
1959 _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_mm256_andnot_si256(_query_high, _query_low), _mm256_andnot_si256(_ref_high, _ref_low)));
1960 _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_query_low, _ref_low));
1961
1962 if (query_unk_mismatch_local_p) {
1963 _diff = _mm256_or_si256(_query_flags, _diff);
1964 } else {
1965 _diff = _mm256_andnot_si256(_query_flags, _diff);
1966 }
1967
1968 if (genome_unk_mismatch_p) {
1969 _diff = _mm256_or_si256(_ref_flags, _diff);
1970 } else {
1971 _diff = _mm256_andnot_si256(_ref_flags, _diff);
1972 }
1973
1974 return _diff;
1975 }
1976 #endif
1977
1978 #ifdef HAVE_AVX512
1979 /* Convert A->G: new high = high | ~low */
1980 static __m512i
block_diff_a2iag_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)1981 block_diff_a2iag_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
1982 bool query_unk_mismatch_local_p) {
1983 __m512i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
1984
1985 read_512(&_query_high,&_query_low,&_query_flags,query_shifted);
1986 read_512(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
1987
1988 /* sarrayp == true */
1989 /* Ignore genome-G to query-A mismatches. Convert everything to 3-nucleotide space. */
1990 _diff = _mm512_setzero_si512();
1991
1992 /* Compare reduced A->G nts */
1993 _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_mm512_andnot_si512(_query_high, _query_low), _mm512_andnot_si512(_ref_high, _ref_low)));
1994 _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_query_low, _ref_low));
1995
1996 if (query_unk_mismatch_local_p) {
1997 _diff = _mm512_or_si512(_query_flags, _diff);
1998 } else {
1999 _diff = _mm512_andnot_si512(_query_flags, _diff);
2000 }
2001
2002 if (genome_unk_mismatch_p) {
2003 _diff = _mm512_or_si512(_ref_flags, _diff);
2004 } else {
2005 _diff = _mm512_andnot_si512(_ref_flags, _diff);
2006 }
2007
2008 return _diff;
2009 }
2010 #endif
2011
2012
2013 static UINT4
block_diff_a2itc_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)2014 block_diff_a2itc_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2015 bool query_unk_mismatch_local_p) {
2016 UINT4 diff;
2017
2018 /* sarrayp == true */
2019 /* Ignore genome-C to query-T mismatches */
2020 diff = 0U;
2021
2022 /* Compare reduced T->C nts */
2023 #ifdef WORDS_BIGENDIAN
2024 diff |= ((query_shifted[0] & ~(query_shifted[1])) ^ (Bigendian_convert_uint(ref_ptr[0]) & ~(Bigendian_convert_uint(ref_ptr[4])))) |
2025 (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
2026 #elif !defined(HAVE_SSE2)
2027 diff |= ((query_shifted[0] & ~(query_shifted[1])) ^ (ref_ptr[0] & ~(ref_ptr[4]))) | (query_shifted[1] ^ ref_ptr[4]);
2028 #else
2029 diff |= ((query_shifted[0] & ~(query_shifted[4])) ^ (ref_ptr[0] & ~(ref_ptr[4]))) | (query_shifted[4] ^ ref_ptr[4]);
2030 #endif
2031 debug(printf(" => diff %08X\n",diff));
2032
2033 /* Flags: Considering N as a mismatch */
2034 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
2035 if (query_unk_mismatch_local_p) {
2036 debug(printf("Marking query flags: query %08X ",query_shifted[8]));
2037 diff |= query_shifted[8];
2038 } else {
2039 debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
2040 diff &= ~(query_shifted[8]);
2041 }
2042 #else
2043 if (query_unk_mismatch_local_p) {
2044 debug(printf("Marking query flags: query %08X ",query_shifted[2]));
2045 diff |= query_shifted[2];
2046 } else {
2047 debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
2048 diff &= ~(query_shifted[2]);
2049 }
2050 #endif
2051
2052 if (genome_unk_mismatch_p) {
2053 debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
2054 #ifdef WORDS_BIGENDIAN
2055 diff |= Bigendian_convert_uint(ref_ptr[8]);
2056 #else
2057 diff |= (ref_ptr[8]);
2058 #endif
2059 } else {
2060 debug(printf("Clearing genome flags: genome %08X ",ref_ptr[8]));
2061 #ifdef WORDS_BIGENDIAN
2062 diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
2063 #else
2064 diff &= ~(ref_ptr[8]);
2065 #endif
2066 }
2067 debug(printf(" => diff %08X\n",diff));
2068
2069 return diff;
2070 }
2071
2072
2073 #ifdef HAVE_SSE2
2074 /* Convert T->C: new high = high & ~low */
2075 static __m128i
block_diff_a2itc_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)2076 block_diff_a2itc_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2077 bool query_unk_mismatch_local_p) {
2078 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2079
2080 _query_high = _mm_load_si128((__m128i *) query_shifted);
2081 _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
2082 _ref_high = _mm_load_si128((__m128i *) ref_ptr);
2083 _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
2084
2085 /* sarrayp == true */
2086 /* Ignore genome-C to query-T mismatches */
2087 _diff = _mm_setzero_si128();
2088
2089 /* Compare reduced T->C nts */
2090 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_low, _query_high), _mm_andnot_si128(_ref_low, _ref_high)));
2091 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
2092
2093 _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
2094 if (query_unk_mismatch_local_p) {
2095 _diff = _mm_or_si128(_query_flags, _diff);
2096 } else {
2097 _diff = _mm_andnot_si128(_query_flags, _diff);
2098 }
2099
2100 _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
2101 if (genome_unk_mismatch_p) {
2102 _diff = _mm_or_si128(_ref_flags, _diff);
2103 } else {
2104 _diff = _mm_andnot_si128(_ref_flags, _diff);
2105 }
2106
2107 return _diff;
2108 }
2109
2110 #ifdef USE_SHIFT_HILO
2111 static __m128i
block_diff_a2itc_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)2112 block_diff_a2itc_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2113 bool query_unk_mismatch_local_p, int startcolumni) {
2114 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2115
2116 read_128_shift_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
2117 read_128_shift_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
2118
2119 /* sarrayp == true */
2120 /* Ignore genome-C to query-T mismatches */
2121 _diff = _mm_setzero_si128();
2122
2123 /* Compare reduced T->C nts */
2124 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_low, _query_high), _mm_andnot_si128(_ref_low, _ref_high)));
2125 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
2126
2127 if (query_unk_mismatch_local_p) {
2128 _diff = _mm_or_si128(_query_flags, _diff);
2129 } else {
2130 _diff = _mm_andnot_si128(_query_flags, _diff);
2131 }
2132
2133 if (genome_unk_mismatch_p) {
2134 _diff = _mm_or_si128(_ref_flags, _diff);
2135 } else {
2136 _diff = _mm_andnot_si128(_ref_flags, _diff);
2137 }
2138
2139 return _diff;
2140 }
2141
2142 static __m128i
block_diff_a2itc_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)2143 block_diff_a2itc_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2144 bool query_unk_mismatch_local_p, int endcolumni) {
2145 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2146
2147 read_128_shift_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
2148 read_128_shift_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
2149
2150 /* sarrayp == true */
2151 /* Ignore genome-C to query-T mismatches */
2152 _diff = _mm_setzero_si128();
2153
2154 /* Compare reduced T->C nts */
2155 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_low, _query_high), _mm_andnot_si128(_ref_low, _ref_high)));
2156 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
2157
2158 if (query_unk_mismatch_local_p) {
2159 _diff = _mm_or_si128(_query_flags, _diff);
2160 } else {
2161 _diff = _mm_andnot_si128(_query_flags, _diff);
2162 }
2163
2164 if (genome_unk_mismatch_p) {
2165 _diff = _mm_or_si128(_ref_flags, _diff);
2166 } else {
2167 _diff = _mm_andnot_si128(_ref_flags, _diff);
2168 }
2169
2170 return _diff;
2171 }
2172 #endif
2173 #endif
2174
2175 #ifdef HAVE_SSSE3
2176 /* Convert T->C: new high = high & ~low */
2177 static __m128i
block_diff_a2itc_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int startcolumni)2178 block_diff_a2itc_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2179 bool query_unk_mismatch_local_p, int startcolumni) {
2180 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2181
2182 read_128_wrap_lo(&_query_high,&_query_low,&_query_flags,query_shifted,startcolumni);
2183 read_128_wrap_lo(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,startcolumni);
2184
2185 /* sarrayp == true */
2186 /* Ignore genome-C to query-T mismatches */
2187 _diff = _mm_setzero_si128();
2188
2189 /* Compare reduced T->C nts */
2190 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_low, _query_high), _mm_andnot_si128(_ref_low, _ref_high)));
2191 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
2192
2193 if (query_unk_mismatch_local_p) {
2194 _diff = _mm_or_si128(_query_flags, _diff);
2195 } else {
2196 _diff = _mm_andnot_si128(_query_flags, _diff);
2197 }
2198
2199 if (genome_unk_mismatch_p) {
2200 _diff = _mm_or_si128(_ref_flags, _diff);
2201 } else {
2202 _diff = _mm_andnot_si128(_ref_flags, _diff);
2203 }
2204
2205 return _diff;
2206 }
2207
2208 static __m128i
block_diff_a2itc_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p,int endcolumni)2209 block_diff_a2itc_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2210 bool query_unk_mismatch_local_p, int endcolumni) {
2211 __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2212
2213 read_128_wrap_hi(&_query_high,&_query_low,&_query_flags,query_shifted,endcolumni);
2214 read_128_wrap_hi(&_ref_high,&_ref_low,&_ref_flags,ref_ptr,endcolumni);
2215
2216 /* sarrayp == true */
2217 /* Ignore genome-C to query-T mismatches */
2218 _diff = _mm_setzero_si128();
2219
2220 /* Compare reduced T->C nts */
2221 _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_andnot_si128(_query_low, _query_high), _mm_andnot_si128(_ref_low, _ref_high)));
2222 _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
2223
2224 if (query_unk_mismatch_local_p) {
2225 _diff = _mm_or_si128(_query_flags, _diff);
2226 } else {
2227 _diff = _mm_andnot_si128(_query_flags, _diff);
2228 }
2229
2230 if (genome_unk_mismatch_p) {
2231 _diff = _mm_or_si128(_ref_flags, _diff);
2232 } else {
2233 _diff = _mm_andnot_si128(_ref_flags, _diff);
2234 }
2235
2236 return _diff;
2237 }
2238 #endif
2239
2240 #ifdef HAVE_AVX2
2241 /* Convert T->C: new high = high & ~low */
2242 static __m256i
block_diff_a2itc_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)2243 block_diff_a2itc_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2244 bool query_unk_mismatch_local_p) {
2245 __m256i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2246
2247 read_256(&_query_high,&_query_low,&_query_flags,query_shifted);
2248 read_256(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
2249
2250 /* sarrayp == true */
2251 /* Ignore genome-C to query-T mismatches */
2252 _diff = _mm256_setzero_si256();
2253
2254 /* Compare reduced T->C nts */
2255 _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_mm256_andnot_si256(_query_low, _query_high), _mm256_andnot_si256(_ref_low, _ref_high)));
2256 _diff = _mm256_or_si256(_diff, _mm256_xor_si256(_query_low, _ref_low));
2257
2258 if (query_unk_mismatch_local_p) {
2259 _diff = _mm256_or_si256(_query_flags, _diff);
2260 } else {
2261 _diff = _mm256_andnot_si256(_query_flags, _diff);
2262 }
2263
2264 if (genome_unk_mismatch_p) {
2265 _diff = _mm256_or_si256(_ref_flags, _diff);
2266 } else {
2267 _diff = _mm256_andnot_si256(_ref_flags, _diff);
2268 }
2269
2270 return _diff;
2271 }
2272 #endif
2273
2274 #ifdef HAVE_AVX512
2275 /* Convert T->C: new high = high & ~low */
2276 static __m512i
block_diff_a2itc_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool query_unk_mismatch_local_p)2277 block_diff_a2itc_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2278 bool query_unk_mismatch_local_p) {
2279 __m512i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
2280
2281 read_512(&_query_high,&_query_low,&_query_flags,query_shifted);
2282 read_512(&_ref_high,&_ref_low,&_ref_flags,ref_ptr);
2283
2284 /* sarrayp == true */
2285 /* Ignore genome-C to query-T mismatches */
2286 _diff = _mm512_setzero_si512();
2287
2288 /* Compare reduced T->C nts */
2289 _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_mm512_andnot_si512(_query_low, _query_high), _mm512_andnot_si512(_ref_low, _ref_high)));
2290 _diff = _mm512_or_si512(_diff, _mm512_xor_si512(_query_low, _ref_low));
2291
2292 if (query_unk_mismatch_local_p) {
2293 _diff = _mm512_or_si512(_query_flags, _diff);
2294 } else {
2295 _diff = _mm512_andnot_si512(_query_flags, _diff);
2296 }
2297
2298 if (genome_unk_mismatch_p) {
2299 _diff = _mm512_or_si512(_ref_flags, _diff);
2300 } else {
2301 _diff = _mm512_andnot_si512(_ref_flags, _diff);
2302 }
2303
2304 return _diff;
2305 }
2306 #endif
2307
2308
2309 static UINT4
block_diff_atoi_sarray_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2310 block_diff_atoi_sarray_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2311 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2312 if (genestrand == +2) {
2313 if (plusp) {
2314 return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2315 } else {
2316 return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2317 }
2318 } else {
2319 if (plusp) {
2320 return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2321 } else {
2322 return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2323 }
2324 }
2325 }
2326
2327 #ifdef HAVE_SSE2
2328 static __m128i
block_diff_atoi_sarray_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2329 block_diff_atoi_sarray_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2330 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2331 if (genestrand == +2) {
2332 if (plusp) {
2333 return block_diff_a2itc_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2334 } else {
2335 return block_diff_a2iag_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2336 }
2337 } else {
2338 if (plusp) {
2339 return block_diff_a2iag_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2340 } else {
2341 return block_diff_a2itc_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2342 }
2343 }
2344 }
2345
2346 #ifdef USE_SHIFT_HILO
2347 static __m128i
block_diff_atoi_sarray_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)2348 block_diff_atoi_sarray_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2349 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2350 int startcolumni) {
2351 if (genestrand == +2) {
2352 if (plusp) {
2353 return block_diff_a2itc_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2354 startcolumni);
2355 } else {
2356 return block_diff_a2iag_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2357 startcolumni);
2358 }
2359 } else {
2360 if (plusp) {
2361 return block_diff_a2iag_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2362 startcolumni);
2363 } else {
2364 return block_diff_a2itc_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2365 startcolumni);
2366 }
2367 }
2368 }
2369
2370 static __m128i
block_diff_atoi_sarray_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)2371 block_diff_atoi_sarray_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2372 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2373 int endcolumni) {
2374 if (genestrand == +2) {
2375 if (plusp) {
2376 return block_diff_a2itc_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2377 endcolumni);
2378 } else {
2379 return block_diff_a2iag_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2380 endcolumni);
2381 }
2382 } else {
2383 if (plusp) {
2384 return block_diff_a2iag_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2385 endcolumni);
2386 } else {
2387 return block_diff_a2itc_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2388 endcolumni);
2389 }
2390 }
2391 }
2392 #endif
2393 #endif
2394
2395 #ifdef HAVE_SSSE3
2396 static __m128i
block_diff_atoi_sarray_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)2397 block_diff_atoi_sarray_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2398 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2399 int startcolumni) {
2400 if (genestrand == +2) {
2401 if (plusp) {
2402 return block_diff_a2itc_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2403 startcolumni);
2404 } else {
2405 return block_diff_a2iag_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2406 startcolumni);
2407 }
2408 } else {
2409 if (plusp) {
2410 return block_diff_a2iag_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2411 startcolumni);
2412 } else {
2413 return block_diff_a2itc_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2414 startcolumni);
2415 }
2416 }
2417 }
2418
2419 static __m128i
block_diff_atoi_sarray_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)2420 block_diff_atoi_sarray_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2421 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2422 int endcolumni) {
2423 if (genestrand == +2) {
2424 if (plusp) {
2425 return block_diff_a2itc_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2426 endcolumni);
2427 } else {
2428 return block_diff_a2iag_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2429 endcolumni);
2430 }
2431 } else {
2432 if (plusp) {
2433 return block_diff_a2iag_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2434 endcolumni);
2435 } else {
2436 return block_diff_a2itc_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2437 endcolumni);
2438 }
2439 }
2440 }
2441 #endif
2442
2443 #ifdef HAVE_AVX2
2444 static __m256i
block_diff_atoi_sarray_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2445 block_diff_atoi_sarray_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2446 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2447 if (genestrand == +2) {
2448 if (plusp) {
2449 return block_diff_a2itc_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2450 } else {
2451 return block_diff_a2iag_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2452 }
2453 } else {
2454 if (plusp) {
2455 return block_diff_a2iag_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2456 } else {
2457 return block_diff_a2itc_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2458 }
2459 }
2460 }
2461 #endif
2462
2463 #ifdef HAVE_AVX512
2464 static __m512i
block_diff_atoi_sarray_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2465 block_diff_atoi_sarray_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2466 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2467 if (genestrand == +2) {
2468 if (plusp) {
2469 return block_diff_a2itc_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2470 } else {
2471 return block_diff_a2iag_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2472 }
2473 } else {
2474 if (plusp) {
2475 return block_diff_a2iag_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2476 } else {
2477 return block_diff_a2itc_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2478 }
2479 }
2480 }
2481 #endif
2482
2483
2484
2485 /************************************************************************
2486 * TTOC
2487 ************************************************************************/
2488
2489 static UINT4
block_diff_ttoc_sarray_32(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2490 block_diff_ttoc_sarray_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2491 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2492 if (genestrand == +2) {
2493 if (plusp) {
2494 return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2495 } else {
2496 return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2497 }
2498 } else {
2499 if (plusp) {
2500 return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2501 } else {
2502 return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2503 }
2504 }
2505 }
2506
2507 #ifdef HAVE_SSE2
2508 static __m128i
block_diff_ttoc_sarray_128(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2509 block_diff_ttoc_sarray_128 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2510 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2511 if (genestrand == +2) {
2512 if (plusp) {
2513 return block_diff_a2iag_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2514 } else {
2515 return block_diff_a2itc_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2516 }
2517 } else {
2518 if (plusp) {
2519 return block_diff_a2itc_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2520 } else {
2521 return block_diff_a2iag_128(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2522 }
2523 }
2524 }
2525
2526 #ifdef USE_SHIFT_HILO
2527 static __m128i
block_diff_ttoc_sarray_128_shift_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)2528 block_diff_ttoc_sarray_128_shift_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2529 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2530 int startcolumni) {
2531 if (genestrand == +2) {
2532 if (plusp) {
2533 return block_diff_a2iag_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2534 startcolumni);
2535 } else {
2536 return block_diff_a2itc_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2537 startcolumni);
2538 }
2539 } else {
2540 if (plusp) {
2541 return block_diff_a2itc_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2542 startcolumni);
2543 } else {
2544 return block_diff_a2iag_128_shift_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2545 startcolumni);
2546 }
2547 }
2548 }
2549
2550 static __m128i
block_diff_ttoc_sarray_128_shift_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)2551 block_diff_ttoc_sarray_128_shift_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2552 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2553 int endcolumni) {
2554 if (genestrand == +2) {
2555 if (plusp) {
2556 return block_diff_a2iag_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2557 endcolumni);
2558 } else {
2559 return block_diff_a2itc_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2560 endcolumni);
2561 }
2562 } else {
2563 if (plusp) {
2564 return block_diff_a2itc_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2565 endcolumni);
2566 } else {
2567 return block_diff_a2iag_128_shift_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2568 endcolumni);
2569 }
2570 }
2571 }
2572 #endif
2573 #endif
2574
2575 #ifdef HAVE_SSSE3
2576 static __m128i
block_diff_ttoc_sarray_128_wrap_lo(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int startcolumni)2577 block_diff_ttoc_sarray_128_wrap_lo (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2578 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2579 int startcolumni) {
2580 if (genestrand == +2) {
2581 if (plusp) {
2582 return block_diff_a2iag_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2583 startcolumni);
2584 } else {
2585 return block_diff_a2itc_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2586 startcolumni);
2587 }
2588 } else {
2589 if (plusp) {
2590 return block_diff_a2itc_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2591 startcolumni);
2592 } else {
2593 return block_diff_a2iag_128_wrap_lo(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2594 startcolumni);
2595 }
2596 }
2597 }
2598
2599 static __m128i
block_diff_ttoc_sarray_128_wrap_hi(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p,int endcolumni)2600 block_diff_ttoc_sarray_128_wrap_hi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2601 bool plusp, int genestrand, bool query_unk_mismatch_local_p,
2602 int endcolumni) {
2603 if (genestrand == +2) {
2604 if (plusp) {
2605 return block_diff_a2iag_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2606 endcolumni);
2607 } else {
2608 return block_diff_a2itc_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2609 endcolumni);
2610 }
2611 } else {
2612 if (plusp) {
2613 return block_diff_a2itc_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2614 endcolumni);
2615 } else {
2616 return block_diff_a2iag_128_wrap_hi(query_shifted,ref_ptr,query_unk_mismatch_local_p,
2617 endcolumni);
2618 }
2619 }
2620 }
2621 #endif
2622
2623 #ifdef HAVE_AVX2
2624 static __m256i
block_diff_ttoc_sarray_256(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2625 block_diff_ttoc_sarray_256 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2626 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2627 if (genestrand == +2) {
2628 if (plusp) {
2629 return block_diff_a2iag_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2630 } else {
2631 return block_diff_a2itc_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2632 }
2633 } else {
2634 if (plusp) {
2635 return block_diff_a2itc_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2636 } else {
2637 return block_diff_a2iag_256(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2638 }
2639 }
2640 }
2641 #endif
2642
2643 #ifdef HAVE_AVX512
2644 static __m512i
block_diff_ttoc_sarray_512(Genomecomp_T * query_shifted,Genomecomp_T * ref_ptr,bool plusp,int genestrand,bool query_unk_mismatch_local_p)2645 block_diff_ttoc_sarray_512 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
2646 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
2647 if (genestrand == +2) {
2648 if (plusp) {
2649 return block_diff_a2iag_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2650 } else {
2651 return block_diff_a2itc_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2652 }
2653 } else {
2654 if (plusp) {
2655 return block_diff_a2itc_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2656 } else {
2657 return block_diff_a2iag_512(query_shifted,ref_ptr,query_unk_mismatch_local_p);
2658 }
2659 }
2660 }
2661 #endif
2662
2663
2664 /* query_shifted, ref_ptr, plusp, genestrand, query_unk_mismatch_local_p */
2665 #ifdef HAVE_AVX512
2666 typedef __m512i (*Diffproc_512_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool);
2667 #endif
2668
2669 #ifdef HAVE_AVX2
2670 typedef __m256i (*Diffproc_256_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool);
2671 #endif
2672
2673 #ifdef HAVE_SSSE3
2674 typedef __m128i (*Diffproc_128_wrap_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool, int);
2675 #endif
2676
2677 #ifdef HAVE_SSE2
2678 #ifdef USE_SHIFT_HILO
2679 typedef __m128i (*Diffproc_128_shift_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool, int);
2680 #endif
2681
2682 typedef __m128i (*Diffproc_128_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool);
2683 #endif
2684
2685 typedef UINT4 (*Diffproc_32_T) (Genomecomp_T *, Genomecomp_T *, bool, int, bool);
2686
2687
2688 /* For CMET and ATOI, ignores genome-to-query mismatches. Used by
2689 Genome_consecutive procedures, called only by sarray-read.c */
2690 #ifdef HAVE_AVX512
2691 static Diffproc_512_T block_diff_sarray_512;
2692 #endif
2693 #ifdef HAVE_AVX2
2694 static Diffproc_256_T block_diff_sarray_256;
2695 #endif
2696 #ifdef HAVE_SSSE3
2697 static Diffproc_128_wrap_T block_diff_sarray_128_wrap_lo;
2698 static Diffproc_128_wrap_T block_diff_sarray_128_wrap_hi;
2699 #endif
2700 #ifdef HAVE_SSE2
2701 static Diffproc_128_T block_diff_sarray_128;
2702 #ifdef USE_SHIFT_HILO
2703 static Diffproc_128_shift_T block_diff_sarray_128_shift_lo;
2704 static Diffproc_128_shift_T block_diff_sarray_128_shift_hi;
2705 #endif
2706 #endif
2707 static Diffproc_32_T block_diff_sarray_32;
2708
2709 #ifdef HAVE_AVX512
2710 static __m512i _BOUND_HIGH_512;
2711 static __m512i _BOUND_LOW_512;
2712 #endif
2713 #ifdef HAVE_AVX2
2714 static __m256i _BOUND_HIGH_256;
2715 static __m256i _BOUND_LOW_256;
2716 #endif
2717 #ifdef HAVE_SSE2
2718 static __m128i _BOUND_HIGH;
2719 static __m128i _BOUND_LOW;
2720 #endif
2721
2722 void
Genome_consec_setup(bool query_unk_mismatch_p_in,bool genome_unk_mismatch_p_in,Mode_T mode)2723 Genome_consec_setup (bool query_unk_mismatch_p_in, bool genome_unk_mismatch_p_in,
2724 Mode_T mode) {
2725
2726 #ifdef HAVE_AVX512
2727 _BOUND_HIGH_512 = _mm512_set_epi32(512,480,448,416, 384,352,320,288, 256,224,192,160, 128,96,64,32);
2728 _BOUND_LOW_512 = _mm512_set_epi32(480,448,416,384, 352,320,288,256, 224,192,160,128, 96,64,32,0);
2729 #endif
2730 #ifdef HAVE_AVX2
2731 _BOUND_HIGH_256 = _mm256_set_epi32(256,224,192,160,128,96,64,32);
2732 _BOUND_LOW_256 = _mm256_set_epi32(224,192,160,128,96,64,32,0);
2733 #endif
2734 #ifdef HAVE_SSE2
2735 _BOUND_HIGH = _mm_set_epi32(128,96,64,32);
2736 _BOUND_LOW = _mm_set_epi32(96,64,32,0);
2737 #endif
2738
2739 query_unk_mismatch_p = query_unk_mismatch_p_in;
2740 genome_unk_mismatch_p = genome_unk_mismatch_p_in;
2741
2742 switch (mode) {
2743 case STANDARD:
2744 #ifdef HAVE_AVX512
2745 block_diff_sarray_512 = block_diff_standard_512;
2746 #endif
2747 #ifdef HAVE_AVX2
2748 block_diff_sarray_256 = block_diff_standard_256;
2749 #endif
2750 #ifdef HAVE_SSSE3
2751 block_diff_sarray_128_wrap_lo = block_diff_standard_128_wrap_lo;
2752 block_diff_sarray_128_wrap_hi = block_diff_standard_128_wrap_hi;
2753 #endif
2754 #ifdef HAVE_SSE2
2755 block_diff_sarray_128 = block_diff_standard_128;
2756 #ifdef USE_SHIFT_HILO
2757 block_diff_sarray_128_shift_lo = block_diff_standard_128_shift_lo;
2758 block_diff_sarray_128_shift_hi = block_diff_standard_128_shift_hi;
2759 #endif
2760 #endif
2761 block_diff_sarray_32 = block_diff_standard_32;
2762 break;
2763
2764 case CMET_STRANDED: case CMET_NONSTRANDED:
2765 #ifdef HAVE_AVX512
2766 block_diff_sarray_512 = block_diff_cmet_sarray_512;
2767 #endif
2768 #ifdef HAVE_AVX2
2769 block_diff_sarray_256 = block_diff_cmet_sarray_256;
2770 #endif
2771 #ifdef HAVE_SSSE3
2772 block_diff_sarray_128_wrap_lo = block_diff_cmet_sarray_128_wrap_lo;
2773 block_diff_sarray_128_wrap_hi = block_diff_cmet_sarray_128_wrap_hi;
2774 #endif
2775 #ifdef HAVE_SSE2
2776 block_diff_sarray_128 = block_diff_cmet_sarray_128;
2777 #ifdef USE_SHIFT_HILO
2778 block_diff_sarray_128_shift_lo = block_diff_cmet_128_shift_lo;
2779 block_diff_sarray_128_shift_hi = block_diff_cmet_128_shift_hi;
2780 #endif
2781 #endif
2782 block_diff_sarray_32 = block_diff_cmet_sarray_32;
2783 break;
2784
2785 case ATOI_STRANDED: case ATOI_NONSTRANDED:
2786 #ifdef HAVE_AVX512
2787 block_diff_sarray_512 = block_diff_atoi_sarray_512;
2788 #endif
2789 #ifdef HAVE_AVX2
2790 block_diff_sarray_256 = block_diff_atoi_sarray_256;
2791 #endif
2792 #ifdef HAVE_SSSE3
2793 block_diff_sarray_128_wrap_lo = block_diff_atoi_sarray_128_wrap_lo;
2794 block_diff_sarray_128_wrap_hi = block_diff_atoi_sarray_128_wrap_hi;
2795 #endif
2796 #ifdef HAVE_SSE2
2797 block_diff_sarray_128 = block_diff_atoi_sarray_128;
2798 #ifdef USE_SHIFT_HILO
2799 block_diff_sarray_128_shift_lo = block_diff_atoi_128_shift_lo;
2800 block_diff_sarray_128_shift_hi = block_diff_atoi_128_shift_hi;
2801 #endif
2802 #endif
2803 block_diff_sarray_32 = block_diff_atoi_sarray_32;
2804 break;
2805
2806 case TTOC_STRANDED: case TTOC_NONSTRANDED:
2807 #ifdef HAVE_AVX512
2808 block_diff_sarray_512 = block_diff_ttoc_sarray_512;
2809 #endif
2810 #ifdef HAVE_AVX2
2811 block_diff_sarray_256 = block_diff_ttoc_sarray_256;
2812 #endif
2813 #ifdef HAVE_SSSE3
2814 block_diff_sarray_128_wrap_lo = block_diff_ttoc_sarray_128_wrap_lo;
2815 block_diff_sarray_128_wrap_hi = block_diff_ttoc_sarray_128_wrap_hi;
2816 #endif
2817 #ifdef HAVE_SSE2
2818 block_diff_sarray_128 = block_diff_ttoc_sarray_128;
2819 #ifdef USE_SHIFT_HILO
2820 block_diff_sarray_128_shift_lo = block_diff_ttoc_128_shift_lo;
2821 block_diff_sarray_128_shift_hi = block_diff_ttoc_128_shift_hi;
2822 #endif
2823 #endif
2824 block_diff_sarray_32 = block_diff_ttoc_sarray_32;
2825 break;
2826
2827 default: fprintf(stderr,"Mode %d not recognized\n",mode); abort();
2828 }
2829
2830 return;
2831 }
2832
2833
2834 /************************************************************************/
2835
2836 #ifdef HAVE_AVX512
2837 /* Need to implement. Extract procedures not available. */
2838 #endif
2839
2840
2841 #ifdef HAVE_AVX2
2842 #define nonzero_p_256(diff) !_mm256_testz_si256(diff,diff)
2843
2844
2845 #if defined(HAVE_POPCNT)
2846 #define popcount_ones_256(_diff) (_popcnt64(_mm256_extract_epi64(_diff,0)) + _popcnt64(_mm256_extract_epi64(_diff,1)) + popcnt64(_mm256_extract_epi64(_diff,2)) + _popcnt64(_mm256_extract_epi64(_diff,3)))
2847 #elif defined(HAVE_MM_POPCNT)
2848 #define popcount_ones_256(_diff) (_mm_popcnt_u64(_mm256_extract_epi64(_diff,0)) + _mm_popcnt_u64(_mm256_extract_epi64(_diff,1)) + _mm_popcnt_u64(_mm256_extract_epi64(_diff,2)) + _mm_popcnt_u64(_mm256_extract_epi64(_diff,3)))
2849 #else
2850 #define popcount_ones_256(_diff) (__builtin_popcountll(_mm256_extract_epi64(_diff,0)) + __builtin_popcountll(_mm256_extract_epi64(_diff,1)) + __builtin_popcountll(_mm256_extract_epi64(_diff,2)) + __builtin_popcountll(_mm256_extract_epi64(_diff,3)))
2851 #endif
2852
2853 static int
count_leading_zeroes_256(__m256i _diff)2854 count_leading_zeroes_256 (__m256i _diff) {
2855 debug4(printf("Entered count_leading_zeroes with "));
2856 debug4(print_vector_256_hex(_diff));
2857 UINT8 x;
2858
2859 #ifdef HAVE_LZCNT
2860 if ((x = _mm256_extract_epi64(_diff,3)) != 0) {
2861 return (int) _lzcnt_u64(x);
2862 } else if ((x = _mm256_extract_epi64(_diff,2)) != 0) {
2863 return 64 + (int) _lzcnt_u64(x);
2864 } else if ((x = _mm256_extract_epi64(_diff,1)) != 0) {
2865 return 128 + (int) _lzcnt_u64(x);
2866 } else {
2867 return 192 + (int) _lzcnt_u64(_mm256_extract_epi64(_diff,0));
2868 }
2869
2870 #elif defined(HAVE_BUILTIN_CLZ)
2871 if ((x = _mm256_extract_epi64(_diff,3)) != 0) {
2872 return (int) __builtin_clzll(x);
2873 } else if ((x = _mm256_extract_epi64(_diff,2)) != 0) {
2874 return 64 + (int) __builtin_clzll(x);
2875 } else if ((x = _mm256_extract_epi64(_diff,1)) != 0) {
2876 return 128 + (int) __builtin_clzll(x);
2877 } else {
2878 return 192 + (int) __builtin_clzll(_mm256_extract_epi64(_diff,0));
2879 }
2880
2881 #else
2882 abort();
2883 #endif
2884 }
2885
2886 static int
count_trailing_zeroes_256(__m256i _diff)2887 count_trailing_zeroes_256 (__m256i _diff) {
2888 debug4(printf("Entered count_trailing_zeroes with "));
2889 debug4(print_vector_256_hex(_diff));
2890 UINT8 x;
2891
2892 #ifdef HAVE_TZCNT
2893 if ((x = _mm256_extract_epi64(_diff,0)) != 0) {
2894 return (int) _tzcnt_u64(x);
2895 } else if ((x = _mm256_extract_epi64(_diff,1)) != 0) {
2896 return 64 + (int) _tzcnt_u64(x);
2897 } else if ((x = _mm256_extract_epi64(_diff,2)) != 0) {
2898 return 128 + (int) _tzcnt_u64(x);
2899 } else {
2900 return 192 + (int) _tzcnt_u64(_mm256_extract_epi64(_diff,3));
2901 }
2902
2903 #elif defined(HAVE_BUILTIN_CTZ)
2904 if ((x = _mm256_extract_epi64(_diff,0)) != 0) {
2905 return (int) __builtin_ctzll(x);
2906 } else if ((x = _mm256_extract_epi64(_diff,1)) != 0) {
2907 return 64 + (int) __builtin_ctzll(x);
2908 } else if ((x = _mm256_extract_epi64(_diff,2)) != 0) {
2909 return 128 + (int) __builtin_ctzll(x);
2910 } else {
2911 return 192 + (int) __builtin_ctzll(_mm256_extract_epi64(_diff,3));
2912 }
2913
2914 #else
2915 abort();
2916 #endif
2917
2918 }
2919
2920 static __m256i
clear_highbit_256(__m256i _diff,int leading_zeroes)2921 clear_highbit_256 (__m256i _diff, int leading_zeroes) {
2922 __m256i _subtract, _relpos;
2923 int relpos;
2924
2925 relpos = 255 - leading_zeroes;
2926 debug3(printf("Clearing high bit at relpos %d\n",relpos));
2927
2928 _subtract = _mm256_slli_epi32(_mm256_set1_epi32(1), relpos % 32);
2929 _relpos = _mm256_set1_epi32(relpos);
2930 _subtract = _mm256_and_si256(_mm256_cmpgt_epi32(_BOUND_HIGH_256, _relpos), _subtract);
2931 _subtract = _mm256_andnot_si256(_mm256_cmpgt_epi32(_BOUND_LOW_256, _relpos), _subtract);
2932
2933 debug3(printf("Subtract: "));
2934 debug3(print_vector_256_hex(_subtract));
2935 #if 0
2936 /* latency 1, throughput: 0.5 */
2937 return _mm256_sub_epi32(_diff, _subtract);
2938 #else
2939 /* _mm256_xor_si128 also works if all other bits are 0. latency 1, throughput: 0.33 */
2940 return _mm256_xor_si256(_diff, _subtract);
2941 #endif
2942 }
2943
2944 /* relpos is equal to trailing_zeroes */
2945 static __m256i
clear_lowbit_256(__m256i _diff,int relpos)2946 clear_lowbit_256 (__m256i _diff, int relpos) {
2947 __m256i _subtract, _relpos;
2948
2949 debug3(printf("Clearing low bit at relpos %d\n",relpos));
2950
2951 _subtract = _mm256_slli_epi32(_mm256_set1_epi32(1), relpos % 32);
2952 _relpos = _mm256_set1_epi32(relpos);
2953 _subtract = _mm256_and_si256(_mm256_cmpgt_epi32(_BOUND_HIGH_256, _relpos), _subtract);
2954 _subtract = _mm256_andnot_si256(_mm256_cmpgt_epi32(_BOUND_LOW_256, _relpos), _subtract);
2955
2956 debug3(printf("Subtract: "));
2957 debug3(print_vector_256_hex(_subtract));
2958 #if 0
2959 /* latency 1, throughput: 0.5 */
2960 return _mm256_sub_epi32(_diff, _subtract);
2961 #else
2962 /* _mm256_xor_si128 also works if all other bits are 0. latency 1, throughput: 0.33 */
2963 return _mm256_xor_si256(_diff, _subtract);
2964 #endif
2965 }
2966
2967 #endif
2968
2969
2970 #ifdef HAVE_SSE2
2971
2972 #ifdef HAVE_SSE4_1
2973 #define nonzero_p_128(diff) !_mm_testz_si128(diff,diff)
2974 #else
2975 #define nonzero_p_128(diff) _mm_movemask_epi8(_mm_cmpeq_epi8(diff,_mm_setzero_si128())) != 0xFFFF
2976 #endif
2977
2978 #if (defined(USE_SHIFT_FIRST_MISMATCH) && defined(HAVE_SSE2)) || (defined(USE_WRAP_FIRST_MISMATCH) && defined(HAVE_SSSE3))
2979 static __m128i
clear_start_128(__m128i _diff,int startdiscard)2980 clear_start_128 (__m128i _diff, int startdiscard) {
2981 __m128i _mask, _startdiscard;
2982 #ifdef DEBUG
2983 __m128i _result;
2984 #endif
2985
2986 debug(printf("Clearing start to startdiscard %d\n",startdiscard));
2987 debug(printf("Before: "));
2988 debug(print_vector_hex(_diff));
2989
2990 #ifdef DEFECTIVE_SSE2_COMPILER
2991 _mask = _mm_sll_epi32(_mm_set1_epi32(~0U), _mm_setr_epi32(startdiscard % 32,0,0,0));
2992 #else
2993 _mask = _mm_slli_epi32(_mm_set1_epi32(~0U), startdiscard % 32);
2994 #endif
2995 _startdiscard = _mm_set1_epi32(startdiscard);
2996 _mask = _mm_or_si128(_mask, _mm_cmplt_epi32(_startdiscard, _BOUND_LOW));
2997 _mask = _mm_and_si128(_mask, _mm_cmplt_epi32(_startdiscard, _BOUND_HIGH));
2998
2999 #ifdef DEBUG
3000 _result = _mm_and_si128(_mask, _diff);
3001 debug(printf("After: "));
3002 print_vector_hex(_result);
3003 #endif
3004
3005 return _mm_and_si128(_mask, _diff);
3006 }
3007 #endif
3008
3009 #if (defined(USE_SHIFT_FIRST_MISMATCH) && defined(HAVE_SSE2)) || (defined(USE_WRAP_FIRST_MISMATCH) && defined(HAVE_SSSE3))
3010 static __m128i
clear_end_128(__m128i _diff,int enddiscard)3011 clear_end_128 (__m128i _diff, int enddiscard) {
3012 __m128i _mask, _enddiscard;
3013 #ifdef DEBUG
3014 __m128i _result;
3015 #endif
3016
3017 debug(printf("Clearing end from enddiscard %d\n",enddiscard));
3018 debug(printf("Before: "));
3019 debug(print_vector_hex(_diff));
3020
3021 #ifdef DEFECTIVE_SSE2_COMPILER
3022 _mask = _mm_sll_epi32(_mm_set1_epi32(~0U), _mm_setr_epi32(enddiscard % 32,0,0,0));
3023 #else
3024 _mask = _mm_slli_epi32(_mm_set1_epi32(~0U), enddiscard % 32);
3025 #endif
3026 _enddiscard = _mm_set1_epi32(enddiscard);
3027 _mask = _mm_or_si128(_mm_cmplt_epi32(_enddiscard, _BOUND_LOW), _mask);
3028 _mask = _mm_and_si128(_mm_cmplt_epi32(_enddiscard, _BOUND_HIGH), _mask);
3029
3030 #ifdef DEBUG
3031 _result = _mm_andnot_si128(_mask, _diff);
3032 debug(printf("After: "));
3033 print_vector_hex(_result);
3034 #endif
3035
3036 return _mm_andnot_si128(_mask, _diff);
3037 }
3038 #endif
3039
3040
3041
3042 #if !defined(HAVE_SSE4_2) || !defined(HAVE_MM_EXTRACT_EPI64)
3043
3044 #if 0
3045 /* Naive method for pre-SSE4.2. Requires four popcount operations. */
3046 static int
3047 popcount_ones_128 (__m128i _diff) {
3048 UINT4 diff[4];
3049
3050 _mm_store_si128((__m128i *) diff,_diff);
3051
3052 return __builtin_popcount(diff[0]) + __builtin_popcount(diff[1]) + __builtin_popcount(diff[2]) + __builtin_popcount(diff[3]);
3053 }
3054 #endif
3055
3056
3057 /************************************************************************
3058 * Method for pre-SSE4.2: Using Harley's method to reduce number of
3059 * popcount operations when we need to compute four 32-bit popcounts
3060 * in a 128-bit register.
3061 *
3062 * The naive method is to do _popcnt32(diff[0]) + _popcnt32(diff[1]) + _popcnt32(diff[2]) + _popcnt32(diff[3]);
3063 *
3064 * Harley's method uses a carry-save adder (CSA) to reduce the number
3065 * of popcount operations for 3 elements from 3 to 2.
3066 ************************************************************************/
3067
3068 #define CSA(h,l, a,b,c, u,v) u = a ^ b; v = c; h = (a & b) | (u & v); l = u ^ v;
3069
3070 static int
popcount_ones_128(__m128i _diff)3071 popcount_ones_128 (__m128i _diff) {
3072 UINT4 ones, twos, u, v;
3073 UINT4 diff[4];
3074
3075 _mm_store_si128((__m128i *) diff,_diff);
3076
3077 CSA(twos, ones, diff[0], diff[1], diff[2], u, v);
3078
3079 return 2*__builtin_popcount(twos) + __builtin_popcount(ones) + __builtin_popcount(diff[3]);
3080 }
3081
3082
3083 #elif defined(HAVE_POPCNT)
3084 #define popcount_ones_128(_diff) (_popcnt64(_mm_extract_epi64(_diff,0)) + _popcnt64(_mm_extract_epi64(_diff,1)))
3085 #elif defined(HAVE_MM_POPCNT) && defined(HAVE_MM_POPCNT_U64)
3086 #define popcount_ones_128(_diff) (_mm_popcnt_u64(_mm_extract_epi64(_diff,0)) + _mm_popcnt_u64(_mm_extract_epi64(_diff,1)))
3087 #else
3088 #define popcount_ones_128(_diff) (__builtin_popcountll(_mm_extract_epi64(_diff,0)) + __builtin_popcountll(_mm_extract_epi64(_diff,1)))
3089 #endif
3090
3091
3092 static int
count_leading_zeroes_128(__m128i _diff)3093 count_leading_zeroes_128 (__m128i _diff) {
3094 debug4(printf("Entered count_leading_zeroes_128 with "));
3095 debug4(print_vector_hex(_diff));
3096
3097 #if defined(HAVE_SSE4_2) && defined(HAVE_LZCNT) && defined(HAVE_MM_EXTRACT_EPI64)
3098 UINT8 x;
3099
3100 if ((x = _mm_extract_epi64(_diff,1)) != 0) {
3101 return (int) _lzcnt_u64(x);
3102 } else {
3103 return 64 + (int) _lzcnt_u64(_mm_extract_epi64(_diff,0));
3104 }
3105
3106 #elif defined(HAVE_SSE4_1) && defined(HAVE_MM_EXTRACT_EPI64)
3107 UINT8 x;
3108
3109 if ((x = _mm_extract_epi64(_diff,1)) != 0) {
3110 return __builtin_clzll(x);
3111 } else {
3112 return 64 + __builtin_clzll(_mm_extract_epi64(_diff,0));
3113 }
3114
3115 #else
3116 UINT4 x;
3117
3118 if ((x = (_mm_extract_epi16(_diff,7) << 16) | (_mm_extract_epi16(_diff,6) & 0x0000FFFF)) != 0) {
3119 debug4(printf("word 3 is non-empty, so returning %d vs %d\n",__builtin_clz(x),__builtin_clz(((UINT4 *) &_diff)[3])));
3120 return __builtin_clz(x);
3121 } else if ((x = (_mm_extract_epi16(_diff,5) << 16) | (_mm_extract_epi16(_diff,4) & 0x0000FFFF)) != 0) {
3122 debug4(printf("word 2 is non-empty, so returning 32 + %d vs %d\n",__builtin_clz(x),__builtin_clz(((UINT4 *) &_diff)[2])));
3123 return 32 + __builtin_clz(x);
3124 } else if ((x = (_mm_extract_epi16(_diff,3) << 16) | (_mm_extract_epi16(_diff,2) & 0x0000FFFF)) != 0) {
3125 debug4(printf("word 1 is non-empty, so returning 64 + %d vs %d\n",__builtin_clz(x),__builtin_clz(((UINT4 *) &_diff)[1])));
3126 return 64 + __builtin_clz(x);
3127 } else {
3128 x = (_mm_extract_epi16(_diff,1) << 16) | (_mm_extract_epi16(_diff,0) & 0x0000FFFF);
3129 debug4(printf("word 0 is non-empty, so returning 96 + %d vs %d\n",__builtin_clz(x),__builtin_clz(((UINT4 *) &_diff)[0])));
3130 return 96 + __builtin_clz(x);
3131 }
3132 #endif
3133 }
3134
3135 static int
count_trailing_zeroes_128(__m128i _diff)3136 count_trailing_zeroes_128 (__m128i _diff) {
3137 debug4(printf("Entered count_trailing_zeroes_128 with "));
3138 debug4(print_vector_hex(_diff));
3139
3140 #if defined(HAVE_SSE4_2) && defined(HAVE_TZCNT) && defined(HAVE_MM_EXTRACT_EPI64)
3141 UINT8 x;
3142
3143 if ((x = _mm_extract_epi64(_diff,0)) != 0) {
3144 return (int) _tzcnt_u64(x);
3145 } else {
3146 return 64 + (int) _tzcnt_u64(_mm_extract_epi64(_diff,1));
3147 }
3148
3149 #elif defined(HAVE_SSE4_1) && defined(HAVE_MM_EXTRACT_EPI64)
3150 UINT8 x;
3151
3152 if ((x = _mm_extract_epi64(_diff,0)) != 0) {
3153 return __builtin_ctzll(x);
3154 } else {
3155 return 64 + __builtin_ctzll(_mm_extract_epi64(_diff,1));
3156 }
3157
3158 #else
3159 UINT4 x;
3160
3161 if ((x = (_mm_extract_epi16(_diff,1) << 16) | (_mm_extract_epi16(_diff,0) & 0x0000FFFF)) != 0) {
3162 debug4(printf("word 0 is non-empty, so returning %d vs %d\n",__builtin_ctz(x),__builtin_ctz(((UINT4 *) &_diff)[0])));
3163 return __builtin_ctz(x);
3164 } else if ((x = (_mm_extract_epi16(_diff,3) << 16) | (_mm_extract_epi16(_diff,2) & 0x0000FFFF)) != 0) {
3165 debug4(printf("word 1 is non-empty, so returning 32 + %d vs %d\n",__builtin_ctz(x),__builtin_ctz(((UINT4 *) &_diff)[1])));
3166 return 32 + __builtin_ctz(x);
3167 } else if ((x = (_mm_extract_epi16(_diff,5) << 16) | (_mm_extract_epi16(_diff,4) & 0x0000FFFF)) != 0) {
3168 debug4(printf("word 2 is non-empty, so returning 64 + %d vs %d\n",__builtin_ctz(x),__builtin_ctz(((UINT4 *) &_diff)[2])));
3169 return 64 + __builtin_ctz(x);
3170 } else {
3171 x = (_mm_extract_epi16(_diff,7) << 16) | (_mm_extract_epi16(_diff,6) & 0x0000FFFF);
3172 debug4(printf("word 3 is non-empty, so returning 96 + %d vs %d\n",__builtin_ctz(x),__builtin_ctz(((UINT4 *) &_diff)[3])));
3173 return 96 + __builtin_ctz(x);
3174 }
3175 #endif
3176 }
3177
3178 #endif
3179
3180
3181 /* 76543210 */
3182 #define HIGH_BIT 0x80000000
3183
3184 #define nonzero_p_32(diff) diff
3185
3186 #define clear_start_32(diff,startdiscard) (diff & (~0U << (startdiscard)))
3187 #define clear_end_32(diff,enddiscard) (diff & ~(~0U << (enddiscard)))
3188
3189 /* For trimming */
3190 #define set_start_32(diff,startdiscard) (diff | ~(~0U << startdiscard))
3191 #define set_end_32(diff,enddiscard) (diff | (~0U << enddiscard))
3192
3193 /* For fragment functions that evaluate only the end 16-mer */
3194 #define clear_start_mask(startdiscard) (~0U << (startdiscard))
3195 #define clear_end_mask(enddiscard) (~(~0U << (enddiscard)))
3196
3197
3198 /* Same speed: clear_highbit(diff,relpos) (diff - (HIGH_BIT >> relpos)) */
3199 /* Note: xor assumes that bit at relpos was on */
3200 #define clear_highbit_32(diff,relpos) (diff ^ (HIGH_BIT >> relpos))
3201
3202 /* Slower: clear_lowbit(diff,relpos) diff -= (1 << relpos) */
3203 #define clear_lowbit_32(diff,relpos) (diff & (diff - 1));
3204
3205
3206 #if !defined(HAVE_SSE4_2)
3207 #define popcount_ones_32(diff) (count_bits[diff & 0x0000FFFF] + count_bits[diff >> 16])
3208 #elif defined(HAVE_POPCNT)
3209 #define popcount_ones_32(diff) (_popcnt32(diff))
3210 #elif defined(HAVE_MM_POPCNT)
3211 #define popcount_ones_32(diff) (_mm_popcnt_u32(diff))
3212 #elif defined(HAVE_BUILTIN_POPCOUNT)
3213 #define popcount_ones_32(diff) (__builtin_popcount(diff))
3214 #else
3215 #define popcount_ones_32(diff) (count_bits[diff & 0x0000FFFF] + count_bits[diff >> 16])
3216 #endif
3217
3218 #if !defined(HAVE_SSE4_2)
3219 #define count_leading_zeroes_32(diff) ((diff >> 16) ? clz_table[diff >> 16] : 16 + clz_table[diff])
3220 #elif defined(HAVE_LZCNT)
3221 #define count_leading_zeroes_32(diff) _lzcnt_u32(diff)
3222 #elif defined(HAVE_BUILTIN_CLZ)
3223 #define count_leading_zeroes_32(diff) __builtin_clz(diff)
3224 #else
3225 #define count_leading_zeroes_32(diff) ((diff >> 16) ? clz_table[diff >> 16] : 16 + clz_table[diff])
3226 #endif
3227
3228 #if !defined(HAVE_SSE4_2)
3229 #define count_trailing_zeroes_32(diff) mod_37_bit_position[(-diff & diff) % 37]
3230 #elif defined(HAVE_TZCNT)
3231 #define count_trailing_zeroes_32(diff) _tzcnt_u32(diff)
3232 #elif defined(HAVE_BUILTIN_CTZ)
3233 #define count_trailing_zeroes_32(diff) __builtin_ctz(diff)
3234 #else
3235 /* lowbit = -diff & diff */
3236 #define count_trailing_zeroes_32(diff) mod_37_bit_position[(-diff & diff) % 37]
3237 #endif
3238
3239
3240 /* Counts matches from pos5 to pos3 up to first mismatch. Modified from mismatches_left */
3241 int
Genome_consecutive_matches_rightward(Genome_T ome,Compress_T query_compress,Univcoord_T left,int pos5,int pos3,bool plusp,int genestrand)3242 Genome_consecutive_matches_rightward (Genome_T ome, Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
3243 bool plusp, int genestrand) {
3244 Genomecomp_T *ref_blocks = Genome_blocks(ome);
3245 int mismatch_position, offset, nshift;
3246 int startdiscard, enddiscard;
3247 Univcoord_T startblocki, endblocki, startblocki_32, endblocki_32;
3248 Genomecomp_T *query_shifted, *ptr, *endptr;
3249 int relpos;
3250 int startcolumni, endcolumni;
3251 UINT4 diff_32;
3252 #ifdef HAVE_SSE2
3253 __m128i diff_128;
3254 #endif
3255 #ifdef HAVE_AVX2
3256 __m256i diff_256;
3257 #endif
3258
3259 debug(
3260 printf("\n\n");
3261 printf("Genome (in consecutive_matches_rightward):\n");
3262 print_blocks(ref_blocks,left+pos5,left+pos3);
3263 printf("\n");
3264 );
3265
3266
3267 startblocki = (left+pos5)/128U*12;
3268 startcolumni = ((left+pos5) % 128) / 32;
3269 startblocki_32 = startblocki + startcolumni;
3270
3271 endblocki = (left+pos3)/128U*12;
3272 endcolumni = ((left+pos3) % 128) / 32;
3273 endblocki_32 = endblocki + endcolumni;
3274
3275 debug(printf("left = %u, pos5 = %d, pos3 = %d, startblocki = %u, endblocki = %u\n",
3276 left,pos5,pos3,startblocki,endblocki));
3277
3278 nshift = left % STEP_SIZE;
3279 query_shifted = Compress_shift(query_compress,nshift);
3280 debug(printf("Query shifted %d:\n",nshift));
3281 debug(Compress_print_blocks(query_shifted,nshift,pos5,pos3));
3282 query_shifted += (nshift+pos5)/STEP_SIZE*COMPRESS_BLOCKSIZE;
3283 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
3284 #else
3285 query_shifted += startcolumni;
3286 #endif
3287
3288 startdiscard = (left+pos5) % 32;
3289 enddiscard = (left+pos3) % 32;
3290 offset = -startdiscard + pos5;
3291 ptr = &(ref_blocks[startblocki_32]);
3292 endptr = &(ref_blocks[endblocki_32]);
3293
3294 if (endblocki_32 == startblocki_32) {
3295 /* Single block */
3296 debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
3297
3298 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3299 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3300 diff_32 = clear_start_32(diff_32,startdiscard);
3301 diff_32 = clear_end_32(diff_32,enddiscard);
3302
3303 if (nonzero_p_32(diff_32)) {
3304 mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3305 debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3306 return (mismatch_position - pos5);
3307 } else {
3308 debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3309 return (pos3 - pos5);
3310 }
3311
3312 } else if (endblocki == startblocki) {
3313 #if defined(USE_SHIFT_FIRST_MISMATCH) && defined(HAVE_SSE2)
3314 /* Shift */
3315 enddiscard += (endcolumni - startcolumni)*32;
3316 assert(startdiscard == ((left+pos5) % 128) - startcolumni*32);
3317 assert(enddiscard == ((left+pos3) % 128) - startcolumni*32);
3318
3319 diff_128 = (block_diff_sarray_128_shift_lo)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true,
3320 startcolumni);
3321 diff_128 = clear_start_128(diff_128,startdiscard);
3322 diff_128 = clear_end_128(diff_128,enddiscard);
3323
3324 if (nonzero_p_128(diff_128)) {
3325 mismatch_position = offset + (relpos = count_trailing_zeroes_128(diff_128));
3326 debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3327 return (mismatch_position - pos5);
3328 } else {
3329 debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3330 return (pos3 - pos5);
3331 }
3332
3333 #else
3334 /* Start block */
3335 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3336 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3337 diff_32 = clear_start_32(diff_32,startdiscard);
3338
3339 if (nonzero_p_32(diff_32)) {
3340 mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3341 debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3342 return (mismatch_position - pos5);
3343 }
3344 query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3345 offset += 32;
3346
3347 /* Single row */
3348 while (++startcolumni < endcolumni) {
3349 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3350 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3351
3352 if (nonzero_p_32(diff_32)) {
3353 mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3354 debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3355 return (mismatch_position - pos5);
3356 }
3357 query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3358 offset += 32;
3359 }
3360
3361 /* End block */
3362 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3363 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3364 diff_32 = clear_end_32(diff_32,enddiscard);
3365
3366 if (nonzero_p_32(diff_32)) {
3367 mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3368 debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3369 return (mismatch_position - pos5);
3370 } else {
3371 debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3372 return (pos3 - pos5);
3373 }
3374 #endif
3375
3376 #if defined(USE_WRAP_FIRST_MISMATCH) && defined(HAVE_SSSE3)
3377 } else if (endblocki == startblocki + 12 && endcolumni < startcolumni) {
3378 /* Wrap */
3379 enddiscard += (4 + endcolumni - startcolumni)*32;
3380 assert(startdiscard == ((left+pos5) % 128) - startcolumni*32);
3381 assert(enddiscard == ((left+pos3) % 128) + (4 - startcolumni)*32);
3382
3383 diff_128 = (block_diff_sarray_128_wrap_lo)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true,
3384 startcolumni);
3385 diff_128 = clear_start_128(diff_128,startdiscard);
3386 diff_128 = clear_end_128(diff_128,enddiscard);
3387
3388 if (nonzero_p_128(diff_128)) {
3389 mismatch_position = offset + (relpos = count_trailing_zeroes_128(diff_128));
3390 debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3391 return (mismatch_position - pos5);
3392 } else {
3393 debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3394 return (pos3 - pos5);
3395 }
3396
3397 #endif
3398
3399 } else {
3400 /* Start block */
3401 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3402 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3403 diff_32 = clear_start_32(diff_32,startdiscard);
3404
3405 if (nonzero_p_32(diff_32)) {
3406 mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3407 debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3408 return (mismatch_position - pos5);
3409 }
3410 query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3411 offset += 32;
3412
3413 /* Start row */
3414 while (++startcolumni < 4) {
3415 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3416 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3417 if (nonzero_p_32(diff_32)) {
3418 mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3419 debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3420 return (mismatch_position - pos5);
3421 }
3422 query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3423 offset += 32;
3424 }
3425 #ifdef HAVE_SSE2
3426 query_shifted += QUERY_NEXTROW;
3427 #endif
3428 ptr += GENOME_NEXTROW;
3429
3430
3431 /* Middle rows */
3432 #ifdef HAVE_AVX2
3433 while (ptr + 24 <= endptr) {
3434 diff_256 = (block_diff_sarray_256)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3435
3436 if (nonzero_p_256(diff_256)) {
3437 mismatch_position = offset + (relpos = count_trailing_zeroes_256(diff_256));
3438 debug(printf("returning %d - %d consecutive matches\n",mismatch_position,pos5));
3439 return (mismatch_position - pos5);
3440 }
3441 query_shifted += 24; ptr += 24;
3442 offset += 256;
3443 }
3444 #endif
3445
3446 #ifdef HAVE_SSE2
3447 while (ptr + 12 <= endptr) {
3448 diff_128 = (block_diff_sarray_128)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3449
3450 if (nonzero_p_128(diff_128)) {
3451 mismatch_position = offset + (relpos = count_trailing_zeroes_128(diff_128));
3452 debug(printf("returning %d - %d consecutive matches\n",mismatch_position,pos5));
3453 return (mismatch_position - pos5);
3454 }
3455 query_shifted += 12; ptr += 12;
3456 offset += 128;
3457 }
3458 #else
3459 while (ptr + 12 <= endptr) {
3460 for (startcolumni = 0; startcolumni < 4; startcolumni++) {
3461 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3462 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3463 if (nonzero_p_32(diff_32)) {
3464 mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3465 debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3466 return (mismatch_position - pos5);
3467 }
3468 query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3469 offset += 32;
3470 }
3471 /* query_shifted += QUERY_NEXTROW; */ ptr += GENOME_NEXTROW;
3472 }
3473 #endif
3474
3475 /* End row */
3476 while (ptr < endptr) {
3477 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3478 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3479
3480 if (nonzero_p_32(diff_32)) {
3481 mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3482 debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3483 return (mismatch_position - pos5);
3484 }
3485 query_shifted += QUERY_NEXTCOL; ptr += GENOME_NEXTCOL;
3486 offset += 32;
3487 }
3488
3489 /* End block */
3490 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3491 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3492 diff_32 = clear_end_32(diff_32,enddiscard);
3493
3494 if (nonzero_p_32(diff_32)) {
3495 mismatch_position = offset + (relpos = count_trailing_zeroes_32(diff_32));
3496 debug(printf("Would return %d - %d consecutive matches\n",mismatch_position,pos5));
3497 return (mismatch_position - pos5);
3498 } else {
3499 debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3500 return (pos3 - pos5);
3501 }
3502 }
3503 }
3504
3505
3506 /* Counts matches from pos3 to pos5 up to first mismatch. Modified from mismatches_right */
3507 int
Genome_consecutive_matches_leftward(Genome_T ome,Compress_T query_compress,Univcoord_T left,int pos5,int pos3,bool plusp,int genestrand)3508 Genome_consecutive_matches_leftward (Genome_T ome, Compress_T query_compress, Univcoord_T left, int pos5, int pos3,
3509 bool plusp, int genestrand) {
3510 Genomecomp_T *ref_blocks = Genome_blocks(ome);
3511 int mismatch_position, offset, relpos, nshift;
3512 int startdiscard, enddiscard;
3513 Univcoord_T startblocki, endblocki, startblocki_32, endblocki_32;
3514 Genomecomp_T *query_shifted, *ptr, *startptr;
3515 #ifndef HAVE_BUILTIN_CLZ
3516 Genomecomp_T top;
3517 #endif
3518 int startcolumni, endcolumni;
3519 UINT4 diff_32;
3520 #ifdef HAVE_SSE2
3521 __m128i diff_128;
3522 #endif
3523 #ifdef HAVE_AVX2
3524 __m256i diff_256;
3525 #endif
3526
3527 /* static int ncalls = 0; */
3528 /* printf("Number of calls to leftward: %d\n",++ncalls); */
3529
3530 debug(
3531 printf("\n\n");
3532 printf("Genome (in consecutive_matches_leftward):\n");
3533 print_blocks(ref_blocks,left+pos5,left+pos3);
3534 printf("\n");
3535 );
3536
3537
3538 startblocki = (left+pos5)/128U*12;
3539 startcolumni = ((left+pos5) % 128) / 32;
3540 startblocki_32 = startblocki + startcolumni;
3541
3542 endblocki = (left+pos3)/128U*12;
3543 endcolumni = ((left+pos3) % 128) / 32;
3544 endblocki_32 = endblocki + endcolumni;
3545
3546 debug(printf("left = %u, pos5 = %d, pos3 = %d, startblocki = %u, endblocki = %u\n",
3547 left,pos5,pos3,startblocki,endblocki));
3548
3549 nshift = left % STEP_SIZE;
3550 query_shifted = Compress_shift(query_compress,nshift);
3551 debug(printf("Query shifted %d:\n",nshift));
3552 debug(Compress_print_blocks(query_shifted,nshift,pos5,pos3));
3553 query_shifted += (nshift+pos3)/STEP_SIZE*COMPRESS_BLOCKSIZE;
3554 #if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
3555 #else
3556 query_shifted += endcolumni;
3557 #endif
3558
3559 startdiscard = (left+pos5) % 32;
3560 enddiscard = (left+pos3) % 32;
3561 offset = (pos3 - 1) - enddiscard + 32;
3562 ptr = &(ref_blocks[endblocki_32]);
3563 startptr = &(ref_blocks[startblocki_32]);
3564
3565 if (startblocki_32 == endblocki_32) {
3566 /* Single block */
3567 debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
3568
3569 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3570 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3571 diff_32 = clear_start_32(diff_32,startdiscard);
3572 diff_32 = clear_end_32(diff_32,enddiscard);
3573
3574 if (nonzero_p_32(diff_32)) {
3575 mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3576 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3577 return (pos3 - mismatch_position - 1);
3578 } else {
3579 debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3580 return (pos3 - pos5);
3581 }
3582
3583 } else if (startblocki == endblocki) {
3584 #if defined(USE_SHIFT_FIRST_MISMATCH) && defined(HAVE_SSE2)
3585 /* Shift */
3586 startdiscard += 96 - (endcolumni - startcolumni)*32;
3587 enddiscard += 96;
3588 assert(startdiscard == ((left+pos5) % 128) + (3 - endcolumni)*32);
3589 assert(enddiscard == ((left+pos3) % 128) + (3 - endcolumni)*32);
3590
3591 diff_128 = (block_diff_sarray_128_shift_hi)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true,
3592 endcolumni);
3593 diff_128 = clear_start_128(diff_128,startdiscard);
3594 diff_128 = clear_end_128(diff_128,enddiscard);
3595
3596 if (nonzero_p_128(diff_128)) {
3597 mismatch_position = offset - (relpos = count_leading_zeroes_128(diff_128));
3598 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3599 return (pos3 - mismatch_position - 1);
3600 } else {
3601 debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3602 return (pos3 - pos5);
3603 }
3604
3605 #else
3606 /* End block */
3607 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3608 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3609 diff_32 = clear_end_32(diff_32,enddiscard);
3610
3611 if (nonzero_p_32(diff_32)) {
3612 mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3613 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3614 return (pos3 - mismatch_position - 1);
3615 }
3616 query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3617 offset -= 32;
3618
3619 /* Single row */
3620 while (--endcolumni > startcolumni) {
3621 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3622 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3623
3624 if (nonzero_p_32(diff_32)) {
3625 mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3626 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3627 return (pos3 - mismatch_position - 1);
3628 }
3629 query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3630 offset -= 32;
3631 }
3632
3633 /* Start block */
3634 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3635 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3636 diff_32 = clear_start_32(diff_32,startdiscard);
3637
3638 if (nonzero_p_32(diff_32)) {
3639 mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3640 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3641 return (pos3 - mismatch_position - 1);
3642 } else {
3643 debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3644 return (pos3 - pos5);
3645 }
3646 #endif
3647
3648 #if defined(USE_WRAP_FIRST_MISMATCH) && defined(HAVE_SSSE3)
3649 } else if (endblocki == startblocki + 12 && endcolumni < startcolumni) {
3650 /* Wrap */
3651 startdiscard += (startcolumni - endcolumni - 1)*32;
3652 enddiscard += 96;
3653 assert(startdiscard == ((left+pos5) % 128) - (endcolumni + 1)*32);
3654 assert(enddiscard == ((left+pos3) % 128) + (3 - endcolumni)*32);
3655
3656 diff_128 = (block_diff_sarray_128_wrap_hi)(query_shifted,ptr,plusp,genestrand,/*query_unk_mismatch_local_p*/true,
3657 endcolumni);
3658 diff_128 = clear_start_128(diff_128,startdiscard);
3659 diff_128 = clear_end_128(diff_128,enddiscard);
3660
3661 if (nonzero_p_128(diff_128)) {
3662 mismatch_position = offset - (relpos = count_leading_zeroes_128(diff_128));
3663 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3664 return (pos3 - mismatch_position - 1);
3665 } else {
3666 debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3667 return (pos3 - pos5);
3668 }
3669 #endif
3670
3671 } else {
3672 /* End block */
3673 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3674 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3675 diff_32 = clear_end_32(diff_32,enddiscard);
3676
3677 if (nonzero_p_32(diff_32)) {
3678 mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3679 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3680 return (pos3 - mismatch_position - 1);
3681 }
3682 query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3683 offset -= 32;
3684
3685 /* End row */
3686 while (--endcolumni >= 0) {
3687 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3688 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3689
3690 if (nonzero_p_32(diff_32)) {
3691 mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3692 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3693 return (pos3 - mismatch_position - 1);
3694 }
3695 query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3696 offset -= 32;
3697 }
3698 #ifdef HAVE_SSE2
3699 query_shifted -= QUERY_NEXTROW;
3700 #endif
3701 ptr -= GENOME_NEXTROW;
3702
3703
3704 /* Middle rows */
3705 #ifdef HAVE_AVX2
3706 while (ptr >= startptr + 24) {
3707 diff_256 = (block_diff_sarray_256)(&(query_shifted[-15]),&(ptr[-15]),plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3708
3709 if (nonzero_p_256(diff_256)) {
3710 mismatch_position = offset - (relpos = count_leading_zeroes_256(diff_256));
3711 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3712 return (pos3 - mismatch_position - 1);
3713 }
3714 query_shifted -= 24; ptr -= 24;
3715 offset -= 256;
3716 }
3717 #endif
3718
3719 #ifdef HAVE_SSE2
3720 while (ptr >= startptr + 12) {
3721 diff_128 = (block_diff_sarray_128)(&(query_shifted[-3]),&(ptr[-3]),plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3722
3723 if (nonzero_p_128(diff_128)) {
3724 mismatch_position = offset - (relpos = count_leading_zeroes_128(diff_128));
3725 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3726 return (pos3 - mismatch_position - 1);
3727 }
3728 query_shifted -= 12; ptr -= 12;
3729 offset -= 128;
3730 }
3731 #else
3732 while (ptr >= startptr + 12) {
3733 for (endcolumni = 3; endcolumni >= 0; --endcolumni) {
3734 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3735 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3736
3737 if (nonzero_p_32(diff_32)) {
3738 mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3739 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3740 return (pos3 - mismatch_position - 1);
3741 }
3742 query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3743 offset -= 32;
3744 }
3745 /* query_shifted -= QUERY_NEXTROW; */ ptr -= GENOME_NEXTROW;
3746 }
3747 #endif
3748
3749 /* Start row */
3750 while (ptr > startptr) {
3751 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3752 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3753 if (nonzero_p_32(diff_32)) {
3754 mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3755 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3756 return (pos3 - mismatch_position - 1);
3757 }
3758 query_shifted -= QUERY_NEXTCOL; ptr -= GENOME_NEXTCOL;
3759 offset -= 32;
3760 }
3761
3762 /* Start block */
3763 diff_32 = (block_diff_sarray_32)(query_shifted,ptr,
3764 plusp,genestrand,/*query_unk_mismatch_local_p*/true);
3765 diff_32 = clear_start_32(diff_32,startdiscard);
3766
3767 if (nonzero_p_32(diff_32)) {
3768 mismatch_position = offset - (relpos = count_leading_zeroes_32(diff_32));
3769 debug(printf("returning %d - %d - 1 consecutive matches\n",pos3,mismatch_position));
3770 return (pos3 - mismatch_position - 1);
3771 } else {
3772 debug(printf("Would return %d - %d consecutive matches\n",pos3,pos5));
3773 return (pos3 - pos5);
3774 }
3775 }
3776 }
3777
3778
3779 /* Modified from Genome_consecutive_matches_rightward. Intended for
3780 writing LCP only, so coords are UINT4, and does not need SIMD
3781 instructions. */
3782
3783 int
Genome_consecutive_matches_pair(Genome_T ome,UINT4 lefta,UINT4 leftb,UINT4 genomelength)3784 Genome_consecutive_matches_pair (Genome_T ome, UINT4 lefta, UINT4 leftb, UINT4 genomelength) {
3785 Genomecomp_T *ref_blocks = Genome_blocks(ome);
3786 int mismatch_position, offset, nshift, rightshift, nshift1, nshift2, nshifta, nshiftb;
3787 int startdiscard, enddiscard, nblocks;
3788 UINT4 left1, left2;
3789 UINT4 startblocki_1, startblocki_2, endblocki;
3790 int startcolumni_1, startcolumni_2, endcolumni;
3791 Genomecomp_T *ptr1, *ptr2, *end, *ptr1_prev;
3792 Genomecomp_T diff;
3793 Genomecomp_T shifted1[3];
3794 int relpos;
3795
3796 nshifta = lefta % 32; /* Not STEP_SIZE */
3797 nshiftb = leftb % 32; /* Not STEP_SIZE */
3798 if (nshifta < nshiftb) {
3799 left1 = lefta;
3800 left2 = leftb;
3801 nshift1 = nshifta;
3802 nshift2 = nshiftb;
3803
3804 } else {
3805 left1 = leftb;
3806 left2 = lefta;
3807 nshift1 = nshiftb;
3808 nshift2 = nshifta;
3809 }
3810 nshift = nshift2 - nshift1;
3811 rightshift = 32 - nshift; /* Not STEP_SIZE */
3812 startdiscard = nshift2;
3813
3814 /* Non-SSE2 code */
3815 startcolumni_1 = (left1 % 128) / 32;
3816 startblocki_1 = left1/128U*12 + startcolumni_1;
3817
3818 startcolumni_2 = (left2 % 128) / 32;
3819 startblocki_2 = left2/128U*12 + startcolumni_2;
3820
3821 endcolumni = (genomelength % 128) / 32;
3822 endblocki = genomelength/128U*12 + endcolumni;
3823
3824
3825 ptr1 = &(ref_blocks[startblocki_1]);
3826 ptr2 = &(ref_blocks[startblocki_2]);
3827 end = &(ref_blocks[endblocki]);
3828
3829 offset = -startdiscard;
3830
3831 debug2(printf("\n\n"));
3832 debug2(printf("left1 = %u, left2 = %u, startblocki_1 = %u, startblocki_2 = %u, endblocki = %u\n",
3833 left1,left2,startblocki_1,startblocki_2,endblocki));
3834 debug2(printf("nshift1 = %d, nshift2 = %d, nshift = %d, startdiscard = %u\n",
3835 nshift1,nshift2,nshift,startdiscard));
3836
3837 if (ptr1 == end) {
3838 /* Single block */
3839 enddiscard = genomelength % 32; /* Not STEP_SIZE */
3840 if (nshift + enddiscard < 32) { /* Not STEP_SIZE */
3841 enddiscard = nshift + enddiscard;
3842
3843 ptr1 = &(ref_blocks[startblocki_1]);
3844 ptr2 = &(ref_blocks[startblocki_2]);
3845 #ifdef WORDS_BIGENDIAN
3846 shifted1[0] = Bigendian_convert_uint(ptr1[0]) << nshift;
3847 shifted1[1] = Bigendian_convert_uint(ptr1[4]) << nshift;
3848 shifted1[2] = Bigendian_convert_uint(ptr1[8]) << nshift;
3849 #else
3850 shifted1[0] = ptr1[0] << nshift;
3851 shifted1[1] = ptr1[4] << nshift;
3852 shifted1[2] = ptr1[8] << nshift;
3853 #endif
3854 debug2(Compress_print_one_block(ptr1));
3855 debug2(Compress_print_one_block(ptr2));
3856 debug2(Compress_print_one_block(shifted1));
3857
3858 #ifdef WORDS_BIGENDIAN
3859 diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
3860 #else
3861 diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
3862 #endif
3863 diff = clear_start_32(diff,startdiscard);
3864 diff = clear_end_32(diff,enddiscard);
3865
3866 if (diff /* != 0U */) {
3867 #ifdef HAVE_BUILTIN_CTZ
3868 mismatch_position = offset + (relpos = __builtin_ctz(diff));
3869 #else
3870 mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
3871 #endif
3872 debug2(printf("case 1: returning %d matches\n",mismatch_position));
3873 return mismatch_position;
3874 } else {
3875 debug2(printf("case 2: returning %d - %d matches\n",enddiscard,startdiscard));
3876 return (enddiscard - startdiscard);
3877 }
3878
3879 } else {
3880 /* Two blocks */
3881 if (nshift > 0) {
3882 enddiscard -= (32 - nshift); /* Not STEP_SIZE */
3883 }
3884
3885 /* Block 1 */
3886 ptr1 = &(ref_blocks[startblocki_1]);
3887 ptr2 = &(ref_blocks[startblocki_2]);
3888 #ifdef WORDS_BIGENDIAN
3889 shifted1[0] = Bigendian_convert_uint(ptr1[0]) << nshift;
3890 shifted1[1] = Bigendian_convert_uint(ptr1[4]) << nshift;
3891 shifted1[2] = Bigendian_convert_uint(ptr1[8]) << nshift;
3892 #else
3893 shifted1[0] = ptr1[0] << nshift;
3894 shifted1[1] = ptr1[4] << nshift;
3895 shifted1[2] = ptr1[8] << nshift;
3896 #endif
3897 debug2(Compress_print_one_block(ptr1));
3898 debug2(Compress_print_one_block(ptr2));
3899 debug2(Compress_print_one_block(shifted1));
3900
3901 #ifdef WORDS_BIGENDIAN
3902 diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
3903 #else
3904 diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
3905 #endif
3906 diff = clear_start_32(diff,startdiscard);
3907
3908 if (diff /* != 0U */) {
3909 #ifdef HAVE_BUILTIN_CTZ
3910 mismatch_position = offset + (relpos = __builtin_ctz(diff));
3911 #else
3912 mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
3913 #endif
3914 debug2(printf("case 3: returning %d matches\n",mismatch_position));
3915 return mismatch_position;
3916 } else {
3917 ptr1_prev = ptr1;
3918 ptr1 += 1; if (++startcolumni_1 == 4) {ptr1 += 8; startcolumni_1 = 0;}
3919 ptr2 += 1; if (++startcolumni_2 == 4) {ptr2 += 8; startcolumni_2 = 0;}
3920 offset += 32; /* Not STEP_SIZE */
3921 }
3922
3923 /* Block 2 */
3924 if (nshift == 0) {
3925 /* rightshift of 32 is a no-op */
3926 #ifdef WORDS_BIGENDIAN
3927 shifted1[0] = Bigendian_convert_uint(ptr1[0]); shifted1[1] = Bigendian_convert_uint(ptr1[4]); shifted1[2] = Bigendian_convert_uint(ptr1[8]);
3928 #else
3929 shifted1[0] = ptr1[0]; shifted1[1] = ptr1[4]; shifted1[2] = ptr1[8];
3930 #endif
3931 } else {
3932 #ifdef WORDS_BIGENDIAN
3933 shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift) | (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
3934 shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift) | (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
3935 shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift) | (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
3936 #else
3937 shifted1[0] = (ptr1[0] << nshift) | (ptr1_prev[0] >> rightshift);
3938 shifted1[1] = (ptr1[4] << nshift) | (ptr1_prev[4] >> rightshift);
3939 shifted1[2] = (ptr1[8] << nshift) | (ptr1_prev[8] >> rightshift);
3940 #endif
3941 }
3942 debug2(Compress_print_one_block(ptr1));
3943 debug2(Compress_print_one_block(ptr2));
3944 debug2(Compress_print_one_block(shifted1));
3945
3946 #ifdef WORDS_BIGENDIAN
3947 diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
3948 #else
3949 diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
3950 #endif
3951 diff = clear_end_32(diff,enddiscard);
3952
3953 if (diff /* != 0U */) {
3954 #ifdef HAVE_BUILTIN_CTZ
3955 mismatch_position = offset + (relpos = __builtin_ctz(diff));
3956 #else
3957 mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
3958 #endif
3959 debug2(printf("case 4: returning %d matches\n",mismatch_position));
3960 return mismatch_position;
3961 } else {
3962 debug2(printf("case 5: returning offset %d + enddiscard %d matches\n",offset,enddiscard));
3963 return offset + enddiscard;
3964 }
3965 }
3966
3967 } else if (ptr2 == end) {
3968 /* Single block */
3969 enddiscard = genomelength % 32; /* Not STEP_SIZE */
3970
3971 ptr1 = &(ref_blocks[startblocki_1]);
3972 ptr2 = &(ref_blocks[startblocki_2]);
3973 #ifdef WORDS_BIGENDIAN
3974 shifted1[0] = Bigendian_convert_uint(ptr1[0]) << nshift;
3975 shifted1[1] = Bigendian_convert_uint(ptr1[4]) << nshift;
3976 shifted1[2] = Bigendian_convert_uint(ptr1[8]) << nshift;
3977 #else
3978 shifted1[0] = ptr1[0] << nshift;
3979 shifted1[1] = ptr1[4] << nshift;
3980 shifted1[2] = ptr1[8] << nshift;
3981 #endif
3982 debug2(Compress_print_one_block(ptr1));
3983 debug2(Compress_print_one_block(ptr2));
3984 debug2(Compress_print_one_block(shifted1));
3985
3986 #ifdef WORDS_BIGENDIAN
3987 diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
3988 #else
3989 diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
3990 #endif
3991 diff = clear_start_32(diff,startdiscard);
3992 diff = clear_end_32(diff,enddiscard);
3993
3994 if (diff /* != 0U */) {
3995 #ifdef HAVE_BUILTIN_CTZ
3996 mismatch_position = offset + (relpos = __builtin_ctz(diff));
3997 #else
3998 mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
3999 #endif
4000 debug2(printf("case 6: returning %d matches\n",mismatch_position));
4001 return mismatch_position;
4002 } else {
4003 debug2(printf("case 7: returning %d - %d matches\n",enddiscard,startdiscard));
4004 return (enddiscard - startdiscard);
4005 }
4006
4007 } else {
4008
4009 /* Startblock */
4010 ptr1 = &(ref_blocks[startblocki_1]);
4011 ptr2 = &(ref_blocks[startblocki_2]);
4012 #ifdef WORDS_BIGENDIAN
4013 shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift);
4014 shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift);
4015 shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift);
4016 #else
4017 shifted1[0] = (ptr1[0] << nshift);
4018 shifted1[1] = (ptr1[4] << nshift);
4019 shifted1[2] = (ptr1[8] << nshift);
4020 #endif
4021 debug2(Compress_print_one_block(ptr1));
4022 debug2(Compress_print_one_block(ptr2));
4023 debug2(Compress_print_one_block(shifted1));
4024
4025 #ifdef WORDS_BIGENDIAN
4026 diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
4027 #else
4028 diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
4029 #endif
4030 diff = clear_start_32(diff,startdiscard);
4031
4032 if (diff /* != 0U */) {
4033 #ifdef HAVE_BUILTIN_CTZ
4034 mismatch_position = offset + (relpos = __builtin_ctz(diff));
4035 #else
4036 mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
4037 #endif
4038 debug2(printf("case 8: returning %d matches\n",mismatch_position));
4039 return mismatch_position;
4040 } else {
4041 ptr1_prev = ptr1;
4042 ptr1 += 1; if (++startcolumni_1 == 4) {ptr1 += 8; startcolumni_1 = 0;}
4043 ptr2 += 1; if (++startcolumni_2 == 4) {ptr2 += 8; startcolumni_2 = 0;}
4044 offset += 32; /* Not STEP_SIZE */
4045 }
4046
4047 while (ptr1 < end && ptr2 < end) {
4048 if (nshift == 0) {
4049 /* rightshift of 32 is a no-op */
4050 #ifdef WORDS_BIGENDIAN
4051 shifted1[0] = Bigendian_convert_uint(ptr1[0]); shifted1[1] = Bigendian_convert_uint(ptr1[4]); shifted1[2] = Bigendian_convert_uint(ptr1[8]);
4052 #else
4053 shifted1[0] = ptr1[0]; shifted1[1] = ptr1[4]; shifted1[2] = ptr1[8];
4054 #endif
4055 } else {
4056 #ifdef WORDS_BIGENDIAN
4057 shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift) | (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
4058 shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift) | (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
4059 shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift) | (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
4060 #else
4061 shifted1[0] = (ptr1[0] << nshift) | (ptr1_prev[0] >> rightshift);
4062 shifted1[1] = (ptr1[4] << nshift) | (ptr1_prev[4] >> rightshift);
4063 shifted1[2] = (ptr1[8] << nshift) | (ptr1_prev[8] >> rightshift);
4064 #endif
4065 }
4066 debug2(Compress_print_one_block(ptr1));
4067 debug2(Compress_print_one_block(ptr2));
4068 debug2(Compress_print_one_block(shifted1));
4069
4070 #ifdef WORDS_BIGENDIAN
4071 diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
4072 #else
4073 diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
4074 #endif
4075 if (diff /* != 0U */) {
4076 #ifdef HAVE_BUILTIN_CTZ
4077 mismatch_position = offset + (relpos = __builtin_ctz(diff));
4078 #else
4079 mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
4080 #endif
4081 debug2(printf("case 9: returning %d matches\n",mismatch_position));
4082 return mismatch_position;
4083 } else {
4084 ptr1_prev = ptr1;
4085 ptr1 += 1; if (++startcolumni_1 == 4) {ptr1 += 8; startcolumni_1 = 0;}
4086 ptr2 += 1; if (++startcolumni_2 == 4) {ptr2 += 8; startcolumni_2 = 0;}
4087 offset += 32; /* Not STEP_SIZE */
4088 }
4089 }
4090
4091 /* Last block of entire genome */
4092 enddiscard = genomelength % 32; /* Not STEP_SIZE */
4093 if (ptr2 == end) {
4094 debug2(printf("ptr2 == end\n"));
4095 /* Keep enddiscard */
4096 nblocks = 1;
4097 } else if (nshift + enddiscard < 32) {
4098 debug2(printf("ptr1 == end and nshift %d + enddiscard %d < 32\n",nshift,enddiscard));
4099 enddiscard = nshift + enddiscard;
4100 nblocks = 1;
4101 } else if (nshift > 0) {
4102 debug2(printf("ptr1 == end and nshift %d + enddiscard %d >= 32\n",nshift,enddiscard));
4103 enddiscard -= (32 - nshift);
4104 nblocks = 2;
4105 } else {
4106 debug2(printf("ptr1 == end and nshift %d + enddiscard %d >= 32\n",nshift,enddiscard));
4107 /* Keep enddiscard */
4108 nblocks = 2;
4109 }
4110
4111 /* Block 1 */
4112 if (nshift == 0) {
4113 /* rightshift of 32 is a no-op */
4114 #ifdef WORDS_BIGENDIAN
4115 shifted1[0] = Bigendian_convert_uint(ptr1[0]); shifted1[1] = Bigendian_convert_uint(ptr1[4]); shifted1[2] = Bigendian_convert_uint(ptr1[8]);
4116 #else
4117 shifted1[0] = ptr1[0]; shifted1[1] = ptr1[4]; shifted1[2] = ptr1[8];
4118 #endif
4119 } else {
4120 #ifdef WORDS_BIGENDIAN
4121 shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift) | (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
4122 shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift) | (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
4123 shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift) | (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
4124 #else
4125 shifted1[0] = (ptr1[0] << nshift) | (ptr1_prev[0] >> rightshift);
4126 shifted1[1] = (ptr1[4] << nshift) | (ptr1_prev[4] >> rightshift);
4127 shifted1[2] = (ptr1[8] << nshift) | (ptr1_prev[8] >> rightshift);
4128 #endif
4129 }
4130 debug2(Compress_print_one_block(ptr1));
4131 debug2(Compress_print_one_block(ptr2));
4132 debug2(Compress_print_one_block(shifted1));
4133
4134 #ifdef WORDS_BIGENDIAN
4135 diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
4136 #else
4137 diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
4138 #endif
4139 if (nblocks == 1) {
4140 diff = clear_end_32(diff,enddiscard);
4141 }
4142
4143 if (diff /* != 0U */) {
4144 #ifdef HAVE_BUILTIN_CTZ
4145 mismatch_position = offset + (relpos = __builtin_ctz(diff));
4146 #else
4147 mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
4148 #endif
4149 debug2(printf("case 10: returning %d matches\n",mismatch_position));
4150 return mismatch_position;
4151 } else if (nblocks == 1) {
4152 debug2(printf("case 11: returning offset %d + enddiscard %d matches\n",offset,enddiscard));
4153 return offset + enddiscard;
4154 } else {
4155 ptr1_prev = ptr1;
4156 ptr1 += 1; if (++startcolumni_1 == 4) {ptr1 += 8; startcolumni_1 = 0;}
4157 ptr2 += 1; if (++startcolumni_2 == 4) {ptr2 += 8; startcolumni_2 = 0;}
4158 offset += 32; /* Not STEP_SIZE */
4159 }
4160
4161 /* Block 2 */
4162 #ifdef WORDS_BIGENDIAN
4163 shifted1[0] = (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
4164 shifted1[1] = (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
4165 shifted1[2] = (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
4166 #else
4167 shifted1[0] = (ptr1_prev[0] >> rightshift);
4168 shifted1[1] = (ptr1_prev[4] >> rightshift);
4169 shifted1[2] = (ptr1_prev[8] >> rightshift);
4170 #endif
4171 debug2(Compress_print_one_block(ptr1));
4172 debug2(Compress_print_one_block(ptr2));
4173 debug2(Compress_print_one_block(shifted1));
4174
4175 #ifdef WORDS_BIGENDIAN
4176 diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
4177 #else
4178 diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
4179 #endif
4180 diff = clear_end_32(diff,enddiscard);
4181
4182 if (diff /* != 0U */) {
4183 #ifdef HAVE_BUILTIN_CTZ
4184 mismatch_position = offset + (relpos = __builtin_ctz(diff));
4185 #else
4186 mismatch_position = offset + mod_37_bit_position[(-diff & diff) % 37];
4187 #endif
4188 debug2(printf("case 12: returning %d matches\n",mismatch_position));
4189 return mismatch_position;
4190 } else {
4191 debug2(printf("case 13: returning offset %d + enddiscard %d matches\n",offset,enddiscard));
4192 return offset + enddiscard;
4193 }
4194 }
4195 }
4196
4197