1 static char rcsid[] = "$Id: dynprog_simd.c 214361 2018-03-21 01:24:28Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "dynprog_simd.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <math.h> /* For ceil, log, pow */
11 #include <ctype.h> /* For tolower */
12
13 #ifdef HAVE_SSE2
14 #include <emmintrin.h>
15 #endif
16 #ifdef HAVE_SSE4_1
17 #include <smmintrin.h>
18 #endif
19 #ifdef HAVE_AVX2
20 #include <immintrin.h>
21 #endif
22
23 #include "mem.h"
24 #include "comp.h"
25 #include "assert.h"
26
27
28 #ifdef HAVE_AVX2
29 #define _MM_ADD_EPI8(x,y) _mm256_add_epi8(x,y)
30 #define _MM_ADDS_EPI8(x,y) _mm256_adds_epi8(x,y)
31 #define _MM_SUBS_EPI8(x,y) _mm256_subs_epi8(x,y)
32 #define _MM_CMPGT_EPI8(x,y) _mm256_cmpgt_epi8(x,y)
33 #define _MM_CMPLT_EPI8(x,y) _mm256_cmpgt_epi8(y,x) /* No _mm256_cmplt commands */
34 #define _MM_MAX_EPI8(x,y) _mm256_max_epi8(x,y)
35 #define _MM_MIN_EPI8(x,y) _mm256_min_epi8(x,y)
36 #define _MM_SET1_EPI8(x) _mm256_set1_epi8(x)
37
38 #define _MM_ADD_EPI16(x,y) _mm256_add_epi16(x,y)
39 #define _MM_ADDS_EPI16(x,y) _mm256_adds_epi16(x,y)
40 #define _MM_SUBS_EPI16(x,y) _mm256_subs_epi16(x,y)
41 #define _MM_CMPGT_EPI16(x,y) _mm256_cmpgt_epi16(x,y)
42 #define _MM_CMPLT_EPI16(x,y) _mm256_cmpgt_epi16(y,x) /* No _mm256_cmplt commands */
43 #define _MM_MAX_EPI16(x,y) _mm256_max_epi16(x,y)
44 #define _MM_MIN_EPI16(x,y) _mm256_min_epi16(x,y)
45 #define _MM_SET1_EPI16(x) _mm256_set1_epi16(x)
46
47 #define _MM_SETZERO_SI _mm256_setzero_si256
48 /* #define _MM_SLLI_SI(x,y) _mm256_slli_si256(x,y) -- 256-bit version works within 128-bit lanes */
49 /* #define _MM_SRLI_SI(x,y) _mm256_srli_si256(x,y) -- 256-bit version works within 128-bit lanes */
50 #define _MM_ANDNOT_SI(x,y) _mm256_andnot_si256(x,y)
51 #define _MM_OR_SI(x,y) _mm256_or_si256(x,y)
52 #define _MM_AND_SI(x,y) _mm256_and_si256(x,y)
53
54 #elif defined(HAVE_SSE4_1) || defined(HAVE_SSE2)
55 #define _MM_ADD_EPI8(x,y) _mm_add_epi8(x,y)
56 #define _MM_ADDS_EPI8(x,y) _mm_adds_epi8(x,y)
57 #define _MM_SUBS_EPI8(x,y) _mm_subs_epi8(x,y)
58 #define _MM_CMPGT_EPI8(x,y) _mm_cmpgt_epi8(x,y)
59 #define _MM_CMPLT_EPI8(x,y) _mm_cmplt_epi8(x,y)
60 #define _MM_MAX_EPI8(x,y) _mm_max_epi8(x,y)
61 #define _MM_MIN_EPI8(x,y) _mm_min_epi8(x,y)
62 #define _MM_SET1_EPI8(x) _mm_set1_epi8(x)
63
64 #define _MM_ADD_EPI16(x,y) _mm_add_epi16(x,y)
65 #define _MM_ADDS_EPI16(x,y) _mm_adds_epi16(x,y)
66 #define _MM_SUBS_EPI16(x,y) _mm_subs_epi16(x,y)
67 #define _MM_CMPGT_EPI16(x,y) _mm_cmpgt_epi16(x,y)
68 #define _MM_CMPLT_EPI16(x,y) _mm_cmplt_epi16(x,y)
69 #define _MM_MAX_EPI16(x,y) _mm_max_epi16(x,y)
70 #define _MM_MIN_EPI16(x,y) _mm_min_epi16(x,y)
71 #define _MM_SET1_EPI16(x) _mm_set1_epi16(x)
72
73 #define _MM_SETZERO_SI _mm_setzero_si128
74 /* #define _MM_SLLI_SI(x,y) _mm_slli_si128(x,y) -- 256-bit version works within 128-bit lanes */
75 /* #define _MM_SRLI_SI(x,y) _mm_srli_si128(x,y) -- 256-bit version works within 128-bit lanes */
76 #define _MM_ANDNOT_SI(x,y) _mm_andnot_si128(x,y)
77 #define _MM_OR_SI(x,y) _mm_or_si128(x,y)
78 #define _MM_AND_SI(x,y) _mm_and_si128(x,y)
79 #endif
80
81
82
83 #define LAZY_INDEL 1 /* Don't advance to next coordinate on final indel, since could go over chromosome bounds. */
84
85 /* Row 0 and column 0 directions */
86 /* Was useful in finding a saturation bug, but can fail because of saturation */
87 #ifdef CHECK1
88 #define check1(x) x
89 #else
90 #define check1(x)
91 #endif
92
93
94 #ifdef DEBUG
95 #define debug(x) x
96 #else
97 #define debug(x)
98 #endif
99
100 #ifdef DEBUG2
101 #define debug2(x) x
102 #else
103 #define debug2(x)
104 #endif
105
106 /* Fgap */
107 #ifdef DEBUG3
108 #define debug3(x) x
109 #else
110 #define debug3(x)
111 #endif
112
113 #ifdef DEBUG8
114 #define debug8(x) x
115 #else
116 #define debug8(x)
117 #endif
118
119 /* Compare SIMD with non-SIMD. Define in dynprog.h */
120 #ifdef DEBUG_SIMD
121 #define debug_simd(x) x
122 #else
123 #define debug_simd(x)
124 #endif
125
126 #ifdef DEBUG15
127 #define debug15(x) x
128 #else
129 #define debug15(x)
130 #endif
131
132 /* Compare AVX2 with SSE42. Define in dynprog.h */
133 #ifdef DEBUG_AVX2
134 #define debug_avx2(x) x
135 #else
136 #define debug_avx2(x)
137 #endif
138
139 /* Checking genomic nt in traceback procedures */
140 #ifdef DEBUG17
141 #define debug17(x) x
142 #else
143 #define debug17(x)
144 #endif
145
146
147
148 #include "complement.h"
149 #define NEG_INFINITY_DISPLAY -99
150
151
152 /************************************************************************
153 * Debugging procedures
154 ************************************************************************/
155
156 #ifdef DEBUG15
157 /* For debugging of SIMD procedures*/
158 #ifdef HAVE_AVX2
159 static void
print_vector_8(__m256i x,int r,int c,char * label)160 print_vector_8 (__m256i x, int r, int c, char *label) {
161 __m256i a[1];
162 Score8_T *s = a;
163
164 _mm_lfence(); /* Needed to print correct values */
165 _mm256_store_si256(a,x);
166 printf("%d,%d %s: %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
167 r,c,label,s[0],s[1],s[2],s[3],s[4],s[5],s[6],s[7],s[8],s[9],s[10],s[11],s[12],s[13],s[14],s[15],
168 s[16],s[17],s[18],s[19],s[20],s[21],s[22],s[23],s[24],s[25],s[26],s[27],s[28],s[29],s[30],s[31]);
169 return;
170 }
171
172 static void
print_vector_16(__m256i x,int r,int c,char * label)173 print_vector_16 (__m256i x, int r, int c, char *label) {
174 __m256i a[1];
175 Score16_T *s = a;
176
177 _mm_lfence(); /* Needed to print correct values */
178 _mm256_store_si256(a,x);
179 printf("%d,%d %s: %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
180 r,c,label,s[0],s[1],s[2],s[3],s[4],s[5],s[6],s[7],s[8],s[9],s[10],s[11],s[12],s[13],s[14],s[15]);
181 return;
182 }
183
184 #else
185 static void
print_vector_8(__m128i x,int r,int c,char * label)186 print_vector_8 (__m128i x, int r, int c, char *label) {
187 __m128i a[1];
188 Score8_T *s = a;
189
190 _mm_lfence(); /* Needed to print correct values */
191 _mm_store_si128(a,x);
192 printf("%d,%d %s: %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
193 r,c,label,s[0],s[1],s[2],s[3],s[4],s[5],s[6],s[7],s[8],s[9],s[10],s[11],s[12],s[13],s[14],s[15]);
194 return;
195 }
196
197 static void
print_vector_16(__m128i x,int r,int c,char * label)198 print_vector_16 (__m128i x, int r, int c, char *label) {
199 __m128i a[1];
200 Score16_T *s = a;
201
202 _mm_lfence(); /* Needed to print correct values */
203 _mm_store_si128(a,x);
204 printf("%d,%d %s: %d %d %d %d %d %d %d %d\n",r,c,label,s[0],s[1],s[2],s[3],s[4],s[5],s[6],s[7]);
205 return;
206 }
207 #endif
208 #endif
209
210
211 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD) || defined(DEBUG2)
212 static void
Matrix8_print(Score8_T ** matrix,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int lband,int uband)213 Matrix8_print (Score8_T **matrix, int rlength, int glength, char *rsequence,
214 char *gsequence, char *gsequencealt,
215 bool revp, int lband, int uband) {
216 int i, j;
217 char g, g_alt;
218
219 #ifdef HAVE_SSE2
220 _mm_lfence();
221 #endif
222
223 /* j */
224 printf(" "); /* For i */
225 printf(" ");
226 for (j = 0; j <= glength; ++j) {
227 printf(" %2d ",j);
228 }
229 printf("\n");
230
231
232 if (gsequence) {
233 printf(" "); /* For i */
234 printf(" ");
235 for (j = 0; j <= glength; ++j) {
236 if (j == 0) {
237 printf(" ");
238 } else {
239 printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
240 }
241 }
242 printf("\n");
243 }
244
245 if (gsequencealt != gsequence) {
246 printf(" "); /* For i */
247 printf(" ");
248 for (j = 0; j <= glength; ++j) {
249 if (j == 0) {
250 printf(" ");
251 } else {
252 g = revp ? gsequence[-j+1] : gsequence[j-1];
253 g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
254 if (g == g_alt) {
255 printf(" %c ",' ');
256 } else {
257 printf(" %c ",g_alt);
258 }
259 }
260 }
261 printf("\n");
262 }
263
264 for (i = 0; i <= rlength; ++i) {
265 printf("%2d ",i);
266 if (i == 0) {
267 printf(" ");
268 } else {
269 printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
270 }
271 for (j = 0; j <= glength; ++j) {
272 if (j < i - lband) {
273 printf(" . ");
274 } else if (j > i + uband) {
275 printf(" . ");
276 } else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
277 printf("%3d ",NEG_INFINITY_DISPLAY);
278 } else {
279 printf("%3d ",matrix[j][i]);
280 }
281 }
282 printf("\n");
283 }
284 printf("\n");
285
286 return;
287 }
288
289 static void
Matrix8_print_ud(Score8_T ** matrix,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int band,bool upperp)290 Matrix8_print_ud (Score8_T **matrix, int rlength, int glength, char *rsequence,
291 char *gsequence, char *gsequencealt,
292 bool revp, int band, bool upperp) {
293 int i, j;
294 char g, g_alt;
295
296 #ifdef HAVE_SSE2
297 _mm_lfence();
298 #endif
299
300 /* j */
301 printf(" "); /* For i */
302 printf(" ");
303 for (j = 0; j <= glength; ++j) {
304 printf(" %2d ",j);
305 }
306 printf("\n");
307
308 if (gsequence) {
309 printf(" "); /* For i */
310 printf(" ");
311 for (j = 0; j <= glength; ++j) {
312 if (j == 0) {
313 printf(" ");
314 } else {
315 printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
316 }
317 }
318 printf("\n");
319 }
320
321 if (gsequencealt != gsequence) {
322 printf(" "); /* For i */
323 printf(" ");
324 for (j = 0; j <= glength; ++j) {
325 if (j == 0) {
326 printf(" ");
327 } else {
328 g = revp ? gsequence[-j+1] : gsequence[j-1];
329 g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
330 if (g == g_alt) {
331 printf(" %c ",' ');
332 } else {
333 printf(" %c ",g_alt);
334 }
335 }
336 }
337 printf("\n");
338 }
339
340 for (i = 0; i <= rlength; ++i) {
341 printf("%2d ",i);
342 if (i == 0) {
343 printf(" ");
344 } else {
345 printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
346 }
347 if (upperp == true) {
348 for (j = 0; j <= glength; ++j) {
349 if (j < i) {
350 printf(" . ");
351 } else if (j > i + band) {
352 printf(" . ");
353 } else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
354 printf("%3d ",NEG_INFINITY_DISPLAY);
355 } else {
356 printf("%3d ",matrix[j][i]);
357 }
358 }
359 } else {
360 for (j = 0; j <= glength; ++j) {
361 if (i < j) {
362 printf(" . ");
363 } else if (i > j + band) {
364 printf(" . ");
365 } else if (matrix[i][j] < NEG_INFINITY_DISPLAY) {
366 printf("%3d ",NEG_INFINITY_DISPLAY);
367 } else {
368 printf("%3d ",matrix[i][j]);
369 }
370 }
371 }
372 printf("\n");
373 }
374 printf("\n");
375
376 return;
377 }
378
379
380 static void
Matrix16_print(Score16_T ** matrix,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int lband,int uband)381 Matrix16_print (Score16_T **matrix, int rlength, int glength, char *rsequence,
382 char *gsequence, char *gsequencealt,
383 bool revp, int lband, int uband) {
384 int i, j;
385 char g, g_alt;
386
387 #ifdef HAVE_SSE2
388 _mm_lfence();
389 #endif
390
391 /* j */
392 if (rlength >= 100) {
393 printf(" ");
394 } else {
395 printf(" "); /* For i */
396 }
397 printf(" ");
398 if (glength >= 100) {
399 for (j = 0; j <= glength; ++j) {
400 printf(" %3d ",j);
401 }
402 } else {
403 for (j = 0; j <= glength; ++j) {
404 printf(" %2d ",j);
405 }
406 }
407 printf("\n");
408
409 if (gsequence) {
410 if (rlength >= 100) {
411 printf(" ");
412 } else {
413 printf(" "); /* For i */
414 }
415 printf(" ");
416 if (glength >= 100) {
417 for (j = 0; j <= glength; ++j) {
418 if (j == 0) {
419 printf(" ");
420 } else {
421 printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
422 }
423 }
424 } else {
425 for (j = 0; j <= glength; ++j) {
426 if (j == 0) {
427 printf(" ");
428 } else {
429 printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
430 }
431 }
432 }
433 printf("\n");
434 }
435
436 if (gsequencealt != gsequence) {
437 if (rlength >= 100) {
438 printf(" ");
439 } else {
440 printf(" "); /* For i */
441 }
442 printf(" ");
443 if (glength >= 100) {
444 for (j = 0; j <= glength; ++j) {
445 if (j == 0) {
446 printf(" ");
447 } else {
448 g = revp ? gsequence[-j+1] : gsequence[j-1];
449 g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
450 if (g == g_alt) {
451 printf(" %c ",' ');
452 } else {
453 printf(" %c ",g_alt);
454 }
455 }
456 }
457 } else {
458 for (j = 0; j <= glength; ++j) {
459 if (j == 0) {
460 printf(" ");
461 } else {
462 g = revp ? gsequence[-j+1] : gsequence[j-1];
463 g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
464 if (g == g_alt) {
465 printf(" %c ",' ');
466 } else {
467 printf(" %c ",g_alt);
468 }
469 }
470 }
471 }
472 printf("\n");
473 }
474
475 for (i = 0; i <= rlength; ++i) {
476 if (rlength >= 100) {
477 printf("%3d ",i);
478 } else {
479 printf("%2d ",i);
480 }
481 if (i == 0) {
482 printf(" ");
483 } else {
484 printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
485 }
486 if (glength >= 100) {
487 for (j = 0; j <= glength; ++j) {
488 if (j < i - lband) {
489 printf(" . ");
490 } else if (j > i + uband) {
491 printf(" . ");
492 } else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
493 printf(" %3d ",NEG_INFINITY_DISPLAY);
494 } else {
495 printf(" %3d ",matrix[j][i]);
496 }
497 }
498 } else {
499 for (j = 0; j <= glength; ++j) {
500 if (j < i - lband) {
501 printf(" . ");
502 } else if (j > i + uband) {
503 printf(" . ");
504 } else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
505 printf("%3d ",NEG_INFINITY_DISPLAY);
506 } else {
507 printf("%3d ",matrix[j][i]);
508 }
509 }
510 }
511 printf("\n");
512 }
513 printf("\n");
514
515 return;
516 }
517
518 static void
Matrix16_print_ud(Score16_T ** matrix,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int band,bool upperp)519 Matrix16_print_ud (Score16_T **matrix, int rlength, int glength, char *rsequence,
520 char *gsequence, char *gsequencealt,
521 bool revp, int band, bool upperp) {
522 int i, j;
523 char g, g_alt;
524
525 #ifdef HAVE_SSE2
526 _mm_lfence();
527 #endif
528
529 /* j */
530 printf(" "); /* For i */
531 printf(" ");
532 for (j = 0; j <= glength; ++j) {
533 printf(" %2d ",j);
534 }
535 printf("\n");
536
537 if (gsequence) {
538 printf(" "); /* For i */
539 printf(" ");
540 for (j = 0; j <= glength; ++j) {
541 if (j == 0) {
542 printf(" ");
543 } else {
544 printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
545 }
546 }
547 printf("\n");
548 }
549
550 if (gsequencealt != gsequence) {
551 printf(" "); /* For i */
552 printf(" ");
553 for (j = 0; j <= glength; ++j) {
554 if (j == 0) {
555 printf(" ");
556 } else {
557 g = revp ? gsequence[-j+1] : gsequence[j-1];
558 g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
559 if (g == g_alt) {
560 printf(" %c ",' ');
561 } else {
562 printf(" %c ",g_alt);
563 }
564 }
565 }
566 printf("\n");
567 }
568
569 for (i = 0; i <= rlength; ++i) {
570 printf("%2d ",i);
571 if (i == 0) {
572 printf(" ");
573 } else {
574 printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
575 }
576 if (upperp == true) {
577 for (j = 0; j <= glength; ++j) {
578 if (j < i) {
579 printf(" . ");
580 } else if (j > i + band) {
581 printf(" . ");
582 } else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
583 printf("%3d ",NEG_INFINITY_DISPLAY);
584 } else {
585 printf("%3d ",matrix[j][i]);
586 }
587 }
588 } else {
589 for (j = 0; j <= glength; ++j) {
590 if (i < j) {
591 printf(" . ");
592 } else if (i > j + band) {
593 printf(" . ");
594 } else if (matrix[i][j] < NEG_INFINITY_DISPLAY) {
595 printf("%3d ",NEG_INFINITY_DISPLAY);
596 } else {
597 printf("%3d ",matrix[i][j]);
598 }
599 }
600 }
601 printf("\n");
602 }
603 printf("\n");
604
605 return;
606 }
607 #endif
608
609 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD) || defined(DEBUG2)
610 static void
Directions8_print(Direction8_T ** directions_nogap,Direction8_T ** directions_Egap,Direction8_T ** directions_Fgap,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int lband,int uband)611 Directions8_print (Direction8_T **directions_nogap, Direction8_T **directions_Egap, Direction8_T **directions_Fgap,
612 int rlength, int glength, char *rsequence, char *gsequence, char *gsequencealt,
613 bool revp, int lband, int uband) {
614 int i, j;
615 char g, g_alt;
616
617 #ifdef HAVE_SSE2
618 _mm_lfence();
619 #endif
620
621 /* j */
622 printf(" "); /* For i */
623 printf(" ");
624 for (j = 0; j <= glength; ++j) {
625 printf(" %2d ",j);
626 }
627 printf("\n");
628
629 if (gsequence) {
630 printf(" "); /* For i */
631 printf(" ");
632 for (j = 0; j <= glength; ++j) {
633 if (j == 0) {
634 printf(" ");
635 } else {
636 printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
637 }
638 }
639 printf("\n");
640 }
641
642 if (gsequencealt != gsequence) {
643 printf(" "); /* For i */
644 printf(" ");
645 for (j = 0; j <= glength; ++j) {
646 if (j == 0) {
647 printf(" ");
648 } else {
649 g = revp ? gsequence[-j+1] : gsequence[j-1];
650 g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
651 if (g == g_alt) {
652 printf(" %c ",' ');
653 } else {
654 printf(" %c ",g_alt);
655 }
656 }
657 }
658 printf("\n");
659 }
660
661 for (i = 0; i <= rlength; ++i) {
662 printf("%2d ",i);
663 if (i == 0) {
664 printf(" ");
665 } else {
666 printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
667 }
668 for (j = 0; j <= glength; ++j) {
669 if (j < i - lband) {
670 printf(" ");
671 } else if (j > i + uband) {
672 printf(" ");
673 } else {
674 if (directions_Egap[j][i] == DIAG) {
675 printf("D");
676 } else {
677 /* Must be HORIZ */
678 printf("H");
679 }
680 printf("|");
681 if (directions_nogap[j][i] == DIAG) {
682 printf("D");
683 } else if (directions_nogap[j][i] == HORIZ) {
684 printf("H");
685 } else {
686 /* Must be VERT */
687 printf("V");
688 }
689 printf("|");
690 if (directions_Fgap[j][i] == DIAG) {
691 printf("D");
692 } else {
693 /* Must be VERT */
694 printf("V");
695 }
696 }
697 printf(" ");
698 }
699 printf("\n");
700 }
701 printf("\n");
702
703 return;
704 }
705
706 static void
Directions8_print_ud(Direction8_T ** directions_nogap,Direction8_T ** directions_Egap,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int band,bool upperp)707 Directions8_print_ud (Direction8_T **directions_nogap, Direction8_T **directions_Egap,
708 int rlength, int glength, char *rsequence, char *gsequence, char *gsequencealt,
709 bool revp, int band, bool upperp) {
710 int i, j;
711 char g, g_alt;
712
713 #ifdef HAVE_SSE2
714 _mm_lfence();
715 #endif
716
717 /* j */
718 printf(" "); /* For i */
719 printf(" ");
720 for (j = 0; j <= glength; ++j) {
721 printf(" %2d ",j);
722 }
723 printf("\n");
724
725 if (gsequence) {
726 printf(" "); /* For i */
727 printf(" ");
728 for (j = 0; j <= glength; ++j) {
729 if (j == 0) {
730 printf(" ");
731 } else {
732 printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
733 }
734 }
735 printf("\n");
736 }
737
738 if (gsequencealt != gsequence) {
739 printf(" "); /* For i */
740 printf(" ");
741 for (j = 0; j <= glength; ++j) {
742 if (j == 0) {
743 printf(" ");
744 } else {
745 g = revp ? gsequence[-j+1] : gsequence[j-1];
746 g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
747 if (g == g_alt) {
748 printf(" %c ",' ');
749 } else {
750 printf(" %c ",g_alt);
751 }
752 }
753 }
754 printf("\n");
755 }
756
757 for (i = 0; i <= rlength; ++i) {
758 printf("%2d ",i);
759 if (i == 0) {
760 printf(" ");
761 } else {
762 printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
763 }
764 if (upperp == true) {
765 for (j = 0; j <= glength; ++j) {
766 if (j < i) {
767 printf(" ");
768 } else if (j > i + band) {
769 printf(" ");
770 } else {
771 if (directions_Egap[j][i] == DIAG) {
772 printf("D");
773 } else {
774 printf("-");
775 }
776 printf("|");
777 if (directions_nogap[j][i] == DIAG) {
778 printf("D");
779 } else {
780 printf("-");
781 }
782 printf("| "); /* For Fgap */
783 }
784 printf(" ");
785 }
786 } else {
787 for (j = 0; j <= glength; ++j) {
788 if (i < j) {
789 printf(" ");
790 } else if (i > j + band) {
791 printf(" ");
792 } else {
793 printf(" |"); /* For Fgap */
794 if (directions_nogap[i][j] == DIAG) {
795 printf("D");
796 } else {
797 printf("-");
798 }
799 printf("|");
800 if (directions_Egap[i][j] == DIAG) {
801 printf("D");
802 } else {
803 printf("-");
804 }
805 }
806 printf(" ");
807 }
808 }
809 printf("\n");
810 }
811 printf("\n");
812
813 return;
814 }
815
816
817 static void
Directions16_print(Direction16_T ** directions_nogap,Direction16_T ** directions_Egap,Direction16_T ** directions_Fgap,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int lband,int uband)818 Directions16_print (Direction16_T **directions_nogap, Direction16_T **directions_Egap, Direction16_T **directions_Fgap,
819 int rlength, int glength, char *rsequence, char *gsequence, char *gsequencealt,
820 bool revp, int lband, int uband) {
821 int i, j;
822 char g, g_alt;
823
824 #ifdef HAVE_SSE2
825 _mm_lfence();
826 #endif
827
828 /* j */
829 printf(" "); /* For i */
830 printf(" ");
831 for (j = 0; j <= glength; ++j) {
832 printf(" %3d ",j);
833 }
834 printf("\n");
835
836 if (gsequence) {
837 printf(" "); /* For i */
838 printf(" ");
839 for (j = 0; j <= glength; ++j) {
840 if (j == 0) {
841 printf(" ");
842 } else {
843 printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
844 }
845 }
846 printf("\n");
847 }
848
849 if (gsequencealt != gsequence) {
850 printf(" "); /* For i */
851 printf(" ");
852 for (j = 0; j <= glength; ++j) {
853 if (j == 0) {
854 printf(" ");
855 } else {
856 g = revp ? gsequence[-j+1] : gsequence[j-1];
857 g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
858 if (g == g_alt) {
859 printf(" %c ",' ');
860 } else {
861 printf(" %c ",g_alt);
862 }
863 }
864 }
865 printf("\n");
866 }
867
868 for (i = 0; i <= rlength; ++i) {
869 printf("%2d ",i);
870 if (i == 0) {
871 printf(" ");
872 } else {
873 printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
874 }
875 for (j = 0; j <= glength; ++j) {
876 if (j < i - lband) {
877 printf(" ");
878 } else if (j > i + uband) {
879 printf(" ");
880 } else {
881 if (directions_Egap[j][i] == DIAG) {
882 printf("D");
883 } else {
884 /* Must be HORIZ */
885 printf("H");
886 }
887 printf("|");
888 if (directions_nogap[j][i] == DIAG) {
889 printf("D");
890 } else if (directions_nogap[j][i] == HORIZ) {
891 printf("H");
892 } else {
893 /* Must be VERT */
894 printf("V");
895 }
896 printf("|");
897 if (directions_Fgap[j][i] == DIAG) {
898 printf("D");
899 } else {
900 /* Must be VERT */
901 printf("V");
902 }
903 }
904 printf(" ");
905 }
906 printf("\n");
907 }
908 printf("\n");
909
910 return;
911 }
912
913 static void
Directions16_print_ud(Direction16_T ** directions_nogap,Direction16_T ** directions_Egap,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int band,bool upperp)914 Directions16_print_ud (Direction16_T **directions_nogap, Direction16_T **directions_Egap,
915 int rlength, int glength, char *rsequence, char *gsequence, char *gsequencealt,
916 bool revp, int band, bool upperp) {
917 int i, j;
918 char g, g_alt;
919
920 #ifdef HAVE_SSE2
921 _mm_lfence();
922 #endif
923
924 /* j */
925 printf(" "); /* For i */
926 printf(" ");
927 for (j = 0; j <= glength; ++j) {
928 printf(" %2d ",j);
929 }
930 printf("\n");
931
932 if (gsequence) {
933 printf(" "); /* For i */
934 printf(" ");
935 for (j = 0; j <= glength; ++j) {
936 if (j == 0) {
937 printf(" ");
938 } else {
939 printf(" %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
940 }
941 }
942 printf("\n");
943 }
944
945 if (gsequencealt != gsequence) {
946 printf(" "); /* For i */
947 printf(" ");
948 for (j = 0; j <= glength; ++j) {
949 if (j == 0) {
950 printf(" ");
951 } else {
952 g = revp ? gsequence[-j+1] : gsequence[j-1];
953 g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
954 if (g == g_alt) {
955 printf(" %c ",' ');
956 } else {
957 printf(" %c ",g_alt);
958 }
959 }
960 }
961 printf("\n");
962 }
963
964 for (i = 0; i <= rlength; ++i) {
965 printf("%2d ",i);
966 if (i == 0) {
967 printf(" ");
968 } else {
969 printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
970 }
971 if (upperp == true) {
972 for (j = 0; j <= glength; ++j) {
973 if (j < i) {
974 printf(" ");
975 } else if (j > i + band) {
976 printf(" ");
977 } else {
978 if (directions_Egap[j][i] == DIAG) {
979 printf("D");
980 } else {
981 printf("-");
982 }
983 printf("|");
984 if (directions_nogap[j][i] == DIAG) {
985 printf("D");
986 } else {
987 printf("-");
988 }
989 }
990 printf(" "); /* For Fgap */
991 printf(" ");
992 }
993 } else {
994 for (j = 0; j <= glength; ++j) {
995 printf(" "); /* For Fgap */
996 if (i < j) {
997 printf(" ");
998 } else if (i > j + band) {
999 printf(" ");
1000 } else {
1001 if (directions_nogap[i][j] == DIAG) {
1002 printf("D");
1003 } else {
1004 printf("-");
1005 }
1006 printf("|");
1007 if (directions_Egap[i][j] == DIAG) {
1008 printf("D");
1009 } else {
1010 printf("-");
1011 }
1012 }
1013 printf(" ");
1014 }
1015 }
1016 printf("\n");
1017 }
1018 printf("\n");
1019
1020 return;
1021 }
1022 #endif
1023
1024
1025
1026 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
1027 static void
banded_matrix8_compare(Score8_T ** matrix1,Score8_T ** matrix2,int rlength,int glength,int lband,int uband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1028 banded_matrix8_compare (Score8_T **matrix1,
1029 #ifdef DEBUG_AVX2
1030 Score8_T **matrix2,
1031 #else
1032 Score32_T **matrix2,
1033 #endif
1034 int rlength, int glength, int lband, int uband, char *rsequence, char *gsequence, char *gsequence_alt,
1035 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1036 bool revp) {
1037 int r, c, rlo, rhigh;
1038
1039 for (c = 1; c <= glength; c++) {
1040 if ((rlo = c - uband) < 1) {
1041 rlo = 1;
1042 };
1043
1044 if ((rhigh = c + lband) > rlength) {
1045 rhigh = rlength;
1046 }
1047
1048 for (r = rlo; r <= rhigh; r++) {
1049 if (matrix1[c][r] <= NEG_INFINITY_8 + 30 && matrix2[c][r] <= NEG_INFINITY_8 + 30) {
1050 /* Okay: both essentially negative infinity */
1051 } else if (matrix1[c][r] != matrix2[c][r]) {
1052 printf("At %d,%d, value %d != value %d\n",r,c,matrix1[c][r],matrix2[c][r]);
1053
1054 Matrix8_print(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1055 revp,lband,uband);
1056 #ifdef DEBUG_AVX2
1057 Matrix8_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1058 revp,lband,uband);
1059 #elif defined(DEBUG_SIMD)
1060 Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1061 goffset,chroffset,chrhigh,watsonp,revp,lband,uband);
1062 #endif
1063 abort();
1064 }
1065 }
1066 }
1067
1068 return;
1069 }
1070
1071 static void
banded_matrix8_compare_upper(Score8_T ** matrix1,Score8_T ** matrix2,int rlength,int glength,int uband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1072 banded_matrix8_compare_upper (Score8_T **matrix1,
1073 #ifdef DEBUG_AVX2
1074 Score8_T **matrix2,
1075 #else
1076 Score32_T **matrix2,
1077 #endif
1078 int rlength, int glength,
1079 int uband, char *rsequence, char *gsequence, char *gsequence_alt,
1080 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1081 bool revp) {
1082 int r, c, rlo, rhigh;
1083
1084 for (c = 1; c <= glength; c++) {
1085 if ((rlo = c - uband) < 1) {
1086 rlo = 1;
1087 };
1088
1089 if ((rhigh = c) > rlength) {
1090 rhigh = rlength;
1091 }
1092
1093 for (r = rlo; r <= rhigh; r++) {
1094 if (matrix1[c][r] <= NEG_INFINITY_8 + 30 && matrix2[c][r] <= NEG_INFINITY_8 + 30) {
1095 /* Okay */
1096 } else if (matrix1[c][r] != matrix2[c][r]) {
1097 printf("At %d,%d, value %d != value %d\n",r,c,matrix1[c][r],matrix2[c][r]);
1098
1099 Matrix8_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1100 revp,uband,/*upperp*/true);
1101 #ifdef DEBUG_AVX2
1102 Matrix8_print_ud(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1103 revp,uband,/*upperp*/true);
1104 #else
1105 Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1106 goffset,chroffset,chrhigh,watsonp,revp,/*lband*/0,uband);
1107 #endif
1108 abort();
1109 }
1110 }
1111 }
1112
1113 return;
1114 }
1115
1116 static void
banded_matrix8_compare_lower(Score8_T ** matrix1,Score8_T ** matrix2,int rlength,int glength,int lband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1117 banded_matrix8_compare_lower (Score8_T **matrix1,
1118 #ifdef DEBUG_AVX2
1119 Score8_T **matrix2,
1120 #else
1121 Score32_T **matrix2,
1122 #endif
1123 int rlength, int glength,
1124 int lband, char *rsequence, char *gsequence, char *gsequence_alt,
1125 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1126 bool revp) {
1127 int r, c, rlo, rhigh;
1128
1129 for (c = 1; c <= glength; c++) {
1130 if ((rlo = c) < 1) {
1131 rlo = 1;
1132 };
1133
1134 if ((rhigh = c + lband) > rlength) {
1135 rhigh = rlength;
1136 }
1137
1138 for (r = rlo; r <= rhigh; r++) {
1139 #ifdef DEBUG_AVX2
1140 if (matrix1[r][c] <= NEG_INFINITY_8 + 30 && matrix2[r][c] <= NEG_INFINITY_8 + 30) {
1141 /* Okay */
1142 } else if (matrix1[r][c] != matrix2[r][c]) {
1143 printf("At %d,%d, value %d != value %d\n",r,c,matrix1[r][c],matrix2[r][c]);
1144
1145 Matrix8_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1146 revp,lband,/*upperp*/false);
1147 Matrix8_print_ud(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1148 revp,lband,/*upperp*/false);
1149 abort();
1150 }
1151 #else
1152 if (matrix1[r][c] <= NEG_INFINITY_8 + 30 && matrix2[c][r] <= NEG_INFINITY_8 + 30) {
1153 /* Okay */
1154 } else if (matrix1[r][c] != matrix2[c][r]) {
1155 printf("At %d,%d, value %d != value %d\n",r,c,matrix1[r][c],matrix2[c][r]);
1156
1157 Matrix8_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1158 revp,lband,/*upperp*/false);
1159 Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1160 goffset,chroffset,chrhigh,watsonp,revp,lband,/*uband*/0);
1161 abort();
1162 }
1163 #endif
1164 }
1165 }
1166
1167 return;
1168 }
1169
1170
1171 static void
banded_matrix16_compare(Score16_T ** matrix1,Score16_T ** matrix2,int rlength,int glength,int lband,int uband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1172 banded_matrix16_compare (Score16_T **matrix1,
1173 #ifdef DEBUG_AVX2
1174 Score16_T **matrix2,
1175 #elif defined(DEBUG_SIMD)
1176 Score32_T **matrix2,
1177 #endif
1178 int rlength, int glength, int lband, int uband, char *rsequence, char *gsequence, char *gsequence_alt,
1179 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1180 bool revp) {
1181 int r, c, rlo, rhigh;
1182
1183 for (c = 1; c <= glength; c++) {
1184 if ((rlo = c - uband) < 1) {
1185 rlo = 1;
1186 };
1187
1188 if ((rhigh = c + lband) > rlength) {
1189 rhigh = rlength;
1190 }
1191
1192 for (r = rlo; r <= rhigh; r++) {
1193 if (matrix1[c][r] <= NEG_INFINITY_16 + 30 && matrix2[c][r] <= NEG_INFINITY_16 + 30) {
1194 /* Okay: both essentially negative infinity */
1195 } else if (matrix1[c][r] != matrix2[c][r]) {
1196 printf("At %d,%d, value %d != value %d\n",r,c,matrix1[c][r],matrix2[c][r]);
1197
1198 Matrix16_print(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1199 revp,lband,uband);
1200 #ifdef DEBUG_AVX2
1201 Matrix16_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1202 revp,lband,uband);
1203 #elif defined(DEBUG_SIMD)
1204 Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1205 goffset,chroffset,chrhigh,watsonp,revp,lband,uband);
1206 #endif
1207 abort();
1208 }
1209 }
1210 }
1211
1212 return;
1213 }
1214
1215 static void
banded_matrix16_compare_upper(Score16_T ** matrix1,Score16_T ** matrix2,int rlength,int glength,int uband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1216 banded_matrix16_compare_upper (Score16_T **matrix1,
1217 #ifdef DEBUG_AVX2
1218 Score16_T **matrix2,
1219 #else
1220 Score32_T **matrix2,
1221 #endif
1222 int rlength, int glength,
1223 int uband, char *rsequence, char *gsequence, char *gsequence_alt,
1224 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1225 bool revp) {
1226 int r, c, rlo, rhigh;
1227
1228 for (c = 1; c <= glength; c++) {
1229 if ((rlo = c - uband) < 1) {
1230 rlo = 1;
1231 };
1232
1233 if ((rhigh = c) > rlength) {
1234 rhigh = rlength;
1235 }
1236
1237 for (r = rlo; r <= rhigh; r++) {
1238 if (matrix1[c][r] <= NEG_INFINITY_16 + 30 && matrix2[c][r] <= NEG_INFINITY_16 + 30) {
1239 /* Okay */
1240 } else if (matrix1[c][r] != matrix2[c][r]) {
1241 printf("At %d,%d, value %d != value %d\n",r,c,matrix1[c][r],matrix2[c][r]);
1242
1243 Matrix16_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1244 revp,uband,/*upperp*/true);
1245 #ifdef DEBUG_AVX2
1246 Matrix16_print_ud(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1247 revp,uband,/*upperp*/true);
1248 #else
1249 Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1250 goffset,chroffset,chrhigh,watsonp,revp,/*lband*/0,uband);
1251 #endif
1252 abort();
1253 }
1254 }
1255 }
1256
1257 return;
1258 }
1259
1260 static void
banded_matrix16_compare_lower(Score16_T ** matrix1,Score16_T ** matrix2,int rlength,int glength,int lband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1261 banded_matrix16_compare_lower (Score16_T **matrix1,
1262 #ifdef DEBUG_AVX2
1263 Score16_T **matrix2,
1264 #else
1265 Score32_T **matrix2,
1266 #endif
1267 int rlength, int glength,
1268 int lband, char *rsequence, char *gsequence, char *gsequence_alt,
1269 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1270 bool revp) {
1271 int r, c, rlo, rhigh;
1272
1273 for (c = 1; c <= glength; c++) {
1274 if ((rlo = c) < 1) {
1275 rlo = 1;
1276 };
1277
1278 if ((rhigh = c + lband) > rlength) {
1279 rhigh = rlength;
1280 }
1281
1282 for (r = rlo; r <= rhigh; r++) {
1283 #ifdef DEBUG_AVX2
1284 if (matrix1[r][c] <= NEG_INFINITY_16 + 30 && matrix2[r][c] <= NEG_INFINITY_16 + 30) {
1285 /* Okay */
1286 } else if (matrix1[r][c] != matrix2[r][c]) {
1287 printf("At %d,%d, value %d != value %d\n",r,c,matrix1[r][c],matrix2[r][c]);
1288
1289 Matrix16_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1290 revp,lband,/*upperp*/false);
1291 Matrix16_print_ud(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1292 revp,lband,/*upperp*/false);
1293 abort();
1294 }
1295 #else
1296 if (matrix1[r][c] <= NEG_INFINITY_16 + 30 && matrix2[c][r] <= NEG_INFINITY_16 + 30) {
1297 /* Okay */
1298 } else if (matrix1[r][c] != matrix2[c][r]) {
1299 printf("At %d,%d, value %d != value %d\n",r,c,matrix1[r][c],matrix2[c][r]);
1300
1301 Matrix16_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1302 revp,lband,/*upperp*/false);
1303 Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1304 goffset,chroffset,chrhigh,watsonp,revp,lband,/*uband*/0);
1305 abort();
1306 }
1307 #endif
1308 }
1309 }
1310
1311 return;
1312 }
1313
1314 #endif
1315
1316 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
1317 static void
banded_directions8_compare_nogap(Score8_T ** matrix,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int lband,int uband)1318 banded_directions8_compare_nogap (Score8_T **matrix, Direction8_T **directions1,
1319 #ifdef DEBUG_AVX2
1320 Direction8_T **directions2,
1321 #elif defined(DEBUG_SIMD)
1322 Direction32_T **directions2,
1323 #endif
1324 int rlength, int glength, int lband, int uband) {
1325 int r, c, rlo, rhigh;
1326
1327 for (c = 1; c <= glength; c++) {
1328 if ((rlo = c - uband) < 1) {
1329 rlo = 1;
1330 };
1331
1332 if ((rhigh = c + lband) > rlength) {
1333 rhigh = rlength;
1334 }
1335
1336 for (r = rlo; r <= rhigh; r++) {
1337 if (matrix[c][r] < NEG_INFINITY_8 + 30) {
1338 /* Don't check */
1339
1340 } else if (directions1[c][r] == 0) {
1341 if (directions2[c][r] == 0) {
1342 } else {
1343 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1344 abort();
1345 }
1346
1347 } else if (directions1[c][r] == 1) {
1348 if (directions2[c][r] == 1) {
1349 } else {
1350 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1351 abort();
1352 }
1353
1354 } else {
1355 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1356 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1357 abort();
1358 }
1359 }
1360 }
1361 }
1362
1363 return;
1364 }
1365
1366 static void
banded_directions8_compare_nogap_upper(Score8_T ** matrix,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int uband)1367 banded_directions8_compare_nogap_upper (Score8_T **matrix, Direction8_T **directions1,
1368 #ifdef DEBUG_AVX2
1369 Direction8_T **directions2,
1370 #elif defined(DEBUG_SIMD)
1371 Direction32_T **directions2,
1372 #endif
1373 int rlength, int glength, int uband) {
1374 int r, c, rlo, rhigh;
1375
1376 for (c = 1; c <= glength; c++) {
1377 if ((rlo = c - uband) < 1) {
1378 rlo = 1;
1379 };
1380
1381 if ((rhigh = c) > rlength) {
1382 rhigh = rlength;
1383 }
1384
1385 for (r = rlo; r <= rhigh; r++) {
1386 if (matrix[c][r] < NEG_INFINITY_8 + 30) {
1387 /* Don't check */
1388
1389 } else if (directions1[c][r] == 0) {
1390 if (directions2[c][r] == 0) {
1391 } else {
1392 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1393 abort();
1394 }
1395
1396 } else if (directions1[c][r] == 1) {
1397 if (directions2[c][r] == 1) {
1398 } else {
1399 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1400 abort();
1401 }
1402
1403 } else {
1404 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1405 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1406 abort();
1407 }
1408 }
1409 }
1410 }
1411
1412 return;
1413 }
1414
1415 static void
banded_directions8_compare_nogap_lower(Score8_T ** matrix,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int lband)1416 banded_directions8_compare_nogap_lower (Score8_T **matrix, Direction8_T **directions1,
1417 #ifdef DEBUG_AVX2
1418 Direction8_T **directions2,
1419 #elif defined(DEBUG_SIMD)
1420 Direction32_T **directions2,
1421 #endif
1422 int rlength, int glength, int lband) {
1423 int r, c, rlo, rhigh;
1424
1425 for (c = 1; c <= glength; c++) {
1426 if ((rlo = c) < 1) {
1427 rlo = 1;
1428 };
1429
1430 if ((rhigh = c + lband) > rlength) {
1431 rhigh = rlength;
1432 }
1433
1434 #ifdef DEBUG_AVX2
1435 for (r = rlo; r <= rhigh; r++) {
1436 if (matrix[c][r] < NEG_INFINITY_8 + 30) {
1437 /* Don't check */
1438
1439 } else if (directions1[r][c] == 0) {
1440 if (directions2[r][c] == 0) {
1441 } else {
1442 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1443 abort();
1444 }
1445
1446 } else if (directions1[r][c] == 1) {
1447 if (directions2[r][c] == 1) {
1448 } else {
1449 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1450 abort();
1451 }
1452
1453 } else {
1454 if (directions2[r][c] == 0 || directions2[r][c] == 0) {
1455 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1456 abort();
1457 }
1458 }
1459 }
1460
1461 #else
1462 for (r = rlo; r <= rhigh; r++) {
1463 if (matrix[c][r] < NEG_INFINITY_8 + 30) {
1464 /* Don't check */
1465
1466 } else if (directions1[r][c] == 0) {
1467 if (directions2[c][r] == 0) {
1468 } else {
1469 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1470 abort();
1471 }
1472
1473 } else if (directions1[r][c] == 1) {
1474 if (directions2[c][r] == 1) {
1475 } else {
1476 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1477 abort();
1478 }
1479
1480 } else {
1481 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1482 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1483 abort();
1484 }
1485 }
1486 }
1487 #endif
1488
1489 }
1490
1491 return;
1492 }
1493
1494
1495 static void
banded_directions16_compare_nogap(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int lband,int uband)1496 banded_directions16_compare_nogap (Direction16_T **directions1,
1497 #ifdef DEBUG_AVX2
1498 Direction16_T **directions2,
1499 #elif defined(DEBUG_SIMD)
1500 Direction32_T **directions2,
1501 #endif
1502 int rlength, int glength, int lband, int uband) {
1503 int r, c, rlo, rhigh;
1504
1505 for (c = 1; c <= glength; c++) {
1506 if ((rlo = c - uband) < 1) {
1507 rlo = 1;
1508 };
1509
1510 if ((rhigh = c + lband) > rlength) {
1511 rhigh = rlength;
1512 }
1513
1514 for (r = rlo; r <= rhigh; r++) {
1515 if (directions1[c][r] == 0) {
1516 if (directions2[c][r] == 0) {
1517 } else {
1518 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1519 abort();
1520 }
1521
1522 } else if (directions1[c][r] == 1) {
1523 if (directions2[c][r] == 1) {
1524 } else {
1525 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1526 abort();
1527 }
1528
1529 } else {
1530 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1531 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1532 abort();
1533 }
1534 }
1535 }
1536 }
1537
1538 return;
1539 }
1540
1541 static void
banded_directions16_compare_nogap_upper(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int uband)1542 banded_directions16_compare_nogap_upper (Direction16_T **directions1,
1543 #ifdef DEBUG_AVX2
1544 Direction16_T **directions2,
1545 #else
1546 Direction32_T **directions2,
1547 #endif
1548 int rlength, int glength, int uband) {
1549 int r, c, rlo, rhigh;
1550
1551 for (c = 1; c <= glength; c++) {
1552 if ((rlo = c - uband) < 1) {
1553 rlo = 1;
1554 };
1555
1556 if ((rhigh = c) > rlength) {
1557 rhigh = rlength;
1558 }
1559
1560 for (r = rlo; r <= rhigh; r++) {
1561 if (directions1[c][r] == 0) {
1562 if (directions2[c][r] == 0) {
1563 } else {
1564 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1565 abort();
1566 }
1567
1568 } else if (directions1[c][r] == 1) {
1569 if (directions2[c][r] == 1) {
1570 } else {
1571 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1572 abort();
1573 }
1574
1575 } else {
1576 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1577 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1578 abort();
1579 }
1580 }
1581 }
1582 }
1583
1584 return;
1585 }
1586
1587 static void
banded_directions16_compare_nogap_lower(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int lband)1588 banded_directions16_compare_nogap_lower (Direction16_T **directions1,
1589 #ifdef DEBUG_AVX2
1590 Direction16_T **directions2,
1591 #else
1592 Direction32_T **directions2,
1593 #endif
1594 int rlength, int glength, int lband) {
1595 int r, c, rlo, rhigh;
1596
1597 for (c = 1; c <= glength; c++) {
1598 if ((rlo = c) < 1) {
1599 rlo = 1;
1600 };
1601
1602 if ((rhigh = c + lband) > rlength) {
1603 rhigh = rlength;
1604 }
1605
1606 for (r = rlo; r <= rhigh; r++) {
1607 #ifdef DEBUG_AVX2
1608 if (directions1[r][c] == 0) {
1609 if (directions2[r][c] == 0) {
1610 } else {
1611 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1612 abort();
1613 }
1614
1615 } else if (directions1[r][c] == 1) {
1616 if (directions2[r][c] == 1) {
1617 } else {
1618 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1619 abort();
1620 }
1621
1622 } else {
1623 if (directions2[r][c] == 0 || directions2[r][c] == 0) {
1624 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1625 abort();
1626 }
1627 }
1628 #else
1629 if (directions1[r][c] == 0) {
1630 if (directions2[c][r] == 0) {
1631 } else {
1632 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1633 abort();
1634 }
1635
1636 } else if (directions1[r][c] == 1) {
1637 if (directions2[c][r] == 1) {
1638 } else {
1639 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1640 abort();
1641 }
1642
1643 } else {
1644 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1645 printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1646 abort();
1647 }
1648 }
1649 #endif
1650 }
1651 }
1652
1653 return;
1654 }
1655 #endif
1656
1657 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
1658 static void
banded_directions8_compare_Egap(Score8_T ** matrix1,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int lband,int uband)1659 banded_directions8_compare_Egap (Score8_T **matrix1, Direction8_T **directions1,
1660 #ifdef DEBUG_AVX2
1661 Direction8_T **directions2,
1662 #else
1663 Direction32_T **directions2,
1664 #endif
1665 int rlength, int glength, int lband, int uband) {
1666 int r, c, rlo, rhigh, last_check;
1667
1668 for (c = 1; c <= glength; c++) {
1669 if ((rlo = c - uband) < 1) {
1670 rlo = 1;
1671 };
1672
1673 if ((rhigh = c + lband) <= rlength) {
1674 /* Don't check rhigh. Egap direction derives from a comparison
1675 of NEG_INFINITY values, and we should never reach here from
1676 directions_nogap anyway. */
1677 last_check = rhigh - 1;
1678
1679 } else {
1680 /* Do check rhigh, which contains instructions for the bottom row */
1681 rhigh = rlength;
1682 last_check = rhigh;
1683 }
1684
1685 for (r = rlo; r <= last_check; r++) {
1686 if (matrix1[c][r] < NEG_INFINITY_8 + 30) {
1687 /* Don't check */
1688
1689 } else if (directions1[c][r] == 0) {
1690 if (directions2[c][r] == 0) {
1691 } else {
1692 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1693 abort();
1694 }
1695
1696 } else if (directions1[c][r] == 1) {
1697 if (directions2[c][r] == 1) {
1698 } else {
1699 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1700 abort();
1701 }
1702
1703 } else {
1704 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1705 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1706 abort();
1707 }
1708 }
1709 }
1710 }
1711
1712 return;
1713 }
1714
1715 static void
banded_directions8_compare_Egap_upper(Score8_T ** matrix1,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int uband)1716 banded_directions8_compare_Egap_upper (Score8_T **matrix1, Direction8_T **directions1,
1717 #ifdef DEBUG_AVX2
1718 Direction8_T **directions2,
1719 #else
1720 Direction32_T **directions2,
1721 #endif
1722 int rlength, int glength, int uband) {
1723 int r, c, rlo, rhigh, last_check;
1724
1725 return;
1726 for (c = 1; c <= glength; c++) {
1727 if ((rlo = c - uband) < 1) {
1728 rlo = 1;
1729 };
1730
1731 if ((rhigh = c) <= rlength) {
1732 /* Don't check rhigh. Egap direction derives from a comparison
1733 of NEG_INFINITY values, and we should never reach here from
1734 directions_nogap anyway. */
1735 last_check = rhigh - 1;
1736
1737 } else {
1738 /* Do check rhigh, which contains instructions for the bottom row */
1739 rhigh = rlength;
1740 last_check = rhigh;
1741 }
1742
1743 for (r = rlo; r <= last_check; r++) {
1744 if (matrix1[c][r] < NEG_INFINITY_8 + 30) {
1745 /* Don't check */
1746
1747 } else if (directions1[c][r] == 0) {
1748 if (directions2[c][r] == 0) {
1749 } else {
1750 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1751 abort();
1752 }
1753
1754 } else if (directions1[c][r] == 1) {
1755 if (directions2[c][r] == 1) {
1756 } else {
1757 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1758 abort();
1759 }
1760
1761 } else {
1762 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1763 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1764 abort();
1765 }
1766 }
1767 }
1768 }
1769
1770 return;
1771 }
1772
1773 static void
banded_directions8_compare_Egap_lower(Score8_T ** matrix1,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int lband)1774 banded_directions8_compare_Egap_lower (Score8_T **matrix1, Direction8_T **directions1,
1775 #ifdef DEBUG_AVX2
1776 Direction8_T **directions2,
1777 #else
1778 Direction32_T **directions2,
1779 #endif
1780 int rlength, int glength, int lband) {
1781 int r, c, rlo, rhigh, last_check;
1782
1783 return;
1784 for (c = 1; c <= glength; c++) {
1785 if ((rlo = c) < 1) {
1786 rlo = 1;
1787 };
1788
1789 if ((rhigh = c + lband) <= rlength) {
1790 /* Don't check rhigh. Egap direction derives from a comparison
1791 of NEG_INFINITY values, and we should never reach here from
1792 directions_nogap anyway. */
1793 last_check = rhigh - 1;
1794
1795 } else {
1796 /* Do check rhigh, which contains instructions for the bottom row */
1797 rhigh = rlength;
1798 last_check = rhigh;
1799 }
1800
1801 for (r = rlo; r <= last_check; r++) {
1802 #ifdef DEBUG_AVX2
1803 if (matrix1[r][c] < NEG_INFINITY_8 + 30) {
1804 /* Don't check */
1805
1806 } else if (directions1[r][c] == 0) {
1807 if (directions2[r][c] == 0) {
1808 } else {
1809 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1810 abort();
1811 }
1812
1813 } else if (directions1[r][c] == 1) {
1814 if (directions2[r][c] == 1) {
1815 } else {
1816 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1817 abort();
1818 }
1819
1820 } else {
1821 if (directions2[r][c] == 0 || directions2[r][c] == 0) {
1822 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1823 abort();
1824 }
1825 }
1826 #else
1827 if (matrix1[r][c] < NEG_INFINITY_8 + 30) {
1828 /* Don't check */
1829
1830 } else if (directions1[r][c] == 0) {
1831 if (directions2[c][r] == 0) {
1832 } else {
1833 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1834 abort();
1835 }
1836
1837 } else if (directions1[r][c] == 1) {
1838 if (directions2[c][r] == 1) {
1839 } else {
1840 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1841 abort();
1842 }
1843
1844 } else {
1845 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1846 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1847 abort();
1848 }
1849 }
1850 #endif
1851 }
1852 }
1853
1854 return;
1855 }
1856
1857
1858 static void
banded_directions16_compare_Egap(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int lband,int uband)1859 banded_directions16_compare_Egap (Direction16_T **directions1,
1860 #ifdef DEBUG_AVX2
1861 Direction16_T **directions2,
1862 #else
1863 Direction32_T **directions2,
1864 #endif
1865 int rlength, int glength, int lband, int uband) {
1866 int r, c, rlo, rhigh, last_check;
1867
1868 for (c = 1; c <= glength; c++) {
1869 if ((rlo = c - uband) < 1) {
1870 rlo = 1;
1871 };
1872
1873 if ((rhigh = c + lband) <= rlength) {
1874 /* Don't check rhigh. Egap direction derives from a comparison
1875 of NEG_INFINITY values, and we should never reach here from
1876 directions_nogap anyway. */
1877 last_check = rhigh - 1;
1878
1879 } else {
1880 /* Do check rhigh, which contains instructions for the bottom row */
1881 rhigh = rlength;
1882 last_check = rhigh;
1883 }
1884
1885 for (r = rlo; r <= last_check; r++) {
1886 if (directions1[c][r] == 0) {
1887 if (directions2[c][r] == 0) {
1888 } else {
1889 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1890 abort();
1891 }
1892 } else if (directions1[c][r] == 1) {
1893 if (directions2[c][r] == 1) {
1894 } else {
1895 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1896 abort();
1897 }
1898
1899 } else {
1900 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1901 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1902 abort();
1903 }
1904 }
1905 }
1906 }
1907
1908 return;
1909 }
1910
1911 static void
banded_directions16_compare_Egap_upper(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int uband)1912 banded_directions16_compare_Egap_upper (Direction16_T **directions1,
1913 #ifdef DEBUG_AVX2
1914 Direction16_T **directions2,
1915 #else
1916 Direction32_T **directions2,
1917 #endif
1918 int rlength, int glength, int uband) {
1919 int r, c, rlo, rhigh, last_check;
1920
1921 return;
1922 for (c = 1; c <= glength; c++) {
1923 if ((rlo = c - uband) < 1) {
1924 rlo = 1;
1925 };
1926
1927 if ((rhigh = c) <= rlength) {
1928 /* Don't check rhigh. Egap direction derives from a comparison
1929 of NEG_INFINITY values, and we should never reach here from
1930 directions_nogap anyway. */
1931 last_check = rhigh - 1;
1932
1933 } else {
1934 /* Do check rhigh, which contains instructions for the bottom row */
1935 rhigh = rlength;
1936 last_check = rhigh;
1937 }
1938
1939 for (r = rlo; r <= last_check; r++) {
1940 if (directions1[c][r] == 0) {
1941 if (directions2[c][r] == 0) {
1942 } else {
1943 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1944 abort();
1945 }
1946 } else if (directions1[c][r] == 1) {
1947 if (directions2[c][r] == 1) {
1948 } else {
1949 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1950 abort();
1951 }
1952
1953 } else {
1954 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1955 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1956 abort();
1957 }
1958 }
1959 }
1960 }
1961
1962 return;
1963 }
1964
1965 static void
banded_directions16_compare_Egap_lower(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int lband)1966 banded_directions16_compare_Egap_lower (Direction16_T **directions1,
1967 #ifdef DEBUG_AVX2
1968 Direction16_T **directions2,
1969 #else
1970 Direction32_T **directions2,
1971 #endif
1972 int rlength, int glength, int lband) {
1973 int r, c, rlo, rhigh, last_check;
1974
1975 return;
1976 for (c = 1; c <= glength; c++) {
1977 if ((rlo = c) < 1) {
1978 rlo = 1;
1979 };
1980
1981 if ((rhigh = c + lband) <= rlength) {
1982 /* Don't check rhigh. Egap direction derives from a comparison
1983 of NEG_INFINITY values, and we should never reach here from
1984 directions_nogap anyway. */
1985 last_check = rhigh - 1;
1986
1987 } else {
1988 /* Do check rhigh, which contains instructions for the bottom row */
1989 rhigh = rlength;
1990 last_check = rhigh;
1991 }
1992
1993 for (r = rlo; r <= last_check; r++) {
1994 #ifdef DEBUG_AVX2
1995 if (directions1[r][c] == 0) {
1996 if (directions2[r][c] == 0) {
1997 } else {
1998 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1999 abort();
2000 }
2001 } else if (directions1[r][c] == 1) {
2002 if (directions2[r][c] == 1) {
2003 } else {
2004 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
2005 abort();
2006 }
2007
2008 } else {
2009 if (directions2[r][c] == 0 || directions2[r][c] == 0) {
2010 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
2011 abort();
2012 }
2013 }
2014 #else
2015 if (directions1[r][c] == 0) {
2016 if (directions2[c][r] == 0) {
2017 } else {
2018 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
2019 abort();
2020 }
2021 } else if (directions1[r][c] == 1) {
2022 if (directions2[c][r] == 1) {
2023 } else {
2024 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
2025 abort();
2026 }
2027
2028 } else {
2029 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
2030 printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
2031 abort();
2032 }
2033 }
2034 #endif
2035 }
2036 }
2037
2038 return;
2039 }
2040 #endif
2041
2042
2043 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
2044 static void
banded_directions8_compare_Fgap(Score8_T ** matrix1,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int lband,int uband)2045 banded_directions8_compare_Fgap (Score8_T **matrix1, Direction8_T **directions1,
2046 #ifdef DEBUG_AVX2
2047 Direction8_T **directions2,
2048 #else
2049 Direction32_T **directions2,
2050 #endif
2051 int rlength, int glength, int lband, int uband) {
2052 int r, c, rlo, rhigh, first_check;
2053
2054 for (c = 1; c <= glength; c++) {
2055 if ((rlo = c - uband) < 1) {
2056 first_check = rlo = 1;
2057 } else {
2058 first_check = rlo + 1;
2059 }
2060
2061 if ((rhigh = c + lband) > rlength) {
2062 rhigh = rlength;
2063 }
2064
2065 for (r = first_check; r <= rhigh; r++) {
2066 if (matrix1[c][r] < NEG_INFINITY_8 + 30) {
2067 /* Don't check */
2068
2069 } else if (directions1[c][r] == 0) {
2070 if (directions2[c][r] == 0) {
2071 } else {
2072 printf("At %d,%d, Fgap dir %d != dir %d. Score is %d\n",
2073 r,c,directions1[c][r],directions2[c][r],matrix1[c][r]);
2074 abort();
2075 }
2076
2077 } else if (directions1[c][r] == 1) {
2078 if (directions2[c][r] == 1) {
2079 } else {
2080 printf("At %d,%d, Fgap dir %d != dir %d. Score is %d\n",
2081 r,c,directions1[c][r],directions2[c][r],matrix1[c][r]);
2082 abort();
2083 }
2084
2085 } else {
2086 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
2087 printf("At %d,%d, Fgap dir %d != dir %d. Score is %d\n",
2088 r,c,directions1[c][r],directions2[c][r],matrix1[c][r]);
2089 abort();
2090 }
2091 }
2092 }
2093 }
2094
2095 return;
2096 }
2097
2098 static void
banded_directions16_compare_Fgap(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int lband,int uband)2099 banded_directions16_compare_Fgap (Direction16_T **directions1,
2100 #ifdef DEBUG_AVX2
2101 Direction16_T **directions2,
2102 #else
2103 Direction32_T **directions2,
2104 #endif
2105 int rlength, int glength, int lband, int uband) {
2106 int r, c, rlo, rhigh, first_check;
2107
2108 for (c = 1; c <= glength; c++) {
2109 if ((rlo = c - uband) < 1) {
2110 first_check = rlo = 1;
2111 } else {
2112 first_check = rlo + 1;
2113 }
2114
2115 if ((rhigh = c + lband) > rlength) {
2116 rhigh = rlength;
2117 }
2118
2119 for (r = first_check; r <= rhigh; r++) {
2120 if (directions1[c][r] == 0) {
2121 if (directions2[c][r] == 0) {
2122 } else {
2123 printf("At %d,%d, Fgap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
2124 abort();
2125 }
2126 } else if (directions1[c][r] == 1) {
2127 if (directions2[c][r] == 1) {
2128 } else {
2129 printf("At %d,%d, Fgap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
2130 abort();
2131 }
2132
2133 } else {
2134 if (directions2[c][r] == 0 || directions2[c][r] == 0) {
2135 printf("At %d,%d, Fgap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
2136 abort();
2137 }
2138 }
2139 }
2140 }
2141
2142 return;
2143 }
2144 #endif
2145
2146
2147 /************************************************************************
2148 * End of debugging procedures
2149 ************************************************************************/
2150
2151
2152
2153 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE2)
2154 /* Makes a matrix of dimensions 0..rlength x 0..glength inclusive */
2155 static Score8_T **
aligned_score8_alloc(int rlength,int glength,void ** ptrs,void * space)2156 aligned_score8_alloc (int rlength, int glength, void **ptrs, void *space) {
2157 Score8_T **matrix, *ptr;
2158 int c;
2159
2160 matrix = (Score8_T **) ptrs;
2161
2162 ptr = (Score8_T *) space;
2163 matrix[0] = ptr; /* Want aligned row to be r = 0, 16, ... */
2164 for (c = 1; c <= glength; c++) {
2165 ptr += rlength;
2166 matrix[c] = ptr; /* Want aligned row to be r = 0, 16, ... */
2167 }
2168 #if defined(DEBUG2) && (defined(DEBUG_AVX2) || defined(DEBUG_SIMD))
2169 memset((void *) matrix[0],0,(glength+1)*rlength*sizeof(Score8_T));
2170 #endif
2171
2172 return matrix;
2173 }
2174
2175 /* No initialization to DIAG (0), for directions_Egap and directions_nogap */
2176 static Score8_T **
aligned_directions8_alloc(int rlength,int glength,void ** ptrs,void * space)2177 aligned_directions8_alloc (int rlength, int glength, void **ptrs, void *space) {
2178 Score8_T **matrix, *ptr;
2179 int c;
2180
2181 matrix = (Score8_T **) ptrs;
2182
2183 ptr = (Score8_T *) space;
2184 matrix[0] = ptr; /* Want aligned row to be r = 0, 16, ... */
2185 for (c = 1; c <= glength; c++) {
2186 ptr += rlength;
2187 matrix[c] = ptr; /* Want aligned row to be r = 0, 16, ... */
2188 }
2189 #if defined(DEBUG2) && (defined(DEBUG_AVX2) || defined(DEBUG_SIMD))
2190 memset((void *) matrix[0],/*DIAG*/0,(glength+1)*rlength*sizeof(Score8_T));
2191 #endif
2192
2193 return matrix;
2194 }
2195
2196 /* Initialization to DIAG (0), for directions_Fgap */
2197 static Score8_T **
aligned_directions8_calloc(int rlength,int glength,void ** ptrs,void * space)2198 aligned_directions8_calloc (int rlength, int glength, void **ptrs, void *space) {
2199 Score8_T **matrix, *ptr;
2200 int c;
2201
2202 matrix = (Score8_T **) ptrs;
2203
2204 ptr = (Score8_T *) space;
2205 matrix[0] = ptr; /* Want aligned row to be r = 0, 16, ... */
2206 for (c = 1; c <= glength; c++) {
2207 ptr += rlength;
2208 matrix[c] = ptr; /* Want aligned row to be r = 0, 16, ... */
2209 }
2210 memset((void *) matrix[0],/*DIAG*/0,(glength+1)*rlength*sizeof(Score8_T));
2211
2212 return matrix;
2213 }
2214
2215
2216
2217 /* Makes a matrix of dimensions 0..rlength x 0..glength inclusive */
2218 static Score16_T **
aligned_score16_alloc(int rlength,int glength,void ** ptrs,void * space)2219 aligned_score16_alloc (int rlength, int glength, void **ptrs, void *space) {
2220 Score16_T **matrix, *ptr;
2221 int c;
2222
2223 matrix = (Score16_T **) ptrs;
2224
2225 ptr = (Score16_T *) space;
2226 matrix[0] = ptr; /* Want aligned row to be r = 0, 8, 16, ... */
2227 for (c = 1; c <= glength; c++) {
2228 ptr += rlength;
2229 matrix[c] = ptr; /* Want aligned row to be r = 0, 8, 16, ... */
2230 }
2231 #ifdef DEBUG2
2232 memset((void *) matrix[0],0,(glength+1)*rlength*sizeof(Score16_T));
2233 #endif
2234
2235 return matrix;
2236 }
2237
2238 /* No initialization to DIAG (0), for directions_Egap and directions_nogap */
2239 static Score16_T **
aligned_directions16_alloc(int rlength,int glength,void ** ptrs,void * space)2240 aligned_directions16_alloc (int rlength, int glength, void **ptrs, void *space) {
2241 Score16_T **matrix, *ptr;
2242 int c;
2243
2244 matrix = (Score16_T **) ptrs;
2245
2246 ptr = (Score16_T *) space;
2247 matrix[0] = ptr; /* Want aligned row to be r = 0, 8, 16, ... */
2248 for (c = 1; c <= glength; c++) {
2249 ptr += rlength;
2250 matrix[c] = ptr; /* Want aligned row to be r = 0, 8, 16, ... */
2251 }
2252 #ifdef DEBUG2
2253 memset((void *) matrix[0],/*DIAG*/0,(glength+1)*rlength*sizeof(Score16_T));
2254 #endif
2255
2256 return matrix;
2257 }
2258
2259 /* Initialization to DIAG (0), for directions_Fgap */
2260 static Score16_T **
aligned_directions16_calloc(int rlength,int glength,void ** ptrs,void * space)2261 aligned_directions16_calloc (int rlength, int glength, void **ptrs, void *space) {
2262 Score16_T **matrix, *ptr;
2263 int c;
2264
2265 matrix = (Score16_T **) ptrs;
2266
2267 ptr = (Score16_T *) space;
2268 matrix[0] = ptr; /* Want aligned row to be r = 0, 8, 16, ... */
2269 for (c = 1; c <= glength; c++) {
2270 ptr += rlength;
2271 matrix[c] = ptr; /* Want aligned row to be r = 0, 8, 16, ... */
2272 }
2273 memset((void *) matrix[0],/*DIAG*/0,(glength+1)*rlength*sizeof(Score16_T));
2274
2275 return matrix;
2276 }
2277 #endif
2278
2279
2280 #define T Dynprog_T
2281
2282
2283 #ifdef DEBUG_AVX2
2284 Score8_T **
Dynprog_simd_8_nonavx2(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,Direction8_T *** directions_Fgap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,int uband,bool jump_late_p,bool revp)2285 Dynprog_simd_8_nonavx2 (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
2286 Direction8_T ***directions_Fgap,
2287 T this, char *rsequence, char *gsequence, char *gsequence_alt,
2288 int rlength, int glength,
2289 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
2290 Mismatchtype_T mismatchtype, int open, int extend,
2291 int lband, int uband, bool jump_late_p, bool revp) {
2292 int c_gap, last_nogap, score, *FF; /* Need to have the ability to go past NEG_INFINITY */
2293 Score8_T **matrix, *score_column;
2294 __m128i pairscores_std, pairscores_alt;
2295 #ifndef HAVE_SSE4_1
2296 __m128i pairscores_best, all_128;
2297 #endif
2298 __m128i H_nogap_r, X_prev_nogap, E_r_gap, T1;
2299 __m128i gap_open, gap_extend, extend_ladder, extend_chunk, complement_dummy;
2300 __m128i dir_horiz;
2301 __m128i ramp, ramp_chunk, lband_vector, filter, ones;
2302 int rlength_ceil, lband_ceil, r, c;
2303 int rlo, rhigh, rlo_calc, rhigh_calc;
2304 int na1, na2, na2_alt;
2305 Score8_T *pairscores_col0;
2306 Score8_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr, pairscore, pairscore0;
2307 Pairdistance_T **pairdistance_array_type;
2308
2309
2310 debug2(printf("Dynprog_simd_8. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
2311 debug15(printf("Dynprog_simd_8. jump_late_p %d, open %d, extend %d, lband %d, uband %d\n",
2312 jump_late_p,open,extend,lband,uband));
2313
2314 rlength_ceil = (int) ((rlength + SIMD_NCHARS_NONAVX2)/SIMD_NCHARS_NONAVX2) * SIMD_NCHARS_NONAVX2;
2315
2316 #ifdef HAVE_SSE4_1
2317 pairdistance_array_type = pairdistance_array[mismatchtype];
2318 #else
2319 /* Need to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
2320 pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
2321 all_128 = _mm_set1_epi8(128);
2322 #endif
2323
2324 debug(printf("Dynprog_simd_8: "));
2325 debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
2326 debug(printf("Query length rounded up to %d\n",rlength_ceil));
2327
2328 matrix = aligned_score8_alloc(rlength_ceil,glength,
2329 this->aligned_std.one.matrix_ptrs,this->aligned_std.one.matrix_space);
2330 *directions_nogap = aligned_directions8_alloc(rlength_ceil,glength,
2331 this->aligned_std.one.directions_ptrs_0,this->aligned_std.one.directions_space_0);
2332 *directions_Egap = aligned_directions8_alloc(rlength_ceil,glength,
2333 this->aligned_std.one.directions_ptrs_1,this->aligned_std.one.directions_space_1);
2334 /* Need to calloc to save time in F loop */
2335 *directions_Fgap = aligned_directions8_calloc(rlength_ceil,glength,
2336 this->aligned_std.one.directions_ptrs_2,this->aligned_std.one.directions_space_2);
2337
2338 #if 0
2339 /* Row 0 initialization */
2340 /* penalty = open; */
2341 for (c = 1; c <= uband && c <= glength; c++) {
2342 /* penalty += extend; */
2343 (*directions_Egap)[c][0] = HORIZ;
2344 (*directions_nogap)[c][0] = HORIZ;
2345 }
2346 #endif
2347 #if 0
2348 /* Already initialized to DIAG. Actually no longer initializing directions_Egap */
2349 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
2350 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
2351 #endif
2352
2353 #if 0
2354 /* Column 0 initialization */
2355 /* penalty = open; */
2356 for (r = 1; r <= SIMD_NCHARS_NONAVX2 && r <= rlength; r++) {
2357 /* penalty += extend; */
2358 (*directions_nogap)[0][r] = VERT;
2359 }
2360 #endif
2361
2362
2363 /* Load pairscores. Store match - mismatch */
2364 pairscores[0] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
2365 pairscores[1] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
2366 pairscores[2] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
2367 pairscores[3] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
2368 pairscores[4] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
2369
2370 lband_ceil = (int) ((lband + SIMD_NCHARS_NONAVX2)/SIMD_NCHARS_NONAVX2) * SIMD_NCHARS_NONAVX2;
2371 pairscores_col0 = (Score8_T *) _mm_malloc(lband_ceil * sizeof(Score8_T),16);
2372
2373
2374 #if 0
2375 /* Should not be necessary */
2376 memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score8_T));
2377 memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score8_T));
2378 memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score8_T));
2379 memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score8_T));
2380 memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score8_T));
2381 #endif
2382
2383 /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
2384 #ifdef HAVE_SSE4_1
2385 pairscores_col0[0] = (Score8_T) 0;
2386 /* Initialization just to lband causes errors in dir_horiz for Egap */
2387 #ifdef ZERO_INITIAL_GAP_PENALTY
2388 for (r = 1; r < lband_ceil; r++) {
2389 pairscores_col0[r] = (Score8_T) 0;
2390 }
2391 #else
2392 for (r = 1; r < lband_ceil; r++) {
2393 pairscores_col0[r] = (Score8_T) NEG_INFINITY_8;
2394 }
2395 #endif
2396 #else
2397 pairscores_col0[0] = (Score8_T) 0+128;
2398 /* Initialization just to lband causes errors in dir_horiz for Egap */
2399 #ifdef ZERO_INITIAL_GAP_PENALTY
2400 for (r = 1; r < lband_ceil; r++) {
2401 pairscores_col0[r] = (Score8_T) 0+128;
2402 }
2403 #else
2404 for (r = 1; r < lband_ceil; r++) {
2405 pairscores_col0[r] = (Score8_T) NEG_INFINITY_8+128;
2406 }
2407 #endif
2408 #endif
2409
2410
2411 /* Row 0 */
2412 r = 0; na1 = 'N';
2413 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
2414 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
2415 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
2416 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
2417 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
2418
2419 if (revp == false) {
2420 for (r = 1; r <= rlength; r++) {
2421 na1 = (int) rsequence[r-1];
2422 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
2423 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
2424 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
2425 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
2426 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
2427 }
2428 } else {
2429 for (r = 1; r <= rlength; r++) {
2430 na1 = (int) rsequence[1-r];
2431 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
2432 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
2433 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
2434 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
2435 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
2436 }
2437 }
2438
2439 #if 0
2440 /* Should not be necessary */
2441 memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
2442 memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
2443 memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
2444 memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
2445 memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
2446 #endif
2447
2448 complement_dummy = _mm_set1_epi8(-1);
2449
2450 FF = (int *) MALLOCA((glength + 1) * sizeof(int));
2451
2452 gap_open = _mm_set1_epi8((Score8_T) open);
2453 gap_extend = _mm_set1_epi8((Score8_T) extend);
2454
2455 #ifndef INFINITE_INITIAL_GAP_PENALTY
2456 lband_vector = _mm_set1_epi8(lband);
2457 ramp = _mm_setr_epi8(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16);
2458 extend_ladder = _mm_setr_epi8(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend,
2459 9*extend,10*extend,11*extend,12*extend,13*extend,14*extend,15*extend,16*extend);
2460 ramp_chunk = _mm_set1_epi8(SIMD_NCHARS_NONAVX2);
2461 extend_chunk = _mm_set1_epi8(SIMD_NCHARS_NONAVX2*extend);
2462 #endif
2463
2464 if (jump_late_p) {
2465 for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS_NONAVX2) {
2466 if ((rhigh = rlo + SIMD_NCHARS_NONAVX2 - 1) > rlength) {
2467 rhigh = rlength;
2468 }
2469
2470 if ((c = rlo - lband) < 0) {
2471 c = 0;
2472
2473 #if defined(ZERO_INITIAL_GAP_PENALTY)
2474 /* Initial H in column 0 determined by zeroed out H. E needs to equal gap_open for column 1. */
2475 E_r_gap = _mm_set1_epi8(NEG_INFINITY_8-open);
2476 filter = _mm_cmpgt_epi8(ramp,lband_vector);
2477 H_nogap_r = _mm_and_si128(filter,E_r_gap); /* Use zeros for score */
2478
2479 E_r_gap = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
2480 E_r_gap = _mm_adds_epi8(E_r_gap,gap_open);
2481
2482 ramp = _mm_adds_epi8(ramp,ramp_chunk); /* Prepare for next block */
2483 extend_ladder = _mm_adds_epi8(extend_ladder,extend_chunk); /* Prepare for next block */
2484 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
2485 /* dir_horiz tests if E >= H. To fill in first column of each
2486 row block with non-diags, make E == H. */
2487 E_r_gap = _mm_set1_epi8(NEG_INFINITY_8);
2488 H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
2489 #else
2490 /* Initial H in column 0 determined by E, which needs to equal
2491 gap_open + extend_ladder for column 1. H is free to be set
2492 equal to E. */
2493 H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* To compensate for T1 = H + open */
2494 filter = _mm_cmpgt_epi8(ramp,lband_vector);
2495 H_nogap_r = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),_mm_and_si128(filter,H_nogap_r));
2496 E_r_gap = _mm_adds_epi8(H_nogap_r,gap_open);
2497 ramp = _mm_adds_epi8(ramp,ramp_chunk); /* Prepare for next block */
2498 extend_ladder = _mm_adds_epi8(extend_ladder,extend_chunk); /* Prepare for next block */
2499 #endif
2500 } else {
2501 E_r_gap = _mm_set1_epi8(NEG_INFINITY_8);
2502 H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
2503 }
2504
2505 for ( ; c <= rhigh + uband && c <= glength; c++) {
2506 score_column = matrix[c];
2507
2508 if (c == 0) {
2509 pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
2510
2511 #ifdef ZERO_INITIAL_GAP_PENALTY
2512 X_prev_nogap = _mm_set1_epi8(0);
2513 #else
2514 if (rlo == 0) {
2515 X_prev_nogap = _mm_set1_epi8(0);
2516 } else {
2517 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
2518 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2519 }
2520 #endif
2521
2522 } else {
2523 na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
2524 na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
2525 pairscores_std_ptr = pairscores[na2];
2526 pairscores_alt_ptr = pairscores[na2_alt];
2527
2528 if (rlo == 0) {
2529 #ifdef ZERO_INITIAL_GAP_PENALTY
2530 X_prev_nogap = _mm_set1_epi8(0);
2531 #else
2532 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
2533 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2534 #endif
2535 } else {
2536 /* second or greater block of 8 */
2537 X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
2538 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2539 }
2540 }
2541
2542 debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
2543 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
2544
2545 /* EGAP */
2546 T1 = _mm_adds_epi8(H_nogap_r, gap_open);
2547 dir_horiz = _mm_cmplt_epi8(E_r_gap,T1); /* E < H */
2548 dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy); /* E >= H, for jump late */
2549 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
2550 debug15(print_vector_8(T1,rlo,c,"T1"));
2551 debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
2552
2553 #ifdef HAVE_SSE4_1
2554 E_r_gap = _mm_max_epi8(E_r_gap, T1); /* Compare H + open with vert */
2555 #else
2556 E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
2557 #endif
2558 E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
2559 debug15(print_vector_8(E_r_gap,rlo,c,"E"));
2560
2561
2562 /* NOGAP */
2563 T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_NONAVX2);
2564 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
2565 H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
2566 X_prev_nogap = T1;
2567
2568 /* Add pairscores, allowing for alternate genomic nt */
2569 #ifdef HAVE_SSE4_1
2570 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
2571 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
2572 H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
2573 #else
2574 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
2575 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
2576 pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
2577 H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
2578 #endif
2579 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
2580 debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
2581
2582 dir_horiz = _mm_cmplt_epi8(E_r_gap,H_nogap_r); /* E < H */
2583 dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy); /* E >= H, for jump late */
2584 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
2585 debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
2586
2587
2588 #ifdef HAVE_SSE4_1
2589 H_nogap_r = _mm_max_epi8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
2590 #else
2591 /* Compare H + pairscores with horiz + extend */
2592 H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
2593 #endif
2594 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
2595 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
2596
2597
2598 /* F loop */
2599 if ((rlo_calc = rlo) < c - uband) {
2600 rlo_calc = c - uband;
2601 }
2602 if ((rhigh_calc = rhigh) >= c + lband) {
2603 rhigh_calc = c + lband;
2604 if (c > 0) {
2605 /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
2606 pairscore = pairscores[na2][rhigh_calc];
2607 if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
2608 pairscore = pairscore0;
2609 }
2610 #ifndef HAVE_SSE4_1
2611 pairscore -= 128;
2612 #endif
2613 if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_8) {
2614 score_column[rhigh_calc] = NEG_INFINITY_8; /* Saturation */
2615 } else if (score > POS_INFINITY_8) {
2616 /* Should never get here, because we limit size of matrix using 8-bit quantities */
2617 score_column[rhigh_calc] = POS_INFINITY_8; /* Saturation */
2618 } else {
2619 score_column[rhigh_calc] = (Score8_T) score;
2620 }
2621 (*directions_Egap)[c][rhigh_calc] = DIAG;
2622 (*directions_nogap)[c][rhigh_calc] = DIAG;
2623 }
2624 }
2625
2626 debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
2627 rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
2628
2629 if (rlo == 0) {
2630 c_gap = NEG_INFINITY_INT;
2631 last_nogap = NEG_INFINITY_INT;
2632 } else if (c >= rlo + uband) {
2633 c_gap = NEG_INFINITY_INT;
2634 last_nogap = NEG_INFINITY_INT;
2635 } else {
2636 debug3(printf("At c %d, uband %d, reading c_gap %d\n",c,uband,FF[c]));
2637 c_gap = FF[c];
2638 last_nogap = (int) score_column[rlo_calc-1];
2639 }
2640
2641 if ((r = rlo_calc) == c - uband) {
2642 /* Handle top value as a special case to prevent going outside of uband */
2643 /* FGAP */
2644 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
2645 r,c,c_gap + extend,last_nogap + open + extend));
2646 score = last_nogap + open /* + extend */;
2647 c_gap = score + extend;
2648 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
2649
2650 /* NOGAP */
2651 last_nogap = (int) score_column[r];
2652 r++;
2653 }
2654
2655 /* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
2656 for ( ; r <= rhigh_calc; r++) {
2657 /* FGAP */
2658 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
2659 r,c,c_gap + extend,last_nogap + open + extend));
2660 if (c_gap /* + extend */ >= (score = last_nogap + open /* + extend */)) { /* Use >= for jump late */
2661 c_gap += extend;
2662 (*directions_Fgap)[c][r] = VERT;
2663 } else {
2664 c_gap = score + extend;
2665 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
2666 }
2667
2668 /* NOGAP */
2669 last_nogap = (int) score_column[r];
2670 debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
2671 if (c_gap >= last_nogap) { /* Use >= for jump late */
2672 last_nogap = c_gap;
2673 score_column[r] = (c_gap < NEG_INFINITY_8) ? NEG_INFINITY_8 : (Score8_T) c_gap; /* Saturation */
2674 (*directions_nogap)[c][r] = VERT;
2675 }
2676 }
2677
2678 FF[c] = c_gap;
2679 debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
2680 H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
2681 }
2682 }
2683
2684 } else {
2685 /* jump early */
2686 #if defined(ZERO_INITIAL_GAP_PENALTY) || defined(INFINITE_INITIAL_GAP_PENALTY)
2687 /* No need for ones */
2688 #else
2689 ones = _mm_set1_epi8(1);
2690 #endif
2691 for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS_NONAVX2) {
2692 if ((rhigh = rlo + SIMD_NCHARS_NONAVX2 - 1) > rlength) {
2693 rhigh = rlength;
2694 }
2695
2696 if ((c = rlo - lband) < 0) {
2697 c = 0;
2698
2699 #if defined(ZERO_INITIAL_GAP_PENALTY)
2700 /* Initial H in column 0 determined by zeroed out H. E needs to equal gap_open for column 1. */
2701 E_r_gap = _mm_set1_epi8(NEG_INFINITY_8-open);
2702 filter = _mm_cmpgt_epi8(ramp,lband_vector);
2703 H_nogap_r = _mm_and_si128(filter,E_r_gap); /* Use zeros for score */
2704
2705 E_r_gap = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
2706 E_r_gap = _mm_adds_epi8(E_r_gap,gap_open);
2707
2708 ramp = _mm_adds_epi8(ramp,ramp_chunk); /* Prepare for next block */
2709 extend_ladder = _mm_adds_epi8(extend_ladder,extend_chunk); /* Prepare for next block */
2710 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
2711 /* dir_horiz tests if E > H. To fill in first column of each
2712 row block with non-diags, make E > H. */
2713 E_r_gap = _mm_set1_epi8(NEG_INFINITY_8+1);
2714 H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
2715 #else
2716 /* Initial H in column 0 determined by E, which needs to equal
2717 gap_open + extend_ladder for column 1. H is free to be set
2718 less than E. */
2719 H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open+1); /* To compensate for T1 = H + open */
2720 filter = _mm_cmpgt_epi8(ramp,lband_vector);
2721 H_nogap_r = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),_mm_and_si128(filter,H_nogap_r));
2722 E_r_gap = _mm_adds_epi8(H_nogap_r,gap_open);
2723 H_nogap_r = _mm_subs_epi8(H_nogap_r,ones); /* To ensure H < E */
2724 ramp = _mm_adds_epi8(ramp,ramp_chunk); /* Prepare for next block */
2725 extend_ladder = _mm_adds_epi8(extend_ladder,extend_chunk); /* Prepare for next block */
2726 #endif
2727 } else {
2728 E_r_gap = _mm_set1_epi8(NEG_INFINITY_8+1);
2729 H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
2730 }
2731
2732 for ( ; c <= rhigh + uband && c <= glength; c++) {
2733 score_column = matrix[c];
2734
2735 if (c == 0) {
2736 pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
2737
2738 #ifdef ZERO_INITIAL_GAP_PENALTY
2739 X_prev_nogap = _mm_set1_epi8(0);
2740 #else
2741 if (rlo == 0) {
2742 X_prev_nogap = _mm_set1_epi8(0);
2743 } else {
2744 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
2745 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2746 }
2747 #endif
2748
2749 } else {
2750 na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
2751 na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
2752 pairscores_std_ptr = pairscores[na2];
2753 pairscores_alt_ptr = pairscores[na2_alt];
2754
2755 if (rlo == 0) {
2756 #ifdef ZERO_INITIAL_GAP_PENALTY
2757 X_prev_nogap = _mm_set1_epi8(0);
2758 #else
2759 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
2760 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2761 #endif
2762 } else {
2763 /* second or greater block of 8 */
2764 X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
2765 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2766 }
2767 }
2768
2769 debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
2770 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
2771
2772 /* EGAP */
2773 T1 = _mm_adds_epi8(H_nogap_r, gap_open);
2774 dir_horiz = _mm_cmpgt_epi8(E_r_gap,T1); /* E > H, for jump early */
2775 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
2776 debug15(print_vector_8(T1,rlo,c,"T1"));
2777 debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
2778
2779 #ifdef HAVE_SSE4_1
2780 E_r_gap = _mm_max_epi8(E_r_gap, T1); /* Compare H + open with vert */
2781 #else
2782 /* Compare H + open with vert */
2783 E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
2784 #endif
2785 E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
2786 debug15(print_vector_8(E_r_gap,rlo,c,"E"));
2787
2788
2789 /* NOGAP */
2790 T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_NONAVX2);
2791 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
2792 H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
2793 X_prev_nogap = T1;
2794
2795 /* Add pairscores, allowing for alternate genomic nt */
2796 #ifdef HAVE_SSE4_1
2797 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
2798 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
2799 H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
2800 #else
2801 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
2802 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
2803 pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
2804 H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
2805 #endif
2806 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
2807 debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
2808
2809 dir_horiz = _mm_cmpgt_epi8(E_r_gap,H_nogap_r); /* E > H, for jump early */
2810 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
2811 debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
2812
2813
2814 #ifdef HAVE_SSE4_1
2815 H_nogap_r = _mm_max_epi8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
2816 #else
2817 /* Compare H + pairscores with horiz + extend */
2818 H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
2819 #endif
2820 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
2821 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
2822
2823
2824 /* F loop */
2825 if ((rlo_calc = rlo) < c - uband) {
2826 rlo_calc = c - uband;
2827 }
2828 if ((rhigh_calc = rhigh) >= c + lband) {
2829 rhigh_calc = c + lband;
2830 if (c > 0) {
2831 /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
2832 pairscore = pairscores[na2][rhigh_calc];
2833 if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
2834 pairscore = pairscore0;
2835 }
2836 #ifndef HAVE_SSE4_1
2837 pairscore -= 128;
2838 #endif
2839 if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_8) {
2840 score_column[rhigh_calc] = NEG_INFINITY_8; /* Saturation */
2841 } else if (score > POS_INFINITY_8) {
2842 /* Should never get here, because we limit size of matrix using 8-bit quantities */
2843 score_column[rhigh_calc] = POS_INFINITY_8; /* Saturation */
2844 } else {
2845 score_column[rhigh_calc] = (Score8_T) score;
2846 }
2847 (*directions_Egap)[c][rhigh_calc] = DIAG;
2848 (*directions_nogap)[c][rhigh_calc] = DIAG;
2849 }
2850 }
2851
2852 debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
2853 rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
2854
2855 if (rlo == 0) {
2856 c_gap = NEG_INFINITY_INT;
2857 last_nogap = NEG_INFINITY_INT;
2858 } else if (c >= rlo + uband) {
2859 c_gap = NEG_INFINITY_INT;
2860 last_nogap = NEG_INFINITY_INT;
2861 } else {
2862 c_gap = FF[c];
2863 last_nogap = (int) score_column[rlo_calc-1];
2864 debug3(printf("LAST_NOGAP gets score_column[%d-1], or %d\n",rlo_calc,last_nogap));
2865 }
2866
2867 if ((r = rlo_calc) == c - uband) {
2868 /* Handle top value as a special case to prevent going outside of uband */
2869 /* FGAP */
2870 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
2871 r,c,c_gap + extend,last_nogap + open + extend));
2872 score = last_nogap + open /* + extend */;
2873 c_gap = score + extend;
2874 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
2875
2876 /* NOGAP */
2877 last_nogap = (int) score_column[r];
2878 r++;
2879 }
2880
2881 /* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
2882 for ( ; r <= rhigh_calc; r++) {
2883 /* FGAP */
2884 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
2885 r,c,c_gap + extend,last_nogap + open + extend));
2886 if (c_gap /* + extend */ > (score = last_nogap + open /* + extend */)) { /* Use > for jump early */
2887 c_gap += extend;
2888 (*directions_Fgap)[c][r] = VERT;
2889 } else {
2890 c_gap = score + extend;
2891 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
2892 }
2893
2894 /* NOGAP */
2895 last_nogap = (int) score_column[r];
2896 debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
2897 if (c_gap > last_nogap) { /* Use > for jump early */
2898 last_nogap = c_gap;
2899 score_column[r] = (c_gap < NEG_INFINITY_8) ? NEG_INFINITY_8 : (Score8_T) c_gap; /* Saturation */
2900 debug3(printf("Stored at score_column[%d]: %d\n",r,(Score8_T) score_column[r]));
2901 (*directions_nogap)[c][r] = VERT;
2902 }
2903 }
2904
2905 FF[c] = c_gap;
2906 debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
2907 H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
2908 }
2909 }
2910 }
2911
2912
2913 #ifdef CHECK1
2914 /* Row 0 and column 0 directions fail anyway due to saturation */
2915 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
2916 (*directions_Egap)[1][0] = HORIZ;
2917 (*directions_Fgap)[0][1] = VERT;
2918 #endif
2919
2920 #ifdef DEBUG2
2921 printf("SIMD: Dynprog_simd_8\n");
2922 Matrix8_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
2923 revp,lband,uband);
2924 Directions8_print(*directions_nogap,*directions_Egap,*directions_Fgap,
2925 rlength,glength,rsequence,gsequence,gsequence_alt,
2926 revp,lband,uband);
2927 #endif
2928
2929 #ifdef CHECK1
2930 /* Check for row 0 directions */
2931 for (c = 1; c <= uband && c <= glength; c++) {
2932 assert((*directions_Egap)[c][0] != DIAG);
2933 assert((*directions_nogap)[c][0] != DIAG);
2934 }
2935 /* Check for column 0 directions */
2936 for (r = 1; r <= lband && r <= rlength; r++) {
2937 assert((*directions_Fgap)[0][r] != DIAG);
2938 assert((*directions_nogap)[0][r] != DIAG);
2939 }
2940 #endif
2941
2942 FREEA(FF);
2943 _mm_free(pairscores_col0);
2944 _mm_free(pairscores[4]);
2945 _mm_free(pairscores[3]);
2946 _mm_free(pairscores[2]);
2947 _mm_free(pairscores[1]);
2948 _mm_free(pairscores[0]);
2949
2950 return matrix;
2951 }
2952 #endif
2953
2954
2955
2956 #if defined(HAVE_SSE2)
2957 /* Modified from Dynprog_simd_8_upper. Operates by columns. */
2958 Score8_T **
Dynprog_simd_8(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,Direction8_T *** directions_Fgap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,int uband,bool jump_late_p,bool revp)2959 Dynprog_simd_8 (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
2960 Direction8_T ***directions_Fgap,
2961 T this, char *rsequence, char *gsequence, char *gsequence_alt,
2962 int rlength, int glength,
2963 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
2964 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
2965 #endif
2966 Mismatchtype_T mismatchtype, int open, int extend,
2967 int lband, int uband, bool jump_late_p, bool revp) {
2968 int c_gap, last_nogap, score, *FF; /* Need to have the ability to go past NEG_INFINITY */
2969 Score8_T **matrix, *score_column;
2970 #ifdef HAVE_AVX2
2971 __m256i pairscores_std, pairscores_alt;
2972 __m256i H_nogap_r, X_prev_nogap, E_r_gap, T1;
2973 __m256i gap_open, gap_extend, complement_dummy;
2974 __m256i dir_horiz;
2975 #if defined(ZERO_INITIAL_GAP_PENALTY)
2976 __m256i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter;
2977 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
2978 #else
2979 __m256i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter, ones;
2980 #endif
2981
2982 #else
2983 __m128i pairscores_std, pairscores_alt;
2984 __m128i H_nogap_r, X_prev_nogap, E_r_gap, T1;
2985 __m128i gap_open, gap_extend, complement_dummy;
2986 __m128i dir_horiz;
2987 #if defined(ZERO_INITIAL_GAP_PENALTY)
2988 __m128i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter;
2989 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
2990 #else
2991 __m128i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter, ones;
2992 #endif
2993 #endif
2994 #ifndef HAVE_SSE4_1
2995 __m128i pairscores_best, all_128;
2996 #endif
2997 int rlength_ceil, lband_ceil, r, c;
2998 int rlo, rhigh, rlo_calc, rhigh_calc;
2999 int na1, na2, na2_alt;
3000 Score8_T *pairscores_col0;
3001 Score8_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr, pairscore, pairscore0;
3002 Pairdistance_T **pairdistance_array_type;
3003
3004 #ifdef DEBUG_AVX2
3005 Score8_T **matrix_std;
3006 Direction8_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
3007 #elif defined(DEBUG_SIMD)
3008 Score32_T **matrix_std;
3009 Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
3010 #endif
3011
3012
3013 debug2(printf("Dynprog_simd_8. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
3014 debug15(printf("Dynprog_simd_8. jump_late_p %d, open %d, extend %d, lband %d, uband %d\n",
3015 jump_late_p,open,extend,lband,uband));
3016
3017 rlength_ceil = (int) ((rlength + SIMD_NCHARS)/SIMD_NCHARS) * SIMD_NCHARS;
3018
3019 #ifdef HAVE_SSE4_1
3020 pairdistance_array_type = pairdistance_array[mismatchtype];
3021 #else
3022 /* Needed to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
3023 pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
3024 all_128 = _mm_set1_epi8(128);
3025 #endif
3026
3027 debug(printf("Dynprog_simd_8: "));
3028 debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
3029 debug(printf("Query length rounded up to %d\n",rlength_ceil));
3030
3031 matrix = aligned_score8_alloc(rlength_ceil,glength,
3032 this->aligned.one.matrix_ptrs,this->aligned.one.matrix_space);
3033 *directions_nogap = aligned_directions8_alloc(rlength_ceil,glength,
3034 this->aligned.one.directions_ptrs_0,this->aligned.one.directions_space_0);
3035 *directions_Egap = aligned_directions8_alloc(rlength_ceil,glength,
3036 this->aligned.one.directions_ptrs_1,this->aligned.one.directions_space_1);
3037 /* Need to calloc to save time in F loop */
3038 *directions_Fgap = aligned_directions8_calloc(rlength_ceil,glength,
3039 this->aligned.one.directions_ptrs_2,this->aligned.one.directions_space_2);
3040
3041 #if 0
3042 /* Row 0 initialization */
3043 /* penalty = open; */
3044 for (c = 1; c <= uband && c <= glength; c++) {
3045 /* penalty += extend; */
3046 (*directions_Egap)[c][0] = HORIZ;
3047 (*directions_nogap)[c][0] = HORIZ;
3048 }
3049 #endif
3050 #if 0
3051 /* Already initialized to DIAG. Actually no longer initializing directions_Egap */
3052 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
3053 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
3054 #endif
3055
3056 #if 0
3057 /* Column 0 initialization */
3058 /* penalty = open; */
3059 for (r = 1; r <= SIMD_NCHARS && r <= rlength; r++) {
3060 /* penalty += extend; */
3061 (*directions_nogap)[0][r] = VERT;
3062 }
3063 #endif
3064
3065
3066 /* Load pairscores. Store match - mismatch */
3067 pairscores[0] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
3068 pairscores[1] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
3069 pairscores[2] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
3070 pairscores[3] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
3071 pairscores[4] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
3072
3073 lband_ceil = (int) ((lband + SIMD_NCHARS)/SIMD_NCHARS) * SIMD_NCHARS;
3074 pairscores_col0 = (Score8_T *) _mm_malloc(lband_ceil * sizeof(Score8_T),ALIGN_SIZE);
3075
3076
3077 #if 0
3078 /* Should not be necessary */
3079 memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score8_T));
3080 memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score8_T));
3081 memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score8_T));
3082 memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score8_T));
3083 memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score8_T));
3084 #endif
3085
3086 /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
3087 #ifdef HAVE_SSE4_1
3088 pairscores_col0[0] = (Score8_T) 0;
3089 /* Initialization just to lband causes errors in dir_horiz for Egap */
3090 #ifdef ZERO_INITIAL_GAP_PENALTY
3091 for (r = 1; r < lband_ceil; r++) {
3092 pairscores_col0[r] = (Score8_T) 0;
3093 }
3094 #else
3095 for (r = 1; r < lband_ceil; r++) {
3096 pairscores_col0[r] = (Score8_T) NEG_INFINITY_8;
3097 }
3098 #endif
3099 #else
3100 pairscores_col0[0] = (Score8_T) 0+128;
3101 /* Initialization just to lband causes errors in dir_horiz for Egap */
3102 #ifdef ZERO_INITIAL_GAP_PENALTY
3103 for (r = 1; r < lband_ceil; r++) {
3104 pairscores_col0[r] = (Score8_T) 0+128;
3105 }
3106 #else
3107 for (r = 1; r < lband_ceil; r++) {
3108 pairscores_col0[r] = (Score8_T) NEG_INFINITY_8+128;
3109 }
3110 #endif
3111 #endif
3112
3113
3114 /* Row 0 */
3115 r = 0; na1 = 'N';
3116 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3117 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3118 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3119 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3120 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3121
3122 if (revp == false) {
3123 for (r = 1; r <= rlength; r++) {
3124 na1 = (int) rsequence[r-1];
3125 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3126 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3127 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3128 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3129 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3130 }
3131 } else {
3132 for (r = 1; r <= rlength; r++) {
3133 na1 = (int) rsequence[1-r];
3134 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3135 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3136 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3137 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3138 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3139 }
3140 }
3141
3142 #if 0
3143 /* Should not be necessary */
3144 memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3145 memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3146 memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3147 memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3148 memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3149 #endif
3150
3151 complement_dummy = _MM_SET1_EPI8(-1);
3152
3153 FF = (int *) MALLOCA((glength + 1) * sizeof(int));
3154
3155 gap_open = _MM_SET1_EPI8((Score8_T) open);
3156 gap_extend = _MM_SET1_EPI8((Score8_T) extend);
3157
3158
3159 #ifndef INFINITE_INITIAL_GAP_PENALTY
3160 #ifdef HAVE_AVX2
3161 ramp = _mm256_setr_epi8(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16);
3162 extend_ladder = _mm256_setr_epi8(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend,
3163 9*extend,10*extend,11*extend,12*extend,13*extend,14*extend,15*extend,16*extend);
3164 #else
3165 ramp = _mm_setr_epi8(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16);
3166 extend_ladder = _mm_setr_epi8(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend,
3167 9*extend,10*extend,11*extend,12*extend,13*extend,14*extend,15*extend,16*extend);
3168 #endif
3169 lband_vector = _MM_SET1_EPI8(lband);
3170 ramp_chunk = _MM_SET1_EPI8(SIMD_NCHARS);
3171 extend_chunk = _MM_SET1_EPI8(SIMD_NCHARS*extend);
3172 #endif
3173
3174 if (jump_late_p) {
3175 for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS) {
3176 if ((rhigh = rlo + SIMD_NCHARS - 1) > rlength) {
3177 rhigh = rlength;
3178 }
3179
3180 if ((c = rlo - lband) < 0) {
3181 c = 0;
3182
3183 #if defined(ZERO_INITIAL_GAP_PENALTY)
3184 /* Initial H in column 0 determined by zeroed out H. E needs to equal gap_open for column 1. */
3185 E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8-open);
3186 filter = _MM_CMPGT_EPI8(ramp,lband_vector);
3187 H_nogap_r = _MM_AND_SI(filter,E_r_gap); /* Use zeros for score */
3188
3189 E_r_gap = _MM_OR_SI(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
3190 E_r_gap = _MM_ADDS_EPI8(E_r_gap,gap_open);
3191
3192 ramp = _MM_ADDS_EPI8(ramp,ramp_chunk); /* Prepare for next block */
3193 extend_ladder = _MM_ADDS_EPI8(extend_ladder,extend_chunk); /* Prepare for next block */
3194 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
3195 /* dir_horiz tests if E >= H. To fill in first column of each
3196 row block with non-diags, make E == H. */
3197 E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8);
3198 H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
3199 #else
3200 /* Initial H in column 0 determined by E, which needs to equal
3201 gap_open + extend_ladder for column 1. H is free to be set
3202 equal to E. */
3203 H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* To compensate for T1 = H + open */
3204 filter = _MM_CMPGT_EPI8(ramp,lband_vector);
3205 H_nogap_r = _MM_OR_SI(_MM_ANDNOT_SI(filter,extend_ladder),_MM_AND_SI(filter,H_nogap_r));
3206 E_r_gap = _MM_ADDS_EPI8(H_nogap_r,gap_open);
3207 ramp = _MM_ADDS_EPI8(ramp,ramp_chunk); /* Prepare for next block */
3208 extend_ladder = _MM_ADDS_EPI8(extend_ladder,extend_chunk); /* Prepare for next block */
3209 #endif
3210 } else {
3211 E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8);
3212 H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
3213 }
3214
3215 for ( ; c <= rhigh + uband && c <= glength; c++) {
3216 score_column = matrix[c];
3217
3218 if (c == 0) {
3219 pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
3220
3221 #ifdef ZERO_INITIAL_GAP_PENALTY
3222 X_prev_nogap = _MM_SETZERO_SI();
3223 #elif defined(HAVE_AVX2)
3224 if (rlo == 0) {
3225 X_prev_nogap = _mm256_setzero_si256();
3226 } else {
3227 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
3228 }
3229 #else
3230 if (rlo == 0) {
3231 X_prev_nogap = _mm_setzero_si128();
3232 } else {
3233 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
3234 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3235 }
3236 #endif
3237
3238 } else {
3239 na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
3240 na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
3241 pairscores_std_ptr = pairscores[na2];
3242 pairscores_alt_ptr = pairscores[na2_alt];
3243
3244 #ifdef HAVE_AVX2
3245 if (rlo == 0) {
3246 #ifdef ZERO_INITIAL_GAP_PENALTY
3247 X_prev_nogap = _MM_SETZERO_SI();
3248 #else
3249 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
3250 #endif
3251 } else {
3252 /* second or greater block of 8 */
3253 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_CHAR_INSERT);
3254 }
3255
3256 #else
3257 if (rlo == 0) {
3258 #ifdef ZERO_INITIAL_GAP_PENALTY
3259 X_prev_nogap = _MM_SETZERO_SI();
3260 #else
3261 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
3262 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3263 #endif
3264 } else {
3265 /* second or greater block of 8 */
3266 X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
3267 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3268 }
3269 #endif
3270 }
3271
3272 debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
3273 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
3274
3275 /* EGAP */
3276 T1 = _MM_ADDS_EPI8(H_nogap_r, gap_open);
3277 dir_horiz = _MM_CMPLT_EPI8(E_r_gap,T1); /* E < H */
3278 dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy); /* E >= H, for jump late */
3279 #ifdef HAVE_AVX2
3280 _mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
3281 #else
3282 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
3283 #endif
3284 debug15(print_vector_8(T1,rlo,c,"T1"));
3285 debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
3286
3287 #ifdef HAVE_SSE4_1
3288 E_r_gap = _MM_MAX_EPI8(E_r_gap, T1); /* Compare H + open with vert */
3289 #else
3290 E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
3291 #endif
3292 E_r_gap = _MM_ADDS_EPI8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
3293 debug15(print_vector_8(E_r_gap,rlo,c,"E"));
3294
3295
3296 /* NOGAP */
3297 #ifdef HAVE_AVX2
3298 T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_r,SIMD_NCHARS-1),LAST_CHAR_INSERT);
3299 X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_r,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
3300 H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_CHAR);
3301 #else
3302 T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_SHIFT);
3303 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
3304 #endif
3305 H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
3306 X_prev_nogap = T1;
3307
3308 /* Add pairscores, allowing for alternate genomic nt */
3309 #ifdef HAVE_AVX2
3310 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
3311 pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
3312 H_nogap_r = _mm256_adds_epi8(H_nogap_r, _mm256_max_epi8(pairscores_std,pairscores_alt));
3313 #elif defined(HAVE_SSE4_1)
3314 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
3315 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
3316 H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
3317 #else
3318 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
3319 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
3320 pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
3321 H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
3322 #endif
3323 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
3324 debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
3325
3326 dir_horiz = _MM_CMPLT_EPI8(E_r_gap,H_nogap_r); /* E < H */
3327 dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy); /* E >= H, for jump late */
3328 #ifdef HAVE_AVX2
3329 _mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
3330 #else
3331 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
3332 #endif
3333 debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
3334
3335
3336 #ifdef HAVE_SSE4_1
3337 H_nogap_r = _MM_MAX_EPI8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
3338 #else
3339 /* Compare H + pairscores with horiz + extend */
3340 H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
3341 #endif
3342 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
3343 #ifdef HAVE_AVX2
3344 _mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
3345 #else
3346 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
3347 #endif
3348
3349 /* F loop */
3350 if ((rlo_calc = rlo) < c - uband) {
3351 rlo_calc = c - uband;
3352 }
3353 if ((rhigh_calc = rhigh) >= c + lband) {
3354 rhigh_calc = c + lband;
3355 if (c > 0) {
3356 /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
3357 pairscore = pairscores[na2][rhigh_calc];
3358 if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
3359 pairscore = pairscore0;
3360 }
3361 #ifndef HAVE_SSE4_1
3362 pairscore -= 128;
3363 #endif
3364 if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_8) {
3365 score_column[rhigh_calc] = NEG_INFINITY_8; /* Saturation */
3366 } else if (score > POS_INFINITY_8) {
3367 /* Should never get here, because we limit size of matrix using 8-bit quantities */
3368 score_column[rhigh_calc] = POS_INFINITY_8; /* Saturation */
3369 } else {
3370 score_column[rhigh_calc] = (Score8_T) score;
3371 }
3372 (*directions_Egap)[c][rhigh_calc] = DIAG;
3373 (*directions_nogap)[c][rhigh_calc] = DIAG;
3374 }
3375 }
3376
3377 debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
3378 rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
3379
3380 if (rlo == 0) {
3381 c_gap = NEG_INFINITY_INT;
3382 last_nogap = NEG_INFINITY_INT;
3383 } else if (c >= rlo + uband) {
3384 c_gap = NEG_INFINITY_INT;
3385 last_nogap = NEG_INFINITY_INT;
3386 } else {
3387 debug3(printf("At c %d, uband %d, reading c_gap %d\n",c,uband,FF[c]));
3388 c_gap = FF[c];
3389 last_nogap = (int) score_column[rlo_calc-1];
3390 }
3391
3392 if ((r = rlo_calc) == c - uband) {
3393 /* Handle top value as a special case to prevent going outside of uband */
3394 /* FGAP */
3395 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
3396 r,c,c_gap + extend,last_nogap + open + extend));
3397 score = last_nogap + open /* + extend */;
3398 c_gap = score + extend;
3399 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
3400
3401 /* NOGAP */
3402 last_nogap = (int) score_column[r];
3403 r++;
3404 }
3405
3406 /* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
3407 for ( ; r <= rhigh_calc; r++) {
3408 /* FGAP */
3409 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
3410 r,c,c_gap + extend,last_nogap + open + extend));
3411 if (c_gap /* + extend */ >= (score = last_nogap + open /* + extend */)) { /* Use >= for jump late */
3412 c_gap += extend;
3413 (*directions_Fgap)[c][r] = VERT;
3414 } else {
3415 c_gap = score + extend;
3416 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
3417 }
3418
3419 /* NOGAP */
3420 last_nogap = (int) score_column[r];
3421 debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
3422 if (c_gap >= last_nogap) { /* Use >= for jump late */
3423 last_nogap = c_gap;
3424 score_column[r] = (c_gap < NEG_INFINITY_8) ? NEG_INFINITY_8 : (Score8_T) c_gap; /* Saturation */
3425 (*directions_nogap)[c][r] = VERT;
3426 }
3427 }
3428
3429 FF[c] = c_gap;
3430 debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
3431 #ifdef HAVE_AVX2
3432 H_nogap_r = _mm256_load_si256((__m256i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
3433 #else
3434 H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
3435 #endif
3436 }
3437 }
3438
3439 } else {
3440 /* jump early */
3441 #if defined(ZERO_INITIAL_GAP_PENALTY) || defined(INFINITE_INITIAL_GAP_PENALTY)
3442 /* No need for ones */
3443 #else
3444 ones = _MM_SET1_EPI8(1);
3445 #endif
3446 for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS) {
3447 if ((rhigh = rlo + SIMD_NCHARS - 1) > rlength) {
3448 rhigh = rlength;
3449 }
3450
3451 if ((c = rlo - lband) < 0) {
3452 c = 0;
3453
3454 #if defined(ZERO_INITIAL_GAP_PENALTY)
3455 /* Initial H in column 0 determined by zeroed out H. E needs to equal gap_open for column 1. */
3456 E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8-open);
3457 filter = _MM_CMPGT_EPI8(ramp,lband_vector);
3458 H_nogap_r = _MM_AND_SI(filter,E_r_gap); /* Use zeros for score */
3459
3460 E_r_gap = _MM_OR_SI(_MM_ANDNOT_SI(filter,extend_ladder),H_nogap_r);
3461 E_r_gap = _MM_ADDS_EPI8(E_r_gap,gap_open);
3462
3463 ramp = _MM_ADDS_EPI8(ramp,ramp_chunk); /* Prepare for next block */
3464 extend_ladder = _MM_ADDS_EPI8(extend_ladder,extend_chunk); /* Prepare for next block */
3465 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
3466 /* dir_horiz tests if E > H. To fill in first column of each
3467 row block with non-diags, make E > H. */
3468 E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8+1);
3469 H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
3470 #else
3471 /* Initial H in column 0 determined by E, which needs to equal
3472 gap_open + extend_ladder for column 1. H is free to be set
3473 less than E. */
3474 H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open+1); /* To compensate for T1 = H + open */
3475 filter = _MM_CMPGT_EPI8(ramp,lband_vector);
3476 H_nogap_r = _MM_OR_SI(_MM_ANDNOT_SI(filter,extend_ladder),_MM_AND_SI(filter,H_nogap_r));
3477 E_r_gap = _MM_ADDS_EPI8(H_nogap_r,gap_open);
3478 H_nogap_r = _MM_SUBS_EPI8(H_nogap_r,ones); /* To ensure H < E */
3479 ramp = _MM_ADDS_EPI8(ramp,ramp_chunk); /* Prepare for next block */
3480 extend_ladder = _MM_ADDS_EPI8(extend_ladder,extend_chunk); /* Prepare for next block */
3481 #endif
3482 } else {
3483 E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8+1);
3484 H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
3485 }
3486
3487 for ( ; c <= rhigh + uband && c <= glength; c++) {
3488 score_column = matrix[c];
3489
3490 if (c == 0) {
3491 pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
3492
3493 #ifdef ZERO_INITIAL_GAP_PENALTY
3494 X_prev_nogap = _MM_SETZERO_SI();
3495 #elif defined(HAVE_AVX2)
3496 if (rlo == 0) {
3497 X_prev_nogap = _mm256_setzero_si256();
3498 } else {
3499 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
3500 }
3501 #else
3502 if (rlo == 0) {
3503 X_prev_nogap = _mm_setzero_si128();
3504 } else {
3505 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
3506 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3507 }
3508 #endif
3509
3510 } else {
3511 na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
3512 na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
3513 pairscores_std_ptr = pairscores[na2];
3514 pairscores_alt_ptr = pairscores[na2_alt];
3515
3516 #ifdef HAVE_AVX2
3517 if (rlo == 0) {
3518 #ifdef ZERO_INITIAL_GAP_PENALTY
3519 X_prev_nogap = _MM_SETZERO_SI();
3520 #else
3521 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
3522 #endif
3523 } else {
3524 /* second or greater block of 8 */
3525 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_CHAR_INSERT);
3526 }
3527
3528 #else
3529 if (rlo == 0) {
3530 #ifdef ZERO_INITIAL_GAP_PENALTY
3531 X_prev_nogap = _MM_SETZERO_SI();
3532 #else
3533 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
3534 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3535 #endif
3536 } else {
3537 /* second or greater block of 8 */
3538 X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
3539 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3540 }
3541 #endif
3542
3543 }
3544
3545 debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
3546 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
3547
3548 /* EGAP */
3549 T1 = _MM_ADDS_EPI8(H_nogap_r, gap_open);
3550 dir_horiz = _MM_CMPGT_EPI8(E_r_gap,T1); /* E > H, for jump early */
3551 #ifdef HAVE_AVX2
3552 _mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
3553 #else
3554 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
3555 #endif
3556 debug15(print_vector_8(T1,rlo,c,"T1"));
3557 debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
3558
3559 #ifdef HAVE_SSE4_1
3560 E_r_gap = _MM_MAX_EPI8(E_r_gap, T1); /* Compare H + open with vert */
3561 #else
3562 /* Compare H + open with vert */
3563 E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
3564 #endif
3565 E_r_gap = _MM_ADDS_EPI8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
3566 debug15(print_vector_8(E_r_gap,rlo,c,"E"));
3567
3568
3569 /* NOGAP */
3570 #ifdef HAVE_AVX2
3571 T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_r,SIMD_NCHARS-1),LAST_CHAR_INSERT);
3572 X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_r,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
3573 H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_CHAR);
3574 #else
3575 T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_SHIFT);
3576 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
3577 #endif
3578 H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
3579 X_prev_nogap = T1;
3580
3581 /* Add pairscores, allowing for alternate genomic nt */
3582 #ifdef HAVE_AVX2
3583 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
3584 pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
3585 H_nogap_r = _mm256_adds_epi8(H_nogap_r, _mm256_max_epi8(pairscores_std,pairscores_alt));
3586 #elif defined(HAVE_SSE4_1)
3587 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
3588 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
3589 H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
3590 #else
3591 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
3592 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
3593 pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
3594 H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
3595 #endif
3596 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
3597 debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
3598
3599 dir_horiz = _MM_CMPGT_EPI8(E_r_gap,H_nogap_r); /* E > H, for jump early */
3600 #ifdef HAVE_AVX2
3601 _mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
3602 #else
3603 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
3604 #endif
3605 debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
3606
3607
3608 #ifdef HAVE_SSE4_1
3609 H_nogap_r = _MM_MAX_EPI8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
3610 #else
3611 /* Compare H + pairscores with horiz + extend */
3612 H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
3613 #endif
3614 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
3615 #ifdef HAVE_AVX2
3616 _mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
3617 #else
3618 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
3619 #endif
3620
3621
3622 /* F loop */
3623 if ((rlo_calc = rlo) < c - uband) {
3624 rlo_calc = c - uband;
3625 }
3626 if ((rhigh_calc = rhigh) >= c + lband) {
3627 rhigh_calc = c + lband;
3628 if (c > 0) {
3629 /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
3630 pairscore = pairscores[na2][rhigh_calc];
3631 if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
3632 pairscore = pairscore0;
3633 }
3634 #ifndef HAVE_SSE4_1
3635 pairscore -= 128;
3636 #endif
3637 if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_8) {
3638 score_column[rhigh_calc] = NEG_INFINITY_8; /* Saturation */
3639 } else if (score > POS_INFINITY_8) {
3640 /* Should never get here, because we limit size of matrix using 8-bit quantities */
3641 score_column[rhigh_calc] = POS_INFINITY_8; /* Saturation */
3642 } else {
3643 score_column[rhigh_calc] = (Score8_T) score;
3644 }
3645 (*directions_Egap)[c][rhigh_calc] = DIAG;
3646 (*directions_nogap)[c][rhigh_calc] = DIAG;
3647 }
3648 }
3649
3650 debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
3651 rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
3652
3653 if (rlo == 0) {
3654 c_gap = NEG_INFINITY_INT;
3655 last_nogap = NEG_INFINITY_INT;
3656 } else if (c >= rlo + uband) {
3657 c_gap = NEG_INFINITY_INT;
3658 last_nogap = NEG_INFINITY_INT;
3659 } else {
3660 c_gap = FF[c];
3661 last_nogap = (int) score_column[rlo_calc-1];
3662 debug3(printf("LAST_NOGAP gets score_column[%d-1], or %d\n",rlo_calc,last_nogap));
3663 }
3664
3665 if ((r = rlo_calc) == c - uband) {
3666 /* Handle top value as a special case to prevent going outside of uband */
3667 /* FGAP */
3668 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
3669 r,c,c_gap + extend,last_nogap + open + extend));
3670 score = last_nogap + open /* + extend */;
3671 c_gap = score + extend;
3672 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
3673
3674 /* NOGAP */
3675 last_nogap = (int) score_column[r];
3676 r++;
3677 }
3678
3679 /* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
3680 for ( ; r <= rhigh_calc; r++) {
3681 /* FGAP */
3682 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
3683 r,c,c_gap + extend,last_nogap + open + extend));
3684 if (c_gap /* + extend */ > (score = last_nogap + open /* + extend */)) { /* Use > for jump early */
3685 c_gap += extend;
3686 (*directions_Fgap)[c][r] = VERT;
3687 } else {
3688 c_gap = score + extend;
3689 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
3690 }
3691
3692 /* NOGAP */
3693 last_nogap = (int) score_column[r];
3694 debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
3695 if (c_gap > last_nogap) { /* Use > for jump early */
3696 last_nogap = c_gap;
3697 score_column[r] = (c_gap < NEG_INFINITY_8) ? NEG_INFINITY_8 : (Score8_T) c_gap; /* Saturation */
3698 debug3(printf("Stored at score_column[%d]: %d\n",r,(Score8_T) score_column[r]));
3699 (*directions_nogap)[c][r] = VERT;
3700 }
3701 }
3702
3703 FF[c] = c_gap;
3704 debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
3705 #ifdef HAVE_AVX2
3706 H_nogap_r = _mm256_load_si256((__m256i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
3707 #else
3708 H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
3709 #endif
3710 }
3711 }
3712 }
3713
3714
3715 #ifdef CHECK1
3716 /* Row 0 and column 0 directions fail anyway due to saturation */
3717 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
3718 (*directions_Egap)[1][0] = HORIZ;
3719 (*directions_Fgap)[0][1] = VERT;
3720 #endif
3721
3722 #ifdef DEBUG2
3723 printf("SIMD: Dynprog_simd_8\n");
3724 Matrix8_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
3725 revp,lband,uband);
3726 Directions8_print(*directions_nogap,*directions_Egap,*directions_Fgap,
3727 rlength,glength,rsequence,gsequence,gsequence_alt,
3728 revp,lband,uband);
3729 #endif
3730
3731 #ifdef CHECK1
3732 /* Check for row 0 directions */
3733 for (c = 1; c <= uband && c <= glength; c++) {
3734 assert((*directions_Egap)[c][0] != DIAG);
3735 assert((*directions_nogap)[c][0] != DIAG);
3736 }
3737 /* Check for column 0 directions */
3738 for (r = 1; r <= lband && r <= rlength; r++) {
3739 assert((*directions_Fgap)[0][r] != DIAG);
3740 assert((*directions_nogap)[0][r] != DIAG);
3741 }
3742 #endif
3743
3744 #ifdef DEBUG_AVX2
3745 matrix_std = Dynprog_simd_8_nonavx2(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
3746 this,rsequence,gsequence,gsequence_alt,
3747 rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
3748 open,extend,lband,uband,jump_late_p,revp);
3749 #elif defined(DEBUG_SIMD)
3750 matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
3751 this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
3752 rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
3753 open,extend,lband,uband,jump_late_p,revp,/*saturation*/NEG_INFINITY_8,
3754 /*upperp*/true,/*lowerp*/true);
3755 #endif
3756
3757 #ifdef DEBUG2
3758 printf("Banded %s\n",revp ? "rev" : "fwd");
3759 Matrix8_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,revp,lband,uband);
3760 Directions8_print(*directions_nogap,*directions_Egap,*directions_Fgap,
3761 rlength,glength,rsequence,gsequence,gsequence_alt,revp,lband,uband);
3762 #endif
3763
3764 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
3765 banded_matrix8_compare(matrix,matrix_std,rlength,glength,lband,uband,
3766 rsequence,gsequence,gsequence_alt,
3767 goffset,chroffset,chrhigh,watsonp,revp);
3768
3769 banded_directions8_compare_nogap(matrix,*directions_nogap,directions_nogap_std,rlength,glength,lband,uband);
3770 banded_directions8_compare_Egap(matrix,*directions_Egap,directions_Egap_std,rlength,glength,lband,uband);
3771 banded_directions8_compare_Fgap(matrix,*directions_Fgap,directions_Fgap_std,rlength,glength,lband,uband);
3772 #endif
3773
3774 FREEA(FF);
3775 _mm_free(pairscores_col0);
3776 _mm_free(pairscores[4]);
3777 _mm_free(pairscores[3]);
3778 _mm_free(pairscores[2]);
3779 _mm_free(pairscores[1]);
3780 _mm_free(pairscores[0]);
3781
3782 return matrix;
3783 }
3784 #endif
3785
3786
3787 #ifdef DEBUG_AVX2
3788 /* Designed for computation above the main diagonal, so no F loop or bottom masking needed */
3789 /* Operates by columns */
3790 Score8_T **
Dynprog_simd_8_upper_nonavx2(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int uband,bool jump_late_p,bool revp)3791 Dynprog_simd_8_upper_nonavx2 (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
3792 T this, char *rsequence, char *gsequence, char *gsequence_alt,
3793 int rlength, int glength,
3794 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
3795 Mismatchtype_T mismatchtype, int open, int extend,
3796 int uband, bool jump_late_p, bool revp) {
3797 Score8_T **matrix, *score_column;
3798 __m128i pairscores_std, pairscores_alt;
3799 #ifdef HAVE_SSE4_1
3800 __m128i E_infinity;
3801 #else
3802 __m128i E_infinity_plus_128;
3803 __m128i pairscores_best, all_128;
3804 #endif
3805 __m128i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, T1;
3806 __m128i gap_open, gap_extend, complement_dummy;
3807 __m128i dir_horiz;
3808 int rlength_ceil, r, c;
3809 int rlo, rhigh;
3810 int na1, na2, na2_alt;
3811 Score8_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr;
3812 Pairdistance_T **pairdistance_array_type;
3813
3814
3815 debug2(printf("Dynprog_simd_8_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
3816 debug15(printf("Dynprog_simd_8_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
3817
3818 rlength_ceil = (int) ((rlength + SIMD_NCHARS_NONAVX2)/SIMD_NCHARS_NONAVX2) * SIMD_NCHARS_NONAVX2;
3819
3820 #ifdef HAVE_SSE4_1
3821 pairdistance_array_type = pairdistance_array[mismatchtype];
3822 #else
3823 /* Needed to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
3824 pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
3825 all_128 = _mm_set1_epi8(128);
3826 #endif
3827
3828 debug(printf("compute_scores_simd_8_bycols (upper): "));
3829 debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
3830 debug(printf("Query length rounded up to %d\n",rlength_ceil));
3831
3832 matrix = aligned_score8_alloc(rlength_ceil,glength,
3833 this->aligned_std.two.upper_matrix_ptrs,this->aligned_std.two.upper_matrix_space);
3834 *directions_nogap = aligned_directions8_alloc(rlength_ceil,glength,
3835 this->aligned_std.two.upper_directions_ptrs_0,this->aligned_std.two.upper_directions_space_0);
3836 *directions_Egap = aligned_directions8_alloc(rlength_ceil,glength,
3837 this->aligned_std.two.upper_directions_ptrs_1,this->aligned_std.two.upper_directions_space_1);
3838
3839 #if 0
3840 /* Row 0 initialization */
3841 /* penalty = open; */
3842 for (c = 1; c <= uband && c <= glength; c++) {
3843 /* penalty += extend; */
3844 (*directions_Egap)[c][0] = HORIZ;
3845 (*directions_nogap)[c][0] = HORIZ;
3846 }
3847 #endif
3848 #if 0
3849 /* Already initialized to DIAG. Actually no longer initializing directions_Egap */
3850 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
3851 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
3852 #endif
3853 #if 0
3854 /* Column 0 initialization */
3855 /* penalty = open; */
3856 for (r = 1; r <= SIMD_NCHARS_NONAVX2 && r <= rlength; r++) {
3857 /* penalty += extend; */
3858 (*directions_nogap)[0][r] = VERT;
3859 }
3860 #endif
3861
3862
3863 /* Load pairscores. Store match - mismatch */
3864 pairscores[0] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
3865 pairscores[1] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
3866 pairscores[2] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
3867 pairscores[3] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
3868 pairscores[4] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
3869
3870 #if 0
3871 /* Should not be necessary */
3872 memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score8_T));
3873 memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score8_T));
3874 memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score8_T));
3875 memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score8_T));
3876 memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score8_T));
3877 #endif
3878
3879 /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
3880 r = 0; na1 = 'N';
3881 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3882 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3883 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3884 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3885 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3886
3887 if (revp == false) {
3888 for (r = 1; r <= rlength; r++) {
3889 na1 = (int) rsequence[r-1];
3890 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3891 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3892 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3893 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3894 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3895 }
3896 } else {
3897 for (r = 1; r <= rlength; r++) {
3898 na1 = (int) rsequence[1-r];
3899 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3900 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3901 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3902 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3903 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3904 }
3905 }
3906
3907 #if 0
3908 /* Should not be necessary */
3909 memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3910 memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3911 memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3912 memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3913 memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3914 #endif
3915
3916 complement_dummy = _mm_set1_epi8(-1);
3917
3918 gap_open = _mm_set1_epi8((Score8_T) open);
3919 gap_extend = _mm_set1_epi8((Score8_T) extend);
3920
3921
3922 #ifdef HAVE_SSE4_1
3923 E_infinity = _mm_set1_epi8(POS_INFINITY_8);
3924 #else
3925 E_infinity_plus_128 = _mm_set1_epi8(POS_INFINITY_8+128);
3926 #endif
3927 if (jump_late_p) {
3928 for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS_NONAVX2) {
3929 if ((rhigh = rlo + SIMD_NCHARS_NONAVX2 - 1) > rlength) {
3930 rhigh = rlength;
3931 }
3932
3933 /* dir_horiz tests if E >= H . To fill in first column of each
3934 row block with non-diags, could make E == H. But irrelevant,
3935 because these are below the diagonal. */
3936 E_mask = _mm_set1_epi8(1);
3937
3938 E_r_gap = _mm_set1_epi8(NEG_INFINITY_8);
3939 H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
3940
3941 for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
3942 score_column = matrix[c];
3943
3944 if (c == 0) {
3945 na2 = na2_alt = 4; /* 'N' */
3946 } else {
3947 na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
3948 na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
3949 }
3950 pairscores_std_ptr = pairscores[na2];
3951 pairscores_alt_ptr = pairscores[na2_alt];
3952
3953 if (c == 0) {
3954 X_prev_nogap = _mm_set1_epi8(0);
3955 } else if (rlo == 0) {
3956 #ifdef ZERO_INITIAL_GAP_PENALTY
3957 X_prev_nogap = _mm_set1_epi8(0);
3958 #else
3959 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the rlo bounds */
3960 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
3961 #endif
3962 } else {
3963 /* second or greater block of 8 */
3964 X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
3965 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
3966 }
3967
3968 debug15(print_vector_8(E_mask,rlo,c,"E_mask"));
3969 #ifdef HAVE_SSE4_1
3970 E_r_gap = _mm_min_epi8(E_r_gap,_mm_add_epi8(E_mask,E_infinity));
3971 #else
3972 E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
3973 #endif
3974 debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
3975 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
3976
3977 /* EGAP */
3978 T1 = _mm_adds_epi8(H_nogap_r, gap_open);
3979 dir_horiz = _mm_cmplt_epi8(E_r_gap,T1); /* E < H */
3980 dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy); /* E >= H, for jump late */
3981 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
3982 debug15(print_vector_8(T1,rlo,c,"T1"));
3983 debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
3984
3985 #ifdef HAVE_SSE4_1
3986 E_r_gap = _mm_max_epi8(E_r_gap, T1); /* Compare H + open with vert */
3987 E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
3988 E_r_gap = _mm_min_epi8(E_r_gap,_mm_add_epi8(E_mask,E_infinity));
3989 #elif 1
3990 E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
3991 E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
3992 E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
3993 #else
3994 /* Try to avoid unnecessary shifts by 128, but overflows */
3995 E_r_gap = _mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128));
3996 E_r_gap = _mm_add_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
3997 E_r_gap = _mm_sub_epi8(_mm_min_epu8(E_r_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
3998 #endif
3999 debug15(print_vector_8(E_r_gap,rlo,c,"E"));
4000
4001
4002 /* NOGAP */
4003 T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_NONAVX2);
4004 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
4005 H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
4006 X_prev_nogap = T1;
4007
4008 /* Add pairscores, allowing for alternate genomic nt */
4009 #ifdef HAVE_SSE4_1
4010 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
4011 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
4012 debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4013 H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
4014 #else
4015 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
4016 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
4017 pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
4018 debug15(print_vector_8(pairscores_best,rlo,c,"pairscores_std"));
4019 H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
4020 #endif
4021 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
4022 debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
4023
4024 dir_horiz = _mm_cmplt_epi8(E_r_gap,H_nogap_r); /* E < H */
4025 dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy); /* E >= H, for jump late */
4026 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4027 debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
4028
4029 #ifdef HAVE_SSE4_1
4030 H_nogap_r = _mm_max_epi8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
4031 #else
4032 /* Compare H + pairscores with horiz + extend */
4033 H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
4034 #endif
4035 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
4036 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
4037
4038
4039 /* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
4040 if (rhigh >= c) {
4041 (*directions_Egap)[c][c] = DIAG;
4042 (*directions_nogap)[c][c] = DIAG;
4043 }
4044
4045 /* No need for F loop here */
4046 E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
4047 }
4048 }
4049
4050 } else {
4051 /* jump early */
4052 for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS_NONAVX2) {
4053 if ((rhigh = rlo + SIMD_NCHARS_NONAVX2 - 1) > rlength) {
4054 rhigh = rlength;
4055 }
4056
4057 /* dir_horiz tests if E > H. To fill in first column of each
4058 row block with non-diags, could make E > H. But irrelevant,
4059 because these are below the diagonal. */
4060 E_mask = _mm_set1_epi8(1);
4061
4062 E_r_gap = _mm_set1_epi8(NEG_INFINITY_8+1);
4063 H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
4064
4065 for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
4066 score_column = matrix[c];
4067
4068 if (c == 0) {
4069 na2 = na2_alt = 4; /* 'N' */;
4070 } else {
4071 na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
4072 na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
4073 }
4074 pairscores_std_ptr = pairscores[na2];
4075 pairscores_alt_ptr = pairscores[na2_alt];
4076
4077 if (c == 0) {
4078 X_prev_nogap = _mm_set1_epi8(0);
4079 } else if (rlo == 0) {
4080 #ifdef ZERO_INITIAL_GAP_PENALTY
4081 X_prev_nogap = _mm_set1_epi8(0);
4082 #else
4083 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the rlo bounds */
4084 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
4085 #endif
4086 } else {
4087 /* second or greater block of 8 */
4088 X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
4089 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
4090 }
4091
4092 debug15(print_vector_8(E_mask,rlo,c,"E_mask"));
4093 #ifdef HAVE_SSE4_1
4094 E_r_gap = _mm_min_epi8(E_r_gap,_mm_add_epi8(E_mask,E_infinity));
4095 #else
4096 E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4097 #endif
4098 debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
4099 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
4100
4101 /* EGAP */
4102 T1 = _mm_adds_epi8(H_nogap_r, gap_open);
4103 dir_horiz = _mm_cmpgt_epi8(E_r_gap,T1); /* E > H, for jump early */
4104 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
4105 debug15(print_vector_8(T1,rlo,c,"T1"));
4106 debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
4107
4108 /* Compare H + open with vert */
4109 #ifdef HAVE_SSE4_1
4110 E_r_gap = _mm_max_epi8(E_r_gap, T1); /* Compare H + open with vert */
4111 E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4112 E_r_gap = _mm_min_epi8(E_r_gap,_mm_add_epi8(E_mask,E_infinity));
4113 #elif 1
4114 E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
4115 E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4116 E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4117 #else
4118 /* Try to avoid unnecessary shifts by 128, but overflows */
4119 E_r_gap = _mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128));
4120 E_r_gap = _mm_add_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4121 E_r_gap = _mm_sub_epi8(_mm_min_epu8(E_r_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4122 #endif
4123 debug15(print_vector_8(E_r_gap,rlo,c,"E"));
4124
4125
4126 /* NOGAP */
4127 T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_NONAVX2);
4128 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
4129 H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
4130 X_prev_nogap = T1;
4131
4132 /* Add pairscores, allowing for alternate genomic nt */
4133 #ifdef HAVE_SSE4_1
4134 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
4135 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
4136 debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4137 H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
4138 #else
4139 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
4140 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
4141 pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
4142 debug15(print_vector_8(pairscores_best,rlo,c,"pairscores_std"));
4143 H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
4144 #endif
4145 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
4146 debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
4147
4148 dir_horiz = _mm_cmpgt_epi8(E_r_gap,H_nogap_r); /* E > H, for jump early */
4149 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4150 debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
4151
4152
4153 #ifdef HAVE_SSE4_1
4154 H_nogap_r = _mm_max_epi8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
4155 #else
4156 /* Compare H + pairscores with horiz + extend */
4157 H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
4158 #endif
4159 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
4160 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
4161
4162 /* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
4163 if (rhigh >= c) {
4164 (*directions_Egap)[c][c] = DIAG;
4165 (*directions_nogap)[c][c] = DIAG;
4166 }
4167
4168 /* No need for F loop here */
4169 E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
4170 }
4171 }
4172 }
4173
4174 #ifdef CHECK1
4175 /* Row 0 and column 0 directions fail anyway due to saturation */
4176 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
4177 (*directions_Egap)[1][0] = HORIZ;
4178 #endif
4179
4180 #ifdef DEBUG2
4181 printf("SIMD: Dynprog_simd_8_upper\n");
4182 Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
4183 revp,uband,/*upperp*/true);
4184 Directions8_print_ud(*directions_nogap,*directions_Egap,
4185 rlength,glength,rsequence,gsequence,gsequence_alt,
4186 revp,uband,/*upperp*/true);
4187 #endif
4188
4189 #ifdef CHECK1
4190 /* Check for row 0 directions */
4191 for (c = 1; c <= uband && c <= glength; c++) {
4192 assert((*directions_Egap)[c][0] != DIAG);
4193 assert((*directions_nogap)[c][0] != DIAG);
4194 }
4195 #endif
4196
4197 _mm_free(pairscores[4]);
4198 _mm_free(pairscores[3]);
4199 _mm_free(pairscores[2]);
4200 _mm_free(pairscores[1]);
4201 _mm_free(pairscores[0]);
4202
4203 return matrix;
4204 }
4205 #endif
4206
4207
4208 /* E_mask works at the wraparound from POS_INFINITY to NEG_INFINITY.
4209 It is designed to prevent a horizontal/vertical jump into the empty
4210 triangle, by setting horizontal/vertical scores to be as small as
4211 possible, e.g., -128. However, it is possible that H is also -128,
4212 so we still need to fix the directions along the main diagonal.
4213
4214 E_mask shifted: 0 0 0 0 1 1 1 1
4215 add E_infinity: 127 127 127 127 127 127 127 127
4216 resulting mask: 127 127 127 127 -128 -128 -128 -128
4217
4218 To deal with non-SSE4.1 systems, which lack _mm_min_epi8, we need
4219 too add 128 to E and mask, then take _mm_min_epu8, then subtract
4220 128, as follows:
4221
4222 E_mask shifted: 0 0 0 0 1 1 1 1
4223 add E_inf+128: 255 255 255 255 255 255 255 255
4224 resulting mask: 255 255 255 255 0 0 0 0
4225 (compare w/E+128)
4226
4227 */
4228
4229
4230 #ifdef HAVE_SSE2
4231 /* Designed for computation above the main diagonal, so no F loop or bottom masking needed */
4232 /* Operates by columns */
4233 Score8_T **
Dynprog_simd_8_upper(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int uband,bool jump_late_p,bool revp)4234 Dynprog_simd_8_upper (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
4235 T this, char *rsequence, char *gsequence, char *gsequence_alt,
4236 int rlength, int glength,
4237 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
4238 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
4239 #endif
4240 Mismatchtype_T mismatchtype, int open, int extend,
4241 int uband, bool jump_late_p, bool revp) {
4242 Score8_T **matrix, *score_column;
4243 #ifdef HAVE_AVX2
4244 __m256i E_infinity;
4245 #elif defined(HAVE_SSE4_1)
4246 __m128i E_infinity;
4247 #else
4248 __m128i E_infinity_plus_128;
4249 __m128i pairscores_best, all_128;
4250 #endif
4251 #ifdef HAVE_AVX2
4252 __m256i pairscores_std, pairscores_alt;
4253 __m256i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, T1;
4254 __m256i gap_open, gap_extend, complement_dummy;
4255 __m256i dir_horiz;
4256 Score8_T save;
4257 #else
4258 __m128i pairscores_std, pairscores_alt;
4259 __m128i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, T1;
4260 __m128i gap_open, gap_extend, complement_dummy;
4261 __m128i dir_horiz;
4262 #endif
4263 int rlength_ceil, r, c;
4264 int rlo, rhigh;
4265 int na1, na2, na2_alt;
4266 Score8_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr;
4267 Pairdistance_T **pairdistance_array_type;
4268
4269 #ifdef DEBUG_AVX2
4270 Score8_T **matrix_std;
4271 Direction8_T **directions_nogap_std, **directions_Egap_std;
4272 char na2_single;
4273 #elif defined(DEBUG_SIMD)
4274 Score32_T **matrix_std;
4275 Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
4276 char na2_single;
4277 #endif
4278
4279
4280 debug2(printf("Dynprog_simd_8_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
4281 debug15(printf("Dynprog_simd_8_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
4282
4283 rlength_ceil = (int) ((rlength + SIMD_NCHARS)/SIMD_NCHARS) * SIMD_NCHARS;
4284
4285 #ifdef HAVE_SSE4_1
4286 pairdistance_array_type = pairdistance_array[mismatchtype];
4287 #else
4288 /* Needed to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
4289 pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
4290 all_128 = _mm_set1_epi8(128);
4291 #endif
4292
4293 debug(printf("compute_scores_simd_8_bycols (upper): "));
4294 debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
4295 debug(printf("Query length rounded up to %d\n",rlength_ceil));
4296
4297 matrix = aligned_score8_alloc(rlength_ceil,glength,
4298 this->aligned.two.upper_matrix_ptrs,this->aligned.two.upper_matrix_space);
4299 *directions_nogap = aligned_directions8_alloc(rlength_ceil,glength,
4300 this->aligned.two.upper_directions_ptrs_0,this->aligned.two.upper_directions_space_0);
4301 *directions_Egap = aligned_directions8_alloc(rlength_ceil,glength,
4302 this->aligned.two.upper_directions_ptrs_1,this->aligned.two.upper_directions_space_1);
4303
4304 #if 0
4305 /* Row 0 initialization */
4306 /* penalty = open; */
4307 for (c = 1; c <= uband && c <= glength; c++) {
4308 /* penalty += extend; */
4309 (*directions_Egap)[c][0] = HORIZ;
4310 (*directions_nogap)[c][0] = HORIZ;
4311 }
4312 #endif
4313 #if 0
4314 /* Already initialized to DIAG. Actually no longer initializing directions_Egap */
4315 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
4316 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
4317 #endif
4318 #if 0
4319 /* Column 0 initialization */
4320 /* penalty = open; */
4321 for (r = 1; r <= SIMD_NCHARS && r <= rlength; r++) {
4322 /* penalty += extend; */
4323 (*directions_nogap)[0][r] = VERT;
4324 }
4325 #endif
4326
4327
4328 /* Load pairscores. Store match - mismatch */
4329 pairscores[0] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
4330 pairscores[1] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
4331 pairscores[2] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
4332 pairscores[3] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
4333 pairscores[4] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
4334
4335 #if 0
4336 /* Should not be necessary */
4337 memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score8_T));
4338 memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score8_T));
4339 memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score8_T));
4340 memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score8_T));
4341 memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score8_T));
4342 #endif
4343
4344 /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
4345 r = 0; na1 = 'N';
4346 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
4347 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
4348 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
4349 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
4350 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
4351
4352 if (revp == false) {
4353 for (r = 1; r <= rlength; r++) {
4354 na1 = (int) rsequence[r-1];
4355 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
4356 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
4357 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
4358 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
4359 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
4360 }
4361 } else {
4362 for (r = 1; r <= rlength; r++) {
4363 na1 = (int) rsequence[1-r];
4364 pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
4365 pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
4366 pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
4367 pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
4368 pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
4369 }
4370 }
4371
4372 #if 0
4373 /* Should not be necessary */
4374 memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
4375 memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
4376 memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
4377 memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
4378 memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
4379 #endif
4380
4381 complement_dummy = _MM_SET1_EPI8(-1);
4382
4383 gap_open = _MM_SET1_EPI8((Score8_T) open);
4384 gap_extend = _MM_SET1_EPI8((Score8_T) extend);
4385
4386 #ifdef HAVE_SSE4_1
4387 E_infinity = _MM_SET1_EPI8(POS_INFINITY_8);
4388 #else
4389 E_infinity_plus_128 = _mm_set1_epi8(POS_INFINITY_8+128);
4390 #endif
4391 if (jump_late_p) {
4392 for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS) {
4393 if ((rhigh = rlo + SIMD_NCHARS - 1) > rlength) {
4394 rhigh = rlength;
4395 }
4396
4397 /* dir_horiz tests if E >= H . To fill in first column of each
4398 row block with non-diags, could make E == H. But irrelevant,
4399 because these are below the diagonal. */
4400 E_mask = _MM_SET1_EPI8(1);
4401
4402 /* Holds for all INITIAL_GAP_PENALTY */
4403 E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8);
4404 H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
4405
4406 for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
4407 score_column = matrix[c];
4408
4409 if (c == 0) {
4410 na2 = na2_alt = 4; /* 'N' */
4411 } else {
4412 na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
4413 na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
4414 }
4415 pairscores_std_ptr = pairscores[na2];
4416 pairscores_alt_ptr = pairscores[na2_alt];
4417
4418 if (c == 0) {
4419 X_prev_nogap = _MM_SETZERO_SI();
4420 } else if (rlo == 0) {
4421 #ifdef ZERO_INITIAL_GAP_PENALTY
4422 X_prev_nogap = _MM_SETZERO_SI();
4423 #elif defined(HAVE_AVX2)
4424 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
4425 #else
4426 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the rlo bounds */
4427 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
4428 #endif
4429 } else {
4430 /* second or greater block of 8 */
4431 #ifdef ZERO_INITIAL_GAP_PENALTY
4432 X_prev_nogap = _MM_SETZERO_SI();
4433 #elif defined(HAVE_AVX2)
4434 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_CHAR_INSERT);
4435 #else
4436 X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
4437 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
4438 #endif
4439 }
4440
4441 debug15(print_vector_8(E_mask,rlo,c,"E_mask"));
4442 #ifdef HAVE_SSE4_1
4443 E_r_gap = _MM_MIN_EPI8(E_r_gap,_MM_ADD_EPI8(E_mask,E_infinity));
4444 #else
4445 E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4446 #endif
4447 debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
4448 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
4449
4450 /* EGAP */
4451 T1 = _MM_ADDS_EPI8(H_nogap_r, gap_open);
4452 dir_horiz = _MM_CMPLT_EPI8(E_r_gap,T1); /* E < H */
4453 dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy); /* E >= H, for jump late */
4454 #ifdef HAVE_AVX2
4455 _mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
4456 #else
4457 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
4458 #endif
4459 debug15(print_vector_8(T1,rlo,c,"T1"));
4460 debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
4461
4462 #ifdef HAVE_SSE4_1
4463 E_r_gap = _MM_MAX_EPI8(E_r_gap, T1); /* Compare H + open with vert */
4464 E_r_gap = _MM_ADDS_EPI8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4465 E_r_gap = _MM_MIN_EPI8(E_r_gap,_MM_ADD_EPI8(E_mask,E_infinity));
4466 #elif 1
4467 E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
4468 E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4469 E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4470 #else
4471 /* Try to avoid unnecessary shifts by 128, but overflows */
4472 E_r_gap = _mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128));
4473 E_r_gap = _mm_add_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4474 E_r_gap = _mm_sub_epi8(_mm_min_epu8(E_r_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4475 #endif
4476 debug15(print_vector_8(E_r_gap,rlo,c,"E"));
4477
4478
4479 /* NOGAP */
4480 #ifdef HAVE_AVX2
4481 T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_r,SIMD_NCHARS-1),LAST_CHAR_INSERT);
4482 X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_r,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
4483 H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_CHAR);
4484 #else
4485 T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_SHIFT);
4486 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
4487 #endif
4488 H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
4489 X_prev_nogap = T1;
4490
4491 /* Add pairscores, allowing for alternate genomic nt */
4492 #ifdef HAVE_AVX2
4493 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
4494 pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
4495 debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4496 H_nogap_r = _MM_ADDS_EPI8(H_nogap_r, _MM_MAX_EPI8(pairscores_std,pairscores_alt));
4497 #elif defined(HAVE_SSE4_1)
4498 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
4499 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
4500 debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4501 H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
4502 #else
4503 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
4504 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
4505 pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
4506 debug15(print_vector_8(pairscores_best,rlo,c,"pairscores_std"));
4507 H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
4508 #endif
4509 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
4510 debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
4511
4512 dir_horiz = _MM_CMPLT_EPI8(E_r_gap,H_nogap_r); /* E < H */
4513 dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy); /* E >= H, for jump late */
4514 #ifdef HAVE_AVX2
4515 _mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4516 #else
4517 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4518 #endif
4519 debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
4520
4521 #ifdef HAVE_SSE4_1
4522 H_nogap_r = _MM_MAX_EPI8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
4523 #else
4524 /* Compare H + pairscores with horiz + extend */
4525 H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
4526 #endif
4527 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
4528 #ifdef HAVE_AVX2
4529 _mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
4530 #else
4531 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
4532 #endif
4533
4534
4535 /* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
4536 if (rhigh >= c) {
4537 (*directions_Egap)[c][c] = DIAG;
4538 (*directions_nogap)[c][c] = DIAG;
4539 }
4540
4541 /* No need for F loop here */
4542 #ifdef HAVE_AVX2
4543 save = _mm256_extract_epi8(E_mask,15);
4544 E_mask = _mm256_slli_si256(E_mask,ONE_CHAR);
4545 E_mask = _mm256_insert_epi8(E_mask,save,16);
4546 #else
4547 E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
4548 #endif
4549 }
4550 }
4551
4552 } else {
4553 /* jump early */
4554 for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS) {
4555 if ((rhigh = rlo + SIMD_NCHARS - 1) > rlength) {
4556 rhigh = rlength;
4557 }
4558
4559 /* dir_horiz tests if E > H. To fill in first column of each
4560 row block with non-diags, could make E > H. But irrelevant,
4561 because these are below the diagonal. */
4562 E_mask = _MM_SET1_EPI8(1);
4563
4564 /* Holds for all INITIAL_GAP_PENALTY */
4565 E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8+1);
4566 H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
4567
4568 for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
4569 score_column = matrix[c];
4570
4571 if (c == 0) {
4572 na2 = na2_alt = 4; /* 'N' */;
4573 } else {
4574 na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
4575 na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
4576 }
4577 pairscores_std_ptr = pairscores[na2];
4578 pairscores_alt_ptr = pairscores[na2_alt];
4579
4580 if (c == 0) {
4581 X_prev_nogap = _MM_SETZERO_SI();
4582 } else if (rlo == 0) {
4583 #ifdef ZERO_INITIAL_GAP_PENALTY
4584 X_prev_nogap = _MM_SETZERO_SI();
4585 #elif defined(HAVE_AVX2)
4586 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
4587 #else
4588 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the rlo bounds */
4589 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
4590 #endif
4591 } else {
4592 /* second or greater block of 8 */
4593 #ifdef ZERO_INITIAL_GAP_PENALTY
4594 X_prev_nogap = _MM_SETZERO_SI();
4595 #elif defined(HAVE_AVX2)
4596 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_CHAR_INSERT);
4597 #else
4598 X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
4599 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
4600 #endif
4601 }
4602
4603 debug15(print_vector_8(E_mask,rlo,c,"E_mask"));
4604 #ifdef HAVE_SSE4_1
4605 E_r_gap = _MM_MIN_EPI8(E_r_gap,_MM_ADD_EPI8(E_mask,E_infinity));
4606 #else
4607 E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4608 #endif
4609 debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
4610 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
4611
4612 /* EGAP */
4613 T1 = _MM_ADDS_EPI8(H_nogap_r, gap_open);
4614 dir_horiz = _MM_CMPGT_EPI8(E_r_gap,T1); /* E > H, for jump early */
4615 #ifdef HAVE_AVX2
4616 _mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
4617 #else
4618 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
4619 #endif
4620 debug15(print_vector_8(T1,rlo,c,"T1"));
4621 debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
4622
4623 /* Compare H + open with vert */
4624 #ifdef HAVE_SSE4_1
4625 E_r_gap = _MM_MAX_EPI8(E_r_gap, T1); /* Compare H + open with vert */
4626 E_r_gap = _MM_ADDS_EPI8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4627 E_r_gap = _MM_MIN_EPI8(E_r_gap,_MM_ADD_EPI8(E_mask,E_infinity));
4628 #elif 1
4629 E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
4630 E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4631 E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4632 #else
4633 /* Try to avoid unnecessary shifts by 128, but overflows */
4634 E_r_gap = _mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128));
4635 E_r_gap = _mm_add_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4636 E_r_gap = _mm_sub_epi8(_mm_min_epu8(E_r_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4637 #endif
4638 debug15(print_vector_8(E_r_gap,rlo,c,"E"));
4639
4640
4641 /* NOGAP */
4642 #ifdef HAVE_AVX2
4643 T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_r,SIMD_NCHARS-1),LAST_CHAR_INSERT);
4644 X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_r,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
4645 H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_CHAR);
4646 #else
4647 T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_SHIFT);
4648 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
4649 #endif
4650 H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
4651 X_prev_nogap = T1;
4652
4653 /* Add pairscores, allowing for alternate genomic nt */
4654 #ifdef HAVE_AVX2
4655 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
4656 pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
4657 debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4658 H_nogap_r = _MM_ADDS_EPI8(H_nogap_r, _MM_MAX_EPI8(pairscores_std,pairscores_alt));
4659 #elif defined(HAVE_SSE4_1)
4660 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
4661 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
4662 debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4663 H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
4664 #else
4665 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
4666 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
4667 pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
4668 debug15(print_vector_8(pairscores_best,rlo,c,"pairscores_std"));
4669 H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
4670 #endif
4671 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
4672 debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
4673
4674 dir_horiz = _MM_CMPGT_EPI8(E_r_gap,H_nogap_r); /* E > H, for jump early */
4675 #ifdef HAVE_AVX2
4676 _mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4677 #else
4678 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4679 #endif
4680 debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
4681
4682
4683 #ifdef HAVE_SSE4_1
4684 H_nogap_r = _MM_MAX_EPI8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
4685 #else
4686 /* Compare H + pairscores with horiz + extend */
4687 H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
4688 #endif
4689 debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
4690 #ifdef HAVE_AVX2
4691 _mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
4692 #else
4693 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
4694 #endif
4695
4696 /* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
4697 if (rhigh >= c) {
4698 (*directions_Egap)[c][c] = DIAG;
4699 (*directions_nogap)[c][c] = DIAG;
4700 }
4701
4702 /* No need for F loop here */
4703 #ifdef HAVE_AVX2
4704 save = _mm256_extract_epi8(E_mask,15);
4705 E_mask = _mm256_slli_si256(E_mask,ONE_CHAR);
4706 E_mask = _mm256_insert_epi8(E_mask,save,16);
4707 #else
4708 E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
4709 #endif
4710 }
4711 }
4712 }
4713
4714
4715 #ifdef CHECK1
4716 /* Row 0 and column 0 directions fail anyway due to saturation */
4717 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
4718 (*directions_Egap)[1][0] = HORIZ;
4719 #endif
4720
4721 #ifdef DEBUG2
4722 printf("SIMD: Dynprog_simd_8_upper\n");
4723 Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
4724 revp,uband,/*upperp*/true);
4725 Directions8_print_ud(*directions_nogap,*directions_Egap,
4726 rlength,glength,rsequence,gsequence,gsequence_alt,
4727 revp,uband,/*upperp*/true);
4728 #endif
4729
4730 #ifdef CHECK1
4731 /* Check for row 0 directions */
4732 for (c = 1; c <= uband && c <= glength; c++) {
4733 assert((*directions_Egap)[c][0] != DIAG);
4734 assert((*directions_nogap)[c][0] != DIAG);
4735 }
4736 #endif
4737
4738 #ifdef DEBUG_AVX2
4739 matrix_std = Dynprog_simd_8_upper_nonavx2(&directions_nogap_std,&directions_Egap_std,
4740 this,rsequence,gsequence,gsequence_alt,
4741 rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
4742 open,extend,uband,jump_late_p,revp);
4743
4744 #elif defined(DEBUG_SIMD)
4745 matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
4746 this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
4747 rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
4748 open,extend,/*lband*/0,uband,jump_late_p,revp,/*saturation*/NEG_INFINITY_8,
4749 /*upperp*/true,/*lowerp*/false);
4750 #endif
4751
4752 #ifdef DEBUG2
4753 printf("Banded %s\n",revp ? "rev" : "fwd");
4754 Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,revp,uband,/*upperp*/true);
4755 Directions8_print_ud(*directions_nogap,*directions_Egap,
4756 rlength,glength,rsequence,gsequence,gsequence_alt,revp,uband,/*upperp*/true);
4757 #endif
4758
4759 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
4760 banded_matrix8_compare_upper(matrix,matrix_std,rlength,glength,uband,
4761 rsequence,gsequence,gsequence_alt,
4762 goffset,chroffset,chrhigh,watsonp,revp);
4763
4764 banded_directions8_compare_nogap_upper(matrix,*directions_nogap,directions_nogap_std,rlength,glength,uband);
4765
4766 banded_directions8_compare_Egap_upper(matrix,*directions_Egap,directions_Egap_std,rlength,glength,uband);
4767 #endif
4768
4769 _mm_free(pairscores[4]);
4770 _mm_free(pairscores[3]);
4771 _mm_free(pairscores[2]);
4772 _mm_free(pairscores[1]);
4773 _mm_free(pairscores[0]);
4774
4775 return matrix;
4776 }
4777 #endif
4778
4779
4780 #ifdef DEBUG_AVX2
4781 /* Designed for computation below the main diagonal, so no F loop or bottom masking needed */
4782 /* Operates by rows */
4783 Score8_T **
Dynprog_simd_8_lower_nonavx2(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,bool jump_late_p,bool revp)4784 Dynprog_simd_8_lower_nonavx2 (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
4785 T this, char *rsequence, char *gsequence, char *gsequence_alt,
4786 int rlength, int glength,
4787 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
4788 Mismatchtype_T mismatchtype, int open, int extend,
4789 int lband, bool jump_late_p, bool revp) {
4790 Score8_T **matrix, *score_column;
4791 __m128i pairscores_std;
4792 #ifdef HAVE_SSE4_1
4793 __m128i E_infinity;
4794 #else
4795 __m128i pairscores_best, all_128, E_infinity_plus_128;
4796 #endif
4797 __m128i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, T1;
4798 __m128i gap_open, gap_extend, complement_dummy;
4799 __m128i dir_vert;
4800 int glength_ceil, r, c;
4801 int clo, chigh;
4802 int na1, na2, na2_alt;
4803 Score8_T *pairscores[5], *pairscores_ptr;
4804 Pairdistance_T **pairdistance_array_type, score1, score2;
4805
4806
4807 debug2(printf("Dynprog_simd_8_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
4808 debug15(printf("Dynprog_simd_8_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
4809
4810 glength_ceil = (int) ((glength + SIMD_NCHARS_NONAVX2)/SIMD_NCHARS_NONAVX2) * SIMD_NCHARS_NONAVX2;
4811
4812 #ifdef HAVE_SSE4_1
4813 pairdistance_array_type = pairdistance_array[mismatchtype];
4814 #else
4815 /* Needed to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
4816 pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
4817 all_128 = _mm_set1_epi8(128);
4818 #endif
4819
4820 debug(printf("compute_scores_simd_8_byrows (lower): "));
4821 debug(printf("Lengths are %d and %d, so band is %d on left\n",rlength,glength,lband));
4822 debug(printf("Genome length rounded up to %d\n",glength_ceil));
4823
4824 matrix = aligned_score8_alloc(glength_ceil,rlength,
4825 this->aligned_std.two.lower_matrix_ptrs,this->aligned_std.two.lower_matrix_space);
4826 *directions_nogap = aligned_directions8_alloc(glength_ceil,rlength,
4827 this->aligned_std.two.lower_directions_ptrs_0,this->aligned_std.two.lower_directions_space_0);
4828 *directions_Egap = aligned_directions8_alloc(glength_ceil,rlength,
4829 this->aligned_std.two.lower_directions_ptrs_1,this->aligned_std.two.lower_directions_space_1);
4830
4831 #if 0
4832 /* Column 0 initialization */
4833 /* penalty = open; */
4834 for (r = 1; r <= lband && r <= rlength; r++) {
4835 /* penalty += extend; */
4836 (*directions_Egap)[r][0] = VERT;
4837 (*directions_nogap)[r][0] = VERT;
4838 }
4839 #endif
4840 #if 0
4841 /* Already initialized to DIAG. Actually no longer initializing directions_Egap */
4842 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
4843 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
4844 #endif
4845 #if 0
4846 /* Row 0 initialization */
4847 /* penalty = open; */
4848 for (c = 1; c <= SIMD_NCHARS_NONAVX2 && c <= glength; c++) {
4849 /* penalty += extend; */
4850 (*directions_nogap)[0][c] = HORIZ;
4851 }
4852 #endif
4853
4854
4855 /* Load pairscores. Store match - mismatch */
4856 pairscores[0] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),16);
4857 pairscores[1] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),16);
4858 pairscores[2] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),16);
4859 pairscores[3] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),16);
4860 pairscores[4] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),16);
4861
4862 #if 0
4863 /* Should not be necessary */
4864 memset((void *) pairscores[0],0,glength_ceil*sizeof(Score8_T));
4865 memset((void *) pairscores[1],0,glength_ceil*sizeof(Score8_T));
4866 memset((void *) pairscores[2],0,glength_ceil*sizeof(Score8_T));
4867 memset((void *) pairscores[3],0,glength_ceil*sizeof(Score8_T));
4868 memset((void *) pairscores[4],0,glength_ceil*sizeof(Score8_T));
4869 #endif
4870
4871 /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
4872 c = 0; na2 = na2_alt = 4; /* 'N' */
4873 #ifdef HAVE_SSE4_1
4874 pairscores[0][c] = (Score8_T) pairdistance_array_type[(int) 'A'][na2];
4875 pairscores[1][c] = (Score8_T) pairdistance_array_type[(int) 'C'][na2];
4876 pairscores[2][c] = (Score8_T) pairdistance_array_type[(int) 'G'][na2];
4877 pairscores[3][c] = (Score8_T) pairdistance_array_type[(int) 'T'][na2];
4878 pairscores[4][c] = (Score8_T) pairdistance_array_type[(int) 'N'][na2];
4879 #else
4880 pairscores[0][c] = (Score8_T) pairdistance_array_type[(int) 'A'][na2] - 128;
4881 pairscores[1][c] = (Score8_T) pairdistance_array_type[(int) 'C'][na2] - 128;
4882 pairscores[2][c] = (Score8_T) pairdistance_array_type[(int) 'G'][na2] - 128;
4883 pairscores[3][c] = (Score8_T) pairdistance_array_type[(int) 'T'][na2] - 128;
4884 pairscores[4][c] = (Score8_T) pairdistance_array_type[(int) 'N'][na2] - 128;
4885 #endif
4886
4887 if (revp == false) {
4888 for (c = 1; c <= glength; c++) {
4889 na2 = gsequence[c-1];
4890 na2_alt = gsequence_alt[c-1];
4891 /* Take max here */
4892 score1 = pairdistance_array_type[(int) 'A'][na2];
4893 score2 = pairdistance_array_type[(int) 'A'][na2_alt];
4894 pairscores[0][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4895
4896 score1 = pairdistance_array_type[(int) 'C'][na2];
4897 score2 = pairdistance_array_type[(int) 'C'][na2_alt];
4898 pairscores[1][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4899
4900 score1 = pairdistance_array_type[(int) 'G'][na2];
4901 score2 = pairdistance_array_type[(int) 'G'][na2_alt];
4902 pairscores[2][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4903
4904 score1 = pairdistance_array_type[(int) 'T'][na2];
4905 score2 = pairdistance_array_type[(int) 'T'][na2_alt];
4906 pairscores[3][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4907
4908 score1 = pairdistance_array_type[(int) 'N'][na2];
4909 score2 = pairdistance_array_type[(int) 'N'][na2_alt];
4910 pairscores[4][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4911 }
4912 } else {
4913 for (c = 1; c <= glength; c++) {
4914 na2 = gsequence[1-c];
4915 na2_alt = gsequence_alt[1-c];
4916 /* Take max here */
4917 score1 = pairdistance_array_type[(int) 'A'][na2];
4918 score2 = pairdistance_array_type[(int) 'A'][na2_alt];
4919 pairscores[0][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4920
4921 score1 = pairdistance_array_type[(int) 'C'][na2];
4922 score2 = pairdistance_array_type[(int) 'C'][na2_alt];
4923 pairscores[1][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4924
4925 score1 = pairdistance_array_type[(int) 'G'][na2];
4926 score2 = pairdistance_array_type[(int) 'G'][na2_alt];
4927 pairscores[2][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4928
4929 score1 = pairdistance_array_type[(int) 'T'][na2];
4930 score2 = pairdistance_array_type[(int) 'T'][na2_alt];
4931 pairscores[3][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4932
4933 score1 = pairdistance_array_type[(int) 'N'][na2];
4934 score2 = pairdistance_array_type[(int) 'N'][na2_alt];
4935 pairscores[4][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4936 }
4937 }
4938
4939 #if 0
4940 /* Should not be necessary */
4941 memset((void *) &(pairscores[0][c]),0,(glength_ceil-c)*sizeof(Score8_T));
4942 memset((void *) &(pairscores[1][c]),0,(glength_ceil-c)*sizeof(Score8_T));
4943 memset((void *) &(pairscores[2][c]),0,(glength_ceil-c)*sizeof(Score8_T));
4944 memset((void *) &(pairscores[3][c]),0,(glength_ceil-c)*sizeof(Score8_T));
4945 memset((void *) &(pairscores[4][c]),0,(glength_ceil-c)*sizeof(Score8_T));
4946 #endif
4947
4948 complement_dummy = _mm_set1_epi8(-1);
4949
4950 gap_open = _mm_set1_epi8((Score8_T) open);
4951 gap_extend = _mm_set1_epi8((Score8_T) extend);
4952
4953 #ifdef HAVE_SSE4_1
4954 E_infinity = _mm_set1_epi8(POS_INFINITY_8);
4955 #else
4956 E_infinity_plus_128 = _mm_set1_epi8(POS_INFINITY_8+128);
4957 #endif
4958 if (jump_late_p) {
4959 for (clo = 0; clo <= glength; clo += SIMD_NCHARS_NONAVX2) {
4960 if ((chigh = clo + SIMD_NCHARS_NONAVX2 - 1) > glength) {
4961 chigh = glength;
4962 }
4963
4964 /* dir_vert tests if E >= H. To fill in first row of each
4965 column block with non-diags, make E == H. */
4966 E_mask = _mm_set1_epi8(1);
4967
4968 E_c_gap = _mm_set1_epi8(NEG_INFINITY_8);
4969 H_nogap_c = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
4970
4971 for (r = clo; r <= chigh + lband && r <= rlength; r++) {
4972 score_column = matrix[r];
4973
4974 if (r == 0) {
4975 na1 = 4; /* 'N' */
4976 } else {
4977 na1 = revp ? nt_to_int_array[rsequence[1-r]] : nt_to_int_array[rsequence[r-1]];
4978 }
4979 pairscores_ptr = pairscores[na1];
4980
4981 if (r == 0) {
4982 X_prev_nogap = _mm_set1_epi8(0);
4983 } else if (clo == 0) {
4984 #ifdef ZERO_INITIAL_GAP_PENALTY
4985 X_prev_nogap = _mm_set1_epi8(0);
4986 #else
4987 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the clo bounds */
4988 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
4989 #endif
4990 } else {
4991 /* second or greater block of 8 */
4992 X_prev_nogap = _mm_set1_epi8(matrix[r-1][clo-1]); /* get H from previous block and previous column */
4993 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
4994 }
4995
4996 debug15(print_vector_8(E_mask,clo,r,"E_mask"));
4997 #ifdef HAVE_SSE4_1
4998 E_c_gap = _mm_min_epi8(E_c_gap,_mm_add_epi8(E_mask,E_infinity));
4999 #else
5000 E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5001 #endif
5002 debug15(print_vector_8(E_c_gap,clo,r,"E_c_gap"));
5003 debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c load"));
5004
5005 /* EGAP */
5006 T1 = _mm_adds_epi8(H_nogap_c, gap_open);
5007 dir_vert = _mm_cmplt_epi8(E_c_gap,T1); /* E < H */
5008 dir_vert = _mm_andnot_si128(dir_vert,complement_dummy); /* E >= H, for jump late */
5009 _mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
5010 debug15(print_vector_8(T1,clo,r,"T1"));
5011 debug15(print_vector_8(dir_vert,clo,r,"dir_Egap"));
5012
5013 #ifdef HAVE_SSE4_1
5014 E_c_gap = _mm_max_epi8(E_c_gap, T1); /* Compare H + open with horiz */
5015 E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5016 E_c_gap = _mm_min_epi8(E_c_gap,_mm_add_epi8(E_mask,E_infinity));
5017 #elif 1
5018 E_c_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
5019 E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5020 E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5021 #else
5022 /* Try to avoid unnecessary shifts by 128, but overflows */
5023 E_c_gap = _mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128));
5024 E_c_gap = _mm_add_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5025 E_c_gap = _mm_sub_epi8(_mm_min_epu8(E_c_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5026 #endif
5027 debug15(print_vector_8(E_c_gap,clo,r,"E"));
5028
5029
5030 /* NOGAP */
5031 T1 = _mm_srli_si128(H_nogap_c,LAST_CHAR_NONAVX2);
5032 H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_CHAR);
5033 H_nogap_c = _mm_or_si128(H_nogap_c, X_prev_nogap);
5034 X_prev_nogap = T1;
5035
5036 /* Add pairscores. No alternate chars for query sequence */
5037 #ifdef HAVE_SSE4_1
5038 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
5039 debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5040 H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_std);
5041 #else
5042 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo])); /* Has 128 added already */
5043 pairscores_best = _mm_sub_epi8(pairscores_std, all_128);
5044 debug15(print_vector_8(pairscores_best,clo,r,"pairscores_std"));
5045 H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_best);
5046 #endif
5047 _mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
5048 debug15(print_vector_8(H_nogap_c,clo,r,"H"));
5049
5050 dir_vert = _mm_cmplt_epi8(E_c_gap,H_nogap_c); /* E < H */
5051 dir_vert = _mm_andnot_si128(dir_vert,complement_dummy); /* E >= H, for jump late */
5052 _mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
5053 debug15(print_vector_8(dir_vert,clo,r,"dir_nogap"));
5054
5055
5056 #ifdef HAVE_SSE4_1
5057 H_nogap_c = _mm_max_epi8(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
5058 #else
5059 /* Compare H + pairscores with horiz + extend */
5060 H_nogap_c = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_c, all_128), _mm_add_epi8(E_c_gap, all_128)), all_128);
5061 #endif
5062 debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c store"));
5063 _mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
5064
5065
5066 /* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
5067 if (chigh >= r) {
5068 (*directions_Egap)[r][r] = DIAG;
5069 (*directions_nogap)[r][r] = DIAG;
5070 }
5071
5072 /* No need for F loop here */
5073 E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
5074 }
5075 }
5076
5077 } else {
5078 /* jump early */
5079 for (clo = 0; clo <= glength; clo += SIMD_NCHARS_NONAVX2) {
5080 if ((chigh = clo + SIMD_NCHARS_NONAVX2 - 1) > glength) {
5081 chigh = glength;
5082 }
5083
5084 /* dir_vert tests if E > H. To fill in first row of each
5085 column block with non-diags, make E > H. */
5086 E_mask = _mm_set1_epi8(1);
5087
5088 E_c_gap = _mm_set1_epi8(NEG_INFINITY_8+1);
5089 H_nogap_c = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
5090
5091 for (r = clo; r <= chigh + lband && r <= rlength; r++) {
5092 score_column = matrix[r];
5093
5094 if (r == 0) {
5095 na1 = 4; /* 'N' */
5096 } else {
5097 na1 = revp ? nt_to_int_array[rsequence[1-r]] : nt_to_int_array[rsequence[r-1]];
5098 }
5099 pairscores_ptr = pairscores[na1];
5100
5101 if (r == 0) {
5102 X_prev_nogap = _mm_set1_epi8(0);
5103 } else if (clo == 0) {
5104 #ifdef ZERO_INITIAL_GAP_PENALTY
5105 X_prev_nogap = _mm_set1_epi8(0);
5106 #else
5107 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the clo bounds */
5108 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
5109 #endif
5110 } else {
5111 /* second or greater block of 8 */
5112 X_prev_nogap = _mm_set1_epi8(matrix[r-1][clo-1]); /* get H from previous block and previous column */
5113 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
5114 }
5115
5116 debug15(print_vector_8(E_mask,clo,r,"E_mask"));
5117 #ifdef HAVE_SSE4_1
5118 E_c_gap = _mm_min_epi8(E_c_gap,_mm_add_epi8(E_mask,E_infinity));
5119 #else
5120 E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5121 #endif
5122 debug15(print_vector_8(E_c_gap,clo,r,"E_c_gap"));
5123 debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c load"));
5124
5125 /* EGAP */
5126 T1 = _mm_adds_epi8(H_nogap_c, gap_open);
5127 dir_vert = _mm_cmpgt_epi8(E_c_gap,T1); /* E > H, for jump early */
5128 _mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
5129 debug15(print_vector_8(T1,clo,r,"T1"));
5130 debug15(print_vector_8(dir_vert,clo,r,"dir_Egap"));
5131
5132 /* Compare H + open with vert */
5133 #ifdef HAVE_SSE4_1
5134 E_c_gap = _mm_max_epi8(E_c_gap, T1); /* Compare H + open with vert */
5135 E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5136 E_c_gap = _mm_min_epi8(E_c_gap,_mm_add_epi8(E_mask,E_infinity));
5137 #elif 1
5138 E_c_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
5139 E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5140 E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5141 #else
5142 /* Try to avoid unnecessary shifts by 128, but overflows */
5143 E_c_gap = _mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128));
5144 E_c_gap = _mm_add_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5145 E_c_gap = _mm_sub_epi8(_mm_min_epu8(E_c_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5146 #endif
5147 debug15(print_vector_8(E_c_gap,clo,r,"E"));
5148
5149
5150 /* NOGAP */
5151 T1 = _mm_srli_si128(H_nogap_c,LAST_CHAR_NONAVX2);
5152 H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_CHAR);
5153 H_nogap_c = _mm_or_si128(H_nogap_c, X_prev_nogap);
5154 X_prev_nogap = T1;
5155
5156 /* Add pairscores. No alternate chars for query sequence */
5157 #ifdef HAVE_SSE4_1
5158 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
5159 debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5160 H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_std);
5161 #else
5162 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo])); /* Has 128 added already */
5163 pairscores_best = _mm_sub_epi8(pairscores_std, all_128);
5164 debug15(print_vector_8(pairscores_best,clo,r,"pairscores_std"));
5165 H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_best);
5166 #endif
5167 _mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
5168 debug15(print_vector_8(H_nogap_c,clo,r,"H"));
5169
5170 dir_vert = _mm_cmpgt_epi8(E_c_gap,H_nogap_c); /* E > H, for jump early */
5171 _mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
5172 debug15(print_vector_8(dir_vert,clo,r,"dir_nogap"));
5173
5174
5175 #ifdef HAVE_SSE4_1
5176 H_nogap_c = _mm_max_epi8(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
5177 #else
5178 /* Compare H + pairscores with horiz + extend */
5179 H_nogap_c = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_c, all_128), _mm_add_epi8(E_c_gap, all_128)), all_128);
5180 #endif
5181 debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c store"));
5182 _mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
5183
5184 /* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
5185 if (chigh >= r) {
5186 (*directions_Egap)[r][r] = DIAG;
5187 (*directions_nogap)[r][r] = DIAG;
5188 }
5189
5190 /* No need for F loop here */
5191 E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
5192 }
5193 }
5194 }
5195
5196 #ifdef CHECK1
5197 /* Row 0 and column 0 directions fail anyway due to saturation */
5198 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
5199 (*directions_Egap)[1][0] = VERT;
5200 #endif
5201
5202 #ifdef DEBUG2
5203 printf("SIMD: Dynprog_simd_8_lower\n");
5204 Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
5205 revp,lband,/*upperp*/false);
5206 Directions8_print_ud(*directions_nogap,*directions_Egap,
5207 rlength,glength,rsequence,gsequence,gsequence_alt,
5208 revp,lband,/*upperp*/false);
5209 #endif
5210
5211 #ifdef CHECK1
5212 /* Check for column 0 directions */
5213 for (r = 1; r <= lband && r <= rlength; r++) {
5214 assert((*directions_Egap)[r][0] != DIAG);
5215 assert((*directions_nogap)[r][0] != DIAG);
5216 }
5217 #endif
5218
5219 _mm_free(pairscores[4]);
5220 _mm_free(pairscores[3]);
5221 _mm_free(pairscores[2]);
5222 _mm_free(pairscores[1]);
5223 _mm_free(pairscores[0]);
5224
5225 return matrix;
5226 }
5227 #endif
5228
5229
5230 #ifdef HAVE_SSE2
5231 /* Designed for computation below the main diagonal, so no F loop or bottom masking needed */
5232 /* Operates by rows */
5233 Score8_T **
Dynprog_simd_8_lower(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,bool jump_late_p,bool revp)5234 Dynprog_simd_8_lower (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
5235 T this, char *rsequence, char *gsequence, char *gsequence_alt,
5236 int rlength, int glength,
5237 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
5238 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
5239 #endif
5240 Mismatchtype_T mismatchtype, int open, int extend,
5241 int lband, bool jump_late_p, bool revp) {
5242 Score8_T **matrix, *score_column;
5243 #ifdef HAVE_AVX2
5244 __m256i E_infinity;
5245 #elif defined(HAVE_SSE4_1)
5246 __m128i E_infinity;
5247 #else
5248 __m128i pairscores_best, all_128, E_infinity_plus_128;
5249 #endif
5250 #ifdef HAVE_AVX2
5251 __m256i pairscores_std;
5252 __m256i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, T1;
5253 __m256i gap_open, gap_extend, complement_dummy;
5254 __m256i dir_vert;
5255 Score8_T save;
5256 #else
5257 __m128i pairscores_std;
5258 __m128i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, T1;
5259 __m128i gap_open, gap_extend, complement_dummy;
5260 __m128i dir_vert;
5261 #endif
5262 int glength_ceil, r, c;
5263 int clo, chigh;
5264 int na1, na2, na2_alt;
5265 Score8_T *pairscores[5], *pairscores_ptr;
5266 Pairdistance_T **pairdistance_array_type, score1, score2;
5267
5268 #ifdef DEBUG_AVX2
5269 Score8_T **matrix_std;
5270 Direction8_T **directions_nogap_std, **directions_Egap_std;
5271 char na2_single;
5272 #elif defined(DEBUG_SIMD)
5273 Score32_T **matrix_std;
5274 Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
5275 char na2_single;
5276 #endif
5277
5278
5279 debug2(printf("Dynprog_simd_8_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
5280 debug15(printf("Dynprog_simd_8_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
5281
5282 glength_ceil = (int) ((glength + SIMD_NCHARS)/SIMD_NCHARS) * SIMD_NCHARS;
5283
5284 #ifdef HAVE_SSE4_1
5285 pairdistance_array_type = pairdistance_array[mismatchtype];
5286 #else
5287 /* Needed to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
5288 pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
5289 all_128 = _mm_set1_epi8(128);
5290 #endif
5291
5292 debug(printf("compute_scores_simd_8_byrows (lower): "));
5293 debug(printf("Lengths are %d and %d, so band is %d on left\n",rlength,glength,lband));
5294 debug(printf("Genome length rounded up to %d\n",glength_ceil));
5295
5296 matrix = aligned_score8_alloc(glength_ceil,rlength,
5297 this->aligned.two.lower_matrix_ptrs,this->aligned.two.lower_matrix_space);
5298 *directions_nogap = aligned_directions8_alloc(glength_ceil,rlength,
5299 this->aligned.two.lower_directions_ptrs_0,this->aligned.two.lower_directions_space_0);
5300 *directions_Egap = aligned_directions8_alloc(glength_ceil,rlength,
5301 this->aligned.two.lower_directions_ptrs_1,this->aligned.two.lower_directions_space_1);
5302
5303 #if 0
5304 /* Column 0 initialization */
5305 /* penalty = open; */
5306 for (r = 1; r <= lband && r <= rlength; r++) {
5307 /* penalty += extend; */
5308 (*directions_Egap)[r][0] = VERT;
5309 (*directions_nogap)[r][0] = VERT;
5310 }
5311 #endif
5312 #if 0
5313 /* Already initialized to DIAG. Actually no longer initializing directions_Egap */
5314 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
5315 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
5316 #endif
5317 #if 0
5318 /* Row 0 initialization */
5319 /* penalty = open; */
5320 for (c = 1; c <= SIMD_NCHARS && c <= glength; c++) {
5321 /* penalty += extend; */
5322 (*directions_nogap)[0][c] = HORIZ;
5323 }
5324 #endif
5325
5326
5327 /* Load pairscores. Store match - mismatch */
5328 pairscores[0] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),ALIGN_SIZE);
5329 pairscores[1] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),ALIGN_SIZE);
5330 pairscores[2] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),ALIGN_SIZE);
5331 pairscores[3] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),ALIGN_SIZE);
5332 pairscores[4] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),ALIGN_SIZE);
5333
5334 #if 0
5335 /* Should not be necessary */
5336 memset((void *) pairscores[0],0,glength_ceil*sizeof(Score8_T));
5337 memset((void *) pairscores[1],0,glength_ceil*sizeof(Score8_T));
5338 memset((void *) pairscores[2],0,glength_ceil*sizeof(Score8_T));
5339 memset((void *) pairscores[3],0,glength_ceil*sizeof(Score8_T));
5340 memset((void *) pairscores[4],0,glength_ceil*sizeof(Score8_T));
5341 #endif
5342
5343 /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
5344 c = 0; na2 = na2_alt = 4; /* 'N' */
5345 #ifdef HAVE_SSE4_1
5346 pairscores[0][c] = (Score8_T) pairdistance_array_type[(int) 'A'][na2];
5347 pairscores[1][c] = (Score8_T) pairdistance_array_type[(int) 'C'][na2];
5348 pairscores[2][c] = (Score8_T) pairdistance_array_type[(int) 'G'][na2];
5349 pairscores[3][c] = (Score8_T) pairdistance_array_type[(int) 'T'][na2];
5350 pairscores[4][c] = (Score8_T) pairdistance_array_type[(int) 'N'][na2];
5351 #else
5352 pairscores[0][c] = (Score8_T) pairdistance_array_type[(int) 'A'][na2] - 128;
5353 pairscores[1][c] = (Score8_T) pairdistance_array_type[(int) 'C'][na2] - 128;
5354 pairscores[2][c] = (Score8_T) pairdistance_array_type[(int) 'G'][na2] - 128;
5355 pairscores[3][c] = (Score8_T) pairdistance_array_type[(int) 'T'][na2] - 128;
5356 pairscores[4][c] = (Score8_T) pairdistance_array_type[(int) 'N'][na2] - 128;
5357 #endif
5358
5359 if (revp == false) {
5360 for (c = 1; c <= glength; c++) {
5361 na2 = gsequence[c-1];
5362 na2_alt = gsequence_alt[c-1];
5363 /* Take max here */
5364 score1 = pairdistance_array_type[(int) 'A'][na2];
5365 score2 = pairdistance_array_type[(int) 'A'][na2_alt];
5366 pairscores[0][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5367
5368 score1 = pairdistance_array_type[(int) 'C'][na2];
5369 score2 = pairdistance_array_type[(int) 'C'][na2_alt];
5370 pairscores[1][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5371
5372 score1 = pairdistance_array_type[(int) 'G'][na2];
5373 score2 = pairdistance_array_type[(int) 'G'][na2_alt];
5374 pairscores[2][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5375
5376 score1 = pairdistance_array_type[(int) 'T'][na2];
5377 score2 = pairdistance_array_type[(int) 'T'][na2_alt];
5378 pairscores[3][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5379
5380 score1 = pairdistance_array_type[(int) 'N'][na2];
5381 score2 = pairdistance_array_type[(int) 'N'][na2_alt];
5382 pairscores[4][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5383 }
5384 } else {
5385 for (c = 1; c <= glength; c++) {
5386 na2 = gsequence[1-c];
5387 na2_alt = gsequence_alt[1-c];
5388 /* Take max here */
5389 score1 = pairdistance_array_type[(int) 'A'][na2];
5390 score2 = pairdistance_array_type[(int) 'A'][na2_alt];
5391 pairscores[0][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5392
5393 score1 = pairdistance_array_type[(int) 'C'][na2];
5394 score2 = pairdistance_array_type[(int) 'C'][na2_alt];
5395 pairscores[1][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5396
5397 score1 = pairdistance_array_type[(int) 'G'][na2];
5398 score2 = pairdistance_array_type[(int) 'G'][na2_alt];
5399 pairscores[2][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5400
5401 score1 = pairdistance_array_type[(int) 'T'][na2];
5402 score2 = pairdistance_array_type[(int) 'T'][na2_alt];
5403 pairscores[3][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5404
5405 score1 = pairdistance_array_type[(int) 'N'][na2];
5406 score2 = pairdistance_array_type[(int) 'N'][na2_alt];
5407 pairscores[4][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5408 }
5409 }
5410
5411 #if 0
5412 /* Should not be necessary */
5413 memset((void *) &(pairscores[0][c]),0,(glength_ceil-c)*sizeof(Score8_T));
5414 memset((void *) &(pairscores[1][c]),0,(glength_ceil-c)*sizeof(Score8_T));
5415 memset((void *) &(pairscores[2][c]),0,(glength_ceil-c)*sizeof(Score8_T));
5416 memset((void *) &(pairscores[3][c]),0,(glength_ceil-c)*sizeof(Score8_T));
5417 memset((void *) &(pairscores[4][c]),0,(glength_ceil-c)*sizeof(Score8_T));
5418 #endif
5419
5420 complement_dummy = _MM_SET1_EPI8(-1);
5421
5422 gap_open = _MM_SET1_EPI8((Score8_T) open);
5423 gap_extend = _MM_SET1_EPI8((Score8_T) extend);
5424
5425 #ifdef HAVE_SSE4_1
5426 E_infinity = _MM_SET1_EPI8(POS_INFINITY_8);
5427 #else
5428 E_infinity_plus_128 = _mm_set1_epi8(POS_INFINITY_8+128);
5429 #endif
5430 if (jump_late_p) {
5431 for (clo = 0; clo <= glength; clo += SIMD_NCHARS) {
5432 if ((chigh = clo + SIMD_NCHARS - 1) > glength) {
5433 chigh = glength;
5434 }
5435
5436 /* dir_vert tests if E >= H. To fill in first row of each
5437 column block with non-diags, make E == H. */
5438 E_mask = _MM_SET1_EPI8(1);
5439
5440 /* Holds for all INITIAL_GAP_PENALTY */
5441 E_c_gap = _MM_SET1_EPI8(NEG_INFINITY_8);
5442 H_nogap_c = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
5443
5444 for (r = clo; r <= chigh + lband && r <= rlength; r++) {
5445 score_column = matrix[r];
5446
5447 if (r == 0) {
5448 na1 = 4; /* 'N' */
5449 } else {
5450 na1 = revp ? nt_to_int_array[(int) rsequence[1-r]] : nt_to_int_array[(int) rsequence[r-1]];
5451 }
5452 pairscores_ptr = pairscores[na1];
5453
5454 if (r == 0) {
5455 X_prev_nogap = _MM_SETZERO_SI();
5456 } else if (clo == 0) {
5457 #ifdef ZERO_INITIAL_GAP_PENALTY
5458 X_prev_nogap = _MM_SETZERO_SI();
5459 #elif defined(HAVE_AVX2)
5460 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
5461 #else
5462 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the clo bounds */
5463 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
5464 #endif
5465 } else {
5466 /* second or greater block of 8 */
5467 #ifdef ZERO_INITIAL_GAP_PENALTY
5468 X_prev_nogap = _MM_SETZERO_SI();
5469 #elif defined(HAVE_AVX2)
5470 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[r-1][clo-1],LAST_CHAR_INSERT);
5471 #else
5472 X_prev_nogap = _mm_set1_epi8(matrix[r-1][clo-1]); /* get H from previous block and previous column */
5473 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
5474 #endif
5475 }
5476
5477 debug15(print_vector_8(E_mask,clo,r,"E_mask"));
5478 #ifdef HAVE_SSE4_1
5479 E_c_gap = _MM_MIN_EPI8(E_c_gap,_MM_ADD_EPI8(E_mask,E_infinity));
5480 #else
5481 E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5482 #endif
5483 debug15(print_vector_8(E_c_gap,clo,r,"E_c_gap"));
5484 debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c load"));
5485
5486 /* EGAP */
5487 T1 = _MM_ADDS_EPI8(H_nogap_c, gap_open);
5488 dir_vert = _MM_CMPLT_EPI8(E_c_gap,T1); /* E < H */
5489 dir_vert = _MM_ANDNOT_SI(dir_vert,complement_dummy); /* E >= H, for jump late */
5490 #ifdef HAVE_AVX2
5491 _mm256_store_si256((__m256i *) &((*directions_Egap)[r][clo]),dir_vert);
5492 #else
5493 _mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
5494 #endif
5495 debug15(print_vector_8(T1,clo,r,"T1"));
5496 debug15(print_vector_8(dir_vert,clo,r,"dir_Egap"));
5497
5498 #ifdef HAVE_SSE4_1
5499 E_c_gap = _MM_MAX_EPI8(E_c_gap, T1); /* Compare H + open with horiz */
5500 E_c_gap = _MM_ADDS_EPI8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5501 E_c_gap = _MM_MIN_EPI8(E_c_gap,_MM_ADD_EPI8(E_mask,E_infinity));
5502 #elif 1
5503 E_c_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
5504 E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5505 E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5506 #else
5507 /* Try to avoid unnecessary shifts by 128, but overflows */
5508 E_c_gap = _mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128));
5509 E_c_gap = _mm_add_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5510 E_c_gap = _mm_sub_epi8(_mm_min_epu8(E_c_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5511 #endif
5512 debug15(print_vector_8(E_c_gap,clo,r,"E"));
5513
5514
5515 /* NOGAP */
5516 #ifdef HAVE_AVX2
5517 T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_c,SIMD_NCHARS-1),LAST_CHAR_INSERT);
5518 X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_c,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
5519 H_nogap_c = _mm256_slli_si256(H_nogap_c,ONE_CHAR);
5520 #else
5521 T1 = _mm_srli_si128(H_nogap_c,LAST_CHAR_SHIFT);
5522 H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_CHAR);
5523 #endif
5524 H_nogap_c = _MM_OR_SI(H_nogap_c, X_prev_nogap);
5525 X_prev_nogap = T1;
5526
5527 /* Add pairscores. No alternate chars for query sequence */
5528 #ifdef HAVE_AVX2
5529 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_ptr[clo]));
5530 debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5531 H_nogap_c = _MM_ADDS_EPI8(H_nogap_c, pairscores_std);
5532 #elif defined(HAVE_SSE4_1)
5533 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
5534 debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5535 H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_std);
5536 #else
5537 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo])); /* Has 128 added already */
5538 pairscores_best = _mm_sub_epi8(pairscores_std, all_128);
5539 debug15(print_vector_8(pairscores_best,clo,r,"pairscores_std"));
5540 H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_best);
5541 #endif
5542 _mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
5543 debug15(print_vector_8(H_nogap_c,clo,r,"H"));
5544
5545 dir_vert = _MM_CMPLT_EPI8(E_c_gap,H_nogap_c); /* E < H */
5546 dir_vert = _MM_ANDNOT_SI(dir_vert,complement_dummy); /* E >= H, for jump late */
5547 #ifdef HAVE_AVX2
5548 _mm256_store_si256((__m256i *) &((*directions_nogap)[r][clo]),dir_vert);
5549 #else
5550 _mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
5551 #endif
5552 debug15(print_vector_8(dir_vert,clo,r,"dir_nogap"));
5553
5554
5555 #ifdef HAVE_SSE4_1
5556 H_nogap_c = _MM_MAX_EPI8(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
5557 #else
5558 /* Compare H + pairscores with horiz + extend */
5559 H_nogap_c = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_c, all_128), _mm_add_epi8(E_c_gap, all_128)), all_128);
5560 #endif
5561 debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c store"));
5562 #ifdef HAVE_AVX2
5563 _mm256_store_si256((__m256i *) &(score_column[clo]), H_nogap_c);
5564 #else
5565 _mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
5566 #endif
5567
5568
5569 /* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
5570 if (chigh >= r) {
5571 (*directions_Egap)[r][r] = DIAG;
5572 (*directions_nogap)[r][r] = DIAG;
5573 }
5574
5575 /* No need for F loop here */
5576 #ifdef HAVE_AVX2
5577 save = _mm256_extract_epi8(E_mask,15);
5578 E_mask = _mm256_slli_si256(E_mask,ONE_CHAR);
5579 E_mask = _mm256_insert_epi8(E_mask,save,16);
5580 #else
5581 E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
5582 #endif
5583 }
5584 }
5585
5586 } else {
5587 /* jump early */
5588 for (clo = 0; clo <= glength; clo += SIMD_NCHARS) {
5589 if ((chigh = clo + SIMD_NCHARS - 1) > glength) {
5590 chigh = glength;
5591 }
5592
5593 /* dir_vert tests if E > H. To fill in first row of each
5594 column block with non-diags, make E > H. */
5595 E_mask = _MM_SET1_EPI8(1);
5596
5597 /* Holds for all INITIAL_GAP_PENALTY */
5598 E_c_gap = _MM_SET1_EPI8(NEG_INFINITY_8+1);
5599 H_nogap_c = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
5600
5601 for (r = clo; r <= chigh + lband && r <= rlength; r++) {
5602 score_column = matrix[r];
5603
5604 if (r == 0) {
5605 na1 = 4; /* 'N' */
5606 } else {
5607 na1 = revp ? nt_to_int_array[(int) rsequence[1-r]] : nt_to_int_array[(int) rsequence[r-1]];
5608 }
5609 pairscores_ptr = pairscores[na1];
5610
5611 if (r == 0) {
5612 X_prev_nogap = _MM_SETZERO_SI();
5613 } else if (clo == 0) {
5614 #ifdef ZERO_INITIAL_GAP_PENALTY
5615 X_prev_nogap = _MM_SETZERO_SI();
5616 #elif defined(HAVE_AVX2)
5617 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
5618 #else
5619 X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the clo bounds */
5620 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
5621 #endif
5622 } else {
5623 /* second or greater block of 8 */
5624 #ifdef ZERO_INITIAL_GAP_PENALTY
5625 X_prev_nogap = _MM_SETZERO_SI();
5626 #elif defined(HAVE_AVX2)
5627 X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[r-1][clo-1],LAST_CHAR_INSERT);
5628 #else
5629 X_prev_nogap = _mm_set1_epi8(matrix[r-1][clo-1]); /* get H from previous block and previous column */
5630 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
5631 #endif
5632 }
5633
5634 debug15(print_vector_8(E_mask,clo,r,"E_mask"));
5635 #ifdef HAVE_SSE4_1
5636 E_c_gap = _MM_MIN_EPI8(E_c_gap,_MM_ADD_EPI8(E_mask,E_infinity));
5637 #else
5638 E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5639 #endif
5640 debug15(print_vector_8(E_c_gap,clo,r,"E_c_gap"));
5641 debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c load"));
5642
5643 /* EGAP */
5644 T1 = _MM_ADDS_EPI8(H_nogap_c, gap_open);
5645 dir_vert = _MM_CMPGT_EPI8(E_c_gap,T1); /* E > H, for jump early */
5646 #ifdef HAVE_AVX2
5647 _mm256_store_si256((__m256i *) &((*directions_Egap)[r][clo]),dir_vert);
5648 #else
5649 _mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
5650 #endif
5651 debug15(print_vector_8(T1,clo,r,"T1"));
5652 debug15(print_vector_8(dir_vert,clo,r,"dir_Egap"));
5653
5654 /* Compare H + open with vert */
5655 #ifdef HAVE_SSE4_1
5656 E_c_gap = _MM_MAX_EPI8(E_c_gap, T1); /* Compare H + open with vert */
5657 E_c_gap = _MM_ADDS_EPI8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5658 E_c_gap = _MM_MIN_EPI8(E_c_gap,_MM_ADD_EPI8(E_mask,E_infinity));
5659 #elif 1
5660 E_c_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
5661 E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5662 E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5663 #else
5664 /* Try to avoid unnecessary shifts by 128, but overflows */
5665 E_c_gap = _mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128));
5666 E_c_gap = _mm_add_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5667 E_c_gap = _mm_sub_epi8(_mm_min_epu8(E_c_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5668 #endif
5669 debug15(print_vector_8(E_c_gap,clo,r,"E"));
5670
5671
5672 /* NOGAP */
5673 #ifdef HAVE_AVX2
5674 T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_c,SIMD_NCHARS-1),LAST_CHAR_INSERT);
5675 X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_c,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
5676 H_nogap_c = _mm256_slli_si256(H_nogap_c,ONE_CHAR);
5677 #else
5678 T1 = _mm_srli_si128(H_nogap_c,LAST_CHAR_SHIFT);
5679 H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_CHAR);
5680 #endif
5681 H_nogap_c = _MM_OR_SI(H_nogap_c, X_prev_nogap);
5682 X_prev_nogap = T1;
5683
5684 /* Add pairscores. No alternate chars for query sequence */
5685 #ifdef HAVE_AVX2
5686 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_ptr[clo]));
5687 debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5688 H_nogap_c = _MM_ADDS_EPI8(H_nogap_c, pairscores_std);
5689 #elif defined(HAVE_SSE4_1)
5690 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
5691 debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5692 H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_std);
5693 #else
5694 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo])); /* Has 128 added already */
5695 pairscores_best = _mm_sub_epi8(pairscores_std, all_128);
5696 debug15(print_vector_8(pairscores_best,clo,r,"pairscores_std"));
5697 H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_best);
5698 #endif
5699 _mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
5700 debug15(print_vector_8(H_nogap_c,clo,r,"H"));
5701
5702 dir_vert = _MM_CMPGT_EPI8(E_c_gap,H_nogap_c); /* E > H, for jump early */
5703 #ifdef HAVE_AVX2
5704 _mm256_store_si256((__m256i *) &((*directions_nogap)[r][clo]),dir_vert);
5705 #else
5706 _mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
5707 #endif
5708 debug15(print_vector_8(dir_vert,clo,r,"dir_nogap"));
5709
5710
5711 #ifdef HAVE_SSE4_1
5712 H_nogap_c = _MM_MAX_EPI8(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
5713 #else
5714 /* Compare H + pairscores with horiz + extend */
5715 H_nogap_c = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_c, all_128), _mm_add_epi8(E_c_gap, all_128)), all_128);
5716 #endif
5717 debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c store"));
5718 #ifdef HAVE_AVX2
5719 _mm256_store_si256((__m256i *) &(score_column[clo]), H_nogap_c);
5720 #else
5721 _mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
5722 #endif
5723
5724 /* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
5725 if (chigh >= r) {
5726 (*directions_Egap)[r][r] = DIAG;
5727 (*directions_nogap)[r][r] = DIAG;
5728 }
5729
5730 /* No need for F loop here */
5731 #ifdef HAVE_AVX2
5732 save = _mm256_extract_epi8(E_mask,15);
5733 E_mask = _mm256_slli_si256(E_mask,ONE_CHAR);
5734 E_mask = _mm256_insert_epi8(E_mask,save,16);
5735 #else
5736 E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
5737 #endif
5738 }
5739 }
5740 }
5741
5742 #ifdef CHECK1
5743 /* Row 0 and column 0 directions fail anyway due to saturation */
5744 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
5745 (*directions_Egap)[1][0] = VERT;
5746 #endif
5747
5748 #ifdef DEBUG2
5749 printf("SIMD: Dynprog_simd_8_lower\n");
5750 Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
5751 revp,lband,/*upperp*/false);
5752 Directions8_print_ud(*directions_nogap,*directions_Egap,
5753 rlength,glength,rsequence,gsequence,gsequence_alt,
5754 revp,lband,/*upperp*/false);
5755 #endif
5756
5757 #ifdef CHECK1
5758 /* Check for column 0 directions */
5759 for (r = 1; r <= lband && r <= rlength; r++) {
5760 assert((*directions_Egap)[r][0] != DIAG);
5761 assert((*directions_nogap)[r][0] != DIAG);
5762 }
5763 #endif
5764
5765 #ifdef DEBUG_AVX2
5766 matrix_std = Dynprog_simd_8_lower_nonavx2(&directions_nogap_std,&directions_Egap_std,
5767 this,rsequence,gsequence,gsequence_alt,
5768 rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
5769 open,extend,lband,jump_late_p,revp);
5770 #elif defined(DEBUG_SIMD)
5771 matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
5772 this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
5773 rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
5774 open,extend,lband,/*uband*/0,jump_late_p,revp,/*saturation*/NEG_INFINITY_8,
5775 /*upperp*/false,/*lowerp*/true);
5776 #endif
5777
5778 #ifdef DEBUG2
5779 printf("Banded %s\n",revp ? "rev" : "fwd");
5780 Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
5781 revp,lband,/*upperp*/false);
5782 Directions8_print_ud(*directions_nogap,*directions_Egap,
5783 rlength,glength,rsequence,gsequence,gsequence_alt,
5784 revp,lband,/*upperp*/false);
5785 #endif
5786
5787 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
5788 banded_matrix8_compare_lower(matrix,matrix_std,rlength,glength,lband,
5789 rsequence,gsequence,gsequence_alt,
5790 goffset,chroffset,chrhigh,watsonp,revp);
5791
5792 banded_directions8_compare_nogap_lower(matrix,*directions_nogap,directions_nogap_std,rlength,glength,lband);
5793
5794 banded_directions8_compare_Egap_lower(matrix,*directions_Egap,directions_Egap_std,rlength,glength,lband);
5795 #endif
5796
5797 _mm_free(pairscores[4]);
5798 _mm_free(pairscores[3]);
5799 _mm_free(pairscores[2]);
5800 _mm_free(pairscores[1]);
5801 _mm_free(pairscores[0]);
5802
5803 return matrix;
5804 }
5805 #endif
5806
5807
5808 #ifdef DEBUG_AVX2
5809 Score16_T **
Dynprog_simd_16_nonavx2(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,Direction16_T *** directions_Fgap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,int uband,bool jump_late_p,bool revp)5810 Dynprog_simd_16_nonavx2 (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
5811 Direction16_T ***directions_Fgap,
5812 T this, char *rsequence, char *gsequence, char *gsequence_alt,
5813 int rlength, int glength,
5814 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
5815 Mismatchtype_T mismatchtype, int open, int extend,
5816 int lband, int uband, bool jump_late_p, bool revp) {
5817 int c_gap, last_nogap, score, *FF; /* Need to have the ability to go past NEG_INFINITY */
5818 Score16_T **matrix, *score_column;
5819 __m128i pairscores_std, pairscores_alt;
5820 __m128i H_nogap_r, X_prev_nogap, E_r_gap, T1;
5821 __m128i gap_open, gap_extend, extend_ladder, extend_chunk, complement_dummy;
5822 __m128i dir_horiz;
5823 __m128i ramp, ramp_chunk, lband_vector, filter, ones;
5824 int rlength_ceil, lband_ceil, r, c;
5825 int rlo, rhigh, rlo_calc, rhigh_calc;
5826 int na1, na2, na2_alt;
5827 Score16_T *pairscores_col0;
5828 Score16_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr, pairscore, pairscore0;
5829 Pairdistance_T **pairdistance_array_type;
5830
5831
5832 debug2(printf("Dynprog_simd_16. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
5833 debug15(printf("Dynprog_simd_16. jump_late_p %d, open %d, extend %d, lband %d, uband %d\n",
5834 jump_late_p,open,extend,lband,uband));
5835
5836 rlength_ceil = (int) ((rlength + SIMD_NSHORTS_NONAVX2)/SIMD_NSHORTS_NONAVX2) * SIMD_NSHORTS_NONAVX2;
5837 pairdistance_array_type = pairdistance_array[mismatchtype];
5838
5839 debug(printf("compute_scores_simd_16_bycols (upper): "));
5840 debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
5841 debug(printf("Query length rounded up to %d\n",rlength_ceil));
5842
5843 matrix = aligned_score16_alloc(rlength_ceil,glength,
5844 this->aligned_std.one.matrix_ptrs,this->aligned_std.one.matrix_space);
5845 *directions_nogap = aligned_directions16_alloc(rlength_ceil,glength,
5846 this->aligned_std.one.directions_ptrs_0,this->aligned_std.one.directions_space_0);
5847 *directions_Egap = aligned_directions16_alloc(rlength_ceil,glength,
5848 this->aligned_std.one.directions_ptrs_1,this->aligned_std.one.directions_space_1);
5849 /* Need to calloc to save time in F loop */
5850 *directions_Fgap = aligned_directions16_calloc(rlength_ceil,glength,
5851 this->aligned_std.one.directions_ptrs_2,this->aligned_std.one.directions_space_2);
5852
5853 #if 0
5854 /* Row 0 initialization */
5855 /* penalty = open; */
5856 for (c = 1; c <= uband && c <= glength; c++) {
5857 /* penalty += extend; */
5858 (*directions_Egap)[c][0] = HORIZ;
5859 (*directions_nogap)[c][0] = HORIZ;
5860 }
5861 #endif
5862 #if 0
5863 /* Already initialized to DIAG. Actually, no longer initializing directions_Egap */
5864 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
5865 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
5866 #endif
5867 #if 0
5868 /* Column 0 initialization */
5869 /* penalty = open; */
5870 for (r = 1; r <= SIMD_NSHORTS_NONAVX2 && r <= rlength; r++) {
5871 /* penalty += extend; */
5872 (*directions_nogap)[0][r] = VERT;
5873 }
5874 #endif
5875
5876
5877 /* Load pairscores. Store match - mismatch */
5878 pairscores[0] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
5879 pairscores[1] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
5880 pairscores[2] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
5881 pairscores[3] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
5882 pairscores[4] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
5883
5884 lband_ceil = (int) ((lband + SIMD_NSHORTS_NONAVX2)/SIMD_NSHORTS_NONAVX2) * SIMD_NSHORTS_NONAVX2;
5885 pairscores_col0 = (Score16_T *) _mm_malloc(lband_ceil * sizeof(Score16_T),16);
5886
5887 #if 0
5888 /* Should not be necessary */
5889 memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score16_T));
5890 memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score16_T));
5891 memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score16_T));
5892 memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score16_T));
5893 memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score16_T));
5894 #endif
5895
5896
5897 pairscores_col0[0] = (Score16_T) 0;
5898 /* Initialization just to lband causes errors in dir_horiz for Egap */
5899 #ifdef ZERO_INITIAL_GAP_PENALTY
5900 for (r = 1; r < lband_ceil; r++) {
5901 pairscores_col0[r] = (Score16_T) 0;
5902 }
5903 #else
5904 for (r = 1; r < lband_ceil; r++) {
5905 pairscores_col0[r] = (Score16_T) NEG_INFINITY_16;
5906 }
5907 #endif
5908
5909 r = 0; na1 = 'N';
5910 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
5911 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
5912 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
5913 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
5914 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
5915
5916 if (revp == false) {
5917 for (r = 1; r <= rlength; r++) {
5918 na1 = (int) rsequence[r-1];
5919 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
5920 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
5921 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
5922 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
5923 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
5924 }
5925 } else {
5926 for (r = 1; r <= rlength; r++) {
5927 na1 = (int) rsequence[1-r];
5928 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
5929 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
5930 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
5931 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
5932 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
5933 }
5934 }
5935
5936 #if 0
5937 /* Should not be necessary */
5938 memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
5939 memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
5940 memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
5941 memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
5942 memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
5943 #endif
5944
5945 complement_dummy = _mm_set1_epi16(-1);
5946
5947 FF = (int *) MALLOCA((glength + 1) * sizeof(int));
5948
5949 gap_open = _mm_set1_epi16((Score16_T) open);
5950 gap_extend = _mm_set1_epi16((Score16_T) extend);
5951
5952
5953 #ifndef INFINITE_INITIAL_GAP_PENALTY
5954 lband_vector = _mm_set1_epi16(lband);
5955 ramp = _mm_setr_epi16(1,2,3,4,5,6,7,8);
5956 extend_ladder = _mm_setr_epi16(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend);
5957 ramp_chunk = _mm_set1_epi16(SIMD_NSHORTS_NONAVX2);
5958 extend_chunk = _mm_set1_epi16(SIMD_NSHORTS_NONAVX2*extend);
5959 #endif
5960
5961 if (jump_late_p) {
5962 for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS_NONAVX2) {
5963 if ((rhigh = rlo + SIMD_NSHORTS_NONAVX2 - 1) > rlength) {
5964 rhigh = rlength;
5965 }
5966
5967 if ((c = rlo - lband) < 0) {
5968 c = 0;
5969
5970 #if defined(ZERO_INITIAL_GAP_PENALTY)
5971 /* Initial H in column 0 determined by zeroed out H. E needs to equal gap_open for column 1. */
5972 E_r_gap = _mm_set1_epi16(NEG_INFINITY_16-open);
5973 filter = _mm_cmpgt_epi16(ramp,lband_vector);
5974 H_nogap_r = _mm_and_si128(filter,E_r_gap); /* Use zeros for score */
5975
5976 E_r_gap = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
5977 E_r_gap = _mm_adds_epi16(E_r_gap,gap_open);
5978
5979 ramp = _mm_adds_epi16(ramp,ramp_chunk); /* Prepare for next block */
5980 extend_ladder = _mm_adds_epi16(extend_ladder,extend_chunk); /* Prepare for next block */
5981 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
5982 /* dir_horiz tests if E >= H. To fill in first column of each
5983 row block with non-diags, make E == H. */
5984 E_r_gap = _mm_set1_epi16(NEG_INFINITY_16);
5985 H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
5986 #else
5987 /* Initial H in column 0 determined by E, which needs to equal
5988 gap_open + extend_ladder for column 1. H is free to be set
5989 equal to E. */
5990 H_nogap_r = _mm_set1_epi16(NEG_INFINITY_8-open); /* To compensate for T1 = H + open */
5991 filter = _mm_cmpgt_epi16(ramp,lband_vector);
5992 H_nogap_r = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),_mm_and_si128(filter,H_nogap_r));
5993 E_r_gap = _mm_adds_epi16(H_nogap_r,gap_open);
5994 ramp = _mm_adds_epi16(ramp,ramp_chunk); /* Prepare for next block */
5995 extend_ladder = _mm_adds_epi16(extend_ladder,extend_chunk); /* Prepare for next block */
5996 #endif
5997
5998 } else {
5999 E_r_gap = _mm_set1_epi16(NEG_INFINITY_16);
6000 H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6001 }
6002
6003 for ( ; c <= rhigh + uband && c <= glength; c++) {
6004 score_column = matrix[c];
6005
6006 if (c == 0) {
6007 pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
6008
6009 #ifdef ZERO_INITIAL_GAP_PENALTY
6010 X_prev_nogap = _mm_set1_epi16(0);
6011 #else
6012 if (rlo == 0) {
6013 X_prev_nogap = _mm_set1_epi16(0);
6014 } else {
6015 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6016 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6017 }
6018 #endif
6019
6020 } else {
6021 na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
6022 na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
6023 pairscores_std_ptr = pairscores[na2];
6024 pairscores_alt_ptr = pairscores[na2_alt];
6025
6026 if (rlo == 0) {
6027 #ifdef ZERO_INITIAL_GAP_PENALTY
6028 X_prev_nogap = _mm_set1_epi16(0);
6029 #else
6030 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6031 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6032 #endif
6033 } else {
6034 /* second or greater block of 16 */
6035 X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
6036 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6037 }
6038 }
6039
6040 debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
6041 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
6042
6043 /* EGAP */
6044 T1 = _mm_adds_epi16(H_nogap_r, gap_open);
6045 dir_horiz = _mm_cmplt_epi16(E_r_gap,T1); /* E < H */
6046 dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy); /* E >= H, for jump late */
6047 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6048 debug15(print_vector_16(T1,rlo,c,"T1"));
6049 debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
6050
6051 E_r_gap = _mm_max_epi16(E_r_gap, T1); /* Compare H + open with vert */
6052 E_r_gap = _mm_adds_epi16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
6053 debug15(print_vector_16(E_r_gap,rlo,c,"E"));
6054
6055
6056 /* NOGAP */
6057 T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_NONAVX2);
6058 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
6059 H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
6060 X_prev_nogap = T1;
6061
6062 /* Add pairscores, allowing for alternate genomic nt */
6063 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
6064 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
6065 H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
6066 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
6067 debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
6068
6069 dir_horiz = _mm_cmplt_epi16(E_r_gap,H_nogap_r); /* E < H */
6070 dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy); /* E >= H, for jump late */
6071 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
6072 debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
6073
6074 H_nogap_r = _mm_max_epi16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
6075 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
6076 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
6077
6078
6079 /* F loop */
6080 if ((rlo_calc = rlo) <= c - uband) {
6081 rlo_calc = c - uband;
6082 }
6083 if ((rhigh_calc = rhigh) >= c + lband) {
6084 rhigh_calc = c + lband;
6085 if (c > 0) {
6086 /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
6087 pairscore = pairscores[na2][rhigh_calc];
6088 if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
6089 pairscore = pairscore0;
6090 }
6091 /* No need to fix for non-SSE4.1: pairscore -= 128; */
6092 if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_16) {
6093 score_column[rhigh_calc] = NEG_INFINITY_16; /* Saturation */
6094 } else if (score > POS_INFINITY_16) {
6095 score_column[rhigh_calc] = POS_INFINITY_16; /* Saturation */
6096 } else {
6097 score_column[rhigh_calc] = (Score16_T) score;
6098 }
6099 (*directions_Egap)[c][rhigh_calc] = DIAG;
6100 (*directions_nogap)[c][rhigh_calc] = DIAG;
6101 }
6102 }
6103
6104 debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
6105 rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
6106
6107 if (rlo == 0) {
6108 c_gap = NEG_INFINITY_INT;
6109 last_nogap = NEG_INFINITY_INT;
6110 } else if (c >= rlo + uband) {
6111 c_gap = NEG_INFINITY_INT;
6112 last_nogap = NEG_INFINITY_INT;
6113 } else {
6114 debug3(printf("At c %d, uband %d, reading c_gap %d\n",c,uband,FF[c]));
6115 c_gap = FF[c];
6116 last_nogap = (int) score_column[rlo_calc-1];
6117 }
6118
6119 if ((r = rlo_calc) == c - uband) {
6120 /* Handle top value as a special case to prevent going outside of uband */
6121 /* FGAP */
6122 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6123 r,c,c_gap + extend,last_nogap + open + extend));
6124 score = last_nogap + open /* + extend */;
6125 c_gap = score + extend;
6126 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6127
6128 /* NOGAP */
6129 last_nogap = (int) score_column[r];
6130 r++;
6131 }
6132
6133 /* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
6134 for ( ; r <= rhigh_calc; r++) {
6135 /* FGAP */
6136 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6137 r,c,c_gap + extend,last_nogap + open + extend));
6138 if (c_gap /* + extend */ >= (score = last_nogap + open /* + extend */)) { /* Use >= for jump late */
6139 c_gap += extend;
6140 (*directions_Fgap)[c][r] = VERT;
6141 } else {
6142 c_gap = score + extend;
6143 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6144 }
6145
6146 /* NOGAP */
6147 last_nogap = (int) score_column[r];
6148 debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
6149 if (c_gap >= last_nogap) { /* Use >= for jump late */
6150 last_nogap = c_gap;
6151 score_column[r] = (c_gap < NEG_INFINITY_16) ? NEG_INFINITY_16 : (Score16_T) c_gap; /* Saturation */
6152 (*directions_nogap)[c][r] = VERT;
6153 }
6154 }
6155
6156 FF[c] = c_gap;
6157 debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
6158 H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
6159 }
6160 }
6161
6162 } else {
6163 /* jump early */
6164 #if defined(ZERO_INITIAL_GAP_PENALTY) || defined(INFINITE_INITIAL_GAP_PENALTY)
6165 /* No need for ones */
6166 #else
6167 ones = _mm_set1_epi16(1);
6168 #endif
6169 for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS_NONAVX2) {
6170 if ((rhigh = rlo + SIMD_NSHORTS_NONAVX2 - 1) > rlength) {
6171 rhigh = rlength;
6172 }
6173
6174 if ((c = rlo - lband) < 0) {
6175 c = 0;
6176
6177 #if defined(ZERO_INITIAL_GAP_PENALTY)
6178 /* Initial H in column 0 determined by zeroed out H. E needs to equal gap_open for column 1. */
6179 E_r_gap = _mm_set1_epi16(NEG_INFINITY_16-open);
6180 filter = _mm_cmpgt_epi16(ramp,lband_vector);
6181 H_nogap_r = _mm_and_si128(filter,E_r_gap); /* Use zeros for score */
6182
6183 E_r_gap = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
6184 E_r_gap = _mm_adds_epi16(E_r_gap,gap_open);
6185
6186 ramp = _mm_adds_epi16(ramp,ramp_chunk); /* Prepare for next block */
6187 extend_ladder = _mm_adds_epi16(extend_ladder,extend_chunk); /* Prepare for next block */
6188 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
6189 /* dir_horiz tests if E > H. To fill in first column of each
6190 row block with non-diags, make E > H. */
6191 E_r_gap = _mm_set1_epi16(NEG_INFINITY_16+1);
6192 H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6193 #else
6194 /* Initial H in column 0 determined by E, which needs to equal
6195 gap_open + extend_ladder for column 1. H is free to be set
6196 less than E. */
6197 H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open+1); /* To compensate for T1 = H + open */
6198 filter = _mm_cmpgt_epi16(ramp,lband_vector);
6199 H_nogap_r = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),_mm_and_si128(filter,H_nogap_r));
6200 E_r_gap = _mm_adds_epi16(H_nogap_r,gap_open);
6201 H_nogap_r = _mm_subs_epi16(H_nogap_r,ones); /* To ensure H < E */
6202 ramp = _mm_adds_epi16(ramp,ramp_chunk); /* Prepare for next block */
6203 extend_ladder = _mm_adds_epi16(extend_ladder,extend_chunk); /* Prepare for next block */
6204 #endif
6205
6206 } else {
6207 E_r_gap = _mm_set1_epi16(NEG_INFINITY_16+1);
6208 H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6209 }
6210
6211 for ( ; c <= rhigh + uband && c <= glength; c++) {
6212 score_column = matrix[c];
6213
6214 if (c == 0) {
6215 pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
6216
6217 #ifdef ZERO_INITIAL_GAP_PENALTY
6218 X_prev_nogap = _mm_set1_epi16(0);
6219 #else
6220 if (rlo == 0) {
6221 X_prev_nogap = _mm_set1_epi16(0);
6222 } else {
6223 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6224 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6225 }
6226 #endif
6227
6228 } else {
6229 na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
6230 na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
6231 pairscores_std_ptr = pairscores[na2];
6232 pairscores_alt_ptr = pairscores[na2_alt];
6233
6234 if (rlo == 0) {
6235 #ifdef ZERO_INITIAL_GAP_PENALTY
6236 X_prev_nogap = _mm_set1_epi16(0);
6237 #else
6238 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6239 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6240 #endif
6241 } else {
6242 /* second or greater block of 16 */
6243 X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
6244 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6245 }
6246 }
6247
6248 debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
6249 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
6250
6251 /* EGAP */
6252 T1 = _mm_adds_epi16(H_nogap_r, gap_open);
6253 dir_horiz = _mm_cmpgt_epi16(E_r_gap,T1); /* E > H, for jump early */
6254 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6255 debug15(print_vector_16(T1,rlo,c,"T1"));
6256 debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
6257
6258 E_r_gap = _mm_max_epi16(E_r_gap, T1); /* Compare H + open with vert */
6259 E_r_gap = _mm_adds_epi16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
6260 debug15(print_vector_16(E_r_gap,rlo,c,"E"));
6261
6262
6263 /* NOGAP */
6264 T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_NONAVX2);
6265 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
6266 H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
6267 X_prev_nogap = T1;
6268
6269 /* Add pairscores, allowing for alternate genomic nt */
6270 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
6271 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
6272 H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
6273 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
6274 debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
6275
6276 dir_horiz = _mm_cmpgt_epi16(E_r_gap,H_nogap_r); /* E > H, for jump early */
6277 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
6278 debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
6279
6280 H_nogap_r = _mm_max_epi16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
6281 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
6282 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
6283
6284
6285 /* F loop */
6286 if ((rlo_calc = rlo) < c - uband) {
6287 rlo_calc = c - uband;
6288 }
6289 if ((rhigh_calc = rhigh) >= c + lband) {
6290 rhigh_calc = c + lband;
6291 if (c > 0) {
6292 /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
6293 pairscore = pairscores[na2][rhigh_calc];
6294 if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
6295 pairscore = pairscore0;
6296 }
6297 /* No need to fix for non-SSE4.1: pairscore -= 128; */
6298 if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_16) {
6299 score_column[rhigh_calc] = NEG_INFINITY_16; /* Saturation */
6300 } else if (score > POS_INFINITY_16) {
6301 score_column[rhigh_calc] = POS_INFINITY_16; /* Saturation */
6302 } else {
6303 score_column[rhigh_calc] = (Score16_T) score;
6304 }
6305 (*directions_Egap)[c][rhigh_calc] = DIAG;
6306 (*directions_nogap)[c][rhigh_calc] = DIAG;
6307 }
6308 }
6309
6310 debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
6311 rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
6312
6313 if (rlo == 0) {
6314 c_gap = NEG_INFINITY_INT;
6315 last_nogap = NEG_INFINITY_INT;
6316 } else if (c >= rlo + uband) {
6317 c_gap = NEG_INFINITY_INT;
6318 last_nogap = NEG_INFINITY_INT;
6319 } else {
6320 c_gap = FF[c];
6321 last_nogap = (int) score_column[rlo_calc-1];
6322 }
6323
6324 if ((r = rlo_calc) == c - uband) {
6325 /* Handle top value as a special case to prevent going outside of uband */
6326 /* FGAP */
6327 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6328 r,c,c_gap + extend,last_nogap + open + extend));
6329 score = last_nogap + open /* + extend */;
6330 c_gap = score + extend;
6331 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6332
6333 /* NOGAP */
6334 last_nogap = (int) score_column[r];
6335 r++;
6336 }
6337
6338 /* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
6339 for ( ; r <= rhigh_calc; r++) {
6340 /* FGAP */
6341 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6342 r,c,c_gap + extend,last_nogap + open + extend));
6343 if (c_gap /* + extend */ > (score = last_nogap + open /* + extend */)) { /* Use > for jump early */
6344 c_gap += extend;
6345 (*directions_Fgap)[c][r] = VERT;
6346 } else {
6347 c_gap = score + extend;
6348 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6349 }
6350
6351 /* NOGAP */
6352 last_nogap = (int) score_column[r];
6353 debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
6354 if (c_gap > last_nogap) { /* Use > for jump early */
6355 last_nogap = c_gap;
6356 score_column[r] = (c_gap < NEG_INFINITY_16) ? NEG_INFINITY_16 : (Score16_T) c_gap; /* Saturation */
6357 (*directions_nogap)[c][r] = VERT;
6358 }
6359 }
6360
6361 FF[c] = c_gap;
6362 debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
6363 H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
6364 }
6365 }
6366 }
6367
6368
6369 #ifdef CHECK1
6370 /* Row 0 and column 0 directions fail anyway due to saturation */
6371 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
6372 (*directions_Egap)[1][0] = HORIZ;
6373 (*directions_Fgap)[0][1] = VERT;
6374 #endif
6375
6376
6377 #ifdef DEBUG2
6378 printf("SIMD: Dynprog_simd_16\n");
6379 Matrix16_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
6380 revp,lband,uband);
6381 Directions16_print(*directions_nogap,*directions_Egap,*directions_Fgap,
6382 rlength,glength,rsequence,gsequence,gsequence_alt,revp,lband,uband);
6383 #endif
6384
6385 #ifdef CHECK1
6386 /* Check for row 0 directions */
6387 for (c = 1; c <= uband && c <= glength; c++) {
6388 assert((*directions_Egap)[c][0] != DIAG);
6389 assert((*directions_nogap)[c][0] != DIAG);
6390 }
6391 /* Check for column 0 directions */
6392 for (r = 1; r <= lband && r <= rlength; r++) {
6393 assert((*directions_Fgap)[0][r] != DIAG);
6394 assert((*directions_nogap)[0][r] != DIAG);
6395 }
6396 #endif
6397
6398 FREEA(FF);
6399 _mm_free(pairscores_col0);
6400 _mm_free(pairscores[4]);
6401 _mm_free(pairscores[3]);
6402 _mm_free(pairscores[2]);
6403 _mm_free(pairscores[1]);
6404 _mm_free(pairscores[0]);
6405
6406 return matrix;
6407 }
6408 #endif
6409
6410
6411
6412
6413 #if defined(HAVE_SSE2)
6414 /* Modified from Dynprog_simd_16_upper. Operates by columns. */
6415 Score16_T **
Dynprog_simd_16(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,Direction16_T *** directions_Fgap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,int uband,bool jump_late_p,bool revp)6416 Dynprog_simd_16 (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
6417 Direction16_T ***directions_Fgap,
6418 T this, char *rsequence, char *gsequence, char *gsequence_alt,
6419 int rlength, int glength,
6420 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
6421 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
6422 #endif
6423 Mismatchtype_T mismatchtype, int open, int extend,
6424 int lband, int uband, bool jump_late_p, bool revp) {
6425 int c_gap, last_nogap, score, *FF; /* Need to have the ability to go past NEG_INFINITY */
6426 Score16_T **matrix, *score_column;
6427 #ifdef HAVE_AVX2
6428 __m256i pairscores_std, pairscores_alt;
6429 __m256i H_nogap_r, X_prev_nogap, E_r_gap, T1;
6430 __m256i gap_open, gap_extend, complement_dummy;
6431 __m256i dir_horiz;
6432 #if defined(ZERO_INITIAL_GAP_PENALTY)
6433 __m256i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter;
6434 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
6435 #else
6436 __m256i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter, ones;
6437 #endif
6438
6439 #else
6440 __m128i pairscores_std, pairscores_alt;
6441 __m128i H_nogap_r, X_prev_nogap, E_r_gap, T1;
6442 __m128i gap_open, gap_extend, complement_dummy;
6443 __m128i dir_horiz;
6444 #if defined(ZERO_INITIAL_GAP_PENALTY)
6445 __m128i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter;
6446 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
6447 #else
6448 __m128i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter, ones;
6449 #endif
6450
6451 #endif
6452
6453 int rlength_ceil, lband_ceil, r, c;
6454 int rlo, rhigh, rlo_calc, rhigh_calc;
6455 int na1, na2, na2_alt;
6456 Score16_T *pairscores_col0;
6457 Score16_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr, pairscore, pairscore0;
6458 Pairdistance_T **pairdistance_array_type;
6459
6460 #if defined(DEBUG_AVX2)
6461 Score16_T **matrix_std;
6462 Direction16_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
6463 #elif defined(DEBUG_SIMD)
6464 Score32_T **matrix_std;
6465 Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
6466 #endif
6467
6468
6469 debug2(printf("Dynprog_simd_16. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
6470 debug15(printf("Dynprog_simd_16. jump_late_p %d, open %d, extend %d, lband %d, uband %d\n",
6471 jump_late_p,open,extend,lband,uband));
6472
6473 rlength_ceil = (int) ((rlength + SIMD_NSHORTS)/SIMD_NSHORTS) * SIMD_NSHORTS;
6474 pairdistance_array_type = pairdistance_array[mismatchtype];
6475
6476 debug(printf("compute_scores_simd_16_bycols (upper): "));
6477 debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
6478 debug(printf("Query length rounded up to %d\n",rlength_ceil));
6479
6480 matrix = aligned_score16_alloc(rlength_ceil,glength,
6481 this->aligned.one.matrix_ptrs,this->aligned.one.matrix_space);
6482 *directions_nogap = aligned_directions16_alloc(rlength_ceil,glength,
6483 this->aligned.one.directions_ptrs_0,this->aligned.one.directions_space_0);
6484 *directions_Egap = aligned_directions16_alloc(rlength_ceil,glength,
6485 this->aligned.one.directions_ptrs_1,this->aligned.one.directions_space_1);
6486 /* Need to calloc to save time in F loop */
6487 *directions_Fgap = aligned_directions16_calloc(rlength_ceil,glength,
6488 this->aligned.one.directions_ptrs_2,this->aligned.one.directions_space_2);
6489
6490 #if 0
6491 /* Row 0 initialization */
6492 /* penalty = open; */
6493 for (c = 1; c <= uband && c <= glength; c++) {
6494 /* penalty += extend; */
6495 (*directions_Egap)[c][0] = HORIZ;
6496 (*directions_nogap)[c][0] = HORIZ;
6497 }
6498 #endif
6499 #if 0
6500 /* Already initialized to DIAG. Actually, no longer initializing directions_Egap */
6501 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
6502 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
6503 #endif
6504 #if 0
6505 /* Column 0 initialization */
6506 /* penalty = open; */
6507 for (r = 1; r <= SIMD_NSHORTS && r <= rlength; r++) {
6508 /* penalty += extend; */
6509 (*directions_nogap)[0][r] = VERT;
6510 }
6511 #endif
6512
6513
6514 /* Load pairscores. Store match - mismatch */
6515 pairscores[0] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
6516 pairscores[1] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
6517 pairscores[2] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
6518 pairscores[3] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
6519 pairscores[4] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
6520
6521 lband_ceil = (int) ((lband + SIMD_NSHORTS)/SIMD_NSHORTS) * SIMD_NSHORTS;
6522 pairscores_col0 = (Score16_T *) _mm_malloc(lband_ceil * sizeof(Score16_T),ALIGN_SIZE);
6523
6524 #if 0
6525 /* Should not be necessary */
6526 memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score16_T));
6527 memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score16_T));
6528 memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score16_T));
6529 memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score16_T));
6530 memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score16_T));
6531 #endif
6532
6533
6534 pairscores_col0[0] = (Score16_T) 0;
6535 /* Initialization just to lband causes errors in dir_horiz for Egap */
6536 #ifdef ZERO_INITIAL_GAP_PENALTY
6537 for (r = 1; r < lband_ceil; r++) {
6538 pairscores_col0[r] = (Score16_T) 0;
6539 }
6540 #else
6541 for (r = 1; r < lband_ceil; r++) {
6542 pairscores_col0[r] = (Score16_T) NEG_INFINITY_16;
6543 }
6544 #endif
6545
6546 r = 0; na1 = 'N';
6547 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
6548 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
6549 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
6550 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
6551 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
6552
6553 if (revp == false) {
6554 for (r = 1; r <= rlength; r++) {
6555 na1 = (int) rsequence[r-1];
6556 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
6557 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
6558 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
6559 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
6560 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
6561 }
6562 } else {
6563 for (r = 1; r <= rlength; r++) {
6564 na1 = (int) rsequence[1-r];
6565 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
6566 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
6567 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
6568 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
6569 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
6570 }
6571 }
6572
6573 #if 0
6574 /* Should not be necessary */
6575 memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
6576 memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
6577 memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
6578 memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
6579 memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
6580 #endif
6581
6582 complement_dummy = _MM_SET1_EPI16(-1);
6583
6584 FF = (int *) MALLOCA((glength + 1) * sizeof(int));
6585
6586 gap_open = _MM_SET1_EPI16((Score16_T) open);
6587 gap_extend = _MM_SET1_EPI16((Score16_T) extend);
6588
6589 #ifndef INFINITE_INITIAL_GAP_PENALTY
6590 #ifdef HAVE_AVX2
6591 ramp = _mm256_setr_epi16(1,2,3,4,5,6,7,8);
6592 extend_ladder = _mm256_setr_epi16(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend);
6593 #else
6594 ramp = _mm_setr_epi16(1,2,3,4,5,6,7,8);
6595 extend_ladder = _mm_setr_epi16(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend);
6596 #endif
6597 lband_vector = _MM_SET1_EPI16(lband);
6598 ramp_chunk = _MM_SET1_EPI16(SIMD_NSHORTS);
6599 extend_chunk = _MM_SET1_EPI16(SIMD_NSHORTS*extend);
6600 #endif
6601
6602 if (jump_late_p) {
6603 for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS) {
6604 if ((rhigh = rlo + SIMD_NSHORTS - 1) > rlength) {
6605 rhigh = rlength;
6606 }
6607
6608 if ((c = rlo - lband) < 0) {
6609 c = 0;
6610
6611 #if defined(ZERO_INITIAL_GAP_PENALTY)
6612 /* Initial H in column 0 determined by zeroed out H. E needs to equal gap_open for column 1. */
6613 E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16-open);
6614 filter = _MM_CMPGT_EPI16(ramp,lband_vector);
6615 H_nogap_r = _MM_AND_SI(filter,E_r_gap); /* Use zeros for score */
6616
6617 E_r_gap = _MM_OR_SI(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
6618 E_r_gap = _MM_ADDS_EPI16(E_r_gap,gap_open);
6619
6620 ramp = _MM_ADDS_EPI16(ramp,ramp_chunk); /* Prepare for next block */
6621 extend_ladder = _MM_ADDS_EPI16(extend_ladder,extend_chunk); /* Prepare for next block */
6622 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
6623 /* dir_horiz tests if E >= H. To fill in first column of each
6624 row block with non-diags, make E == H. */
6625 E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16);
6626 H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6627 #else
6628 /* Initial H in column 0 determined by E, which needs to equal
6629 gap_open + extend_ladder for column 1. H is free to be set
6630 equal to E. */
6631 H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_8-open); /* To compensate for T1 = H + open */
6632 filter = _MM_CMPGT_EPI16(ramp,lband_vector);
6633 H_nogap_r = _MM_OR_SI(_mm_andnot_si128(filter,extend_ladder),_mm_and_si128(filter,H_nogap_r));
6634 E_r_gap = _MM_ADDS_EPI16(H_nogap_r,gap_open);
6635 ramp = _MM_ADDS_EPI16(ramp,ramp_chunk); /* Prepare for next block */
6636 extend_ladder = _MM_ADDS_EPI16(extend_ladder,extend_chunk); /* Prepare for next block */
6637 #endif
6638
6639 } else {
6640 E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16);
6641 H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6642 }
6643
6644 for ( ; c <= rhigh + uband && c <= glength; c++) {
6645 score_column = matrix[c];
6646
6647 if (c == 0) {
6648 pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
6649
6650 #ifdef ZERO_INITIAL_GAP_PENALTY
6651 X_prev_nogap = _MM_SETZERO_SI();
6652 #elif defined(HAVE_AVX2)
6653 if (rlo == 0) {
6654 X_prev_nogap = _mm256_setzero_si256();
6655 } else {
6656 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
6657 }
6658 #else
6659 if (rlo == 0) {
6660 X_prev_nogap = _mm_setzero_si128();
6661 } else {
6662 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6663 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6664 }
6665 #endif
6666
6667 } else {
6668 na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
6669 na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
6670 pairscores_std_ptr = pairscores[na2];
6671 pairscores_alt_ptr = pairscores[na2_alt];
6672
6673 #ifdef HAVE_AVX2
6674 if (rlo == 0) {
6675 #ifdef ZERO_INITIAL_GAP_PENALTY
6676 X_prev_nogap = _MM_SETZERO_SI();
6677 #else
6678 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
6679 #endif
6680 } else {
6681 /* second or greater block of 16 */
6682 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_SHORT_INSERT);
6683 }
6684
6685 #else
6686 if (rlo == 0) {
6687 #ifdef ZERO_INITIAL_GAP_PENALTY
6688 X_prev_nogap = _MM_SETZERO_SI();
6689 #else
6690 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6691 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6692 #endif
6693 } else {
6694 /* second or greater block of 16 */
6695 X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
6696 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6697 }
6698 #endif
6699 }
6700
6701 debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
6702 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
6703
6704 /* EGAP */
6705 T1 = _MM_ADDS_EPI16(H_nogap_r, gap_open);
6706 dir_horiz = _MM_CMPLT_EPI16(E_r_gap,T1); /* E < H */
6707 dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy); /* E >= H, for jump late */
6708 #ifdef HAVE_AVX2
6709 _mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6710 #else
6711 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6712 #endif
6713 debug15(print_vector_16(T1,rlo,c,"T1"));
6714 debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
6715
6716 E_r_gap = _MM_MAX_EPI16(E_r_gap, T1); /* Compare H + open with vert */
6717 E_r_gap = _MM_ADDS_EPI16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
6718 debug15(print_vector_16(E_r_gap,rlo,c,"E"));
6719
6720
6721 /* NOGAP */
6722 #ifdef HAVE_AVX2
6723 T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_r,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
6724 X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_r,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
6725 H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_SHORT);
6726 #else
6727 T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_SHIFT);
6728 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
6729 #endif
6730 H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
6731 X_prev_nogap = T1;
6732
6733 /* Add pairscores, allowing for alternate genomic nt */
6734 #ifdef HAVE_AVX2
6735 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
6736 pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
6737 H_nogap_r = _mm256_adds_epi16(H_nogap_r, _mm256_max_epi16(pairscores_std,pairscores_alt));
6738 #else
6739 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
6740 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
6741 H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
6742 #endif
6743 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
6744 debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
6745
6746 dir_horiz = _MM_CMPLT_EPI16(E_r_gap,H_nogap_r); /* E < H */
6747 dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy); /* E >= H, for jump late */
6748 #ifdef HAVE_AVX2
6749 _mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
6750 #else
6751 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
6752 #endif
6753 debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
6754
6755 H_nogap_r = _MM_MAX_EPI16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
6756 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
6757 #ifdef HAVE_AVX2
6758 _mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
6759 #else
6760 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
6761 #endif
6762
6763
6764 /* F loop */
6765 if ((rlo_calc = rlo) <= c - uband) {
6766 rlo_calc = c - uband;
6767 }
6768 if ((rhigh_calc = rhigh) >= c + lband) {
6769 rhigh_calc = c + lband;
6770 if (c > 0) {
6771 /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
6772 pairscore = pairscores[na2][rhigh_calc];
6773 if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
6774 pairscore = pairscore0;
6775 }
6776 /* No need to fix for non-SSE4.1: pairscore -= 128; */
6777 if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_16) {
6778 score_column[rhigh_calc] = NEG_INFINITY_16; /* Saturation */
6779 } else if (score > POS_INFINITY_16) {
6780 score_column[rhigh_calc] = POS_INFINITY_16; /* Saturation */
6781 } else {
6782 score_column[rhigh_calc] = (Score16_T) score;
6783 }
6784 (*directions_Egap)[c][rhigh_calc] = DIAG;
6785 (*directions_nogap)[c][rhigh_calc] = DIAG;
6786 }
6787 }
6788
6789 debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
6790 rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
6791
6792 if (rlo == 0) {
6793 c_gap = NEG_INFINITY_INT;
6794 last_nogap = NEG_INFINITY_INT;
6795 } else if (c >= rlo + uband) {
6796 c_gap = NEG_INFINITY_INT;
6797 last_nogap = NEG_INFINITY_INT;
6798 } else {
6799 debug3(printf("At c %d, uband %d, reading c_gap %d\n",c,uband,FF[c]));
6800 c_gap = FF[c];
6801 last_nogap = (int) score_column[rlo_calc-1];
6802 }
6803
6804 if ((r = rlo_calc) == c - uband) {
6805 /* Handle top value as a special case to prevent going outside of uband */
6806 /* FGAP */
6807 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6808 r,c,c_gap + extend,last_nogap + open + extend));
6809 score = last_nogap + open /* + extend */;
6810 c_gap = score + extend;
6811 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6812
6813 /* NOGAP */
6814 last_nogap = (int) score_column[r];
6815 r++;
6816 }
6817
6818 /* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
6819 for ( ; r <= rhigh_calc; r++) {
6820 /* FGAP */
6821 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6822 r,c,c_gap + extend,last_nogap + open + extend));
6823 if (c_gap /* + extend */ >= (score = last_nogap + open /* + extend */)) { /* Use >= for jump late */
6824 c_gap += extend;
6825 (*directions_Fgap)[c][r] = VERT;
6826 } else {
6827 c_gap = score + extend;
6828 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6829 }
6830
6831 /* NOGAP */
6832 last_nogap = (int) score_column[r];
6833 debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
6834 if (c_gap >= last_nogap) { /* Use >= for jump late */
6835 last_nogap = c_gap;
6836 score_column[r] = (c_gap < NEG_INFINITY_16) ? NEG_INFINITY_16 : (Score16_T) c_gap; /* Saturation */
6837 (*directions_nogap)[c][r] = VERT;
6838 }
6839 }
6840
6841 FF[c] = c_gap;
6842 debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
6843 #ifdef HAVE_AVX2
6844 H_nogap_r = _mm256_load_si256((__m256i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
6845 #else
6846 H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
6847 #endif
6848 }
6849 }
6850
6851 } else {
6852 /* jump early */
6853 #if defined(ZERO_INITIAL_GAP_PENALTY) || defined(INFINITE_INITIAL_GAP_PENALTY)
6854 /* No need for ones */
6855 #else
6856 ones = _MM_SET1_EPI16(1);
6857 #endif
6858 for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS) {
6859 if ((rhigh = rlo + SIMD_NSHORTS - 1) > rlength) {
6860 rhigh = rlength;
6861 }
6862
6863 if ((c = rlo - lband) < 0) {
6864 c = 0;
6865
6866 #if defined(ZERO_INITIAL_GAP_PENALTY)
6867 /* Initial H in column 0 determined by zeroed out H. E needs to equal gap_open for column 1. */
6868 E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16-open);
6869 filter = _MM_CMPGT_EPI16(ramp,lband_vector);
6870 H_nogap_r = _MM_AND_SI(filter,E_r_gap); /* Use zeros for score */
6871
6872 E_r_gap = _MM_OR_SI(_MM_ANDNOT_SI(filter,extend_ladder),H_nogap_r);
6873 E_r_gap = _MM_ADDS_EPI16(E_r_gap,gap_open);
6874
6875 ramp = _MM_ADDS_EPI16(ramp,ramp_chunk); /* Prepare for next block */
6876 extend_ladder = _MM_ADDS_EPI16(extend_ladder,extend_chunk); /* Prepare for next block */
6877 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
6878 /* dir_horiz tests if E > H. To fill in first column of each
6879 row block with non-diags, make E > H. */
6880 E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16+1);
6881 H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6882 #else
6883 /* Initial H in column 0 determined by E, which needs to equal
6884 gap_open + extend_ladder for column 1. H is free to be set
6885 less than E. */
6886 H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open+1); /* To compensate for T1 = H + open */
6887 filter = _MM_CMPGT_EPI16(ramp,lband_vector);
6888 H_nogap_r = _MM_OR_SI(_MM_ANDNOT_SI(filter,extend_ladder),_MM_AND_SI(filter,H_nogap_r));
6889 E_r_gap = _MM_ADDS_EPI16(H_nogap_r,gap_open);
6890 H_nogap_r = _MM_SUBS_EPI16(H_nogap_r,ones); /* To ensure H < E */
6891 ramp = _MM_ADDS_EPI16(ramp,ramp_chunk); /* Prepare for next block */
6892 extend_ladder = _MM_ADDS_EPI16(extend_ladder,extend_chunk); /* Prepare for next block */
6893 #endif
6894
6895 } else {
6896 E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16+1);
6897 H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6898 }
6899
6900 for ( ; c <= rhigh + uband && c <= glength; c++) {
6901 score_column = matrix[c];
6902
6903 if (c == 0) {
6904 pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
6905
6906 #ifdef ZERO_INITIAL_GAP_PENALTY
6907 X_prev_nogap = _MM_SETZERO_SI();
6908 #elif defined(HAVE_AVX2)
6909 if (rlo == 0) {
6910 X_prev_nogap = _mm256_setzero_si256();
6911 } else {
6912 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
6913 }
6914 #else
6915 if (rlo == 0) {
6916 X_prev_nogap = _mm_setzero_si128();
6917 } else {
6918 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6919 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6920 }
6921 #endif
6922
6923 } else {
6924 na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
6925 na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
6926 pairscores_std_ptr = pairscores[na2];
6927 pairscores_alt_ptr = pairscores[na2_alt];
6928
6929 #ifdef HAVE_AVX2
6930 if (rlo == 0) {
6931 #ifdef ZERO_INITIAL_GAP_PENALTY
6932 X_prev_nogap = _MM_SETZERO_SI();
6933 #else
6934 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
6935 #endif
6936 } else {
6937 /* second or greater block of 16 */
6938 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_SHORT_INSERT);
6939 }
6940
6941 #else
6942 if (rlo == 0) {
6943 #ifdef ZERO_INITIAL_GAP_PENALTY
6944 X_prev_nogap = _MM_SETZERO_SI();
6945 #else
6946 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6947 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6948 #endif
6949 } else {
6950 /* second or greater block of 16 */
6951 X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
6952 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6953 }
6954 #endif
6955 }
6956
6957 debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
6958 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
6959
6960 /* EGAP */
6961 T1 = _MM_ADDS_EPI16(H_nogap_r, gap_open);
6962 dir_horiz = _MM_CMPGT_EPI16(E_r_gap,T1); /* E > H, for jump early */
6963 #ifdef HAVE_AVX2
6964 _mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6965 #else
6966 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6967 #endif
6968 debug15(print_vector_16(T1,rlo,c,"T1"));
6969 debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
6970
6971 E_r_gap = _MM_MAX_EPI16(E_r_gap, T1); /* Compare H + open with vert */
6972 E_r_gap = _MM_ADDS_EPI16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
6973 debug15(print_vector_16(E_r_gap,rlo,c,"E"));
6974
6975
6976 /* NOGAP */
6977 #ifdef HAVE_AVX2
6978 T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_r,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
6979 X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_r,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
6980 H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_SHORT);
6981 #else
6982 T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_SHIFT);
6983 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
6984 #endif
6985 H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
6986 X_prev_nogap = T1;
6987
6988 /* Add pairscores, allowing for alternate genomic nt */
6989 #ifdef HAVE_AVX2
6990 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
6991 pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
6992 H_nogap_r = _mm256_adds_epi16(H_nogap_r, _mm256_max_epi16(pairscores_std,pairscores_alt));
6993 #else
6994 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
6995 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
6996 H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
6997 #endif
6998 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
6999 debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
7000
7001 dir_horiz = _MM_CMPGT_EPI16(E_r_gap,H_nogap_r); /* E > H, for jump early */
7002 #ifdef HAVE_AVX2
7003 _mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7004 #else
7005 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7006 #endif
7007 debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
7008
7009 H_nogap_r = _MM_MAX_EPI16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
7010 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
7011 #ifdef HAVE_AVX2
7012 _mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
7013 #else
7014 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
7015 #endif
7016
7017
7018 /* F loop */
7019 if ((rlo_calc = rlo) < c - uband) {
7020 rlo_calc = c - uband;
7021 }
7022 if ((rhigh_calc = rhigh) >= c + lband) {
7023 rhigh_calc = c + lband;
7024 if (c > 0) {
7025 /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
7026 pairscore = pairscores[na2][rhigh_calc];
7027 if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
7028 pairscore = pairscore0;
7029 }
7030 /* No need to fix for non-SSE4.1: pairscore -= 128; */
7031 if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_16) {
7032 score_column[rhigh_calc] = NEG_INFINITY_16; /* Saturation */
7033 } else if (score > POS_INFINITY_16) {
7034 score_column[rhigh_calc] = POS_INFINITY_16; /* Saturation */
7035 } else {
7036 score_column[rhigh_calc] = (Score16_T) score;
7037 }
7038 (*directions_Egap)[c][rhigh_calc] = DIAG;
7039 (*directions_nogap)[c][rhigh_calc] = DIAG;
7040 }
7041 }
7042
7043 debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
7044 rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
7045
7046 if (rlo == 0) {
7047 c_gap = NEG_INFINITY_INT;
7048 last_nogap = NEG_INFINITY_INT;
7049 } else if (c >= rlo + uband) {
7050 c_gap = NEG_INFINITY_INT;
7051 last_nogap = NEG_INFINITY_INT;
7052 } else {
7053 c_gap = FF[c];
7054 last_nogap = (int) score_column[rlo_calc-1];
7055 }
7056
7057 if ((r = rlo_calc) == c - uband) {
7058 /* Handle top value as a special case to prevent going outside of uband */
7059 /* FGAP */
7060 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
7061 r,c,c_gap + extend,last_nogap + open + extend));
7062 score = last_nogap + open /* + extend */;
7063 c_gap = score + extend;
7064 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
7065
7066 /* NOGAP */
7067 last_nogap = (int) score_column[r];
7068 r++;
7069 }
7070
7071 /* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
7072 for ( ; r <= rhigh_calc; r++) {
7073 /* FGAP */
7074 debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
7075 r,c,c_gap + extend,last_nogap + open + extend));
7076 if (c_gap /* + extend */ > (score = last_nogap + open /* + extend */)) { /* Use > for jump early */
7077 c_gap += extend;
7078 (*directions_Fgap)[c][r] = VERT;
7079 } else {
7080 c_gap = score + extend;
7081 /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
7082 }
7083
7084 /* NOGAP */
7085 last_nogap = (int) score_column[r];
7086 debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
7087 if (c_gap > last_nogap) { /* Use > for jump early */
7088 last_nogap = c_gap;
7089 score_column[r] = (c_gap < NEG_INFINITY_16) ? NEG_INFINITY_16 : (Score16_T) c_gap; /* Saturation */
7090 (*directions_nogap)[c][r] = VERT;
7091 }
7092 }
7093
7094 FF[c] = c_gap;
7095 debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
7096 #ifdef HAVE_AVX2
7097 H_nogap_r = _mm256_load_si256((__m256i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
7098 #else
7099 H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
7100 #endif
7101 }
7102 }
7103 }
7104
7105
7106 #ifdef CHECK1
7107 /* Row 0 and column 0 directions fail anyway due to saturation */
7108 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
7109 (*directions_Egap)[1][0] = HORIZ;
7110 (*directions_Fgap)[0][1] = VERT;
7111 #endif
7112
7113
7114 #ifdef DEBUG2
7115 printf("SIMD: Dynprog_simd_16\n");
7116 Matrix16_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
7117 revp,lband,uband);
7118 Directions16_print(*directions_nogap,*directions_Egap,*directions_Fgap,
7119 rlength,glength,rsequence,gsequence,gsequence_alt,
7120 revp,lband,uband);
7121 #endif
7122
7123 #ifdef CHECK1
7124 /* Check for row 0 directions */
7125 for (c = 1; c <= uband && c <= glength; c++) {
7126 assert((*directions_Egap)[c][0] != DIAG);
7127 assert((*directions_nogap)[c][0] != DIAG);
7128 }
7129 /* Check for column 0 directions */
7130 for (r = 1; r <= lband && r <= rlength; r++) {
7131 assert((*directions_Fgap)[0][r] != DIAG);
7132 assert((*directions_nogap)[0][r] != DIAG);
7133 }
7134 #endif
7135
7136 #ifdef DEBUG_AVX2
7137 matrix_std = Dynprog_simd_16_nonavx2(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
7138 this,rsequence,gsequence,gsequence_alt,
7139 rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
7140 open,extend,lband,uband,jump_late_p,revp);
7141 #elif defined(DEBUG_SIMD)
7142 matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
7143 this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
7144 rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
7145 open,extend,lband,uband,jump_late_p,revp,/*saturation*/NEG_INFINITY_16,
7146 /*upperp*/true,/*lowerp*/true);
7147 #endif
7148
7149 #ifdef DEBUG2
7150 printf("Banded\n");
7151 Matrix16_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
7152 revp,lband,uband);
7153 Directions16_print(*directions_nogap,*directions_Egap,*directions_Fgap,
7154 rlength,glength,rsequence,gsequence,gsequence_alt,revp,lband,uband);
7155 #endif
7156
7157 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
7158 banded_matrix16_compare(matrix,matrix_std,rlength,glength,lband,uband,
7159 rsequence,gsequence,gsequence_alt,
7160 goffset,chroffset,chrhigh,watsonp,revp);
7161
7162 banded_directions16_compare_nogap(*directions_nogap,directions_nogap_std,rlength,glength,lband,uband);
7163 banded_directions16_compare_Egap(*directions_Egap,directions_Egap_std,rlength,glength,lband,uband);
7164 banded_directions16_compare_Fgap(*directions_Fgap,directions_Fgap_std,rlength,glength,lband,uband);
7165 #endif
7166
7167 FREEA(FF);
7168 _mm_free(pairscores_col0);
7169 _mm_free(pairscores[4]);
7170 _mm_free(pairscores[3]);
7171 _mm_free(pairscores[2]);
7172 _mm_free(pairscores[1]);
7173 _mm_free(pairscores[0]);
7174
7175 return matrix;
7176 }
7177 #endif
7178
7179
7180 #ifdef DEBUG_AVX2
7181 /* Designed for computation above the diagonal, so no F loop or bottom masking needed */
7182 /* Operates by columns */
7183 Score16_T **
Dynprog_simd_16_upper_nonavx2(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int uband,bool jump_late_p,bool revp)7184 Dynprog_simd_16_upper_nonavx2 (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
7185 T this, char *rsequence, char *gsequence, char *gsequence_alt,
7186 int rlength, int glength,
7187 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
7188 Mismatchtype_T mismatchtype, int open, int extend,
7189 int uband, bool jump_late_p, bool revp) {
7190 Score16_T **matrix, *score_column;
7191 __m128i pairscores_std, pairscores_alt;
7192 __m128i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, E_infinity, T1;
7193 __m128i gap_open, gap_extend, complement_dummy;
7194 __m128i dir_horiz;
7195 int rlength_ceil, r, c;
7196 int rlo, rhigh;
7197 int na1, na2, na2_alt;
7198 Score16_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr;
7199 Pairdistance_T **pairdistance_array_type;
7200
7201
7202 debug2(printf("Dynprog_simd_16_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
7203 debug15(printf("Dynprog_simd_16_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
7204
7205 rlength_ceil = (int) ((rlength + SIMD_NSHORTS_NONAVX2)/SIMD_NSHORTS_NONAVX2) * SIMD_NSHORTS_NONAVX2;
7206 pairdistance_array_type = pairdistance_array[mismatchtype];
7207
7208 debug(printf("compute_scores_simd_16_bycols (upper): "));
7209 debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
7210 debug(printf("Query length rounded up to %d\n",rlength_ceil));
7211
7212 matrix = aligned_score16_alloc(rlength_ceil,glength,
7213 this->aligned_std.two.upper_matrix_ptrs,this->aligned_std.two.upper_matrix_space);
7214 *directions_nogap = aligned_directions16_alloc(rlength_ceil,glength,
7215 this->aligned_std.two.upper_directions_ptrs_0,this->aligned_std.two.upper_directions_space_0);
7216 *directions_Egap = aligned_directions16_alloc(rlength_ceil,glength,
7217 this->aligned_std.two.upper_directions_ptrs_1,this->aligned_std.two.upper_directions_space_1);
7218
7219 #if 0
7220 /* Row 0 initialization */
7221 /* penalty = open; */
7222 for (c = 1; c <= uband && c <= glength; c++) {
7223 /* penalty += extend; */
7224 (*directions_Egap)[c][0] = HORIZ;
7225 (*directions_nogap)[c][0] = HORIZ;
7226 }
7227 #endif
7228 #if 0
7229 /* Already initialized to DIAG. Actually, no longer initializing directions_Egap */
7230 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
7231 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
7232 #endif
7233 #if 0
7234 /* Column 0 initialization */
7235 /* penalty = open; */
7236 for (r = 1; r <= SIMD_NSHORTS_NONAVX2 && r <= rlength; r++) {
7237 /* penalty += extend; */
7238 (*directions_nogap)[0][r] = VERT;
7239 }
7240 #endif
7241
7242
7243 /* Load pairscores. Store match - mismatch */
7244 pairscores[0] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
7245 pairscores[1] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
7246 pairscores[2] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
7247 pairscores[3] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
7248 pairscores[4] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
7249
7250 #if 0
7251 /* Should not be necessary */
7252 memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score16_T));
7253 memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score16_T));
7254 memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score16_T));
7255 memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score16_T));
7256 memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score16_T));
7257 #endif
7258
7259 r = 0; na1 = 'N';
7260 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7261 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7262 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7263 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7264 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7265
7266 if (revp == false) {
7267 for (r = 1; r <= rlength; r++) {
7268 na1 = (int) rsequence[r-1];
7269 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7270 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7271 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7272 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7273 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7274 }
7275 } else {
7276 for (r = 1; r <= rlength; r++) {
7277 na1 = (int) rsequence[1-r];
7278 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7279 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7280 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7281 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7282 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7283 }
7284 }
7285
7286 #if 0
7287 /* Should not be necessary */
7288 memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7289 memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7290 memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7291 memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7292 memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7293 #endif
7294
7295 complement_dummy = _mm_set1_epi16(-1);
7296
7297 gap_open = _mm_set1_epi16((Score16_T) open);
7298 gap_extend = _mm_set1_epi16((Score16_T) extend);
7299
7300 E_infinity = _mm_set1_epi16(POS_INFINITY_16);
7301 if (jump_late_p) {
7302 for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS_NONAVX2) {
7303 if ((rhigh = rlo + SIMD_NSHORTS_NONAVX2 - 1) > rlength) {
7304 rhigh = rlength;
7305 }
7306
7307 /* dir_horiz tests if E >= H. To fill in first column of each
7308 row block with non-diags, could make E == H. But irrelevant,
7309 because these are above the diagonal. */
7310 E_mask = _mm_set1_epi16(1);
7311
7312 E_r_gap = _mm_set1_epi16(NEG_INFINITY_16);
7313 H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
7314
7315 for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
7316 score_column = matrix[c];
7317
7318 if (c == 0) {
7319 na2 = na2_alt = 4; /* 'N' */
7320 } else {
7321 na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
7322 na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
7323 }
7324 pairscores_std_ptr = pairscores[na2];
7325 pairscores_alt_ptr = pairscores[na2_alt];
7326
7327 if (c == 0) {
7328 X_prev_nogap = _mm_set1_epi16(0);
7329 } else if (rlo == 0) {
7330 #ifdef ZERO_INITIAL_GAP_PENALTY
7331 X_prev_nogap = _mm_set1_epi16(0);
7332 #else
7333 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
7334 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
7335 #endif
7336 } else {
7337 /* second or greater block of 16 */
7338 X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
7339 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
7340 }
7341
7342 debug15(print_vector_16(E_mask,rlo,c,"E_mask"));
7343 E_r_gap = _mm_min_epi16(E_r_gap,_mm_add_epi16(E_mask,E_infinity));
7344 debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
7345 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
7346
7347 /* EGAP */
7348 T1 = _mm_adds_epi16(H_nogap_r, gap_open);
7349 dir_horiz = _mm_cmplt_epi16(E_r_gap,T1); /* E < H */
7350 dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy); /* E >= H, for jump late */
7351 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7352 debug15(print_vector_16(T1,rlo,c,"T1"));
7353 debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
7354
7355 E_r_gap = _mm_max_epi16(E_r_gap, T1); /* Compare H + open with vert */
7356 E_r_gap = _mm_adds_epi16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
7357 E_r_gap = _mm_min_epi16(E_r_gap,_mm_add_epi16(E_mask,E_infinity));
7358 debug15(print_vector_16(E_r_gap,rlo,c,"E"));
7359
7360
7361 /* NOGAP */
7362 T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_NONAVX2);
7363 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
7364 H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
7365 X_prev_nogap = T1;
7366
7367 /* Add pairscores, allowing for alternate genomic nt */
7368 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
7369 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
7370 H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
7371 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
7372 debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
7373
7374 dir_horiz = _mm_cmplt_epi16(E_r_gap,H_nogap_r); /* E < H */
7375 dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy); /* E >= H, for jump late */
7376 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7377 debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
7378
7379 H_nogap_r = _mm_max_epi16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
7380 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
7381 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
7382
7383
7384 /* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
7385 if (rhigh >= c) {
7386 (*directions_Egap)[c][c] = DIAG;
7387 (*directions_nogap)[c][c] = DIAG;
7388 }
7389
7390 /* No need for F loop here */
7391 E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
7392 }
7393 }
7394
7395 } else {
7396 /* jump early */
7397 for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS_NONAVX2) {
7398 if ((rhigh = rlo + SIMD_NSHORTS_NONAVX2 - 1) > rlength) {
7399 rhigh = rlength;
7400 }
7401
7402 /* dir_horiz tests if E > H. To fill in first column of each
7403 row block with non-diags, could make E > H. But irrelevant,
7404 because these are above the diagonal. */
7405 E_mask = _mm_set1_epi16(1);
7406
7407 E_r_gap = _mm_set1_epi16(NEG_INFINITY_16+1);
7408 H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
7409
7410 for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
7411 score_column = matrix[c];
7412
7413 if (c == 0) {
7414 na2 = na2_alt = 4; /* 'N' */
7415 } else {
7416 na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
7417 na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
7418 }
7419 pairscores_std_ptr = pairscores[na2];
7420 pairscores_alt_ptr = pairscores[na2_alt];
7421
7422 if (c == 0) {
7423 X_prev_nogap = _mm_set1_epi16(0);
7424 } else if (rlo == 0) {
7425 #ifdef ZERO_INITIAL_GAP_PENALTY
7426 X_prev_nogap = _mm_set1_epi16(0);
7427 #else
7428 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
7429 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
7430 #endif
7431 } else {
7432 /* second or greater block of 16 */
7433 X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
7434 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
7435 }
7436
7437 debug15(print_vector_16(E_mask,rlo,c,"E_mask"));
7438 E_r_gap = _mm_min_epi16(E_r_gap,_mm_add_epi16(E_mask,E_infinity));
7439 debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
7440 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
7441
7442 /* EGAP */
7443 T1 = _mm_adds_epi16(H_nogap_r, gap_open);
7444 dir_horiz = _mm_cmpgt_epi16(E_r_gap,T1); /* E > H, for jump early */
7445 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7446 debug15(print_vector_16(T1,rlo,c,"T1"));
7447 debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
7448
7449 E_r_gap = _mm_max_epi16(E_r_gap, T1); /* Compare H + open with vert */
7450 E_r_gap = _mm_adds_epi16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
7451 E_r_gap = _mm_min_epi16(E_r_gap,_mm_add_epi16(E_mask,E_infinity));
7452 debug15(print_vector_16(E_r_gap,rlo,c,"E"));
7453
7454
7455 /* NOGAP */
7456 T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_NONAVX2);
7457 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
7458 H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
7459 X_prev_nogap = T1;
7460
7461 /* Add pairscores, allowing for alternate genomic nt */
7462 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
7463 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
7464 H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
7465 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
7466 debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
7467
7468 dir_horiz = _mm_cmpgt_epi16(E_r_gap,H_nogap_r); /* E > H, for jump early */
7469 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7470 debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
7471
7472 H_nogap_r = _mm_max_epi16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
7473 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
7474 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
7475
7476
7477 /* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
7478 if (rhigh >= c) {
7479 (*directions_Egap)[c][c] = DIAG;
7480 (*directions_nogap)[c][c] = DIAG;
7481 }
7482
7483 /* No need for F loop here */
7484 E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
7485 }
7486 }
7487 }
7488
7489 #ifdef CHECK1
7490 /* Row 0 and column 0 directions fail anyway due to saturation */
7491 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
7492 (*directions_Egap)[1][0] = HORIZ;
7493 #endif
7494
7495 #ifdef DEBUG2
7496 printf("SIMD: Dynprog_simd_16_upper\n");
7497 Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
7498 revp,uband,/*upperp*/true);
7499 Directions16_print_ud(*directions_nogap,*directions_Egap,
7500 rlength,glength,rsequence,gsequence,gsequence_alt,
7501 revp,uband,/*upperp*/true);
7502 #endif
7503
7504 #ifdef CHECK1
7505 /* Check for row 0 directions */
7506 for (c = 1; c <= uband && c <= glength; c++) {
7507 assert((*directions_Egap)[c][0] != DIAG);
7508 assert((*directions_nogap)[c][0] != DIAG);
7509 }
7510 #endif
7511
7512 _mm_free(pairscores[4]);
7513 _mm_free(pairscores[3]);
7514 _mm_free(pairscores[2]);
7515 _mm_free(pairscores[1]);
7516 _mm_free(pairscores[0]);
7517
7518 return matrix;
7519 }
7520 #endif
7521
7522
7523 #ifdef HAVE_SSE2
7524 /* Designed for computation above the diagonal, so no F loop or bottom masking needed */
7525 /* Operates by columns */
7526 Score16_T **
Dynprog_simd_16_upper(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int uband,bool jump_late_p,bool revp)7527 Dynprog_simd_16_upper (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
7528 T this, char *rsequence, char *gsequence, char *gsequence_alt,
7529 int rlength, int glength,
7530 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
7531 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
7532 #endif
7533 Mismatchtype_T mismatchtype, int open, int extend,
7534 int uband, bool jump_late_p, bool revp) {
7535 Score16_T **matrix, *score_column;
7536 #ifdef HAVE_AVX2
7537 __m256i pairscores_std, pairscores_alt;
7538 __m256i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, E_infinity, T1;
7539 __m256i gap_open, gap_extend, complement_dummy;
7540 __m256i dir_horiz;
7541 Score16_T save;
7542 #else
7543 __m128i pairscores_std, pairscores_alt;
7544 __m128i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, E_infinity, T1;
7545 __m128i gap_open, gap_extend, complement_dummy;
7546 __m128i dir_horiz;
7547 #endif
7548 int rlength_ceil, r, c;
7549 int rlo, rhigh;
7550 int na1, na2, na2_alt;
7551 Score16_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr;
7552 Pairdistance_T **pairdistance_array_type;
7553
7554 #ifdef DEBUG_AVX2
7555 Score16_T **matrix_std;
7556 Direction16_T **directions_nogap_std, **directions_Egap_std;
7557 char na2_single;
7558 #elif defined(DEBUG_SIMD)
7559 Score32_T **matrix_std;
7560 Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
7561 char na2_single;
7562 #endif
7563
7564
7565 debug2(printf("Dynprog_simd_16_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
7566 debug15(printf("Dynprog_simd_16_upper. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
7567
7568 rlength_ceil = (int) ((rlength + SIMD_NSHORTS)/SIMD_NSHORTS) * SIMD_NSHORTS;
7569 pairdistance_array_type = pairdistance_array[mismatchtype];
7570
7571 debug(printf("compute_scores_simd_16_bycols (upper): "));
7572 debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
7573 debug(printf("Query length rounded up to %d\n",rlength_ceil));
7574
7575 matrix = aligned_score16_alloc(rlength_ceil,glength,
7576 this->aligned.two.upper_matrix_ptrs,this->aligned.two.upper_matrix_space);
7577 *directions_nogap = aligned_directions16_alloc(rlength_ceil,glength,
7578 this->aligned.two.upper_directions_ptrs_0,this->aligned.two.upper_directions_space_0);
7579 *directions_Egap = aligned_directions16_alloc(rlength_ceil,glength,
7580 this->aligned.two.upper_directions_ptrs_1,this->aligned.two.upper_directions_space_1);
7581
7582 #if 0
7583 /* Row 0 initialization */
7584 /* penalty = open; */
7585 for (c = 1; c <= uband && c <= glength; c++) {
7586 /* penalty += extend; */
7587 (*directions_Egap)[c][0] = HORIZ;
7588 (*directions_nogap)[c][0] = HORIZ;
7589 }
7590 #endif
7591 #if 0
7592 /* Already initialized to DIAG. Actually, no longer initializing directions_Egap */
7593 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
7594 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
7595 #endif
7596 #if 0
7597 /* Column 0 initialization */
7598 /* penalty = open; */
7599 for (r = 1; r <= SIMD_NSHORTS && r <= rlength; r++) {
7600 /* penalty += extend; */
7601 (*directions_nogap)[0][r] = VERT;
7602 }
7603 #endif
7604
7605
7606 /* Load pairscores. Store match - mismatch */
7607 pairscores[0] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
7608 pairscores[1] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
7609 pairscores[2] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
7610 pairscores[3] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
7611 pairscores[4] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
7612
7613 #if 0
7614 /* Should not be necessary */
7615 memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score16_T));
7616 memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score16_T));
7617 memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score16_T));
7618 memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score16_T));
7619 memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score16_T));
7620 #endif
7621
7622 r = 0; na1 = 'N';
7623 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7624 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7625 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7626 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7627 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7628
7629 if (revp == false) {
7630 for (r = 1; r <= rlength; r++) {
7631 na1 = (int) rsequence[r-1];
7632 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7633 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7634 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7635 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7636 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7637 }
7638 } else {
7639 for (r = 1; r <= rlength; r++) {
7640 na1 = (int) rsequence[1-r];
7641 pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7642 pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7643 pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7644 pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7645 pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7646 }
7647 }
7648
7649 #if 0
7650 /* Should not be necessary */
7651 memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7652 memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7653 memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7654 memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7655 memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7656 #endif
7657
7658 complement_dummy = _MM_SET1_EPI16(-1);
7659
7660 gap_open = _MM_SET1_EPI16((Score16_T) open);
7661 gap_extend = _MM_SET1_EPI16((Score16_T) extend);
7662
7663 E_infinity = _MM_SET1_EPI16(POS_INFINITY_16);
7664 if (jump_late_p) {
7665 for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS) {
7666 if ((rhigh = rlo + SIMD_NSHORTS - 1) > rlength) {
7667 rhigh = rlength;
7668 }
7669
7670 /* dir_horiz tests if E >= H. To fill in first column of each
7671 row block with non-diags, could make E == H. But irrelevant,
7672 because these are above the diagonal. */
7673 E_mask = _MM_SET1_EPI16(1);
7674
7675 /* Holds for all INITIAL_GAP_PENALTY */
7676 E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16);
7677 H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
7678
7679 for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
7680 score_column = matrix[c];
7681
7682 if (c == 0) {
7683 na2 = na2_alt = 4; /* 'N' */
7684 } else {
7685 na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
7686 na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
7687 }
7688 pairscores_std_ptr = pairscores[na2];
7689 pairscores_alt_ptr = pairscores[na2_alt];
7690
7691 if (c == 0) {
7692 X_prev_nogap = _MM_SETZERO_SI();
7693 } else if (rlo == 0) {
7694 #ifdef ZERO_INITIAL_GAP_PENALTY
7695 X_prev_nogap = _MM_SETZERO_SI();
7696 #elif defined(HAVE_AVX2)
7697 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
7698 #else
7699 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
7700 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
7701 #endif
7702 } else {
7703 /* second or greater block of 16 */
7704 #ifdef ZERO_INITIAL_GAP_PENALTY
7705 X_prev_nogap = _MM_SETZERO_SI();
7706 #elif defined(HAVE_AVX2)
7707 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_SHORT_INSERT);
7708 #else
7709 X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
7710 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
7711 #endif
7712 }
7713
7714 debug15(print_vector_16(E_mask,rlo,c,"E_mask"));
7715 E_r_gap = _MM_MIN_EPI16(E_r_gap,_MM_ADD_EPI16(E_mask,E_infinity));
7716 debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
7717 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
7718
7719 /* EGAP */
7720 T1 = _MM_ADDS_EPI16(H_nogap_r, gap_open);
7721 dir_horiz = _MM_CMPLT_EPI16(E_r_gap,T1); /* E < H */
7722 dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy); /* E >= H, for jump late */
7723 #ifdef HAVE_AVX2
7724 _mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7725 #else
7726 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7727 #endif
7728 debug15(print_vector_16(T1,rlo,c,"T1"));
7729 debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
7730
7731 E_r_gap = _MM_MAX_EPI16(E_r_gap, T1); /* Compare H + open with vert */
7732 E_r_gap = _MM_ADDS_EPI16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
7733 E_r_gap = _MM_MIN_EPI16(E_r_gap,_MM_ADD_EPI16(E_mask,E_infinity));
7734 debug15(print_vector_16(E_r_gap,rlo,c,"E"));
7735
7736
7737 /* NOGAP */
7738 #ifdef HAVE_AVX2
7739 T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_r,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
7740 X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_r,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
7741 H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_SHORT);
7742 #else
7743 T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_SHIFT);
7744 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
7745 #endif
7746 H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
7747 X_prev_nogap = T1;
7748
7749 /* Add pairscores, allowing for alternate genomic nt */
7750 #ifdef HAVE_AVX2
7751 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
7752 pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
7753 H_nogap_r = _mm256_adds_epi16(H_nogap_r, _mm256_max_epi16(pairscores_std,pairscores_alt));
7754 #else
7755 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
7756 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
7757 H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
7758 #endif
7759 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
7760 debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
7761
7762 dir_horiz = _MM_CMPLT_EPI16(E_r_gap,H_nogap_r); /* E < H */
7763 dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy); /* E >= H, for jump late */
7764 #ifdef HAVE_AVX2
7765 _mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7766 #else
7767 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7768 #endif
7769 debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
7770
7771 H_nogap_r = _MM_MAX_EPI16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
7772 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
7773 #ifdef HAVE_AVX2
7774 _mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
7775 #else
7776 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
7777 #endif
7778
7779 /* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
7780 if (rhigh >= c) {
7781 (*directions_Egap)[c][c] = DIAG;
7782 (*directions_nogap)[c][c] = DIAG;
7783 }
7784
7785 /* No need for F loop here */
7786 #ifdef HAVE_AVX2
7787 save = _mm256_extract_epi16(E_mask,7);
7788 E_mask = _mm256_slli_si256(E_mask,ONE_SHORT);
7789 E_mask = _mm256_insert_epi16(E_mask,save,8);
7790 #else
7791 E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
7792 #endif
7793 }
7794 }
7795
7796 } else {
7797 /* jump early */
7798 for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS) {
7799 if ((rhigh = rlo + SIMD_NSHORTS - 1) > rlength) {
7800 rhigh = rlength;
7801 }
7802
7803 /* dir_horiz tests if E > H. To fill in first column of each
7804 row block with non-diags, could make E > H. But irrelevant,
7805 because these are above the diagonal. */
7806 E_mask = _MM_SET1_EPI16(1);
7807
7808 /* Holds for all INITIAL_GAP_PENALTY */
7809 E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16+1);
7810 H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
7811
7812 for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
7813 score_column = matrix[c];
7814
7815 if (c == 0) {
7816 na2 = na2_alt = 4; /* 'N' */
7817 } else {
7818 na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
7819 na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
7820 }
7821 pairscores_std_ptr = pairscores[na2];
7822 pairscores_alt_ptr = pairscores[na2_alt];
7823
7824 if (c == 0) {
7825 X_prev_nogap = _MM_SETZERO_SI();
7826 } else if (rlo == 0) {
7827 #ifdef ZERO_INITIAL_GAP_PENALTY
7828 X_prev_nogap = _MM_SETZERO_SI();
7829 #elif defined(HAVE_AVX2)
7830 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
7831 #else
7832 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
7833 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
7834 #endif
7835 } else {
7836 /* second or greater block of 16 */
7837 #ifdef HAVE_AVX2
7838 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_SHORT_INSERT);
7839 #else
7840 X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
7841 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
7842 #endif
7843 }
7844
7845 debug15(print_vector_16(E_mask,rlo,c,"E_mask"));
7846 E_r_gap = _MM_MIN_EPI16(E_r_gap,_MM_ADD_EPI16(E_mask,E_infinity));
7847 debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
7848 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
7849
7850 /* EGAP */
7851 T1 = _MM_ADDS_EPI16(H_nogap_r, gap_open);
7852 dir_horiz = _MM_CMPGT_EPI16(E_r_gap,T1); /* E > H, for jump early */
7853 #ifdef HAVE_AVX2
7854 _mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7855 #else
7856 _mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7857 #endif
7858 debug15(print_vector_16(T1,rlo,c,"T1"));
7859 debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
7860
7861 E_r_gap = _MM_MAX_EPI16(E_r_gap, T1); /* Compare H + open with vert */
7862 E_r_gap = _MM_ADDS_EPI16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
7863 E_r_gap = _MM_MIN_EPI16(E_r_gap,_MM_ADD_EPI16(E_mask,E_infinity));
7864 debug15(print_vector_16(E_r_gap,rlo,c,"E"));
7865
7866
7867 /* NOGAP */
7868 #ifdef HAVE_AVX2
7869 T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_r,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
7870 X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_r,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
7871 H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_SHORT);
7872 #else
7873 T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_SHIFT);
7874 H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
7875 #endif
7876 H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
7877 X_prev_nogap = T1;
7878
7879 /* Add pairscores, allowing for alternate genomic nt */
7880 #ifdef HAVE_AVX2
7881 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
7882 pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
7883 H_nogap_r = _mm256_adds_epi16(H_nogap_r, _mm256_max_epi16(pairscores_std,pairscores_alt));
7884 #else
7885 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
7886 pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
7887 H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
7888 #endif
7889 _mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
7890 debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
7891
7892 dir_horiz = _MM_CMPGT_EPI16(E_r_gap,H_nogap_r); /* E > H, for jump early */
7893 #ifdef HAVE_AVX2
7894 _mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7895 #else
7896 _mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7897 #endif
7898 debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
7899
7900 H_nogap_r = _MM_MAX_EPI16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
7901 debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
7902 #ifdef HAVE_AVX2
7903 _mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
7904 #else
7905 _mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
7906 #endif
7907
7908 /* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
7909 if (rhigh >= c) {
7910 (*directions_Egap)[c][c] = DIAG;
7911 (*directions_nogap)[c][c] = DIAG;
7912 }
7913
7914 /* No need for F loop here */
7915 #ifdef HAVE_AVX2
7916 save = _mm256_extract_epi16(E_mask,7);
7917 E_mask = _mm256_slli_si256(E_mask,ONE_SHORT);
7918 E_mask = _mm256_insert_epi16(E_mask,save,8);
7919 #else
7920 E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
7921 #endif
7922 }
7923 }
7924 }
7925
7926 #ifdef CHECK1
7927 /* Row 0 and column 0 directions fail anyway due to saturation */
7928 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
7929 (*directions_Egap)[1][0] = HORIZ;
7930 #endif
7931
7932 #ifdef DEBUG2
7933 printf("SIMD: Dynprog_simd_16_upper\n");
7934 Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
7935 revp,uband,/*upperp*/true);
7936 Directions16_print_ud(*directions_nogap,*directions_Egap,
7937 rlength,glength,rsequence,gsequence,gsequence_alt,
7938 revp,uband,/*upperp*/true);
7939 #endif
7940
7941 #ifdef CHECK1
7942 /* Check for row 0 directions */
7943 for (c = 1; c <= uband && c <= glength; c++) {
7944 assert((*directions_Egap)[c][0] != DIAG);
7945 assert((*directions_nogap)[c][0] != DIAG);
7946 }
7947 #endif
7948
7949 #ifdef DEBUG_AVX2
7950 matrix_std = Dynprog_simd_16_upper_nonavx2(&directions_nogap_std,&directions_Egap_std,
7951 this,rsequence,gsequence,gsequence_alt,
7952 rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
7953 open,extend,uband,jump_late_p,revp);
7954 #elif defined(DEBUG_SIMD)
7955 matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
7956 this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
7957 rlength,glength,
7958 goffset,chroffset,chrhigh,watsonp,mismatchtype,
7959 open,extend,/*lband*/0,uband,jump_late_p,revp,/*saturation*/NEG_INFINITY_16,
7960 /*upperp*/true,/*lowerp*/false);
7961 #endif
7962
7963 #ifdef DEBUG2
7964 printf("Banded\n");
7965 Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
7966 revp,uband,/*upperp*/true);
7967 Directions16_print_ud(*directions_nogap,*directions_Egap,
7968 rlength,glength,rsequence,gsequence,gsequence_alt,revp,uband,/*upperp*/true);
7969 #endif
7970
7971 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
7972 banded_matrix16_compare_upper(matrix,matrix_std,rlength,glength,uband,
7973 rsequence,gsequence,gsequence_alt,
7974 goffset,chroffset,chrhigh,watsonp,revp);
7975
7976 banded_directions16_compare_nogap_upper(*directions_nogap,directions_nogap_std,rlength,glength,uband);
7977
7978 banded_directions16_compare_Egap_upper(*directions_Egap,directions_Egap_std,rlength,glength,uband);
7979 #endif
7980
7981 _mm_free(pairscores[4]);
7982 _mm_free(pairscores[3]);
7983 _mm_free(pairscores[2]);
7984 _mm_free(pairscores[1]);
7985 _mm_free(pairscores[0]);
7986
7987 return matrix;
7988 }
7989 #endif
7990
7991
7992 #ifdef DEBUG_AVX2
7993 /* Designed for computation below the diagonal, so no F loop or bottom masking needed */
7994 /* Operates by rows */
7995 Score16_T **
Dynprog_simd_16_lower_nonavx2(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,bool jump_late_p,bool revp)7996 Dynprog_simd_16_lower_nonavx2 (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
7997 T this, char *rsequence, char *gsequence, char *gsequence_alt,
7998 int rlength, int glength,
7999 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
8000 Mismatchtype_T mismatchtype, int open, int extend,
8001 int lband, bool jump_late_p, bool revp) {
8002 Score16_T **matrix, *score_column;
8003 __m128i pairscores_std;
8004 __m128i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, E_infinity, T1;
8005 __m128i gap_open, gap_extend, complement_dummy;
8006 __m128i dir_vert;
8007 int glength_ceil, r, c;
8008 int clo, chigh;
8009 int na1, na2, na2_alt;
8010 Score16_T *pairscores[5], *pairscores_ptr;
8011 Pairdistance_T **pairdistance_array_type, score1, score2;
8012
8013 debug2(printf("Dynprog_simd_16_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
8014 debug15(printf("Dynprog_simd_16_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
8015
8016 glength_ceil = (int) ((glength + SIMD_NSHORTS_NONAVX2)/SIMD_NSHORTS_NONAVX2) * SIMD_NSHORTS_NONAVX2;
8017 pairdistance_array_type = pairdistance_array[mismatchtype];
8018
8019 debug(printf("compute_scores_simd_16_byrows (lower): "));
8020 debug(printf("Lengths are %d and %d, so band is %d on left\n",rlength,glength,lband));
8021 debug(printf("Genome length rounded up to %d\n",glength_ceil));
8022
8023 matrix = aligned_score16_alloc(glength_ceil,rlength,
8024 this->aligned_std.two.lower_matrix_ptrs,this->aligned_std.two.lower_matrix_space);
8025 *directions_nogap = aligned_directions16_alloc(glength_ceil,rlength,
8026 this->aligned_std.two.lower_directions_ptrs_0,this->aligned_std.two.lower_directions_space_0);
8027 *directions_Egap = aligned_directions16_alloc(glength_ceil,rlength,
8028 this->aligned_std.two.lower_directions_ptrs_1,this->aligned_std.two.lower_directions_space_1);
8029
8030 #if 0
8031 /* Column 0 initialization */
8032 /* penalty = open; */
8033 for (r = 1; r <= lband && r <= rlength; r++) {
8034 /* penalty += extend; */
8035 (*directions_Egap)[r][0] = VERT;
8036 (*directions_nogap)[r][0] = VERT;
8037 }
8038 #endif
8039 #if 0
8040 /* Already initialized to DIAG. Actually, no longer initializing directions_Egap */
8041 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
8042 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
8043 #endif
8044 #if 0
8045 /* Row 0 initialization */
8046 /* penalty = open; */
8047 for (c = 1; c <= SIMD_NSHORTS_NONAVX2 && c <= glength; c++) {
8048 /* penalty += extend; */
8049 (*directions_nogap)[0][c] = HORIZ;
8050 }
8051 #endif
8052
8053
8054 /* Load pairscores. Store match - mismatch */
8055 pairscores[0] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),16);
8056 pairscores[1] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),16);
8057 pairscores[2] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),16);
8058 pairscores[3] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),16);
8059 pairscores[4] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),16);
8060
8061 #if 0
8062 /* Should not be necessary */
8063 memset((void *) pairscores[0],0,glength_ceil*sizeof(Score16_T));
8064 memset((void *) pairscores[1],0,glength_ceil*sizeof(Score16_T));
8065 memset((void *) pairscores[2],0,glength_ceil*sizeof(Score16_T));
8066 memset((void *) pairscores[3],0,glength_ceil*sizeof(Score16_T));
8067 memset((void *) pairscores[4],0,glength_ceil*sizeof(Score16_T));
8068 #endif
8069
8070 c = 0; na2 = na2_alt = 'N';
8071 pairscores[0][c] = (Score16_T) pairdistance_array_type[(int) 'A'][na2];
8072 pairscores[1][c] = (Score16_T) pairdistance_array_type[(int) 'C'][na2];
8073 pairscores[2][c] = (Score16_T) pairdistance_array_type[(int) 'G'][na2];
8074 pairscores[3][c] = (Score16_T) pairdistance_array_type[(int) 'T'][na2];
8075 pairscores[4][c] = (Score16_T) pairdistance_array_type[(int) 'N'][na2];
8076
8077 if (revp == false) {
8078 for (c = 1; c <= glength; c++) {
8079 na2 = gsequence[c-1];
8080 na2_alt = gsequence_alt[c-1];
8081 /* Take max here */
8082 score1 = pairdistance_array_type[(int) 'A'][na2];
8083 score2 = pairdistance_array_type[(int) 'A'][na2_alt];
8084 pairscores[0][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8085
8086 score1 = pairdistance_array_type[(int) 'C'][na2];
8087 score2 = pairdistance_array_type[(int) 'C'][na2_alt];
8088 pairscores[1][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8089
8090 score1 = pairdistance_array_type[(int) 'G'][na2];
8091 score2 = pairdistance_array_type[(int) 'G'][na2_alt];
8092 pairscores[2][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8093
8094 score1 = pairdistance_array_type[(int) 'T'][na2];
8095 score2 = pairdistance_array_type[(int) 'T'][na2_alt];
8096 pairscores[3][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8097
8098 score1 = pairdistance_array_type[(int) 'N'][na2];
8099 score2 = pairdistance_array_type[(int) 'N'][na2_alt];
8100 pairscores[4][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8101 }
8102 } else {
8103 for (c = 1; c <= glength; c++) {
8104 na2 = gsequence[1-c];
8105 na2_alt = gsequence_alt[1-c];
8106 /* Take max here */
8107 score1 = pairdistance_array_type[(int) 'A'][na2];
8108 score2 = pairdistance_array_type[(int) 'A'][na2_alt];
8109 pairscores[0][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8110
8111 score1 = pairdistance_array_type[(int) 'C'][na2];
8112 score2 = pairdistance_array_type[(int) 'C'][na2_alt];
8113 pairscores[1][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8114
8115 score1 = pairdistance_array_type[(int) 'G'][na2];
8116 score2 = pairdistance_array_type[(int) 'G'][na2_alt];
8117 pairscores[2][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8118
8119 score1 = pairdistance_array_type[(int) 'T'][na2];
8120 score2 = pairdistance_array_type[(int) 'T'][na2_alt];
8121 pairscores[3][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8122
8123 score1 = pairdistance_array_type[(int) 'N'][na2];
8124 score2 = pairdistance_array_type[(int) 'N'][na2_alt];
8125 pairscores[4][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8126 }
8127 }
8128
8129 #if 0
8130 /* Should not be necessary */
8131 memset((void *) &(pairscores[0][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8132 memset((void *) &(pairscores[1][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8133 memset((void *) &(pairscores[2][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8134 memset((void *) &(pairscores[3][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8135 memset((void *) &(pairscores[4][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8136 #endif
8137
8138 complement_dummy = _mm_set1_epi16(-1);
8139
8140 gap_open = _mm_set1_epi16((Score16_T) open);
8141 gap_extend = _mm_set1_epi16((Score16_T) extend);
8142
8143 E_infinity = _mm_set1_epi16(POS_INFINITY_16);
8144 if (jump_late_p) {
8145 for (clo = 0; clo <= glength; clo += SIMD_NSHORTS_NONAVX2) {
8146 if ((chigh = clo + SIMD_NSHORTS_NONAVX2 - 1) > glength) {
8147 chigh = glength;
8148 }
8149
8150 /* dir_vert tests if E >= H. To fill in first row of each
8151 column block with non-diags, make E == H. */
8152 E_mask = _mm_set1_epi16(1);
8153
8154 E_c_gap = _mm_set1_epi16(NEG_INFINITY_16);
8155 H_nogap_c = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
8156
8157 for (r = clo; r <= chigh + lband && r <= rlength; r++) {
8158 score_column = matrix[r];
8159
8160 if (r == 0) {
8161 na1 = 4; /* 'N' */
8162 } else {
8163 na1 = revp ? nt_to_int_array[rsequence[1-r]] : nt_to_int_array[rsequence[r-1]];
8164 }
8165 pairscores_ptr = pairscores[na1];
8166
8167 if (r == 0) {
8168 X_prev_nogap = _mm_set1_epi16(0);
8169 } else if (clo == 0) {
8170 #ifdef ZERO_INITIAL_GAP_PENALTY
8171 X_prev_nogap = _mm_set1_epi16(0);
8172 #else
8173 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
8174 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
8175 #endif
8176 } else {
8177 /* second or greater block of 16 */
8178 X_prev_nogap = _mm_set1_epi16(matrix[r-1][clo-1]); /* get H from previous block and previous column */
8179 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
8180 }
8181
8182 debug15(print_vector_16(E_mask,clo,r,"E_mask"));
8183 E_c_gap = _mm_min_epi16(E_c_gap,_mm_add_epi16(E_mask,E_infinity));
8184 debug15(print_vector_16(E_c_gap,clo,r,"E_c_gap"));
8185 debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c load"));
8186
8187 /* EGAP */
8188 T1 = _mm_adds_epi16(H_nogap_c, gap_open);
8189 dir_vert = _mm_cmplt_epi16(E_c_gap,T1); /* E < H */
8190 dir_vert = _mm_andnot_si128(dir_vert,complement_dummy); /* E >= H, for jump late */
8191 _mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
8192 debug15(print_vector_16(T1,clo,r,"T1"));
8193 debug15(print_vector_16(dir_vert,clo,r,"dir_Egap"));
8194
8195 E_c_gap = _mm_max_epi16(E_c_gap, T1); /* Compare H + open with vert */
8196 E_c_gap = _mm_adds_epi16(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
8197 E_c_gap = _mm_min_epi16(E_c_gap,_mm_add_epi16(E_mask,E_infinity));
8198 debug15(print_vector_16(E_c_gap,clo,r,"E"));
8199
8200
8201 /* NOGAP */
8202 T1 = _mm_srli_si128(H_nogap_c,LAST_SHORT_NONAVX2);
8203 H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_SHORT);
8204 H_nogap_c = _mm_or_si128(H_nogap_c, X_prev_nogap);
8205 X_prev_nogap = T1;
8206
8207 /* Add pairscores. No alternate chars for query sequence. */
8208 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
8209 H_nogap_c = _mm_adds_epi16(H_nogap_c, pairscores_std);
8210 _mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
8211 debug15(print_vector_16(H_nogap_c,clo,r,"H"));
8212
8213 dir_vert = _mm_cmplt_epi16(E_c_gap,H_nogap_c); /* E < H */
8214 dir_vert = _mm_andnot_si128(dir_vert,complement_dummy); /* E >= H, for jump late */
8215 _mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
8216 debug15(print_vector_16(dir_vert,clo,r,"dir_nogap"));
8217
8218 H_nogap_c = _mm_max_epi16(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
8219 debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c store"));
8220 _mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
8221
8222
8223 /* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
8224 if (chigh >= r) {
8225 (*directions_Egap)[r][r] = DIAG;
8226 (*directions_nogap)[r][r] = DIAG;
8227 }
8228
8229 /* No need for F loop here */
8230 E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
8231 }
8232 }
8233
8234 } else {
8235 /* jump early */
8236 for (clo = 0; clo <= glength; clo += SIMD_NSHORTS_NONAVX2) {
8237 if ((chigh = clo + SIMD_NSHORTS_NONAVX2 - 1) > glength) {
8238 chigh = glength;
8239 }
8240
8241 /* dir_vert tests if E > H. To fill in first row of each
8242 column block with non-diags, make E > H. */
8243 E_mask = _mm_set1_epi16(1);
8244
8245 E_c_gap = _mm_set1_epi16(NEG_INFINITY_16+1);
8246 H_nogap_c = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
8247
8248 for (r = clo; r <= chigh + lband && r <= rlength; r++) {
8249 score_column = matrix[r];
8250
8251 if (r == 0) {
8252 na1 = 4; /* 'N' */
8253 } else {
8254 na1 = revp ? nt_to_int_array[rsequence[1-r]] : nt_to_int_array[rsequence[r-1]];
8255 }
8256 pairscores_ptr = pairscores[na1];
8257
8258 if (r == 0) {
8259 X_prev_nogap = _mm_set1_epi16(0);
8260 } else if (clo == 0) {
8261 #ifdef ZERO_INITIAL_GAP_PENALTY
8262 X_prev_nogap = _mm_set1_epi16(0);
8263 #else
8264 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
8265 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
8266 #endif
8267 } else {
8268 /* second or greater block of 16 */
8269 X_prev_nogap = _mm_set1_epi16(matrix[r-1][clo-1]); /* get H from previous block and previous column */
8270 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
8271 }
8272
8273 debug15(print_vector_16(E_mask,clo,r,"E_mask"));
8274 E_c_gap = _mm_min_epi16(E_c_gap,_mm_add_epi16(E_mask,E_infinity));
8275 debug15(print_vector_16(E_c_gap,clo,r,"E_c_gap"));
8276 debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c load"));
8277
8278 /* EGAP */
8279 T1 = _mm_adds_epi16(H_nogap_c, gap_open);
8280 dir_vert = _mm_cmpgt_epi16(E_c_gap,T1); /* E > H, for jump early */
8281 _mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
8282 debug15(print_vector_16(T1,clo,r,"T1"));
8283 debug15(print_vector_16(dir_vert,clo,r,"dir_Egap"));
8284
8285 E_c_gap = _mm_max_epi16(E_c_gap, T1); /* Compare H + open with vert */
8286 E_c_gap = _mm_adds_epi16(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
8287 E_c_gap = _mm_min_epi16(E_c_gap,_mm_add_epi16(E_mask,E_infinity));
8288 debug15(print_vector_16(E_c_gap,clo,r,"E"));
8289
8290
8291 /* NOGAP */
8292 T1 = _mm_srli_si128(H_nogap_c,LAST_SHORT_NONAVX2);
8293 H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_SHORT);
8294 H_nogap_c = _mm_or_si128(H_nogap_c, X_prev_nogap);
8295 X_prev_nogap = T1;
8296
8297 /* Add pairscores. No alternate chars for query sequence */
8298 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
8299 H_nogap_c = _mm_adds_epi16(H_nogap_c, pairscores_std);
8300 _mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
8301 debug15(print_vector_16(H_nogap_c,clo,r,"H"));
8302
8303 dir_vert = _mm_cmpgt_epi16(E_c_gap,H_nogap_c); /* E > H, for jump early */
8304 _mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
8305 debug15(print_vector_16(dir_vert,clo,r,"dir_nogap"));
8306
8307 H_nogap_c = _mm_max_epi16(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
8308 debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c store"));
8309 _mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
8310
8311
8312 /* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
8313 if (chigh >= r) {
8314 (*directions_Egap)[r][r] = DIAG;
8315 (*directions_nogap)[r][r] = DIAG;
8316 }
8317
8318 /* No need for F loop here */
8319 E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
8320 }
8321 }
8322 }
8323
8324
8325 #ifdef CHECK1
8326 /* Row 0 and column 0 directions fail anyway due to saturation */
8327 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
8328 (*directions_Egap)[1][0] = VERT;
8329 #endif
8330
8331 #ifdef DEBUG2
8332 printf("SIMD: Dynprog_simd_16_lower\n");
8333 Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
8334 revp,lband,/*upperp*/false);
8335 Directions16_print_ud(*directions_nogap,*directions_Egap,
8336 rlength,glength,rsequence,gsequence,gsequence_alt,
8337 revp,lband,/*upperp*/false);
8338 #endif
8339
8340 #ifdef CHECK1
8341 /* Check for column 0 directions */
8342 for (r = 1; r <= lband && r <= rlength; r++) {
8343 assert((*directions_Egap)[r][0] != DIAG);
8344 assert((*directions_nogap)[r][0] != DIAG);
8345 }
8346 #endif
8347
8348 _mm_free(pairscores[4]);
8349 _mm_free(pairscores[3]);
8350 _mm_free(pairscores[2]);
8351 _mm_free(pairscores[1]);
8352 _mm_free(pairscores[0]);
8353
8354 return matrix;
8355 }
8356 #endif
8357
8358
8359 #ifdef HAVE_SSE2
8360 /* Designed for computation below the diagonal, so no F loop or bottom masking needed */
8361 /* Operates by rows */
8362 Score16_T **
Dynprog_simd_16_lower(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,bool jump_late_p,bool revp)8363 Dynprog_simd_16_lower (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
8364 T this, char *rsequence, char *gsequence, char *gsequence_alt,
8365 int rlength, int glength,
8366 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
8367 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
8368 #endif
8369 Mismatchtype_T mismatchtype, int open, int extend,
8370 int lband, bool jump_late_p, bool revp) {
8371 Score16_T **matrix, *score_column;
8372 #ifdef HAVE_AVX2
8373 __m256i pairscores_std;
8374 __m256i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, E_infinity, T1;
8375 __m256i gap_open, gap_extend, complement_dummy;
8376 __m256i dir_vert;
8377 Score16_T save;
8378 #else
8379 __m128i pairscores_std;
8380 __m128i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, E_infinity, T1;
8381 __m128i gap_open, gap_extend, complement_dummy;
8382 __m128i dir_vert;
8383 #endif
8384 int glength_ceil, r, c;
8385 int clo, chigh;
8386 int na1, na2, na2_alt;
8387 Score16_T *pairscores[5], *pairscores_ptr;
8388 Pairdistance_T **pairdistance_array_type, score1, score2;
8389
8390 #ifdef DEBUG_AVX2
8391 Score16_T **matrix_std;
8392 Direction16_T **directions_nogap_std, **directions_Egap_std;
8393 char na2_single;
8394 #elif defined(DEBUG_SIMD)
8395 Score32_T **matrix_std;
8396 Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
8397 char na2_single;
8398 #endif
8399
8400
8401 debug2(printf("Dynprog_simd_16_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
8402 debug15(printf("Dynprog_simd_16_lower. jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
8403
8404 glength_ceil = (int) ((glength + SIMD_NSHORTS)/SIMD_NSHORTS) * SIMD_NSHORTS;
8405 pairdistance_array_type = pairdistance_array[mismatchtype];
8406
8407 debug(printf("compute_scores_simd_16_byrows (lower): "));
8408 debug(printf("Lengths are %d and %d, so band is %d on left\n",rlength,glength,lband));
8409 debug(printf("Genome length rounded up to %d\n",glength_ceil));
8410
8411 matrix = aligned_score16_alloc(glength_ceil,rlength,
8412 this->aligned.two.lower_matrix_ptrs,this->aligned.two.lower_matrix_space);
8413 *directions_nogap = aligned_directions16_alloc(glength_ceil,rlength,
8414 this->aligned.two.lower_directions_ptrs_0,this->aligned.two.lower_directions_space_0);
8415 *directions_Egap = aligned_directions16_alloc(glength_ceil,rlength,
8416 this->aligned.two.lower_directions_ptrs_1,this->aligned.two.lower_directions_space_1);
8417
8418 #if 0
8419 /* Column 0 initialization */
8420 /* penalty = open; */
8421 for (r = 1; r <= lband && r <= rlength; r++) {
8422 /* penalty += extend; */
8423 (*directions_Egap)[r][0] = VERT;
8424 (*directions_nogap)[r][0] = VERT;
8425 }
8426 #endif
8427 #if 0
8428 /* Already initialized to DIAG. Actually, no longer initializing directions_Egap */
8429 (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
8430 (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
8431 #endif
8432 #if 0
8433 /* Row 0 initialization */
8434 /* penalty = open; */
8435 for (c = 1; c <= SIMD_NSHORTS && c <= glength; c++) {
8436 /* penalty += extend; */
8437 (*directions_nogap)[0][c] = HORIZ;
8438 }
8439 #endif
8440
8441
8442 /* Load pairscores. Store match - mismatch */
8443 pairscores[0] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),ALIGN_SIZE);
8444 pairscores[1] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),ALIGN_SIZE);
8445 pairscores[2] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),ALIGN_SIZE);
8446 pairscores[3] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),ALIGN_SIZE);
8447 pairscores[4] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),ALIGN_SIZE);
8448
8449 #if 0
8450 /* Should not be necessary */
8451 memset((void *) pairscores[0],0,glength_ceil*sizeof(Score16_T));
8452 memset((void *) pairscores[1],0,glength_ceil*sizeof(Score16_T));
8453 memset((void *) pairscores[2],0,glength_ceil*sizeof(Score16_T));
8454 memset((void *) pairscores[3],0,glength_ceil*sizeof(Score16_T));
8455 memset((void *) pairscores[4],0,glength_ceil*sizeof(Score16_T));
8456 #endif
8457
8458 c = 0; na2 = na2_alt = 'N';
8459 pairscores[0][c] = (Score16_T) pairdistance_array_type[(int) 'A'][na2];
8460 pairscores[1][c] = (Score16_T) pairdistance_array_type[(int) 'C'][na2];
8461 pairscores[2][c] = (Score16_T) pairdistance_array_type[(int) 'G'][na2];
8462 pairscores[3][c] = (Score16_T) pairdistance_array_type[(int) 'T'][na2];
8463 pairscores[4][c] = (Score16_T) pairdistance_array_type[(int) 'N'][na2];
8464
8465 if (revp == false) {
8466 for (c = 1; c <= glength; c++) {
8467 na2 = gsequence[c-1];
8468 na2_alt = gsequence_alt[c-1];
8469 /* Take max here */
8470 score1 = pairdistance_array_type[(int) 'A'][na2];
8471 score2 = pairdistance_array_type[(int) 'A'][na2_alt];
8472 pairscores[0][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8473
8474 score1 = pairdistance_array_type[(int) 'C'][na2];
8475 score2 = pairdistance_array_type[(int) 'C'][na2_alt];
8476 pairscores[1][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8477
8478 score1 = pairdistance_array_type[(int) 'G'][na2];
8479 score2 = pairdistance_array_type[(int) 'G'][na2_alt];
8480 pairscores[2][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8481
8482 score1 = pairdistance_array_type[(int) 'T'][na2];
8483 score2 = pairdistance_array_type[(int) 'T'][na2_alt];
8484 pairscores[3][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8485
8486 score1 = pairdistance_array_type[(int) 'N'][na2];
8487 score2 = pairdistance_array_type[(int) 'N'][na2_alt];
8488 pairscores[4][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8489 }
8490 } else {
8491 for (c = 1; c <= glength; c++) {
8492 na2 = gsequence[1-c];
8493 na2_alt = gsequence_alt[1-c];
8494 /* Take max here */
8495 score1 = pairdistance_array_type[(int) 'A'][na2];
8496 score2 = pairdistance_array_type[(int) 'A'][na2_alt];
8497 pairscores[0][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8498
8499 score1 = pairdistance_array_type[(int) 'C'][na2];
8500 score2 = pairdistance_array_type[(int) 'C'][na2_alt];
8501 pairscores[1][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8502
8503 score1 = pairdistance_array_type[(int) 'G'][na2];
8504 score2 = pairdistance_array_type[(int) 'G'][na2_alt];
8505 pairscores[2][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8506
8507 score1 = pairdistance_array_type[(int) 'T'][na2];
8508 score2 = pairdistance_array_type[(int) 'T'][na2_alt];
8509 pairscores[3][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8510
8511 score1 = pairdistance_array_type[(int) 'N'][na2];
8512 score2 = pairdistance_array_type[(int) 'N'][na2_alt];
8513 pairscores[4][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8514 }
8515 }
8516
8517 #if 0
8518 /* Should not be necessary */
8519 memset((void *) &(pairscores[0][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8520 memset((void *) &(pairscores[1][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8521 memset((void *) &(pairscores[2][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8522 memset((void *) &(pairscores[3][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8523 memset((void *) &(pairscores[4][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8524 #endif
8525
8526 complement_dummy = _MM_SET1_EPI16(-1);
8527
8528 gap_open = _MM_SET1_EPI16((Score16_T) open);
8529 gap_extend = _MM_SET1_EPI16((Score16_T) extend);
8530
8531 E_infinity = _MM_SET1_EPI16(POS_INFINITY_16);
8532 if (jump_late_p) {
8533 for (clo = 0; clo <= glength; clo += SIMD_NSHORTS) {
8534 if ((chigh = clo + SIMD_NSHORTS - 1) > glength) {
8535 chigh = glength;
8536 }
8537
8538 /* dir_vert tests if E >= H. To fill in first row of each
8539 column block with non-diags, make E == H. */
8540 E_mask = _MM_SET1_EPI16(1);
8541
8542 /* Holds for all INITIAL_GAP_PENALTY */
8543 E_c_gap = _MM_SET1_EPI16(NEG_INFINITY_16);
8544 H_nogap_c = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
8545
8546 for (r = clo; r <= chigh + lband && r <= rlength; r++) {
8547 score_column = matrix[r];
8548
8549 if (r == 0) {
8550 na1 = 4; /* 'N' */
8551 } else {
8552 na1 = revp ? nt_to_int_array[(int) rsequence[1-r]] : nt_to_int_array[(int) rsequence[r-1]];
8553 }
8554 pairscores_ptr = pairscores[na1];
8555
8556 if (r == 0) {
8557 X_prev_nogap = _MM_SETZERO_SI();
8558 } else if (clo == 0) {
8559 #ifdef ZERO_INITIAL_GAP_PENALTY
8560 X_prev_nogap = _MM_SETZERO_SI();
8561 #elif defined(HAVE_AVX2)
8562 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
8563 #else
8564 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
8565 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
8566 #endif
8567 } else {
8568 /* second or greater block of 16 */
8569 #ifdef ZERO_INITIAL_GAP_PENALTY
8570 X_prev_nogap = _MM_SETZERO_SI();
8571 #elif defined(HAVE_AVX2)
8572 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[r-1][clo-1],LAST_SHORT_INSERT);
8573 #else
8574 X_prev_nogap = _mm_set1_epi16(matrix[r-1][clo-1]); /* get H from previous block and previous column */
8575 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
8576 #endif
8577 }
8578
8579 debug15(print_vector_16(E_mask,clo,r,"E_mask"));
8580 E_c_gap = _MM_MIN_EPI16(E_c_gap,_MM_ADD_EPI16(E_mask,E_infinity));
8581 debug15(print_vector_16(E_c_gap,clo,r,"E_c_gap"));
8582 debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c load"));
8583
8584 /* EGAP */
8585 T1 = _MM_ADDS_EPI16(H_nogap_c, gap_open);
8586 dir_vert = _MM_CMPLT_EPI16(E_c_gap,T1); /* E < H */
8587 dir_vert = _MM_ANDNOT_SI(dir_vert,complement_dummy); /* E >= H, for jump late */
8588 #ifdef HAVE_AVX2
8589 _mm256_store_si256((__m256i *) &((*directions_Egap)[r][clo]),dir_vert);
8590 #else
8591 _mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
8592 #endif
8593 debug15(print_vector_16(T1,clo,r,"T1"));
8594 debug15(print_vector_16(dir_vert,clo,r,"dir_Egap"));
8595
8596 E_c_gap = _MM_MAX_EPI16(E_c_gap, T1); /* Compare H + open with vert */
8597 E_c_gap = _MM_ADDS_EPI16(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
8598 E_c_gap = _MM_MIN_EPI16(E_c_gap,_MM_ADD_EPI16(E_mask,E_infinity));
8599 debug15(print_vector_16(E_c_gap,clo,r,"E"));
8600
8601
8602 /* NOGAP */
8603 #ifdef HAVE_AVX2
8604 T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_c,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
8605 X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_c,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
8606 H_nogap_c = _mm256_slli_si256(H_nogap_c,ONE_SHORT);
8607 #else
8608 T1 = _mm_srli_si128(H_nogap_c,LAST_SHORT_SHIFT);
8609 H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_SHORT);
8610 #endif
8611 H_nogap_c = _MM_OR_SI(H_nogap_c, X_prev_nogap);
8612 X_prev_nogap = T1;
8613
8614 /* Add pairscores. No alternate chars for query sequence. */
8615 #ifdef HAVE_AVX2
8616 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_ptr[clo]));
8617 H_nogap_c = _mm256_adds_epi16(H_nogap_c, pairscores_std);
8618 #else
8619 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
8620 H_nogap_c = _mm_adds_epi16(H_nogap_c, pairscores_std);
8621 #endif
8622 _mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
8623 debug15(print_vector_16(H_nogap_c,clo,r,"H"));
8624
8625 dir_vert = _MM_CMPLT_EPI16(E_c_gap,H_nogap_c); /* E < H */
8626 dir_vert = _MM_ANDNOT_SI(dir_vert,complement_dummy); /* E >= H, for jump late */
8627 #ifdef HAVE_AVX2
8628 _mm256_store_si256((__m256i *) &((*directions_nogap)[r][clo]),dir_vert);
8629 #else
8630 _mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
8631 #endif
8632 debug15(print_vector_16(dir_vert,clo,r,"dir_nogap"));
8633
8634 H_nogap_c = _MM_MAX_EPI16(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
8635 debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c store"));
8636 #ifdef HAVE_AVX2
8637 _mm256_store_si256((__m256i *) &(score_column[clo]), H_nogap_c);
8638 #else
8639 _mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
8640 #endif
8641
8642
8643 /* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
8644 if (chigh >= r) {
8645 (*directions_Egap)[r][r] = DIAG;
8646 (*directions_nogap)[r][r] = DIAG;
8647 }
8648
8649 /* No need for F loop here */
8650 #ifdef HAVE_AVX2
8651 save = _mm256_extract_epi16(E_mask,7);
8652 E_mask = _mm256_slli_si256(E_mask,ONE_SHORT);
8653 E_mask = _mm256_insert_epi16(E_mask,save,8);
8654 #else
8655 E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
8656 #endif
8657 }
8658 }
8659
8660 } else {
8661 /* jump early */
8662 for (clo = 0; clo <= glength; clo += SIMD_NSHORTS) {
8663 if ((chigh = clo + SIMD_NSHORTS - 1) > glength) {
8664 chigh = glength;
8665 }
8666
8667 /* dir_vert tests if E > H. To fill in first row of each
8668 column block with non-diags, make E > H. */
8669 E_mask = _MM_SET1_EPI16(1);
8670
8671 /* Holds for all INITIAL_GAP_PENALTY */
8672 E_c_gap = _MM_SET1_EPI16(NEG_INFINITY_16+1);
8673 H_nogap_c = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
8674
8675 for (r = clo; r <= chigh + lband && r <= rlength; r++) {
8676 score_column = matrix[r];
8677
8678 if (r == 0) {
8679 na1 = 4; /* 'N' */
8680 } else {
8681 na1 = revp ? nt_to_int_array[(int) rsequence[1-r]] : nt_to_int_array[(int) rsequence[r-1]];
8682 }
8683 pairscores_ptr = pairscores[na1];
8684
8685 if (r == 0) {
8686 X_prev_nogap = _MM_SETZERO_SI();
8687 } else if (clo == 0) {
8688 #ifdef ZERO_INITIAL_GAP_PENALTY
8689 X_prev_nogap = _MM_SETZERO_SI();
8690 #elif defined(HAVE_AVX2)
8691 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
8692 #else
8693 X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
8694 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
8695 #endif
8696 } else {
8697 /* second or greater block of 16 */
8698 #ifdef ZERO_INITIAL_GAP_PENALTY
8699 X_prev_nogap = _MM_SETZERO_SI();
8700 #elif defined(HAVE_AVX2)
8701 X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[r-1][clo-1],LAST_SHORT_INSERT);
8702 #else
8703 X_prev_nogap = _mm_set1_epi16(matrix[r-1][clo-1]); /* get H from previous block and previous column */
8704 X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
8705 #endif
8706 }
8707
8708 debug15(print_vector_16(E_mask,clo,r,"E_mask"));
8709 E_c_gap = _MM_MIN_EPI16(E_c_gap,_MM_ADD_EPI16(E_mask,E_infinity));
8710 debug15(print_vector_16(E_c_gap,clo,r,"E_c_gap"));
8711 debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c load"));
8712
8713 /* EGAP */
8714 T1 = _MM_ADDS_EPI16(H_nogap_c, gap_open);
8715 dir_vert = _MM_CMPGT_EPI16(E_c_gap,T1); /* E > H, for jump early */
8716 #ifdef HAVE_AVX2
8717 _mm256_store_si256((__m256i *) &((*directions_Egap)[r][clo]),dir_vert);
8718 #else
8719 _mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
8720 #endif
8721 debug15(print_vector_16(T1,clo,r,"T1"));
8722 debug15(print_vector_16(dir_vert,clo,r,"dir_Egap"));
8723
8724 E_c_gap = _MM_MAX_EPI16(E_c_gap, T1); /* Compare H + open with vert */
8725 E_c_gap = _MM_ADDS_EPI16(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
8726 E_c_gap = _MM_MIN_EPI16(E_c_gap,_MM_ADD_EPI16(E_mask,E_infinity));
8727 debug15(print_vector_16(E_c_gap,clo,r,"E"));
8728
8729
8730 /* NOGAP */
8731 #ifdef HAVE_AVX2
8732 T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_c,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
8733 X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_c,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
8734 H_nogap_c = _mm256_slli_si256(H_nogap_c,ONE_SHORT);
8735 #else
8736 T1 = _mm_srli_si128(H_nogap_c,LAST_SHORT_SHIFT);
8737 H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_SHORT);
8738 #endif
8739 H_nogap_c = _MM_OR_SI(H_nogap_c, X_prev_nogap);
8740 X_prev_nogap = T1;
8741
8742 /* Add pairscores. No alternate chars for query sequence */
8743 #ifdef HAVE_AVX2
8744 pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_ptr[clo]));
8745 H_nogap_c = _mm256_adds_epi16(H_nogap_c, pairscores_std);
8746 #else
8747 pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
8748 H_nogap_c = _mm_adds_epi16(H_nogap_c, pairscores_std);
8749 #endif
8750 _mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
8751 debug15(print_vector_16(H_nogap_c,clo,r,"H"));
8752
8753 dir_vert = _MM_CMPGT_EPI16(E_c_gap,H_nogap_c); /* E > H, for jump early */
8754 #ifdef HAVE_AVX2
8755 _mm256_store_si256((__m256i *) &((*directions_nogap)[r][clo]),dir_vert);
8756 #else
8757 _mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
8758 #endif
8759 debug15(print_vector_16(dir_vert,clo,r,"dir_nogap"));
8760
8761 H_nogap_c = _MM_MAX_EPI16(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
8762 debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c store"));
8763 #ifdef HAVE_AVX2
8764 _mm256_store_si256((__m256i *) &(score_column[clo]), H_nogap_c);
8765 #else
8766 _mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
8767 #endif
8768
8769
8770 /* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
8771 if (chigh >= r) {
8772 (*directions_Egap)[r][r] = DIAG;
8773 (*directions_nogap)[r][r] = DIAG;
8774 }
8775
8776 /* No need for F loop here */
8777 #ifdef HAVE_AVX2
8778 save = _mm256_extract_epi16(E_mask,7);
8779 E_mask = _mm256_slli_si256(E_mask,ONE_SHORT);
8780 E_mask = _mm256_insert_epi16(E_mask,save,8);
8781 #else
8782 E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
8783 #endif
8784 }
8785 }
8786 }
8787
8788
8789 #ifdef CHECK1
8790 /* Row 0 and column 0 directions fail anyway due to saturation */
8791 /* Handle (0,1) and (1,0) directions, otherwise DIAG */
8792 (*directions_Egap)[1][0] = VERT;
8793 #endif
8794
8795 #ifdef DEBUG2
8796 printf("SIMD: Dynprog_simd_16_lower\n");
8797 Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
8798 revp,lband,/*upperp*/false);
8799 Directions16_print_ud(*directions_nogap,*directions_Egap,
8800 rlength,glength,rsequence,gsequence,gsequence_alt,
8801 revp,lband,/*upperp*/false);
8802 #endif
8803
8804 #ifdef CHECK1
8805 /* Check for column 0 directions */
8806 for (r = 1; r <= lband && r <= rlength; r++) {
8807 assert((*directions_Egap)[r][0] != DIAG);
8808 assert((*directions_nogap)[r][0] != DIAG);
8809 }
8810 #endif
8811
8812 #ifdef DEBUG_AVX2
8813 matrix_std = Dynprog_simd_16_lower_nonavx2(&directions_nogap_std,&directions_Egap_std,
8814 this,rsequence,gsequence,gsequence_alt,
8815 rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
8816 open,extend,lband,jump_late_p,revp);
8817 #elif defined(DEBUG_SIMD)
8818 matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
8819 this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
8820 rlength,glength,
8821 goffset,chroffset,chrhigh,watsonp,mismatchtype,
8822 open,extend,lband,/*uband*/0,jump_late_p,revp,/*saturation*/NEG_INFINITY_16,
8823 /*upperp*/false,/*lowerp*/true);
8824 #endif
8825
8826 #ifdef DEBUG2
8827 printf("Banded\n");
8828 Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
8829 revp,lband,/*upperp*/false);
8830 Directions16_print_ud(*directions_nogap,*directions_Egap,
8831 rlength,glength,rsequence,gsequence,gsequence_alt,
8832 revp,lband,/*upperp*/false);
8833 #endif
8834
8835 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
8836 banded_matrix16_compare_lower(matrix,matrix_std,rlength,glength,lband,
8837 rsequence,gsequence,gsequence_alt,
8838 goffset,chroffset,chrhigh,watsonp,revp);
8839
8840 banded_directions16_compare_nogap_lower(*directions_nogap,directions_nogap_std,rlength,glength,lband);
8841
8842 banded_directions16_compare_Egap_lower(*directions_Egap,directions_Egap_std,rlength,glength,lband);
8843 #endif
8844
8845 _mm_free(pairscores[4]);
8846 _mm_free(pairscores[3]);
8847 _mm_free(pairscores[2]);
8848 _mm_free(pairscores[1]);
8849 _mm_free(pairscores[0]);
8850
8851 return matrix;
8852 }
8853 #endif
8854
8855
8856 #ifdef DEBUG17
8857 static char complCode[128] = COMPLEMENT_LC;
8858
8859 static char
get_genomic_nt(char * g_alt,int genomicpos,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp)8860 get_genomic_nt (char *g_alt, int genomicpos, Univcoord_T chroffset, Univcoord_T chrhigh,
8861 bool watsonp) {
8862 char c2, c2_alt;
8863 Univcoord_T pos;
8864
8865 #if 0
8866 /* If the read has a deletion, then we will extend beyond 0 or genomiclength, so do not restrict. */
8867 if (genomicpos < 0) {
8868 return '*';
8869
8870 } else if (genomicpos >= genomiclength) {
8871 return '*';
8872
8873 }
8874 #endif
8875
8876 if (watsonp) {
8877 if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
8878 *g_alt = '*';
8879 return '*';
8880
8881 } else if (pos >= chrhigh) {
8882 *g_alt = '*';
8883 return '*';
8884
8885 #if 0
8886 } else if (genome) {
8887 /* Not necessary, because Genome_get_char_blocks should work */
8888 debug8(printf("At %u, genomicnt is %c\n",
8889 genomicpos,Genome_get_char(genome,pos)));
8890 return Genome_get_char(genome,pos);
8891 #endif
8892
8893 } else {
8894 /* GMAP with user-supplied genomic segment */
8895 debug8(printf("At %u, genomicnt is %c\n",
8896 genomicpos,Genome_get_char_blocks(pos)));
8897 return Genome_get_char_blocks(&(*g_alt),pos);
8898 }
8899
8900 } else {
8901 if ((pos = chrhigh - genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
8902 *g_alt = '*';
8903 return '*';
8904
8905 } else if (pos >= chrhigh) {
8906 *g_alt = '*';
8907 return '*';
8908
8909 #if 0
8910 } else if (genome) {
8911 /* Not necessary, because Genome_get_char_blocks should work */
8912 c2 = Genome_get_char(genome,pos);
8913 #endif
8914
8915 } else {
8916 /* GMAP with user-supplied genomic segment */
8917 c2 = Genome_get_char_blocks(&c2_alt,pos);
8918 }
8919 debug8(printf("At %u, genomicnt is %c\n",genomicpos,complCode[(int) c2]));
8920 *g_alt = complCode[(int) c2_alt];
8921 return complCode[(int) c2];
8922 }
8923 }
8924 #endif
8925
8926
8927 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE2)
8928 List_T
Dynprog_traceback_8(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction8_T ** directions_nogap,Direction8_T ** directions_Egap,Direction8_T ** directions_Fgap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,int dynprogindex)8929 Dynprog_traceback_8 (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
8930 Direction8_T **directions_nogap, Direction8_T **directions_Egap, Direction8_T **directions_Fgap,
8931 int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
8932 int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp,
8933 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int genestrand,
8934 int dynprogindex) {
8935 char c1, c1_uc, c2, c2_alt;
8936 int dist;
8937 bool add_dashes_p;
8938 int querycoord, genomecoord;
8939 Direction8_T dir;
8940 #ifdef DEBUG17
8941 char c2_single;
8942 #endif
8943
8944 debug(printf("Starting traceback_8 at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
8945
8946 while (r > 0 && c > 0) { /* dir != STOP */
8947 if ((dir = directions_nogap[c][r]) == HORIZ) {
8948 dist = 1;
8949 while (c > 0 && directions_Egap[c--][r] != DIAG) {
8950 dist++;
8951 }
8952 #if 0
8953 if (c == 0) {
8954 /* Directions in column 0 can sometimes be DIAG */
8955 dir = VERT;
8956 } else {
8957 printf("| "); /* For Fgap */
8958 dir = directions_nogap[c][r];
8959 }
8960 #endif
8961
8962 debug(printf("H%d: ",dist));
8963 pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,r,c+dist,dist,/*genomesequence*/NULL,
8964 queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
8965 watsonp,dynprogindex);
8966 if (add_dashes_p == true) {
8967 *nopens += 1;
8968 *nindels += dist;
8969 }
8970 debug(printf("\n"));
8971
8972 } else if (dir == VERT) {
8973 dist = 1;
8974 while (r > 0 && directions_Fgap[c][r--] != DIAG) {
8975 dist++;
8976 }
8977 #if 0
8978 if (r == 0) {
8979 /* Directions in row 0 can sometimes be DIAG */
8980 dir = HORIZ;
8981 } else {
8982 dir = directions_nogap[c][r];
8983 }
8984 #endif
8985
8986 debug(printf("V%d: ",dist));
8987 pairs = Pairpool_add_queryskip(pairs,r+dist,c,dist,rsequence,
8988 queryoffset,genomeoffset,pairpool,revp,
8989 dynprogindex);
8990 *nopens += 1;
8991 *nindels += dist;
8992 debug(printf("\n"));
8993
8994 } else if (dir == DIAG) {
8995 querycoord = r-1;
8996 genomecoord = c-1;
8997 if (revp == true) {
8998 querycoord = -querycoord;
8999 genomecoord = -genomecoord;
9000 }
9001
9002 c1 = rsequence[querycoord];
9003 c1_uc = rsequenceuc[querycoord];
9004 c2 = gsequence[genomecoord];
9005 c2_alt = gsequence_alt[genomecoord];
9006 #ifdef DEBUG17
9007 c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9008 if (c2 != c2_single) {
9009 abort();
9010 }
9011 #endif
9012
9013 #ifdef EXTRACT_GENOMICSEG
9014 assert(c2 == genomesequence[genomecoord]);
9015 #endif
9016
9017 if (c2 == '*') {
9018 /* Don't push pairs past end of chromosome */
9019 debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u, chroffset %u, chrhigh %u, watsonp %d\n",
9020 genomeoffset,genomecoord,chroffset,chrhigh,watsonp));
9021
9022 } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9023 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9024 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9025 *nmatches += 1;
9026 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9027 c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9028
9029 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9030 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9031 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9032 *nmatches += 1;
9033 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9034 c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9035
9036 } else {
9037 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9038 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9039 *nmismatches += 1;
9040 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9041 c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9042 }
9043
9044 r--; c--;
9045
9046 } else {
9047 fprintf(stderr,"Bad dir at r %d, c %d\n",r,c);
9048 abort();
9049 }
9050 }
9051
9052 if (r == 0 && c == 0) {
9053 /* Finished with a diagonal step */
9054
9055 } else if (c == 0) {
9056 dist = r;
9057 debug(printf("V%d: ",dist));
9058 pairs = Pairpool_add_queryskip(pairs,r,/*c*/0+LAZY_INDEL,dist,rsequence,
9059 queryoffset,genomeoffset,pairpool,revp,
9060 dynprogindex);
9061 *nopens += 1;
9062 *nindels += dist;
9063 debug(printf("\n"));
9064
9065 } else {
9066 assert(r == 0);
9067 dist = c;
9068 debug(printf("H%d: ",dist));
9069 pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,/*r*/0+LAZY_INDEL,c,dist,/*genomesequence*/NULL,
9070 queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9071 watsonp,dynprogindex);
9072 if (add_dashes_p == true) {
9073 *nopens += 1;
9074 *nindels += dist;
9075 }
9076 debug(printf("\n"));
9077 }
9078
9079 return pairs;
9080 }
9081 #endif
9082
9083
9084 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE2)
9085 List_T
Dynprog_traceback_8_upper(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction8_T ** directions_nogap,Direction8_T ** directions_Egap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,int dynprogindex)9086 Dynprog_traceback_8_upper (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
9087 Direction8_T **directions_nogap, Direction8_T **directions_Egap,
9088 int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
9089 int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp,
9090 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int genestrand,
9091 int dynprogindex) {
9092 char c1, c1_uc, c2, c2_alt;
9093 int dist;
9094 bool add_dashes_p;
9095 int querycoord, genomecoord;
9096 Direction8_T dir;
9097 #ifdef DEBUG17
9098 char c2_single;
9099 #endif
9100
9101 debug(printf("Starting traceback_8_upper at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
9102
9103 while (r > 0 && c > 0) { /* dir != STOP */
9104 if ((dir = directions_nogap[c][r]) != DIAG) {
9105 /* Must be HORIZ */
9106 dist = 1;
9107 /* Should not need to check for c > r if the Egap diagonal above the main is populated with DIAG */
9108 while (/* c > r && */ directions_Egap[c--][r] != DIAG) {
9109 dist++;
9110 }
9111 assert(c >= r);
9112
9113 debug(printf("H%d: ",dist));
9114 pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,r,c+dist,dist,/*genomesequence*/NULL,
9115 queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9116 watsonp,dynprogindex);
9117 if (add_dashes_p == true) {
9118 *nopens += 1;
9119 *nindels += dist;
9120 }
9121 debug(printf("\n"));
9122
9123 } else {
9124 querycoord = r-1;
9125 genomecoord = c-1;
9126 if (revp == true) {
9127 querycoord = -querycoord;
9128 genomecoord = -genomecoord;
9129 }
9130
9131 c1 = rsequence[querycoord];
9132 c1_uc = rsequenceuc[querycoord];
9133 c2 = gsequence[genomecoord];
9134 c2_alt = gsequence_alt[genomecoord];
9135 #ifdef DEBUG17
9136 c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9137 if (c2 != c2_single) {
9138 abort();
9139 }
9140 #endif
9141
9142 #ifdef EXTRACT_GENOMICSEG
9143 assert(c2 == genomesequence[genomecoord]);
9144 #endif
9145
9146 if (c2 == '*') {
9147 /* Don't push pairs past end of chromosome */
9148 debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u, chroffset %u, chrhigh %u, watsonp %d\n",
9149 genomeoffset,genomecoord,chroffset,chrhigh,watsonp));
9150
9151 } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9152 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9153 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9154 *nmatches += 1;
9155 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9156 c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9157
9158 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9159 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9160 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9161 *nmatches += 1;
9162 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9163 c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9164
9165 } else {
9166 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9167 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9168 *nmismatches += 1;
9169 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9170 c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9171 }
9172
9173 r--; c--;
9174 }
9175 }
9176
9177 assert(r == 0);
9178 if (/* r == 0 && */ c == 0) {
9179 /* Finished with a diagonal step */
9180
9181 } else {
9182 assert(c != 0);
9183 assert(r == 0);
9184 dist = c;
9185 debug(printf("H%d: ",dist));
9186 pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,/*r*/0+LAZY_INDEL,c,dist,/*genomesequence*/NULL,
9187 queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9188 watsonp,dynprogindex);
9189 if (add_dashes_p == true) {
9190 *nopens += 1;
9191 *nindels += dist;
9192 }
9193 debug(printf("\n"));
9194 }
9195
9196 return pairs;
9197 }
9198
9199 List_T
Dynprog_traceback_8_lower(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction8_T ** directions_nogap,Direction8_T ** directions_Egap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,int genestrand,bool revp,int dynprogindex)9200 Dynprog_traceback_8_lower (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
9201 Direction8_T **directions_nogap, Direction8_T **directions_Egap,
9202 int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
9203 int queryoffset, int genomeoffset, Pairpool_T pairpool,
9204 int genestrand, bool revp, int dynprogindex) {
9205 char c1, c1_uc, c2, c2_alt;
9206 int dist;
9207 int querycoord, genomecoord;
9208 Direction8_T dir;
9209 #ifdef DEBUG17
9210 char c2_single;
9211 #endif
9212
9213 debug(printf("Starting traceback_8_lower at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
9214
9215 while (r > 0 && c > 0) { /* dir != STOP */
9216 if ((dir = directions_nogap[r][c]) != DIAG) {
9217 /* Must be VERT */
9218 dist = 1;
9219 /* Should not need to check for r > c if the Egap diagonal below the main is populated with DIAG */
9220 while (/* r > c && */ directions_Egap[r--][c] != DIAG) {
9221 dist++;
9222 }
9223 assert(r >= c);
9224
9225 debug(printf("V%d: ",dist));
9226 pairs = Pairpool_add_queryskip(pairs,r+dist,c,dist,rsequence,
9227 queryoffset,genomeoffset,pairpool,revp,
9228 dynprogindex);
9229 *nopens += 1;
9230 *nindels += dist;
9231 debug(printf("\n"));
9232
9233 } else {
9234 querycoord = r-1;
9235 genomecoord = c-1;
9236 if (revp == true) {
9237 querycoord = -querycoord;
9238 genomecoord = -genomecoord;
9239 }
9240
9241 c1 = rsequence[querycoord];
9242 c1_uc = rsequenceuc[querycoord];
9243 c2 = gsequence[genomecoord];
9244 c2_alt = gsequence_alt[genomecoord];
9245 #ifdef DEBUG17
9246 c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9247 if (c2 != c2_single) {
9248 abort();
9249 }
9250 #endif
9251
9252 #ifdef EXTRACT_GENOMICSEG
9253 assert(c2 == genomesequence[genomecoord]);
9254 #endif
9255
9256 if (c2 == '*') {
9257 /* Don't push pairs past end of chromosome */
9258 debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u\n",
9259 genomeoffset,genomecoord));
9260
9261 } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9262 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9263 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9264 *nmatches += 1;
9265 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9266 c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9267
9268 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9269 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9270 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9271 *nmatches += 1;
9272 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9273 c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9274
9275 } else {
9276 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9277 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9278 *nmismatches += 1;
9279 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9280 c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9281 }
9282
9283 r--; c--;
9284 }
9285 }
9286
9287 assert(c == 0);
9288 if (r == 0 /* && c == 0 */) {
9289 /* Finished with a diagonal step */
9290
9291 } else {
9292 assert(r != 0);
9293 assert(c == 0);
9294 dist = r;
9295 debug(printf("V%d: ",dist));
9296 pairs = Pairpool_add_queryskip(pairs,r,/*c*/0+LAZY_INDEL,dist,rsequence,
9297 queryoffset,genomeoffset,pairpool,revp,
9298 dynprogindex);
9299 *nopens += 1;
9300 *nindels += dist;
9301 debug(printf("\n"));
9302 }
9303
9304 return pairs;
9305 }
9306
9307
9308 List_T
Dynprog_traceback_16(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction16_T ** directions_nogap,Direction16_T ** directions_Egap,Direction16_T ** directions_Fgap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,int dynprogindex)9309 Dynprog_traceback_16 (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
9310 Direction16_T **directions_nogap, Direction16_T **directions_Egap, Direction16_T **directions_Fgap,
9311 int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
9312 int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp,
9313 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int genestrand,
9314 int dynprogindex) {
9315 char c1, c1_uc, c2, c2_alt;
9316 int dist;
9317 bool add_dashes_p;
9318 int querycoord, genomecoord;
9319 Direction16_T dir;
9320 #ifdef DEBUG17
9321 char c2_single;
9322 #endif
9323
9324 debug(printf("Starting traceback_16 at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
9325
9326 while (r > 0 && c > 0) { /* dir != STOP */
9327 if ((dir = directions_nogap[c][r]) == HORIZ) {
9328 dist = 1;
9329 while (c > 0 && directions_Egap[c--][r] != DIAG) {
9330 dist++;
9331 }
9332 #if 0
9333 if (c == 0) {
9334 /* Directions in column 0 can sometimes be DIAG */
9335 dir = VERT;
9336 } else {
9337 dir = directions_nogap[c][r];
9338 }
9339 #endif
9340
9341 debug(printf("H%d: ",dist));
9342 pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,r,c+dist,dist,/*genomesequence*/NULL,
9343 queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9344 watsonp,dynprogindex);
9345 if (add_dashes_p == true) {
9346 *nopens += 1;
9347 *nindels += dist;
9348 }
9349 debug(printf("\n"));
9350
9351 } else if (dir == VERT) {
9352 dist = 1;
9353 while (r > 0 && directions_Fgap[c][r--] != DIAG) {
9354 dist++;
9355 }
9356 #if 0
9357 if (r == 0) {
9358 /* Directions in row 0 can sometimes be DIAG */
9359 dir = HORIZ;
9360 } else {
9361 dir = directions_nogap[c][r];
9362 }
9363 #endif
9364
9365 debug(printf("V%d: ",dist));
9366 debug(printf("New dir at %d,%d is %d\n",c,r,dir));
9367 pairs = Pairpool_add_queryskip(pairs,r+dist,c,dist,rsequence,
9368 queryoffset,genomeoffset,pairpool,revp,
9369 dynprogindex);
9370 *nopens += 1;
9371 *nindels += dist;
9372 debug(printf("\n"));
9373
9374 } else if (dir == DIAG) {
9375 querycoord = r-1;
9376 genomecoord = c-1;
9377 if (revp == true) {
9378 querycoord = -querycoord;
9379 genomecoord = -genomecoord;
9380 }
9381
9382 c1 = rsequence[querycoord];
9383 c1_uc = rsequenceuc[querycoord];
9384 c2 = gsequence[genomecoord];
9385 c2_alt = gsequence_alt[genomecoord];
9386 #ifdef DEBUG17
9387 c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9388 if (c2 != c2_single) {
9389 abort();
9390 }
9391 #endif
9392
9393 #ifdef EXTRACT_GENOMICSEG
9394 assert(c2 == genomesequence[genomecoord]);
9395 #endif
9396
9397 if (c2 == '*') {
9398 /* Don't push pairs past end of chromosome */
9399 debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u, chroffset %u, chrhigh %u, watsonp %d\n",
9400 genomeoffset,genomecoord,chroffset,chrhigh,watsonp));
9401
9402 } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9403 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9404 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9405 *nmatches += 1;
9406 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9407 c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9408
9409 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9410 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9411 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9412 *nmatches += 1;
9413 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9414 c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9415
9416 } else {
9417 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9418 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9419 *nmismatches += 1;
9420 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9421 c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9422 }
9423
9424 r--; c--;
9425
9426 } else {
9427 fprintf(stderr,"Bad dir at r %d, c %d\n",r,c);
9428 abort();
9429 }
9430 }
9431
9432 if (r == 0 && c == 0) {
9433 /* Finished with a diagonal step */
9434
9435 } else if (c == 0) {
9436 dist = r;
9437 debug(printf("V%d: ",dist));
9438 pairs = Pairpool_add_queryskip(pairs,r,/*c*/0+LAZY_INDEL,dist,rsequence,
9439 queryoffset,genomeoffset,pairpool,revp,
9440 dynprogindex);
9441 *nopens += 1;
9442 *nindels += dist;
9443 debug(printf("\n"));
9444
9445 } else {
9446 assert(r == 0);
9447 dist = c;
9448 debug(printf("H%d: ",dist));
9449 pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,/*r*/0+LAZY_INDEL,c,dist,/*genomesequence*/NULL,
9450 queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9451 watsonp,dynprogindex);
9452 if (add_dashes_p == true) {
9453 *nopens += 1;
9454 *nindels += dist;
9455 }
9456 debug(printf("\n"));
9457 }
9458
9459 return pairs;
9460 }
9461
9462
9463 List_T
Dynprog_traceback_16_upper(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction16_T ** directions_nogap,Direction16_T ** directions_Egap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,int dynprogindex)9464 Dynprog_traceback_16_upper (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
9465 Direction16_T **directions_nogap, Direction16_T **directions_Egap,
9466 int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
9467 int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp,
9468 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int genestrand,
9469 int dynprogindex) {
9470 char c1, c1_uc, c2, c2_alt;
9471 int dist;
9472 bool add_dashes_p;
9473 int querycoord, genomecoord;
9474 Direction16_T dir;
9475 #ifdef DEBUG17
9476 char c2_single;
9477 #endif
9478
9479 debug(printf("Starting traceback_16_upper at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
9480
9481 while (r > 0 && c > 0) { /* dir != STOP */
9482 if ((dir = directions_nogap[c][r]) != DIAG) {
9483 /* Must be HORIZ */
9484 dist = 1;
9485 /* Should not need to check for c > r if the Egap diagonal above the main is populated with DIAG */
9486 while (/* c > r && */ directions_Egap[c--][r] != DIAG) {
9487 dist++;
9488 }
9489 assert(c >= r);
9490
9491 debug(printf("H%d: ",dist));
9492 pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,r,c+dist,dist,/*genomesequence*/NULL,
9493 queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9494 watsonp,dynprogindex);
9495 if (add_dashes_p == true) {
9496 *nopens += 1;
9497 *nindels += dist;
9498 }
9499 debug(printf("\n"));
9500
9501 } else {
9502 querycoord = r-1;
9503 genomecoord = c-1;
9504 if (revp == true) {
9505 querycoord = -querycoord;
9506 genomecoord = -genomecoord;
9507 }
9508
9509 c1 = rsequence[querycoord];
9510 c1_uc = rsequenceuc[querycoord];
9511 c2 = gsequence[genomecoord];
9512 c2_alt = gsequence_alt[genomecoord];
9513 #ifdef DEBUG17
9514 c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9515 if (c2 != c2_single) {
9516 abort();
9517 }
9518 #endif
9519
9520 #ifdef EXTRACT_GENOMICSEG
9521 assert(c2 == genomesequence[genomecoord]);
9522 #endif
9523
9524 if (c2 == '*') {
9525 /* Don't push pairs past end of chromosome */
9526 debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u, chroffset %u, chrhigh %u, watsonp %d\n",
9527 genomeoffset,genomecoord,chroffset,chrhigh,watsonp));
9528
9529 } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9530 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9531 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9532 *nmatches += 1;
9533 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9534 c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9535
9536 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9537 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9538 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9539 *nmatches += 1;
9540 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9541 c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9542
9543 } else {
9544 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9545 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9546 *nmismatches += 1;
9547 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9548 c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9549 }
9550
9551 r--; c--;
9552 }
9553 }
9554
9555 assert(r == 0);
9556 if (/* r == 0 && */ c == 0) {
9557 /* Finished with a diagonal step */
9558
9559 } else {
9560 assert(c != 0);
9561 assert(r == 0);
9562 dist = c;
9563 debug(printf("H%d: ",dist));
9564 pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,/*r*/0+LAZY_INDEL,c,dist,/*genomesequence*/NULL,
9565 queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9566 watsonp,dynprogindex);
9567 if (add_dashes_p == true) {
9568 *nopens += 1;
9569 *nindels += dist;
9570 }
9571 debug(printf("\n"));
9572 }
9573
9574 return pairs;
9575 }
9576
9577 List_T
Dynprog_traceback_16_lower(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction16_T ** directions_nogap,Direction16_T ** directions_Egap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,int genestrand,bool revp,int dynprogindex)9578 Dynprog_traceback_16_lower (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
9579 Direction16_T **directions_nogap, Direction16_T **directions_Egap,
9580 int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
9581 int queryoffset, int genomeoffset, Pairpool_T pairpool,
9582 int genestrand, bool revp, int dynprogindex) {
9583 char c1, c1_uc, c2, c2_alt;
9584 int dist;
9585 int querycoord, genomecoord;
9586 Direction16_T dir;
9587 #ifdef DEBUG17
9588 char c2_single;
9589 #endif
9590
9591 debug(printf("Starting traceback_16_lower at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
9592
9593 while (r > 0 && c > 0) { /* dir != STOP */
9594 if ((dir = directions_nogap[r][c]) != DIAG) {
9595 /* Must be VERT */
9596 dist = 1;
9597 /* Should not need to check for r > c if the Egap diagonal below the main is populated with DIAG */
9598 while (/* r > c && */ directions_Egap[r--][c] != DIAG) {
9599 dist++;
9600 }
9601 assert(r >= c);
9602
9603 debug(printf("V%d: ",dist));
9604 pairs = Pairpool_add_queryskip(pairs,r+dist,c,dist,rsequence,
9605 queryoffset,genomeoffset,pairpool,revp,
9606 dynprogindex);
9607 *nopens += 1;
9608 *nindels += dist;
9609 debug(printf("\n"));
9610
9611 } else {
9612 querycoord = r-1;
9613 genomecoord = c-1;
9614 if (revp == true) {
9615 querycoord = -querycoord;
9616 genomecoord = -genomecoord;
9617 }
9618
9619 c1 = rsequence[querycoord];
9620 c1_uc = rsequenceuc[querycoord];
9621 c2 = gsequence[genomecoord];
9622 c2_alt = gsequence_alt[genomecoord];
9623 #ifdef DEBUG17
9624 c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9625 if (c2 != c2_single) {
9626 abort();
9627 }
9628 #endif
9629
9630 #ifdef EXTRACT_GENOMICSEG
9631 assert(c2 == genomesequence[genomecoord]);
9632 #endif
9633
9634 if (c2 == '*') {
9635 /* Don't push pairs past end of chromosome */
9636 debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u\n",
9637 genomeoffset,genomecoord));
9638
9639 } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9640 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9641 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9642 *nmatches += 1;
9643 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9644 c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9645
9646 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9647 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9648 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9649 *nmatches += 1;
9650 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9651 c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9652
9653 } else {
9654 debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9655 r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9656 *nmismatches += 1;
9657 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9658 c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9659 }
9660
9661 r--; c--;
9662 }
9663 }
9664
9665 assert(c == 0);
9666 if (r == 0 /* && c == 0 */) {
9667 /* Finished with a diagonal step */
9668
9669 } else {
9670 assert(r != 0);
9671 assert(c == 0);
9672 dist = r;
9673 debug(printf("V%d: ",dist));
9674 pairs = Pairpool_add_queryskip(pairs,r,/*c*/0+LAZY_INDEL,dist,rsequence,
9675 queryoffset,genomeoffset,pairpool,revp,
9676 dynprogindex);
9677 *nopens += 1;
9678 *nindels += dist;
9679 debug(printf("\n"));
9680 }
9681
9682 return pairs;
9683 }
9684 #endif
9685
9686
9687