1 static char rcsid[] = "$Id: dynprog_simd.c 214361 2018-03-21 01:24:28Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "dynprog_simd.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <math.h>		/* For ceil, log, pow */
11 #include <ctype.h>		/* For tolower */
12 
13 #ifdef HAVE_SSE2
14 #include <emmintrin.h>
15 #endif
16 #ifdef HAVE_SSE4_1
17 #include <smmintrin.h>
18 #endif
19 #ifdef HAVE_AVX2
20 #include <immintrin.h>
21 #endif
22 
23 #include "mem.h"
24 #include "comp.h"
25 #include "assert.h"
26 
27 
28 #ifdef HAVE_AVX2
29 #define _MM_ADD_EPI8(x,y) _mm256_add_epi8(x,y)
30 #define _MM_ADDS_EPI8(x,y) _mm256_adds_epi8(x,y)
31 #define _MM_SUBS_EPI8(x,y) _mm256_subs_epi8(x,y)
32 #define _MM_CMPGT_EPI8(x,y) _mm256_cmpgt_epi8(x,y)
33 #define _MM_CMPLT_EPI8(x,y) _mm256_cmpgt_epi8(y,x) /* No _mm256_cmplt commands */
34 #define _MM_MAX_EPI8(x,y) _mm256_max_epi8(x,y)
35 #define _MM_MIN_EPI8(x,y) _mm256_min_epi8(x,y)
36 #define _MM_SET1_EPI8(x) _mm256_set1_epi8(x)
37 
38 #define _MM_ADD_EPI16(x,y) _mm256_add_epi16(x,y)
39 #define _MM_ADDS_EPI16(x,y) _mm256_adds_epi16(x,y)
40 #define _MM_SUBS_EPI16(x,y) _mm256_subs_epi16(x,y)
41 #define _MM_CMPGT_EPI16(x,y) _mm256_cmpgt_epi16(x,y)
42 #define _MM_CMPLT_EPI16(x,y) _mm256_cmpgt_epi16(y,x) /* No _mm256_cmplt commands */
43 #define _MM_MAX_EPI16(x,y) _mm256_max_epi16(x,y)
44 #define _MM_MIN_EPI16(x,y) _mm256_min_epi16(x,y)
45 #define _MM_SET1_EPI16(x) _mm256_set1_epi16(x)
46 
47 #define _MM_SETZERO_SI _mm256_setzero_si256
48 /* #define _MM_SLLI_SI(x,y) _mm256_slli_si256(x,y) -- 256-bit version works within 128-bit lanes */
49 /* #define _MM_SRLI_SI(x,y) _mm256_srli_si256(x,y) -- 256-bit version works within 128-bit lanes */
50 #define _MM_ANDNOT_SI(x,y) _mm256_andnot_si256(x,y)
51 #define _MM_OR_SI(x,y) _mm256_or_si256(x,y)
52 #define _MM_AND_SI(x,y) _mm256_and_si256(x,y)
53 
54 #elif defined(HAVE_SSE4_1) || defined(HAVE_SSE2)
55 #define _MM_ADD_EPI8(x,y) _mm_add_epi8(x,y)
56 #define _MM_ADDS_EPI8(x,y) _mm_adds_epi8(x,y)
57 #define _MM_SUBS_EPI8(x,y) _mm_subs_epi8(x,y)
58 #define _MM_CMPGT_EPI8(x,y) _mm_cmpgt_epi8(x,y)
59 #define _MM_CMPLT_EPI8(x,y) _mm_cmplt_epi8(x,y)
60 #define _MM_MAX_EPI8(x,y) _mm_max_epi8(x,y)
61 #define _MM_MIN_EPI8(x,y) _mm_min_epi8(x,y)
62 #define _MM_SET1_EPI8(x) _mm_set1_epi8(x)
63 
64 #define _MM_ADD_EPI16(x,y) _mm_add_epi16(x,y)
65 #define _MM_ADDS_EPI16(x,y) _mm_adds_epi16(x,y)
66 #define _MM_SUBS_EPI16(x,y) _mm_subs_epi16(x,y)
67 #define _MM_CMPGT_EPI16(x,y) _mm_cmpgt_epi16(x,y)
68 #define _MM_CMPLT_EPI16(x,y) _mm_cmplt_epi16(x,y)
69 #define _MM_MAX_EPI16(x,y) _mm_max_epi16(x,y)
70 #define _MM_MIN_EPI16(x,y) _mm_min_epi16(x,y)
71 #define _MM_SET1_EPI16(x) _mm_set1_epi16(x)
72 
73 #define _MM_SETZERO_SI _mm_setzero_si128
74 /* #define _MM_SLLI_SI(x,y) _mm_slli_si128(x,y) -- 256-bit version works within 128-bit lanes */
75 /* #define _MM_SRLI_SI(x,y) _mm_srli_si128(x,y) -- 256-bit version works within 128-bit lanes */
76 #define _MM_ANDNOT_SI(x,y) _mm_andnot_si128(x,y)
77 #define _MM_OR_SI(x,y) _mm_or_si128(x,y)
78 #define _MM_AND_SI(x,y) _mm_and_si128(x,y)
79 #endif
80 
81 
82 
83 #define LAZY_INDEL 1		/* Don't advance to next coordinate on final indel, since could go over chromosome bounds. */
84 
85 /* Row 0 and column 0 directions */
86 /* Was useful in finding a saturation bug, but can fail because of saturation */
87 #ifdef CHECK1
88 #define check1(x) x
89 #else
90 #define check1(x)
91 #endif
92 
93 
94 #ifdef DEBUG
95 #define debug(x) x
96 #else
97 #define debug(x)
98 #endif
99 
100 #ifdef DEBUG2
101 #define debug2(x) x
102 #else
103 #define debug2(x)
104 #endif
105 
106 /* Fgap */
107 #ifdef DEBUG3
108 #define debug3(x) x
109 #else
110 #define debug3(x)
111 #endif
112 
113 #ifdef DEBUG8
114 #define debug8(x) x
115 #else
116 #define debug8(x)
117 #endif
118 
119 /* Compare SIMD with non-SIMD.  Define in dynprog.h */
120 #ifdef DEBUG_SIMD
121 #define debug_simd(x) x
122 #else
123 #define debug_simd(x)
124 #endif
125 
126 #ifdef DEBUG15
127 #define debug15(x) x
128 #else
129 #define debug15(x)
130 #endif
131 
132 /* Compare AVX2 with SSE42.  Define in dynprog.h */
133 #ifdef DEBUG_AVX2
134 #define debug_avx2(x) x
135 #else
136 #define debug_avx2(x)
137 #endif
138 
139 /* Checking genomic nt in traceback procedures */
140 #ifdef DEBUG17
141 #define debug17(x) x
142 #else
143 #define debug17(x)
144 #endif
145 
146 
147 
148 #include "complement.h"
149 #define NEG_INFINITY_DISPLAY -99
150 
151 
152 /************************************************************************
153  *   Debugging procedures
154  ************************************************************************/
155 
156 #ifdef DEBUG15
157 /* For debugging of SIMD procedures*/
158 #ifdef HAVE_AVX2
159 static void
print_vector_8(__m256i x,int r,int c,char * label)160 print_vector_8 (__m256i x, int r, int c, char *label) {
161   __m256i a[1];
162   Score8_T *s = a;
163 
164   _mm_lfence();			/* Needed to print correct values */
165   _mm256_store_si256(a,x);
166   printf("%d,%d %s: %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
167 	 r,c,label,s[0],s[1],s[2],s[3],s[4],s[5],s[6],s[7],s[8],s[9],s[10],s[11],s[12],s[13],s[14],s[15],
168 	 s[16],s[17],s[18],s[19],s[20],s[21],s[22],s[23],s[24],s[25],s[26],s[27],s[28],s[29],s[30],s[31]);
169   return;
170 }
171 
172 static void
print_vector_16(__m256i x,int r,int c,char * label)173 print_vector_16 (__m256i x, int r, int c, char *label) {
174   __m256i a[1];
175   Score16_T *s = a;
176 
177   _mm_lfence();			/* Needed to print correct values */
178   _mm256_store_si256(a,x);
179   printf("%d,%d %s: %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
180 	 r,c,label,s[0],s[1],s[2],s[3],s[4],s[5],s[6],s[7],s[8],s[9],s[10],s[11],s[12],s[13],s[14],s[15]);
181   return;
182 }
183 
184 #else
185 static void
print_vector_8(__m128i x,int r,int c,char * label)186 print_vector_8 (__m128i x, int r, int c, char *label) {
187   __m128i a[1];
188   Score8_T *s = a;
189 
190   _mm_lfence();			/* Needed to print correct values */
191   _mm_store_si128(a,x);
192   printf("%d,%d %s: %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
193 	 r,c,label,s[0],s[1],s[2],s[3],s[4],s[5],s[6],s[7],s[8],s[9],s[10],s[11],s[12],s[13],s[14],s[15]);
194   return;
195 }
196 
197 static void
print_vector_16(__m128i x,int r,int c,char * label)198 print_vector_16 (__m128i x, int r, int c, char *label) {
199   __m128i a[1];
200   Score16_T *s = a;
201 
202   _mm_lfence();			/* Needed to print correct values */
203   _mm_store_si128(a,x);
204   printf("%d,%d %s: %d %d %d %d %d %d %d %d\n",r,c,label,s[0],s[1],s[2],s[3],s[4],s[5],s[6],s[7]);
205   return;
206 }
207 #endif
208 #endif
209 
210 
211 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD) || defined(DEBUG2)
212 static void
Matrix8_print(Score8_T ** matrix,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int lband,int uband)213 Matrix8_print (Score8_T **matrix, int rlength, int glength, char *rsequence,
214 	       char *gsequence, char *gsequencealt,
215 	       bool revp, int lband, int uband) {
216   int i, j;
217   char g, g_alt;
218 
219 #ifdef HAVE_SSE2
220   _mm_lfence();
221 #endif
222 
223   /* j */
224   printf("   ");		/* For i */
225   printf("  ");
226   for (j = 0; j <= glength; ++j) {
227     printf(" %2d ",j);
228   }
229   printf("\n");
230 
231 
232   if (gsequence) {
233     printf("   ");		/* For i */
234     printf("  ");
235     for (j = 0; j <= glength; ++j) {
236       if (j == 0) {
237 	printf("    ");
238       } else {
239 	printf("  %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
240       }
241     }
242     printf("\n");
243   }
244 
245   if (gsequencealt != gsequence) {
246     printf("   ");		/* For i */
247     printf("  ");
248     for (j = 0; j <= glength; ++j) {
249       if (j == 0) {
250 	printf("    ");
251       } else {
252 	g = revp ? gsequence[-j+1] : gsequence[j-1];
253 	g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
254 	if (g == g_alt) {
255 	  printf("  %c ",' ');
256 	} else {
257 	  printf("  %c ",g_alt);
258 	}
259       }
260     }
261     printf("\n");
262   }
263 
264   for (i = 0; i <= rlength; ++i) {
265     printf("%2d ",i);
266     if (i == 0) {
267       printf("  ");
268     } else {
269       printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
270     }
271     for (j = 0; j <= glength; ++j) {
272       if (j < i - lband) {
273 	printf("  . ");
274       } else if (j > i + uband) {
275 	printf("  . ");
276       } else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
277 	printf("%3d ",NEG_INFINITY_DISPLAY);
278       } else {
279 	printf("%3d ",matrix[j][i]);
280       }
281     }
282     printf("\n");
283   }
284   printf("\n");
285 
286   return;
287 }
288 
289 static void
Matrix8_print_ud(Score8_T ** matrix,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int band,bool upperp)290 Matrix8_print_ud (Score8_T **matrix, int rlength, int glength, char *rsequence,
291 		  char *gsequence, char *gsequencealt,
292 		  bool revp, int band, bool upperp) {
293   int i, j;
294   char g, g_alt;
295 
296 #ifdef HAVE_SSE2
297   _mm_lfence();
298 #endif
299 
300   /* j */
301   printf("   ");		/* For i */
302   printf("  ");
303   for (j = 0; j <= glength; ++j) {
304     printf(" %2d ",j);
305   }
306   printf("\n");
307 
308   if (gsequence) {
309     printf("   ");		/* For i */
310     printf("  ");
311     for (j = 0; j <= glength; ++j) {
312       if (j == 0) {
313 	printf("    ");
314       } else {
315 	printf("  %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
316       }
317     }
318     printf("\n");
319   }
320 
321   if (gsequencealt != gsequence) {
322     printf("   ");		/* For i */
323     printf("  ");
324     for (j = 0; j <= glength; ++j) {
325       if (j == 0) {
326 	printf("    ");
327       } else {
328 	g = revp ? gsequence[-j+1] : gsequence[j-1];
329 	g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
330 	if (g == g_alt) {
331 	  printf("  %c ",' ');
332 	} else {
333 	  printf("  %c ",g_alt);
334 	}
335       }
336     }
337     printf("\n");
338   }
339 
340   for (i = 0; i <= rlength; ++i) {
341     printf("%2d ",i);
342     if (i == 0) {
343       printf("  ");
344     } else {
345       printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
346     }
347     if (upperp == true) {
348       for (j = 0; j <= glength; ++j) {
349 	if (j < i) {
350 	  printf("  . ");
351 	} else if (j > i + band) {
352 	  printf("  . ");
353 	} else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
354 	  printf("%3d ",NEG_INFINITY_DISPLAY);
355 	} else {
356 	  printf("%3d ",matrix[j][i]);
357 	}
358       }
359     } else {
360       for (j = 0; j <= glength; ++j) {
361 	if (i < j) {
362 	  printf("  . ");
363 	} else if (i > j + band) {
364 	  printf("  . ");
365 	} else if (matrix[i][j] < NEG_INFINITY_DISPLAY) {
366 	  printf("%3d ",NEG_INFINITY_DISPLAY);
367 	} else {
368 	  printf("%3d ",matrix[i][j]);
369 	}
370       }
371     }
372     printf("\n");
373   }
374   printf("\n");
375 
376   return;
377 }
378 
379 
380 static void
Matrix16_print(Score16_T ** matrix,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int lband,int uband)381 Matrix16_print (Score16_T **matrix, int rlength, int glength, char *rsequence,
382 		char *gsequence, char *gsequencealt,
383 		bool revp, int lband, int uband) {
384   int i, j;
385   char g, g_alt;
386 
387 #ifdef HAVE_SSE2
388   _mm_lfence();
389 #endif
390 
391   /* j */
392   if (rlength >= 100) {
393     printf("    ");
394   } else {
395     printf("   ");		/* For i */
396   }
397   printf("  ");
398   if (glength >= 100) {
399     for (j = 0; j <= glength; ++j) {
400       printf(" %3d ",j);
401     }
402   } else {
403     for (j = 0; j <= glength; ++j) {
404       printf(" %2d ",j);
405     }
406   }
407   printf("\n");
408 
409   if (gsequence) {
410     if (rlength >= 100) {
411       printf("    ");
412     } else {
413       printf("   ");		/* For i */
414     }
415     printf("  ");
416     if (glength >= 100) {
417       for (j = 0; j <= glength; ++j) {
418 	if (j == 0) {
419 	  printf("     ");
420 	} else {
421 	  printf("   %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
422 	}
423       }
424     } else {
425       for (j = 0; j <= glength; ++j) {
426 	if (j == 0) {
427 	  printf("    ");
428 	} else {
429 	  printf("  %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
430 	}
431       }
432     }
433     printf("\n");
434   }
435 
436   if (gsequencealt != gsequence) {
437     if (rlength >= 100) {
438       printf("    ");
439     } else {
440       printf("   ");		/* For i */
441     }
442     printf("  ");
443     if (glength >= 100) {
444       for (j = 0; j <= glength; ++j) {
445 	if (j == 0) {
446 	  printf("     ");
447 	} else {
448 	  g = revp ? gsequence[-j+1] : gsequence[j-1];
449 	  g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
450 	  if (g == g_alt) {
451 	    printf("   %c ",' ');
452 	  } else {
453 	    printf("   %c ",g_alt);
454 	  }
455 	}
456       }
457     } else {
458       for (j = 0; j <= glength; ++j) {
459 	if (j == 0) {
460 	  printf("    ");
461 	} else {
462 	  g = revp ? gsequence[-j+1] : gsequence[j-1];
463 	  g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
464 	  if (g == g_alt) {
465 	    printf("  %c ",' ');
466 	  } else {
467 	    printf("  %c ",g_alt);
468 	  }
469 	}
470       }
471     }
472     printf("\n");
473   }
474 
475   for (i = 0; i <= rlength; ++i) {
476     if (rlength >= 100) {
477       printf("%3d ",i);
478     } else {
479       printf("%2d ",i);
480     }
481     if (i == 0) {
482       printf("  ");
483     } else {
484       printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
485     }
486     if (glength >= 100) {
487       for (j = 0; j <= glength; ++j) {
488 	if (j < i - lband) {
489 	  printf("   . ");
490 	} else if (j > i + uband) {
491 	  printf("   . ");
492 	} else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
493 	  printf(" %3d ",NEG_INFINITY_DISPLAY);
494 	} else {
495 	  printf(" %3d ",matrix[j][i]);
496 	}
497       }
498     } else {
499       for (j = 0; j <= glength; ++j) {
500 	if (j < i - lband) {
501 	  printf("  . ");
502 	} else if (j > i + uband) {
503 	  printf("  . ");
504 	} else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
505 	  printf("%3d ",NEG_INFINITY_DISPLAY);
506 	} else {
507 	  printf("%3d ",matrix[j][i]);
508 	}
509       }
510     }
511     printf("\n");
512   }
513   printf("\n");
514 
515   return;
516 }
517 
518 static void
Matrix16_print_ud(Score16_T ** matrix,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int band,bool upperp)519 Matrix16_print_ud (Score16_T **matrix, int rlength, int glength, char *rsequence,
520 		   char *gsequence, char *gsequencealt,
521 		   bool revp, int band, bool upperp) {
522   int i, j;
523   char g, g_alt;
524 
525 #ifdef HAVE_SSE2
526   _mm_lfence();
527 #endif
528 
529   /* j */
530   printf("   ");		/* For i */
531   printf("  ");
532   for (j = 0; j <= glength; ++j) {
533     printf(" %2d ",j);
534   }
535   printf("\n");
536 
537   if (gsequence) {
538     printf("   ");		/* For i */
539     printf("  ");
540     for (j = 0; j <= glength; ++j) {
541       if (j == 0) {
542 	printf("    ");
543       } else {
544 	printf("  %c ",revp ? gsequence[-j+1] : gsequence[j-1]);
545       }
546     }
547     printf("\n");
548   }
549 
550   if (gsequencealt != gsequence) {
551     printf("   ");		/* For i */
552     printf("  ");
553     for (j = 0; j <= glength; ++j) {
554       if (j == 0) {
555 	printf("    ");
556       } else {
557 	g = revp ? gsequence[-j+1] : gsequence[j-1];
558 	g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
559 	if (g == g_alt) {
560 	  printf("  %c ",' ');
561 	} else {
562 	  printf("  %c ",g_alt);
563 	}
564       }
565     }
566     printf("\n");
567   }
568 
569   for (i = 0; i <= rlength; ++i) {
570     printf("%2d ",i);
571     if (i == 0) {
572       printf("  ");
573     } else {
574       printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
575     }
576     if (upperp == true) {
577       for (j = 0; j <= glength; ++j) {
578 	if (j < i) {
579 	  printf("  . ");
580 	} else if (j > i + band) {
581 	  printf("  . ");
582 	} else if (matrix[j][i] < NEG_INFINITY_DISPLAY) {
583 	  printf("%3d ",NEG_INFINITY_DISPLAY);
584 	} else {
585 	  printf("%3d ",matrix[j][i]);
586 	}
587       }
588     } else {
589       for (j = 0; j <= glength; ++j) {
590 	if (i < j) {
591 	  printf("  . ");
592 	} else if (i > j + band) {
593 	  printf("  . ");
594 	} else if (matrix[i][j] < NEG_INFINITY_DISPLAY) {
595 	  printf("%3d ",NEG_INFINITY_DISPLAY);
596 	} else {
597 	  printf("%3d ",matrix[i][j]);
598 	}
599       }
600     }
601     printf("\n");
602   }
603   printf("\n");
604 
605   return;
606 }
607 #endif
608 
609 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD) || defined(DEBUG2)
610 static void
Directions8_print(Direction8_T ** directions_nogap,Direction8_T ** directions_Egap,Direction8_T ** directions_Fgap,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int lband,int uband)611 Directions8_print (Direction8_T **directions_nogap, Direction8_T **directions_Egap, Direction8_T **directions_Fgap,
612 		   int rlength, int glength, char *rsequence, char *gsequence, char *gsequencealt,
613 		   bool revp, int lband, int uband) {
614   int i, j;
615   char g, g_alt;
616 
617 #ifdef HAVE_SSE2
618   _mm_lfence();
619 #endif
620 
621   /* j */
622   printf("   ");		/* For i */
623   printf("  ");
624   for (j = 0; j <= glength; ++j) {
625     printf(" %2d   ",j);
626   }
627   printf("\n");
628 
629   if (gsequence) {
630     printf("   ");		/* For i */
631     printf("  ");
632     for (j = 0; j <= glength; ++j) {
633       if (j == 0) {
634 	printf("      ");
635       } else {
636 	printf("  %c   ",revp ? gsequence[-j+1] : gsequence[j-1]);
637       }
638     }
639     printf("\n");
640   }
641 
642   if (gsequencealt != gsequence) {
643     printf("   ");		/* For i */
644     printf("  ");
645     for (j = 0; j <= glength; ++j) {
646       if (j == 0) {
647 	printf("      ");
648       } else {
649 	g = revp ? gsequence[-j+1] : gsequence[j-1];
650 	g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
651 	if (g == g_alt) {
652 	  printf("  %c   ",' ');
653 	} else {
654 	  printf("  %c   ",g_alt);
655 	}
656       }
657     }
658     printf("\n");
659   }
660 
661   for (i = 0; i <= rlength; ++i) {
662     printf("%2d ",i);
663     if (i == 0) {
664       printf("  ");
665     } else {
666       printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
667     }
668     for (j = 0; j <= glength; ++j) {
669       if (j < i - lband) {
670 	printf("     ");
671       } else if (j > i + uband) {
672 	printf("     ");
673       } else {
674 	if (directions_Egap[j][i] == DIAG) {
675 	  printf("D");
676 	} else {
677 	  /* Must be HORIZ */
678 	  printf("H");
679 	}
680 	printf("|");
681 	if (directions_nogap[j][i] == DIAG) {
682 	  printf("D");
683 	} else if (directions_nogap[j][i] == HORIZ) {
684 	  printf("H");
685 	} else {
686 	  /* Must be VERT */
687 	  printf("V");
688 	}
689 	printf("|");
690 	if (directions_Fgap[j][i] == DIAG) {
691 	  printf("D");
692 	} else {
693 	  /* Must be VERT */
694 	  printf("V");
695 	}
696       }
697       printf(" ");
698     }
699     printf("\n");
700   }
701   printf("\n");
702 
703   return;
704 }
705 
706 static void
Directions8_print_ud(Direction8_T ** directions_nogap,Direction8_T ** directions_Egap,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int band,bool upperp)707 Directions8_print_ud (Direction8_T **directions_nogap, Direction8_T **directions_Egap,
708 		      int rlength, int glength, char *rsequence, char *gsequence, char *gsequencealt,
709 		      bool revp, int band, bool upperp) {
710   int i, j;
711   char g, g_alt;
712 
713 #ifdef HAVE_SSE2
714   _mm_lfence();
715 #endif
716 
717   /* j */
718   printf("   ");		/* For i */
719   printf("  ");
720   for (j = 0; j <= glength; ++j) {
721     printf(" %2d   ",j);
722   }
723   printf("\n");
724 
725   if (gsequence) {
726     printf("   ");		/* For i */
727     printf("  ");
728     for (j = 0; j <= glength; ++j) {
729       if (j == 0) {
730 	printf("      ");
731       } else {
732 	printf("  %c   ",revp ? gsequence[-j+1] : gsequence[j-1]);
733       }
734     }
735     printf("\n");
736   }
737 
738   if (gsequencealt != gsequence) {
739     printf("   ");		/* For i */
740     printf("  ");
741     for (j = 0; j <= glength; ++j) {
742       if (j == 0) {
743 	printf("      ");
744       } else {
745 	g = revp ? gsequence[-j+1] : gsequence[j-1];
746 	g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
747 	if (g == g_alt) {
748 	  printf("  %c   ",' ');
749 	} else {
750 	  printf("  %c   ",g_alt);
751 	}
752       }
753     }
754     printf("\n");
755   }
756 
757   for (i = 0; i <= rlength; ++i) {
758     printf("%2d ",i);
759     if (i == 0) {
760       printf("  ");
761     } else {
762       printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
763     }
764     if (upperp == true) {
765       for (j = 0; j <= glength; ++j) {
766 	if (j < i) {
767 	  printf("     ");
768 	} else if (j > i + band) {
769 	  printf("     ");
770 	} else {
771 	  if (directions_Egap[j][i] == DIAG) {
772 	    printf("D");
773 	  } else {
774 	    printf("-");
775 	  }
776 	  printf("|");
777 	  if (directions_nogap[j][i] == DIAG) {
778 	    printf("D");
779 	  } else {
780 	    printf("-");
781 	  }
782 	  printf("| ");		/* For Fgap */
783 	}
784 	printf(" ");
785       }
786     } else {
787       for (j = 0; j <= glength; ++j) {
788 	if (i < j) {
789 	  printf("     ");
790 	} else if (i > j + band) {
791 	  printf("     ");
792 	} else {
793 	  printf(" |");		/* For Fgap */
794 	  if (directions_nogap[i][j] == DIAG) {
795 	    printf("D");
796 	  } else {
797 	    printf("-");
798 	  }
799 	  printf("|");
800 	  if (directions_Egap[i][j] == DIAG) {
801 	    printf("D");
802 	  } else {
803 	    printf("-");
804 	  }
805 	}
806 	printf(" ");
807       }
808     }
809     printf("\n");
810   }
811   printf("\n");
812 
813   return;
814 }
815 
816 
817 static void
Directions16_print(Direction16_T ** directions_nogap,Direction16_T ** directions_Egap,Direction16_T ** directions_Fgap,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int lband,int uband)818 Directions16_print (Direction16_T **directions_nogap, Direction16_T **directions_Egap, Direction16_T **directions_Fgap,
819 		    int rlength, int glength, char *rsequence, char *gsequence, char *gsequencealt,
820 		    bool revp, int lband, int uband) {
821   int i, j;
822   char g, g_alt;
823 
824 #ifdef HAVE_SSE2
825   _mm_lfence();
826 #endif
827 
828   /* j */
829   printf("   ");		/* For i */
830   printf("  ");
831   for (j = 0; j <= glength; ++j) {
832     printf(" %3d  ",j);
833   }
834   printf("\n");
835 
836   if (gsequence) {
837     printf("   ");		/* For i */
838     printf("  ");
839     for (j = 0; j <= glength; ++j) {
840       if (j == 0) {
841 	printf("      ");
842       } else {
843 	printf("  %c   ",revp ? gsequence[-j+1] : gsequence[j-1]);
844       }
845     }
846     printf("\n");
847   }
848 
849   if (gsequencealt != gsequence) {
850     printf("   ");		/* For i */
851     printf("  ");
852     for (j = 0; j <= glength; ++j) {
853       if (j == 0) {
854 	printf("      ");
855       } else {
856 	g = revp ? gsequence[-j+1] : gsequence[j-1];
857 	g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
858 	if (g == g_alt) {
859 	  printf("  %c   ",' ');
860 	} else {
861 	  printf("  %c   ",g_alt);
862 	}
863       }
864     }
865     printf("\n");
866   }
867 
868   for (i = 0; i <= rlength; ++i) {
869     printf("%2d ",i);
870     if (i == 0) {
871       printf("  ");
872     } else {
873       printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
874     }
875     for (j = 0; j <= glength; ++j) {
876       if (j < i - lband) {
877 	printf("     ");
878       } else if (j > i + uband) {
879 	printf("     ");
880       } else {
881 	if (directions_Egap[j][i] == DIAG) {
882 	  printf("D");
883 	} else {
884 	  /* Must be HORIZ */
885 	  printf("H");
886 	}
887 	printf("|");
888 	if (directions_nogap[j][i] == DIAG) {
889 	  printf("D");
890 	} else if (directions_nogap[j][i] == HORIZ) {
891 	  printf("H");
892 	} else {
893 	  /* Must be VERT */
894 	  printf("V");
895 	}
896 	printf("|");
897 	if (directions_Fgap[j][i] == DIAG) {
898 	  printf("D");
899 	} else {
900 	  /* Must be VERT */
901 	  printf("V");
902 	}
903       }
904       printf(" ");
905     }
906     printf("\n");
907   }
908   printf("\n");
909 
910   return;
911 }
912 
913 static void
Directions16_print_ud(Direction16_T ** directions_nogap,Direction16_T ** directions_Egap,int rlength,int glength,char * rsequence,char * gsequence,char * gsequencealt,bool revp,int band,bool upperp)914 Directions16_print_ud (Direction16_T **directions_nogap, Direction16_T **directions_Egap,
915 		       int rlength, int glength, char *rsequence, char *gsequence, char *gsequencealt,
916 		       bool revp, int band, bool upperp) {
917   int i, j;
918   char g, g_alt;
919 
920 #ifdef HAVE_SSE2
921   _mm_lfence();
922 #endif
923 
924   /* j */
925   printf("   ");		/* For i */
926   printf("  ");
927   for (j = 0; j <= glength; ++j) {
928     printf(" %2d   ",j);
929   }
930   printf("\n");
931 
932   if (gsequence) {
933     printf("   ");		/* For i */
934     printf("  ");
935     for (j = 0; j <= glength; ++j) {
936       if (j == 0) {
937 	printf("      ");
938       } else {
939 	printf("  %c   ",revp ? gsequence[-j+1] : gsequence[j-1]);
940       }
941     }
942     printf("\n");
943   }
944 
945   if (gsequencealt != gsequence) {
946     printf("   ");		/* For i */
947     printf("  ");
948     for (j = 0; j <= glength; ++j) {
949       if (j == 0) {
950 	printf("      ");
951       } else {
952 	g = revp ? gsequence[-j+1] : gsequence[j-1];
953 	g_alt = revp ? gsequencealt[-j+1] : gsequencealt[j-1];
954 	if (g == g_alt) {
955 	  printf("  %c   ",' ');
956 	} else {
957 	  printf("  %c   ",g_alt);
958 	}
959       }
960     }
961     printf("\n");
962   }
963 
964   for (i = 0; i <= rlength; ++i) {
965     printf("%2d ",i);
966     if (i == 0) {
967       printf("  ");
968     } else {
969       printf("%c ",revp ? rsequence[-i+1] : rsequence[i-1]);
970     }
971     if (upperp == true) {
972       for (j = 0; j <= glength; ++j) {
973 	if (j < i) {
974 	  printf("   ");
975 	} else if (j > i + band) {
976 	  printf("   ");
977 	} else {
978 	  if (directions_Egap[j][i] == DIAG) {
979 	    printf("D");
980 	  } else {
981 	    printf("-");
982 	  }
983 	  printf("|");
984 	  if (directions_nogap[j][i] == DIAG) {
985 	    printf("D");
986 	  } else {
987 	    printf("-");
988 	  }
989 	}
990 	printf("  ");		/* For Fgap */
991 	printf(" ");
992       }
993     } else {
994       for (j = 0; j <= glength; ++j) {
995 	printf("  ");		/* For Fgap */
996 	if (i < j) {
997 	  printf("   ");
998 	} else if (i > j + band) {
999 	  printf("   ");
1000 	} else {
1001 	  if (directions_nogap[i][j] == DIAG) {
1002 	    printf("D");
1003 	  } else {
1004 	    printf("-");
1005 	  }
1006 	  printf("|");
1007 	  if (directions_Egap[i][j] == DIAG) {
1008 	    printf("D");
1009 	  } else {
1010 	    printf("-");
1011 	  }
1012 	}
1013 	printf(" ");
1014       }
1015     }
1016     printf("\n");
1017   }
1018   printf("\n");
1019 
1020   return;
1021 }
1022 #endif
1023 
1024 
1025 
1026 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
1027 static void
banded_matrix8_compare(Score8_T ** matrix1,Score8_T ** matrix2,int rlength,int glength,int lband,int uband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1028 banded_matrix8_compare (Score8_T **matrix1,
1029 #ifdef DEBUG_AVX2
1030 			Score8_T **matrix2,
1031 #else
1032 			Score32_T **matrix2,
1033 #endif
1034 			int rlength, int glength, int lband, int uband, char *rsequence, char *gsequence, char *gsequence_alt,
1035 			int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1036 			bool revp) {
1037   int r, c, rlo, rhigh;
1038 
1039   for (c = 1; c <= glength; c++) {
1040     if ((rlo = c - uband) < 1) {
1041       rlo = 1;
1042     };
1043 
1044     if ((rhigh = c + lband) > rlength) {
1045       rhigh = rlength;
1046     }
1047 
1048     for (r = rlo; r <= rhigh; r++) {
1049       if (matrix1[c][r] <= NEG_INFINITY_8 + 30 && matrix2[c][r] <= NEG_INFINITY_8 + 30) {
1050 	/* Okay: both essentially negative infinity */
1051       } else if (matrix1[c][r] != matrix2[c][r]) {
1052 	printf("At %d,%d, value %d != value %d\n",r,c,matrix1[c][r],matrix2[c][r]);
1053 
1054 	Matrix8_print(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1055 		      revp,lband,uband);
1056 #ifdef DEBUG_AVX2
1057 	Matrix8_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1058 		      revp,lband,uband);
1059 #elif defined(DEBUG_SIMD)
1060 	Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1061 			       goffset,chroffset,chrhigh,watsonp,revp,lband,uband);
1062 #endif
1063 	abort();
1064       }
1065     }
1066   }
1067 
1068   return;
1069 }
1070 
1071 static void
banded_matrix8_compare_upper(Score8_T ** matrix1,Score8_T ** matrix2,int rlength,int glength,int uband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1072 banded_matrix8_compare_upper (Score8_T **matrix1,
1073 #ifdef DEBUG_AVX2
1074 			      Score8_T **matrix2,
1075 #else
1076 			      Score32_T **matrix2,
1077 #endif
1078 			      int rlength, int glength,
1079 			      int uband, char *rsequence, char *gsequence, char *gsequence_alt,
1080 			      int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1081 			      bool revp) {
1082   int r, c, rlo, rhigh;
1083 
1084   for (c = 1; c <= glength; c++) {
1085     if ((rlo = c - uband) < 1) {
1086       rlo = 1;
1087     };
1088 
1089     if ((rhigh = c) > rlength) {
1090       rhigh = rlength;
1091     }
1092 
1093     for (r = rlo; r <= rhigh; r++) {
1094       if (matrix1[c][r] <= NEG_INFINITY_8 + 30 && matrix2[c][r] <= NEG_INFINITY_8 + 30) {
1095 	/* Okay */
1096       } else if (matrix1[c][r] != matrix2[c][r]) {
1097 	printf("At %d,%d, value %d != value %d\n",r,c,matrix1[c][r],matrix2[c][r]);
1098 
1099 	Matrix8_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1100 			 revp,uband,/*upperp*/true);
1101 #ifdef DEBUG_AVX2
1102 	Matrix8_print_ud(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1103 			 revp,uband,/*upperp*/true);
1104 #else
1105 	Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1106 			       goffset,chroffset,chrhigh,watsonp,revp,/*lband*/0,uband);
1107 #endif
1108 	abort();
1109       }
1110     }
1111   }
1112 
1113   return;
1114 }
1115 
1116 static void
banded_matrix8_compare_lower(Score8_T ** matrix1,Score8_T ** matrix2,int rlength,int glength,int lband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1117 banded_matrix8_compare_lower (Score8_T **matrix1,
1118 #ifdef DEBUG_AVX2
1119 			      Score8_T **matrix2,
1120 #else
1121 			      Score32_T **matrix2,
1122 #endif
1123 			      int rlength, int glength,
1124 			      int lband, char *rsequence, char *gsequence, char *gsequence_alt,
1125 			      int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1126 			      bool revp) {
1127   int r, c, rlo, rhigh;
1128 
1129   for (c = 1; c <= glength; c++) {
1130     if ((rlo = c) < 1) {
1131       rlo = 1;
1132     };
1133 
1134     if ((rhigh = c + lband) > rlength) {
1135       rhigh = rlength;
1136     }
1137 
1138     for (r = rlo; r <= rhigh; r++) {
1139 #ifdef DEBUG_AVX2
1140       if (matrix1[r][c] <= NEG_INFINITY_8 + 30 && matrix2[r][c] <= NEG_INFINITY_8 + 30) {
1141 	/* Okay */
1142       } else if (matrix1[r][c] != matrix2[r][c]) {
1143 	printf("At %d,%d, value %d != value %d\n",r,c,matrix1[r][c],matrix2[r][c]);
1144 
1145 	Matrix8_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1146 			 revp,lband,/*upperp*/false);
1147 	Matrix8_print_ud(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1148 			 revp,lband,/*upperp*/false);
1149 	abort();
1150       }
1151 #else
1152       if (matrix1[r][c] <= NEG_INFINITY_8 + 30 && matrix2[c][r] <= NEG_INFINITY_8 + 30) {
1153 	/* Okay */
1154       } else if (matrix1[r][c] != matrix2[c][r]) {
1155 	printf("At %d,%d, value %d != value %d\n",r,c,matrix1[r][c],matrix2[c][r]);
1156 
1157 	Matrix8_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1158 			 revp,lband,/*upperp*/false);
1159 	Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1160 			       goffset,chroffset,chrhigh,watsonp,revp,lband,/*uband*/0);
1161 	abort();
1162       }
1163 #endif
1164     }
1165   }
1166 
1167   return;
1168 }
1169 
1170 
1171 static void
banded_matrix16_compare(Score16_T ** matrix1,Score16_T ** matrix2,int rlength,int glength,int lband,int uband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1172 banded_matrix16_compare (Score16_T **matrix1,
1173 #ifdef DEBUG_AVX2
1174 			 Score16_T **matrix2,
1175 #elif defined(DEBUG_SIMD)
1176 			 Score32_T **matrix2,
1177 #endif
1178 			 int rlength, int glength, int lband, int uband, char *rsequence, char *gsequence, char *gsequence_alt,
1179 			 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1180 			 bool revp) {
1181   int r, c, rlo, rhigh;
1182 
1183   for (c = 1; c <= glength; c++) {
1184     if ((rlo = c - uband) < 1) {
1185       rlo = 1;
1186     };
1187 
1188     if ((rhigh = c + lband) > rlength) {
1189       rhigh = rlength;
1190     }
1191 
1192     for (r = rlo; r <= rhigh; r++) {
1193       if (matrix1[c][r] <= NEG_INFINITY_16 + 30 && matrix2[c][r] <= NEG_INFINITY_16 + 30) {
1194 	/* Okay: both essentially negative infinity */
1195       } else if (matrix1[c][r] != matrix2[c][r]) {
1196 	printf("At %d,%d, value %d != value %d\n",r,c,matrix1[c][r],matrix2[c][r]);
1197 
1198 	Matrix16_print(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1199 		       revp,lband,uband);
1200 #ifdef DEBUG_AVX2
1201 	Matrix16_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1202 		       revp,lband,uband);
1203 #elif defined(DEBUG_SIMD)
1204 	Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1205 			       goffset,chroffset,chrhigh,watsonp,revp,lband,uband);
1206 #endif
1207 	abort();
1208       }
1209     }
1210   }
1211 
1212   return;
1213 }
1214 
1215 static void
banded_matrix16_compare_upper(Score16_T ** matrix1,Score16_T ** matrix2,int rlength,int glength,int uband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1216 banded_matrix16_compare_upper (Score16_T **matrix1,
1217 #ifdef DEBUG_AVX2
1218 			       Score16_T **matrix2,
1219 #else
1220 			       Score32_T **matrix2,
1221 #endif
1222 			       int rlength, int glength,
1223 			       int uband, char *rsequence, char *gsequence, char *gsequence_alt,
1224 			       int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1225 			       bool revp) {
1226   int r, c, rlo, rhigh;
1227 
1228   for (c = 1; c <= glength; c++) {
1229     if ((rlo = c - uband) < 1) {
1230       rlo = 1;
1231     };
1232 
1233     if ((rhigh = c) > rlength) {
1234       rhigh = rlength;
1235     }
1236 
1237     for (r = rlo; r <= rhigh; r++) {
1238       if (matrix1[c][r] <= NEG_INFINITY_16 + 30 && matrix2[c][r] <= NEG_INFINITY_16 + 30) {
1239 	/* Okay */
1240       } else if (matrix1[c][r] != matrix2[c][r]) {
1241 	printf("At %d,%d, value %d != value %d\n",r,c,matrix1[c][r],matrix2[c][r]);
1242 
1243 	Matrix16_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1244 			    revp,uband,/*upperp*/true);
1245 #ifdef DEBUG_AVX2
1246 	Matrix16_print_ud(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1247 			    revp,uband,/*upperp*/true);
1248 #else
1249 	Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1250 				 goffset,chroffset,chrhigh,watsonp,revp,/*lband*/0,uband);
1251 #endif
1252 	abort();
1253       }
1254     }
1255   }
1256 
1257   return;
1258 }
1259 
1260 static void
banded_matrix16_compare_lower(Score16_T ** matrix1,Score16_T ** matrix2,int rlength,int glength,int lband,char * rsequence,char * gsequence,char * gsequence_alt,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool revp)1261 banded_matrix16_compare_lower (Score16_T **matrix1,
1262 #ifdef DEBUG_AVX2
1263 			       Score16_T **matrix2,
1264 #else
1265 			       Score32_T **matrix2,
1266 #endif
1267 			       int rlength, int glength,
1268 			       int lband, char *rsequence, char *gsequence, char *gsequence_alt,
1269 			       int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1270 			       bool revp) {
1271   int r, c, rlo, rhigh;
1272 
1273   for (c = 1; c <= glength; c++) {
1274     if ((rlo = c) < 1) {
1275       rlo = 1;
1276     };
1277 
1278     if ((rhigh = c + lband) > rlength) {
1279       rhigh = rlength;
1280     }
1281 
1282     for (r = rlo; r <= rhigh; r++) {
1283 #ifdef DEBUG_AVX2
1284       if (matrix1[r][c] <= NEG_INFINITY_16 + 30 && matrix2[r][c] <= NEG_INFINITY_16 + 30) {
1285 	/* Okay */
1286       } else if (matrix1[r][c] != matrix2[r][c]) {
1287 	printf("At %d,%d, value %d != value %d\n",r,c,matrix1[r][c],matrix2[r][c]);
1288 
1289 	Matrix16_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1290 			  revp,lband,/*upperp*/false);
1291 	Matrix16_print_ud(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1292 			  revp,lband,/*upperp*/false);
1293 	abort();
1294       }
1295 #else
1296       if (matrix1[r][c] <= NEG_INFINITY_16 + 30 && matrix2[c][r] <= NEG_INFINITY_16 + 30) {
1297 	/* Okay */
1298       } else if (matrix1[r][c] != matrix2[c][r]) {
1299 	printf("At %d,%d, value %d != value %d\n",r,c,matrix1[r][c],matrix2[c][r]);
1300 
1301 	Matrix16_print_ud(matrix1,rlength,glength,rsequence,gsequence,gsequence_alt,
1302 			  revp,lband,/*upperp*/false);
1303 	Dynprog_Matrix32_print(matrix2,rlength,glength,rsequence,gsequence,gsequence_alt,
1304 			       goffset,chroffset,chrhigh,watsonp,revp,lband,/*uband*/0);
1305 	abort();
1306       }
1307 #endif
1308     }
1309   }
1310 
1311   return;
1312 }
1313 
1314 #endif
1315 
1316 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
1317 static void
banded_directions8_compare_nogap(Score8_T ** matrix,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int lband,int uband)1318 banded_directions8_compare_nogap (Score8_T **matrix, Direction8_T **directions1,
1319 #ifdef DEBUG_AVX2
1320 				  Direction8_T **directions2,
1321 #elif defined(DEBUG_SIMD)
1322 				  Direction32_T **directions2,
1323 #endif
1324 				  int rlength, int glength, int lband, int uband) {
1325   int r, c, rlo, rhigh;
1326 
1327   for (c = 1; c <= glength; c++) {
1328     if ((rlo = c - uband) < 1) {
1329       rlo = 1;
1330     };
1331 
1332     if ((rhigh = c + lband) > rlength) {
1333       rhigh = rlength;
1334     }
1335 
1336     for (r = rlo; r <= rhigh; r++) {
1337       if (matrix[c][r] < NEG_INFINITY_8 + 30) {
1338 	/* Don't check */
1339 
1340       } else if (directions1[c][r] == 0) {
1341 	if (directions2[c][r] == 0) {
1342 	} else {
1343 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1344 	  abort();
1345 	}
1346 
1347       } else if (directions1[c][r] == 1) {
1348 	if (directions2[c][r] == 1) {
1349 	} else {
1350 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1351 	  abort();
1352 	}
1353 
1354       } else {
1355 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1356 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1357 	  abort();
1358 	}
1359       }
1360     }
1361   }
1362 
1363   return;
1364 }
1365 
1366 static void
banded_directions8_compare_nogap_upper(Score8_T ** matrix,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int uband)1367 banded_directions8_compare_nogap_upper (Score8_T **matrix, Direction8_T **directions1,
1368 #ifdef DEBUG_AVX2
1369 					Direction8_T **directions2,
1370 #elif defined(DEBUG_SIMD)
1371 					Direction32_T **directions2,
1372 #endif
1373 					int rlength, int glength, int uband) {
1374   int r, c, rlo, rhigh;
1375 
1376   for (c = 1; c <= glength; c++) {
1377     if ((rlo = c - uband) < 1) {
1378       rlo = 1;
1379     };
1380 
1381     if ((rhigh = c) > rlength) {
1382       rhigh = rlength;
1383     }
1384 
1385     for (r = rlo; r <= rhigh; r++) {
1386       if (matrix[c][r] < NEG_INFINITY_8 + 30) {
1387 	/* Don't check */
1388 
1389       } else if (directions1[c][r] == 0) {
1390 	if (directions2[c][r] == 0) {
1391 	} else {
1392 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1393 	  abort();
1394 	}
1395 
1396       } else if (directions1[c][r] == 1) {
1397 	if (directions2[c][r] == 1) {
1398 	} else {
1399 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1400 	  abort();
1401 	}
1402 
1403       } else {
1404 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1405 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1406 	  abort();
1407 	}
1408       }
1409     }
1410   }
1411 
1412   return;
1413 }
1414 
1415 static void
banded_directions8_compare_nogap_lower(Score8_T ** matrix,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int lband)1416 banded_directions8_compare_nogap_lower (Score8_T **matrix, Direction8_T **directions1,
1417 #ifdef DEBUG_AVX2
1418 					Direction8_T **directions2,
1419 #elif defined(DEBUG_SIMD)
1420 					Direction32_T **directions2,
1421 #endif
1422 					int rlength, int glength, int lband) {
1423   int r, c, rlo, rhigh;
1424 
1425   for (c = 1; c <= glength; c++) {
1426     if ((rlo = c) < 1) {
1427       rlo = 1;
1428     };
1429 
1430     if ((rhigh = c + lband) > rlength) {
1431       rhigh = rlength;
1432     }
1433 
1434 #ifdef DEBUG_AVX2
1435     for (r = rlo; r <= rhigh; r++) {
1436       if (matrix[c][r] < NEG_INFINITY_8 + 30) {
1437 	/* Don't check */
1438 
1439       } else if (directions1[r][c] == 0) {
1440 	if (directions2[r][c] == 0) {
1441 	} else {
1442 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1443 	  abort();
1444 	}
1445 
1446       } else if (directions1[r][c] == 1) {
1447 	if (directions2[r][c] == 1) {
1448 	} else {
1449 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1450 	  abort();
1451 	}
1452 
1453       } else {
1454 	if (directions2[r][c] == 0 || directions2[r][c] == 0) {
1455 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1456 	  abort();
1457 	}
1458       }
1459     }
1460 
1461 #else
1462     for (r = rlo; r <= rhigh; r++) {
1463       if (matrix[c][r] < NEG_INFINITY_8 + 30) {
1464 	/* Don't check */
1465 
1466       } else if (directions1[r][c] == 0) {
1467 	if (directions2[c][r] == 0) {
1468 	} else {
1469 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1470 	  abort();
1471 	}
1472 
1473       } else if (directions1[r][c] == 1) {
1474 	if (directions2[c][r] == 1) {
1475 	} else {
1476 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1477 	  abort();
1478 	}
1479 
1480       } else {
1481 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1482 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1483 	  abort();
1484 	}
1485       }
1486     }
1487 #endif
1488 
1489   }
1490 
1491   return;
1492 }
1493 
1494 
1495 static void
banded_directions16_compare_nogap(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int lband,int uband)1496 banded_directions16_compare_nogap (Direction16_T **directions1,
1497 #ifdef DEBUG_AVX2
1498 				   Direction16_T **directions2,
1499 #elif defined(DEBUG_SIMD)
1500 				   Direction32_T **directions2,
1501 #endif
1502 				   int rlength, int glength, int lband, int uband) {
1503   int r, c, rlo, rhigh;
1504 
1505   for (c = 1; c <= glength; c++) {
1506     if ((rlo = c - uband) < 1) {
1507       rlo = 1;
1508     };
1509 
1510     if ((rhigh = c + lband) > rlength) {
1511       rhigh = rlength;
1512     }
1513 
1514     for (r = rlo; r <= rhigh; r++) {
1515       if (directions1[c][r] == 0) {
1516 	if (directions2[c][r] == 0) {
1517 	} else {
1518 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1519 	  abort();
1520 	}
1521 
1522       } else if (directions1[c][r] == 1) {
1523 	if (directions2[c][r] == 1) {
1524 	} else {
1525 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1526 	  abort();
1527 	}
1528 
1529       } else {
1530 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1531 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1532 	  abort();
1533 	}
1534       }
1535     }
1536   }
1537 
1538   return;
1539 }
1540 
1541 static void
banded_directions16_compare_nogap_upper(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int uband)1542 banded_directions16_compare_nogap_upper (Direction16_T **directions1,
1543 #ifdef DEBUG_AVX2
1544 					 Direction16_T **directions2,
1545 #else
1546 					 Direction32_T **directions2,
1547 #endif
1548 					 int rlength, int glength, int uband) {
1549   int r, c, rlo, rhigh;
1550 
1551   for (c = 1; c <= glength; c++) {
1552     if ((rlo = c - uband) < 1) {
1553       rlo = 1;
1554     };
1555 
1556     if ((rhigh = c) > rlength) {
1557       rhigh = rlength;
1558     }
1559 
1560     for (r = rlo; r <= rhigh; r++) {
1561       if (directions1[c][r] == 0) {
1562 	if (directions2[c][r] == 0) {
1563 	} else {
1564 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1565 	  abort();
1566 	}
1567 
1568       } else if (directions1[c][r] == 1) {
1569 	if (directions2[c][r] == 1) {
1570 	} else {
1571 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1572 	  abort();
1573 	}
1574 
1575       } else {
1576 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1577 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1578 	  abort();
1579 	}
1580       }
1581     }
1582   }
1583 
1584   return;
1585 }
1586 
1587 static void
banded_directions16_compare_nogap_lower(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int lband)1588 banded_directions16_compare_nogap_lower (Direction16_T **directions1,
1589 #ifdef DEBUG_AVX2
1590 					 Direction16_T **directions2,
1591 #else
1592 					 Direction32_T **directions2,
1593 #endif
1594 					 int rlength, int glength, int lband) {
1595   int r, c, rlo, rhigh;
1596 
1597   for (c = 1; c <= glength; c++) {
1598     if ((rlo = c) < 1) {
1599       rlo = 1;
1600     };
1601 
1602     if ((rhigh = c + lband) > rlength) {
1603       rhigh = rlength;
1604     }
1605 
1606     for (r = rlo; r <= rhigh; r++) {
1607 #ifdef DEBUG_AVX2
1608       if (directions1[r][c] == 0) {
1609 	if (directions2[r][c] == 0) {
1610 	} else {
1611 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1612 	  abort();
1613 	}
1614 
1615       } else if (directions1[r][c] == 1) {
1616 	if (directions2[r][c] == 1) {
1617 	} else {
1618 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1619 	  abort();
1620 	}
1621 
1622       } else {
1623 	if (directions2[r][c] == 0 || directions2[r][c] == 0) {
1624 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1625 	  abort();
1626 	}
1627       }
1628 #else
1629       if (directions1[r][c] == 0) {
1630 	if (directions2[c][r] == 0) {
1631 	} else {
1632 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1633 	  abort();
1634 	}
1635 
1636       } else if (directions1[r][c] == 1) {
1637 	if (directions2[c][r] == 1) {
1638 	} else {
1639 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1640 	  abort();
1641 	}
1642 
1643       } else {
1644 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1645 	  printf("At %d,%d, nogap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1646 	  abort();
1647 	}
1648       }
1649 #endif
1650     }
1651   }
1652 
1653   return;
1654 }
1655 #endif
1656 
1657 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
1658 static void
banded_directions8_compare_Egap(Score8_T ** matrix1,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int lband,int uband)1659 banded_directions8_compare_Egap (Score8_T **matrix1, Direction8_T **directions1,
1660 #ifdef DEBUG_AVX2
1661 				 Direction8_T **directions2,
1662 #else
1663 				 Direction32_T **directions2,
1664 #endif
1665 				 int rlength, int glength, int lband, int uband) {
1666   int r, c, rlo, rhigh, last_check;
1667 
1668   for (c = 1; c <= glength; c++) {
1669     if ((rlo = c - uband) < 1) {
1670       rlo = 1;
1671     };
1672 
1673     if ((rhigh = c + lband) <= rlength) {
1674       /* Don't check rhigh.  Egap direction derives from a comparison
1675 	 of NEG_INFINITY values, and we should never reach here from
1676 	 directions_nogap anyway. */
1677       last_check = rhigh - 1;
1678 
1679     } else {
1680       /* Do check rhigh, which contains instructions for the bottom row */
1681       rhigh = rlength;
1682       last_check = rhigh;
1683     }
1684 
1685     for (r = rlo; r <= last_check; r++) {
1686       if (matrix1[c][r] < NEG_INFINITY_8 + 30) {
1687 	/* Don't check */
1688 
1689       } else if (directions1[c][r] == 0) {
1690 	if (directions2[c][r] == 0) {
1691 	} else {
1692 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1693 	  abort();
1694 	}
1695 
1696       } else if (directions1[c][r] == 1) {
1697 	if (directions2[c][r] == 1) {
1698 	} else {
1699 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1700 	  abort();
1701 	}
1702 
1703       } else {
1704 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1705 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1706 	  abort();
1707 	}
1708       }
1709     }
1710   }
1711 
1712   return;
1713 }
1714 
1715 static void
banded_directions8_compare_Egap_upper(Score8_T ** matrix1,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int uband)1716 banded_directions8_compare_Egap_upper (Score8_T **matrix1, Direction8_T **directions1,
1717 #ifdef DEBUG_AVX2
1718 				       Direction8_T **directions2,
1719 #else
1720 				       Direction32_T **directions2,
1721 #endif
1722 				       int rlength, int glength, int uband) {
1723   int r, c, rlo, rhigh, last_check;
1724 
1725   return;
1726   for (c = 1; c <= glength; c++) {
1727     if ((rlo = c - uband) < 1) {
1728       rlo = 1;
1729     };
1730 
1731     if ((rhigh = c) <= rlength) {
1732       /* Don't check rhigh.  Egap direction derives from a comparison
1733 	 of NEG_INFINITY values, and we should never reach here from
1734 	 directions_nogap anyway. */
1735       last_check = rhigh - 1;
1736 
1737     } else {
1738       /* Do check rhigh, which contains instructions for the bottom row */
1739       rhigh = rlength;
1740       last_check = rhigh;
1741     }
1742 
1743     for (r = rlo; r <= last_check; r++) {
1744       if (matrix1[c][r] < NEG_INFINITY_8 + 30) {
1745 	/* Don't check */
1746 
1747       } else if (directions1[c][r] == 0) {
1748 	if (directions2[c][r] == 0) {
1749 	} else {
1750 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1751 	  abort();
1752 	}
1753 
1754       } else if (directions1[c][r] == 1) {
1755 	if (directions2[c][r] == 1) {
1756 	} else {
1757 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1758 	  abort();
1759 	}
1760 
1761       } else {
1762 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1763 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1764 	  abort();
1765 	}
1766       }
1767     }
1768   }
1769 
1770   return;
1771 }
1772 
1773 static void
banded_directions8_compare_Egap_lower(Score8_T ** matrix1,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int lband)1774 banded_directions8_compare_Egap_lower (Score8_T **matrix1, Direction8_T **directions1,
1775 #ifdef DEBUG_AVX2
1776 				       Direction8_T **directions2,
1777 #else
1778 				       Direction32_T **directions2,
1779 #endif
1780 				       int rlength, int glength, int lband) {
1781   int r, c, rlo, rhigh, last_check;
1782 
1783   return;
1784   for (c = 1; c <= glength; c++) {
1785     if ((rlo = c) < 1) {
1786       rlo = 1;
1787     };
1788 
1789     if ((rhigh = c + lband) <= rlength) {
1790       /* Don't check rhigh.  Egap direction derives from a comparison
1791 	 of NEG_INFINITY values, and we should never reach here from
1792 	 directions_nogap anyway. */
1793       last_check = rhigh - 1;
1794 
1795     } else {
1796       /* Do check rhigh, which contains instructions for the bottom row */
1797       rhigh = rlength;
1798       last_check = rhigh;
1799     }
1800 
1801     for (r = rlo; r <= last_check; r++) {
1802 #ifdef DEBUG_AVX2
1803       if (matrix1[r][c] < NEG_INFINITY_8 + 30) {
1804 	/* Don't check */
1805 
1806       } else if (directions1[r][c] == 0) {
1807 	if (directions2[r][c] == 0) {
1808 	} else {
1809 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1810 	  abort();
1811 	}
1812 
1813       } else if (directions1[r][c] == 1) {
1814 	if (directions2[r][c] == 1) {
1815 	} else {
1816 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1817 	  abort();
1818 	}
1819 
1820       } else {
1821 	if (directions2[r][c] == 0 || directions2[r][c] == 0) {
1822 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1823 	  abort();
1824 	}
1825       }
1826 #else
1827       if (matrix1[r][c] < NEG_INFINITY_8 + 30) {
1828 	/* Don't check */
1829 
1830       } else if (directions1[r][c] == 0) {
1831 	if (directions2[c][r] == 0) {
1832 	} else {
1833 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1834 	  abort();
1835 	}
1836 
1837       } else if (directions1[r][c] == 1) {
1838 	if (directions2[c][r] == 1) {
1839 	} else {
1840 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1841 	  abort();
1842 	}
1843 
1844       } else {
1845 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1846 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
1847 	  abort();
1848 	}
1849       }
1850 #endif
1851     }
1852   }
1853 
1854   return;
1855 }
1856 
1857 
1858 static void
banded_directions16_compare_Egap(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int lband,int uband)1859 banded_directions16_compare_Egap (Direction16_T **directions1,
1860 #ifdef DEBUG_AVX2
1861 				  Direction16_T **directions2,
1862 #else
1863 				  Direction32_T **directions2,
1864 #endif
1865 				  int rlength, int glength, int lband, int uband) {
1866   int r, c, rlo, rhigh, last_check;
1867 
1868   for (c = 1; c <= glength; c++) {
1869     if ((rlo = c - uband) < 1) {
1870       rlo = 1;
1871     };
1872 
1873     if ((rhigh = c + lband) <= rlength) {
1874       /* Don't check rhigh.  Egap direction derives from a comparison
1875 	 of NEG_INFINITY values, and we should never reach here from
1876 	 directions_nogap anyway. */
1877       last_check = rhigh - 1;
1878 
1879     } else {
1880       /* Do check rhigh, which contains instructions for the bottom row */
1881       rhigh = rlength;
1882       last_check = rhigh;
1883     }
1884 
1885     for (r = rlo; r <= last_check; r++) {
1886       if (directions1[c][r] == 0) {
1887 	if (directions2[c][r] == 0) {
1888 	} else {
1889 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1890 	  abort();
1891 	}
1892       } else if (directions1[c][r] == 1) {
1893 	if (directions2[c][r] == 1) {
1894 	} else {
1895 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1896 	  abort();
1897 	}
1898 
1899       } else {
1900 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1901 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1902 	  abort();
1903 	}
1904       }
1905     }
1906   }
1907 
1908   return;
1909 }
1910 
1911 static void
banded_directions16_compare_Egap_upper(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int uband)1912 banded_directions16_compare_Egap_upper (Direction16_T **directions1,
1913 #ifdef DEBUG_AVX2
1914 					Direction16_T **directions2,
1915 #else
1916 					Direction32_T **directions2,
1917 #endif
1918 					int rlength, int glength, int uband) {
1919   int r, c, rlo, rhigh, last_check;
1920 
1921   return;
1922   for (c = 1; c <= glength; c++) {
1923     if ((rlo = c - uband) < 1) {
1924       rlo = 1;
1925     };
1926 
1927     if ((rhigh = c) <= rlength) {
1928       /* Don't check rhigh.  Egap direction derives from a comparison
1929 	 of NEG_INFINITY values, and we should never reach here from
1930 	 directions_nogap anyway. */
1931       last_check = rhigh - 1;
1932 
1933     } else {
1934       /* Do check rhigh, which contains instructions for the bottom row */
1935       rhigh = rlength;
1936       last_check = rhigh;
1937     }
1938 
1939     for (r = rlo; r <= last_check; r++) {
1940       if (directions1[c][r] == 0) {
1941 	if (directions2[c][r] == 0) {
1942 	} else {
1943 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1944 	  abort();
1945 	}
1946       } else if (directions1[c][r] == 1) {
1947 	if (directions2[c][r] == 1) {
1948 	} else {
1949 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1950 	  abort();
1951 	}
1952 
1953       } else {
1954 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
1955 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
1956 	  abort();
1957 	}
1958       }
1959     }
1960   }
1961 
1962   return;
1963 }
1964 
1965 static void
banded_directions16_compare_Egap_lower(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int lband)1966 banded_directions16_compare_Egap_lower (Direction16_T **directions1,
1967 #ifdef DEBUG_AVX2
1968 					Direction16_T **directions2,
1969 #else
1970 					Direction32_T **directions2,
1971 #endif
1972 					int rlength, int glength, int lband) {
1973   int r, c, rlo, rhigh, last_check;
1974 
1975   return;
1976   for (c = 1; c <= glength; c++) {
1977     if ((rlo = c) < 1) {
1978       rlo = 1;
1979     };
1980 
1981     if ((rhigh = c + lband) <= rlength) {
1982       /* Don't check rhigh.  Egap direction derives from a comparison
1983 	 of NEG_INFINITY values, and we should never reach here from
1984 	 directions_nogap anyway. */
1985       last_check = rhigh - 1;
1986 
1987     } else {
1988       /* Do check rhigh, which contains instructions for the bottom row */
1989       rhigh = rlength;
1990       last_check = rhigh;
1991     }
1992 
1993     for (r = rlo; r <= last_check; r++) {
1994 #ifdef DEBUG_AVX2
1995       if (directions1[r][c] == 0) {
1996 	if (directions2[r][c] == 0) {
1997 	} else {
1998 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
1999 	  abort();
2000 	}
2001       } else if (directions1[r][c] == 1) {
2002 	if (directions2[r][c] == 1) {
2003 	} else {
2004 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
2005 	  abort();
2006 	}
2007 
2008       } else {
2009 	if (directions2[r][c] == 0 || directions2[r][c] == 0) {
2010 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[r][c]);
2011 	  abort();
2012 	}
2013       }
2014 #else
2015       if (directions1[r][c] == 0) {
2016 	if (directions2[c][r] == 0) {
2017 	} else {
2018 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
2019 	  abort();
2020 	}
2021       } else if (directions1[r][c] == 1) {
2022 	if (directions2[c][r] == 1) {
2023 	} else {
2024 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
2025 	  abort();
2026 	}
2027 
2028       } else {
2029 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
2030 	  printf("At %d,%d, Egap dir %d != dir %d\n",r,c,directions1[r][c],directions2[c][r]);
2031 	  abort();
2032 	}
2033       }
2034 #endif
2035     }
2036   }
2037 
2038   return;
2039 }
2040 #endif
2041 
2042 
2043 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
2044 static void
banded_directions8_compare_Fgap(Score8_T ** matrix1,Direction8_T ** directions1,Direction8_T ** directions2,int rlength,int glength,int lband,int uband)2045 banded_directions8_compare_Fgap (Score8_T **matrix1, Direction8_T **directions1,
2046 #ifdef DEBUG_AVX2
2047 				 Direction8_T **directions2,
2048 #else
2049 				 Direction32_T **directions2,
2050 #endif
2051 				 int rlength, int glength, int lband, int uband) {
2052   int r, c, rlo, rhigh, first_check;
2053 
2054   for (c = 1; c <= glength; c++) {
2055     if ((rlo = c - uband) < 1) {
2056       first_check = rlo = 1;
2057     } else {
2058       first_check = rlo + 1;
2059     }
2060 
2061     if ((rhigh = c + lband) > rlength) {
2062       rhigh = rlength;
2063     }
2064 
2065     for (r = first_check; r <= rhigh; r++) {
2066       if (matrix1[c][r] < NEG_INFINITY_8 + 30) {
2067 	/* Don't check */
2068 
2069       } else if (directions1[c][r] == 0) {
2070 	if (directions2[c][r] == 0) {
2071 	} else {
2072 	  printf("At %d,%d, Fgap dir %d != dir %d.  Score is %d\n",
2073 		 r,c,directions1[c][r],directions2[c][r],matrix1[c][r]);
2074 	  abort();
2075 	}
2076 
2077       } else if (directions1[c][r] == 1) {
2078 	if (directions2[c][r] == 1) {
2079 	} else {
2080 	  printf("At %d,%d, Fgap dir %d != dir %d.  Score is %d\n",
2081 		 r,c,directions1[c][r],directions2[c][r],matrix1[c][r]);
2082 	  abort();
2083 	}
2084 
2085       } else {
2086 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
2087 	  printf("At %d,%d, Fgap dir %d != dir %d.  Score is %d\n",
2088 		 r,c,directions1[c][r],directions2[c][r],matrix1[c][r]);
2089 	  abort();
2090 	}
2091       }
2092     }
2093   }
2094 
2095   return;
2096 }
2097 
2098 static void
banded_directions16_compare_Fgap(Direction16_T ** directions1,Direction16_T ** directions2,int rlength,int glength,int lband,int uband)2099 banded_directions16_compare_Fgap (Direction16_T **directions1,
2100 #ifdef DEBUG_AVX2
2101 				  Direction16_T **directions2,
2102 #else
2103 				  Direction32_T **directions2,
2104 #endif
2105 				  int rlength, int glength, int lband, int uband) {
2106   int r, c, rlo, rhigh, first_check;
2107 
2108   for (c = 1; c <= glength; c++) {
2109     if ((rlo = c - uband) < 1) {
2110       first_check = rlo = 1;
2111     } else {
2112       first_check = rlo + 1;
2113     }
2114 
2115     if ((rhigh = c + lband) > rlength) {
2116       rhigh = rlength;
2117     }
2118 
2119     for (r = first_check; r <= rhigh; r++) {
2120       if (directions1[c][r] == 0) {
2121 	if (directions2[c][r] == 0) {
2122 	} else {
2123 	  printf("At %d,%d, Fgap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
2124 	  abort();
2125 	}
2126       } else if (directions1[c][r] == 1) {
2127 	if (directions2[c][r] == 1) {
2128 	} else {
2129 	  printf("At %d,%d, Fgap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
2130 	  abort();
2131 	}
2132 
2133       } else {
2134 	if (directions2[c][r] == 0 || directions2[c][r] == 0) {
2135 	  printf("At %d,%d, Fgap dir %d != dir %d\n",r,c,directions1[c][r],directions2[c][r]);
2136 	  abort();
2137 	}
2138       }
2139     }
2140   }
2141 
2142   return;
2143 }
2144 #endif
2145 
2146 
2147 /************************************************************************
2148  *   End of debugging procedures
2149  ************************************************************************/
2150 
2151 
2152 
2153 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE2)
2154 /* Makes a matrix of dimensions 0..rlength x 0..glength inclusive */
2155 static Score8_T **
aligned_score8_alloc(int rlength,int glength,void ** ptrs,void * space)2156 aligned_score8_alloc (int rlength, int glength, void **ptrs, void *space) {
2157   Score8_T **matrix, *ptr;
2158   int c;
2159 
2160   matrix = (Score8_T **) ptrs;
2161 
2162   ptr = (Score8_T *) space;
2163   matrix[0] = ptr;	   /* Want aligned row to be r = 0, 16, ... */
2164   for (c = 1; c <= glength; c++) {
2165     ptr += rlength;
2166     matrix[c] = ptr;	   /* Want aligned row to be r = 0, 16, ... */
2167   }
2168 #if defined(DEBUG2) && (defined(DEBUG_AVX2) || defined(DEBUG_SIMD))
2169   memset((void *) matrix[0],0,(glength+1)*rlength*sizeof(Score8_T));
2170 #endif
2171 
2172   return matrix;
2173 }
2174 
2175 /* No initialization to DIAG (0), for directions_Egap and directions_nogap */
2176 static Score8_T **
aligned_directions8_alloc(int rlength,int glength,void ** ptrs,void * space)2177 aligned_directions8_alloc (int rlength, int glength, void **ptrs, void *space) {
2178   Score8_T **matrix, *ptr;
2179   int c;
2180 
2181   matrix = (Score8_T **) ptrs;
2182 
2183   ptr = (Score8_T *) space;
2184   matrix[0] = ptr;	   /* Want aligned row to be r = 0, 16, ... */
2185   for (c = 1; c <= glength; c++) {
2186     ptr += rlength;
2187     matrix[c] = ptr;	   /* Want aligned row to be r = 0, 16, ... */
2188   }
2189 #if defined(DEBUG2) && (defined(DEBUG_AVX2) || defined(DEBUG_SIMD))
2190   memset((void *) matrix[0],/*DIAG*/0,(glength+1)*rlength*sizeof(Score8_T));
2191 #endif
2192 
2193   return matrix;
2194 }
2195 
2196 /* Initialization to DIAG (0), for directions_Fgap */
2197 static Score8_T **
aligned_directions8_calloc(int rlength,int glength,void ** ptrs,void * space)2198 aligned_directions8_calloc (int rlength, int glength, void **ptrs, void *space) {
2199   Score8_T **matrix, *ptr;
2200   int c;
2201 
2202   matrix = (Score8_T **) ptrs;
2203 
2204   ptr = (Score8_T *) space;
2205   matrix[0] = ptr;	/* Want aligned row to be r = 0, 16, ... */
2206   for (c = 1; c <= glength; c++) {
2207     ptr += rlength;
2208     matrix[c] = ptr;	/* Want aligned row to be r = 0, 16, ... */
2209   }
2210   memset((void *) matrix[0],/*DIAG*/0,(glength+1)*rlength*sizeof(Score8_T));
2211 
2212   return matrix;
2213 }
2214 
2215 
2216 
2217 /* Makes a matrix of dimensions 0..rlength x 0..glength inclusive */
2218 static Score16_T **
aligned_score16_alloc(int rlength,int glength,void ** ptrs,void * space)2219 aligned_score16_alloc (int rlength, int glength, void **ptrs, void *space) {
2220   Score16_T **matrix, *ptr;
2221   int c;
2222 
2223   matrix = (Score16_T **) ptrs;
2224 
2225   ptr = (Score16_T *) space;
2226   matrix[0] = ptr;	/* Want aligned row to be r = 0, 8, 16, ... */
2227   for (c = 1; c <= glength; c++) {
2228     ptr += rlength;
2229     matrix[c] = ptr;	/* Want aligned row to be r = 0, 8, 16, ... */
2230   }
2231 #ifdef DEBUG2
2232   memset((void *) matrix[0],0,(glength+1)*rlength*sizeof(Score16_T));
2233 #endif
2234 
2235   return matrix;
2236 }
2237 
2238 /* No initialization to DIAG (0), for directions_Egap and directions_nogap */
2239 static Score16_T **
aligned_directions16_alloc(int rlength,int glength,void ** ptrs,void * space)2240 aligned_directions16_alloc (int rlength, int glength, void **ptrs, void *space) {
2241   Score16_T **matrix, *ptr;
2242   int c;
2243 
2244   matrix = (Score16_T **) ptrs;
2245 
2246   ptr = (Score16_T *) space;
2247   matrix[0] = ptr;	/* Want aligned row to be r = 0, 8, 16, ... */
2248   for (c = 1; c <= glength; c++) {
2249     ptr += rlength;
2250     matrix[c] = ptr;	/* Want aligned row to be r = 0, 8, 16, ... */
2251   }
2252 #ifdef DEBUG2
2253   memset((void *) matrix[0],/*DIAG*/0,(glength+1)*rlength*sizeof(Score16_T));
2254 #endif
2255 
2256   return matrix;
2257 }
2258 
2259 /* Initialization to DIAG (0), for directions_Fgap */
2260 static Score16_T **
aligned_directions16_calloc(int rlength,int glength,void ** ptrs,void * space)2261 aligned_directions16_calloc (int rlength, int glength, void **ptrs, void *space) {
2262   Score16_T **matrix, *ptr;
2263   int c;
2264 
2265   matrix = (Score16_T **) ptrs;
2266 
2267   ptr = (Score16_T *) space;
2268   matrix[0] = ptr;	/* Want aligned row to be r = 0, 8, 16, ... */
2269   for (c = 1; c <= glength; c++) {
2270     ptr += rlength;
2271     matrix[c] = ptr;	/* Want aligned row to be r = 0, 8, 16, ... */
2272   }
2273   memset((void *) matrix[0],/*DIAG*/0,(glength+1)*rlength*sizeof(Score16_T));
2274 
2275   return matrix;
2276 }
2277 #endif
2278 
2279 
2280 #define T Dynprog_T
2281 
2282 
2283 #ifdef DEBUG_AVX2
2284 Score8_T **
Dynprog_simd_8_nonavx2(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,Direction8_T *** directions_Fgap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,int uband,bool jump_late_p,bool revp)2285 Dynprog_simd_8_nonavx2 (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
2286 			Direction8_T ***directions_Fgap,
2287 			T this, char *rsequence, char *gsequence, char *gsequence_alt,
2288 			int rlength, int glength,
2289 			int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
2290 			Mismatchtype_T mismatchtype, int open, int extend,
2291 			int lband, int uband, bool jump_late_p, bool revp) {
2292   int c_gap, last_nogap, score, *FF;	/* Need to have the ability to go past NEG_INFINITY */
2293   Score8_T **matrix, *score_column;
2294   __m128i pairscores_std, pairscores_alt;
2295 #ifndef HAVE_SSE4_1
2296   __m128i pairscores_best, all_128;
2297 #endif
2298   __m128i H_nogap_r, X_prev_nogap, E_r_gap, T1;
2299   __m128i gap_open, gap_extend, extend_ladder, extend_chunk, complement_dummy;
2300   __m128i dir_horiz;
2301   __m128i ramp, ramp_chunk, lband_vector, filter, ones;
2302   int rlength_ceil, lband_ceil, r, c;
2303   int rlo, rhigh, rlo_calc, rhigh_calc;
2304   int na1, na2, na2_alt;
2305   Score8_T *pairscores_col0;
2306   Score8_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr, pairscore, pairscore0;
2307   Pairdistance_T **pairdistance_array_type;
2308 
2309 
2310   debug2(printf("Dynprog_simd_8.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
2311   debug15(printf("Dynprog_simd_8.  jump_late_p %d, open %d, extend %d, lband %d, uband %d\n",
2312 		 jump_late_p,open,extend,lband,uband));
2313 
2314   rlength_ceil = (int) ((rlength + SIMD_NCHARS_NONAVX2)/SIMD_NCHARS_NONAVX2) * SIMD_NCHARS_NONAVX2;
2315 
2316 #ifdef HAVE_SSE4_1
2317   pairdistance_array_type = pairdistance_array[mismatchtype];
2318 #else
2319   /* Need to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
2320   pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
2321   all_128 = _mm_set1_epi8(128);
2322 #endif
2323 
2324   debug(printf("Dynprog_simd_8: "));
2325   debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
2326   debug(printf("Query length rounded up to %d\n",rlength_ceil));
2327 
2328   matrix = aligned_score8_alloc(rlength_ceil,glength,
2329 				this->aligned_std.one.matrix_ptrs,this->aligned_std.one.matrix_space);
2330   *directions_nogap = aligned_directions8_alloc(rlength_ceil,glength,
2331 						this->aligned_std.one.directions_ptrs_0,this->aligned_std.one.directions_space_0);
2332   *directions_Egap = aligned_directions8_alloc(rlength_ceil,glength,
2333 					       this->aligned_std.one.directions_ptrs_1,this->aligned_std.one.directions_space_1);
2334   /* Need to calloc to save time in F loop */
2335   *directions_Fgap = aligned_directions8_calloc(rlength_ceil,glength,
2336 						this->aligned_std.one.directions_ptrs_2,this->aligned_std.one.directions_space_2);
2337 
2338 #if 0
2339   /* Row 0 initialization */
2340   /* penalty = open; */
2341   for (c = 1; c <= uband && c <= glength; c++) {
2342     /* penalty += extend; */
2343     (*directions_Egap)[c][0] = HORIZ;
2344     (*directions_nogap)[c][0] = HORIZ;
2345   }
2346 #endif
2347 #if 0
2348   /* Already initialized to DIAG.  Actually no longer initializing directions_Egap */
2349   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
2350   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
2351 #endif
2352 
2353 #if 0
2354   /* Column 0 initialization */
2355   /* penalty = open; */
2356   for (r = 1; r <= SIMD_NCHARS_NONAVX2 && r <= rlength; r++) {
2357     /* penalty += extend; */
2358     (*directions_nogap)[0][r] = VERT;
2359   }
2360 #endif
2361 
2362 
2363   /* Load pairscores.  Store match - mismatch */
2364   pairscores[0] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
2365   pairscores[1] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
2366   pairscores[2] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
2367   pairscores[3] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
2368   pairscores[4] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
2369 
2370   lband_ceil = (int) ((lband + SIMD_NCHARS_NONAVX2)/SIMD_NCHARS_NONAVX2) * SIMD_NCHARS_NONAVX2;
2371   pairscores_col0 = (Score8_T *) _mm_malloc(lband_ceil * sizeof(Score8_T),16);
2372 
2373 
2374 #if 0
2375   /* Should not be necessary */
2376   memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score8_T));
2377   memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score8_T));
2378   memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score8_T));
2379   memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score8_T));
2380   memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score8_T));
2381 #endif
2382 
2383   /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
2384 #ifdef HAVE_SSE4_1
2385   pairscores_col0[0] = (Score8_T) 0;
2386   /* Initialization just to lband causes errors in dir_horiz for Egap */
2387 #ifdef ZERO_INITIAL_GAP_PENALTY
2388   for (r = 1; r < lband_ceil; r++) {
2389     pairscores_col0[r] = (Score8_T) 0;
2390   }
2391 #else
2392   for (r = 1; r < lband_ceil; r++) {
2393     pairscores_col0[r] = (Score8_T) NEG_INFINITY_8;
2394   }
2395 #endif
2396 #else
2397   pairscores_col0[0] = (Score8_T) 0+128;
2398   /* Initialization just to lband causes errors in dir_horiz for Egap */
2399 #ifdef ZERO_INITIAL_GAP_PENALTY
2400   for (r = 1; r < lband_ceil; r++) {
2401     pairscores_col0[r] = (Score8_T) 0+128;
2402   }
2403 #else
2404   for (r = 1; r < lband_ceil; r++) {
2405     pairscores_col0[r] = (Score8_T) NEG_INFINITY_8+128;
2406   }
2407 #endif
2408 #endif
2409 
2410 
2411   /* Row 0 */
2412   r = 0; na1 = 'N';
2413   pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
2414   pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
2415   pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
2416   pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
2417   pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
2418 
2419   if (revp == false) {
2420     for (r = 1; r <= rlength; r++) {
2421       na1 = (int) rsequence[r-1];
2422       pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
2423       pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
2424       pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
2425       pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
2426       pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
2427     }
2428   } else {
2429     for (r = 1; r <= rlength; r++) {
2430       na1 = (int) rsequence[1-r];
2431       pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
2432       pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
2433       pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
2434       pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
2435       pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
2436     }
2437   }
2438 
2439 #if 0
2440   /* Should not be necessary */
2441   memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
2442   memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
2443   memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
2444   memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
2445   memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
2446 #endif
2447 
2448   complement_dummy = _mm_set1_epi8(-1);
2449 
2450   FF = (int *) MALLOCA((glength + 1) * sizeof(int));
2451 
2452   gap_open = _mm_set1_epi8((Score8_T) open);
2453   gap_extend = _mm_set1_epi8((Score8_T) extend);
2454 
2455 #ifndef INFINITE_INITIAL_GAP_PENALTY
2456   lband_vector = _mm_set1_epi8(lband);
2457   ramp = _mm_setr_epi8(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16);
2458   extend_ladder = _mm_setr_epi8(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend,
2459 				9*extend,10*extend,11*extend,12*extend,13*extend,14*extend,15*extend,16*extend);
2460   ramp_chunk = _mm_set1_epi8(SIMD_NCHARS_NONAVX2);
2461   extend_chunk = _mm_set1_epi8(SIMD_NCHARS_NONAVX2*extend);
2462 #endif
2463 
2464   if (jump_late_p) {
2465     for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS_NONAVX2) {
2466       if ((rhigh = rlo + SIMD_NCHARS_NONAVX2 - 1) > rlength) {
2467 	rhigh = rlength;
2468       }
2469 
2470       if ((c = rlo - lband) < 0) {
2471 	c = 0;
2472 
2473 #if defined(ZERO_INITIAL_GAP_PENALTY)
2474 	/* Initial H in column 0 determined by zeroed out H.  E needs to equal gap_open for column 1. */
2475 	E_r_gap = _mm_set1_epi8(NEG_INFINITY_8-open);
2476 	filter = _mm_cmpgt_epi8(ramp,lband_vector);
2477 	H_nogap_r = _mm_and_si128(filter,E_r_gap); /* Use zeros for score */
2478 
2479 	E_r_gap = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
2480 	E_r_gap = _mm_adds_epi8(E_r_gap,gap_open);
2481 
2482 	ramp = _mm_adds_epi8(ramp,ramp_chunk); /* Prepare for next block */
2483 	extend_ladder = _mm_adds_epi8(extend_ladder,extend_chunk); /* Prepare for next block */
2484 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
2485 	/* dir_horiz tests if E >= H.  To fill in first column of each
2486 	   row block with non-diags, make E == H. */
2487 	E_r_gap = _mm_set1_epi8(NEG_INFINITY_8);
2488 	H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
2489 #else
2490 	/* Initial H in column 0 determined by E, which needs to equal
2491 	   gap_open + extend_ladder for column 1.  H is free to be set
2492 	   equal to E. */
2493 	H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* To compensate for T1 = H + open */
2494 	filter = _mm_cmpgt_epi8(ramp,lband_vector);
2495 	H_nogap_r = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),_mm_and_si128(filter,H_nogap_r));
2496 	E_r_gap = _mm_adds_epi8(H_nogap_r,gap_open);
2497 	ramp = _mm_adds_epi8(ramp,ramp_chunk); /* Prepare for next block */
2498 	extend_ladder = _mm_adds_epi8(extend_ladder,extend_chunk); /* Prepare for next block */
2499 #endif
2500       } else {
2501 	E_r_gap = _mm_set1_epi8(NEG_INFINITY_8);
2502 	H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
2503       }
2504 
2505       for ( ; c <= rhigh + uband && c <= glength; c++) {
2506 	score_column = matrix[c];
2507 
2508 	if (c == 0) {
2509 	  pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
2510 
2511 #ifdef ZERO_INITIAL_GAP_PENALTY
2512 	  X_prev_nogap = _mm_set1_epi8(0);
2513 #else
2514 	  if (rlo == 0) {
2515 	    X_prev_nogap = _mm_set1_epi8(0);
2516 	  } else {
2517 	    X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
2518 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2519 	  }
2520 #endif
2521 
2522 	} else {
2523 	  na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
2524 	  na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
2525 	  pairscores_std_ptr = pairscores[na2];
2526 	  pairscores_alt_ptr = pairscores[na2_alt];
2527 
2528 	  if (rlo == 0) {
2529 #ifdef ZERO_INITIAL_GAP_PENALTY
2530 	    X_prev_nogap = _mm_set1_epi8(0);
2531 #else
2532 	    X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
2533 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2534 #endif
2535 	  } else {
2536 	    /* second or greater block of 8 */
2537 	    X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
2538 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2539 	  }
2540 	}
2541 
2542 	debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
2543 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
2544 
2545 	/* EGAP */
2546 	T1 = _mm_adds_epi8(H_nogap_r, gap_open);
2547 	dir_horiz = _mm_cmplt_epi8(E_r_gap,T1); /* E < H */
2548 	dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy);	/* E >= H, for jump late */
2549 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
2550 	debug15(print_vector_8(T1,rlo,c,"T1"));
2551 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
2552 
2553 #ifdef HAVE_SSE4_1
2554 	E_r_gap = _mm_max_epi8(E_r_gap, T1); /* Compare H + open with vert */
2555 #else
2556 	E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
2557 #endif
2558 	E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
2559 	debug15(print_vector_8(E_r_gap,rlo,c,"E"));
2560 
2561 
2562 	/* NOGAP */
2563 	T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_NONAVX2);
2564 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
2565 	H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
2566 	X_prev_nogap = T1;
2567 
2568 	/* Add pairscores, allowing for alternate genomic nt */
2569 #ifdef HAVE_SSE4_1
2570 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
2571 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
2572 	H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
2573 #else
2574 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
2575 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
2576 	pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
2577 	H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
2578 #endif
2579 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
2580 	debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
2581 
2582 	dir_horiz = _mm_cmplt_epi8(E_r_gap,H_nogap_r); /* E < H */
2583 	dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy);	/* E >= H, for jump late */
2584 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
2585 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
2586 
2587 
2588 #ifdef HAVE_SSE4_1
2589 	H_nogap_r = _mm_max_epi8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
2590 #else
2591 	/* Compare H + pairscores with horiz + extend */
2592 	H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
2593 #endif
2594 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
2595 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
2596 
2597 
2598 	/* F loop */
2599 	if ((rlo_calc = rlo) < c - uband) {
2600 	  rlo_calc = c - uband;
2601 	}
2602 	if ((rhigh_calc = rhigh) >= c + lband) {
2603 	  rhigh_calc = c + lband;
2604 	  if (c > 0) {
2605 	    /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
2606 	    pairscore = pairscores[na2][rhigh_calc];
2607 	    if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
2608 	      pairscore = pairscore0;
2609 	    }
2610 #ifndef HAVE_SSE4_1
2611 	    pairscore -= 128;
2612 #endif
2613 	    if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_8) {
2614 	      score_column[rhigh_calc] = NEG_INFINITY_8; /* Saturation */
2615 	    } else if (score > POS_INFINITY_8) {
2616 	      /* Should never get here, because we limit size of matrix using 8-bit quantities */
2617 	      score_column[rhigh_calc] = POS_INFINITY_8; /* Saturation */
2618 	    } else {
2619 	      score_column[rhigh_calc] = (Score8_T) score;
2620 	    }
2621 	    (*directions_Egap)[c][rhigh_calc] = DIAG;
2622 	    (*directions_nogap)[c][rhigh_calc] = DIAG;
2623 	  }
2624 	}
2625 
2626 	debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
2627 		      rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
2628 
2629 	if (rlo == 0) {
2630 	  c_gap = NEG_INFINITY_INT;
2631 	  last_nogap = NEG_INFINITY_INT;
2632 	} else if (c >= rlo + uband) {
2633 	  c_gap = NEG_INFINITY_INT;
2634 	  last_nogap = NEG_INFINITY_INT;
2635 	} else {
2636 	  debug3(printf("At c %d, uband %d, reading c_gap %d\n",c,uband,FF[c]));
2637 	  c_gap = FF[c];
2638 	  last_nogap = (int) score_column[rlo_calc-1];
2639 	}
2640 
2641 	if ((r = rlo_calc) == c - uband) {
2642 	  /* Handle top value as a special case to prevent going outside of uband */
2643 	  /* FGAP */
2644 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
2645 			r,c,c_gap + extend,last_nogap + open + extend));
2646 	  score = last_nogap + open /* + extend */;
2647 	  c_gap = score + extend;
2648 	  /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
2649 
2650 	  /* NOGAP */
2651 	  last_nogap = (int) score_column[r];
2652 	  r++;
2653 	}
2654 
2655 	/* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
2656 	for ( ; r <= rhigh_calc; r++) {
2657 	  /* FGAP */
2658 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
2659 			r,c,c_gap + extend,last_nogap + open + extend));
2660 	  if (c_gap /* + extend */ >= (score = last_nogap + open /* + extend */)) {  /* Use >= for jump late */
2661 	    c_gap += extend;
2662 	    (*directions_Fgap)[c][r] = VERT;
2663 	  } else {
2664 	    c_gap = score + extend;
2665 	    /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
2666 	  }
2667 
2668 	  /* NOGAP */
2669 	  last_nogap = (int) score_column[r];
2670 	  debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
2671 	  if (c_gap >= last_nogap) {  /* Use >= for jump late */
2672 	    last_nogap = c_gap;
2673 	    score_column[r] = (c_gap < NEG_INFINITY_8) ? NEG_INFINITY_8 : (Score8_T) c_gap; /* Saturation */
2674 	    (*directions_nogap)[c][r] = VERT;
2675 	  }
2676 	}
2677 
2678 	FF[c] = c_gap;
2679 	debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
2680 	H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
2681       }
2682     }
2683 
2684   } else {
2685     /* jump early */
2686 #if defined(ZERO_INITIAL_GAP_PENALTY) || defined(INFINITE_INITIAL_GAP_PENALTY)
2687     /* No need for ones */
2688 #else
2689     ones = _mm_set1_epi8(1);
2690 #endif
2691     for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS_NONAVX2) {
2692       if ((rhigh = rlo + SIMD_NCHARS_NONAVX2 - 1) > rlength) {
2693 	rhigh = rlength;
2694       }
2695 
2696       if ((c = rlo - lband) < 0) {
2697 	c = 0;
2698 
2699 #if defined(ZERO_INITIAL_GAP_PENALTY)
2700 	/* Initial H in column 0 determined by zeroed out H.  E needs to equal gap_open for column 1. */
2701 	E_r_gap = _mm_set1_epi8(NEG_INFINITY_8-open);
2702 	filter = _mm_cmpgt_epi8(ramp,lband_vector);
2703 	H_nogap_r = _mm_and_si128(filter,E_r_gap); /* Use zeros for score */
2704 
2705 	E_r_gap = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
2706 	E_r_gap = _mm_adds_epi8(E_r_gap,gap_open);
2707 
2708 	ramp = _mm_adds_epi8(ramp,ramp_chunk); /* Prepare for next block */
2709 	extend_ladder = _mm_adds_epi8(extend_ladder,extend_chunk); /* Prepare for next block */
2710 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
2711 	/* dir_horiz tests if E > H.  To fill in first column of each
2712 	   row block with non-diags, make E > H. */
2713 	E_r_gap = _mm_set1_epi8(NEG_INFINITY_8+1);
2714 	H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
2715 #else
2716 	/* Initial H in column 0 determined by E, which needs to equal
2717 	   gap_open + extend_ladder for column 1.  H is free to be set
2718 	   less than E. */
2719 	H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open+1); /* To compensate for T1 = H + open */
2720 	filter = _mm_cmpgt_epi8(ramp,lband_vector);
2721 	H_nogap_r = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),_mm_and_si128(filter,H_nogap_r));
2722 	E_r_gap = _mm_adds_epi8(H_nogap_r,gap_open);
2723 	H_nogap_r = _mm_subs_epi8(H_nogap_r,ones); /* To ensure H < E */
2724 	ramp = _mm_adds_epi8(ramp,ramp_chunk); /* Prepare for next block */
2725 	extend_ladder = _mm_adds_epi8(extend_ladder,extend_chunk); /* Prepare for next block */
2726 #endif
2727       } else {
2728 	E_r_gap = _mm_set1_epi8(NEG_INFINITY_8+1);
2729 	H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
2730       }
2731 
2732       for ( ; c <= rhigh + uband && c <= glength; c++) {
2733 	score_column = matrix[c];
2734 
2735 	if (c == 0) {
2736 	  pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
2737 
2738 #ifdef ZERO_INITIAL_GAP_PENALTY
2739 	  X_prev_nogap = _mm_set1_epi8(0);
2740 #else
2741 	  if (rlo == 0) {
2742 	    X_prev_nogap = _mm_set1_epi8(0);
2743 	  } else {
2744 	    X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
2745 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2746 	  }
2747 #endif
2748 
2749 	} else {
2750 	  na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
2751 	  na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
2752 	  pairscores_std_ptr = pairscores[na2];
2753 	  pairscores_alt_ptr = pairscores[na2_alt];
2754 
2755 	  if (rlo == 0) {
2756 #ifdef ZERO_INITIAL_GAP_PENALTY
2757 	    X_prev_nogap = _mm_set1_epi8(0);
2758 #else
2759 	    X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
2760 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2761 #endif
2762 	  } else {
2763 	    /* second or greater block of 8 */
2764 	    X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
2765 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
2766 	  }
2767 	}
2768 
2769 	debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
2770 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
2771 
2772 	/* EGAP */
2773 	T1 = _mm_adds_epi8(H_nogap_r, gap_open);
2774 	dir_horiz = _mm_cmpgt_epi8(E_r_gap,T1); /* E > H, for jump early */
2775 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
2776 	debug15(print_vector_8(T1,rlo,c,"T1"));
2777 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
2778 
2779 #ifdef HAVE_SSE4_1
2780 	E_r_gap = _mm_max_epi8(E_r_gap, T1); /* Compare H + open with vert */
2781 #else
2782 	/* Compare H + open with vert */
2783 	E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
2784 #endif
2785 	E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
2786 	debug15(print_vector_8(E_r_gap,rlo,c,"E"));
2787 
2788 
2789 	/* NOGAP */
2790 	T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_NONAVX2);
2791 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
2792 	H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
2793 	X_prev_nogap = T1;
2794 
2795 	/* Add pairscores, allowing for alternate genomic nt */
2796 #ifdef HAVE_SSE4_1
2797 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
2798 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
2799 	H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
2800 #else
2801 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
2802 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
2803 	pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
2804 	H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
2805 #endif
2806 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
2807 	debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
2808 
2809 	dir_horiz = _mm_cmpgt_epi8(E_r_gap,H_nogap_r); /* E > H, for jump early */
2810 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
2811 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
2812 
2813 
2814 #ifdef HAVE_SSE4_1
2815 	H_nogap_r = _mm_max_epi8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
2816 #else
2817 	/* Compare H + pairscores with horiz + extend */
2818 	H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
2819 #endif
2820 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
2821 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
2822 
2823 
2824 	/* F loop */
2825 	if ((rlo_calc = rlo) < c - uband) {
2826 	  rlo_calc = c - uband;
2827 	}
2828 	if ((rhigh_calc = rhigh) >= c + lband) {
2829 	  rhigh_calc = c + lband;
2830 	  if (c > 0) {
2831 	    /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
2832 	    pairscore = pairscores[na2][rhigh_calc];
2833 	    if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
2834 	      pairscore = pairscore0;
2835 	    }
2836 #ifndef HAVE_SSE4_1
2837 	    pairscore -= 128;
2838 #endif
2839 	    if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_8) {
2840 	      score_column[rhigh_calc] = NEG_INFINITY_8; /* Saturation */
2841 	    } else if (score > POS_INFINITY_8) {
2842 	      /* Should never get here, because we limit size of matrix using 8-bit quantities */
2843 	      score_column[rhigh_calc] = POS_INFINITY_8; /* Saturation */
2844 	    } else {
2845 	      score_column[rhigh_calc] = (Score8_T) score;
2846 	    }
2847 	    (*directions_Egap)[c][rhigh_calc] = DIAG;
2848 	    (*directions_nogap)[c][rhigh_calc] = DIAG;
2849 	  }
2850 	}
2851 
2852 	debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
2853 		      rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
2854 
2855 	if (rlo == 0) {
2856 	  c_gap = NEG_INFINITY_INT;
2857 	  last_nogap = NEG_INFINITY_INT;
2858 	} else if (c >= rlo + uband) {
2859 	  c_gap = NEG_INFINITY_INT;
2860 	  last_nogap = NEG_INFINITY_INT;
2861 	} else {
2862 	  c_gap = FF[c];
2863 	  last_nogap = (int) score_column[rlo_calc-1];
2864 	  debug3(printf("LAST_NOGAP gets score_column[%d-1], or %d\n",rlo_calc,last_nogap));
2865 	}
2866 
2867 	if ((r = rlo_calc) == c - uband) {
2868 	  /* Handle top value as a special case to prevent going outside of uband */
2869 	  /* FGAP */
2870 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
2871 			r,c,c_gap + extend,last_nogap + open + extend));
2872 	  score = last_nogap + open /* + extend */;
2873 	  c_gap = score + extend;
2874 	  /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
2875 
2876 	  /* NOGAP */
2877 	  last_nogap = (int) score_column[r];
2878 	  r++;
2879 	}
2880 
2881 	/* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
2882 	for ( ; r <= rhigh_calc; r++) {
2883 	  /* FGAP */
2884 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
2885 			r,c,c_gap + extend,last_nogap + open + extend));
2886 	  if (c_gap /* + extend */ > (score = last_nogap + open /* + extend */)) {  /* Use > for jump early */
2887 	    c_gap += extend;
2888 	    (*directions_Fgap)[c][r] = VERT;
2889 	  } else {
2890 	    c_gap = score + extend;
2891 	    /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
2892 	  }
2893 
2894 	  /* NOGAP */
2895 	  last_nogap = (int) score_column[r];
2896 	  debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
2897 	  if (c_gap > last_nogap) {  /* Use > for jump early */
2898 	    last_nogap = c_gap;
2899 	    score_column[r] = (c_gap < NEG_INFINITY_8) ? NEG_INFINITY_8 : (Score8_T) c_gap; /* Saturation */
2900 	    debug3(printf("Stored at score_column[%d]: %d\n",r,(Score8_T) score_column[r]));
2901 	    (*directions_nogap)[c][r] = VERT;
2902 	  }
2903 	}
2904 
2905 	FF[c] = c_gap;
2906 	debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
2907 	H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
2908       }
2909     }
2910   }
2911 
2912 
2913 #ifdef CHECK1
2914   /* Row 0 and column 0 directions fail anyway due to saturation */
2915   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
2916   (*directions_Egap)[1][0] = HORIZ;
2917   (*directions_Fgap)[0][1] = VERT;
2918 #endif
2919 
2920 #ifdef DEBUG2
2921   printf("SIMD: Dynprog_simd_8\n");
2922   Matrix8_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
2923 		revp,lband,uband);
2924   Directions8_print(*directions_nogap,*directions_Egap,*directions_Fgap,
2925 			    rlength,glength,rsequence,gsequence,gsequence_alt,
2926 			    revp,lband,uband);
2927 #endif
2928 
2929 #ifdef CHECK1
2930   /* Check for row 0 directions */
2931   for (c = 1; c <= uband && c <= glength; c++) {
2932     assert((*directions_Egap)[c][0] != DIAG);
2933     assert((*directions_nogap)[c][0] != DIAG);
2934   }
2935   /* Check for column 0 directions */
2936   for (r = 1; r <= lband && r <= rlength; r++) {
2937     assert((*directions_Fgap)[0][r] != DIAG);
2938     assert((*directions_nogap)[0][r] != DIAG);
2939   }
2940 #endif
2941 
2942   FREEA(FF);
2943   _mm_free(pairscores_col0);
2944   _mm_free(pairscores[4]);
2945   _mm_free(pairscores[3]);
2946   _mm_free(pairscores[2]);
2947   _mm_free(pairscores[1]);
2948   _mm_free(pairscores[0]);
2949 
2950   return matrix;
2951 }
2952 #endif
2953 
2954 
2955 
2956 #if defined(HAVE_SSE2)
2957 /* Modified from Dynprog_simd_8_upper.  Operates by columns. */
2958 Score8_T **
Dynprog_simd_8(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,Direction8_T *** directions_Fgap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,int uband,bool jump_late_p,bool revp)2959 Dynprog_simd_8 (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
2960 		Direction8_T ***directions_Fgap,
2961 		T this, char *rsequence, char *gsequence, char *gsequence_alt,
2962 		int rlength, int glength,
2963 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
2964 		int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
2965 #endif
2966 		Mismatchtype_T mismatchtype, int open, int extend,
2967 		int lband, int uband, bool jump_late_p, bool revp) {
2968   int c_gap, last_nogap, score, *FF;	/* Need to have the ability to go past NEG_INFINITY */
2969   Score8_T **matrix, *score_column;
2970 #ifdef HAVE_AVX2
2971   __m256i pairscores_std, pairscores_alt;
2972   __m256i H_nogap_r, X_prev_nogap, E_r_gap, T1;
2973   __m256i gap_open, gap_extend, complement_dummy;
2974   __m256i dir_horiz;
2975 #if defined(ZERO_INITIAL_GAP_PENALTY)
2976   __m256i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter;
2977 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
2978 #else
2979   __m256i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter, ones;
2980 #endif
2981 
2982 #else
2983   __m128i pairscores_std, pairscores_alt;
2984   __m128i H_nogap_r, X_prev_nogap, E_r_gap, T1;
2985   __m128i gap_open, gap_extend, complement_dummy;
2986   __m128i dir_horiz;
2987 #if defined(ZERO_INITIAL_GAP_PENALTY)
2988   __m128i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter;
2989 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
2990 #else
2991   __m128i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter, ones;
2992 #endif
2993 #endif
2994 #ifndef HAVE_SSE4_1
2995   __m128i pairscores_best, all_128;
2996 #endif
2997   int rlength_ceil, lband_ceil, r, c;
2998   int rlo, rhigh, rlo_calc, rhigh_calc;
2999   int na1, na2, na2_alt;
3000   Score8_T *pairscores_col0;
3001   Score8_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr, pairscore, pairscore0;
3002   Pairdistance_T **pairdistance_array_type;
3003 
3004 #ifdef DEBUG_AVX2
3005   Score8_T **matrix_std;
3006   Direction8_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
3007 #elif defined(DEBUG_SIMD)
3008   Score32_T **matrix_std;
3009   Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
3010 #endif
3011 
3012 
3013   debug2(printf("Dynprog_simd_8.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
3014   debug15(printf("Dynprog_simd_8.  jump_late_p %d, open %d, extend %d, lband %d, uband %d\n",
3015 		 jump_late_p,open,extend,lband,uband));
3016 
3017   rlength_ceil = (int) ((rlength + SIMD_NCHARS)/SIMD_NCHARS) * SIMD_NCHARS;
3018 
3019 #ifdef HAVE_SSE4_1
3020   pairdistance_array_type = pairdistance_array[mismatchtype];
3021 #else
3022   /* Needed to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
3023   pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
3024   all_128 = _mm_set1_epi8(128);
3025 #endif
3026 
3027   debug(printf("Dynprog_simd_8: "));
3028   debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
3029   debug(printf("Query length rounded up to %d\n",rlength_ceil));
3030 
3031   matrix = aligned_score8_alloc(rlength_ceil,glength,
3032 				this->aligned.one.matrix_ptrs,this->aligned.one.matrix_space);
3033   *directions_nogap = aligned_directions8_alloc(rlength_ceil,glength,
3034 						this->aligned.one.directions_ptrs_0,this->aligned.one.directions_space_0);
3035   *directions_Egap = aligned_directions8_alloc(rlength_ceil,glength,
3036 					       this->aligned.one.directions_ptrs_1,this->aligned.one.directions_space_1);
3037   /* Need to calloc to save time in F loop */
3038   *directions_Fgap = aligned_directions8_calloc(rlength_ceil,glength,
3039 						this->aligned.one.directions_ptrs_2,this->aligned.one.directions_space_2);
3040 
3041 #if 0
3042   /* Row 0 initialization */
3043   /* penalty = open; */
3044   for (c = 1; c <= uband && c <= glength; c++) {
3045     /* penalty += extend; */
3046     (*directions_Egap)[c][0] = HORIZ;
3047     (*directions_nogap)[c][0] = HORIZ;
3048   }
3049 #endif
3050 #if 0
3051   /* Already initialized to DIAG.  Actually no longer initializing directions_Egap */
3052   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
3053   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
3054 #endif
3055 
3056 #if 0
3057   /* Column 0 initialization */
3058   /* penalty = open; */
3059   for (r = 1; r <= SIMD_NCHARS && r <= rlength; r++) {
3060     /* penalty += extend; */
3061     (*directions_nogap)[0][r] = VERT;
3062   }
3063 #endif
3064 
3065 
3066   /* Load pairscores.  Store match - mismatch */
3067   pairscores[0] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
3068   pairscores[1] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
3069   pairscores[2] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
3070   pairscores[3] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
3071   pairscores[4] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
3072 
3073   lband_ceil = (int) ((lband + SIMD_NCHARS)/SIMD_NCHARS) * SIMD_NCHARS;
3074   pairscores_col0 = (Score8_T *) _mm_malloc(lband_ceil * sizeof(Score8_T),ALIGN_SIZE);
3075 
3076 
3077 #if 0
3078   /* Should not be necessary */
3079   memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score8_T));
3080   memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score8_T));
3081   memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score8_T));
3082   memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score8_T));
3083   memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score8_T));
3084 #endif
3085 
3086   /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
3087 #ifdef HAVE_SSE4_1
3088   pairscores_col0[0] = (Score8_T) 0;
3089   /* Initialization just to lband causes errors in dir_horiz for Egap */
3090 #ifdef ZERO_INITIAL_GAP_PENALTY
3091   for (r = 1; r < lband_ceil; r++) {
3092     pairscores_col0[r] = (Score8_T) 0;
3093   }
3094 #else
3095   for (r = 1; r < lband_ceil; r++) {
3096     pairscores_col0[r] = (Score8_T) NEG_INFINITY_8;
3097   }
3098 #endif
3099 #else
3100   pairscores_col0[0] = (Score8_T) 0+128;
3101   /* Initialization just to lband causes errors in dir_horiz for Egap */
3102 #ifdef ZERO_INITIAL_GAP_PENALTY
3103   for (r = 1; r < lband_ceil; r++) {
3104     pairscores_col0[r] = (Score8_T) 0+128;
3105   }
3106 #else
3107   for (r = 1; r < lband_ceil; r++) {
3108     pairscores_col0[r] = (Score8_T) NEG_INFINITY_8+128;
3109   }
3110 #endif
3111 #endif
3112 
3113 
3114   /* Row 0 */
3115   r = 0; na1 = 'N';
3116   pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3117   pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3118   pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3119   pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3120   pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3121 
3122   if (revp == false) {
3123     for (r = 1; r <= rlength; r++) {
3124       na1 = (int) rsequence[r-1];
3125       pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3126       pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3127       pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3128       pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3129       pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3130     }
3131   } else {
3132     for (r = 1; r <= rlength; r++) {
3133       na1 = (int) rsequence[1-r];
3134       pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3135       pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3136       pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3137       pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3138       pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3139     }
3140   }
3141 
3142 #if 0
3143   /* Should not be necessary */
3144   memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3145   memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3146   memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3147   memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3148   memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3149 #endif
3150 
3151   complement_dummy = _MM_SET1_EPI8(-1);
3152 
3153   FF = (int *) MALLOCA((glength + 1) * sizeof(int));
3154 
3155   gap_open = _MM_SET1_EPI8((Score8_T) open);
3156   gap_extend = _MM_SET1_EPI8((Score8_T) extend);
3157 
3158 
3159 #ifndef INFINITE_INITIAL_GAP_PENALTY
3160 #ifdef HAVE_AVX2
3161   ramp = _mm256_setr_epi8(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16);
3162   extend_ladder = _mm256_setr_epi8(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend,
3163 				   9*extend,10*extend,11*extend,12*extend,13*extend,14*extend,15*extend,16*extend);
3164 #else
3165   ramp = _mm_setr_epi8(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16);
3166   extend_ladder = _mm_setr_epi8(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend,
3167 				9*extend,10*extend,11*extend,12*extend,13*extend,14*extend,15*extend,16*extend);
3168 #endif
3169   lband_vector = _MM_SET1_EPI8(lband);
3170   ramp_chunk = _MM_SET1_EPI8(SIMD_NCHARS);
3171   extend_chunk = _MM_SET1_EPI8(SIMD_NCHARS*extend);
3172 #endif
3173 
3174   if (jump_late_p) {
3175     for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS) {
3176       if ((rhigh = rlo + SIMD_NCHARS - 1) > rlength) {
3177 	rhigh = rlength;
3178       }
3179 
3180       if ((c = rlo - lband) < 0) {
3181 	c = 0;
3182 
3183 #if defined(ZERO_INITIAL_GAP_PENALTY)
3184 	/* Initial H in column 0 determined by zeroed out H.  E needs to equal gap_open for column 1. */
3185 	E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8-open);
3186 	filter = _MM_CMPGT_EPI8(ramp,lband_vector);
3187 	H_nogap_r = _MM_AND_SI(filter,E_r_gap); /* Use zeros for score */
3188 
3189 	E_r_gap = _MM_OR_SI(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
3190 	E_r_gap = _MM_ADDS_EPI8(E_r_gap,gap_open);
3191 
3192 	ramp = _MM_ADDS_EPI8(ramp,ramp_chunk); /* Prepare for next block */
3193 	extend_ladder = _MM_ADDS_EPI8(extend_ladder,extend_chunk); /* Prepare for next block */
3194 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
3195 	/* dir_horiz tests if E >= H.  To fill in first column of each
3196 	   row block with non-diags, make E == H. */
3197 	E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8);
3198 	H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
3199 #else
3200 	/* Initial H in column 0 determined by E, which needs to equal
3201 	   gap_open + extend_ladder for column 1.  H is free to be set
3202 	   equal to E. */
3203 	H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* To compensate for T1 = H + open */
3204 	filter = _MM_CMPGT_EPI8(ramp,lband_vector);
3205 	H_nogap_r = _MM_OR_SI(_MM_ANDNOT_SI(filter,extend_ladder),_MM_AND_SI(filter,H_nogap_r));
3206 	E_r_gap = _MM_ADDS_EPI8(H_nogap_r,gap_open);
3207 	ramp = _MM_ADDS_EPI8(ramp,ramp_chunk); /* Prepare for next block */
3208 	extend_ladder = _MM_ADDS_EPI8(extend_ladder,extend_chunk); /* Prepare for next block */
3209 #endif
3210       } else {
3211 	E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8);
3212 	H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
3213       }
3214 
3215       for ( ; c <= rhigh + uband && c <= glength; c++) {
3216 	score_column = matrix[c];
3217 
3218 	if (c == 0) {
3219 	  pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
3220 
3221 #ifdef ZERO_INITIAL_GAP_PENALTY
3222 	  X_prev_nogap = _MM_SETZERO_SI();
3223 #elif defined(HAVE_AVX2)
3224 	  if (rlo == 0) {
3225 	    X_prev_nogap = _mm256_setzero_si256();
3226 	  } else {
3227 	    X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
3228 	  }
3229 #else
3230 	  if (rlo == 0) {
3231 	    X_prev_nogap = _mm_setzero_si128();
3232 	  } else {
3233 	    X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
3234 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3235 	  }
3236 #endif
3237 
3238 	} else {
3239 	  na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
3240 	  na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
3241 	  pairscores_std_ptr = pairscores[na2];
3242 	  pairscores_alt_ptr = pairscores[na2_alt];
3243 
3244 #ifdef HAVE_AVX2
3245 	  if (rlo == 0) {
3246 #ifdef ZERO_INITIAL_GAP_PENALTY
3247 	    X_prev_nogap = _MM_SETZERO_SI();
3248 #else
3249 	    X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
3250 #endif
3251 	  } else {
3252 	    /* second or greater block of 8 */
3253 	    X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_CHAR_INSERT);
3254 	  }
3255 
3256 #else
3257 	  if (rlo == 0) {
3258 #ifdef ZERO_INITIAL_GAP_PENALTY
3259 	    X_prev_nogap = _MM_SETZERO_SI();
3260 #else
3261 	    X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
3262 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3263 #endif
3264 	  } else {
3265 	    /* second or greater block of 8 */
3266 	    X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
3267 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3268 	  }
3269 #endif
3270 	}
3271 
3272 	debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
3273 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
3274 
3275 	/* EGAP */
3276 	T1 = _MM_ADDS_EPI8(H_nogap_r, gap_open);
3277 	dir_horiz = _MM_CMPLT_EPI8(E_r_gap,T1); /* E < H */
3278 	dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy);	/* E >= H, for jump late */
3279 #ifdef HAVE_AVX2
3280 	_mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
3281 #else
3282 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
3283 #endif
3284 	debug15(print_vector_8(T1,rlo,c,"T1"));
3285 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
3286 
3287 #ifdef HAVE_SSE4_1
3288 	E_r_gap = _MM_MAX_EPI8(E_r_gap, T1); /* Compare H + open with vert */
3289 #else
3290 	E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
3291 #endif
3292 	E_r_gap = _MM_ADDS_EPI8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
3293 	debug15(print_vector_8(E_r_gap,rlo,c,"E"));
3294 
3295 
3296 	/* NOGAP */
3297 #ifdef HAVE_AVX2
3298 	T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_r,SIMD_NCHARS-1),LAST_CHAR_INSERT);
3299 	X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_r,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
3300 	H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_CHAR);
3301 #else
3302 	T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_SHIFT);
3303 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
3304 #endif
3305 	H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
3306 	X_prev_nogap = T1;
3307 
3308 	/* Add pairscores, allowing for alternate genomic nt */
3309 #ifdef HAVE_AVX2
3310 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
3311 	pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
3312 	H_nogap_r = _mm256_adds_epi8(H_nogap_r, _mm256_max_epi8(pairscores_std,pairscores_alt));
3313 #elif defined(HAVE_SSE4_1)
3314 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
3315 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
3316 	H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
3317 #else
3318 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
3319 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
3320 	pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
3321 	H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
3322 #endif
3323 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
3324 	debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
3325 
3326 	dir_horiz = _MM_CMPLT_EPI8(E_r_gap,H_nogap_r); /* E < H */
3327 	dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy);	/* E >= H, for jump late */
3328 #ifdef HAVE_AVX2
3329 	_mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
3330 #else
3331 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
3332 #endif
3333 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
3334 
3335 
3336 #ifdef HAVE_SSE4_1
3337 	H_nogap_r = _MM_MAX_EPI8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
3338 #else
3339 	/* Compare H + pairscores with horiz + extend */
3340 	H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
3341 #endif
3342 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
3343 #ifdef HAVE_AVX2
3344 	_mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
3345 #else
3346 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
3347 #endif
3348 
3349 	/* F loop */
3350 	if ((rlo_calc = rlo) < c - uband) {
3351 	  rlo_calc = c - uband;
3352 	}
3353 	if ((rhigh_calc = rhigh) >= c + lband) {
3354 	  rhigh_calc = c + lband;
3355 	  if (c > 0) {
3356 	    /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
3357 	    pairscore = pairscores[na2][rhigh_calc];
3358 	    if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
3359 	      pairscore = pairscore0;
3360 	    }
3361 #ifndef HAVE_SSE4_1
3362 	    pairscore -= 128;
3363 #endif
3364 	    if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_8) {
3365 	      score_column[rhigh_calc] = NEG_INFINITY_8; /* Saturation */
3366 	    } else if (score > POS_INFINITY_8) {
3367 	      /* Should never get here, because we limit size of matrix using 8-bit quantities */
3368 	      score_column[rhigh_calc] = POS_INFINITY_8; /* Saturation */
3369 	    } else {
3370 	      score_column[rhigh_calc] = (Score8_T) score;
3371 	    }
3372 	    (*directions_Egap)[c][rhigh_calc] = DIAG;
3373 	    (*directions_nogap)[c][rhigh_calc] = DIAG;
3374 	  }
3375 	}
3376 
3377 	debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
3378 		      rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
3379 
3380 	if (rlo == 0) {
3381 	  c_gap = NEG_INFINITY_INT;
3382 	  last_nogap = NEG_INFINITY_INT;
3383 	} else if (c >= rlo + uband) {
3384 	  c_gap = NEG_INFINITY_INT;
3385 	  last_nogap = NEG_INFINITY_INT;
3386 	} else {
3387 	  debug3(printf("At c %d, uband %d, reading c_gap %d\n",c,uband,FF[c]));
3388 	  c_gap = FF[c];
3389 	  last_nogap = (int) score_column[rlo_calc-1];
3390 	}
3391 
3392 	if ((r = rlo_calc) == c - uband) {
3393 	  /* Handle top value as a special case to prevent going outside of uband */
3394 	  /* FGAP */
3395 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
3396 			r,c,c_gap + extend,last_nogap + open + extend));
3397 	  score = last_nogap + open /* + extend */;
3398 	  c_gap = score + extend;
3399 	  /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
3400 
3401 	  /* NOGAP */
3402 	  last_nogap = (int) score_column[r];
3403 	  r++;
3404 	}
3405 
3406 	/* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
3407 	for ( ; r <= rhigh_calc; r++) {
3408 	  /* FGAP */
3409 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
3410 			r,c,c_gap + extend,last_nogap + open + extend));
3411 	  if (c_gap /* + extend */ >= (score = last_nogap + open /* + extend */)) {  /* Use >= for jump late */
3412 	    c_gap += extend;
3413 	    (*directions_Fgap)[c][r] = VERT;
3414 	  } else {
3415 	    c_gap = score + extend;
3416 	    /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
3417 	  }
3418 
3419 	  /* NOGAP */
3420 	  last_nogap = (int) score_column[r];
3421 	  debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
3422 	  if (c_gap >= last_nogap) {  /* Use >= for jump late */
3423 	    last_nogap = c_gap;
3424 	    score_column[r] = (c_gap < NEG_INFINITY_8) ? NEG_INFINITY_8 : (Score8_T) c_gap; /* Saturation */
3425 	    (*directions_nogap)[c][r] = VERT;
3426 	  }
3427 	}
3428 
3429 	FF[c] = c_gap;
3430 	debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
3431 #ifdef HAVE_AVX2
3432 	H_nogap_r = _mm256_load_si256((__m256i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
3433 #else
3434 	H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
3435 #endif
3436       }
3437     }
3438 
3439   } else {
3440     /* jump early */
3441 #if defined(ZERO_INITIAL_GAP_PENALTY) || defined(INFINITE_INITIAL_GAP_PENALTY)
3442     /* No need for ones */
3443 #else
3444     ones = _MM_SET1_EPI8(1);
3445 #endif
3446     for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS) {
3447       if ((rhigh = rlo + SIMD_NCHARS - 1) > rlength) {
3448 	rhigh = rlength;
3449       }
3450 
3451       if ((c = rlo - lband) < 0) {
3452 	c = 0;
3453 
3454 #if defined(ZERO_INITIAL_GAP_PENALTY)
3455 	/* Initial H in column 0 determined by zeroed out H.  E needs to equal gap_open for column 1. */
3456 	E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8-open);
3457 	filter = _MM_CMPGT_EPI8(ramp,lband_vector);
3458 	H_nogap_r = _MM_AND_SI(filter,E_r_gap); /* Use zeros for score */
3459 
3460 	E_r_gap = _MM_OR_SI(_MM_ANDNOT_SI(filter,extend_ladder),H_nogap_r);
3461 	E_r_gap = _MM_ADDS_EPI8(E_r_gap,gap_open);
3462 
3463 	ramp = _MM_ADDS_EPI8(ramp,ramp_chunk); /* Prepare for next block */
3464 	extend_ladder = _MM_ADDS_EPI8(extend_ladder,extend_chunk); /* Prepare for next block */
3465 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
3466 	/* dir_horiz tests if E > H.  To fill in first column of each
3467 	   row block with non-diags, make E > H. */
3468 	E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8+1);
3469 	H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
3470 #else
3471 	/* Initial H in column 0 determined by E, which needs to equal
3472 	   gap_open + extend_ladder for column 1.  H is free to be set
3473 	   less than E. */
3474 	H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open+1); /* To compensate for T1 = H + open */
3475 	filter = _MM_CMPGT_EPI8(ramp,lband_vector);
3476 	H_nogap_r = _MM_OR_SI(_MM_ANDNOT_SI(filter,extend_ladder),_MM_AND_SI(filter,H_nogap_r));
3477 	E_r_gap = _MM_ADDS_EPI8(H_nogap_r,gap_open);
3478 	H_nogap_r = _MM_SUBS_EPI8(H_nogap_r,ones); /* To ensure H < E */
3479 	ramp = _MM_ADDS_EPI8(ramp,ramp_chunk); /* Prepare for next block */
3480 	extend_ladder = _MM_ADDS_EPI8(extend_ladder,extend_chunk); /* Prepare for next block */
3481 #endif
3482       } else {
3483 	E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8+1);
3484 	H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
3485       }
3486 
3487       for ( ; c <= rhigh + uband && c <= glength; c++) {
3488 	score_column = matrix[c];
3489 
3490 	if (c == 0) {
3491 	  pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
3492 
3493 #ifdef ZERO_INITIAL_GAP_PENALTY
3494 	  X_prev_nogap = _MM_SETZERO_SI();
3495 #elif defined(HAVE_AVX2)
3496 	  if (rlo == 0) {
3497 	    X_prev_nogap = _mm256_setzero_si256();
3498 	  } else {
3499 	    X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
3500 	  }
3501 #else
3502 	  if (rlo == 0) {
3503 	    X_prev_nogap = _mm_setzero_si128();
3504 	  } else {
3505 	    X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
3506 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3507 	  }
3508 #endif
3509 
3510 	} else {
3511 	  na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
3512 	  na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
3513 	  pairscores_std_ptr = pairscores[na2];
3514 	  pairscores_alt_ptr = pairscores[na2_alt];
3515 
3516 #ifdef HAVE_AVX2
3517 	  if (rlo == 0) {
3518 #ifdef ZERO_INITIAL_GAP_PENALTY
3519 	    X_prev_nogap = _MM_SETZERO_SI();
3520 #else
3521 	    X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
3522 #endif
3523 	  } else {
3524 	    /* second or greater block of 8 */
3525 	    X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_CHAR_INSERT);
3526 	  }
3527 
3528 #else
3529 	  if (rlo == 0) {
3530 #ifdef ZERO_INITIAL_GAP_PENALTY
3531 	    X_prev_nogap = _MM_SETZERO_SI();
3532 #else
3533 	    X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8);
3534 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3535 #endif
3536 	  } else {
3537 	    /* second or greater block of 8 */
3538 	    X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
3539 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
3540 	  }
3541 #endif
3542 
3543 	}
3544 
3545 	debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
3546 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
3547 
3548 	/* EGAP */
3549 	T1 = _MM_ADDS_EPI8(H_nogap_r, gap_open);
3550 	dir_horiz = _MM_CMPGT_EPI8(E_r_gap,T1); /* E > H, for jump early */
3551 #ifdef HAVE_AVX2
3552 	_mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
3553 #else
3554 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
3555 #endif
3556 	debug15(print_vector_8(T1,rlo,c,"T1"));
3557 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
3558 
3559 #ifdef HAVE_SSE4_1
3560 	E_r_gap = _MM_MAX_EPI8(E_r_gap, T1); /* Compare H + open with vert */
3561 #else
3562 	/* Compare H + open with vert */
3563 	E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
3564 #endif
3565 	E_r_gap = _MM_ADDS_EPI8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
3566 	debug15(print_vector_8(E_r_gap,rlo,c,"E"));
3567 
3568 
3569 	/* NOGAP */
3570 #ifdef HAVE_AVX2
3571 	T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_r,SIMD_NCHARS-1),LAST_CHAR_INSERT);
3572 	X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_r,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
3573 	H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_CHAR);
3574 #else
3575 	T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_SHIFT);
3576 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
3577 #endif
3578 	H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
3579 	X_prev_nogap = T1;
3580 
3581 	/* Add pairscores, allowing for alternate genomic nt */
3582 #ifdef HAVE_AVX2
3583 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
3584 	pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
3585 	H_nogap_r = _mm256_adds_epi8(H_nogap_r, _mm256_max_epi8(pairscores_std,pairscores_alt));
3586 #elif defined(HAVE_SSE4_1)
3587 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
3588 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
3589 	H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
3590 #else
3591 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
3592 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
3593 	pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
3594 	H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
3595 #endif
3596 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
3597 	debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
3598 
3599 	dir_horiz = _MM_CMPGT_EPI8(E_r_gap,H_nogap_r); /* E > H, for jump early */
3600 #ifdef HAVE_AVX2
3601 	_mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
3602 #else
3603 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
3604 #endif
3605 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
3606 
3607 
3608 #ifdef HAVE_SSE4_1
3609 	H_nogap_r = _MM_MAX_EPI8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
3610 #else
3611 	/* Compare H + pairscores with horiz + extend */
3612 	H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
3613 #endif
3614 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
3615 #ifdef HAVE_AVX2
3616 	_mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
3617 #else
3618 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
3619 #endif
3620 
3621 
3622 	/* F loop */
3623 	if ((rlo_calc = rlo) < c - uband) {
3624 	  rlo_calc = c - uband;
3625 	}
3626 	if ((rhigh_calc = rhigh) >= c + lband) {
3627 	  rhigh_calc = c + lband;
3628 	  if (c > 0) {
3629 	    /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
3630 	    pairscore = pairscores[na2][rhigh_calc];
3631 	    if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
3632 	      pairscore = pairscore0;
3633 	    }
3634 #ifndef HAVE_SSE4_1
3635 	    pairscore -= 128;
3636 #endif
3637 	    if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_8) {
3638 	      score_column[rhigh_calc] = NEG_INFINITY_8; /* Saturation */
3639 	    } else if (score > POS_INFINITY_8) {
3640 	      /* Should never get here, because we limit size of matrix using 8-bit quantities */
3641 	      score_column[rhigh_calc] = POS_INFINITY_8; /* Saturation */
3642 	    } else {
3643 	      score_column[rhigh_calc] = (Score8_T) score;
3644 	    }
3645 	    (*directions_Egap)[c][rhigh_calc] = DIAG;
3646 	    (*directions_nogap)[c][rhigh_calc] = DIAG;
3647 	  }
3648 	}
3649 
3650 	debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
3651 		      rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
3652 
3653 	if (rlo == 0) {
3654 	  c_gap = NEG_INFINITY_INT;
3655 	  last_nogap = NEG_INFINITY_INT;
3656 	} else if (c >= rlo + uband) {
3657 	  c_gap = NEG_INFINITY_INT;
3658 	  last_nogap = NEG_INFINITY_INT;
3659 	} else {
3660 	  c_gap = FF[c];
3661 	  last_nogap = (int) score_column[rlo_calc-1];
3662 	  debug3(printf("LAST_NOGAP gets score_column[%d-1], or %d\n",rlo_calc,last_nogap));
3663 	}
3664 
3665 	if ((r = rlo_calc) == c - uband) {
3666 	  /* Handle top value as a special case to prevent going outside of uband */
3667 	  /* FGAP */
3668 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
3669 			r,c,c_gap + extend,last_nogap + open + extend));
3670 	  score = last_nogap + open /* + extend */;
3671 	  c_gap = score + extend;
3672 	  /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
3673 
3674 	  /* NOGAP */
3675 	  last_nogap = (int) score_column[r];
3676 	  r++;
3677 	}
3678 
3679 	/* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
3680 	for ( ; r <= rhigh_calc; r++) {
3681 	  /* FGAP */
3682 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
3683 			r,c,c_gap + extend,last_nogap + open + extend));
3684 	  if (c_gap /* + extend */ > (score = last_nogap + open /* + extend */)) {  /* Use > for jump early */
3685 	    c_gap += extend;
3686 	    (*directions_Fgap)[c][r] = VERT;
3687 	  } else {
3688 	    c_gap = score + extend;
3689 	    /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
3690 	  }
3691 
3692 	  /* NOGAP */
3693 	  last_nogap = (int) score_column[r];
3694 	  debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
3695 	  if (c_gap > last_nogap) {  /* Use > for jump early */
3696 	    last_nogap = c_gap;
3697 	    score_column[r] = (c_gap < NEG_INFINITY_8) ? NEG_INFINITY_8 : (Score8_T) c_gap; /* Saturation */
3698 	    debug3(printf("Stored at score_column[%d]: %d\n",r,(Score8_T) score_column[r]));
3699 	    (*directions_nogap)[c][r] = VERT;
3700 	  }
3701 	}
3702 
3703 	FF[c] = c_gap;
3704 	debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
3705 #ifdef HAVE_AVX2
3706 	H_nogap_r = _mm256_load_si256((__m256i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
3707 #else
3708 	H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
3709 #endif
3710       }
3711     }
3712   }
3713 
3714 
3715 #ifdef CHECK1
3716   /* Row 0 and column 0 directions fail anyway due to saturation */
3717   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
3718   (*directions_Egap)[1][0] = HORIZ;
3719   (*directions_Fgap)[0][1] = VERT;
3720 #endif
3721 
3722 #ifdef DEBUG2
3723   printf("SIMD: Dynprog_simd_8\n");
3724   Matrix8_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
3725 		revp,lband,uband);
3726   Directions8_print(*directions_nogap,*directions_Egap,*directions_Fgap,
3727 			    rlength,glength,rsequence,gsequence,gsequence_alt,
3728 			    revp,lband,uband);
3729 #endif
3730 
3731 #ifdef CHECK1
3732   /* Check for row 0 directions */
3733   for (c = 1; c <= uband && c <= glength; c++) {
3734     assert((*directions_Egap)[c][0] != DIAG);
3735     assert((*directions_nogap)[c][0] != DIAG);
3736   }
3737   /* Check for column 0 directions */
3738   for (r = 1; r <= lband && r <= rlength; r++) {
3739     assert((*directions_Fgap)[0][r] != DIAG);
3740     assert((*directions_nogap)[0][r] != DIAG);
3741   }
3742 #endif
3743 
3744 #ifdef DEBUG_AVX2
3745   matrix_std = Dynprog_simd_8_nonavx2(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
3746 				      this,rsequence,gsequence,gsequence_alt,
3747 				      rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
3748 				      open,extend,lband,uband,jump_late_p,revp);
3749 #elif defined(DEBUG_SIMD)
3750   matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
3751 				this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
3752 				rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
3753 				open,extend,lband,uband,jump_late_p,revp,/*saturation*/NEG_INFINITY_8,
3754 				/*upperp*/true,/*lowerp*/true);
3755 #endif
3756 
3757 #ifdef DEBUG2
3758   printf("Banded %s\n",revp ? "rev" : "fwd");
3759   Matrix8_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,revp,lband,uband);
3760   Directions8_print(*directions_nogap,*directions_Egap,*directions_Fgap,
3761 		    rlength,glength,rsequence,gsequence,gsequence_alt,revp,lband,uband);
3762 #endif
3763 
3764 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
3765   banded_matrix8_compare(matrix,matrix_std,rlength,glength,lband,uband,
3766 			 rsequence,gsequence,gsequence_alt,
3767 			 goffset,chroffset,chrhigh,watsonp,revp);
3768 
3769   banded_directions8_compare_nogap(matrix,*directions_nogap,directions_nogap_std,rlength,glength,lband,uband);
3770   banded_directions8_compare_Egap(matrix,*directions_Egap,directions_Egap_std,rlength,glength,lband,uband);
3771   banded_directions8_compare_Fgap(matrix,*directions_Fgap,directions_Fgap_std,rlength,glength,lband,uband);
3772 #endif
3773 
3774   FREEA(FF);
3775   _mm_free(pairscores_col0);
3776   _mm_free(pairscores[4]);
3777   _mm_free(pairscores[3]);
3778   _mm_free(pairscores[2]);
3779   _mm_free(pairscores[1]);
3780   _mm_free(pairscores[0]);
3781 
3782   return matrix;
3783 }
3784 #endif
3785 
3786 
3787 #ifdef DEBUG_AVX2
3788 /* Designed for computation above the main diagonal, so no F loop or bottom masking needed */
3789 /* Operates by columns */
3790 Score8_T **
Dynprog_simd_8_upper_nonavx2(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int uband,bool jump_late_p,bool revp)3791 Dynprog_simd_8_upper_nonavx2 (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
3792 			      T this, char *rsequence, char *gsequence, char *gsequence_alt,
3793 			      int rlength, int glength,
3794 			      int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
3795 			      Mismatchtype_T mismatchtype, int open, int extend,
3796 			      int uband, bool jump_late_p, bool revp) {
3797   Score8_T **matrix, *score_column;
3798   __m128i pairscores_std, pairscores_alt;
3799 #ifdef HAVE_SSE4_1
3800   __m128i E_infinity;
3801 #else
3802   __m128i E_infinity_plus_128;
3803   __m128i pairscores_best, all_128;
3804 #endif
3805   __m128i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, T1;
3806   __m128i gap_open, gap_extend, complement_dummy;
3807   __m128i dir_horiz;
3808   int rlength_ceil, r, c;
3809   int rlo, rhigh;
3810   int na1, na2, na2_alt;
3811   Score8_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr;
3812   Pairdistance_T **pairdistance_array_type;
3813 
3814 
3815   debug2(printf("Dynprog_simd_8_upper.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
3816   debug15(printf("Dynprog_simd_8_upper.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
3817 
3818   rlength_ceil = (int) ((rlength + SIMD_NCHARS_NONAVX2)/SIMD_NCHARS_NONAVX2) * SIMD_NCHARS_NONAVX2;
3819 
3820 #ifdef HAVE_SSE4_1
3821   pairdistance_array_type = pairdistance_array[mismatchtype];
3822 #else
3823   /* Needed to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
3824   pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
3825   all_128 = _mm_set1_epi8(128);
3826 #endif
3827 
3828   debug(printf("compute_scores_simd_8_bycols (upper): "));
3829   debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
3830   debug(printf("Query length rounded up to %d\n",rlength_ceil));
3831 
3832   matrix = aligned_score8_alloc(rlength_ceil,glength,
3833 				this->aligned_std.two.upper_matrix_ptrs,this->aligned_std.two.upper_matrix_space);
3834   *directions_nogap = aligned_directions8_alloc(rlength_ceil,glength,
3835 						this->aligned_std.two.upper_directions_ptrs_0,this->aligned_std.two.upper_directions_space_0);
3836   *directions_Egap = aligned_directions8_alloc(rlength_ceil,glength,
3837 					       this->aligned_std.two.upper_directions_ptrs_1,this->aligned_std.two.upper_directions_space_1);
3838 
3839 #if 0
3840   /* Row 0 initialization */
3841   /* penalty = open; */
3842   for (c = 1; c <= uband && c <= glength; c++) {
3843     /* penalty += extend; */
3844     (*directions_Egap)[c][0] = HORIZ;
3845     (*directions_nogap)[c][0] = HORIZ;
3846   }
3847 #endif
3848 #if 0
3849   /* Already initialized to DIAG.  Actually no longer initializing directions_Egap */
3850   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
3851   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
3852 #endif
3853 #if 0
3854   /* Column 0 initialization */
3855   /* penalty = open; */
3856   for (r = 1; r <= SIMD_NCHARS_NONAVX2 && r <= rlength; r++) {
3857     /* penalty += extend; */
3858     (*directions_nogap)[0][r] = VERT;
3859   }
3860 #endif
3861 
3862 
3863   /* Load pairscores.  Store match - mismatch */
3864   pairscores[0] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
3865   pairscores[1] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
3866   pairscores[2] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
3867   pairscores[3] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
3868   pairscores[4] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),16);
3869 
3870 #if 0
3871   /* Should not be necessary */
3872   memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score8_T));
3873   memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score8_T));
3874   memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score8_T));
3875   memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score8_T));
3876   memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score8_T));
3877 #endif
3878 
3879   /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
3880   r = 0; na1 = 'N';
3881   pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3882   pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3883   pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3884   pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3885   pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3886 
3887   if (revp == false) {
3888     for (r = 1; r <= rlength; r++) {
3889       na1 = (int) rsequence[r-1];
3890       pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3891       pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3892       pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3893       pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3894       pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3895     }
3896   } else {
3897     for (r = 1; r <= rlength; r++) {
3898       na1 = (int) rsequence[1-r];
3899       pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
3900       pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
3901       pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
3902       pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
3903       pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
3904     }
3905   }
3906 
3907 #if 0
3908   /* Should not be necessary */
3909   memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3910   memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3911   memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3912   memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3913   memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
3914 #endif
3915 
3916   complement_dummy = _mm_set1_epi8(-1);
3917 
3918   gap_open = _mm_set1_epi8((Score8_T) open);
3919   gap_extend = _mm_set1_epi8((Score8_T) extend);
3920 
3921 
3922 #ifdef HAVE_SSE4_1
3923   E_infinity = _mm_set1_epi8(POS_INFINITY_8);
3924 #else
3925   E_infinity_plus_128 = _mm_set1_epi8(POS_INFINITY_8+128);
3926 #endif
3927   if (jump_late_p) {
3928     for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS_NONAVX2) {
3929       if ((rhigh = rlo + SIMD_NCHARS_NONAVX2 - 1) > rlength) {
3930 	rhigh = rlength;
3931       }
3932 
3933       /* dir_horiz tests if E >= H .  To fill in first column of each
3934 	 row block with non-diags, could make E == H.  But irrelevant,
3935 	 because these are below the diagonal. */
3936       E_mask = _mm_set1_epi8(1);
3937 
3938       E_r_gap = _mm_set1_epi8(NEG_INFINITY_8);
3939       H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
3940 
3941       for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
3942 	score_column = matrix[c];
3943 
3944 	if (c == 0) {
3945 	  na2 = na2_alt = 4; /* 'N' */
3946 	} else {
3947 	  na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
3948 	  na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
3949 	}
3950 	pairscores_std_ptr = pairscores[na2];
3951 	pairscores_alt_ptr = pairscores[na2_alt];
3952 
3953 	if (c == 0) {
3954 	  X_prev_nogap = _mm_set1_epi8(0);
3955 	} else if (rlo == 0) {
3956 #ifdef ZERO_INITIAL_GAP_PENALTY
3957 	  X_prev_nogap = _mm_set1_epi8(0);
3958 #else
3959 	  X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the rlo bounds */
3960 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
3961 #endif
3962 	} else {
3963 	  /* second or greater block of 8 */
3964 	  X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
3965 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
3966 	}
3967 
3968 	debug15(print_vector_8(E_mask,rlo,c,"E_mask"));
3969 #ifdef HAVE_SSE4_1
3970 	E_r_gap = _mm_min_epi8(E_r_gap,_mm_add_epi8(E_mask,E_infinity));
3971 #else
3972 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
3973 #endif
3974 	debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
3975 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
3976 
3977 	/* EGAP */
3978 	T1 = _mm_adds_epi8(H_nogap_r, gap_open);
3979 	dir_horiz = _mm_cmplt_epi8(E_r_gap,T1); /* E < H */
3980 	dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy);	/* E >= H, for jump late */
3981 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
3982 	debug15(print_vector_8(T1,rlo,c,"T1"));
3983 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
3984 
3985 #ifdef HAVE_SSE4_1
3986 	E_r_gap = _mm_max_epi8(E_r_gap, T1); /* Compare H + open with vert */
3987 	E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
3988 	E_r_gap = _mm_min_epi8(E_r_gap,_mm_add_epi8(E_mask,E_infinity));
3989 #elif 1
3990 	E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
3991 	E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
3992 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
3993 #else
3994 	/* Try to avoid unnecessary shifts by 128, but overflows */
3995 	E_r_gap = _mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128));
3996 	E_r_gap = _mm_add_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
3997 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(E_r_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
3998 #endif
3999 	debug15(print_vector_8(E_r_gap,rlo,c,"E"));
4000 
4001 
4002 	/* NOGAP */
4003 	T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_NONAVX2);
4004 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
4005 	H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
4006 	X_prev_nogap = T1;
4007 
4008 	/* Add pairscores, allowing for alternate genomic nt */
4009 #ifdef HAVE_SSE4_1
4010 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
4011 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
4012 	debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4013 	H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
4014 #else
4015 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
4016 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
4017 	pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
4018 	debug15(print_vector_8(pairscores_best,rlo,c,"pairscores_std"));
4019 	H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
4020 #endif
4021 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
4022 	debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
4023 
4024 	dir_horiz = _mm_cmplt_epi8(E_r_gap,H_nogap_r); /* E < H */
4025 	dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy);	/* E >= H, for jump late */
4026 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4027 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
4028 
4029 #ifdef HAVE_SSE4_1
4030 	H_nogap_r = _mm_max_epi8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
4031 #else
4032 	/* Compare H + pairscores with horiz + extend */
4033 	H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
4034 #endif
4035 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
4036 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
4037 
4038 
4039 	/* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
4040 	if (rhigh >= c) {
4041 	  (*directions_Egap)[c][c] = DIAG;
4042 	  (*directions_nogap)[c][c] = DIAG;
4043 	}
4044 
4045 	/* No need for F loop here */
4046 	E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
4047       }
4048     }
4049 
4050   } else {
4051     /* jump early */
4052     for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS_NONAVX2) {
4053       if ((rhigh = rlo + SIMD_NCHARS_NONAVX2 - 1) > rlength) {
4054 	rhigh = rlength;
4055       }
4056 
4057       /* dir_horiz tests if E > H.  To fill in first column of each
4058 	 row block with non-diags, could make E > H.  But irrelevant,
4059 	 because these are below the diagonal. */
4060       E_mask = _mm_set1_epi8(1);
4061 
4062       E_r_gap = _mm_set1_epi8(NEG_INFINITY_8+1);
4063       H_nogap_r = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
4064 
4065       for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
4066 	score_column = matrix[c];
4067 
4068 	if (c == 0) {
4069 	  na2 = na2_alt = 4; /* 'N' */;
4070 	} else {
4071 	  na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
4072 	  na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
4073 	}
4074 	pairscores_std_ptr = pairscores[na2];
4075 	pairscores_alt_ptr = pairscores[na2_alt];
4076 
4077 	if (c == 0) {
4078 	  X_prev_nogap = _mm_set1_epi8(0);
4079 	} else if (rlo == 0) {
4080 #ifdef ZERO_INITIAL_GAP_PENALTY
4081 	  X_prev_nogap = _mm_set1_epi8(0);
4082 #else
4083 	  X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the rlo bounds */
4084 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
4085 #endif
4086 	} else {
4087 	  /* second or greater block of 8 */
4088 	  X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
4089 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
4090 	}
4091 
4092 	debug15(print_vector_8(E_mask,rlo,c,"E_mask"));
4093 #ifdef HAVE_SSE4_1
4094 	E_r_gap = _mm_min_epi8(E_r_gap,_mm_add_epi8(E_mask,E_infinity));
4095 #else
4096 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4097 #endif
4098 	debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
4099 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
4100 
4101 	/* EGAP */
4102 	T1 = _mm_adds_epi8(H_nogap_r, gap_open);
4103 	dir_horiz = _mm_cmpgt_epi8(E_r_gap,T1); /* E > H, for jump early */
4104 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
4105 	debug15(print_vector_8(T1,rlo,c,"T1"));
4106 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
4107 
4108 	/* Compare H + open with vert */
4109 #ifdef HAVE_SSE4_1
4110 	E_r_gap = _mm_max_epi8(E_r_gap, T1); /* Compare H + open with vert */
4111 	E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4112 	E_r_gap = _mm_min_epi8(E_r_gap,_mm_add_epi8(E_mask,E_infinity));
4113 #elif 1
4114 	E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
4115 	E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4116 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4117 #else
4118 	/* Try to avoid unnecessary shifts by 128, but overflows */
4119 	E_r_gap = _mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128));
4120 	E_r_gap = _mm_add_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4121 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(E_r_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4122 #endif
4123 	debug15(print_vector_8(E_r_gap,rlo,c,"E"));
4124 
4125 
4126 	/* NOGAP */
4127 	T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_NONAVX2);
4128 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
4129 	H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
4130 	X_prev_nogap = T1;
4131 
4132 	/* Add pairscores, allowing for alternate genomic nt */
4133 #ifdef HAVE_SSE4_1
4134 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
4135 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
4136 	debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4137 	H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
4138 #else
4139 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
4140 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
4141 	pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
4142 	debug15(print_vector_8(pairscores_best,rlo,c,"pairscores_std"));
4143 	H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
4144 #endif
4145 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
4146 	debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
4147 
4148 	dir_horiz = _mm_cmpgt_epi8(E_r_gap,H_nogap_r); /* E > H, for jump early */
4149 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4150 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
4151 
4152 
4153 #ifdef HAVE_SSE4_1
4154 	H_nogap_r = _mm_max_epi8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
4155 #else
4156 	/* Compare H + pairscores with horiz + extend */
4157 	H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
4158 #endif
4159 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
4160 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
4161 
4162 	/* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
4163 	if (rhigh >= c) {
4164 	  (*directions_Egap)[c][c] = DIAG;
4165 	  (*directions_nogap)[c][c] = DIAG;
4166 	}
4167 
4168 	/* No need for F loop here */
4169 	E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
4170       }
4171     }
4172   }
4173 
4174 #ifdef CHECK1
4175   /* Row 0 and column 0 directions fail anyway due to saturation */
4176   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
4177   (*directions_Egap)[1][0] = HORIZ;
4178 #endif
4179 
4180 #ifdef DEBUG2
4181   printf("SIMD: Dynprog_simd_8_upper\n");
4182   Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
4183 		   revp,uband,/*upperp*/true);
4184   Directions8_print_ud(*directions_nogap,*directions_Egap,
4185 		       rlength,glength,rsequence,gsequence,gsequence_alt,
4186 		       revp,uband,/*upperp*/true);
4187 #endif
4188 
4189 #ifdef CHECK1
4190   /* Check for row 0 directions */
4191   for (c = 1; c <= uband && c <= glength; c++) {
4192     assert((*directions_Egap)[c][0] != DIAG);
4193     assert((*directions_nogap)[c][0] != DIAG);
4194   }
4195 #endif
4196 
4197   _mm_free(pairscores[4]);
4198   _mm_free(pairscores[3]);
4199   _mm_free(pairscores[2]);
4200   _mm_free(pairscores[1]);
4201   _mm_free(pairscores[0]);
4202 
4203   return matrix;
4204 }
4205 #endif
4206 
4207 
4208 /* E_mask works at the wraparound from POS_INFINITY to NEG_INFINITY.
4209    It is designed to prevent a horizontal/vertical jump into the empty
4210    triangle, by setting horizontal/vertical scores to be as small as
4211    possible, e.g., -128.  However, it is possible that H is also -128,
4212    so we still need to fix the directions along the main diagonal.
4213 
4214    E_mask shifted:    0    0    0    0    1    1    1    1
4215    add E_infinity:  127  127  127  127  127  127  127  127
4216    resulting mask:  127  127  127  127 -128 -128 -128 -128
4217 
4218    To deal with non-SSE4.1 systems, which lack _mm_min_epi8, we need
4219    too add 128 to E and mask, then take _mm_min_epu8, then subtract
4220    128, as follows:
4221 
4222    E_mask shifted:    0    0    0    0    1    1    1    1
4223    add E_inf+128:   255  255  255  255  255  255  255  255
4224    resulting mask:  255  255  255  255    0    0    0    0
4225    (compare w/E+128)
4226 
4227 */
4228 
4229 
4230 #ifdef HAVE_SSE2
4231 /* Designed for computation above the main diagonal, so no F loop or bottom masking needed */
4232 /* Operates by columns */
4233 Score8_T **
Dynprog_simd_8_upper(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int uband,bool jump_late_p,bool revp)4234 Dynprog_simd_8_upper (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
4235 		      T this, char *rsequence, char *gsequence, char *gsequence_alt,
4236 		      int rlength, int glength,
4237 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
4238 		      int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
4239 #endif
4240 		      Mismatchtype_T mismatchtype, int open, int extend,
4241 		      int uband, bool jump_late_p, bool revp) {
4242   Score8_T **matrix, *score_column;
4243 #ifdef HAVE_AVX2
4244   __m256i E_infinity;
4245 #elif defined(HAVE_SSE4_1)
4246   __m128i E_infinity;
4247 #else
4248   __m128i E_infinity_plus_128;
4249   __m128i pairscores_best, all_128;
4250 #endif
4251 #ifdef HAVE_AVX2
4252   __m256i pairscores_std, pairscores_alt;
4253   __m256i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, T1;
4254   __m256i gap_open, gap_extend, complement_dummy;
4255   __m256i dir_horiz;
4256   Score8_T save;
4257 #else
4258   __m128i pairscores_std, pairscores_alt;
4259   __m128i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, T1;
4260   __m128i gap_open, gap_extend, complement_dummy;
4261   __m128i dir_horiz;
4262 #endif
4263   int rlength_ceil, r, c;
4264   int rlo, rhigh;
4265   int na1, na2, na2_alt;
4266   Score8_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr;
4267   Pairdistance_T **pairdistance_array_type;
4268 
4269 #ifdef DEBUG_AVX2
4270   Score8_T **matrix_std;
4271   Direction8_T **directions_nogap_std, **directions_Egap_std;
4272   char na2_single;
4273 #elif defined(DEBUG_SIMD)
4274   Score32_T **matrix_std;
4275   Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
4276   char na2_single;
4277 #endif
4278 
4279 
4280   debug2(printf("Dynprog_simd_8_upper.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
4281   debug15(printf("Dynprog_simd_8_upper.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
4282 
4283   rlength_ceil = (int) ((rlength + SIMD_NCHARS)/SIMD_NCHARS) * SIMD_NCHARS;
4284 
4285 #ifdef HAVE_SSE4_1
4286   pairdistance_array_type = pairdistance_array[mismatchtype];
4287 #else
4288   /* Needed to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
4289   pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
4290   all_128 = _mm_set1_epi8(128);
4291 #endif
4292 
4293   debug(printf("compute_scores_simd_8_bycols (upper): "));
4294   debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
4295   debug(printf("Query length rounded up to %d\n",rlength_ceil));
4296 
4297   matrix = aligned_score8_alloc(rlength_ceil,glength,
4298 				this->aligned.two.upper_matrix_ptrs,this->aligned.two.upper_matrix_space);
4299   *directions_nogap = aligned_directions8_alloc(rlength_ceil,glength,
4300 						this->aligned.two.upper_directions_ptrs_0,this->aligned.two.upper_directions_space_0);
4301   *directions_Egap = aligned_directions8_alloc(rlength_ceil,glength,
4302 					       this->aligned.two.upper_directions_ptrs_1,this->aligned.two.upper_directions_space_1);
4303 
4304 #if 0
4305   /* Row 0 initialization */
4306   /* penalty = open; */
4307   for (c = 1; c <= uband && c <= glength; c++) {
4308     /* penalty += extend; */
4309     (*directions_Egap)[c][0] = HORIZ;
4310     (*directions_nogap)[c][0] = HORIZ;
4311   }
4312 #endif
4313 #if 0
4314   /* Already initialized to DIAG.  Actually no longer initializing directions_Egap */
4315   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
4316   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
4317 #endif
4318 #if 0
4319   /* Column 0 initialization */
4320   /* penalty = open; */
4321   for (r = 1; r <= SIMD_NCHARS && r <= rlength; r++) {
4322     /* penalty += extend; */
4323     (*directions_nogap)[0][r] = VERT;
4324   }
4325 #endif
4326 
4327 
4328   /* Load pairscores.  Store match - mismatch */
4329   pairscores[0] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
4330   pairscores[1] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
4331   pairscores[2] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
4332   pairscores[3] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
4333   pairscores[4] = (Score8_T *) _mm_malloc(rlength_ceil * sizeof(Score8_T),ALIGN_SIZE);
4334 
4335 #if 0
4336   /* Should not be necessary */
4337   memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score8_T));
4338   memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score8_T));
4339   memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score8_T));
4340   memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score8_T));
4341   memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score8_T));
4342 #endif
4343 
4344   /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
4345   r = 0; na1 = 'N';
4346   pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
4347   pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
4348   pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
4349   pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
4350   pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
4351 
4352   if (revp == false) {
4353     for (r = 1; r <= rlength; r++) {
4354       na1 = (int) rsequence[r-1];
4355       pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
4356       pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
4357       pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
4358       pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
4359       pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
4360     }
4361   } else {
4362     for (r = 1; r <= rlength; r++) {
4363       na1 = (int) rsequence[1-r];
4364       pairscores[0][r] = (Score8_T) pairdistance_array_type[na1][(int) 'A'];
4365       pairscores[1][r] = (Score8_T) pairdistance_array_type[na1][(int) 'C'];
4366       pairscores[2][r] = (Score8_T) pairdistance_array_type[na1][(int) 'G'];
4367       pairscores[3][r] = (Score8_T) pairdistance_array_type[na1][(int) 'T'];
4368       pairscores[4][r] = (Score8_T) pairdistance_array_type[na1][(int) 'N'];
4369     }
4370   }
4371 
4372 #if 0
4373   /* Should not be necessary */
4374   memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
4375   memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
4376   memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
4377   memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
4378   memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score8_T));
4379 #endif
4380 
4381   complement_dummy = _MM_SET1_EPI8(-1);
4382 
4383   gap_open = _MM_SET1_EPI8((Score8_T) open);
4384   gap_extend = _MM_SET1_EPI8((Score8_T) extend);
4385 
4386 #ifdef HAVE_SSE4_1
4387   E_infinity = _MM_SET1_EPI8(POS_INFINITY_8);
4388 #else
4389   E_infinity_plus_128 = _mm_set1_epi8(POS_INFINITY_8+128);
4390 #endif
4391   if (jump_late_p) {
4392     for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS) {
4393       if ((rhigh = rlo + SIMD_NCHARS - 1) > rlength) {
4394 	rhigh = rlength;
4395       }
4396 
4397       /* dir_horiz tests if E >= H .  To fill in first column of each
4398 	 row block with non-diags, could make E == H.  But irrelevant,
4399 	 because these are below the diagonal. */
4400       E_mask = _MM_SET1_EPI8(1);
4401 
4402       /* Holds for all INITIAL_GAP_PENALTY */
4403       E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8);
4404       H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
4405 
4406       for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
4407 	score_column = matrix[c];
4408 
4409 	if (c == 0) {
4410 	  na2 = na2_alt = 4; /* 'N' */
4411 	} else {
4412 	  na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
4413 	  na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
4414 	}
4415 	pairscores_std_ptr = pairscores[na2];
4416 	pairscores_alt_ptr = pairscores[na2_alt];
4417 
4418 	if (c == 0) {
4419 	  X_prev_nogap = _MM_SETZERO_SI();
4420 	} else if (rlo == 0) {
4421 #ifdef ZERO_INITIAL_GAP_PENALTY
4422 	  X_prev_nogap = _MM_SETZERO_SI();
4423 #elif defined(HAVE_AVX2)
4424 	  X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
4425 #else
4426 	  X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the rlo bounds */
4427 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
4428 #endif
4429 	} else {
4430 	  /* second or greater block of 8 */
4431 #ifdef ZERO_INITIAL_GAP_PENALTY
4432 	  X_prev_nogap = _MM_SETZERO_SI();
4433 #elif defined(HAVE_AVX2)
4434 	  X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_CHAR_INSERT);
4435 #else
4436 	  X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
4437 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
4438 #endif
4439 	}
4440 
4441 	debug15(print_vector_8(E_mask,rlo,c,"E_mask"));
4442 #ifdef HAVE_SSE4_1
4443 	E_r_gap = _MM_MIN_EPI8(E_r_gap,_MM_ADD_EPI8(E_mask,E_infinity));
4444 #else
4445 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4446 #endif
4447 	debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
4448 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
4449 
4450 	/* EGAP */
4451 	T1 = _MM_ADDS_EPI8(H_nogap_r, gap_open);
4452 	dir_horiz = _MM_CMPLT_EPI8(E_r_gap,T1); /* E < H */
4453 	dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy);	/* E >= H, for jump late */
4454 #ifdef HAVE_AVX2
4455 	_mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
4456 #else
4457 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
4458 #endif
4459 	debug15(print_vector_8(T1,rlo,c,"T1"));
4460 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
4461 
4462 #ifdef HAVE_SSE4_1
4463 	E_r_gap = _MM_MAX_EPI8(E_r_gap, T1); /* Compare H + open with vert */
4464 	E_r_gap = _MM_ADDS_EPI8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4465 	E_r_gap = _MM_MIN_EPI8(E_r_gap,_MM_ADD_EPI8(E_mask,E_infinity));
4466 #elif 1
4467 	E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
4468 	E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4469 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4470 #else
4471 	/* Try to avoid unnecessary shifts by 128, but overflows */
4472 	E_r_gap = _mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128));
4473 	E_r_gap = _mm_add_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4474 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(E_r_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4475 #endif
4476 	debug15(print_vector_8(E_r_gap,rlo,c,"E"));
4477 
4478 
4479 	/* NOGAP */
4480 #ifdef HAVE_AVX2
4481 	T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_r,SIMD_NCHARS-1),LAST_CHAR_INSERT);
4482 	X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_r,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
4483 	H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_CHAR);
4484 #else
4485 	T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_SHIFT);
4486 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
4487 #endif
4488 	H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
4489 	X_prev_nogap = T1;
4490 
4491 	/* Add pairscores, allowing for alternate genomic nt */
4492 #ifdef HAVE_AVX2
4493 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
4494 	pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
4495 	debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4496 	H_nogap_r = _MM_ADDS_EPI8(H_nogap_r, _MM_MAX_EPI8(pairscores_std,pairscores_alt));
4497 #elif defined(HAVE_SSE4_1)
4498 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
4499 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
4500 	debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4501 	H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
4502 #else
4503 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
4504 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
4505 	pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
4506 	debug15(print_vector_8(pairscores_best,rlo,c,"pairscores_std"));
4507 	H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
4508 #endif
4509 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
4510 	debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
4511 
4512 	dir_horiz = _MM_CMPLT_EPI8(E_r_gap,H_nogap_r); /* E < H */
4513 	dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy);	/* E >= H, for jump late */
4514 #ifdef HAVE_AVX2
4515 	_mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4516 #else
4517 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4518 #endif
4519 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
4520 
4521 #ifdef HAVE_SSE4_1
4522 	H_nogap_r = _MM_MAX_EPI8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
4523 #else
4524 	/* Compare H + pairscores with horiz + extend */
4525 	H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
4526 #endif
4527 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
4528 #ifdef HAVE_AVX2
4529 	_mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
4530 #else
4531 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
4532 #endif
4533 
4534 
4535 	/* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
4536 	if (rhigh >= c) {
4537 	  (*directions_Egap)[c][c] = DIAG;
4538 	  (*directions_nogap)[c][c] = DIAG;
4539 	}
4540 
4541 	/* No need for F loop here */
4542 #ifdef HAVE_AVX2
4543 	save = _mm256_extract_epi8(E_mask,15);
4544 	E_mask = _mm256_slli_si256(E_mask,ONE_CHAR);
4545 	E_mask = _mm256_insert_epi8(E_mask,save,16);
4546 #else
4547 	E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
4548 #endif
4549       }
4550     }
4551 
4552   } else {
4553     /* jump early */
4554     for (rlo = 0; rlo <= rlength; rlo += SIMD_NCHARS) {
4555       if ((rhigh = rlo + SIMD_NCHARS - 1) > rlength) {
4556 	rhigh = rlength;
4557       }
4558 
4559       /* dir_horiz tests if E > H.  To fill in first column of each
4560 	 row block with non-diags, could make E > H.  But irrelevant,
4561 	 because these are below the diagonal. */
4562       E_mask = _MM_SET1_EPI8(1);
4563 
4564       /* Holds for all INITIAL_GAP_PENALTY */
4565       E_r_gap = _MM_SET1_EPI8(NEG_INFINITY_8+1);
4566       H_nogap_r = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
4567 
4568       for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
4569 	score_column = matrix[c];
4570 
4571 	if (c == 0) {
4572 	  na2 = na2_alt = 4; /* 'N' */;
4573 	} else {
4574 	  na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
4575 	  na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
4576 	}
4577 	pairscores_std_ptr = pairscores[na2];
4578 	pairscores_alt_ptr = pairscores[na2_alt];
4579 
4580 	if (c == 0) {
4581 	  X_prev_nogap = _MM_SETZERO_SI();
4582 	} else if (rlo == 0) {
4583 #ifdef ZERO_INITIAL_GAP_PENALTY
4584 	  X_prev_nogap = _MM_SETZERO_SI();
4585 #elif defined(HAVE_AVX2)
4586 	  X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
4587 #else
4588 	  X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the rlo bounds */
4589 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
4590 #endif
4591 	} else {
4592 	  /* second or greater block of 8 */
4593 #ifdef ZERO_INITIAL_GAP_PENALTY
4594 	  X_prev_nogap = _MM_SETZERO_SI();
4595 #elif defined(HAVE_AVX2)
4596 	  X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_CHAR_INSERT);
4597 #else
4598 	  X_prev_nogap = _mm_set1_epi8(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
4599 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
4600 #endif
4601 	}
4602 
4603 	debug15(print_vector_8(E_mask,rlo,c,"E_mask"));
4604 #ifdef HAVE_SSE4_1
4605 	E_r_gap = _MM_MIN_EPI8(E_r_gap,_MM_ADD_EPI8(E_mask,E_infinity));
4606 #else
4607 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4608 #endif
4609 	debug15(print_vector_8(E_r_gap,rlo,c,"E_r_gap"));
4610 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r load"));
4611 
4612 	/* EGAP */
4613 	T1 = _MM_ADDS_EPI8(H_nogap_r, gap_open);
4614 	dir_horiz = _MM_CMPGT_EPI8(E_r_gap,T1); /* E > H, for jump early */
4615 #ifdef HAVE_AVX2
4616 	_mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
4617 #else
4618 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
4619 #endif
4620 	debug15(print_vector_8(T1,rlo,c,"T1"));
4621 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_Egap"));
4622 
4623 	/* Compare H + open with vert */
4624 #ifdef HAVE_SSE4_1
4625 	E_r_gap = _MM_MAX_EPI8(E_r_gap, T1); /* Compare H + open with vert */
4626 	E_r_gap = _MM_ADDS_EPI8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4627 	E_r_gap = _MM_MIN_EPI8(E_r_gap,_MM_ADD_EPI8(E_mask,E_infinity));
4628 #elif 1
4629 	E_r_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
4630 	E_r_gap = _mm_adds_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4631 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_r_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4632 #else
4633 	/* Try to avoid unnecessary shifts by 128, but overflows */
4634 	E_r_gap = _mm_max_epu8(_mm_add_epi8(E_r_gap, all_128), _mm_add_epi8(T1, all_128));
4635 	E_r_gap = _mm_add_epi8(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
4636 	E_r_gap = _mm_sub_epi8(_mm_min_epu8(E_r_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
4637 #endif
4638 	debug15(print_vector_8(E_r_gap,rlo,c,"E"));
4639 
4640 
4641 	/* NOGAP */
4642 #ifdef HAVE_AVX2
4643 	T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_r,SIMD_NCHARS-1),LAST_CHAR_INSERT);
4644 	X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_r,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
4645 	H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_CHAR);
4646 #else
4647 	T1 = _mm_srli_si128(H_nogap_r,LAST_CHAR_SHIFT);
4648 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_CHAR);
4649 #endif
4650 	H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
4651 	X_prev_nogap = T1;
4652 
4653 	/* Add pairscores, allowing for alternate genomic nt */
4654 #ifdef HAVE_AVX2
4655 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
4656 	pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
4657 	debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4658 	H_nogap_r = _MM_ADDS_EPI8(H_nogap_r, _MM_MAX_EPI8(pairscores_std,pairscores_alt));
4659 #elif defined(HAVE_SSE4_1)
4660 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
4661 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
4662 	debug15(print_vector_8(pairscores_std,rlo,c,"pairscores_std"));
4663 	H_nogap_r = _mm_adds_epi8(H_nogap_r, _mm_max_epi8(pairscores_std,pairscores_alt));
4664 #else
4665 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo])); /* Has 128 added already */
4666 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo])); /* Has 128 added already */
4667 	pairscores_best = _mm_sub_epi8(_mm_max_epu8(pairscores_std, pairscores_alt), all_128);
4668 	debug15(print_vector_8(pairscores_best,rlo,c,"pairscores_std"));
4669 	H_nogap_r = _mm_adds_epi8(H_nogap_r, pairscores_best);
4670 #endif
4671 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
4672 	debug15(print_vector_8(H_nogap_r,rlo,c,"H"));
4673 
4674 	dir_horiz = _MM_CMPGT_EPI8(E_r_gap,H_nogap_r); /* E > H, for jump early */
4675 #ifdef HAVE_AVX2
4676 	_mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4677 #else
4678 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
4679 #endif
4680 	debug15(print_vector_8(dir_horiz,rlo,c,"dir_nogap"));
4681 
4682 
4683 #ifdef HAVE_SSE4_1
4684 	H_nogap_r = _MM_MAX_EPI8(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
4685 #else
4686 	/* Compare H + pairscores with horiz + extend */
4687 	H_nogap_r = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_r, all_128), _mm_add_epi8(E_r_gap, all_128)), all_128);
4688 #endif
4689 	debug15(print_vector_8(H_nogap_r,rlo,c,"H_nogap_r store"));
4690 #ifdef HAVE_AVX2
4691 	_mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
4692 #else
4693 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
4694 #endif
4695 
4696 	/* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
4697 	if (rhigh >= c) {
4698 	  (*directions_Egap)[c][c] = DIAG;
4699 	  (*directions_nogap)[c][c] = DIAG;
4700 	}
4701 
4702 	/* No need for F loop here */
4703 #ifdef HAVE_AVX2
4704 	save = _mm256_extract_epi8(E_mask,15);
4705 	E_mask = _mm256_slli_si256(E_mask,ONE_CHAR);
4706 	E_mask = _mm256_insert_epi8(E_mask,save,16);
4707 #else
4708 	E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
4709 #endif
4710       }
4711     }
4712   }
4713 
4714 
4715 #ifdef CHECK1
4716   /* Row 0 and column 0 directions fail anyway due to saturation */
4717   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
4718   (*directions_Egap)[1][0] = HORIZ;
4719 #endif
4720 
4721 #ifdef DEBUG2
4722   printf("SIMD: Dynprog_simd_8_upper\n");
4723   Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
4724 		   revp,uband,/*upperp*/true);
4725   Directions8_print_ud(*directions_nogap,*directions_Egap,
4726 		       rlength,glength,rsequence,gsequence,gsequence_alt,
4727 		       revp,uband,/*upperp*/true);
4728 #endif
4729 
4730 #ifdef CHECK1
4731   /* Check for row 0 directions */
4732   for (c = 1; c <= uband && c <= glength; c++) {
4733     assert((*directions_Egap)[c][0] != DIAG);
4734     assert((*directions_nogap)[c][0] != DIAG);
4735   }
4736 #endif
4737 
4738 #ifdef DEBUG_AVX2
4739   matrix_std = Dynprog_simd_8_upper_nonavx2(&directions_nogap_std,&directions_Egap_std,
4740 					    this,rsequence,gsequence,gsequence_alt,
4741 					    rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
4742 					    open,extend,uband,jump_late_p,revp);
4743 
4744 #elif defined(DEBUG_SIMD)
4745   matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
4746 				this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
4747 				rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
4748 				open,extend,/*lband*/0,uband,jump_late_p,revp,/*saturation*/NEG_INFINITY_8,
4749 				/*upperp*/true,/*lowerp*/false);
4750 #endif
4751 
4752 #ifdef DEBUG2
4753   printf("Banded %s\n",revp ? "rev" : "fwd");
4754   Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,revp,uband,/*upperp*/true);
4755   Directions8_print_ud(*directions_nogap,*directions_Egap,
4756 		       rlength,glength,rsequence,gsequence,gsequence_alt,revp,uband,/*upperp*/true);
4757 #endif
4758 
4759 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
4760   banded_matrix8_compare_upper(matrix,matrix_std,rlength,glength,uband,
4761 			       rsequence,gsequence,gsequence_alt,
4762 			       goffset,chroffset,chrhigh,watsonp,revp);
4763 
4764   banded_directions8_compare_nogap_upper(matrix,*directions_nogap,directions_nogap_std,rlength,glength,uband);
4765 
4766   banded_directions8_compare_Egap_upper(matrix,*directions_Egap,directions_Egap_std,rlength,glength,uband);
4767 #endif
4768 
4769   _mm_free(pairscores[4]);
4770   _mm_free(pairscores[3]);
4771   _mm_free(pairscores[2]);
4772   _mm_free(pairscores[1]);
4773   _mm_free(pairscores[0]);
4774 
4775   return matrix;
4776 }
4777 #endif
4778 
4779 
4780 #ifdef DEBUG_AVX2
4781 /* Designed for computation below the main diagonal, so no F loop or bottom masking needed */
4782 /* Operates by rows */
4783 Score8_T **
Dynprog_simd_8_lower_nonavx2(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,bool jump_late_p,bool revp)4784 Dynprog_simd_8_lower_nonavx2 (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
4785 			      T this, char *rsequence, char *gsequence, char *gsequence_alt,
4786 			      int rlength, int glength,
4787 			      int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
4788 			      Mismatchtype_T mismatchtype, int open, int extend,
4789 			      int lband, bool jump_late_p, bool revp) {
4790   Score8_T **matrix, *score_column;
4791   __m128i pairscores_std;
4792 #ifdef HAVE_SSE4_1
4793   __m128i E_infinity;
4794 #else
4795   __m128i pairscores_best, all_128, E_infinity_plus_128;
4796 #endif
4797   __m128i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, T1;
4798   __m128i gap_open, gap_extend, complement_dummy;
4799   __m128i dir_vert;
4800   int glength_ceil, r, c;
4801   int clo, chigh;
4802   int na1, na2, na2_alt;
4803   Score8_T *pairscores[5], *pairscores_ptr;
4804   Pairdistance_T **pairdistance_array_type, score1, score2;
4805 
4806 
4807   debug2(printf("Dynprog_simd_8_lower.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
4808   debug15(printf("Dynprog_simd_8_lower.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
4809 
4810   glength_ceil = (int) ((glength + SIMD_NCHARS_NONAVX2)/SIMD_NCHARS_NONAVX2) * SIMD_NCHARS_NONAVX2;
4811 
4812 #ifdef HAVE_SSE4_1
4813   pairdistance_array_type = pairdistance_array[mismatchtype];
4814 #else
4815   /* Needed to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
4816   pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
4817   all_128 = _mm_set1_epi8(128);
4818 #endif
4819 
4820   debug(printf("compute_scores_simd_8_byrows (lower): "));
4821   debug(printf("Lengths are %d and %d, so band is %d on left\n",rlength,glength,lband));
4822   debug(printf("Genome length rounded up to %d\n",glength_ceil));
4823 
4824   matrix = aligned_score8_alloc(glength_ceil,rlength,
4825 				this->aligned_std.two.lower_matrix_ptrs,this->aligned_std.two.lower_matrix_space);
4826   *directions_nogap = aligned_directions8_alloc(glength_ceil,rlength,
4827 						this->aligned_std.two.lower_directions_ptrs_0,this->aligned_std.two.lower_directions_space_0);
4828   *directions_Egap = aligned_directions8_alloc(glength_ceil,rlength,
4829 					       this->aligned_std.two.lower_directions_ptrs_1,this->aligned_std.two.lower_directions_space_1);
4830 
4831 #if 0
4832   /* Column 0 initialization */
4833   /* penalty = open; */
4834   for (r = 1; r <= lband && r <= rlength; r++) {
4835     /* penalty += extend; */
4836     (*directions_Egap)[r][0] = VERT;
4837     (*directions_nogap)[r][0] = VERT;
4838   }
4839 #endif
4840 #if 0
4841   /* Already initialized to DIAG.  Actually no longer initializing directions_Egap */
4842   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
4843   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
4844 #endif
4845 #if 0
4846   /* Row 0 initialization */
4847   /* penalty = open; */
4848   for (c = 1; c <= SIMD_NCHARS_NONAVX2 && c <= glength; c++) {
4849     /* penalty += extend; */
4850     (*directions_nogap)[0][c] = HORIZ;
4851   }
4852 #endif
4853 
4854 
4855   /* Load pairscores.  Store match - mismatch */
4856   pairscores[0] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),16);
4857   pairscores[1] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),16);
4858   pairscores[2] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),16);
4859   pairscores[3] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),16);
4860   pairscores[4] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),16);
4861 
4862 #if 0
4863   /* Should not be necessary */
4864   memset((void *) pairscores[0],0,glength_ceil*sizeof(Score8_T));
4865   memset((void *) pairscores[1],0,glength_ceil*sizeof(Score8_T));
4866   memset((void *) pairscores[2],0,glength_ceil*sizeof(Score8_T));
4867   memset((void *) pairscores[3],0,glength_ceil*sizeof(Score8_T));
4868   memset((void *) pairscores[4],0,glength_ceil*sizeof(Score8_T));
4869 #endif
4870 
4871   /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
4872   c = 0; na2 = na2_alt = 4; /* 'N' */
4873 #ifdef HAVE_SSE4_1
4874   pairscores[0][c] = (Score8_T) pairdistance_array_type[(int) 'A'][na2];
4875   pairscores[1][c] = (Score8_T) pairdistance_array_type[(int) 'C'][na2];
4876   pairscores[2][c] = (Score8_T) pairdistance_array_type[(int) 'G'][na2];
4877   pairscores[3][c] = (Score8_T) pairdistance_array_type[(int) 'T'][na2];
4878   pairscores[4][c] = (Score8_T) pairdistance_array_type[(int) 'N'][na2];
4879 #else
4880   pairscores[0][c] = (Score8_T) pairdistance_array_type[(int) 'A'][na2] - 128;
4881   pairscores[1][c] = (Score8_T) pairdistance_array_type[(int) 'C'][na2] - 128;
4882   pairscores[2][c] = (Score8_T) pairdistance_array_type[(int) 'G'][na2] - 128;
4883   pairscores[3][c] = (Score8_T) pairdistance_array_type[(int) 'T'][na2] - 128;
4884   pairscores[4][c] = (Score8_T) pairdistance_array_type[(int) 'N'][na2] - 128;
4885 #endif
4886 
4887   if (revp == false) {
4888     for (c = 1; c <= glength; c++) {
4889       na2 = gsequence[c-1];
4890       na2_alt = gsequence_alt[c-1];
4891       /* Take max here */
4892       score1 = pairdistance_array_type[(int) 'A'][na2];
4893       score2 = pairdistance_array_type[(int) 'A'][na2_alt];
4894       pairscores[0][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4895 
4896       score1 = pairdistance_array_type[(int) 'C'][na2];
4897       score2 = pairdistance_array_type[(int) 'C'][na2_alt];
4898       pairscores[1][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4899 
4900       score1 = pairdistance_array_type[(int) 'G'][na2];
4901       score2 = pairdistance_array_type[(int) 'G'][na2_alt];
4902       pairscores[2][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4903 
4904       score1 = pairdistance_array_type[(int) 'T'][na2];
4905       score2 = pairdistance_array_type[(int) 'T'][na2_alt];
4906       pairscores[3][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4907 
4908       score1 = pairdistance_array_type[(int) 'N'][na2];
4909       score2 = pairdistance_array_type[(int) 'N'][na2_alt];
4910       pairscores[4][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4911     }
4912   } else {
4913     for (c = 1; c <= glength; c++) {
4914       na2 = gsequence[1-c];
4915       na2_alt = gsequence_alt[1-c];
4916       /* Take max here */
4917       score1 = pairdistance_array_type[(int) 'A'][na2];
4918       score2 = pairdistance_array_type[(int) 'A'][na2_alt];
4919       pairscores[0][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4920 
4921       score1 = pairdistance_array_type[(int) 'C'][na2];
4922       score2 = pairdistance_array_type[(int) 'C'][na2_alt];
4923       pairscores[1][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4924 
4925       score1 = pairdistance_array_type[(int) 'G'][na2];
4926       score2 = pairdistance_array_type[(int) 'G'][na2_alt];
4927       pairscores[2][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4928 
4929       score1 = pairdistance_array_type[(int) 'T'][na2];
4930       score2 = pairdistance_array_type[(int) 'T'][na2_alt];
4931       pairscores[3][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4932 
4933       score1 = pairdistance_array_type[(int) 'N'][na2];
4934       score2 = pairdistance_array_type[(int) 'N'][na2_alt];
4935       pairscores[4][c] = (Score8_T) (score1 > score2) ? score1 : score2;
4936     }
4937   }
4938 
4939 #if 0
4940   /* Should not be necessary */
4941   memset((void *) &(pairscores[0][c]),0,(glength_ceil-c)*sizeof(Score8_T));
4942   memset((void *) &(pairscores[1][c]),0,(glength_ceil-c)*sizeof(Score8_T));
4943   memset((void *) &(pairscores[2][c]),0,(glength_ceil-c)*sizeof(Score8_T));
4944   memset((void *) &(pairscores[3][c]),0,(glength_ceil-c)*sizeof(Score8_T));
4945   memset((void *) &(pairscores[4][c]),0,(glength_ceil-c)*sizeof(Score8_T));
4946 #endif
4947 
4948   complement_dummy = _mm_set1_epi8(-1);
4949 
4950   gap_open = _mm_set1_epi8((Score8_T) open);
4951   gap_extend = _mm_set1_epi8((Score8_T) extend);
4952 
4953 #ifdef HAVE_SSE4_1
4954   E_infinity = _mm_set1_epi8(POS_INFINITY_8);
4955 #else
4956   E_infinity_plus_128 = _mm_set1_epi8(POS_INFINITY_8+128);
4957 #endif
4958   if (jump_late_p) {
4959     for (clo = 0; clo <= glength; clo += SIMD_NCHARS_NONAVX2) {
4960       if ((chigh = clo + SIMD_NCHARS_NONAVX2 - 1) > glength) {
4961 	chigh = glength;
4962       }
4963 
4964       /* dir_vert tests if E >= H.  To fill in first row of each
4965 	 column block with non-diags, make E == H. */
4966       E_mask = _mm_set1_epi8(1);
4967 
4968       E_c_gap = _mm_set1_epi8(NEG_INFINITY_8);
4969       H_nogap_c = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
4970 
4971       for (r = clo; r <= chigh + lband && r <= rlength; r++) {
4972 	score_column = matrix[r];
4973 
4974 	if (r == 0) {
4975 	  na1 = 4; /* 'N' */
4976 	} else {
4977 	  na1 = revp ? nt_to_int_array[rsequence[1-r]] : nt_to_int_array[rsequence[r-1]];
4978 	}
4979 	pairscores_ptr = pairscores[na1];
4980 
4981 	if (r == 0) {
4982 	  X_prev_nogap = _mm_set1_epi8(0);
4983 	} else if (clo == 0) {
4984 #ifdef ZERO_INITIAL_GAP_PENALTY
4985 	  X_prev_nogap = _mm_set1_epi8(0);
4986 #else
4987 	  X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the clo bounds */
4988 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
4989 #endif
4990 	} else {
4991 	  /* second or greater block of 8 */
4992 	  X_prev_nogap = _mm_set1_epi8(matrix[r-1][clo-1]); /* get H from previous block and previous column */
4993 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
4994 	}
4995 
4996 	debug15(print_vector_8(E_mask,clo,r,"E_mask"));
4997 #ifdef HAVE_SSE4_1
4998 	E_c_gap = _mm_min_epi8(E_c_gap,_mm_add_epi8(E_mask,E_infinity));
4999 #else
5000 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5001 #endif
5002 	debug15(print_vector_8(E_c_gap,clo,r,"E_c_gap"));
5003 	debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c load"));
5004 
5005 	/* EGAP */
5006 	T1 = _mm_adds_epi8(H_nogap_c, gap_open);
5007 	dir_vert = _mm_cmplt_epi8(E_c_gap,T1); /* E < H */
5008 	dir_vert = _mm_andnot_si128(dir_vert,complement_dummy);	/* E >= H, for jump late */
5009 	_mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
5010 	debug15(print_vector_8(T1,clo,r,"T1"));
5011 	debug15(print_vector_8(dir_vert,clo,r,"dir_Egap"));
5012 
5013 #ifdef HAVE_SSE4_1
5014 	E_c_gap = _mm_max_epi8(E_c_gap, T1); /* Compare H + open with horiz */
5015 	E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5016 	E_c_gap = _mm_min_epi8(E_c_gap,_mm_add_epi8(E_mask,E_infinity));
5017 #elif 1
5018 	E_c_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
5019 	E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5020 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5021 #else
5022 	/* Try to avoid unnecessary shifts by 128, but overflows */
5023 	E_c_gap = _mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128));
5024 	E_c_gap = _mm_add_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5025 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(E_c_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5026 #endif
5027 	debug15(print_vector_8(E_c_gap,clo,r,"E"));
5028 
5029 
5030 	/* NOGAP */
5031 	T1 = _mm_srli_si128(H_nogap_c,LAST_CHAR_NONAVX2);
5032 	H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_CHAR);
5033 	H_nogap_c = _mm_or_si128(H_nogap_c, X_prev_nogap);
5034 	X_prev_nogap = T1;
5035 
5036 	/* Add pairscores.  No alternate chars for query sequence */
5037 #ifdef HAVE_SSE4_1
5038 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
5039 	debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5040 	H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_std);
5041 #else
5042 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo])); /* Has 128 added already */
5043 	pairscores_best = _mm_sub_epi8(pairscores_std, all_128);
5044 	debug15(print_vector_8(pairscores_best,clo,r,"pairscores_std"));
5045 	H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_best);
5046 #endif
5047 	_mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
5048 	debug15(print_vector_8(H_nogap_c,clo,r,"H"));
5049 
5050 	dir_vert = _mm_cmplt_epi8(E_c_gap,H_nogap_c); /* E < H */
5051 	dir_vert = _mm_andnot_si128(dir_vert,complement_dummy);	/* E >= H, for jump late */
5052 	_mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
5053 	debug15(print_vector_8(dir_vert,clo,r,"dir_nogap"));
5054 
5055 
5056 #ifdef HAVE_SSE4_1
5057 	H_nogap_c = _mm_max_epi8(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
5058 #else
5059 	/* Compare H + pairscores with horiz + extend */
5060 	H_nogap_c = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_c, all_128), _mm_add_epi8(E_c_gap, all_128)), all_128);
5061 #endif
5062 	debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c store"));
5063 	_mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
5064 
5065 
5066 	/* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
5067 	if (chigh >= r) {
5068 	  (*directions_Egap)[r][r] = DIAG;
5069 	  (*directions_nogap)[r][r] = DIAG;
5070 	}
5071 
5072 	/* No need for F loop here */
5073 	E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
5074       }
5075     }
5076 
5077   } else {
5078     /* jump early */
5079     for (clo = 0; clo <= glength; clo += SIMD_NCHARS_NONAVX2) {
5080       if ((chigh = clo + SIMD_NCHARS_NONAVX2 - 1) > glength) {
5081 	chigh = glength;
5082       }
5083 
5084       /* dir_vert tests if E > H.  To fill in first row of each
5085 	 column block with non-diags, make E > H. */
5086       E_mask = _mm_set1_epi8(1);
5087 
5088       E_c_gap = _mm_set1_epi8(NEG_INFINITY_8+1);
5089       H_nogap_c = _mm_set1_epi8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
5090 
5091       for (r = clo; r <= chigh + lband && r <= rlength; r++) {
5092 	score_column = matrix[r];
5093 
5094 	if (r == 0) {
5095 	  na1 = 4; /* 'N' */
5096 	} else {
5097 	  na1 = revp ? nt_to_int_array[rsequence[1-r]] : nt_to_int_array[rsequence[r-1]];
5098 	}
5099 	pairscores_ptr = pairscores[na1];
5100 
5101 	if (r == 0) {
5102 	  X_prev_nogap = _mm_set1_epi8(0);
5103 	} else if (clo == 0) {
5104 #ifdef ZERO_INITIAL_GAP_PENALTY
5105 	  X_prev_nogap = _mm_set1_epi8(0);
5106 #else
5107 	  X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the clo bounds */
5108 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
5109 #endif
5110 	} else {
5111 	  /* second or greater block of 8 */
5112 	  X_prev_nogap = _mm_set1_epi8(matrix[r-1][clo-1]); /* get H from previous block and previous column */
5113 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_NONAVX2);
5114 	}
5115 
5116 	debug15(print_vector_8(E_mask,clo,r,"E_mask"));
5117 #ifdef HAVE_SSE4_1
5118 	E_c_gap = _mm_min_epi8(E_c_gap,_mm_add_epi8(E_mask,E_infinity));
5119 #else
5120 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5121 #endif
5122 	debug15(print_vector_8(E_c_gap,clo,r,"E_c_gap"));
5123 	debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c load"));
5124 
5125 	/* EGAP */
5126 	T1 = _mm_adds_epi8(H_nogap_c, gap_open);
5127 	dir_vert = _mm_cmpgt_epi8(E_c_gap,T1); /* E > H, for jump early */
5128 	_mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
5129 	debug15(print_vector_8(T1,clo,r,"T1"));
5130 	debug15(print_vector_8(dir_vert,clo,r,"dir_Egap"));
5131 
5132 	/* Compare H + open with vert */
5133 #ifdef HAVE_SSE4_1
5134 	E_c_gap = _mm_max_epi8(E_c_gap, T1); /* Compare H + open with vert */
5135 	E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5136 	E_c_gap = _mm_min_epi8(E_c_gap,_mm_add_epi8(E_mask,E_infinity));
5137 #elif 1
5138 	E_c_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
5139 	E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5140 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5141 #else
5142 	/* Try to avoid unnecessary shifts by 128, but overflows */
5143 	E_c_gap = _mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128));
5144 	E_c_gap = _mm_add_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5145 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(E_c_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5146 #endif
5147 	debug15(print_vector_8(E_c_gap,clo,r,"E"));
5148 
5149 
5150 	/* NOGAP */
5151 	T1 = _mm_srli_si128(H_nogap_c,LAST_CHAR_NONAVX2);
5152 	H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_CHAR);
5153 	H_nogap_c = _mm_or_si128(H_nogap_c, X_prev_nogap);
5154 	X_prev_nogap = T1;
5155 
5156 	/* Add pairscores.  No alternate chars for query sequence */
5157 #ifdef HAVE_SSE4_1
5158 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
5159 	debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5160 	H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_std);
5161 #else
5162 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo])); /* Has 128 added already */
5163 	pairscores_best = _mm_sub_epi8(pairscores_std, all_128);
5164 	debug15(print_vector_8(pairscores_best,clo,r,"pairscores_std"));
5165 	H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_best);
5166 #endif
5167 	_mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
5168 	debug15(print_vector_8(H_nogap_c,clo,r,"H"));
5169 
5170 	dir_vert = _mm_cmpgt_epi8(E_c_gap,H_nogap_c); /* E > H, for jump early */
5171 	_mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
5172 	debug15(print_vector_8(dir_vert,clo,r,"dir_nogap"));
5173 
5174 
5175 #ifdef HAVE_SSE4_1
5176 	H_nogap_c = _mm_max_epi8(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
5177 #else
5178 	/* Compare H + pairscores with horiz + extend */
5179 	H_nogap_c = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_c, all_128), _mm_add_epi8(E_c_gap, all_128)), all_128);
5180 #endif
5181 	debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c store"));
5182 	_mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
5183 
5184 	/* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
5185 	if (chigh >= r) {
5186 	  (*directions_Egap)[r][r] = DIAG;
5187 	  (*directions_nogap)[r][r] = DIAG;
5188 	}
5189 
5190 	/* No need for F loop here */
5191 	E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
5192       }
5193     }
5194   }
5195 
5196 #ifdef CHECK1
5197   /* Row 0 and column 0 directions fail anyway due to saturation */
5198   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
5199   (*directions_Egap)[1][0] = VERT;
5200 #endif
5201 
5202 #ifdef DEBUG2
5203   printf("SIMD: Dynprog_simd_8_lower\n");
5204   Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
5205 		   revp,lband,/*upperp*/false);
5206   Directions8_print_ud(*directions_nogap,*directions_Egap,
5207 		       rlength,glength,rsequence,gsequence,gsequence_alt,
5208 		       revp,lband,/*upperp*/false);
5209 #endif
5210 
5211 #ifdef CHECK1
5212   /* Check for column 0 directions */
5213   for (r = 1; r <= lband && r <= rlength; r++) {
5214     assert((*directions_Egap)[r][0] != DIAG);
5215     assert((*directions_nogap)[r][0] != DIAG);
5216   }
5217 #endif
5218 
5219   _mm_free(pairscores[4]);
5220   _mm_free(pairscores[3]);
5221   _mm_free(pairscores[2]);
5222   _mm_free(pairscores[1]);
5223   _mm_free(pairscores[0]);
5224 
5225   return matrix;
5226 }
5227 #endif
5228 
5229 
5230 #ifdef HAVE_SSE2
5231 /* Designed for computation below the main diagonal, so no F loop or bottom masking needed */
5232 /* Operates by rows */
5233 Score8_T **
Dynprog_simd_8_lower(Direction8_T *** directions_nogap,Direction8_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,bool jump_late_p,bool revp)5234 Dynprog_simd_8_lower (Direction8_T ***directions_nogap, Direction8_T ***directions_Egap,
5235 		      T this, char *rsequence, char *gsequence, char *gsequence_alt,
5236 		      int rlength, int glength,
5237 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
5238 		      int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
5239 #endif
5240 		      Mismatchtype_T mismatchtype, int open, int extend,
5241 		      int lband, bool jump_late_p, bool revp) {
5242   Score8_T **matrix, *score_column;
5243 #ifdef HAVE_AVX2
5244   __m256i E_infinity;
5245 #elif defined(HAVE_SSE4_1)
5246   __m128i E_infinity;
5247 #else
5248   __m128i pairscores_best, all_128, E_infinity_plus_128;
5249 #endif
5250 #ifdef HAVE_AVX2
5251   __m256i pairscores_std;
5252   __m256i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, T1;
5253   __m256i gap_open, gap_extend, complement_dummy;
5254   __m256i dir_vert;
5255   Score8_T save;
5256 #else
5257   __m128i pairscores_std;
5258   __m128i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, T1;
5259   __m128i gap_open, gap_extend, complement_dummy;
5260   __m128i dir_vert;
5261 #endif
5262   int glength_ceil, r, c;
5263   int clo, chigh;
5264   int na1, na2, na2_alt;
5265   Score8_T *pairscores[5], *pairscores_ptr;
5266   Pairdistance_T **pairdistance_array_type, score1, score2;
5267 
5268 #ifdef DEBUG_AVX2
5269   Score8_T **matrix_std;
5270   Direction8_T **directions_nogap_std, **directions_Egap_std;
5271   char na2_single;
5272 #elif defined(DEBUG_SIMD)
5273   Score32_T **matrix_std;
5274   Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
5275   char na2_single;
5276 #endif
5277 
5278 
5279   debug2(printf("Dynprog_simd_8_lower.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
5280   debug15(printf("Dynprog_simd_8_lower.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
5281 
5282   glength_ceil = (int) ((glength + SIMD_NCHARS)/SIMD_NCHARS) * SIMD_NCHARS;
5283 
5284 #ifdef HAVE_SSE4_1
5285   pairdistance_array_type = pairdistance_array[mismatchtype];
5286 #else
5287   /* Needed to use _mm_max_epu8 and _mm_min_epu8, instead of signed versions */
5288   pairdistance_array_type = pairdistance_array_plus_128[mismatchtype];
5289   all_128 = _mm_set1_epi8(128);
5290 #endif
5291 
5292   debug(printf("compute_scores_simd_8_byrows (lower): "));
5293   debug(printf("Lengths are %d and %d, so band is %d on left\n",rlength,glength,lband));
5294   debug(printf("Genome length rounded up to %d\n",glength_ceil));
5295 
5296   matrix = aligned_score8_alloc(glength_ceil,rlength,
5297 				this->aligned.two.lower_matrix_ptrs,this->aligned.two.lower_matrix_space);
5298   *directions_nogap = aligned_directions8_alloc(glength_ceil,rlength,
5299 						this->aligned.two.lower_directions_ptrs_0,this->aligned.two.lower_directions_space_0);
5300   *directions_Egap = aligned_directions8_alloc(glength_ceil,rlength,
5301 					       this->aligned.two.lower_directions_ptrs_1,this->aligned.two.lower_directions_space_1);
5302 
5303 #if 0
5304   /* Column 0 initialization */
5305   /* penalty = open; */
5306   for (r = 1; r <= lband && r <= rlength; r++) {
5307     /* penalty += extend; */
5308     (*directions_Egap)[r][0] = VERT;
5309     (*directions_nogap)[r][0] = VERT;
5310   }
5311 #endif
5312 #if 0
5313   /* Already initialized to DIAG.  Actually no longer initializing directions_Egap */
5314   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
5315   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
5316 #endif
5317 #if 0
5318   /* Row 0 initialization */
5319   /* penalty = open; */
5320   for (c = 1; c <= SIMD_NCHARS && c <= glength; c++) {
5321     /* penalty += extend; */
5322     (*directions_nogap)[0][c] = HORIZ;
5323   }
5324 #endif
5325 
5326 
5327   /* Load pairscores.  Store match - mismatch */
5328   pairscores[0] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),ALIGN_SIZE);
5329   pairscores[1] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),ALIGN_SIZE);
5330   pairscores[2] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),ALIGN_SIZE);
5331   pairscores[3] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),ALIGN_SIZE);
5332   pairscores[4] = (Score8_T *) _mm_malloc(glength_ceil * sizeof(Score8_T),ALIGN_SIZE);
5333 
5334 #if 0
5335   /* Should not be necessary */
5336   memset((void *) pairscores[0],0,glength_ceil*sizeof(Score8_T));
5337   memset((void *) pairscores[1],0,glength_ceil*sizeof(Score8_T));
5338   memset((void *) pairscores[2],0,glength_ceil*sizeof(Score8_T));
5339   memset((void *) pairscores[3],0,glength_ceil*sizeof(Score8_T));
5340   memset((void *) pairscores[4],0,glength_ceil*sizeof(Score8_T));
5341 #endif
5342 
5343   /* For non-SSE4.1, addition of 128 taken care of by using pairdistance_array_plus_128 above */
5344   c = 0; na2 = na2_alt = 4; /* 'N' */
5345 #ifdef HAVE_SSE4_1
5346   pairscores[0][c] = (Score8_T) pairdistance_array_type[(int) 'A'][na2];
5347   pairscores[1][c] = (Score8_T) pairdistance_array_type[(int) 'C'][na2];
5348   pairscores[2][c] = (Score8_T) pairdistance_array_type[(int) 'G'][na2];
5349   pairscores[3][c] = (Score8_T) pairdistance_array_type[(int) 'T'][na2];
5350   pairscores[4][c] = (Score8_T) pairdistance_array_type[(int) 'N'][na2];
5351 #else
5352   pairscores[0][c] = (Score8_T) pairdistance_array_type[(int) 'A'][na2] - 128;
5353   pairscores[1][c] = (Score8_T) pairdistance_array_type[(int) 'C'][na2] - 128;
5354   pairscores[2][c] = (Score8_T) pairdistance_array_type[(int) 'G'][na2] - 128;
5355   pairscores[3][c] = (Score8_T) pairdistance_array_type[(int) 'T'][na2] - 128;
5356   pairscores[4][c] = (Score8_T) pairdistance_array_type[(int) 'N'][na2] - 128;
5357 #endif
5358 
5359   if (revp == false) {
5360     for (c = 1; c <= glength; c++) {
5361       na2 = gsequence[c-1];
5362       na2_alt = gsequence_alt[c-1];
5363       /* Take max here */
5364       score1 = pairdistance_array_type[(int) 'A'][na2];
5365       score2 = pairdistance_array_type[(int) 'A'][na2_alt];
5366       pairscores[0][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5367 
5368       score1 = pairdistance_array_type[(int) 'C'][na2];
5369       score2 = pairdistance_array_type[(int) 'C'][na2_alt];
5370       pairscores[1][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5371 
5372       score1 = pairdistance_array_type[(int) 'G'][na2];
5373       score2 = pairdistance_array_type[(int) 'G'][na2_alt];
5374       pairscores[2][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5375 
5376       score1 = pairdistance_array_type[(int) 'T'][na2];
5377       score2 = pairdistance_array_type[(int) 'T'][na2_alt];
5378       pairscores[3][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5379 
5380       score1 = pairdistance_array_type[(int) 'N'][na2];
5381       score2 = pairdistance_array_type[(int) 'N'][na2_alt];
5382       pairscores[4][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5383     }
5384   } else {
5385     for (c = 1; c <= glength; c++) {
5386       na2 = gsequence[1-c];
5387       na2_alt = gsequence_alt[1-c];
5388       /* Take max here */
5389       score1 = pairdistance_array_type[(int) 'A'][na2];
5390       score2 = pairdistance_array_type[(int) 'A'][na2_alt];
5391       pairscores[0][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5392 
5393       score1 = pairdistance_array_type[(int) 'C'][na2];
5394       score2 = pairdistance_array_type[(int) 'C'][na2_alt];
5395       pairscores[1][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5396 
5397       score1 = pairdistance_array_type[(int) 'G'][na2];
5398       score2 = pairdistance_array_type[(int) 'G'][na2_alt];
5399       pairscores[2][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5400 
5401       score1 = pairdistance_array_type[(int) 'T'][na2];
5402       score2 = pairdistance_array_type[(int) 'T'][na2_alt];
5403       pairscores[3][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5404 
5405       score1 = pairdistance_array_type[(int) 'N'][na2];
5406       score2 = pairdistance_array_type[(int) 'N'][na2_alt];
5407       pairscores[4][c] = (Score8_T) (score1 > score2) ? score1 : score2;
5408     }
5409   }
5410 
5411 #if 0
5412   /* Should not be necessary */
5413   memset((void *) &(pairscores[0][c]),0,(glength_ceil-c)*sizeof(Score8_T));
5414   memset((void *) &(pairscores[1][c]),0,(glength_ceil-c)*sizeof(Score8_T));
5415   memset((void *) &(pairscores[2][c]),0,(glength_ceil-c)*sizeof(Score8_T));
5416   memset((void *) &(pairscores[3][c]),0,(glength_ceil-c)*sizeof(Score8_T));
5417   memset((void *) &(pairscores[4][c]),0,(glength_ceil-c)*sizeof(Score8_T));
5418 #endif
5419 
5420   complement_dummy = _MM_SET1_EPI8(-1);
5421 
5422   gap_open = _MM_SET1_EPI8((Score8_T) open);
5423   gap_extend = _MM_SET1_EPI8((Score8_T) extend);
5424 
5425 #ifdef HAVE_SSE4_1
5426   E_infinity = _MM_SET1_EPI8(POS_INFINITY_8);
5427 #else
5428   E_infinity_plus_128 = _mm_set1_epi8(POS_INFINITY_8+128);
5429 #endif
5430   if (jump_late_p) {
5431     for (clo = 0; clo <= glength; clo += SIMD_NCHARS) {
5432       if ((chigh = clo + SIMD_NCHARS - 1) > glength) {
5433 	chigh = glength;
5434       }
5435 
5436       /* dir_vert tests if E >= H.  To fill in first row of each
5437 	 column block with non-diags, make E == H. */
5438       E_mask = _MM_SET1_EPI8(1);
5439 
5440       /* Holds for all INITIAL_GAP_PENALTY */
5441       E_c_gap = _MM_SET1_EPI8(NEG_INFINITY_8);
5442       H_nogap_c = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
5443 
5444       for (r = clo; r <= chigh + lband && r <= rlength; r++) {
5445 	score_column = matrix[r];
5446 
5447 	if (r == 0) {
5448 	  na1 = 4; /* 'N' */
5449 	} else {
5450 	  na1 = revp ? nt_to_int_array[(int) rsequence[1-r]] : nt_to_int_array[(int) rsequence[r-1]];
5451 	}
5452 	pairscores_ptr = pairscores[na1];
5453 
5454 	if (r == 0) {
5455 	  X_prev_nogap = _MM_SETZERO_SI();
5456 	} else if (clo == 0) {
5457 #ifdef ZERO_INITIAL_GAP_PENALTY
5458 	  X_prev_nogap = _MM_SETZERO_SI();
5459 #elif defined(HAVE_AVX2)
5460 	  X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
5461 #else
5462 	  X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the clo bounds */
5463 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
5464 #endif
5465 	} else {
5466 	  /* second or greater block of 8 */
5467 #ifdef ZERO_INITIAL_GAP_PENALTY
5468 	  X_prev_nogap = _MM_SETZERO_SI();
5469 #elif defined(HAVE_AVX2)
5470 	  X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[r-1][clo-1],LAST_CHAR_INSERT);
5471 #else
5472 	  X_prev_nogap = _mm_set1_epi8(matrix[r-1][clo-1]); /* get H from previous block and previous column */
5473 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
5474 #endif
5475 	}
5476 
5477 	debug15(print_vector_8(E_mask,clo,r,"E_mask"));
5478 #ifdef HAVE_SSE4_1
5479 	E_c_gap = _MM_MIN_EPI8(E_c_gap,_MM_ADD_EPI8(E_mask,E_infinity));
5480 #else
5481 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5482 #endif
5483 	debug15(print_vector_8(E_c_gap,clo,r,"E_c_gap"));
5484 	debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c load"));
5485 
5486 	/* EGAP */
5487 	T1 = _MM_ADDS_EPI8(H_nogap_c, gap_open);
5488 	dir_vert = _MM_CMPLT_EPI8(E_c_gap,T1); /* E < H */
5489 	dir_vert = _MM_ANDNOT_SI(dir_vert,complement_dummy);	/* E >= H, for jump late */
5490 #ifdef HAVE_AVX2
5491 	_mm256_store_si256((__m256i *) &((*directions_Egap)[r][clo]),dir_vert);
5492 #else
5493 	_mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
5494 #endif
5495 	debug15(print_vector_8(T1,clo,r,"T1"));
5496 	debug15(print_vector_8(dir_vert,clo,r,"dir_Egap"));
5497 
5498 #ifdef HAVE_SSE4_1
5499 	E_c_gap = _MM_MAX_EPI8(E_c_gap, T1); /* Compare H + open with horiz */
5500 	E_c_gap = _MM_ADDS_EPI8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5501 	E_c_gap = _MM_MIN_EPI8(E_c_gap,_MM_ADD_EPI8(E_mask,E_infinity));
5502 #elif 1
5503 	E_c_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
5504 	E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5505 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5506 #else
5507 	/* Try to avoid unnecessary shifts by 128, but overflows */
5508 	E_c_gap = _mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128));
5509 	E_c_gap = _mm_add_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (horiz + open) */
5510 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(E_c_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5511 #endif
5512 	debug15(print_vector_8(E_c_gap,clo,r,"E"));
5513 
5514 
5515 	/* NOGAP */
5516 #ifdef HAVE_AVX2
5517 	T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_c,SIMD_NCHARS-1),LAST_CHAR_INSERT);
5518 	X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_c,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
5519 	H_nogap_c = _mm256_slli_si256(H_nogap_c,ONE_CHAR);
5520 #else
5521 	T1 = _mm_srli_si128(H_nogap_c,LAST_CHAR_SHIFT);
5522 	H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_CHAR);
5523 #endif
5524 	H_nogap_c = _MM_OR_SI(H_nogap_c, X_prev_nogap);
5525 	X_prev_nogap = T1;
5526 
5527 	/* Add pairscores.  No alternate chars for query sequence */
5528 #ifdef HAVE_AVX2
5529 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_ptr[clo]));
5530 	debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5531 	H_nogap_c = _MM_ADDS_EPI8(H_nogap_c, pairscores_std);
5532 #elif defined(HAVE_SSE4_1)
5533 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
5534 	debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5535 	H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_std);
5536 #else
5537 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo])); /* Has 128 added already */
5538 	pairscores_best = _mm_sub_epi8(pairscores_std, all_128);
5539 	debug15(print_vector_8(pairscores_best,clo,r,"pairscores_std"));
5540 	H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_best);
5541 #endif
5542 	_mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
5543 	debug15(print_vector_8(H_nogap_c,clo,r,"H"));
5544 
5545 	dir_vert = _MM_CMPLT_EPI8(E_c_gap,H_nogap_c); /* E < H */
5546 	dir_vert = _MM_ANDNOT_SI(dir_vert,complement_dummy);	/* E >= H, for jump late */
5547 #ifdef HAVE_AVX2
5548 	_mm256_store_si256((__m256i *) &((*directions_nogap)[r][clo]),dir_vert);
5549 #else
5550 	_mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
5551 #endif
5552 	debug15(print_vector_8(dir_vert,clo,r,"dir_nogap"));
5553 
5554 
5555 #ifdef HAVE_SSE4_1
5556 	H_nogap_c = _MM_MAX_EPI8(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
5557 #else
5558 	/* Compare H + pairscores with horiz + extend */
5559 	H_nogap_c = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_c, all_128), _mm_add_epi8(E_c_gap, all_128)), all_128);
5560 #endif
5561 	debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c store"));
5562 #ifdef HAVE_AVX2
5563 	_mm256_store_si256((__m256i *) &(score_column[clo]), H_nogap_c);
5564 #else
5565 	_mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
5566 #endif
5567 
5568 
5569 	/* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
5570 	if (chigh >= r) {
5571 	  (*directions_Egap)[r][r] = DIAG;
5572 	  (*directions_nogap)[r][r] = DIAG;
5573 	}
5574 
5575 	/* No need for F loop here */
5576 #ifdef HAVE_AVX2
5577 	save = _mm256_extract_epi8(E_mask,15);
5578 	E_mask = _mm256_slli_si256(E_mask,ONE_CHAR);
5579 	E_mask = _mm256_insert_epi8(E_mask,save,16);
5580 #else
5581 	E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
5582 #endif
5583       }
5584     }
5585 
5586   } else {
5587     /* jump early */
5588     for (clo = 0; clo <= glength; clo += SIMD_NCHARS) {
5589       if ((chigh = clo + SIMD_NCHARS - 1) > glength) {
5590 	chigh = glength;
5591       }
5592 
5593       /* dir_vert tests if E > H.  To fill in first row of each
5594 	 column block with non-diags, make E > H. */
5595       E_mask = _MM_SET1_EPI8(1);
5596 
5597       /* Holds for all INITIAL_GAP_PENALTY */
5598       E_c_gap = _MM_SET1_EPI8(NEG_INFINITY_8+1);
5599       H_nogap_c = _MM_SET1_EPI8(NEG_INFINITY_8-open); /* Compensate for T1 = H + open */
5600 
5601       for (r = clo; r <= chigh + lband && r <= rlength; r++) {
5602 	score_column = matrix[r];
5603 
5604 	if (r == 0) {
5605 	  na1 = 4; /* 'N' */
5606 	} else {
5607 	  na1 = revp ? nt_to_int_array[(int) rsequence[1-r]] : nt_to_int_array[(int) rsequence[r-1]];
5608 	}
5609 	pairscores_ptr = pairscores[na1];
5610 
5611 	if (r == 0) {
5612 	  X_prev_nogap = _MM_SETZERO_SI();
5613 	} else if (clo == 0) {
5614 #ifdef ZERO_INITIAL_GAP_PENALTY
5615 	  X_prev_nogap = _MM_SETZERO_SI();
5616 #elif defined(HAVE_AVX2)
5617 	  X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),NEG_INFINITY_8,LAST_CHAR_INSERT);
5618 #else
5619 	  X_prev_nogap = _mm_set1_epi8(NEG_INFINITY_8); /* works if we start outside the clo bounds */
5620 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
5621 #endif
5622 	} else {
5623 	  /* second or greater block of 8 */
5624 #ifdef ZERO_INITIAL_GAP_PENALTY
5625 	  X_prev_nogap = _MM_SETZERO_SI();
5626 #elif defined(HAVE_AVX2)
5627 	  X_prev_nogap = _mm256_insert_epi8(_mm256_setzero_si256(),matrix[r-1][clo-1],LAST_CHAR_INSERT);
5628 #else
5629 	  X_prev_nogap = _mm_set1_epi8(matrix[r-1][clo-1]); /* get H from previous block and previous column */
5630 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_CHAR_SHIFT);
5631 #endif
5632 	}
5633 
5634 	debug15(print_vector_8(E_mask,clo,r,"E_mask"));
5635 #ifdef HAVE_SSE4_1
5636 	E_c_gap = _MM_MIN_EPI8(E_c_gap,_MM_ADD_EPI8(E_mask,E_infinity));
5637 #else
5638 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5639 #endif
5640 	debug15(print_vector_8(E_c_gap,clo,r,"E_c_gap"));
5641 	debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c load"));
5642 
5643 	/* EGAP */
5644 	T1 = _MM_ADDS_EPI8(H_nogap_c, gap_open);
5645 	dir_vert = _MM_CMPGT_EPI8(E_c_gap,T1); /* E > H, for jump early */
5646 #ifdef HAVE_AVX2
5647 	_mm256_store_si256((__m256i *) &((*directions_Egap)[r][clo]),dir_vert);
5648 #else
5649 	_mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
5650 #endif
5651 	debug15(print_vector_8(T1,clo,r,"T1"));
5652 	debug15(print_vector_8(dir_vert,clo,r,"dir_Egap"));
5653 
5654 	/* Compare H + open with vert */
5655 #ifdef HAVE_SSE4_1
5656 	E_c_gap = _MM_MAX_EPI8(E_c_gap, T1); /* Compare H + open with vert */
5657 	E_c_gap = _MM_ADDS_EPI8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5658 	E_c_gap = _MM_MIN_EPI8(E_c_gap,_MM_ADD_EPI8(E_mask,E_infinity));
5659 #elif 1
5660 	E_c_gap = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128)), all_128);
5661 	E_c_gap = _mm_adds_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5662 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(_mm_add_epi8(E_c_gap, all_128),_mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5663 #else
5664 	/* Try to avoid unnecessary shifts by 128, but overflows */
5665 	E_c_gap = _mm_max_epu8(_mm_add_epi8(E_c_gap, all_128), _mm_add_epi8(T1, all_128));
5666 	E_c_gap = _mm_add_epi8(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
5667 	E_c_gap = _mm_sub_epi8(_mm_min_epu8(E_c_gap, _mm_add_epi8(E_mask,E_infinity_plus_128)), all_128);
5668 #endif
5669 	debug15(print_vector_8(E_c_gap,clo,r,"E"));
5670 
5671 
5672 	/* NOGAP */
5673 #ifdef HAVE_AVX2
5674 	T1 = _mm256_insert_epi8(_mm256_setzero_si256(),_mm256_extract_epi8(H_nogap_c,SIMD_NCHARS-1),LAST_CHAR_INSERT);
5675 	X_prev_nogap = _mm256_insert_epi8(X_prev_nogap,_mm256_extract_epi8(H_nogap_c,MID_CHAR_INSERT-1),MID_CHAR_INSERT);
5676 	H_nogap_c = _mm256_slli_si256(H_nogap_c,ONE_CHAR);
5677 #else
5678 	T1 = _mm_srli_si128(H_nogap_c,LAST_CHAR_SHIFT);
5679 	H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_CHAR);
5680 #endif
5681 	H_nogap_c = _MM_OR_SI(H_nogap_c, X_prev_nogap);
5682 	X_prev_nogap = T1;
5683 
5684 	/* Add pairscores.  No alternate chars for query sequence */
5685 #ifdef HAVE_AVX2
5686 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_ptr[clo]));
5687 	debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5688 	H_nogap_c = _MM_ADDS_EPI8(H_nogap_c, pairscores_std);
5689 #elif defined(HAVE_SSE4_1)
5690 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
5691 	debug15(print_vector_8(pairscores_std,clo,r,"pairscores_std"));
5692 	H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_std);
5693 #else
5694 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo])); /* Has 128 added already */
5695 	pairscores_best = _mm_sub_epi8(pairscores_std, all_128);
5696 	debug15(print_vector_8(pairscores_best,clo,r,"pairscores_std"));
5697 	H_nogap_c = _mm_adds_epi8(H_nogap_c, pairscores_best);
5698 #endif
5699 	_mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
5700 	debug15(print_vector_8(H_nogap_c,clo,r,"H"));
5701 
5702 	dir_vert = _MM_CMPGT_EPI8(E_c_gap,H_nogap_c); /* E > H, for jump early */
5703 #ifdef HAVE_AVX2
5704 	_mm256_store_si256((__m256i *) &((*directions_nogap)[r][clo]),dir_vert);
5705 #else
5706 	_mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
5707 #endif
5708 	debug15(print_vector_8(dir_vert,clo,r,"dir_nogap"));
5709 
5710 
5711 #ifdef HAVE_SSE4_1
5712 	H_nogap_c = _MM_MAX_EPI8(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
5713 #else
5714 	/* Compare H + pairscores with horiz + extend */
5715 	H_nogap_c = _mm_sub_epi8(_mm_max_epu8(_mm_add_epi8(H_nogap_c, all_128), _mm_add_epi8(E_c_gap, all_128)), all_128);
5716 #endif
5717 	debug15(print_vector_8(H_nogap_c,clo,r,"H_nogap_c store"));
5718 #ifdef HAVE_AVX2
5719 	_mm256_store_si256((__m256i *) &(score_column[clo]), H_nogap_c);
5720 #else
5721 	_mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
5722 #endif
5723 
5724 	/* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
5725 	if (chigh >= r) {
5726 	  (*directions_Egap)[r][r] = DIAG;
5727 	  (*directions_nogap)[r][r] = DIAG;
5728 	}
5729 
5730 	/* No need for F loop here */
5731 #ifdef HAVE_AVX2
5732 	save = _mm256_extract_epi8(E_mask,15);
5733 	E_mask = _mm256_slli_si256(E_mask,ONE_CHAR);
5734 	E_mask = _mm256_insert_epi8(E_mask,save,16);
5735 #else
5736 	E_mask = _mm_slli_si128(E_mask,ONE_CHAR);
5737 #endif
5738       }
5739     }
5740   }
5741 
5742 #ifdef CHECK1
5743   /* Row 0 and column 0 directions fail anyway due to saturation */
5744   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
5745   (*directions_Egap)[1][0] = VERT;
5746 #endif
5747 
5748 #ifdef DEBUG2
5749   printf("SIMD: Dynprog_simd_8_lower\n");
5750   Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
5751 		   revp,lband,/*upperp*/false);
5752   Directions8_print_ud(*directions_nogap,*directions_Egap,
5753 		       rlength,glength,rsequence,gsequence,gsequence_alt,
5754 		       revp,lband,/*upperp*/false);
5755 #endif
5756 
5757 #ifdef CHECK1
5758   /* Check for column 0 directions */
5759   for (r = 1; r <= lband && r <= rlength; r++) {
5760     assert((*directions_Egap)[r][0] != DIAG);
5761     assert((*directions_nogap)[r][0] != DIAG);
5762   }
5763 #endif
5764 
5765 #ifdef DEBUG_AVX2
5766   matrix_std = Dynprog_simd_8_lower_nonavx2(&directions_nogap_std,&directions_Egap_std,
5767 					    this,rsequence,gsequence,gsequence_alt,
5768 					    rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
5769 					    open,extend,lband,jump_late_p,revp);
5770 #elif defined(DEBUG_SIMD)
5771   matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
5772 				this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
5773 				rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
5774 				open,extend,lband,/*uband*/0,jump_late_p,revp,/*saturation*/NEG_INFINITY_8,
5775 				/*upperp*/false,/*lowerp*/true);
5776 #endif
5777 
5778 #ifdef DEBUG2
5779   printf("Banded %s\n",revp ? "rev" : "fwd");
5780   Matrix8_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
5781 		   revp,lband,/*upperp*/false);
5782   Directions8_print_ud(*directions_nogap,*directions_Egap,
5783 		       rlength,glength,rsequence,gsequence,gsequence_alt,
5784 		       revp,lband,/*upperp*/false);
5785 #endif
5786 
5787 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
5788   banded_matrix8_compare_lower(matrix,matrix_std,rlength,glength,lband,
5789 			       rsequence,gsequence,gsequence_alt,
5790 			       goffset,chroffset,chrhigh,watsonp,revp);
5791 
5792   banded_directions8_compare_nogap_lower(matrix,*directions_nogap,directions_nogap_std,rlength,glength,lband);
5793 
5794   banded_directions8_compare_Egap_lower(matrix,*directions_Egap,directions_Egap_std,rlength,glength,lband);
5795 #endif
5796 
5797   _mm_free(pairscores[4]);
5798   _mm_free(pairscores[3]);
5799   _mm_free(pairscores[2]);
5800   _mm_free(pairscores[1]);
5801   _mm_free(pairscores[0]);
5802 
5803   return matrix;
5804 }
5805 #endif
5806 
5807 
5808 #ifdef DEBUG_AVX2
5809 Score16_T **
Dynprog_simd_16_nonavx2(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,Direction16_T *** directions_Fgap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,int uband,bool jump_late_p,bool revp)5810 Dynprog_simd_16_nonavx2 (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
5811 			 Direction16_T ***directions_Fgap,
5812 			 T this, char *rsequence, char *gsequence, char *gsequence_alt,
5813 			 int rlength, int glength,
5814 			 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
5815 			 Mismatchtype_T mismatchtype, int open, int extend,
5816 			 int lband, int uband, bool jump_late_p, bool revp) {
5817   int c_gap, last_nogap, score, *FF; /* Need to have the ability to go past NEG_INFINITY */
5818   Score16_T **matrix, *score_column;
5819   __m128i pairscores_std, pairscores_alt;
5820   __m128i H_nogap_r, X_prev_nogap, E_r_gap, T1;
5821   __m128i gap_open, gap_extend, extend_ladder, extend_chunk, complement_dummy;
5822   __m128i dir_horiz;
5823   __m128i ramp, ramp_chunk, lband_vector, filter, ones;
5824   int rlength_ceil, lband_ceil, r, c;
5825   int rlo, rhigh, rlo_calc, rhigh_calc;
5826   int na1, na2, na2_alt;
5827   Score16_T *pairscores_col0;
5828   Score16_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr, pairscore, pairscore0;
5829   Pairdistance_T **pairdistance_array_type;
5830 
5831 
5832   debug2(printf("Dynprog_simd_16.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
5833   debug15(printf("Dynprog_simd_16.  jump_late_p %d, open %d, extend %d, lband %d, uband %d\n",
5834 		 jump_late_p,open,extend,lband,uband));
5835 
5836   rlength_ceil = (int) ((rlength + SIMD_NSHORTS_NONAVX2)/SIMD_NSHORTS_NONAVX2) * SIMD_NSHORTS_NONAVX2;
5837   pairdistance_array_type = pairdistance_array[mismatchtype];
5838 
5839   debug(printf("compute_scores_simd_16_bycols (upper): "));
5840   debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
5841   debug(printf("Query length rounded up to %d\n",rlength_ceil));
5842 
5843   matrix = aligned_score16_alloc(rlength_ceil,glength,
5844 				 this->aligned_std.one.matrix_ptrs,this->aligned_std.one.matrix_space);
5845   *directions_nogap = aligned_directions16_alloc(rlength_ceil,glength,
5846 						 this->aligned_std.one.directions_ptrs_0,this->aligned_std.one.directions_space_0);
5847   *directions_Egap = aligned_directions16_alloc(rlength_ceil,glength,
5848 						this->aligned_std.one.directions_ptrs_1,this->aligned_std.one.directions_space_1);
5849   /* Need to calloc to save time in F loop */
5850   *directions_Fgap = aligned_directions16_calloc(rlength_ceil,glength,
5851 						 this->aligned_std.one.directions_ptrs_2,this->aligned_std.one.directions_space_2);
5852 
5853 #if 0
5854   /* Row 0 initialization */
5855   /* penalty = open; */
5856   for (c = 1; c <= uband && c <= glength; c++) {
5857     /* penalty += extend; */
5858     (*directions_Egap)[c][0] = HORIZ;
5859     (*directions_nogap)[c][0] = HORIZ;
5860   }
5861 #endif
5862 #if 0
5863   /* Already initialized to DIAG.  Actually, no longer initializing directions_Egap */
5864   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
5865   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
5866 #endif
5867 #if 0
5868   /* Column 0 initialization */
5869   /* penalty = open; */
5870   for (r = 1; r <= SIMD_NSHORTS_NONAVX2 && r <= rlength; r++) {
5871     /* penalty += extend; */
5872     (*directions_nogap)[0][r] = VERT;
5873   }
5874 #endif
5875 
5876 
5877   /* Load pairscores.  Store match - mismatch */
5878   pairscores[0] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
5879   pairscores[1] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
5880   pairscores[2] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
5881   pairscores[3] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
5882   pairscores[4] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
5883 
5884   lband_ceil = (int) ((lband + SIMD_NSHORTS_NONAVX2)/SIMD_NSHORTS_NONAVX2) * SIMD_NSHORTS_NONAVX2;
5885   pairscores_col0 = (Score16_T *) _mm_malloc(lband_ceil * sizeof(Score16_T),16);
5886 
5887 #if 0
5888   /* Should not be necessary */
5889   memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score16_T));
5890   memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score16_T));
5891   memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score16_T));
5892   memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score16_T));
5893   memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score16_T));
5894 #endif
5895 
5896 
5897   pairscores_col0[0] = (Score16_T) 0;
5898   /* Initialization just to lband causes errors in dir_horiz for Egap */
5899 #ifdef ZERO_INITIAL_GAP_PENALTY
5900   for (r = 1; r < lband_ceil; r++) {
5901     pairscores_col0[r] = (Score16_T) 0;
5902   }
5903 #else
5904   for (r = 1; r < lband_ceil; r++) {
5905     pairscores_col0[r] = (Score16_T) NEG_INFINITY_16;
5906   }
5907 #endif
5908 
5909   r = 0; na1 = 'N';
5910   pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
5911   pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
5912   pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
5913   pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
5914   pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
5915 
5916   if (revp == false) {
5917     for (r = 1; r <= rlength; r++) {
5918       na1 = (int) rsequence[r-1];
5919       pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
5920       pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
5921       pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
5922       pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
5923       pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
5924     }
5925   } else {
5926     for (r = 1; r <= rlength; r++) {
5927       na1 = (int) rsequence[1-r];
5928       pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
5929       pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
5930       pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
5931       pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
5932       pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
5933     }
5934   }
5935 
5936 #if 0
5937   /* Should not be necessary */
5938   memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
5939   memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
5940   memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
5941   memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
5942   memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
5943 #endif
5944 
5945   complement_dummy = _mm_set1_epi16(-1);
5946 
5947   FF = (int *) MALLOCA((glength + 1) * sizeof(int));
5948 
5949   gap_open = _mm_set1_epi16((Score16_T) open);
5950   gap_extend = _mm_set1_epi16((Score16_T) extend);
5951 
5952 
5953 #ifndef INFINITE_INITIAL_GAP_PENALTY
5954   lband_vector = _mm_set1_epi16(lband);
5955   ramp = _mm_setr_epi16(1,2,3,4,5,6,7,8);
5956   extend_ladder = _mm_setr_epi16(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend);
5957   ramp_chunk = _mm_set1_epi16(SIMD_NSHORTS_NONAVX2);
5958   extend_chunk = _mm_set1_epi16(SIMD_NSHORTS_NONAVX2*extend);
5959 #endif
5960 
5961   if (jump_late_p) {
5962     for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS_NONAVX2) {
5963       if ((rhigh = rlo + SIMD_NSHORTS_NONAVX2 - 1) > rlength) {
5964 	rhigh = rlength;
5965       }
5966 
5967       if ((c = rlo - lband) < 0) {
5968 	c = 0;
5969 
5970 #if defined(ZERO_INITIAL_GAP_PENALTY)
5971 	/* Initial H in column 0 determined by zeroed out H.  E needs to equal gap_open for column 1. */
5972 	E_r_gap = _mm_set1_epi16(NEG_INFINITY_16-open);
5973 	filter = _mm_cmpgt_epi16(ramp,lband_vector);
5974 	H_nogap_r = _mm_and_si128(filter,E_r_gap); /* Use zeros for score */
5975 
5976 	E_r_gap = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
5977 	E_r_gap = _mm_adds_epi16(E_r_gap,gap_open);
5978 
5979 	ramp = _mm_adds_epi16(ramp,ramp_chunk); /* Prepare for next block */
5980 	extend_ladder = _mm_adds_epi16(extend_ladder,extend_chunk); /* Prepare for next block */
5981 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
5982 	/* dir_horiz tests if E >= H.  To fill in first column of each
5983 	   row block with non-diags, make E == H. */
5984 	E_r_gap = _mm_set1_epi16(NEG_INFINITY_16);
5985 	H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
5986 #else
5987 	/* Initial H in column 0 determined by E, which needs to equal
5988 	   gap_open + extend_ladder for column 1.  H is free to be set
5989 	   equal to E. */
5990 	H_nogap_r = _mm_set1_epi16(NEG_INFINITY_8-open); /* To compensate for T1 = H + open */
5991 	filter = _mm_cmpgt_epi16(ramp,lband_vector);
5992 	H_nogap_r = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),_mm_and_si128(filter,H_nogap_r));
5993 	E_r_gap = _mm_adds_epi16(H_nogap_r,gap_open);
5994 	ramp = _mm_adds_epi16(ramp,ramp_chunk); /* Prepare for next block */
5995 	extend_ladder = _mm_adds_epi16(extend_ladder,extend_chunk); /* Prepare for next block */
5996 #endif
5997 
5998       } else {
5999 	E_r_gap = _mm_set1_epi16(NEG_INFINITY_16);
6000 	H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6001       }
6002 
6003       for ( ; c <= rhigh + uband && c <= glength; c++) {
6004 	score_column = matrix[c];
6005 
6006 	if (c == 0) {
6007 	  pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
6008 
6009 #ifdef ZERO_INITIAL_GAP_PENALTY
6010 	  X_prev_nogap = _mm_set1_epi16(0);
6011 #else
6012 	  if (rlo == 0) {
6013 	    X_prev_nogap = _mm_set1_epi16(0);
6014 	  } else {
6015 	    X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6016 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6017 	  }
6018 #endif
6019 
6020 	} else {
6021 	  na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
6022 	  na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
6023 	  pairscores_std_ptr = pairscores[na2];
6024 	  pairscores_alt_ptr = pairscores[na2_alt];
6025 
6026 	  if (rlo == 0) {
6027 #ifdef ZERO_INITIAL_GAP_PENALTY
6028 	    X_prev_nogap = _mm_set1_epi16(0);
6029 #else
6030 	    X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6031 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6032 #endif
6033 	  } else {
6034 	    /* second or greater block of 16 */
6035 	    X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
6036 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6037 	  }
6038 	}
6039 
6040 	debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
6041 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
6042 
6043 	/* EGAP */
6044 	T1 = _mm_adds_epi16(H_nogap_r, gap_open);
6045 	dir_horiz = _mm_cmplt_epi16(E_r_gap,T1); /* E < H */
6046 	dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy);	/* E >= H, for jump late */
6047 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6048 	debug15(print_vector_16(T1,rlo,c,"T1"));
6049 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
6050 
6051 	E_r_gap = _mm_max_epi16(E_r_gap, T1); /* Compare H + open with vert */
6052 	E_r_gap = _mm_adds_epi16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
6053 	debug15(print_vector_16(E_r_gap,rlo,c,"E"));
6054 
6055 
6056 	/* NOGAP */
6057 	T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_NONAVX2);
6058 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
6059 	H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
6060 	X_prev_nogap = T1;
6061 
6062 	/* Add pairscores, allowing for alternate genomic nt */
6063 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
6064 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
6065 	H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
6066 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
6067 	debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
6068 
6069 	dir_horiz = _mm_cmplt_epi16(E_r_gap,H_nogap_r); /* E < H */
6070 	dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy);	/* E >= H, for jump late */
6071 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
6072 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
6073 
6074 	H_nogap_r = _mm_max_epi16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
6075 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
6076 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
6077 
6078 
6079 	/* F loop */
6080 	if ((rlo_calc = rlo) <= c - uband) {
6081 	  rlo_calc = c - uband;
6082 	}
6083 	if ((rhigh_calc = rhigh) >= c + lband) {
6084 	  rhigh_calc = c + lband;
6085 	  if (c > 0) {
6086 	    /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
6087 	    pairscore = pairscores[na2][rhigh_calc];
6088 	    if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
6089 	      pairscore = pairscore0;
6090 	    }
6091 	    /* No need to fix for non-SSE4.1: pairscore -= 128; */
6092 	    if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_16) {
6093 	      score_column[rhigh_calc] = NEG_INFINITY_16; /* Saturation */
6094 	    } else if (score > POS_INFINITY_16) {
6095 	      score_column[rhigh_calc] = POS_INFINITY_16; /* Saturation */
6096 	    } else {
6097 	      score_column[rhigh_calc] = (Score16_T) score;
6098 	    }
6099 	    (*directions_Egap)[c][rhigh_calc] = DIAG;
6100 	    (*directions_nogap)[c][rhigh_calc] = DIAG;
6101 	  }
6102 	}
6103 
6104 	debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
6105 		      rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
6106 
6107 	if (rlo == 0) {
6108 	  c_gap = NEG_INFINITY_INT;
6109 	  last_nogap = NEG_INFINITY_INT;
6110 	} else if (c >= rlo + uband) {
6111 	  c_gap = NEG_INFINITY_INT;
6112 	  last_nogap = NEG_INFINITY_INT;
6113 	} else {
6114 	  debug3(printf("At c %d, uband %d, reading c_gap %d\n",c,uband,FF[c]));
6115 	  c_gap = FF[c];
6116 	  last_nogap = (int) score_column[rlo_calc-1];
6117 	}
6118 
6119 	if ((r = rlo_calc) == c - uband) {
6120 	  /* Handle top value as a special case to prevent going outside of uband */
6121 	  /* FGAP */
6122 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6123 			r,c,c_gap + extend,last_nogap + open + extend));
6124 	  score = last_nogap + open /* + extend */;
6125 	  c_gap = score + extend;
6126 	  /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6127 
6128 	  /* NOGAP */
6129 	  last_nogap = (int) score_column[r];
6130 	  r++;
6131 	}
6132 
6133 	/* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
6134 	for ( ; r <= rhigh_calc; r++) {
6135 	  /* FGAP */
6136 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6137 			r,c,c_gap + extend,last_nogap + open + extend));
6138 	  if (c_gap /* + extend */ >= (score = last_nogap + open /* + extend */)) {  /* Use >= for jump late */
6139 	    c_gap += extend;
6140 	    (*directions_Fgap)[c][r] = VERT;
6141 	  } else {
6142 	    c_gap = score + extend;
6143 	    /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6144 	  }
6145 
6146 	  /* NOGAP */
6147 	  last_nogap = (int) score_column[r];
6148 	  debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
6149 	  if (c_gap >= last_nogap) {  /* Use >= for jump late */
6150 	    last_nogap = c_gap;
6151 	    score_column[r] = (c_gap < NEG_INFINITY_16) ? NEG_INFINITY_16 : (Score16_T) c_gap; /* Saturation */
6152 	    (*directions_nogap)[c][r] = VERT;
6153 	  }
6154 	}
6155 
6156 	FF[c] = c_gap;
6157 	debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
6158 	H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
6159       }
6160     }
6161 
6162   } else {
6163     /* jump early */
6164 #if defined(ZERO_INITIAL_GAP_PENALTY) || defined(INFINITE_INITIAL_GAP_PENALTY)
6165     /* No need for ones */
6166 #else
6167     ones = _mm_set1_epi16(1);
6168 #endif
6169     for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS_NONAVX2) {
6170       if ((rhigh = rlo + SIMD_NSHORTS_NONAVX2 - 1) > rlength) {
6171 	rhigh = rlength;
6172       }
6173 
6174       if ((c = rlo - lband) < 0) {
6175 	c = 0;
6176 
6177 #if defined(ZERO_INITIAL_GAP_PENALTY)
6178 	/* Initial H in column 0 determined by zeroed out H.  E needs to equal gap_open for column 1. */
6179 	E_r_gap = _mm_set1_epi16(NEG_INFINITY_16-open);
6180 	filter = _mm_cmpgt_epi16(ramp,lband_vector);
6181 	H_nogap_r = _mm_and_si128(filter,E_r_gap); /* Use zeros for score */
6182 
6183 	E_r_gap = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
6184 	E_r_gap = _mm_adds_epi16(E_r_gap,gap_open);
6185 
6186 	ramp = _mm_adds_epi16(ramp,ramp_chunk); /* Prepare for next block */
6187 	extend_ladder = _mm_adds_epi16(extend_ladder,extend_chunk); /* Prepare for next block */
6188 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
6189 	/* dir_horiz tests if E > H.  To fill in first column of each
6190 	   row block with non-diags, make E > H. */
6191 	E_r_gap = _mm_set1_epi16(NEG_INFINITY_16+1);
6192 	H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6193 #else
6194 	/* Initial H in column 0 determined by E, which needs to equal
6195 	   gap_open + extend_ladder for column 1.  H is free to be set
6196 	   less than E. */
6197 	H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open+1); /* To compensate for T1 = H + open */
6198 	filter = _mm_cmpgt_epi16(ramp,lband_vector);
6199 	H_nogap_r = _mm_or_si128(_mm_andnot_si128(filter,extend_ladder),_mm_and_si128(filter,H_nogap_r));
6200 	E_r_gap = _mm_adds_epi16(H_nogap_r,gap_open);
6201 	H_nogap_r = _mm_subs_epi16(H_nogap_r,ones);    /* To ensure H < E */
6202 	ramp = _mm_adds_epi16(ramp,ramp_chunk); /* Prepare for next block */
6203 	extend_ladder = _mm_adds_epi16(extend_ladder,extend_chunk); /* Prepare for next block */
6204 #endif
6205 
6206       } else {
6207 	E_r_gap = _mm_set1_epi16(NEG_INFINITY_16+1);
6208 	H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6209       }
6210 
6211       for ( ; c <= rhigh + uband && c <= glength; c++) {
6212 	score_column = matrix[c];
6213 
6214 	if (c == 0) {
6215 	  pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
6216 
6217 #ifdef ZERO_INITIAL_GAP_PENALTY
6218 	  X_prev_nogap = _mm_set1_epi16(0);
6219 #else
6220 	  if (rlo == 0) {
6221 	    X_prev_nogap = _mm_set1_epi16(0);
6222 	  } else {
6223 	    X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6224 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6225 	  }
6226 #endif
6227 
6228 	} else {
6229 	  na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
6230 	  na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
6231 	  pairscores_std_ptr = pairscores[na2];
6232 	  pairscores_alt_ptr = pairscores[na2_alt];
6233 
6234 	  if (rlo == 0) {
6235 #ifdef ZERO_INITIAL_GAP_PENALTY
6236 	    X_prev_nogap = _mm_set1_epi16(0);
6237 #else
6238 	    X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6239 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6240 #endif
6241 	  } else {
6242 	    /* second or greater block of 16 */
6243 	    X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
6244 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
6245 	  }
6246 	}
6247 
6248 	debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
6249 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
6250 
6251 	/* EGAP */
6252 	T1 = _mm_adds_epi16(H_nogap_r, gap_open);
6253 	dir_horiz = _mm_cmpgt_epi16(E_r_gap,T1); /* E > H, for jump early */
6254 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6255 	debug15(print_vector_16(T1,rlo,c,"T1"));
6256 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
6257 
6258 	E_r_gap = _mm_max_epi16(E_r_gap, T1); /* Compare H + open with vert */
6259 	E_r_gap = _mm_adds_epi16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
6260 	debug15(print_vector_16(E_r_gap,rlo,c,"E"));
6261 
6262 
6263 	/* NOGAP */
6264 	T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_NONAVX2);
6265 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
6266 	H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
6267 	X_prev_nogap = T1;
6268 
6269 	/* Add pairscores, allowing for alternate genomic nt */
6270 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
6271 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
6272 	H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
6273 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
6274 	debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
6275 
6276 	dir_horiz = _mm_cmpgt_epi16(E_r_gap,H_nogap_r); /* E > H, for jump early */
6277 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
6278 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
6279 
6280 	H_nogap_r = _mm_max_epi16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
6281 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
6282 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
6283 
6284 
6285 	/* F loop */
6286 	if ((rlo_calc = rlo) < c - uband) {
6287 	  rlo_calc = c - uband;
6288 	}
6289 	if ((rhigh_calc = rhigh) >= c + lband) {
6290 	  rhigh_calc = c + lband;
6291 	  if (c > 0) {
6292 	    /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
6293 	    pairscore = pairscores[na2][rhigh_calc];
6294 	    if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
6295 	      pairscore = pairscore0;
6296 	    }
6297 	    /* No need to fix for non-SSE4.1: pairscore -= 128; */
6298 	    if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_16) {
6299 	      score_column[rhigh_calc] = NEG_INFINITY_16; /* Saturation */
6300 	    } else if (score > POS_INFINITY_16) {
6301 	      score_column[rhigh_calc] = POS_INFINITY_16; /* Saturation */
6302 	    } else {
6303 	      score_column[rhigh_calc] = (Score16_T) score;
6304 	    }
6305 	    (*directions_Egap)[c][rhigh_calc] = DIAG;
6306 	    (*directions_nogap)[c][rhigh_calc] = DIAG;
6307 	  }
6308 	}
6309 
6310 	debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
6311 		      rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
6312 
6313 	if (rlo == 0) {
6314 	  c_gap = NEG_INFINITY_INT;
6315 	  last_nogap = NEG_INFINITY_INT;
6316 	} else if (c >= rlo + uband) {
6317 	  c_gap = NEG_INFINITY_INT;
6318 	  last_nogap = NEG_INFINITY_INT;
6319 	} else {
6320 	  c_gap = FF[c];
6321 	  last_nogap = (int) score_column[rlo_calc-1];
6322 	}
6323 
6324 	if ((r = rlo_calc) == c - uband) {
6325 	  /* Handle top value as a special case to prevent going outside of uband */
6326 	  /* FGAP */
6327 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6328 			r,c,c_gap + extend,last_nogap + open + extend));
6329 	  score = last_nogap + open /* + extend */;
6330 	  c_gap = score + extend;
6331 	  /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6332 
6333 	  /* NOGAP */
6334 	  last_nogap = (int) score_column[r];
6335 	  r++;
6336 	}
6337 
6338 	/* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
6339 	for ( ; r <= rhigh_calc; r++) {
6340 	  /* FGAP */
6341 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6342 			r,c,c_gap + extend,last_nogap + open + extend));
6343 	  if (c_gap /* + extend */ > (score = last_nogap + open /* + extend */)) {  /* Use > for jump early */
6344 	    c_gap += extend;
6345 	    (*directions_Fgap)[c][r] = VERT;
6346 	  } else {
6347 	    c_gap = score + extend;
6348 	    /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6349 	  }
6350 
6351 	  /* NOGAP */
6352 	  last_nogap = (int) score_column[r];
6353 	  debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
6354 	  if (c_gap > last_nogap) {  /* Use > for jump early */
6355 	    last_nogap = c_gap;
6356 	    score_column[r] = (c_gap < NEG_INFINITY_16) ? NEG_INFINITY_16 : (Score16_T) c_gap; /* Saturation */
6357 	    (*directions_nogap)[c][r] = VERT;
6358 	  }
6359 	}
6360 
6361 	FF[c] = c_gap;
6362 	debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
6363 	H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
6364       }
6365     }
6366   }
6367 
6368 
6369 #ifdef CHECK1
6370   /* Row 0 and column 0 directions fail anyway due to saturation */
6371   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
6372   (*directions_Egap)[1][0] = HORIZ;
6373   (*directions_Fgap)[0][1] = VERT;
6374 #endif
6375 
6376 
6377 #ifdef DEBUG2
6378   printf("SIMD: Dynprog_simd_16\n");
6379   Matrix16_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
6380 		 revp,lband,uband);
6381   Directions16_print(*directions_nogap,*directions_Egap,*directions_Fgap,
6382 		     rlength,glength,rsequence,gsequence,gsequence_alt,revp,lband,uband);
6383 #endif
6384 
6385 #ifdef CHECK1
6386   /* Check for row 0 directions */
6387   for (c = 1; c <= uband && c <= glength; c++) {
6388     assert((*directions_Egap)[c][0] != DIAG);
6389     assert((*directions_nogap)[c][0] != DIAG);
6390   }
6391   /* Check for column 0 directions */
6392   for (r = 1; r <= lband && r <= rlength; r++) {
6393     assert((*directions_Fgap)[0][r] != DIAG);
6394     assert((*directions_nogap)[0][r] != DIAG);
6395   }
6396 #endif
6397 
6398   FREEA(FF);
6399   _mm_free(pairscores_col0);
6400   _mm_free(pairscores[4]);
6401   _mm_free(pairscores[3]);
6402   _mm_free(pairscores[2]);
6403   _mm_free(pairscores[1]);
6404   _mm_free(pairscores[0]);
6405 
6406   return matrix;
6407   }
6408 #endif
6409 
6410 
6411 
6412 
6413 #if defined(HAVE_SSE2)
6414 /* Modified from Dynprog_simd_16_upper.  Operates by columns. */
6415 Score16_T **
Dynprog_simd_16(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,Direction16_T *** directions_Fgap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,int uband,bool jump_late_p,bool revp)6416 Dynprog_simd_16 (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
6417 		 Direction16_T ***directions_Fgap,
6418 		 T this, char *rsequence, char *gsequence, char *gsequence_alt,
6419 		 int rlength, int glength,
6420 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
6421 		 int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
6422 #endif
6423 		 Mismatchtype_T mismatchtype, int open, int extend,
6424 		 int lband, int uband, bool jump_late_p, bool revp) {
6425   int c_gap, last_nogap, score, *FF; /* Need to have the ability to go past NEG_INFINITY */
6426   Score16_T **matrix, *score_column;
6427 #ifdef HAVE_AVX2
6428   __m256i pairscores_std, pairscores_alt;
6429   __m256i H_nogap_r, X_prev_nogap, E_r_gap, T1;
6430   __m256i gap_open, gap_extend, complement_dummy;
6431   __m256i dir_horiz;
6432 #if defined(ZERO_INITIAL_GAP_PENALTY)
6433   __m256i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter;
6434 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
6435 #else
6436   __m256i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter, ones;
6437 #endif
6438 
6439 #else
6440   __m128i pairscores_std, pairscores_alt;
6441   __m128i H_nogap_r, X_prev_nogap, E_r_gap, T1;
6442   __m128i gap_open, gap_extend, complement_dummy;
6443   __m128i dir_horiz;
6444 #if defined(ZERO_INITIAL_GAP_PENALTY)
6445   __m128i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter;
6446 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
6447 #else
6448   __m128i ramp, ramp_chunk, extend_ladder, extend_chunk, lband_vector, filter, ones;
6449 #endif
6450 
6451 #endif
6452 
6453   int rlength_ceil, lband_ceil, r, c;
6454   int rlo, rhigh, rlo_calc, rhigh_calc;
6455   int na1, na2, na2_alt;
6456   Score16_T *pairscores_col0;
6457   Score16_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr, pairscore, pairscore0;
6458   Pairdistance_T **pairdistance_array_type;
6459 
6460 #if defined(DEBUG_AVX2)
6461   Score16_T **matrix_std;
6462   Direction16_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
6463 #elif defined(DEBUG_SIMD)
6464   Score32_T **matrix_std;
6465   Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
6466 #endif
6467 
6468 
6469   debug2(printf("Dynprog_simd_16.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
6470   debug15(printf("Dynprog_simd_16.  jump_late_p %d, open %d, extend %d, lband %d, uband %d\n",
6471 		 jump_late_p,open,extend,lband,uband));
6472 
6473   rlength_ceil = (int) ((rlength + SIMD_NSHORTS)/SIMD_NSHORTS) * SIMD_NSHORTS;
6474   pairdistance_array_type = pairdistance_array[mismatchtype];
6475 
6476   debug(printf("compute_scores_simd_16_bycols (upper): "));
6477   debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
6478   debug(printf("Query length rounded up to %d\n",rlength_ceil));
6479 
6480   matrix = aligned_score16_alloc(rlength_ceil,glength,
6481 				 this->aligned.one.matrix_ptrs,this->aligned.one.matrix_space);
6482   *directions_nogap = aligned_directions16_alloc(rlength_ceil,glength,
6483 						 this->aligned.one.directions_ptrs_0,this->aligned.one.directions_space_0);
6484   *directions_Egap = aligned_directions16_alloc(rlength_ceil,glength,
6485 						this->aligned.one.directions_ptrs_1,this->aligned.one.directions_space_1);
6486   /* Need to calloc to save time in F loop */
6487   *directions_Fgap = aligned_directions16_calloc(rlength_ceil,glength,
6488 						 this->aligned.one.directions_ptrs_2,this->aligned.one.directions_space_2);
6489 
6490 #if 0
6491   /* Row 0 initialization */
6492   /* penalty = open; */
6493   for (c = 1; c <= uband && c <= glength; c++) {
6494     /* penalty += extend; */
6495     (*directions_Egap)[c][0] = HORIZ;
6496     (*directions_nogap)[c][0] = HORIZ;
6497   }
6498 #endif
6499 #if 0
6500   /* Already initialized to DIAG.  Actually, no longer initializing directions_Egap */
6501   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
6502   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
6503 #endif
6504 #if 0
6505   /* Column 0 initialization */
6506   /* penalty = open; */
6507   for (r = 1; r <= SIMD_NSHORTS && r <= rlength; r++) {
6508     /* penalty += extend; */
6509     (*directions_nogap)[0][r] = VERT;
6510   }
6511 #endif
6512 
6513 
6514   /* Load pairscores.  Store match - mismatch */
6515   pairscores[0] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
6516   pairscores[1] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
6517   pairscores[2] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
6518   pairscores[3] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
6519   pairscores[4] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
6520 
6521   lband_ceil = (int) ((lband + SIMD_NSHORTS)/SIMD_NSHORTS) * SIMD_NSHORTS;
6522   pairscores_col0 = (Score16_T *) _mm_malloc(lband_ceil * sizeof(Score16_T),ALIGN_SIZE);
6523 
6524 #if 0
6525   /* Should not be necessary */
6526   memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score16_T));
6527   memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score16_T));
6528   memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score16_T));
6529   memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score16_T));
6530   memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score16_T));
6531 #endif
6532 
6533 
6534   pairscores_col0[0] = (Score16_T) 0;
6535   /* Initialization just to lband causes errors in dir_horiz for Egap */
6536 #ifdef ZERO_INITIAL_GAP_PENALTY
6537   for (r = 1; r < lband_ceil; r++) {
6538     pairscores_col0[r] = (Score16_T) 0;
6539   }
6540 #else
6541   for (r = 1; r < lband_ceil; r++) {
6542     pairscores_col0[r] = (Score16_T) NEG_INFINITY_16;
6543   }
6544 #endif
6545 
6546   r = 0; na1 = 'N';
6547   pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
6548   pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
6549   pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
6550   pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
6551   pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
6552 
6553   if (revp == false) {
6554     for (r = 1; r <= rlength; r++) {
6555       na1 = (int) rsequence[r-1];
6556       pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
6557       pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
6558       pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
6559       pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
6560       pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
6561     }
6562   } else {
6563     for (r = 1; r <= rlength; r++) {
6564       na1 = (int) rsequence[1-r];
6565       pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
6566       pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
6567       pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
6568       pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
6569       pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
6570     }
6571   }
6572 
6573 #if 0
6574   /* Should not be necessary */
6575   memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
6576   memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
6577   memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
6578   memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
6579   memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
6580 #endif
6581 
6582   complement_dummy = _MM_SET1_EPI16(-1);
6583 
6584   FF = (int *) MALLOCA((glength + 1) * sizeof(int));
6585 
6586   gap_open = _MM_SET1_EPI16((Score16_T) open);
6587   gap_extend = _MM_SET1_EPI16((Score16_T) extend);
6588 
6589 #ifndef INFINITE_INITIAL_GAP_PENALTY
6590 #ifdef HAVE_AVX2
6591   ramp = _mm256_setr_epi16(1,2,3,4,5,6,7,8);
6592   extend_ladder = _mm256_setr_epi16(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend);
6593 #else
6594   ramp = _mm_setr_epi16(1,2,3,4,5,6,7,8);
6595   extend_ladder = _mm_setr_epi16(extend,2*extend,3*extend,4*extend,5*extend,6*extend,7*extend,8*extend);
6596 #endif
6597   lband_vector = _MM_SET1_EPI16(lband);
6598   ramp_chunk = _MM_SET1_EPI16(SIMD_NSHORTS);
6599   extend_chunk = _MM_SET1_EPI16(SIMD_NSHORTS*extend);
6600 #endif
6601 
6602   if (jump_late_p) {
6603     for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS) {
6604       if ((rhigh = rlo + SIMD_NSHORTS - 1) > rlength) {
6605 	rhigh = rlength;
6606       }
6607 
6608       if ((c = rlo - lband) < 0) {
6609 	c = 0;
6610 
6611 #if defined(ZERO_INITIAL_GAP_PENALTY)
6612 	/* Initial H in column 0 determined by zeroed out H.  E needs to equal gap_open for column 1. */
6613 	E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16-open);
6614 	filter = _MM_CMPGT_EPI16(ramp,lband_vector);
6615 	H_nogap_r = _MM_AND_SI(filter,E_r_gap); /* Use zeros for score */
6616 
6617 	E_r_gap = _MM_OR_SI(_mm_andnot_si128(filter,extend_ladder),H_nogap_r);
6618 	E_r_gap = _MM_ADDS_EPI16(E_r_gap,gap_open);
6619 
6620 	ramp = _MM_ADDS_EPI16(ramp,ramp_chunk); /* Prepare for next block */
6621 	extend_ladder = _MM_ADDS_EPI16(extend_ladder,extend_chunk); /* Prepare for next block */
6622 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
6623 	/* dir_horiz tests if E >= H.  To fill in first column of each
6624 	   row block with non-diags, make E == H. */
6625 	E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16);
6626 	H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6627 #else
6628 	/* Initial H in column 0 determined by E, which needs to equal
6629 	   gap_open + extend_ladder for column 1.  H is free to be set
6630 	   equal to E. */
6631 	H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_8-open); /* To compensate for T1 = H + open */
6632 	filter = _MM_CMPGT_EPI16(ramp,lband_vector);
6633 	H_nogap_r = _MM_OR_SI(_mm_andnot_si128(filter,extend_ladder),_mm_and_si128(filter,H_nogap_r));
6634 	E_r_gap = _MM_ADDS_EPI16(H_nogap_r,gap_open);
6635 	ramp = _MM_ADDS_EPI16(ramp,ramp_chunk); /* Prepare for next block */
6636 	extend_ladder = _MM_ADDS_EPI16(extend_ladder,extend_chunk); /* Prepare for next block */
6637 #endif
6638 
6639       } else {
6640 	E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16);
6641 	H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6642       }
6643 
6644       for ( ; c <= rhigh + uband && c <= glength; c++) {
6645 	score_column = matrix[c];
6646 
6647 	if (c == 0) {
6648 	  pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
6649 
6650 #ifdef ZERO_INITIAL_GAP_PENALTY
6651 	  X_prev_nogap = _MM_SETZERO_SI();
6652 #elif defined(HAVE_AVX2)
6653 	  if (rlo == 0) {
6654 	    X_prev_nogap = _mm256_setzero_si256();
6655 	  } else {
6656 	    X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
6657 	  }
6658 #else
6659 	  if (rlo == 0) {
6660 	    X_prev_nogap = _mm_setzero_si128();
6661 	  } else {
6662 	    X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6663 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6664 	  }
6665 #endif
6666 
6667 	} else {
6668 	  na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
6669 	  na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
6670 	  pairscores_std_ptr = pairscores[na2];
6671 	  pairscores_alt_ptr = pairscores[na2_alt];
6672 
6673 #ifdef HAVE_AVX2
6674 	  if (rlo == 0) {
6675 #ifdef ZERO_INITIAL_GAP_PENALTY
6676 	    X_prev_nogap = _MM_SETZERO_SI();
6677 #else
6678 	    X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
6679 #endif
6680 	  } else {
6681 	    /* second or greater block of 16 */
6682 	    X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_SHORT_INSERT);
6683 	  }
6684 
6685 #else
6686 	  if (rlo == 0) {
6687 #ifdef ZERO_INITIAL_GAP_PENALTY
6688 	    X_prev_nogap = _MM_SETZERO_SI();
6689 #else
6690 	    X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6691 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6692 #endif
6693 	  } else {
6694 	    /* second or greater block of 16 */
6695 	    X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
6696 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6697 	  }
6698 #endif
6699 	}
6700 
6701 	debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
6702 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
6703 
6704 	/* EGAP */
6705 	T1 = _MM_ADDS_EPI16(H_nogap_r, gap_open);
6706 	dir_horiz = _MM_CMPLT_EPI16(E_r_gap,T1); /* E < H */
6707 	dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy);	/* E >= H, for jump late */
6708 #ifdef HAVE_AVX2
6709 	_mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6710 #else
6711 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6712 #endif
6713 	debug15(print_vector_16(T1,rlo,c,"T1"));
6714 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
6715 
6716 	E_r_gap = _MM_MAX_EPI16(E_r_gap, T1); /* Compare H + open with vert */
6717 	E_r_gap = _MM_ADDS_EPI16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
6718 	debug15(print_vector_16(E_r_gap,rlo,c,"E"));
6719 
6720 
6721 	/* NOGAP */
6722 #ifdef HAVE_AVX2
6723 	T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_r,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
6724 	X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_r,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
6725 	H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_SHORT);
6726 #else
6727 	T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_SHIFT);
6728 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
6729 #endif
6730 	H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
6731 	X_prev_nogap = T1;
6732 
6733 	/* Add pairscores, allowing for alternate genomic nt */
6734 #ifdef HAVE_AVX2
6735 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
6736 	pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
6737 	H_nogap_r = _mm256_adds_epi16(H_nogap_r, _mm256_max_epi16(pairscores_std,pairscores_alt));
6738 #else
6739 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
6740 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
6741 	H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
6742 #endif
6743 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
6744 	debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
6745 
6746 	dir_horiz = _MM_CMPLT_EPI16(E_r_gap,H_nogap_r); /* E < H */
6747 	dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy);	/* E >= H, for jump late */
6748 #ifdef HAVE_AVX2
6749 	_mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
6750 #else
6751 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
6752 #endif
6753 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
6754 
6755 	H_nogap_r = _MM_MAX_EPI16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
6756 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
6757 #ifdef HAVE_AVX2
6758 	_mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
6759 #else
6760 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
6761 #endif
6762 
6763 
6764 	/* F loop */
6765 	if ((rlo_calc = rlo) <= c - uband) {
6766 	  rlo_calc = c - uband;
6767 	}
6768 	if ((rhigh_calc = rhigh) >= c + lband) {
6769 	  rhigh_calc = c + lband;
6770 	  if (c > 0) {
6771 	    /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
6772 	    pairscore = pairscores[na2][rhigh_calc];
6773 	    if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
6774 	      pairscore = pairscore0;
6775 	    }
6776 	    /* No need to fix for non-SSE4.1: pairscore -= 128; */
6777 	    if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_16) {
6778 	      score_column[rhigh_calc] = NEG_INFINITY_16; /* Saturation */
6779 	    } else if (score > POS_INFINITY_16) {
6780 	      score_column[rhigh_calc] = POS_INFINITY_16; /* Saturation */
6781 	    } else {
6782 	      score_column[rhigh_calc] = (Score16_T) score;
6783 	    }
6784 	    (*directions_Egap)[c][rhigh_calc] = DIAG;
6785 	    (*directions_nogap)[c][rhigh_calc] = DIAG;
6786 	  }
6787 	}
6788 
6789 	debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
6790 		      rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
6791 
6792 	if (rlo == 0) {
6793 	  c_gap = NEG_INFINITY_INT;
6794 	  last_nogap = NEG_INFINITY_INT;
6795 	} else if (c >= rlo + uband) {
6796 	  c_gap = NEG_INFINITY_INT;
6797 	  last_nogap = NEG_INFINITY_INT;
6798 	} else {
6799 	  debug3(printf("At c %d, uband %d, reading c_gap %d\n",c,uband,FF[c]));
6800 	  c_gap = FF[c];
6801 	  last_nogap = (int) score_column[rlo_calc-1];
6802 	}
6803 
6804 	if ((r = rlo_calc) == c - uband) {
6805 	  /* Handle top value as a special case to prevent going outside of uband */
6806 	  /* FGAP */
6807 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6808 			r,c,c_gap + extend,last_nogap + open + extend));
6809 	  score = last_nogap + open /* + extend */;
6810 	  c_gap = score + extend;
6811 	  /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6812 
6813 	  /* NOGAP */
6814 	  last_nogap = (int) score_column[r];
6815 	  r++;
6816 	}
6817 
6818 	/* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
6819 	for ( ; r <= rhigh_calc; r++) {
6820 	  /* FGAP */
6821 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
6822 			r,c,c_gap + extend,last_nogap + open + extend));
6823 	  if (c_gap /* + extend */ >= (score = last_nogap + open /* + extend */)) {  /* Use >= for jump late */
6824 	    c_gap += extend;
6825 	    (*directions_Fgap)[c][r] = VERT;
6826 	  } else {
6827 	    c_gap = score + extend;
6828 	    /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
6829 	  }
6830 
6831 	  /* NOGAP */
6832 	  last_nogap = (int) score_column[r];
6833 	  debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
6834 	  if (c_gap >= last_nogap) {  /* Use >= for jump late */
6835 	    last_nogap = c_gap;
6836 	    score_column[r] = (c_gap < NEG_INFINITY_16) ? NEG_INFINITY_16 : (Score16_T) c_gap; /* Saturation */
6837 	    (*directions_nogap)[c][r] = VERT;
6838 	  }
6839 	}
6840 
6841 	FF[c] = c_gap;
6842 	debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
6843 #ifdef HAVE_AVX2
6844 	H_nogap_r = _mm256_load_si256((__m256i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
6845 #else
6846 	H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
6847 #endif
6848       }
6849     }
6850 
6851   } else {
6852     /* jump early */
6853 #if defined(ZERO_INITIAL_GAP_PENALTY) || defined(INFINITE_INITIAL_GAP_PENALTY)
6854     /* No need for ones */
6855 #else
6856     ones = _MM_SET1_EPI16(1);
6857 #endif
6858     for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS) {
6859       if ((rhigh = rlo + SIMD_NSHORTS - 1) > rlength) {
6860 	rhigh = rlength;
6861       }
6862 
6863       if ((c = rlo - lband) < 0) {
6864 	c = 0;
6865 
6866 #if defined(ZERO_INITIAL_GAP_PENALTY)
6867 	/* Initial H in column 0 determined by zeroed out H.  E needs to equal gap_open for column 1. */
6868 	E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16-open);
6869 	filter = _MM_CMPGT_EPI16(ramp,lband_vector);
6870 	H_nogap_r = _MM_AND_SI(filter,E_r_gap); /* Use zeros for score */
6871 
6872 	E_r_gap = _MM_OR_SI(_MM_ANDNOT_SI(filter,extend_ladder),H_nogap_r);
6873 	E_r_gap = _MM_ADDS_EPI16(E_r_gap,gap_open);
6874 
6875 	ramp = _MM_ADDS_EPI16(ramp,ramp_chunk); /* Prepare for next block */
6876 	extend_ladder = _MM_ADDS_EPI16(extend_ladder,extend_chunk); /* Prepare for next block */
6877 #elif defined(INFINITE_INITIAL_GAP_PENALTY)
6878 	/* dir_horiz tests if E > H.  To fill in first column of each
6879 	   row block with non-diags, make E > H. */
6880 	E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16+1);
6881 	H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6882 #else
6883 	/* Initial H in column 0 determined by E, which needs to equal
6884 	   gap_open + extend_ladder for column 1.  H is free to be set
6885 	   less than E. */
6886 	H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open+1); /* To compensate for T1 = H + open */
6887 	filter = _MM_CMPGT_EPI16(ramp,lband_vector);
6888 	H_nogap_r = _MM_OR_SI(_MM_ANDNOT_SI(filter,extend_ladder),_MM_AND_SI(filter,H_nogap_r));
6889 	E_r_gap = _MM_ADDS_EPI16(H_nogap_r,gap_open);
6890 	H_nogap_r = _MM_SUBS_EPI16(H_nogap_r,ones);    /* To ensure H < E */
6891 	ramp = _MM_ADDS_EPI16(ramp,ramp_chunk); /* Prepare for next block */
6892 	extend_ladder = _MM_ADDS_EPI16(extend_ladder,extend_chunk); /* Prepare for next block */
6893 #endif
6894 
6895       } else {
6896 	E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16+1);
6897 	H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
6898       }
6899 
6900       for ( ; c <= rhigh + uband && c <= glength; c++) {
6901 	score_column = matrix[c];
6902 
6903 	if (c == 0) {
6904 	  pairscores_std_ptr = pairscores_alt_ptr = pairscores_col0;
6905 
6906 #ifdef ZERO_INITIAL_GAP_PENALTY
6907 	  X_prev_nogap = _MM_SETZERO_SI();
6908 #elif defined(HAVE_AVX2)
6909 	  if (rlo == 0) {
6910             X_prev_nogap = _mm256_setzero_si256();
6911 	  } else {
6912 	    X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
6913 	  }
6914 #else
6915 	  if (rlo == 0) {
6916             X_prev_nogap = _mm_setzero_si128();
6917 	  } else {
6918 	    X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6919 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6920 	  }
6921 #endif
6922 
6923 	} else {
6924 	  na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
6925 	  na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
6926 	  pairscores_std_ptr = pairscores[na2];
6927 	  pairscores_alt_ptr = pairscores[na2_alt];
6928 
6929 #ifdef HAVE_AVX2
6930 	  if (rlo == 0) {
6931 #ifdef ZERO_INITIAL_GAP_PENALTY
6932 	    X_prev_nogap = _MM_SETZERO_SI();
6933 #else
6934 	    X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
6935 #endif
6936 	  } else {
6937 	    /* second or greater block of 16 */
6938 	    X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_SHORT_INSERT);
6939 	  }
6940 
6941 #else
6942 	  if (rlo == 0) {
6943 #ifdef ZERO_INITIAL_GAP_PENALTY
6944 	    X_prev_nogap = _MM_SETZERO_SI();
6945 #else
6946 	    X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16);
6947 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6948 #endif
6949 	  } else {
6950 	    /* second or greater block of 16 */
6951 	    X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
6952 	    X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
6953 	  }
6954 #endif
6955 	}
6956 
6957 	debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
6958 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
6959 
6960 	/* EGAP */
6961 	T1 = _MM_ADDS_EPI16(H_nogap_r, gap_open);
6962 	dir_horiz = _MM_CMPGT_EPI16(E_r_gap,T1); /* E > H, for jump early */
6963 #ifdef HAVE_AVX2
6964 	_mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6965 #else
6966 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
6967 #endif
6968 	debug15(print_vector_16(T1,rlo,c,"T1"));
6969 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
6970 
6971 	E_r_gap = _MM_MAX_EPI16(E_r_gap, T1); /* Compare H + open with vert */
6972 	E_r_gap = _MM_ADDS_EPI16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
6973 	debug15(print_vector_16(E_r_gap,rlo,c,"E"));
6974 
6975 
6976 	/* NOGAP */
6977 #ifdef HAVE_AVX2
6978 	T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_r,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
6979 	X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_r,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
6980 	H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_SHORT);
6981 #else
6982 	T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_SHIFT);
6983 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
6984 #endif
6985 	H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
6986 	X_prev_nogap = T1;
6987 
6988 	/* Add pairscores, allowing for alternate genomic nt */
6989 #ifdef HAVE_AVX2
6990 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
6991 	pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
6992 	H_nogap_r = _mm256_adds_epi16(H_nogap_r, _mm256_max_epi16(pairscores_std,pairscores_alt));
6993 #else
6994 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
6995 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
6996 	H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
6997 #endif
6998 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
6999 	debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
7000 
7001 	dir_horiz = _MM_CMPGT_EPI16(E_r_gap,H_nogap_r); /* E > H, for jump early */
7002 #ifdef HAVE_AVX2
7003 	_mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7004 #else
7005 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7006 #endif
7007 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
7008 
7009 	H_nogap_r = _MM_MAX_EPI16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
7010 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
7011 #ifdef HAVE_AVX2
7012 	_mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
7013 #else
7014 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
7015 #endif
7016 
7017 
7018 	/* F loop */
7019 	if ((rlo_calc = rlo) < c - uband) {
7020 	  rlo_calc = c - uband;
7021 	}
7022 	if ((rhigh_calc = rhigh) >= c + lband) {
7023 	  rhigh_calc = c + lband;
7024 	  if (c > 0) {
7025 	    /* Set bottom values to DIAG (not HORIZ) to prevent going outside of lband */
7026 	    pairscore = pairscores[na2][rhigh_calc];
7027 	    if ((pairscore0 = pairscores[(int) na2_alt][rhigh_calc]) > pairscore) {
7028 	      pairscore = pairscore0;
7029 	    }
7030 	    /* No need to fix for non-SSE4.1: pairscore -= 128; */
7031 	    if ((score = (int) matrix[c-1][rhigh_calc-1] + (int) pairscore) < NEG_INFINITY_16) {
7032 	      score_column[rhigh_calc] = NEG_INFINITY_16; /* Saturation */
7033 	    } else if (score > POS_INFINITY_16) {
7034 	      score_column[rhigh_calc] = POS_INFINITY_16; /* Saturation */
7035 	    } else {
7036 	      score_column[rhigh_calc] = (Score16_T) score;
7037 	    }
7038 	    (*directions_Egap)[c][rhigh_calc] = DIAG;
7039 	    (*directions_nogap)[c][rhigh_calc] = DIAG;
7040 	  }
7041 	}
7042 
7043 	debug3(printf("F loop: rlo %d, rhigh %d, c %d, lband %d, uband %d => rlo_calc %d, rhigh_calc %d\n",
7044 		      rlo,rhigh,rlo_calc,c,lband,uband,rhigh_calc));
7045 
7046 	if (rlo == 0) {
7047 	  c_gap = NEG_INFINITY_INT;
7048 	  last_nogap = NEG_INFINITY_INT;
7049 	} else if (c >= rlo + uband) {
7050 	  c_gap = NEG_INFINITY_INT;
7051 	  last_nogap = NEG_INFINITY_INT;
7052 	} else {
7053 	  c_gap = FF[c];
7054 	  last_nogap = (int) score_column[rlo_calc-1];
7055 	}
7056 
7057 	if ((r = rlo_calc) == c - uband) {
7058 	  /* Handle top value as a special case to prevent going outside of uband */
7059 	  /* FGAP */
7060 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
7061 			r,c,c_gap + extend,last_nogap + open + extend));
7062 	  score = last_nogap + open /* + extend */;
7063 	  c_gap = score + extend;
7064 	  /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
7065 
7066 	  /* NOGAP */
7067 	  last_nogap = (int) score_column[r];
7068 	  r++;
7069 	}
7070 
7071 	/* score_ptr = &(score_column[rlo_calc]); -- Also possible, but less transparent */
7072 	for ( ; r <= rhigh_calc; r++) {
7073 	  /* FGAP */
7074 	  debug3(printf("Fgap at r %d, c %d: c_gap + extend %d vs last_nogap + open + extend %d\n",
7075 			r,c,c_gap + extend,last_nogap + open + extend));
7076 	  if (c_gap /* + extend */ > (score = last_nogap + open /* + extend */)) {  /* Use > for jump early */
7077 	    c_gap += extend;
7078 	    (*directions_Fgap)[c][r] = VERT;
7079 	  } else {
7080 	    c_gap = score + extend;
7081 	    /* (*directions_Fgap)[c][r] = DIAG: -- Already initialized to DIAG */
7082 	  }
7083 
7084 	  /* NOGAP */
7085 	  last_nogap = (int) score_column[r];
7086 	  debug3(printf("assign nogap at r %d, c %d: H/E %d vs vert + extend %d\n",r,c,last_nogap,c_gap));
7087 	  if (c_gap > last_nogap) {  /* Use > for jump early */
7088 	    last_nogap = c_gap;
7089 	    score_column[r] = (c_gap < NEG_INFINITY_16) ? NEG_INFINITY_16 : (Score16_T) c_gap; /* Saturation */
7090 	    (*directions_nogap)[c][r] = VERT;
7091 	  }
7092 	}
7093 
7094 	FF[c] = c_gap;
7095 	debug3(printf("At c %d, storing c_gap %d\n",c,FF[c]));
7096 #ifdef HAVE_AVX2
7097 	H_nogap_r = _mm256_load_si256((__m256i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
7098 #else
7099 	H_nogap_r = _mm_load_si128((__m128i *) &(score_column[rlo])); /* Need to reload because of changes by F loop */
7100 #endif
7101       }
7102     }
7103   }
7104 
7105 
7106 #ifdef CHECK1
7107   /* Row 0 and column 0 directions fail anyway due to saturation */
7108   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
7109   (*directions_Egap)[1][0] = HORIZ;
7110   (*directions_Fgap)[0][1] = VERT;
7111 #endif
7112 
7113 
7114 #ifdef DEBUG2
7115   printf("SIMD: Dynprog_simd_16\n");
7116   Matrix16_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
7117 		 revp,lband,uband);
7118   Directions16_print(*directions_nogap,*directions_Egap,*directions_Fgap,
7119 			     rlength,glength,rsequence,gsequence,gsequence_alt,
7120 			     revp,lband,uband);
7121 #endif
7122 
7123 #ifdef CHECK1
7124   /* Check for row 0 directions */
7125   for (c = 1; c <= uband && c <= glength; c++) {
7126     assert((*directions_Egap)[c][0] != DIAG);
7127     assert((*directions_nogap)[c][0] != DIAG);
7128   }
7129   /* Check for column 0 directions */
7130   for (r = 1; r <= lband && r <= rlength; r++) {
7131     assert((*directions_Fgap)[0][r] != DIAG);
7132     assert((*directions_nogap)[0][r] != DIAG);
7133   }
7134 #endif
7135 
7136 #ifdef DEBUG_AVX2
7137   matrix_std = Dynprog_simd_16_nonavx2(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
7138 				       this,rsequence,gsequence,gsequence_alt,
7139 				       rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
7140 				       open,extend,lband,uband,jump_late_p,revp);
7141 #elif defined(DEBUG_SIMD)
7142   matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
7143 				this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
7144 				rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
7145 				open,extend,lband,uband,jump_late_p,revp,/*saturation*/NEG_INFINITY_16,
7146 				/*upperp*/true,/*lowerp*/true);
7147 #endif
7148 
7149 #ifdef DEBUG2
7150   printf("Banded\n");
7151   Matrix16_print(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
7152 		 revp,lband,uband);
7153   Directions16_print(*directions_nogap,*directions_Egap,*directions_Fgap,
7154 		     rlength,glength,rsequence,gsequence,gsequence_alt,revp,lband,uband);
7155 #endif
7156 
7157 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
7158   banded_matrix16_compare(matrix,matrix_std,rlength,glength,lband,uband,
7159 			  rsequence,gsequence,gsequence_alt,
7160 			  goffset,chroffset,chrhigh,watsonp,revp);
7161 
7162   banded_directions16_compare_nogap(*directions_nogap,directions_nogap_std,rlength,glength,lband,uband);
7163   banded_directions16_compare_Egap(*directions_Egap,directions_Egap_std,rlength,glength,lband,uband);
7164   banded_directions16_compare_Fgap(*directions_Fgap,directions_Fgap_std,rlength,glength,lband,uband);
7165 #endif
7166 
7167   FREEA(FF);
7168   _mm_free(pairscores_col0);
7169   _mm_free(pairscores[4]);
7170   _mm_free(pairscores[3]);
7171   _mm_free(pairscores[2]);
7172   _mm_free(pairscores[1]);
7173   _mm_free(pairscores[0]);
7174 
7175   return matrix;
7176   }
7177 #endif
7178 
7179 
7180 #ifdef DEBUG_AVX2
7181 /* Designed for computation above the diagonal, so no F loop or bottom masking needed */
7182 /* Operates by columns */
7183 Score16_T **
Dynprog_simd_16_upper_nonavx2(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int uband,bool jump_late_p,bool revp)7184 Dynprog_simd_16_upper_nonavx2 (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
7185 			       T this, char *rsequence, char *gsequence, char *gsequence_alt,
7186 			       int rlength, int glength,
7187 			       int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
7188 			       Mismatchtype_T mismatchtype, int open, int extend,
7189 			       int uband, bool jump_late_p, bool revp) {
7190   Score16_T **matrix, *score_column;
7191   __m128i pairscores_std, pairscores_alt;
7192   __m128i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, E_infinity, T1;
7193   __m128i gap_open, gap_extend, complement_dummy;
7194   __m128i dir_horiz;
7195   int rlength_ceil, r, c;
7196   int rlo, rhigh;
7197   int na1, na2, na2_alt;
7198   Score16_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr;
7199   Pairdistance_T **pairdistance_array_type;
7200 
7201 
7202   debug2(printf("Dynprog_simd_16_upper.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
7203   debug15(printf("Dynprog_simd_16_upper.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
7204 
7205   rlength_ceil = (int) ((rlength + SIMD_NSHORTS_NONAVX2)/SIMD_NSHORTS_NONAVX2) * SIMD_NSHORTS_NONAVX2;
7206   pairdistance_array_type = pairdistance_array[mismatchtype];
7207 
7208   debug(printf("compute_scores_simd_16_bycols (upper): "));
7209   debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
7210   debug(printf("Query length rounded up to %d\n",rlength_ceil));
7211 
7212   matrix = aligned_score16_alloc(rlength_ceil,glength,
7213 				 this->aligned_std.two.upper_matrix_ptrs,this->aligned_std.two.upper_matrix_space);
7214   *directions_nogap = aligned_directions16_alloc(rlength_ceil,glength,
7215 						 this->aligned_std.two.upper_directions_ptrs_0,this->aligned_std.two.upper_directions_space_0);
7216   *directions_Egap = aligned_directions16_alloc(rlength_ceil,glength,
7217 						this->aligned_std.two.upper_directions_ptrs_1,this->aligned_std.two.upper_directions_space_1);
7218 
7219 #if 0
7220   /* Row 0 initialization */
7221   /* penalty = open; */
7222   for (c = 1; c <= uband && c <= glength; c++) {
7223     /* penalty += extend; */
7224     (*directions_Egap)[c][0] = HORIZ;
7225     (*directions_nogap)[c][0] = HORIZ;
7226   }
7227 #endif
7228 #if 0
7229   /* Already initialized to DIAG.  Actually, no longer initializing directions_Egap */
7230   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
7231   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
7232 #endif
7233 #if 0
7234   /* Column 0 initialization */
7235   /* penalty = open; */
7236   for (r = 1; r <= SIMD_NSHORTS_NONAVX2 && r <= rlength; r++) {
7237     /* penalty += extend; */
7238     (*directions_nogap)[0][r] = VERT;
7239   }
7240 #endif
7241 
7242 
7243   /* Load pairscores.  Store match - mismatch */
7244   pairscores[0] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
7245   pairscores[1] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
7246   pairscores[2] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
7247   pairscores[3] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
7248   pairscores[4] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),16);
7249 
7250 #if 0
7251   /* Should not be necessary */
7252   memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score16_T));
7253   memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score16_T));
7254   memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score16_T));
7255   memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score16_T));
7256   memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score16_T));
7257 #endif
7258 
7259   r = 0; na1 = 'N';
7260   pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7261   pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7262   pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7263   pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7264   pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7265 
7266   if (revp == false) {
7267     for (r = 1; r <= rlength; r++) {
7268       na1 = (int) rsequence[r-1];
7269       pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7270       pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7271       pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7272       pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7273       pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7274     }
7275   } else {
7276     for (r = 1; r <= rlength; r++) {
7277       na1 = (int) rsequence[1-r];
7278       pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7279       pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7280       pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7281       pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7282       pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7283     }
7284   }
7285 
7286 #if 0
7287   /* Should not be necessary */
7288   memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7289   memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7290   memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7291   memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7292   memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7293 #endif
7294 
7295   complement_dummy = _mm_set1_epi16(-1);
7296 
7297   gap_open = _mm_set1_epi16((Score16_T) open);
7298   gap_extend = _mm_set1_epi16((Score16_T) extend);
7299 
7300   E_infinity = _mm_set1_epi16(POS_INFINITY_16);
7301   if (jump_late_p) {
7302     for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS_NONAVX2) {
7303       if ((rhigh = rlo + SIMD_NSHORTS_NONAVX2 - 1) > rlength) {
7304 	rhigh = rlength;
7305       }
7306 
7307       /* dir_horiz tests if E >= H.  To fill in first column of each
7308 	 row block with non-diags, could make E == H.  But irrelevant,
7309 	 because these are above the diagonal. */
7310       E_mask = _mm_set1_epi16(1);
7311 
7312       E_r_gap = _mm_set1_epi16(NEG_INFINITY_16);
7313       H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
7314 
7315       for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
7316 	score_column = matrix[c];
7317 
7318 	if (c == 0) {
7319 	  na2 = na2_alt = 4; /* 'N' */
7320 	} else {
7321 	  na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
7322 	  na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
7323 	}
7324 	pairscores_std_ptr = pairscores[na2];
7325 	pairscores_alt_ptr = pairscores[na2_alt];
7326 
7327 	if (c == 0) {
7328 	  X_prev_nogap = _mm_set1_epi16(0);
7329 	} else if (rlo == 0) {
7330 #ifdef ZERO_INITIAL_GAP_PENALTY
7331 	  X_prev_nogap = _mm_set1_epi16(0);
7332 #else
7333 	  X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
7334 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
7335 #endif
7336 	} else {
7337 	  /* second or greater block of 16 */
7338 	  X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
7339 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
7340 	}
7341 
7342 	debug15(print_vector_16(E_mask,rlo,c,"E_mask"));
7343 	E_r_gap = _mm_min_epi16(E_r_gap,_mm_add_epi16(E_mask,E_infinity));
7344 	debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
7345 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
7346 
7347 	/* EGAP */
7348 	T1 = _mm_adds_epi16(H_nogap_r, gap_open);
7349 	dir_horiz = _mm_cmplt_epi16(E_r_gap,T1); /* E < H */
7350 	dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy);	/* E >= H, for jump late */
7351 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7352 	debug15(print_vector_16(T1,rlo,c,"T1"));
7353 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
7354 
7355 	E_r_gap = _mm_max_epi16(E_r_gap, T1); /* Compare H + open with vert */
7356 	E_r_gap = _mm_adds_epi16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
7357 	E_r_gap = _mm_min_epi16(E_r_gap,_mm_add_epi16(E_mask,E_infinity));
7358 	debug15(print_vector_16(E_r_gap,rlo,c,"E"));
7359 
7360 
7361 	/* NOGAP */
7362 	T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_NONAVX2);
7363 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
7364 	H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
7365 	X_prev_nogap = T1;
7366 
7367 	/* Add pairscores, allowing for alternate genomic nt */
7368 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
7369 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
7370 	H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
7371 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
7372 	debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
7373 
7374 	dir_horiz = _mm_cmplt_epi16(E_r_gap,H_nogap_r); /* E < H */
7375 	dir_horiz = _mm_andnot_si128(dir_horiz,complement_dummy);	/* E >= H, for jump late */
7376 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7377 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
7378 
7379 	H_nogap_r = _mm_max_epi16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
7380 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
7381 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
7382 
7383 
7384 	/* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
7385 	if (rhigh >= c) {
7386 	  (*directions_Egap)[c][c] = DIAG;
7387 	  (*directions_nogap)[c][c] = DIAG;
7388 	}
7389 
7390 	/* No need for F loop here */
7391 	E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
7392       }
7393     }
7394 
7395   } else {
7396     /* jump early */
7397     for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS_NONAVX2) {
7398       if ((rhigh = rlo + SIMD_NSHORTS_NONAVX2 - 1) > rlength) {
7399 	rhigh = rlength;
7400       }
7401 
7402       /* dir_horiz tests if E > H.  To fill in first column of each
7403 	 row block with non-diags, could make E > H.  But irrelevant,
7404 	 because these are above the diagonal. */
7405       E_mask = _mm_set1_epi16(1);
7406 
7407       E_r_gap = _mm_set1_epi16(NEG_INFINITY_16+1);
7408       H_nogap_r = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
7409 
7410       for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
7411 	score_column = matrix[c];
7412 
7413 	if (c == 0) {
7414 	  na2 = na2_alt = 4; /* 'N' */
7415 	} else {
7416 	  na2 = revp ? nt_to_int_array[gsequence[1-c]] : nt_to_int_array[gsequence[c-1]];
7417 	  na2_alt = revp ? nt_to_int_array[gsequence_alt[1-c]] : nt_to_int_array[gsequence_alt[c-1]];
7418 	}
7419 	pairscores_std_ptr = pairscores[na2];
7420 	pairscores_alt_ptr = pairscores[na2_alt];
7421 
7422 	if (c == 0) {
7423 	  X_prev_nogap = _mm_set1_epi16(0);
7424 	} else if (rlo == 0) {
7425 #ifdef ZERO_INITIAL_GAP_PENALTY
7426 	  X_prev_nogap = _mm_set1_epi16(0);
7427 #else
7428 	  X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
7429 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
7430 #endif
7431 	} else {
7432 	  /* second or greater block of 16 */
7433 	  X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
7434 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
7435 	}
7436 
7437 	debug15(print_vector_16(E_mask,rlo,c,"E_mask"));
7438 	E_r_gap = _mm_min_epi16(E_r_gap,_mm_add_epi16(E_mask,E_infinity));
7439 	debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
7440 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
7441 
7442 	/* EGAP */
7443 	T1 = _mm_adds_epi16(H_nogap_r, gap_open);
7444 	dir_horiz = _mm_cmpgt_epi16(E_r_gap,T1); /* E > H, for jump early */
7445 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7446 	debug15(print_vector_16(T1,rlo,c,"T1"));
7447 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
7448 
7449 	E_r_gap = _mm_max_epi16(E_r_gap, T1); /* Compare H + open with vert */
7450 	E_r_gap = _mm_adds_epi16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
7451 	E_r_gap = _mm_min_epi16(E_r_gap,_mm_add_epi16(E_mask,E_infinity));
7452 	debug15(print_vector_16(E_r_gap,rlo,c,"E"));
7453 
7454 
7455 	/* NOGAP */
7456 	T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_NONAVX2);
7457 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
7458 	H_nogap_r = _mm_or_si128(H_nogap_r, X_prev_nogap);
7459 	X_prev_nogap = T1;
7460 
7461 	/* Add pairscores, allowing for alternate genomic nt */
7462 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
7463 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
7464 	H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
7465 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
7466 	debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
7467 
7468 	dir_horiz = _mm_cmpgt_epi16(E_r_gap,H_nogap_r); /* E > H, for jump early */
7469 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7470 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
7471 
7472 	H_nogap_r = _mm_max_epi16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
7473 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
7474 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
7475 
7476 
7477 	/* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
7478 	if (rhigh >= c) {
7479 	  (*directions_Egap)[c][c] = DIAG;
7480 	  (*directions_nogap)[c][c] = DIAG;
7481 	}
7482 
7483 	/* No need for F loop here */
7484 	E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
7485       }
7486     }
7487   }
7488 
7489 #ifdef CHECK1
7490   /* Row 0 and column 0 directions fail anyway due to saturation */
7491   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
7492   (*directions_Egap)[1][0] = HORIZ;
7493 #endif
7494 
7495 #ifdef DEBUG2
7496   printf("SIMD: Dynprog_simd_16_upper\n");
7497   Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
7498 		    revp,uband,/*upperp*/true);
7499   Directions16_print_ud(*directions_nogap,*directions_Egap,
7500 			rlength,glength,rsequence,gsequence,gsequence_alt,
7501 			revp,uband,/*upperp*/true);
7502 #endif
7503 
7504 #ifdef CHECK1
7505   /* Check for row 0 directions */
7506   for (c = 1; c <= uband && c <= glength; c++) {
7507     assert((*directions_Egap)[c][0] != DIAG);
7508     assert((*directions_nogap)[c][0] != DIAG);
7509   }
7510 #endif
7511 
7512   _mm_free(pairscores[4]);
7513   _mm_free(pairscores[3]);
7514   _mm_free(pairscores[2]);
7515   _mm_free(pairscores[1]);
7516   _mm_free(pairscores[0]);
7517 
7518   return matrix;
7519 }
7520 #endif
7521 
7522 
7523 #ifdef HAVE_SSE2
7524 /* Designed for computation above the diagonal, so no F loop or bottom masking needed */
7525 /* Operates by columns */
7526 Score16_T **
Dynprog_simd_16_upper(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int uband,bool jump_late_p,bool revp)7527 Dynprog_simd_16_upper (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
7528 		       T this, char *rsequence, char *gsequence, char *gsequence_alt,
7529 		       int rlength, int glength,
7530 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
7531 		       int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
7532 #endif
7533 		       Mismatchtype_T mismatchtype, int open, int extend,
7534 		       int uband, bool jump_late_p, bool revp) {
7535   Score16_T **matrix, *score_column;
7536 #ifdef HAVE_AVX2
7537   __m256i pairscores_std, pairscores_alt;
7538   __m256i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, E_infinity, T1;
7539   __m256i gap_open, gap_extend, complement_dummy;
7540   __m256i dir_horiz;
7541   Score16_T save;
7542 #else
7543   __m128i pairscores_std, pairscores_alt;
7544   __m128i H_nogap_r, X_prev_nogap, E_r_gap, E_mask, E_infinity, T1;
7545   __m128i gap_open, gap_extend, complement_dummy;
7546   __m128i dir_horiz;
7547 #endif
7548   int rlength_ceil, r, c;
7549   int rlo, rhigh;
7550   int na1, na2, na2_alt;
7551   Score16_T *pairscores[5], *pairscores_std_ptr, *pairscores_alt_ptr;
7552   Pairdistance_T **pairdistance_array_type;
7553 
7554 #ifdef DEBUG_AVX2
7555   Score16_T **matrix_std;
7556   Direction16_T **directions_nogap_std, **directions_Egap_std;
7557   char na2_single;
7558 #elif defined(DEBUG_SIMD)
7559   Score32_T **matrix_std;
7560   Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
7561   char na2_single;
7562 #endif
7563 
7564 
7565   debug2(printf("Dynprog_simd_16_upper.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
7566   debug15(printf("Dynprog_simd_16_upper.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
7567 
7568   rlength_ceil = (int) ((rlength + SIMD_NSHORTS)/SIMD_NSHORTS) * SIMD_NSHORTS;
7569   pairdistance_array_type = pairdistance_array[mismatchtype];
7570 
7571   debug(printf("compute_scores_simd_16_bycols (upper): "));
7572   debug(printf("Lengths are %d and %d, so band is %d on right\n",rlength,glength,uband));
7573   debug(printf("Query length rounded up to %d\n",rlength_ceil));
7574 
7575   matrix = aligned_score16_alloc(rlength_ceil,glength,
7576 				 this->aligned.two.upper_matrix_ptrs,this->aligned.two.upper_matrix_space);
7577   *directions_nogap = aligned_directions16_alloc(rlength_ceil,glength,
7578 						 this->aligned.two.upper_directions_ptrs_0,this->aligned.two.upper_directions_space_0);
7579   *directions_Egap = aligned_directions16_alloc(rlength_ceil,glength,
7580 						this->aligned.two.upper_directions_ptrs_1,this->aligned.two.upper_directions_space_1);
7581 
7582 #if 0
7583   /* Row 0 initialization */
7584   /* penalty = open; */
7585   for (c = 1; c <= uband && c <= glength; c++) {
7586     /* penalty += extend; */
7587     (*directions_Egap)[c][0] = HORIZ;
7588     (*directions_nogap)[c][0] = HORIZ;
7589   }
7590 #endif
7591 #if 0
7592   /* Already initialized to DIAG.  Actually, no longer initializing directions_Egap */
7593   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
7594   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
7595 #endif
7596 #if 0
7597   /* Column 0 initialization */
7598   /* penalty = open; */
7599   for (r = 1; r <= SIMD_NSHORTS && r <= rlength; r++) {
7600     /* penalty += extend; */
7601     (*directions_nogap)[0][r] = VERT;
7602   }
7603 #endif
7604 
7605 
7606   /* Load pairscores.  Store match - mismatch */
7607   pairscores[0] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
7608   pairscores[1] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
7609   pairscores[2] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
7610   pairscores[3] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
7611   pairscores[4] = (Score16_T *) _mm_malloc(rlength_ceil * sizeof(Score16_T),ALIGN_SIZE);
7612 
7613 #if 0
7614   /* Should not be necessary */
7615   memset((void *) pairscores[0],0,rlength_ceil*sizeof(Score16_T));
7616   memset((void *) pairscores[1],0,rlength_ceil*sizeof(Score16_T));
7617   memset((void *) pairscores[2],0,rlength_ceil*sizeof(Score16_T));
7618   memset((void *) pairscores[3],0,rlength_ceil*sizeof(Score16_T));
7619   memset((void *) pairscores[4],0,rlength_ceil*sizeof(Score16_T));
7620 #endif
7621 
7622   r = 0; na1 = 'N';
7623   pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7624   pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7625   pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7626   pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7627   pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7628 
7629   if (revp == false) {
7630     for (r = 1; r <= rlength; r++) {
7631       na1 = (int) rsequence[r-1];
7632       pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7633       pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7634       pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7635       pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7636       pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7637     }
7638   } else {
7639     for (r = 1; r <= rlength; r++) {
7640       na1 = (int) rsequence[1-r];
7641       pairscores[0][r] = (Score16_T) pairdistance_array_type[na1][(int) 'A'];
7642       pairscores[1][r] = (Score16_T) pairdistance_array_type[na1][(int) 'C'];
7643       pairscores[2][r] = (Score16_T) pairdistance_array_type[na1][(int) 'G'];
7644       pairscores[3][r] = (Score16_T) pairdistance_array_type[na1][(int) 'T'];
7645       pairscores[4][r] = (Score16_T) pairdistance_array_type[na1][(int) 'N'];
7646     }
7647   }
7648 
7649 #if 0
7650   /* Should not be necessary */
7651   memset((void *) &(pairscores[0][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7652   memset((void *) &(pairscores[1][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7653   memset((void *) &(pairscores[2][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7654   memset((void *) &(pairscores[3][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7655   memset((void *) &(pairscores[4][r]),0,(rlength_ceil-r)*sizeof(Score16_T));
7656 #endif
7657 
7658   complement_dummy = _MM_SET1_EPI16(-1);
7659 
7660   gap_open = _MM_SET1_EPI16((Score16_T) open);
7661   gap_extend = _MM_SET1_EPI16((Score16_T) extend);
7662 
7663   E_infinity = _MM_SET1_EPI16(POS_INFINITY_16);
7664   if (jump_late_p) {
7665     for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS) {
7666       if ((rhigh = rlo + SIMD_NSHORTS - 1) > rlength) {
7667 	rhigh = rlength;
7668       }
7669 
7670       /* dir_horiz tests if E >= H.  To fill in first column of each
7671 	 row block with non-diags, could make E == H.  But irrelevant,
7672 	 because these are above the diagonal. */
7673       E_mask = _MM_SET1_EPI16(1);
7674 
7675       /* Holds for all INITIAL_GAP_PENALTY */
7676       E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16);
7677       H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
7678 
7679       for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
7680 	score_column = matrix[c];
7681 
7682 	if (c == 0) {
7683 	  na2 = na2_alt = 4; /* 'N' */
7684 	} else {
7685 	  na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
7686 	  na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
7687 	}
7688 	pairscores_std_ptr = pairscores[na2];
7689 	pairscores_alt_ptr = pairscores[na2_alt];
7690 
7691 	if (c == 0) {
7692 	  X_prev_nogap = _MM_SETZERO_SI();
7693 	} else if (rlo == 0) {
7694 #ifdef ZERO_INITIAL_GAP_PENALTY
7695 	  X_prev_nogap = _MM_SETZERO_SI();
7696 #elif defined(HAVE_AVX2)
7697 	  X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
7698 #else
7699 	  X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
7700 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
7701 #endif
7702 	} else {
7703 	  /* second or greater block of 16 */
7704 #ifdef ZERO_INITIAL_GAP_PENALTY
7705 	  X_prev_nogap = _MM_SETZERO_SI();
7706 #elif defined(HAVE_AVX2)
7707 	  X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_SHORT_INSERT);
7708 #else
7709 	  X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
7710 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
7711 #endif
7712 	}
7713 
7714 	debug15(print_vector_16(E_mask,rlo,c,"E_mask"));
7715 	E_r_gap = _MM_MIN_EPI16(E_r_gap,_MM_ADD_EPI16(E_mask,E_infinity));
7716 	debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
7717 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
7718 
7719 	/* EGAP */
7720 	T1 = _MM_ADDS_EPI16(H_nogap_r, gap_open);
7721 	dir_horiz = _MM_CMPLT_EPI16(E_r_gap,T1); /* E < H */
7722 	dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy);	/* E >= H, for jump late */
7723 #ifdef HAVE_AVX2
7724 	_mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7725 #else
7726 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7727 #endif
7728 	debug15(print_vector_16(T1,rlo,c,"T1"));
7729 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
7730 
7731 	E_r_gap = _MM_MAX_EPI16(E_r_gap, T1); /* Compare H + open with vert */
7732 	E_r_gap = _MM_ADDS_EPI16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
7733 	E_r_gap = _MM_MIN_EPI16(E_r_gap,_MM_ADD_EPI16(E_mask,E_infinity));
7734 	debug15(print_vector_16(E_r_gap,rlo,c,"E"));
7735 
7736 
7737 	/* NOGAP */
7738 #ifdef HAVE_AVX2
7739 	T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_r,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
7740 	X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_r,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
7741 	H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_SHORT);
7742 #else
7743 	T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_SHIFT);
7744 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
7745 #endif
7746 	H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
7747 	X_prev_nogap = T1;
7748 
7749 	/* Add pairscores, allowing for alternate genomic nt */
7750 #ifdef HAVE_AVX2
7751 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
7752 	pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
7753 	H_nogap_r = _mm256_adds_epi16(H_nogap_r, _mm256_max_epi16(pairscores_std,pairscores_alt));
7754 #else
7755 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
7756 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
7757 	H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
7758 #endif
7759 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
7760 	debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
7761 
7762 	dir_horiz = _MM_CMPLT_EPI16(E_r_gap,H_nogap_r); /* E < H */
7763 	dir_horiz = _MM_ANDNOT_SI(dir_horiz,complement_dummy);	/* E >= H, for jump late */
7764 #ifdef HAVE_AVX2
7765 	_mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7766 #else
7767 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7768 #endif
7769 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
7770 
7771 	H_nogap_r = _MM_MAX_EPI16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
7772 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
7773 #ifdef HAVE_AVX2
7774 	_mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
7775 #else
7776 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
7777 #endif
7778 
7779 	/* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
7780 	if (rhigh >= c) {
7781 	  (*directions_Egap)[c][c] = DIAG;
7782 	  (*directions_nogap)[c][c] = DIAG;
7783 	}
7784 
7785 	/* No need for F loop here */
7786 #ifdef HAVE_AVX2
7787 	save = _mm256_extract_epi16(E_mask,7);
7788 	E_mask = _mm256_slli_si256(E_mask,ONE_SHORT);
7789 	E_mask = _mm256_insert_epi16(E_mask,save,8);
7790 #else
7791 	E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
7792 #endif
7793       }
7794     }
7795 
7796   } else {
7797     /* jump early */
7798     for (rlo = 0; rlo <= rlength; rlo += SIMD_NSHORTS) {
7799       if ((rhigh = rlo + SIMD_NSHORTS - 1) > rlength) {
7800 	rhigh = rlength;
7801       }
7802 
7803       /* dir_horiz tests if E > H.  To fill in first column of each
7804 	 row block with non-diags, could make E > H.  But irrelevant,
7805 	 because these are above the diagonal. */
7806       E_mask = _MM_SET1_EPI16(1);
7807 
7808       /* Holds for all INITIAL_GAP_PENALTY */
7809       E_r_gap = _MM_SET1_EPI16(NEG_INFINITY_16+1);
7810       H_nogap_r = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
7811 
7812       for (c = rlo; c <= rhigh + uband && c <= glength; c++) {
7813 	score_column = matrix[c];
7814 
7815 	if (c == 0) {
7816 	  na2 = na2_alt = 4; /* 'N' */
7817 	} else {
7818 	  na2 = revp ? nt_to_int_array[(int) gsequence[1-c]] : nt_to_int_array[(int) gsequence[c-1]];
7819 	  na2_alt = revp ? nt_to_int_array[(int) gsequence_alt[1-c]] : nt_to_int_array[(int) gsequence_alt[c-1]];
7820 	}
7821 	pairscores_std_ptr = pairscores[na2];
7822 	pairscores_alt_ptr = pairscores[na2_alt];
7823 
7824 	if (c == 0) {
7825 	  X_prev_nogap = _MM_SETZERO_SI();
7826 	} else if (rlo == 0) {
7827 #ifdef ZERO_INITIAL_GAP_PENALTY
7828 	  X_prev_nogap = _MM_SETZERO_SI();
7829 #elif defined(HAVE_AVX2)
7830 	  X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
7831 #else
7832 	  X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
7833 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
7834 #endif
7835 	} else {
7836 	  /* second or greater block of 16 */
7837 #ifdef HAVE_AVX2
7838 	  X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[c-1][rlo-1],LAST_SHORT_INSERT);
7839 #else
7840 	  X_prev_nogap = _mm_set1_epi16(matrix[c-1][rlo-1]); /* get H from previous block and previous column */
7841 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
7842 #endif
7843 	}
7844 
7845 	debug15(print_vector_16(E_mask,rlo,c,"E_mask"));
7846 	E_r_gap = _MM_MIN_EPI16(E_r_gap,_MM_ADD_EPI16(E_mask,E_infinity));
7847 	debug15(print_vector_16(E_r_gap,rlo,c,"E_r_gap"));
7848 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r load"));
7849 
7850 	/* EGAP */
7851 	T1 = _MM_ADDS_EPI16(H_nogap_r, gap_open);
7852 	dir_horiz = _MM_CMPGT_EPI16(E_r_gap,T1); /* E > H, for jump early */
7853 #ifdef HAVE_AVX2
7854 	_mm256_store_si256((__m256i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7855 #else
7856 	_mm_store_si128((__m128i *) &((*directions_Egap)[c][rlo]),dir_horiz);
7857 #endif
7858 	debug15(print_vector_16(T1,rlo,c,"T1"));
7859 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_Egap"));
7860 
7861 	E_r_gap = _MM_MAX_EPI16(E_r_gap, T1); /* Compare H + open with vert */
7862 	E_r_gap = _MM_ADDS_EPI16(E_r_gap, gap_extend); /* Compute scores for Egap (vert + open) */
7863 	E_r_gap = _MM_MIN_EPI16(E_r_gap,_MM_ADD_EPI16(E_mask,E_infinity));
7864 	debug15(print_vector_16(E_r_gap,rlo,c,"E"));
7865 
7866 
7867 	/* NOGAP */
7868 #ifdef HAVE_AVX2
7869 	T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_r,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
7870 	X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_r,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
7871 	H_nogap_r = _mm256_slli_si256(H_nogap_r,ONE_SHORT);
7872 #else
7873 	T1 = _mm_srli_si128(H_nogap_r,LAST_SHORT_SHIFT);
7874 	H_nogap_r = _mm_slli_si128(H_nogap_r,ONE_SHORT);
7875 #endif
7876 	H_nogap_r = _MM_OR_SI(H_nogap_r, X_prev_nogap);
7877 	X_prev_nogap = T1;
7878 
7879 	/* Add pairscores, allowing for alternate genomic nt */
7880 #ifdef HAVE_AVX2
7881 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_std_ptr[rlo]));
7882 	pairscores_alt = _mm256_load_si256((__m256i *) &(pairscores_alt_ptr[rlo]));
7883 	H_nogap_r = _mm256_adds_epi16(H_nogap_r, _mm256_max_epi16(pairscores_std,pairscores_alt));
7884 #else
7885 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_std_ptr[rlo]));
7886 	pairscores_alt = _mm_load_si128((__m128i *) &(pairscores_alt_ptr[rlo]));
7887 	H_nogap_r = _mm_adds_epi16(H_nogap_r, _mm_max_epi16(pairscores_std,pairscores_alt));
7888 #endif
7889 	_mm_clflush(&H_nogap_r); /* Needed for opencc -O3 on AMD */
7890 	debug15(print_vector_16(H_nogap_r,rlo,c,"H"));
7891 
7892 	dir_horiz = _MM_CMPGT_EPI16(E_r_gap,H_nogap_r); /* E > H, for jump early */
7893 #ifdef HAVE_AVX2
7894 	_mm256_store_si256((__m256i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7895 #else
7896 	_mm_store_si128((__m128i *) &((*directions_nogap)[c][rlo]),dir_horiz);
7897 #endif
7898 	debug15(print_vector_16(dir_horiz,rlo,c,"dir_nogap"));
7899 
7900 	H_nogap_r = _MM_MAX_EPI16(H_nogap_r, E_r_gap); /* Compare H + pairscores with horiz + extend */
7901 	debug15(print_vector_16(H_nogap_r,rlo,c,"H_nogap_r store"));
7902 #ifdef HAVE_AVX2
7903 	_mm256_store_si256((__m256i *) &(score_column[rlo]), H_nogap_r);
7904 #else
7905 	_mm_store_si128((__m128i *) &(score_column[rlo]), H_nogap_r);
7906 #endif
7907 
7908 	/* Fix gaps along diagonal to prevent going into lower triangle, which can happen with ties between E and H */
7909 	if (rhigh >= c) {
7910 	  (*directions_Egap)[c][c] = DIAG;
7911 	  (*directions_nogap)[c][c] = DIAG;
7912 	}
7913 
7914 	/* No need for F loop here */
7915 #ifdef HAVE_AVX2
7916 	save = _mm256_extract_epi16(E_mask,7);
7917 	E_mask = _mm256_slli_si256(E_mask,ONE_SHORT);
7918 	E_mask = _mm256_insert_epi16(E_mask,save,8);
7919 #else
7920 	E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
7921 #endif
7922       }
7923     }
7924   }
7925 
7926 #ifdef CHECK1
7927   /* Row 0 and column 0 directions fail anyway due to saturation */
7928   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
7929   (*directions_Egap)[1][0] = HORIZ;
7930 #endif
7931 
7932 #ifdef DEBUG2
7933   printf("SIMD: Dynprog_simd_16_upper\n");
7934   Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
7935 		    revp,uband,/*upperp*/true);
7936   Directions16_print_ud(*directions_nogap,*directions_Egap,
7937 			rlength,glength,rsequence,gsequence,gsequence_alt,
7938 			revp,uband,/*upperp*/true);
7939 #endif
7940 
7941 #ifdef CHECK1
7942   /* Check for row 0 directions */
7943   for (c = 1; c <= uband && c <= glength; c++) {
7944     assert((*directions_Egap)[c][0] != DIAG);
7945     assert((*directions_nogap)[c][0] != DIAG);
7946   }
7947 #endif
7948 
7949 #ifdef DEBUG_AVX2
7950   matrix_std = Dynprog_simd_16_upper_nonavx2(&directions_nogap_std,&directions_Egap_std,
7951 					     this,rsequence,gsequence,gsequence_alt,
7952 					     rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
7953 					     open,extend,uband,jump_late_p,revp);
7954 #elif defined(DEBUG_SIMD)
7955   matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
7956 				this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
7957 				rlength,glength,
7958 				goffset,chroffset,chrhigh,watsonp,mismatchtype,
7959 				open,extend,/*lband*/0,uband,jump_late_p,revp,/*saturation*/NEG_INFINITY_16,
7960 				/*upperp*/true,/*lowerp*/false);
7961 #endif
7962 
7963 #ifdef DEBUG2
7964   printf("Banded\n");
7965   Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
7966                     revp,uband,/*upperp*/true);
7967   Directions16_print_ud(*directions_nogap,*directions_Egap,
7968                          rlength,glength,rsequence,gsequence,gsequence_alt,revp,uband,/*upperp*/true);
7969 #endif
7970 
7971 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
7972   banded_matrix16_compare_upper(matrix,matrix_std,rlength,glength,uband,
7973 				rsequence,gsequence,gsequence_alt,
7974 				goffset,chroffset,chrhigh,watsonp,revp);
7975 
7976   banded_directions16_compare_nogap_upper(*directions_nogap,directions_nogap_std,rlength,glength,uband);
7977 
7978   banded_directions16_compare_Egap_upper(*directions_Egap,directions_Egap_std,rlength,glength,uband);
7979 #endif
7980 
7981   _mm_free(pairscores[4]);
7982   _mm_free(pairscores[3]);
7983   _mm_free(pairscores[2]);
7984   _mm_free(pairscores[1]);
7985   _mm_free(pairscores[0]);
7986 
7987   return matrix;
7988 }
7989 #endif
7990 
7991 
7992 #ifdef DEBUG_AVX2
7993 /* Designed for computation below the diagonal, so no F loop or bottom masking needed */
7994 /* Operates by rows */
7995 Score16_T **
Dynprog_simd_16_lower_nonavx2(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,bool jump_late_p,bool revp)7996 Dynprog_simd_16_lower_nonavx2 (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
7997 			       T this, char *rsequence, char *gsequence, char *gsequence_alt,
7998 			       int rlength, int glength,
7999 			       int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
8000 			       Mismatchtype_T mismatchtype, int open, int extend,
8001 			       int lband, bool jump_late_p, bool revp) {
8002   Score16_T **matrix, *score_column;
8003   __m128i pairscores_std;
8004   __m128i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, E_infinity, T1;
8005   __m128i gap_open, gap_extend, complement_dummy;
8006   __m128i dir_vert;
8007   int glength_ceil, r, c;
8008   int clo, chigh;
8009   int na1, na2, na2_alt;
8010   Score16_T *pairscores[5], *pairscores_ptr;
8011   Pairdistance_T **pairdistance_array_type, score1, score2;
8012 
8013   debug2(printf("Dynprog_simd_16_lower.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
8014   debug15(printf("Dynprog_simd_16_lower.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
8015 
8016   glength_ceil = (int) ((glength + SIMD_NSHORTS_NONAVX2)/SIMD_NSHORTS_NONAVX2) * SIMD_NSHORTS_NONAVX2;
8017   pairdistance_array_type = pairdistance_array[mismatchtype];
8018 
8019   debug(printf("compute_scores_simd_16_byrows (lower): "));
8020   debug(printf("Lengths are %d and %d, so band is %d on left\n",rlength,glength,lband));
8021   debug(printf("Genome length rounded up to %d\n",glength_ceil));
8022 
8023   matrix = aligned_score16_alloc(glength_ceil,rlength,
8024 				 this->aligned_std.two.lower_matrix_ptrs,this->aligned_std.two.lower_matrix_space);
8025   *directions_nogap = aligned_directions16_alloc(glength_ceil,rlength,
8026 						 this->aligned_std.two.lower_directions_ptrs_0,this->aligned_std.two.lower_directions_space_0);
8027   *directions_Egap = aligned_directions16_alloc(glength_ceil,rlength,
8028 						this->aligned_std.two.lower_directions_ptrs_1,this->aligned_std.two.lower_directions_space_1);
8029 
8030 #if 0
8031   /* Column 0 initialization */
8032   /* penalty = open; */
8033   for (r = 1; r <= lband && r <= rlength; r++) {
8034     /* penalty += extend; */
8035     (*directions_Egap)[r][0] = VERT;
8036     (*directions_nogap)[r][0] = VERT;
8037   }
8038 #endif
8039 #if 0
8040   /* Already initialized to DIAG.  Actually, no longer initializing directions_Egap */
8041   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
8042   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
8043 #endif
8044 #if 0
8045   /* Row 0 initialization */
8046   /* penalty = open; */
8047   for (c = 1; c <= SIMD_NSHORTS_NONAVX2 && c <= glength; c++) {
8048     /* penalty += extend; */
8049     (*directions_nogap)[0][c] = HORIZ;
8050   }
8051 #endif
8052 
8053 
8054   /* Load pairscores.  Store match - mismatch */
8055   pairscores[0] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),16);
8056   pairscores[1] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),16);
8057   pairscores[2] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),16);
8058   pairscores[3] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),16);
8059   pairscores[4] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),16);
8060 
8061 #if 0
8062   /* Should not be necessary */
8063   memset((void *) pairscores[0],0,glength_ceil*sizeof(Score16_T));
8064   memset((void *) pairscores[1],0,glength_ceil*sizeof(Score16_T));
8065   memset((void *) pairscores[2],0,glength_ceil*sizeof(Score16_T));
8066   memset((void *) pairscores[3],0,glength_ceil*sizeof(Score16_T));
8067   memset((void *) pairscores[4],0,glength_ceil*sizeof(Score16_T));
8068 #endif
8069 
8070   c = 0; na2 = na2_alt = 'N';
8071   pairscores[0][c] = (Score16_T) pairdistance_array_type[(int) 'A'][na2];
8072   pairscores[1][c] = (Score16_T) pairdistance_array_type[(int) 'C'][na2];
8073   pairscores[2][c] = (Score16_T) pairdistance_array_type[(int) 'G'][na2];
8074   pairscores[3][c] = (Score16_T) pairdistance_array_type[(int) 'T'][na2];
8075   pairscores[4][c] = (Score16_T) pairdistance_array_type[(int) 'N'][na2];
8076 
8077   if (revp == false) {
8078     for (c = 1; c <= glength; c++) {
8079       na2 = gsequence[c-1];
8080       na2_alt = gsequence_alt[c-1];
8081       /* Take max here */
8082       score1 = pairdistance_array_type[(int) 'A'][na2];
8083       score2 = pairdistance_array_type[(int) 'A'][na2_alt];
8084       pairscores[0][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8085 
8086       score1 = pairdistance_array_type[(int) 'C'][na2];
8087       score2 = pairdistance_array_type[(int) 'C'][na2_alt];
8088       pairscores[1][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8089 
8090       score1 = pairdistance_array_type[(int) 'G'][na2];
8091       score2 = pairdistance_array_type[(int) 'G'][na2_alt];
8092       pairscores[2][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8093 
8094       score1 = pairdistance_array_type[(int) 'T'][na2];
8095       score2 = pairdistance_array_type[(int) 'T'][na2_alt];
8096       pairscores[3][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8097 
8098       score1 = pairdistance_array_type[(int) 'N'][na2];
8099       score2 = pairdistance_array_type[(int) 'N'][na2_alt];
8100       pairscores[4][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8101     }
8102   } else {
8103     for (c = 1; c <= glength; c++) {
8104       na2 = gsequence[1-c];
8105       na2_alt = gsequence_alt[1-c];
8106       /* Take max here */
8107       score1 = pairdistance_array_type[(int) 'A'][na2];
8108       score2 = pairdistance_array_type[(int) 'A'][na2_alt];
8109       pairscores[0][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8110 
8111       score1 = pairdistance_array_type[(int) 'C'][na2];
8112       score2 = pairdistance_array_type[(int) 'C'][na2_alt];
8113       pairscores[1][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8114 
8115       score1 = pairdistance_array_type[(int) 'G'][na2];
8116       score2 = pairdistance_array_type[(int) 'G'][na2_alt];
8117       pairscores[2][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8118 
8119       score1 = pairdistance_array_type[(int) 'T'][na2];
8120       score2 = pairdistance_array_type[(int) 'T'][na2_alt];
8121       pairscores[3][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8122 
8123       score1 = pairdistance_array_type[(int) 'N'][na2];
8124       score2 = pairdistance_array_type[(int) 'N'][na2_alt];
8125       pairscores[4][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8126     }
8127   }
8128 
8129 #if 0
8130   /* Should not be necessary */
8131   memset((void *) &(pairscores[0][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8132   memset((void *) &(pairscores[1][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8133   memset((void *) &(pairscores[2][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8134   memset((void *) &(pairscores[3][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8135   memset((void *) &(pairscores[4][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8136 #endif
8137 
8138   complement_dummy = _mm_set1_epi16(-1);
8139 
8140   gap_open = _mm_set1_epi16((Score16_T) open);
8141   gap_extend = _mm_set1_epi16((Score16_T) extend);
8142 
8143   E_infinity = _mm_set1_epi16(POS_INFINITY_16);
8144   if (jump_late_p) {
8145     for (clo = 0; clo <= glength; clo += SIMD_NSHORTS_NONAVX2) {
8146       if ((chigh = clo + SIMD_NSHORTS_NONAVX2 - 1) > glength) {
8147 	chigh = glength;
8148       }
8149 
8150       /* dir_vert tests if E >= H.  To fill in first row of each
8151 	 column block with non-diags, make E == H. */
8152       E_mask = _mm_set1_epi16(1);
8153 
8154       E_c_gap = _mm_set1_epi16(NEG_INFINITY_16);
8155       H_nogap_c = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
8156 
8157       for (r = clo; r <= chigh + lband && r <= rlength; r++) {
8158 	score_column = matrix[r];
8159 
8160 	if (r == 0) {
8161 	  na1 = 4; /* 'N' */
8162 	} else {
8163 	  na1 = revp ? nt_to_int_array[rsequence[1-r]] : nt_to_int_array[rsequence[r-1]];
8164 	}
8165 	pairscores_ptr = pairscores[na1];
8166 
8167 	if (r == 0) {
8168 	  X_prev_nogap = _mm_set1_epi16(0);
8169 	} else if (clo == 0) {
8170 #ifdef ZERO_INITIAL_GAP_PENALTY
8171 	  X_prev_nogap = _mm_set1_epi16(0);
8172 #else
8173 	  X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
8174 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
8175 #endif
8176 	} else {
8177 	  /* second or greater block of 16 */
8178 	  X_prev_nogap = _mm_set1_epi16(matrix[r-1][clo-1]); /* get H from previous block and previous column */
8179 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
8180 	}
8181 
8182 	debug15(print_vector_16(E_mask,clo,r,"E_mask"));
8183 	E_c_gap = _mm_min_epi16(E_c_gap,_mm_add_epi16(E_mask,E_infinity));
8184 	debug15(print_vector_16(E_c_gap,clo,r,"E_c_gap"));
8185 	debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c load"));
8186 
8187 	/* EGAP */
8188 	T1 = _mm_adds_epi16(H_nogap_c, gap_open);
8189 	dir_vert = _mm_cmplt_epi16(E_c_gap,T1); /* E < H */
8190 	dir_vert = _mm_andnot_si128(dir_vert,complement_dummy);	/* E >= H, for jump late */
8191 	_mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
8192 	debug15(print_vector_16(T1,clo,r,"T1"));
8193 	debug15(print_vector_16(dir_vert,clo,r,"dir_Egap"));
8194 
8195 	E_c_gap = _mm_max_epi16(E_c_gap, T1); /* Compare H + open with vert */
8196 	E_c_gap = _mm_adds_epi16(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
8197 	E_c_gap = _mm_min_epi16(E_c_gap,_mm_add_epi16(E_mask,E_infinity));
8198 	debug15(print_vector_16(E_c_gap,clo,r,"E"));
8199 
8200 
8201 	/* NOGAP */
8202 	T1 = _mm_srli_si128(H_nogap_c,LAST_SHORT_NONAVX2);
8203 	H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_SHORT);
8204 	H_nogap_c = _mm_or_si128(H_nogap_c, X_prev_nogap);
8205 	X_prev_nogap = T1;
8206 
8207 	/* Add pairscores.  No alternate chars for query sequence. */
8208 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
8209 	H_nogap_c = _mm_adds_epi16(H_nogap_c, pairscores_std);
8210 	_mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
8211 	debug15(print_vector_16(H_nogap_c,clo,r,"H"));
8212 
8213 	dir_vert = _mm_cmplt_epi16(E_c_gap,H_nogap_c); /* E < H */
8214 	dir_vert = _mm_andnot_si128(dir_vert,complement_dummy);	/* E >= H, for jump late */
8215 	_mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
8216 	debug15(print_vector_16(dir_vert,clo,r,"dir_nogap"));
8217 
8218 	H_nogap_c = _mm_max_epi16(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
8219 	debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c store"));
8220 	_mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
8221 
8222 
8223 	/* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
8224 	if (chigh >= r) {
8225 	  (*directions_Egap)[r][r] = DIAG;
8226 	  (*directions_nogap)[r][r] = DIAG;
8227 	}
8228 
8229 	/* No need for F loop here */
8230 	E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
8231       }
8232     }
8233 
8234   } else {
8235     /* jump early */
8236     for (clo = 0; clo <= glength; clo += SIMD_NSHORTS_NONAVX2) {
8237       if ((chigh = clo + SIMD_NSHORTS_NONAVX2 - 1) > glength) {
8238 	chigh = glength;
8239       }
8240 
8241       /* dir_vert tests if E > H.  To fill in first row of each
8242 	 column block with non-diags, make E > H. */
8243       E_mask = _mm_set1_epi16(1);
8244 
8245       E_c_gap = _mm_set1_epi16(NEG_INFINITY_16+1);
8246       H_nogap_c = _mm_set1_epi16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
8247 
8248       for (r = clo; r <= chigh + lband && r <= rlength; r++) {
8249 	score_column = matrix[r];
8250 
8251 	if (r == 0) {
8252 	  na1 = 4; /* 'N' */
8253 	} else {
8254 	  na1 = revp ? nt_to_int_array[rsequence[1-r]] : nt_to_int_array[rsequence[r-1]];
8255 	}
8256 	pairscores_ptr = pairscores[na1];
8257 
8258 	if (r == 0) {
8259 	  X_prev_nogap = _mm_set1_epi16(0);
8260 	} else if (clo == 0) {
8261 #ifdef ZERO_INITIAL_GAP_PENALTY
8262 	  X_prev_nogap = _mm_set1_epi16(0);
8263 #else
8264 	  X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
8265 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
8266 #endif
8267 	} else {
8268 	  /* second or greater block of 16 */
8269 	  X_prev_nogap = _mm_set1_epi16(matrix[r-1][clo-1]); /* get H from previous block and previous column */
8270 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_NONAVX2);
8271 	}
8272 
8273 	debug15(print_vector_16(E_mask,clo,r,"E_mask"));
8274 	E_c_gap = _mm_min_epi16(E_c_gap,_mm_add_epi16(E_mask,E_infinity));
8275 	debug15(print_vector_16(E_c_gap,clo,r,"E_c_gap"));
8276 	debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c load"));
8277 
8278 	/* EGAP */
8279 	T1 = _mm_adds_epi16(H_nogap_c, gap_open);
8280 	dir_vert = _mm_cmpgt_epi16(E_c_gap,T1); /* E > H, for jump early */
8281 	_mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
8282 	debug15(print_vector_16(T1,clo,r,"T1"));
8283 	debug15(print_vector_16(dir_vert,clo,r,"dir_Egap"));
8284 
8285 	E_c_gap = _mm_max_epi16(E_c_gap, T1); /* Compare H + open with vert */
8286 	E_c_gap = _mm_adds_epi16(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
8287 	E_c_gap = _mm_min_epi16(E_c_gap,_mm_add_epi16(E_mask,E_infinity));
8288 	debug15(print_vector_16(E_c_gap,clo,r,"E"));
8289 
8290 
8291 	/* NOGAP */
8292 	T1 = _mm_srli_si128(H_nogap_c,LAST_SHORT_NONAVX2);
8293 	H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_SHORT);
8294 	H_nogap_c = _mm_or_si128(H_nogap_c, X_prev_nogap);
8295 	X_prev_nogap = T1;
8296 
8297 	/* Add pairscores.  No alternate chars for query sequence */
8298 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
8299 	H_nogap_c = _mm_adds_epi16(H_nogap_c, pairscores_std);
8300 	_mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
8301 	debug15(print_vector_16(H_nogap_c,clo,r,"H"));
8302 
8303 	dir_vert = _mm_cmpgt_epi16(E_c_gap,H_nogap_c); /* E > H, for jump early */
8304 	_mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
8305 	debug15(print_vector_16(dir_vert,clo,r,"dir_nogap"));
8306 
8307 	H_nogap_c = _mm_max_epi16(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
8308 	debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c store"));
8309 	_mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
8310 
8311 
8312 	/* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
8313 	if (chigh >= r) {
8314 	  (*directions_Egap)[r][r] = DIAG;
8315 	  (*directions_nogap)[r][r] = DIAG;
8316 	}
8317 
8318 	/* No need for F loop here */
8319 	E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
8320       }
8321     }
8322   }
8323 
8324 
8325 #ifdef CHECK1
8326   /* Row 0 and column 0 directions fail anyway due to saturation */
8327   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
8328   (*directions_Egap)[1][0] = VERT;
8329 #endif
8330 
8331 #ifdef DEBUG2
8332   printf("SIMD: Dynprog_simd_16_lower\n");
8333   Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
8334 		    revp,lband,/*upperp*/false);
8335   Directions16_print_ud(*directions_nogap,*directions_Egap,
8336 			rlength,glength,rsequence,gsequence,gsequence_alt,
8337 			revp,lband,/*upperp*/false);
8338 #endif
8339 
8340 #ifdef CHECK1
8341   /* Check for column 0 directions */
8342   for (r = 1; r <= lband && r <= rlength; r++) {
8343     assert((*directions_Egap)[r][0] != DIAG);
8344     assert((*directions_nogap)[r][0] != DIAG);
8345   }
8346 #endif
8347 
8348   _mm_free(pairscores[4]);
8349   _mm_free(pairscores[3]);
8350   _mm_free(pairscores[2]);
8351   _mm_free(pairscores[1]);
8352   _mm_free(pairscores[0]);
8353 
8354   return matrix;
8355 }
8356 #endif
8357 
8358 
8359 #ifdef HAVE_SSE2
8360 /* Designed for computation below the diagonal, so no F loop or bottom masking needed */
8361 /* Operates by rows */
8362 Score16_T **
Dynprog_simd_16_lower(Direction16_T *** directions_nogap,Direction16_T *** directions_Egap,T this,char * rsequence,char * gsequence,char * gsequence_alt,int rlength,int glength,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,Mismatchtype_T mismatchtype,int open,int extend,int lband,bool jump_late_p,bool revp)8363 Dynprog_simd_16_lower (Direction16_T ***directions_nogap, Direction16_T ***directions_Egap,
8364 		       T this, char *rsequence, char *gsequence, char *gsequence_alt,
8365 		       int rlength, int glength,
8366 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
8367 		       int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
8368 #endif
8369 		       Mismatchtype_T mismatchtype, int open, int extend,
8370 		       int lband, bool jump_late_p, bool revp) {
8371   Score16_T **matrix, *score_column;
8372 #ifdef HAVE_AVX2
8373   __m256i pairscores_std;
8374   __m256i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, E_infinity, T1;
8375   __m256i gap_open, gap_extend, complement_dummy;
8376   __m256i dir_vert;
8377   Score16_T save;
8378 #else
8379   __m128i pairscores_std;
8380   __m128i H_nogap_c, X_prev_nogap, E_c_gap, E_mask, E_infinity, T1;
8381   __m128i gap_open, gap_extend, complement_dummy;
8382   __m128i dir_vert;
8383 #endif
8384   int glength_ceil, r, c;
8385   int clo, chigh;
8386   int na1, na2, na2_alt;
8387   Score16_T *pairscores[5], *pairscores_ptr;
8388   Pairdistance_T **pairdistance_array_type, score1, score2;
8389 
8390 #ifdef DEBUG_AVX2
8391   Score16_T **matrix_std;
8392   Direction16_T **directions_nogap_std, **directions_Egap_std;
8393   char na2_single;
8394 #elif defined(DEBUG_SIMD)
8395   Score32_T **matrix_std;
8396   Direction32_T **directions_nogap_std, **directions_Egap_std, **directions_Fgap_std;
8397   char na2_single;
8398 #endif
8399 
8400 
8401   debug2(printf("Dynprog_simd_16_lower.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
8402   debug15(printf("Dynprog_simd_16_lower.  jump_late_p %d, open %d, extend %d\n",jump_late_p,open,extend));
8403 
8404   glength_ceil = (int) ((glength + SIMD_NSHORTS)/SIMD_NSHORTS) * SIMD_NSHORTS;
8405   pairdistance_array_type = pairdistance_array[mismatchtype];
8406 
8407   debug(printf("compute_scores_simd_16_byrows (lower): "));
8408   debug(printf("Lengths are %d and %d, so band is %d on left\n",rlength,glength,lband));
8409   debug(printf("Genome length rounded up to %d\n",glength_ceil));
8410 
8411   matrix = aligned_score16_alloc(glength_ceil,rlength,
8412 				 this->aligned.two.lower_matrix_ptrs,this->aligned.two.lower_matrix_space);
8413   *directions_nogap = aligned_directions16_alloc(glength_ceil,rlength,
8414 						 this->aligned.two.lower_directions_ptrs_0,this->aligned.two.lower_directions_space_0);
8415   *directions_Egap = aligned_directions16_alloc(glength_ceil,rlength,
8416 						this->aligned.two.lower_directions_ptrs_1,this->aligned.two.lower_directions_space_1);
8417 
8418 #if 0
8419   /* Column 0 initialization */
8420   /* penalty = open; */
8421   for (r = 1; r <= lband && r <= rlength; r++) {
8422     /* penalty += extend; */
8423     (*directions_Egap)[r][0] = VERT;
8424     (*directions_nogap)[r][0] = VERT;
8425   }
8426 #endif
8427 #if 0
8428   /* Already initialized to DIAG.  Actually, no longer initializing directions_Egap */
8429   (*directions_Egap)[1][0] = DIAG; /* previously used STOP */
8430   (*directions_nogap)[0][0] = DIAG; /* previously used STOP */
8431 #endif
8432 #if 0
8433   /* Row 0 initialization */
8434   /* penalty = open; */
8435   for (c = 1; c <= SIMD_NSHORTS && c <= glength; c++) {
8436     /* penalty += extend; */
8437     (*directions_nogap)[0][c] = HORIZ;
8438   }
8439 #endif
8440 
8441 
8442   /* Load pairscores.  Store match - mismatch */
8443   pairscores[0] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),ALIGN_SIZE);
8444   pairscores[1] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),ALIGN_SIZE);
8445   pairscores[2] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),ALIGN_SIZE);
8446   pairscores[3] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),ALIGN_SIZE);
8447   pairscores[4] = (Score16_T *) _mm_malloc(glength_ceil * sizeof(Score16_T),ALIGN_SIZE);
8448 
8449 #if 0
8450   /* Should not be necessary */
8451   memset((void *) pairscores[0],0,glength_ceil*sizeof(Score16_T));
8452   memset((void *) pairscores[1],0,glength_ceil*sizeof(Score16_T));
8453   memset((void *) pairscores[2],0,glength_ceil*sizeof(Score16_T));
8454   memset((void *) pairscores[3],0,glength_ceil*sizeof(Score16_T));
8455   memset((void *) pairscores[4],0,glength_ceil*sizeof(Score16_T));
8456 #endif
8457 
8458   c = 0; na2 = na2_alt = 'N';
8459   pairscores[0][c] = (Score16_T) pairdistance_array_type[(int) 'A'][na2];
8460   pairscores[1][c] = (Score16_T) pairdistance_array_type[(int) 'C'][na2];
8461   pairscores[2][c] = (Score16_T) pairdistance_array_type[(int) 'G'][na2];
8462   pairscores[3][c] = (Score16_T) pairdistance_array_type[(int) 'T'][na2];
8463   pairscores[4][c] = (Score16_T) pairdistance_array_type[(int) 'N'][na2];
8464 
8465   if (revp == false) {
8466     for (c = 1; c <= glength; c++) {
8467       na2 = gsequence[c-1];
8468       na2_alt = gsequence_alt[c-1];
8469       /* Take max here */
8470       score1 = pairdistance_array_type[(int) 'A'][na2];
8471       score2 = pairdistance_array_type[(int) 'A'][na2_alt];
8472       pairscores[0][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8473 
8474       score1 = pairdistance_array_type[(int) 'C'][na2];
8475       score2 = pairdistance_array_type[(int) 'C'][na2_alt];
8476       pairscores[1][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8477 
8478       score1 = pairdistance_array_type[(int) 'G'][na2];
8479       score2 = pairdistance_array_type[(int) 'G'][na2_alt];
8480       pairscores[2][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8481 
8482       score1 = pairdistance_array_type[(int) 'T'][na2];
8483       score2 = pairdistance_array_type[(int) 'T'][na2_alt];
8484       pairscores[3][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8485 
8486       score1 = pairdistance_array_type[(int) 'N'][na2];
8487       score2 = pairdistance_array_type[(int) 'N'][na2_alt];
8488       pairscores[4][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8489     }
8490   } else {
8491     for (c = 1; c <= glength; c++) {
8492       na2 = gsequence[1-c];
8493       na2_alt = gsequence_alt[1-c];
8494       /* Take max here */
8495       score1 = pairdistance_array_type[(int) 'A'][na2];
8496       score2 = pairdistance_array_type[(int) 'A'][na2_alt];
8497       pairscores[0][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8498 
8499       score1 = pairdistance_array_type[(int) 'C'][na2];
8500       score2 = pairdistance_array_type[(int) 'C'][na2_alt];
8501       pairscores[1][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8502 
8503       score1 = pairdistance_array_type[(int) 'G'][na2];
8504       score2 = pairdistance_array_type[(int) 'G'][na2_alt];
8505       pairscores[2][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8506 
8507       score1 = pairdistance_array_type[(int) 'T'][na2];
8508       score2 = pairdistance_array_type[(int) 'T'][na2_alt];
8509       pairscores[3][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8510 
8511       score1 = pairdistance_array_type[(int) 'N'][na2];
8512       score2 = pairdistance_array_type[(int) 'N'][na2_alt];
8513       pairscores[4][c] = (Score16_T) (score1 > score2) ? score1 : score2;
8514     }
8515   }
8516 
8517 #if 0
8518   /* Should not be necessary */
8519   memset((void *) &(pairscores[0][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8520   memset((void *) &(pairscores[1][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8521   memset((void *) &(pairscores[2][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8522   memset((void *) &(pairscores[3][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8523   memset((void *) &(pairscores[4][c]),0,(glength_ceil-c)*sizeof(Score16_T));
8524 #endif
8525 
8526   complement_dummy = _MM_SET1_EPI16(-1);
8527 
8528   gap_open = _MM_SET1_EPI16((Score16_T) open);
8529   gap_extend = _MM_SET1_EPI16((Score16_T) extend);
8530 
8531   E_infinity = _MM_SET1_EPI16(POS_INFINITY_16);
8532   if (jump_late_p) {
8533     for (clo = 0; clo <= glength; clo += SIMD_NSHORTS) {
8534       if ((chigh = clo + SIMD_NSHORTS - 1) > glength) {
8535 	chigh = glength;
8536       }
8537 
8538       /* dir_vert tests if E >= H.  To fill in first row of each
8539 	 column block with non-diags, make E == H. */
8540       E_mask = _MM_SET1_EPI16(1);
8541 
8542       /* Holds for all INITIAL_GAP_PENALTY */
8543       E_c_gap = _MM_SET1_EPI16(NEG_INFINITY_16);
8544       H_nogap_c = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
8545 
8546       for (r = clo; r <= chigh + lband && r <= rlength; r++) {
8547 	score_column = matrix[r];
8548 
8549 	if (r == 0) {
8550 	  na1 = 4; /* 'N' */
8551 	} else {
8552 	  na1 = revp ? nt_to_int_array[(int) rsequence[1-r]] : nt_to_int_array[(int) rsequence[r-1]];
8553 	}
8554 	pairscores_ptr = pairscores[na1];
8555 
8556 	if (r == 0) {
8557 	  X_prev_nogap = _MM_SETZERO_SI();
8558 	} else if (clo == 0) {
8559 #ifdef ZERO_INITIAL_GAP_PENALTY
8560 	  X_prev_nogap = _MM_SETZERO_SI();
8561 #elif defined(HAVE_AVX2)
8562 	  X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
8563 #else
8564 	  X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
8565 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
8566 #endif
8567 	} else {
8568 	  /* second or greater block of 16 */
8569 #ifdef ZERO_INITIAL_GAP_PENALTY
8570 	  X_prev_nogap = _MM_SETZERO_SI();
8571 #elif defined(HAVE_AVX2)
8572 	  X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[r-1][clo-1],LAST_SHORT_INSERT);
8573 #else
8574 	  X_prev_nogap = _mm_set1_epi16(matrix[r-1][clo-1]); /* get H from previous block and previous column */
8575 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
8576 #endif
8577 	}
8578 
8579 	debug15(print_vector_16(E_mask,clo,r,"E_mask"));
8580 	E_c_gap = _MM_MIN_EPI16(E_c_gap,_MM_ADD_EPI16(E_mask,E_infinity));
8581 	debug15(print_vector_16(E_c_gap,clo,r,"E_c_gap"));
8582 	debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c load"));
8583 
8584 	/* EGAP */
8585 	T1 = _MM_ADDS_EPI16(H_nogap_c, gap_open);
8586 	dir_vert = _MM_CMPLT_EPI16(E_c_gap,T1); /* E < H */
8587 	dir_vert = _MM_ANDNOT_SI(dir_vert,complement_dummy);	/* E >= H, for jump late */
8588 #ifdef HAVE_AVX2
8589 	_mm256_store_si256((__m256i *) &((*directions_Egap)[r][clo]),dir_vert);
8590 #else
8591 	_mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
8592 #endif
8593 	debug15(print_vector_16(T1,clo,r,"T1"));
8594 	debug15(print_vector_16(dir_vert,clo,r,"dir_Egap"));
8595 
8596 	E_c_gap = _MM_MAX_EPI16(E_c_gap, T1); /* Compare H + open with vert */
8597 	E_c_gap = _MM_ADDS_EPI16(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
8598 	E_c_gap = _MM_MIN_EPI16(E_c_gap,_MM_ADD_EPI16(E_mask,E_infinity));
8599 	debug15(print_vector_16(E_c_gap,clo,r,"E"));
8600 
8601 
8602 	/* NOGAP */
8603 #ifdef HAVE_AVX2
8604 	T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_c,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
8605 	X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_c,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
8606 	H_nogap_c = _mm256_slli_si256(H_nogap_c,ONE_SHORT);
8607 #else
8608 	T1 = _mm_srli_si128(H_nogap_c,LAST_SHORT_SHIFT);
8609 	H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_SHORT);
8610 #endif
8611 	H_nogap_c = _MM_OR_SI(H_nogap_c, X_prev_nogap);
8612 	X_prev_nogap = T1;
8613 
8614 	/* Add pairscores.  No alternate chars for query sequence. */
8615 #ifdef HAVE_AVX2
8616 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_ptr[clo]));
8617 	H_nogap_c = _mm256_adds_epi16(H_nogap_c, pairscores_std);
8618 #else
8619 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
8620 	H_nogap_c = _mm_adds_epi16(H_nogap_c, pairscores_std);
8621 #endif
8622 	_mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
8623 	debug15(print_vector_16(H_nogap_c,clo,r,"H"));
8624 
8625 	dir_vert = _MM_CMPLT_EPI16(E_c_gap,H_nogap_c); /* E < H */
8626 	dir_vert = _MM_ANDNOT_SI(dir_vert,complement_dummy);	/* E >= H, for jump late */
8627 #ifdef HAVE_AVX2
8628 	_mm256_store_si256((__m256i *) &((*directions_nogap)[r][clo]),dir_vert);
8629 #else
8630 	_mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
8631 #endif
8632 	debug15(print_vector_16(dir_vert,clo,r,"dir_nogap"));
8633 
8634 	H_nogap_c = _MM_MAX_EPI16(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
8635 	debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c store"));
8636 #ifdef HAVE_AVX2
8637 	_mm256_store_si256((__m256i *) &(score_column[clo]), H_nogap_c);
8638 #else
8639 	_mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
8640 #endif
8641 
8642 
8643 	/* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
8644 	if (chigh >= r) {
8645 	  (*directions_Egap)[r][r] = DIAG;
8646 	  (*directions_nogap)[r][r] = DIAG;
8647 	}
8648 
8649 	/* No need for F loop here */
8650 #ifdef HAVE_AVX2
8651 	save = _mm256_extract_epi16(E_mask,7);
8652 	E_mask = _mm256_slli_si256(E_mask,ONE_SHORT);
8653 	E_mask = _mm256_insert_epi16(E_mask,save,8);
8654 #else
8655 	E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
8656 #endif
8657       }
8658     }
8659 
8660   } else {
8661     /* jump early */
8662     for (clo = 0; clo <= glength; clo += SIMD_NSHORTS) {
8663       if ((chigh = clo + SIMD_NSHORTS - 1) > glength) {
8664 	chigh = glength;
8665       }
8666 
8667       /* dir_vert tests if E > H.  To fill in first row of each
8668 	 column block with non-diags, make E > H. */
8669       E_mask = _MM_SET1_EPI16(1);
8670 
8671       /* Holds for all INITIAL_GAP_PENALTY */
8672       E_c_gap = _MM_SET1_EPI16(NEG_INFINITY_16+1);
8673       H_nogap_c = _MM_SET1_EPI16(NEG_INFINITY_16-open); /* Compensate for T1 = H + open */
8674 
8675       for (r = clo; r <= chigh + lband && r <= rlength; r++) {
8676 	score_column = matrix[r];
8677 
8678 	if (r == 0) {
8679 	  na1 = 4; /* 'N' */
8680 	} else {
8681 	  na1 = revp ? nt_to_int_array[(int) rsequence[1-r]] : nt_to_int_array[(int) rsequence[r-1]];
8682 	}
8683 	pairscores_ptr = pairscores[na1];
8684 
8685 	if (r == 0) {
8686 	  X_prev_nogap = _MM_SETZERO_SI();
8687 	} else if (clo == 0) {
8688 #ifdef ZERO_INITIAL_GAP_PENALTY
8689 	  X_prev_nogap = _MM_SETZERO_SI();
8690 #elif defined(HAVE_AVX2)
8691 	  X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),NEG_INFINITY_16,LAST_SHORT_INSERT);
8692 #else
8693 	  X_prev_nogap = _mm_set1_epi16(NEG_INFINITY_16); /* works if we start outside the rlo bounds */
8694 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
8695 #endif
8696 	} else {
8697 	  /* second or greater block of 16 */
8698 #ifdef ZERO_INITIAL_GAP_PENALTY
8699 	  X_prev_nogap = _MM_SETZERO_SI();
8700 #elif defined(HAVE_AVX2)
8701 	  X_prev_nogap = _mm256_insert_epi16(_mm256_setzero_si256(),matrix[r-1][clo-1],LAST_SHORT_INSERT);
8702 #else
8703 	  X_prev_nogap = _mm_set1_epi16(matrix[r-1][clo-1]); /* get H from previous block and previous column */
8704 	  X_prev_nogap = _mm_srli_si128(X_prev_nogap,LAST_SHORT_SHIFT);
8705 #endif
8706 	}
8707 
8708 	debug15(print_vector_16(E_mask,clo,r,"E_mask"));
8709 	E_c_gap = _MM_MIN_EPI16(E_c_gap,_MM_ADD_EPI16(E_mask,E_infinity));
8710 	debug15(print_vector_16(E_c_gap,clo,r,"E_c_gap"));
8711 	debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c load"));
8712 
8713 	/* EGAP */
8714 	T1 = _MM_ADDS_EPI16(H_nogap_c, gap_open);
8715 	dir_vert = _MM_CMPGT_EPI16(E_c_gap,T1); /* E > H, for jump early */
8716 #ifdef HAVE_AVX2
8717 	_mm256_store_si256((__m256i *) &((*directions_Egap)[r][clo]),dir_vert);
8718 #else
8719 	_mm_store_si128((__m128i *) &((*directions_Egap)[r][clo]),dir_vert);
8720 #endif
8721 	debug15(print_vector_16(T1,clo,r,"T1"));
8722 	debug15(print_vector_16(dir_vert,clo,r,"dir_Egap"));
8723 
8724 	E_c_gap = _MM_MAX_EPI16(E_c_gap, T1); /* Compare H + open with vert */
8725 	E_c_gap = _MM_ADDS_EPI16(E_c_gap, gap_extend); /* Compute scores for Egap (vert + open) */
8726 	E_c_gap = _MM_MIN_EPI16(E_c_gap,_MM_ADD_EPI16(E_mask,E_infinity));
8727 	debug15(print_vector_16(E_c_gap,clo,r,"E"));
8728 
8729 
8730 	/* NOGAP */
8731 #ifdef HAVE_AVX2
8732 	T1 = _mm256_insert_epi16(_mm256_setzero_si256(),_mm256_extract_epi16(H_nogap_c,SIMD_NSHORTS-1),LAST_SHORT_INSERT);
8733 	X_prev_nogap = _mm256_insert_epi16(X_prev_nogap,_mm256_extract_epi16(H_nogap_c,MID_SHORT_INSERT-1),MID_SHORT_INSERT);
8734 	H_nogap_c = _mm256_slli_si256(H_nogap_c,ONE_SHORT);
8735 #else
8736 	T1 = _mm_srli_si128(H_nogap_c,LAST_SHORT_SHIFT);
8737 	H_nogap_c = _mm_slli_si128(H_nogap_c,ONE_SHORT);
8738 #endif
8739 	H_nogap_c = _MM_OR_SI(H_nogap_c, X_prev_nogap);
8740 	X_prev_nogap = T1;
8741 
8742 	/* Add pairscores.  No alternate chars for query sequence */
8743 #ifdef HAVE_AVX2
8744 	pairscores_std = _mm256_load_si256((__m256i *) &(pairscores_ptr[clo]));
8745 	H_nogap_c = _mm256_adds_epi16(H_nogap_c, pairscores_std);
8746 #else
8747 	pairscores_std = _mm_load_si128((__m128i *) &(pairscores_ptr[clo]));
8748 	H_nogap_c = _mm_adds_epi16(H_nogap_c, pairscores_std);
8749 #endif
8750 	_mm_clflush(&H_nogap_c); /* Needed for opencc -O3 on AMD */
8751 	debug15(print_vector_16(H_nogap_c,clo,r,"H"));
8752 
8753 	dir_vert = _MM_CMPGT_EPI16(E_c_gap,H_nogap_c); /* E > H, for jump early */
8754 #ifdef HAVE_AVX2
8755 	_mm256_store_si256((__m256i *) &((*directions_nogap)[r][clo]),dir_vert);
8756 #else
8757 	_mm_store_si128((__m128i *) &((*directions_nogap)[r][clo]),dir_vert);
8758 #endif
8759 	debug15(print_vector_16(dir_vert,clo,r,"dir_nogap"));
8760 
8761 	H_nogap_c = _MM_MAX_EPI16(H_nogap_c, E_c_gap); /* Compare H + pairscores with horiz + extend */
8762 	debug15(print_vector_16(H_nogap_c,clo,r,"H_nogap_c store"));
8763 #ifdef HAVE_AVX2
8764 	_mm256_store_si256((__m256i *) &(score_column[clo]), H_nogap_c);
8765 #else
8766 	_mm_store_si128((__m128i *) &(score_column[clo]), H_nogap_c);
8767 #endif
8768 
8769 
8770 	/* Fix gaps along diagonal to prevent going into upper triangle, which can happen with ties between E and H */
8771 	if (chigh >= r) {
8772 	  (*directions_Egap)[r][r] = DIAG;
8773 	  (*directions_nogap)[r][r] = DIAG;
8774 	}
8775 
8776 	/* No need for F loop here */
8777 #ifdef HAVE_AVX2
8778 	save = _mm256_extract_epi16(E_mask,7);
8779 	E_mask = _mm256_slli_si256(E_mask,ONE_SHORT);
8780 	E_mask = _mm256_insert_epi16(E_mask,save,8);
8781 #else
8782 	E_mask = _mm_slli_si128(E_mask,ONE_SHORT);
8783 #endif
8784       }
8785     }
8786   }
8787 
8788 
8789 #ifdef CHECK1
8790   /* Row 0 and column 0 directions fail anyway due to saturation */
8791   /* Handle (0,1) and (1,0) directions, otherwise DIAG */
8792   (*directions_Egap)[1][0] = VERT;
8793 #endif
8794 
8795 #ifdef DEBUG2
8796   printf("SIMD: Dynprog_simd_16_lower\n");
8797   Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
8798 		    revp,lband,/*upperp*/false);
8799   Directions16_print_ud(*directions_nogap,*directions_Egap,
8800 			rlength,glength,rsequence,gsequence,gsequence_alt,
8801 			revp,lband,/*upperp*/false);
8802 #endif
8803 
8804 #ifdef CHECK1
8805   /* Check for column 0 directions */
8806   for (r = 1; r <= lband && r <= rlength; r++) {
8807     assert((*directions_Egap)[r][0] != DIAG);
8808     assert((*directions_nogap)[r][0] != DIAG);
8809   }
8810 #endif
8811 
8812 #ifdef DEBUG_AVX2
8813   matrix_std = Dynprog_simd_16_lower_nonavx2(&directions_nogap_std,&directions_Egap_std,
8814 					     this,rsequence,gsequence,gsequence_alt,
8815 					     rlength,glength,goffset,chroffset,chrhigh,watsonp,mismatchtype,
8816 					     open,extend,lband,jump_late_p,revp);
8817 #elif defined(DEBUG_SIMD)
8818   matrix_std = Dynprog_standard(&directions_nogap_std,&directions_Egap_std,&directions_Fgap_std,
8819 				this,rsequence,/*gsequence (NULL for debugging)*/NULL,/*gsequence_alt*/NULL,
8820 				rlength,glength,
8821 				goffset,chroffset,chrhigh,watsonp,mismatchtype,
8822 				open,extend,lband,/*uband*/0,jump_late_p,revp,/*saturation*/NEG_INFINITY_16,
8823 				/*upperp*/false,/*lowerp*/true);
8824 #endif
8825 
8826 #ifdef DEBUG2
8827   printf("Banded\n");
8828   Matrix16_print_ud(matrix,rlength,glength,rsequence,gsequence,gsequence_alt,
8829 		    revp,lband,/*upperp*/false);
8830   Directions16_print_ud(*directions_nogap,*directions_Egap,
8831 			rlength,glength,rsequence,gsequence,gsequence_alt,
8832 			revp,lband,/*upperp*/false);
8833 #endif
8834 
8835 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
8836   banded_matrix16_compare_lower(matrix,matrix_std,rlength,glength,lband,
8837 				rsequence,gsequence,gsequence_alt,
8838 				goffset,chroffset,chrhigh,watsonp,revp);
8839 
8840   banded_directions16_compare_nogap_lower(*directions_nogap,directions_nogap_std,rlength,glength,lband);
8841 
8842   banded_directions16_compare_Egap_lower(*directions_Egap,directions_Egap_std,rlength,glength,lband);
8843 #endif
8844 
8845   _mm_free(pairscores[4]);
8846   _mm_free(pairscores[3]);
8847   _mm_free(pairscores[2]);
8848   _mm_free(pairscores[1]);
8849   _mm_free(pairscores[0]);
8850 
8851   return matrix;
8852 }
8853 #endif
8854 
8855 
8856 #ifdef DEBUG17
8857 static char complCode[128] = COMPLEMENT_LC;
8858 
8859 static char
get_genomic_nt(char * g_alt,int genomicpos,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp)8860 get_genomic_nt (char *g_alt, int genomicpos, Univcoord_T chroffset, Univcoord_T chrhigh,
8861 		bool watsonp) {
8862   char c2, c2_alt;
8863   Univcoord_T pos;
8864 
8865 #if 0
8866   /* If the read has a deletion, then we will extend beyond 0 or genomiclength, so do not restrict. */
8867   if (genomicpos < 0) {
8868     return '*';
8869 
8870   } else if (genomicpos >= genomiclength) {
8871     return '*';
8872 
8873   }
8874 #endif
8875 
8876   if (watsonp) {
8877     if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
8878       *g_alt = '*';
8879       return '*';
8880 
8881     } else if (pos >= chrhigh) {
8882       *g_alt = '*';
8883       return '*';
8884 
8885 #if 0
8886     } else if (genome) {
8887       /* Not necessary, because Genome_get_char_blocks should work */
8888       debug8(printf("At %u, genomicnt is %c\n",
8889 		    genomicpos,Genome_get_char(genome,pos)));
8890       return Genome_get_char(genome,pos);
8891 #endif
8892 
8893     } else {
8894       /* GMAP with user-supplied genomic segment */
8895       debug8(printf("At %u, genomicnt is %c\n",
8896 		    genomicpos,Genome_get_char_blocks(pos)));
8897       return Genome_get_char_blocks(&(*g_alt),pos);
8898     }
8899 
8900   } else {
8901     if ((pos = chrhigh - genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
8902       *g_alt = '*';
8903       return '*';
8904 
8905     } else if (pos >= chrhigh) {
8906       *g_alt = '*';
8907       return '*';
8908 
8909 #if 0
8910     } else if (genome) {
8911       /* Not necessary, because Genome_get_char_blocks should work */
8912       c2 = Genome_get_char(genome,pos);
8913 #endif
8914 
8915     } else {
8916       /* GMAP with user-supplied genomic segment */
8917       c2 = Genome_get_char_blocks(&c2_alt,pos);
8918     }
8919     debug8(printf("At %u, genomicnt is %c\n",genomicpos,complCode[(int) c2]));
8920     *g_alt = complCode[(int) c2_alt];
8921     return complCode[(int) c2];
8922   }
8923 }
8924 #endif
8925 
8926 
8927 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE2)
8928 List_T
Dynprog_traceback_8(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction8_T ** directions_nogap,Direction8_T ** directions_Egap,Direction8_T ** directions_Fgap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,int dynprogindex)8929 Dynprog_traceback_8 (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
8930 		     Direction8_T **directions_nogap, Direction8_T **directions_Egap, Direction8_T **directions_Fgap,
8931 		     int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
8932 		     int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp,
8933 		     Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int genestrand,
8934 		     int dynprogindex) {
8935   char c1, c1_uc, c2, c2_alt;
8936   int dist;
8937   bool add_dashes_p;
8938   int querycoord, genomecoord;
8939   Direction8_T dir;
8940 #ifdef DEBUG17
8941   char c2_single;
8942 #endif
8943 
8944   debug(printf("Starting traceback_8 at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
8945 
8946   while (r > 0 && c > 0) {  /* dir != STOP */
8947     if ((dir = directions_nogap[c][r]) == HORIZ) {
8948       dist = 1;
8949       while (c > 0 && directions_Egap[c--][r] != DIAG) {
8950 	dist++;
8951       }
8952 #if 0
8953       if (c == 0) {
8954 	/* Directions in column 0 can sometimes be DIAG */
8955 	dir = VERT;
8956       } else {
8957 	printf("| ");		/* For Fgap */
8958 	dir = directions_nogap[c][r];
8959       }
8960 #endif
8961 
8962       debug(printf("H%d: ",dist));
8963       pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,r,c+dist,dist,/*genomesequence*/NULL,
8964 				      queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
8965 				      watsonp,dynprogindex);
8966       if (add_dashes_p == true) {
8967 	*nopens += 1;
8968 	*nindels += dist;
8969       }
8970       debug(printf("\n"));
8971 
8972     } else if (dir == VERT) {
8973       dist = 1;
8974       while (r > 0 && directions_Fgap[c][r--] != DIAG) {
8975 	dist++;
8976       }
8977 #if 0
8978       if (r == 0) {
8979 	/* Directions in row 0 can sometimes be DIAG */
8980 	dir = HORIZ;
8981       } else {
8982 	dir = directions_nogap[c][r];
8983       }
8984 #endif
8985 
8986       debug(printf("V%d: ",dist));
8987       pairs = Pairpool_add_queryskip(pairs,r+dist,c,dist,rsequence,
8988 				     queryoffset,genomeoffset,pairpool,revp,
8989 				     dynprogindex);
8990       *nopens += 1;
8991       *nindels += dist;
8992       debug(printf("\n"));
8993 
8994     } else if (dir == DIAG) {
8995       querycoord = r-1;
8996       genomecoord = c-1;
8997       if (revp == true) {
8998 	querycoord = -querycoord;
8999 	genomecoord = -genomecoord;
9000       }
9001 
9002       c1 = rsequence[querycoord];
9003       c1_uc = rsequenceuc[querycoord];
9004       c2 = gsequence[genomecoord];
9005       c2_alt = gsequence_alt[genomecoord];
9006 #ifdef DEBUG17
9007       c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9008       if (c2 != c2_single) {
9009 	abort();
9010       }
9011 #endif
9012 
9013 #ifdef EXTRACT_GENOMICSEG
9014       assert(c2 == genomesequence[genomecoord]);
9015 #endif
9016 
9017       if (c2 == '*') {
9018 	/* Don't push pairs past end of chromosome */
9019 	debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u, chroffset %u, chrhigh %u, watsonp %d\n",
9020 		     genomeoffset,genomecoord,chroffset,chrhigh,watsonp));
9021 
9022       } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9023 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9024 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9025 	*nmatches += 1;
9026 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9027 			      c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9028 
9029       } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9030 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9031 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9032 	*nmatches += 1;
9033 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9034 			      c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9035 
9036       } else {
9037 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9038 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9039 	*nmismatches += 1;
9040 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9041 			      c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9042       }
9043 
9044       r--; c--;
9045 
9046     } else {
9047       fprintf(stderr,"Bad dir at r %d, c %d\n",r,c);
9048       abort();
9049     }
9050   }
9051 
9052   if (r == 0 && c == 0) {
9053     /* Finished with a diagonal step */
9054 
9055   } else if (c == 0) {
9056     dist = r;
9057     debug(printf("V%d: ",dist));
9058     pairs = Pairpool_add_queryskip(pairs,r,/*c*/0+LAZY_INDEL,dist,rsequence,
9059 				   queryoffset,genomeoffset,pairpool,revp,
9060 				   dynprogindex);
9061     *nopens += 1;
9062     *nindels += dist;
9063     debug(printf("\n"));
9064 
9065   } else {
9066     assert(r == 0);
9067     dist = c;
9068     debug(printf("H%d: ",dist));
9069     pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,/*r*/0+LAZY_INDEL,c,dist,/*genomesequence*/NULL,
9070 				    queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9071 				    watsonp,dynprogindex);
9072     if (add_dashes_p == true) {
9073       *nopens += 1;
9074       *nindels += dist;
9075     }
9076     debug(printf("\n"));
9077   }
9078 
9079   return pairs;
9080 }
9081 #endif
9082 
9083 
9084 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE2)
9085 List_T
Dynprog_traceback_8_upper(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction8_T ** directions_nogap,Direction8_T ** directions_Egap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,int dynprogindex)9086 Dynprog_traceback_8_upper (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
9087 			   Direction8_T **directions_nogap, Direction8_T **directions_Egap,
9088 			   int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
9089 			   int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp,
9090 			   Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int genestrand,
9091 			   int dynprogindex) {
9092   char c1, c1_uc, c2, c2_alt;
9093   int dist;
9094   bool add_dashes_p;
9095   int querycoord, genomecoord;
9096   Direction8_T dir;
9097 #ifdef DEBUG17
9098   char c2_single;
9099 #endif
9100 
9101   debug(printf("Starting traceback_8_upper at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
9102 
9103   while (r > 0 && c > 0) {  /* dir != STOP */
9104     if ((dir = directions_nogap[c][r]) != DIAG) {
9105       /* Must be HORIZ */
9106       dist = 1;
9107       /* Should not need to check for c > r if the Egap diagonal above the main is populated with DIAG */
9108       while (/* c > r && */ directions_Egap[c--][r] != DIAG) {
9109 	dist++;
9110       }
9111       assert(c >= r);
9112 
9113       debug(printf("H%d: ",dist));
9114       pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,r,c+dist,dist,/*genomesequence*/NULL,
9115 				      queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9116 				      watsonp,dynprogindex);
9117       if (add_dashes_p == true) {
9118 	*nopens += 1;
9119 	*nindels += dist;
9120       }
9121       debug(printf("\n"));
9122 
9123     } else {
9124       querycoord = r-1;
9125       genomecoord = c-1;
9126       if (revp == true) {
9127 	querycoord = -querycoord;
9128 	genomecoord = -genomecoord;
9129       }
9130 
9131       c1 = rsequence[querycoord];
9132       c1_uc = rsequenceuc[querycoord];
9133       c2 = gsequence[genomecoord];
9134       c2_alt = gsequence_alt[genomecoord];
9135 #ifdef DEBUG17
9136       c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9137       if (c2 != c2_single) {
9138 	abort();
9139       }
9140 #endif
9141 
9142 #ifdef EXTRACT_GENOMICSEG
9143       assert(c2 == genomesequence[genomecoord]);
9144 #endif
9145 
9146       if (c2 == '*') {
9147 	/* Don't push pairs past end of chromosome */
9148 	debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u, chroffset %u, chrhigh %u, watsonp %d\n",
9149 		     genomeoffset,genomecoord,chroffset,chrhigh,watsonp));
9150 
9151       } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9152 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9153 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9154 	*nmatches += 1;
9155 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9156 			      c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9157 
9158       } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9159 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9160 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9161 	*nmatches += 1;
9162 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9163 			      c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9164 
9165       } else {
9166 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9167 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9168 	*nmismatches += 1;
9169 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9170 			      c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9171       }
9172 
9173       r--; c--;
9174     }
9175   }
9176 
9177   assert(r == 0);
9178   if (/* r == 0 && */ c == 0) {
9179     /* Finished with a diagonal step */
9180 
9181   } else {
9182     assert(c != 0);
9183     assert(r == 0);
9184     dist = c;
9185     debug(printf("H%d: ",dist));
9186     pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,/*r*/0+LAZY_INDEL,c,dist,/*genomesequence*/NULL,
9187 				    queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9188 				    watsonp,dynprogindex);
9189     if (add_dashes_p == true) {
9190       *nopens += 1;
9191       *nindels += dist;
9192     }
9193     debug(printf("\n"));
9194   }
9195 
9196   return pairs;
9197 }
9198 
9199 List_T
Dynprog_traceback_8_lower(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction8_T ** directions_nogap,Direction8_T ** directions_Egap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,int genestrand,bool revp,int dynprogindex)9200 Dynprog_traceback_8_lower (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
9201 			   Direction8_T **directions_nogap, Direction8_T **directions_Egap,
9202 			   int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
9203 			   int queryoffset, int genomeoffset, Pairpool_T pairpool,
9204 			   int genestrand, bool revp, int dynprogindex) {
9205   char c1, c1_uc, c2, c2_alt;
9206   int dist;
9207   int querycoord, genomecoord;
9208   Direction8_T dir;
9209 #ifdef DEBUG17
9210   char c2_single;
9211 #endif
9212 
9213   debug(printf("Starting traceback_8_lower at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
9214 
9215   while (r > 0 && c > 0) {  /* dir != STOP */
9216     if ((dir = directions_nogap[r][c]) != DIAG) {
9217       /* Must be VERT */
9218       dist = 1;
9219       /* Should not need to check for r > c if the Egap diagonal below the main is populated with DIAG */
9220       while (/* r > c && */ directions_Egap[r--][c] != DIAG) {
9221 	dist++;
9222       }
9223       assert(r >= c);
9224 
9225       debug(printf("V%d: ",dist));
9226       pairs = Pairpool_add_queryskip(pairs,r+dist,c,dist,rsequence,
9227 				     queryoffset,genomeoffset,pairpool,revp,
9228 				     dynprogindex);
9229       *nopens += 1;
9230       *nindels += dist;
9231       debug(printf("\n"));
9232 
9233     } else {
9234       querycoord = r-1;
9235       genomecoord = c-1;
9236       if (revp == true) {
9237 	querycoord = -querycoord;
9238 	genomecoord = -genomecoord;
9239       }
9240 
9241       c1 = rsequence[querycoord];
9242       c1_uc = rsequenceuc[querycoord];
9243       c2 = gsequence[genomecoord];
9244       c2_alt = gsequence_alt[genomecoord];
9245 #ifdef DEBUG17
9246       c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9247       if (c2 != c2_single) {
9248 	abort();
9249       }
9250 #endif
9251 
9252 #ifdef EXTRACT_GENOMICSEG
9253       assert(c2 == genomesequence[genomecoord]);
9254 #endif
9255 
9256       if (c2 == '*') {
9257 	/* Don't push pairs past end of chromosome */
9258 	debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u\n",
9259 		     genomeoffset,genomecoord));
9260 
9261       } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9262 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9263 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9264 	*nmatches += 1;
9265 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9266 			      c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9267 
9268       } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9269 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9270 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9271 	*nmatches += 1;
9272 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9273 			      c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9274 
9275       } else {
9276 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9277 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9278 	*nmismatches += 1;
9279 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9280 			      c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9281       }
9282 
9283       r--; c--;
9284     }
9285   }
9286 
9287   assert(c == 0);
9288   if (r == 0 /* && c == 0 */) {
9289     /* Finished with a diagonal step */
9290 
9291   } else {
9292     assert(r != 0);
9293     assert(c == 0);
9294     dist = r;
9295     debug(printf("V%d: ",dist));
9296     pairs = Pairpool_add_queryskip(pairs,r,/*c*/0+LAZY_INDEL,dist,rsequence,
9297 				   queryoffset,genomeoffset,pairpool,revp,
9298 				   dynprogindex);
9299     *nopens += 1;
9300     *nindels += dist;
9301     debug(printf("\n"));
9302   }
9303 
9304   return pairs;
9305 }
9306 
9307 
9308 List_T
Dynprog_traceback_16(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction16_T ** directions_nogap,Direction16_T ** directions_Egap,Direction16_T ** directions_Fgap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,int dynprogindex)9309 Dynprog_traceback_16 (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
9310 		      Direction16_T **directions_nogap, Direction16_T **directions_Egap, Direction16_T **directions_Fgap,
9311 		      int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
9312 		      int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp,
9313 		      Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int genestrand,
9314 		      int dynprogindex) {
9315   char c1, c1_uc, c2, c2_alt;
9316   int dist;
9317   bool add_dashes_p;
9318   int querycoord, genomecoord;
9319   Direction16_T dir;
9320 #ifdef DEBUG17
9321   char c2_single;
9322 #endif
9323 
9324   debug(printf("Starting traceback_16 at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
9325 
9326   while (r > 0 && c > 0) {  /* dir != STOP */
9327     if ((dir = directions_nogap[c][r]) == HORIZ) {
9328       dist = 1;
9329       while (c > 0 && directions_Egap[c--][r] != DIAG) {
9330 	dist++;
9331       }
9332 #if 0
9333       if (c == 0) {
9334 	/* Directions in column 0 can sometimes be DIAG */
9335 	dir = VERT;
9336       } else {
9337 	dir = directions_nogap[c][r];
9338       }
9339 #endif
9340 
9341       debug(printf("H%d: ",dist));
9342       pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,r,c+dist,dist,/*genomesequence*/NULL,
9343 				      queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9344 				      watsonp,dynprogindex);
9345       if (add_dashes_p == true) {
9346 	*nopens += 1;
9347 	*nindels += dist;
9348       }
9349       debug(printf("\n"));
9350 
9351     } else if (dir == VERT) {
9352       dist = 1;
9353       while (r > 0 && directions_Fgap[c][r--] != DIAG) {
9354 	dist++;
9355       }
9356 #if 0
9357       if (r == 0) {
9358 	/* Directions in row 0 can sometimes be DIAG */
9359 	dir = HORIZ;
9360       } else {
9361 	dir = directions_nogap[c][r];
9362       }
9363 #endif
9364 
9365       debug(printf("V%d: ",dist));
9366       debug(printf("New dir at %d,%d is %d\n",c,r,dir));
9367       pairs = Pairpool_add_queryskip(pairs,r+dist,c,dist,rsequence,
9368 				     queryoffset,genomeoffset,pairpool,revp,
9369 				     dynprogindex);
9370       *nopens += 1;
9371       *nindels += dist;
9372       debug(printf("\n"));
9373 
9374     } else if (dir == DIAG) {
9375       querycoord = r-1;
9376       genomecoord = c-1;
9377       if (revp == true) {
9378 	querycoord = -querycoord;
9379 	genomecoord = -genomecoord;
9380       }
9381 
9382       c1 = rsequence[querycoord];
9383       c1_uc = rsequenceuc[querycoord];
9384       c2 = gsequence[genomecoord];
9385       c2_alt = gsequence_alt[genomecoord];
9386 #ifdef DEBUG17
9387       c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9388       if (c2 != c2_single) {
9389 	abort();
9390       }
9391 #endif
9392 
9393 #ifdef EXTRACT_GENOMICSEG
9394       assert(c2 == genomesequence[genomecoord]);
9395 #endif
9396 
9397       if (c2 == '*') {
9398 	/* Don't push pairs past end of chromosome */
9399 	debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u, chroffset %u, chrhigh %u, watsonp %d\n",
9400 		     genomeoffset,genomecoord,chroffset,chrhigh,watsonp));
9401 
9402       } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9403 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9404 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9405 	*nmatches += 1;
9406 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9407 			      c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9408 
9409       } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9410 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9411 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9412 	*nmatches += 1;
9413 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9414 			      c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9415 
9416       } else {
9417 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9418 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9419 	*nmismatches += 1;
9420 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9421 			      c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9422       }
9423 
9424       r--; c--;
9425 
9426     } else {
9427       fprintf(stderr,"Bad dir at r %d, c %d\n",r,c);
9428       abort();
9429     }
9430   }
9431 
9432   if (r == 0 && c == 0) {
9433     /* Finished with a diagonal step */
9434 
9435   } else if (c == 0) {
9436     dist = r;
9437     debug(printf("V%d: ",dist));
9438     pairs = Pairpool_add_queryskip(pairs,r,/*c*/0+LAZY_INDEL,dist,rsequence,
9439 				   queryoffset,genomeoffset,pairpool,revp,
9440 				   dynprogindex);
9441     *nopens += 1;
9442     *nindels += dist;
9443     debug(printf("\n"));
9444 
9445   } else {
9446     assert(r == 0);
9447     dist = c;
9448     debug(printf("H%d: ",dist));
9449     pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,/*r*/0+LAZY_INDEL,c,dist,/*genomesequence*/NULL,
9450 				    queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9451 				    watsonp,dynprogindex);
9452     if (add_dashes_p == true) {
9453       *nopens += 1;
9454       *nindels += dist;
9455     }
9456     debug(printf("\n"));
9457   }
9458 
9459   return pairs;
9460 }
9461 
9462 
9463 List_T
Dynprog_traceback_16_upper(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction16_T ** directions_nogap,Direction16_T ** directions_Egap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,int dynprogindex)9464 Dynprog_traceback_16_upper (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
9465 			    Direction16_T **directions_nogap, Direction16_T **directions_Egap,
9466 			    int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
9467 			    int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp,
9468 			    Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int genestrand,
9469 			    int dynprogindex) {
9470   char c1, c1_uc, c2, c2_alt;
9471   int dist;
9472   bool add_dashes_p;
9473   int querycoord, genomecoord;
9474   Direction16_T dir;
9475 #ifdef DEBUG17
9476   char c2_single;
9477 #endif
9478 
9479   debug(printf("Starting traceback_16_upper at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
9480 
9481   while (r > 0 && c > 0) {  /* dir != STOP */
9482     if ((dir = directions_nogap[c][r]) != DIAG) {
9483       /* Must be HORIZ */
9484       dist = 1;
9485       /* Should not need to check for c > r if the Egap diagonal above the main is populated with DIAG */
9486       while (/* c > r && */ directions_Egap[c--][r] != DIAG) {
9487 	dist++;
9488       }
9489       assert(c >= r);
9490 
9491       debug(printf("H%d: ",dist));
9492       pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,r,c+dist,dist,/*genomesequence*/NULL,
9493 				      queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9494 				      watsonp,dynprogindex);
9495       if (add_dashes_p == true) {
9496 	*nopens += 1;
9497 	*nindels += dist;
9498       }
9499       debug(printf("\n"));
9500 
9501     } else {
9502       querycoord = r-1;
9503       genomecoord = c-1;
9504       if (revp == true) {
9505 	querycoord = -querycoord;
9506 	genomecoord = -genomecoord;
9507       }
9508 
9509       c1 = rsequence[querycoord];
9510       c1_uc = rsequenceuc[querycoord];
9511       c2 = gsequence[genomecoord];
9512       c2_alt = gsequence_alt[genomecoord];
9513 #ifdef DEBUG17
9514       c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9515       if (c2 != c2_single) {
9516 	abort();
9517       }
9518 #endif
9519 
9520 #ifdef EXTRACT_GENOMICSEG
9521       assert(c2 == genomesequence[genomecoord]);
9522 #endif
9523 
9524       if (c2 == '*') {
9525 	/* Don't push pairs past end of chromosome */
9526 	debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u, chroffset %u, chrhigh %u, watsonp %d\n",
9527 		     genomeoffset,genomecoord,chroffset,chrhigh,watsonp));
9528 
9529       } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9530 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9531 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9532 	*nmatches += 1;
9533 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9534 			      c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9535 
9536       } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9537 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9538 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9539 	*nmatches += 1;
9540 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9541 			      c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9542 
9543       } else {
9544 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9545 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9546 	*nmismatches += 1;
9547 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9548 			      c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9549       }
9550 
9551       r--; c--;
9552     }
9553   }
9554 
9555   assert(r == 0);
9556   if (/* r == 0 && */ c == 0) {
9557     /* Finished with a diagonal step */
9558 
9559   } else {
9560     assert(c != 0);
9561     assert(r == 0);
9562     dist = c;
9563     debug(printf("H%d: ",dist));
9564     pairs = Pairpool_add_genomeskip(&add_dashes_p,pairs,/*r*/0+LAZY_INDEL,c,dist,/*genomesequence*/NULL,
9565 				    queryoffset,genomeoffset,pairpool,revp,chroffset,chrhigh,
9566 				    watsonp,dynprogindex);
9567     if (add_dashes_p == true) {
9568       *nopens += 1;
9569       *nindels += dist;
9570     }
9571     debug(printf("\n"));
9572   }
9573 
9574   return pairs;
9575 }
9576 
9577 List_T
Dynprog_traceback_16_lower(List_T pairs,int * nmatches,int * nmismatches,int * nopens,int * nindels,Direction16_T ** directions_nogap,Direction16_T ** directions_Egap,int r,int c,char * rsequence,char * rsequenceuc,char * gsequence,char * gsequence_alt,int queryoffset,int genomeoffset,Pairpool_T pairpool,int genestrand,bool revp,int dynprogindex)9578 Dynprog_traceback_16_lower (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
9579 			    Direction16_T **directions_nogap, Direction16_T **directions_Egap,
9580 			    int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
9581 			    int queryoffset, int genomeoffset, Pairpool_T pairpool,
9582 			    int genestrand, bool revp, int dynprogindex) {
9583   char c1, c1_uc, c2, c2_alt;
9584   int dist;
9585   int querycoord, genomecoord;
9586   Direction16_T dir;
9587 #ifdef DEBUG17
9588   char c2_single;
9589 #endif
9590 
9591   debug(printf("Starting traceback_16_lower at r=%d,c=%d (roffset=%d, goffset=%d)\n",r,c,queryoffset,genomeoffset));
9592 
9593   while (r > 0 && c > 0) {  /* dir != STOP */
9594     if ((dir = directions_nogap[r][c]) != DIAG) {
9595       /* Must be VERT */
9596       dist = 1;
9597       /* Should not need to check for r > c if the Egap diagonal below the main is populated with DIAG */
9598       while (/* r > c && */ directions_Egap[r--][c] != DIAG) {
9599 	dist++;
9600       }
9601       assert(r >= c);
9602 
9603       debug(printf("V%d: ",dist));
9604       pairs = Pairpool_add_queryskip(pairs,r+dist,c,dist,rsequence,
9605 				     queryoffset,genomeoffset,pairpool,revp,
9606 				     dynprogindex);
9607       *nopens += 1;
9608       *nindels += dist;
9609       debug(printf("\n"));
9610 
9611     } else {
9612       querycoord = r-1;
9613       genomecoord = c-1;
9614       if (revp == true) {
9615 	querycoord = -querycoord;
9616 	genomecoord = -genomecoord;
9617       }
9618 
9619       c1 = rsequence[querycoord];
9620       c1_uc = rsequenceuc[querycoord];
9621       c2 = gsequence[genomecoord];
9622       c2_alt = gsequence_alt[genomecoord];
9623 #ifdef DEBUG17
9624       c2_single = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
9625       if (c2 != c2_single) {
9626 	abort();
9627       }
9628 #endif
9629 
9630 #ifdef EXTRACT_GENOMICSEG
9631       assert(c2 == genomesequence[genomecoord]);
9632 #endif
9633 
9634       if (c2 == '*') {
9635 	/* Don't push pairs past end of chromosome */
9636 	debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u\n",
9637 		     genomeoffset,genomecoord));
9638 
9639       } else if (/*querysequenceuc[querycoord]*/c1_uc == c2 || c1_uc == c2_alt) {
9640 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - match\n",
9641 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9642 	*nmatches += 1;
9643 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9644 			      c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
9645 
9646       } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
9647 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - ambiguous\n",
9648 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9649 	*nmatches += 1;
9650 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9651 			      c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
9652 
9653       } else {
9654 	debug(printf("Pushing %d,%d [%d,%d] (%c,%c) - mismatch\n",
9655 		     r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1_uc,c2));
9656 	*nmismatches += 1;
9657 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
9658 			      c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
9659       }
9660 
9661       r--; c--;
9662     }
9663   }
9664 
9665   assert(c == 0);
9666   if (r == 0 /* && c == 0 */) {
9667     /* Finished with a diagonal step */
9668 
9669   } else {
9670     assert(r != 0);
9671     assert(c == 0);
9672     dist = r;
9673     debug(printf("V%d: ",dist));
9674     pairs = Pairpool_add_queryskip(pairs,r,/*c*/0+LAZY_INDEL,dist,rsequence,
9675 				   queryoffset,genomeoffset,pairpool,revp,
9676 				   dynprogindex);
9677     *nopens += 1;
9678     *nindels += dist;
9679     debug(printf("\n"));
9680   }
9681 
9682   return pairs;
9683 }
9684 #endif
9685 
9686 
9687