1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 /**
21  * aligner_sw_sse.cpp
22  *
23  * Versions of key alignment functions that use vector instructions to
24  * accelerate dynamic programming.  Based chiefly on the striped Smith-Waterman
25  * paper and implementation by Michael Farrar.  See:
26  *
27  * Farrar M. Striped Smith-Waterman speeds database searches six times over
28  * other SIMD implementations. Bioinformatics. 2007 Jan 15;23(2):156-61.
29  * http://sites.google.com/site/farrarmichael/smith-waterman
30  *
31  * While the paper describes an implementation of Smith-Waterman, we extend it
32  * do end-to-end read alignment as well as local alignment.  The change
33  * required for this is minor: we simply let vmax be the maximum element in the
34  * score domain rather than the minimum.
35  *
36  * The vectorized dynamic programming implementation lacks some features that
37  * make it hard to adapt to solving the entire dynamic-programming alignment
38  * problem.  For instance:
39  *
40  * - It doesn't respect gap barriers on either end of the read
41  * - It just gives a maximum; not enough information to backtrace without
42  *   redoing some alignment
43  * - It's a little difficult to handle st_ and en_, especially st_.
44  * - The query profile mechanism makes handling of ambiguous reference bases a
45  *   little tricky (16 cols in query profile lookup table instead of 5)
46  *
47  * Given the drawbacks, it is tempting to use SSE dynamic programming as a
48  * filter rather than as an aligner per se.  Here are a few ideas for how it
49  * can be extended to handle more of the alignment problem:
50  *
51  * - Save calculated scores to a big array as we go.  We return to this array
52  *   to find and backtrace from good solutions.
53  */
54 
55 #include <limits>
56 #include "aligner_sw.h"
57 
58 static const size_t NBYTES_PER_REG  = 16;
59 static const size_t NWORDS_PER_REG  = 8;
60 static const size_t NBITS_PER_WORD  = 16;
61 static const size_t NBYTES_PER_WORD = 2;
62 
63 // In 16-bit end-to-end mode, we have the option of using signed saturated
64 // arithmetic.  Because we have signed arithmetic, there's no need to add/subtract
65 // bias when building an applying the query profile.  The lowest value we can
66 // use is 0x8000, and the greatest is 0x7fff.
67 
68 typedef int16_t TCScore;
69 
70 /**
71  * Build query profile look up tables for the read.  The query profile look
72  * up table is organized as a 1D array indexed by [i][j] where i is the
73  * reference character in the current DP column (0=A, 1=C, etc), and j is
74  * the segment of the query we're currently working on.
75  */
buildQueryProfileEnd2EndSseI16(bool fw)76 void SwAligner::buildQueryProfileEnd2EndSseI16(bool fw) {
77 	bool& done = fw ? sseI16fwBuilt_ : sseI16rcBuilt_;
78 	if(done) {
79 		return;
80 	}
81 	done = true;
82 	const BTDnaString* rd = fw ? rdfw_ : rdrc_;
83 	const BTString* qu = fw ? qufw_ : qurc_;
84     // daehwan - allows to align a portion of a read, not the whole
85 	// const size_t len = rd->length();
86     const size_t len = dpRows();
87 	const size_t seglen = (len + (NWORDS_PER_REG-1)) / NWORDS_PER_REG;
88 	// How many __m128i's are needed
89 	size_t n128s =
90 		64 +                    // slack bytes, for alignment?
91 		(seglen * ALPHA_SIZE)   // query profile data
92 		* 2;                    // & gap barrier data
93 	assert_gt(n128s, 0);
94 	SSEData& d = fw ? sseI16fw_ : sseI16rc_;
95 	d.profbuf_.resizeNoCopy(n128s);
96 	assert(!d.profbuf_.empty());
97 	d.maxPen_      = d.maxBonus_ = 0;
98 	d.lastIter_    = d.lastWord_ = 0;
99 	d.qprofStride_ = d.gbarStride_ = 2;
100 	d.bias_ = 0; // no bias when words are signed
101 	// For each reference character A, C, G, T, N ...
102 	for(size_t refc = 0; refc < ALPHA_SIZE; refc++) {
103 		// For each segment ...
104 		for(size_t i = 0; i < seglen; i++) {
105 			size_t j = i;
106 			int16_t *qprofWords =
107 				reinterpret_cast<int16_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2));
108 			int16_t *gbarWords =
109 				reinterpret_cast<int16_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2) + 1);
110 			// For each sub-word (byte) ...
111 			for(size_t k = 0; k < NWORDS_PER_REG; k++) {
112 				int sc = 0;
113 				*gbarWords = 0;
114 				if(j < len) {
115 					int readc = (*rd)[j];
116 					int readq = (*qu)[j];
117 					sc = sc_->score(readc, (int)(1 << refc), readq - 33);
118 					size_t j_from_end = len - j - 1;
119 					if(j < (size_t)sc_->gapbar ||
120 					   j_from_end < (size_t)sc_->gapbar)
121 					{
122 						// Inside the gap barrier
123 						*gbarWords = 0x8000; // add this twice
124 					}
125 				}
126 				if(refc == 0 && j == len-1) {
127 					// Remember which 128-bit word and which smaller word has
128 					// the final row
129 					d.lastIter_ = i;
130 					d.lastWord_ = k;
131 				}
132 				if(sc < 0) {
133 					if((size_t)(-sc) > d.maxPen_) {
134 						d.maxPen_ = (size_t)(-sc);
135 					}
136 				} else {
137 					if((size_t)sc > d.maxBonus_) {
138 						d.maxBonus_ = (size_t)sc;
139 					}
140 				}
141 				*qprofWords = (int16_t)sc;
142 				gbarWords++;
143 				qprofWords++;
144 				j += seglen; // update offset into query
145 			}
146 		}
147 	}
148 }
149 
150 #ifndef NDEBUG
151 /**
152  * Return true iff the cell has sane E/F/H values w/r/t its predecessors.
153  */
cellOkEnd2EndI16(SSEData & d,size_t row,size_t col,int refc,int readc,int readq,const Scoring & sc)154 static bool cellOkEnd2EndI16(
155 	SSEData& d,
156 	size_t row,
157 	size_t col,
158 	int refc,
159 	int readc,
160 	int readq,
161 	const Scoring& sc)     // scoring scheme
162 {
163 	TCScore floorsc = 0x8000;
164 	TCScore ceilsc = MAX_I64;
165 	TAlScore offsetsc = -0x7fff;
166 	TAlScore sc_h_cur = (TAlScore)d.mat_.helt(row, col);
167 	TAlScore sc_e_cur = (TAlScore)d.mat_.eelt(row, col);
168 	TAlScore sc_f_cur = (TAlScore)d.mat_.felt(row, col);
169 	if(sc_h_cur > floorsc) {
170 		sc_h_cur += offsetsc;
171 	}
172 	if(sc_e_cur > floorsc) {
173 		sc_e_cur += offsetsc;
174 	}
175 	if(sc_f_cur > floorsc) {
176 		sc_f_cur += offsetsc;
177 	}
178 	bool gapsAllowed = true;
179 	size_t rowFromEnd = d.mat_.nrow() - row - 1;
180 	if(row < (size_t)sc.gapbar || rowFromEnd < (size_t)sc.gapbar) {
181 		gapsAllowed = false;
182 	}
183 	bool e_left_trans = false, h_left_trans = false;
184 	bool f_up_trans   = false, h_up_trans = false;
185 	bool h_diag_trans = false;
186 	if(gapsAllowed) {
187 		TAlScore sc_h_left = floorsc;
188 		TAlScore sc_e_left = floorsc;
189 		TAlScore sc_h_up   = floorsc;
190 		TAlScore sc_f_up   = floorsc;
191 		if(col > 0 && sc_e_cur > floorsc && sc_e_cur <= ceilsc) {
192 			sc_h_left = d.mat_.helt(row, col-1) + offsetsc;
193 			sc_e_left = d.mat_.eelt(row, col-1) + offsetsc;
194 			e_left_trans = (sc_e_left > floorsc && sc_e_cur == sc_e_left - sc.readGapExtend());
195 			h_left_trans = (sc_h_left > floorsc && sc_e_cur == sc_h_left - sc.readGapOpen());
196 			assert(e_left_trans || h_left_trans);
197 		}
198 		if(row > 0 && sc_f_cur > floorsc && sc_f_cur <= ceilsc) {
199 			sc_h_up = d.mat_.helt(row-1, col) + offsetsc;
200 			sc_f_up = d.mat_.felt(row-1, col) + offsetsc;
201 			f_up_trans = (sc_f_up > floorsc && sc_f_cur == sc_f_up - sc.refGapExtend());
202 			h_up_trans = (sc_h_up > floorsc && sc_f_cur == sc_h_up - sc.refGapOpen());
203 			assert(f_up_trans || h_up_trans);
204 		}
205 	} else {
206 		assert_geq(floorsc, sc_e_cur);
207 		assert_geq(floorsc, sc_f_cur);
208 	}
209 	if(col > 0 && row > 0 && sc_h_cur > floorsc && sc_h_cur <= ceilsc) {
210 		TAlScore sc_h_upleft = d.mat_.helt(row-1, col-1) + offsetsc;
211 		TAlScore sc_diag = sc.score(readc, (int)refc, readq - 33);
212 		h_diag_trans = sc_h_cur == sc_h_upleft + sc_diag;
213 	}
214 	assert(
215 		sc_h_cur <= floorsc ||
216 		e_left_trans ||
217 		h_left_trans ||
218 		f_up_trans   ||
219 		h_up_trans   ||
220 		h_diag_trans ||
221 		sc_h_cur > ceilsc ||
222 		row == 0 ||
223 		col == 0);
224 	return true;
225 }
226 #endif /*ndef NDEBUG*/
227 
228 #ifdef NDEBUG
229 
230 #define assert_all_eq0(x)
231 #define assert_all_gt(x, y)
232 #define assert_all_gt_lo(x)
233 #define assert_all_lt(x, y)
234 #define assert_all_lt_hi(x)
235 
236 #else
237 
238 #define assert_all_eq0(x) { \
239 	__m128i z = _mm_setzero_si128(); \
240 	__m128i tmp = _mm_setzero_si128(); \
241 	z = _mm_xor_si128(z, z); \
242 	tmp = _mm_cmpeq_epi16(x, z); \
243 	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
244 }
245 
246 #define assert_all_gt(x, y) { \
247 	__m128i tmp = _mm_cmpgt_epi16(x, y); \
248 	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
249 }
250 
251 #define assert_all_gt_lo(x) { \
252 	__m128i z = _mm_setzero_si128(); \
253 	__m128i tmp = _mm_setzero_si128(); \
254 	z = _mm_xor_si128(z, z); \
255 	tmp = _mm_cmpgt_epi16(x, z); \
256 	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
257 }
258 
259 #define assert_all_lt(x, y) { \
260 	__m128i tmp = _mm_cmplt_epi16(x, y); \
261 	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
262 }
263 
264 #define assert_all_leq(x, y) { \
265 	__m128i tmp = _mm_cmpgt_epi16(x, y); \
266 	assert_eq(0x0000, _mm_movemask_epi8(tmp)); \
267 }
268 
269 #define assert_all_lt_hi(x) { \
270 	__m128i z = _mm_setzero_si128(); \
271 	__m128i tmp = _mm_setzero_si128(); \
272 	z = _mm_cmpeq_epi16(z, z); \
273 	z = _mm_srli_epi16(z, 1); \
274 	tmp = _mm_cmplt_epi16(x, z); \
275 	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
276 }
277 #endif
278 
279 /**
280  * Aligns by filling a dynamic programming matrix with the SSE-accelerated,
281  * banded DP approach of Farrar.  As it goes, it determines which cells we
282  * might backtrace from and tallies the best (highest-scoring) N backtrace
283  * candidate cells per diagonal.  Also returns the alignment score of the best
284  * alignment in the matrix.
285  *
286  * This routine does *not* maintain a matrix holding the entire matrix worth of
287  * scores, nor does it maintain any other dense O(mn) data structure, as this
288  * would quickly exhaust memory for queries longer than about 10,000 kb.
289  * Instead, in the fill stage it maintains two columns worth of scores at a
290  * time (current/previous, or right/left) - these take O(m) space.  When
291  * finished with the current column, it determines which cells from the
292  * previous column, if any, are candidates we might backtrace from to find a
293  * full alignment.  A candidate cell has a score that rises above the threshold
294  * and isn't improved upon by a match in the next column.  The best N
295  * candidates per diagonal are stored in a O(m + n) data structure.
296  */
alignGatherEE16(int & flag,bool debug)297 TAlScore SwAligner::alignGatherEE16(int& flag, bool debug) {
298 	assert_leq(rdf_, rd_->length());
299 	assert_leq(rdf_, qu_->length());
300 	assert_lt(rfi_, rff_);
301 	assert_lt(rdi_, rdf_);
302 	assert_eq(rd_->length(), qu_->length());
303 	assert_geq(sc_->gapbar, 1);
304 	assert(repOk());
305 #ifndef NDEBUG
306 	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
307 		assert_range(0, 16, (int)rf_[i]);
308 	}
309 #endif
310 
311 	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
312 	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
313 	if(!debug) met.dp++;
314 	buildQueryProfileEnd2EndSseI16(fw_);
315 	assert(!d.profbuf_.empty());
316 
317 	assert_eq(0, d.maxBonus_);
318 	size_t iter =
319 		(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
320 
321 	// Now set up the score vectors.  We just need two columns worth, which
322 	// we'll call "left" and "right".
323 	d.vecbuf_.resize(4 * 2 * iter);
324 	d.vecbuf_.zero();
325 	__m128i *vbuf_l = d.vecbuf_.ptr();
326 	__m128i *vbuf_r = d.vecbuf_.ptr() + (4 * iter);
327 
328 	// This is the data structure that holds candidate cells per diagonal.
329 	const size_t ndiags = rff_ - rfi_ + dpRows() - 1;
330 	if(!debug) {
331 		btdiag_.init(ndiags, 2);
332 	}
333 
334 	// Data structure that holds checkpointed anti-diagonals
335 	TAlScore perfectScore = sc_->perfectScore(dpRows());
336 	bool checkpoint = true;
337 	bool cpdebug = false;
338 #ifndef NDEBUG
339 	cpdebug = dpRows() < 1000;
340 #endif
341 	cper_.init(
342 		dpRows(),      // # rows
343 		rff_ - rfi_,   // # columns
344 		cperPerPow2_,  // checkpoint every 1 << perpow2 diags (& next)
345 		perfectScore,  // perfect score (for sanity checks)
346 		false,         // matrix cells have 8-bit scores?
347 		cperTri_,      // triangular mini-fills?
348 		false,         // alignment is local?
349 		cpdebug);      // save all cells for debugging?
350 
351 	// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
352 	// implementation:
353 	//
354 	//  http://sites.google.com/site/farrarmichael/smith-waterman
355 	//
356 	// Much of the implmentation below is adapted from Michael's code.
357 
358 	// Set all elts to reference gap open penalty
359 	__m128i rfgapo   = _mm_setzero_si128();
360 	__m128i rfgape   = _mm_setzero_si128();
361 	__m128i rdgapo   = _mm_setzero_si128();
362 	__m128i rdgape   = _mm_setzero_si128();
363 	__m128i vlo      = _mm_setzero_si128();
364 	__m128i vhi      = _mm_setzero_si128();
365 	__m128i vhilsw   = _mm_setzero_si128();
366 	__m128i vlolsw   = _mm_setzero_si128();
367 	__m128i ve       = _mm_setzero_si128();
368 	__m128i vf       = _mm_setzero_si128();
369 	__m128i vh       = _mm_setzero_si128();
370 	__m128i vhd      = _mm_setzero_si128();
371 	__m128i vhdtmp   = _mm_setzero_si128();
372 	__m128i vtmp     = _mm_setzero_si128();
373 
374 	assert_gt(sc_->refGapOpen(), 0);
375 	assert_leq(sc_->refGapOpen(), MAX_I16);
376 	rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
377 	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
378 	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
379 
380 	// Set all elts to reference gap extension penalty
381 	assert_gt(sc_->refGapExtend(), 0);
382 	assert_leq(sc_->refGapExtend(), MAX_I16);
383 	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
384 	rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
385 	rfgape = _mm_shufflelo_epi16(rfgape, 0);
386 	rfgape = _mm_shuffle_epi32(rfgape, 0);
387 
388 	// Set all elts to read gap open penalty
389 	assert_gt(sc_->readGapOpen(), 0);
390 	assert_leq(sc_->readGapOpen(), MAX_I16);
391 	rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
392 	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
393 	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
394 
395 	// Set all elts to read gap extension penalty
396 	assert_gt(sc_->readGapExtend(), 0);
397 	assert_leq(sc_->readGapExtend(), MAX_I16);
398 	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
399 	rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
400 	rdgape = _mm_shufflelo_epi16(rdgape, 0);
401 	rdgape = _mm_shuffle_epi32(rdgape, 0);
402 
403 	// Set all elts to 0x8000 (min value for signed 16-bit)
404 	vlo = _mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
405 	vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
406 
407 	// Set all elts to 0x7fff (max value for signed 16-bit)
408 	vhi = _mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
409 	vhi = _mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
410 
411 	// vlolsw: topmost (least sig) word set to 0x8000, all other words=0
412 	vlolsw = _mm_shuffle_epi32(vlo, 0);
413 	vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
414 
415 	// vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
416 	vhilsw = _mm_shuffle_epi32(vhi, 0);
417 	vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
418 
419 	// Points to a long vector of __m128i where each element is a block of
420 	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
421 	// the block of cells is from the E matrix.  If index % 3 == 1, they're
422 	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
423 	// Blocks of cells are organized in the same interleaved manner as they are
424 	// calculated by the Farrar algorithm.
425 	const __m128i *pvScore; // points into the query profile
426 
427 	const size_t colstride = ROWSTRIDE_2COL * iter;
428 
429 	// Initialize the H and E vectors in the first matrix column
430 	__m128i *pvELeft = vbuf_l + 0; __m128i *pvERight = vbuf_r + 0;
431 	/* __m128i *pvFLeft = vbuf_l + 1; */ __m128i *pvFRight = vbuf_r + 1;
432 	__m128i *pvHLeft = vbuf_l + 2; __m128i *pvHRight = vbuf_r + 2;
433 
434 	// Maximum score in final row
435 	bool found = false;
436 	TCScore lrmax = MIN_I16;
437 
438 	for(size_t i = 0; i < iter; i++) {
439 		_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
440 		// Could initialize Hs to high or low.  If high, cells in the lower
441 		// triangle will have somewhat more legitiate scores, but still won't
442 		// be exhaustively scored.
443 		_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
444 	}
445 
446 	assert_gt(sc_->gapbar, 0);
447 	size_t nfixup = 0;
448 
449 	// Fill in the table as usual but instead of using the same gap-penalty
450 	// vector for each iteration of the inner loop, load words out of a
451 	// pre-calculated gap vector parallel to the query profile.  The pre-
452 	// calculated gap vectors enforce the gap barrier constraint by making it
453 	// infinitely costly to introduce a gap in barrier rows.
454 	//
455 	// AND use a separate loop to fill in the first row of the table, enforcing
456 	// the st_ constraints in the process.  This is awkward because it
457 	// separates the processing of the first row from the others and might make
458 	// it difficult to use the first-row results in the next row, but it might
459 	// be the simplest and least disruptive way to deal with the st_ constraint.
460 
461 	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
462 		// Swap left and right; vbuf_l is the vector on the left, which we
463 		// generally load from, and vbuf_r is the vector on the right, which we
464 		// generally store to.
465 		swap(vbuf_l, vbuf_r);
466 		pvELeft = vbuf_l + 0; pvERight = vbuf_r + 0;
467 		/* pvFLeft = vbuf_l + 1; */ pvFRight = vbuf_r + 1;
468 		pvHLeft = vbuf_l + 2; pvHRight = vbuf_r + 2;
469 
470 		// Fetch the appropriate query profile.  Note that elements of rf_ must
471 		// be numbers, not masks.
472 		const int refc = (int)rf_[i];
473 
474 		// Fetch the appropriate query profile
475 		size_t off = (size_t)firsts5[refc] * iter * 2;
476 		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
477 
478 		// Set all cells to low value
479 		vf = _mm_cmpeq_epi16(vf, vf);
480 		vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
481 		vf = _mm_or_si128(vf, vlolsw);
482 
483 		// Load H vector from the final row of the previous column
484 		vh = _mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
485 		// Shift 2 bytes down so that topmost (least sig) cell gets 0
486 		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
487 		// Fill topmost (least sig) cell with high value
488 		vh = _mm_or_si128(vh, vhilsw);
489 
490 		// For each character in the reference text:
491 		size_t j;
492 		for(j = 0; j < iter; j++) {
493 			// Load cells from E, calculated previously
494 			ve = _mm_load_si128(pvELeft);
495 			vhd = _mm_load_si128(pvHLeft);
496 			assert_all_lt(ve, vhi);
497 			pvELeft += ROWSTRIDE_2COL;
498 
499 			// Store cells in F, calculated previously
500 			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
501 			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
502 			_mm_store_si128(pvFRight, vf);
503 			pvFRight += ROWSTRIDE_2COL;
504 
505 			// Factor in query profile (matches and mismatches)
506 			vh = _mm_adds_epi16(vh, pvScore[0]);
507 
508 			// Update H, factoring in E and F
509 			vh = _mm_max_epi16(vh, vf);
510 
511 			// Update vE value
512 			vhdtmp = vhd;
513 			vhd = _mm_subs_epi16(vhd, rdgapo);
514 			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
515 			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
516 			ve = _mm_subs_epi16(ve, rdgape);
517 			ve = _mm_max_epi16(ve, vhd);
518 			vh = _mm_max_epi16(vh, ve);
519 
520 			// Save the new vH values
521 			_mm_store_si128(pvHRight, vh);
522 			pvHRight += ROWSTRIDE_2COL;
523 			vtmp = vh;
524 			assert_all_lt(ve, vhi);
525 
526 			// Load the next h value
527 			vh = vhdtmp;
528 			pvHLeft += ROWSTRIDE_2COL;
529 
530 			// Save E values
531 			_mm_store_si128(pvERight, ve);
532 			pvERight += ROWSTRIDE_2COL;
533 
534 			// Update vf value
535 			vtmp = _mm_subs_epi16(vtmp, rfgapo);
536 			vf = _mm_subs_epi16(vf, rfgape);
537 			assert_all_lt(vf, vhi);
538 			vf = _mm_max_epi16(vf, vtmp);
539 
540 			pvScore += 2; // move on to next query profile / gap veto
541 		}
542 		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
543 		pvFRight -= colstride; // reset to start of column
544 		vtmp = _mm_load_si128(pvFRight);
545 
546 		pvHRight -= colstride; // reset to start of column
547 		vh = _mm_load_si128(pvHRight);
548 
549 		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
550 
551 		// vf from last row gets shifted down by one to overlay the first row
552 		// rfgape has already been subtracted from it.
553 		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
554 		vf = _mm_or_si128(vf, vlolsw);
555 
556 		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
557 		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
558 		vf = _mm_max_epi16(vtmp, vf);
559 		vtmp = _mm_cmpgt_epi16(vf, vtmp);
560 		int cmp = _mm_movemask_epi8(vtmp);
561 
562 		// If any element of vtmp is greater than H - gap-open...
563 		j = 0;
564 		while(cmp != 0x0000) {
565 			// Store this vf
566 			_mm_store_si128(pvFRight, vf);
567 			pvFRight += ROWSTRIDE_2COL;
568 
569 			// Update vh w/r/t new vf
570 			vh = _mm_max_epi16(vh, vf);
571 
572 			// Save vH values
573 			_mm_store_si128(pvHRight, vh);
574 			pvHRight += ROWSTRIDE_2COL;
575 
576 			pvScore += 2;
577 
578 			assert_lt(j, iter);
579 			if(++j == iter) {
580 				pvFRight -= colstride;
581 				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
582 				pvHRight -= colstride;
583 				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
584 				pvScore = d.profbuf_.ptr() + off + 1;
585 				j = 0;
586 				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
587 				vf = _mm_or_si128(vf, vlolsw);
588 			} else {
589 				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
590 				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
591 			}
592 
593 			// Update F with another gap extension
594 			vf = _mm_subs_epi16(vf, rfgape);
595 			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
596 			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
597 			vf = _mm_max_epi16(vtmp, vf);
598 			vtmp = _mm_cmpgt_epi16(vf, vtmp);
599 			cmp = _mm_movemask_epi8(vtmp);
600 			nfixup++;
601 		}
602 
603 
604 		// Check in the last row for the maximum so far
605 		__m128i *vtmp = vbuf_r + 2 /* H */ + (d.lastIter_ * ROWSTRIDE_2COL);
606 		// Note: we may not want to extract from the final row
607 		TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
608 		found = true;
609 		if(lr > lrmax) {
610 			lrmax = lr;
611 		}
612 
613 		// Now we'd like to know whether the bottommost element of the right
614 		// column is a candidate we might backtrace from.  First question is:
615 		// did it exceed the minimum score threshold?
616 		TAlScore score = (TAlScore)(lr - 0x7fff);
617 		if(lr == MIN_I16) {
618 			score = MIN_I64;
619 		}
620 		if(!debug && score >= minsc_) {
621 			DpBtCandidate cand(dpRows() - 1, i - rfi_, score);
622 			btdiag_.add(i - rfi_, cand);
623 		}
624 
625 		// Save some elements to checkpoints
626 		if(checkpoint) {
627 
628 			__m128i *pvE = vbuf_r + 0;
629 			__m128i *pvF = vbuf_r + 1;
630 			__m128i *pvH = vbuf_r + 2;
631 			size_t coli = i - rfi_;
632 			if(coli < cper_.locol_) cper_.locol_ = coli;
633 			if(coli > cper_.hicol_) cper_.hicol_ = coli;
634 
635 			if(cperTri_) {
636 				size_t rc_mod = coli & cper_.lomask_;
637 				assert_lt(rc_mod, cper_.per_);
638 				int64_t row = -(int64_t)rc_mod-1;
639 				int64_t row_mod = row;
640 				int64_t row_div = 0;
641 				size_t idx = coli >> cper_.perpow2_;
642 				size_t idxrow = idx * cper_.nrow_;
643 				assert_eq(4, ROWSTRIDE_2COL);
644 				bool done = false;
645 				while(true) {
646 					row += (cper_.per_ - 2);
647 					row_mod += (cper_.per_ - 2);
648 					for(size_t j = 0; j < 2; j++) {
649 						row++;
650 						row_mod++;
651 						if(row >= 0 && (size_t)row < cper_.nrow_) {
652 							// Update row divided by iter_ and mod iter_
653 							while(row_mod >= (int64_t)iter) {
654 								row_mod -= (int64_t)iter;
655 								row_div++;
656 							}
657 							size_t delt = idxrow + row;
658 							size_t vecoff = (row_mod << 5) + row_div;
659 							assert_lt(row_div, 8);
660 							int16_t h_sc = ((int16_t*)pvH)[vecoff];
661 							int16_t e_sc = ((int16_t*)pvE)[vecoff];
662 							int16_t f_sc = ((int16_t*)pvF)[vecoff];
663 							if(h_sc != MIN_I16) h_sc -= 0x7fff;
664 							if(e_sc != MIN_I16) e_sc -= 0x7fff;
665 							if(f_sc != MIN_I16) f_sc -= 0x7fff;
666 							assert_leq(h_sc, cper_.perf_);
667 							assert_leq(e_sc, cper_.perf_);
668 							assert_leq(f_sc, cper_.perf_);
669 							CpQuad *qdiags = ((j == 0) ? cper_.qdiag1s_.ptr() : cper_.qdiag2s_.ptr());
670 							qdiags[delt].sc[0] = h_sc;
671 							qdiags[delt].sc[1] = e_sc;
672 							qdiags[delt].sc[2] = f_sc;
673 						} // if(row >= 0 && row < nrow_)
674 						else if(row >= 0 && (size_t)row >= cper_.nrow_) {
675 							done = true;
676 							break;
677 						}
678 					} // end of loop over anti-diags
679 					if(done) {
680 						break;
681 					}
682 					idx++;
683 					idxrow += cper_.nrow_;
684 				}
685 			} else {
686 				// If this is the first column, take this opportunity to
687 				// pre-calculate the coordinates of the elements we're going to
688 				// checkpoint.
689 				if(coli == 0) {
690 					size_t cpi    = cper_.per_-1;
691 					size_t cpimod = cper_.per_-1;
692 					size_t cpidiv = 0;
693 					cper_.commitMap_.clear();
694 					while(cpi < cper_.nrow_) {
695 						while(cpimod >= iter) {
696 							cpimod -= iter;
697 							cpidiv++;
698 						}
699 						size_t vecoff = (cpimod << 5) + cpidiv;
700 						cper_.commitMap_.push_back(vecoff);
701 						cpi += cper_.per_;
702 						cpimod += cper_.per_;
703 					}
704 				}
705 				// Save all the rows
706 				size_t rowoff = 0;
707 				size_t sz = cper_.commitMap_.size();
708 				for(size_t i = 0; i < sz; i++, rowoff += cper_.ncol_) {
709 					size_t vecoff = cper_.commitMap_[i];
710 					int16_t h_sc = ((int16_t*)pvH)[vecoff];
711 					int16_t e_sc = ((int16_t*)pvE)[vecoff];
712 					int16_t f_sc = ((int16_t*)pvF)[vecoff];
713 					if(h_sc != MIN_I16) h_sc -= 0x7fff;
714 					if(e_sc != MIN_I16) e_sc -= 0x7fff;
715 					if(f_sc != MIN_I16) f_sc -= 0x7fff;
716 					assert_leq(h_sc, cper_.perf_);
717 					assert_leq(e_sc, cper_.perf_);
718 					assert_leq(f_sc, cper_.perf_);
719 					CpQuad& dst = cper_.qrows_[rowoff + coli];
720 					dst.sc[0] = h_sc;
721 					dst.sc[1] = e_sc;
722 					dst.sc[2] = f_sc;
723 				}
724 				// Is this a column we'd like to checkpoint?
725 				if((coli & cper_.lomask_) == cper_.lomask_) {
726 					// Save the column using memcpys
727 					assert_gt(coli, 0);
728 					size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
729 					size_t coloff = (coli >> cper_.perpow2_) * wordspercol;
730 					__m128i *dst = cper_.qcols_.ptr() + coloff;
731 					memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
732 				}
733 			}
734 			if(cper_.debug_) {
735 				// Save the column using memcpys
736 				size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
737 				size_t coloff = coli * wordspercol;
738 				__m128i *dst = cper_.qcolsD_.ptr() + coloff;
739 				memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
740 			}
741 		}
742 	}
743 
744 	// Update metrics
745 	if(!debug) {
746 		size_t ninner = (rff_ - rfi_) * iter;
747 		met.col   += (rff_ - rfi_);             // DP columns
748 		met.cell  += (ninner * NWORDS_PER_REG); // DP cells
749 		met.inner += ninner;                    // DP inner loop iters
750 		met.fixup += nfixup;                    // DP fixup loop iters
751 	}
752 
753 	flag = 0;
754 
755 	// Did we find a solution?
756 	TAlScore score = MIN_I64;
757 	if(!found) {
758 		flag = -1; // no
759 		if(!debug) met.dpfail++;
760 		return MIN_I64;
761 	} else {
762 		score = (TAlScore)(lrmax - 0x7fff);
763 		if(score < minsc_) {
764 			flag = -1; // no
765 			if(!debug) met.dpfail++;
766 			return score;
767 		}
768 	}
769 
770 	// Could we have saturated?
771 	if(lrmax == MIN_I16) {
772 		flag = -2; // yes
773 		if(!debug) met.dpsat++;
774 		return MIN_I64;
775 	}
776 
777 	// Now take all the backtrace candidates in the btdaig_ structure and
778 	// dump them into the btncand_ array.  They'll be sorted later.
779 	if(!debug) {
780 		btdiag_.dump(btncand_);
781 		assert(!btncand_.empty());
782 	}
783 
784 	// Return largest score
785 	if(!debug) met.dpsucc++;
786 	return score;
787 }
788 
789 /**
790  * Solve the current alignment problem using SSE instructions that operate on 8
791  * signed 16-bit values packed into a single 128-bit register.
792  */
alignNucleotidesEnd2EndSseI16(int & flag,bool debug)793 TAlScore SwAligner::alignNucleotidesEnd2EndSseI16(int& flag, bool debug) {
794 	assert_leq(rdf_, rd_->length());
795 	assert_leq(rdf_, qu_->length());
796 	assert_lt(rfi_, rff_);
797 	assert_lt(rdi_, rdf_);
798 	assert_eq(rd_->length(), qu_->length());
799 	assert_geq(sc_->gapbar, 1);
800 	assert(repOk());
801 #ifndef NDEBUG
802 	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
803 		assert_range(0, 16, (int)rf_[i]);
804 	}
805 #endif
806 
807 	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
808 	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
809 	if(!debug) met.dp++;
810 	buildQueryProfileEnd2EndSseI16(fw_);
811 	assert(!d.profbuf_.empty());
812 
813 	assert_eq(0, d.maxBonus_);
814 	size_t iter =
815 		(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
816 
817 	// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
818 	// implementation:
819 	//
820 	//  http://sites.google.com/site/farrarmichael/smith-waterman
821 	//
822 	// Much of the implmentation below is adapted from Michael's code.
823 
824 	// Set all elts to reference gap open penalty
825 	__m128i rfgapo   = _mm_setzero_si128();
826 	__m128i rfgape   = _mm_setzero_si128();
827 	__m128i rdgapo   = _mm_setzero_si128();
828 	__m128i rdgape   = _mm_setzero_si128();
829 	__m128i vlo      = _mm_setzero_si128();
830 	__m128i vhi      = _mm_setzero_si128();
831 	__m128i vhilsw   = _mm_setzero_si128();
832 	__m128i vlolsw   = _mm_setzero_si128();
833 	__m128i ve       = _mm_setzero_si128();
834 	__m128i vf       = _mm_setzero_si128();
835 	__m128i vh       = _mm_setzero_si128();
836 #if 0
837 	__m128i vhd      = _mm_setzero_si128();
838 	__m128i vhdtmp   = _mm_setzero_si128();
839 #endif
840 	__m128i vtmp     = _mm_setzero_si128();
841 
842 	assert_gt(sc_->refGapOpen(), 0);
843 	assert_leq(sc_->refGapOpen(), MAX_I16);
844 	rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
845 	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
846 	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
847 
848 	// Set all elts to reference gap extension penalty
849 	assert_gt(sc_->refGapExtend(), 0);
850 	assert_leq(sc_->refGapExtend(), MAX_I16);
851 	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
852 	rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
853 	rfgape = _mm_shufflelo_epi16(rfgape, 0);
854 	rfgape = _mm_shuffle_epi32(rfgape, 0);
855 
856 	// Set all elts to read gap open penalty
857 	assert_gt(sc_->readGapOpen(), 0);
858 	assert_leq(sc_->readGapOpen(), MAX_I16);
859 	rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
860 	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
861 	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
862 
863 	// Set all elts to read gap extension penalty
864 	assert_gt(sc_->readGapExtend(), 0);
865 	assert_leq(sc_->readGapExtend(), MAX_I16);
866 	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
867 	rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
868 	rdgape = _mm_shufflelo_epi16(rdgape, 0);
869 	rdgape = _mm_shuffle_epi32(rdgape, 0);
870 
871 	// Set all elts to 0x8000 (min value for signed 16-bit)
872 	vlo = _mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
873 	vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
874 
875 	// Set all elts to 0x7fff (max value for signed 16-bit)
876 	vhi = _mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
877 	vhi = _mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
878 
879 	// vlolsw: topmost (least sig) word set to 0x8000, all other words=0
880 	vlolsw = _mm_shuffle_epi32(vlo, 0);
881 	vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
882 
883 	// vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
884 	vhilsw = _mm_shuffle_epi32(vhi, 0);
885 	vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
886 
887 	// Points to a long vector of __m128i where each element is a block of
888 	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
889 	// the block of cells is from the E matrix.  If index % 3 == 1, they're
890 	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
891 	// Blocks of cells are organized in the same interleaved manner as they are
892 	// calculated by the Farrar algorithm.
893 	const __m128i *pvScore; // points into the query profile
894 
895 	d.mat_.init(dpRows(), rff_ - rfi_, NWORDS_PER_REG);
896 	const size_t colstride = d.mat_.colstride();
897 	assert_eq(ROWSTRIDE, colstride / iter);
898 
899 	// Initialize the H and E vectors in the first matrix column
900 	__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
901 	__m128i *pvETmp = d.mat_.evec(0, 0);
902 
903 	// Maximum score in final row
904 	bool found = false;
905 	TCScore lrmax = MIN_I16;
906 
907 	for(size_t i = 0; i < iter; i++) {
908 		_mm_store_si128(pvETmp, vlo);
909 		// Could initialize Hs to high or low.  If high, cells in the lower
910 		// triangle will have somewhat more legitiate scores, but still won't
911 		// be exhaustively scored.
912 		_mm_store_si128(pvHTmp, vlo);
913 		pvETmp += ROWSTRIDE;
914 		pvHTmp += ROWSTRIDE;
915 	}
916 	// These are swapped just before the innermost loop
917 	__m128i *pvHStore = d.mat_.hvec(0, 0);
918 	__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
919 	__m128i *pvELoad  = d.mat_.evec(0, 0);
920 	__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
921 	__m128i *pvFStore = d.mat_.fvec(0, 0);
922 	__m128i *pvFTmp   = NULL;
923 
924 	assert_gt(sc_->gapbar, 0);
925 	size_t nfixup = 0;
926 
927 	// Fill in the table as usual but instead of using the same gap-penalty
928 	// vector for each iteration of the inner loop, load words out of a
929 	// pre-calculated gap vector parallel to the query profile.  The pre-
930 	// calculated gap vectors enforce the gap barrier constraint by making it
931 	// infinitely costly to introduce a gap in barrier rows.
932 	//
933 	// AND use a separate loop to fill in the first row of the table, enforcing
934 	// the st_ constraints in the process.  This is awkward because it
935 	// separates the processing of the first row from the others and might make
936 	// it difficult to use the first-row results in the next row, but it might
937 	// be the simplest and least disruptive way to deal with the st_ constraint.
938 
939 	colstop_ = rff_ - 1;
940 	lastsolcol_ = 0;
941 
942 	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
943 		assert(pvFStore == d.mat_.fvec(0, i - rfi_));
944 		assert(pvHStore == d.mat_.hvec(0, i - rfi_));
945 
946 		// Fetch the appropriate query profile.  Note that elements of rf_ must
947 		// be numbers, not masks.
948 		const int refc = (int)rf_[i];
949 		size_t off = (size_t)firsts5[refc] * iter * 2;
950 		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
951 
952 		// Set all cells to low value
953 		vf = _mm_cmpeq_epi16(vf, vf);
954 		vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
955 		vf = _mm_or_si128(vf, vlolsw);
956 
957 		// Load H vector from the final row of the previous column
958 		vh = _mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
959 		// Shift 2 bytes down so that topmost (least sig) cell gets 0
960 		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
961 		// Fill topmost (least sig) cell with high value
962 		vh = _mm_or_si128(vh, vhilsw);
963 
964 		// For each character in the reference text:
965 		size_t j;
966 		for(j = 0; j < iter; j++) {
967 			// Load cells from E, calculated previously
968 			ve = _mm_load_si128(pvELoad);
969 #if 0
970 			vhd = _mm_load_si128(pvHLoad);
971 #endif
972 			assert_all_lt(ve, vhi);
973 			pvELoad += ROWSTRIDE;
974 
975 			// Store cells in F, calculated previously
976 			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
977 			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
978 			_mm_store_si128(pvFStore, vf);
979 			pvFStore += ROWSTRIDE;
980 
981 			// Factor in query profile (matches and mismatches)
982 			vh = _mm_adds_epi16(vh, pvScore[0]);
983 
984 			// Update H, factoring in E and F
985 			vh = _mm_max_epi16(vh, ve);
986 			vh = _mm_max_epi16(vh, vf);
987 
988 			// Save the new vH values
989 			_mm_store_si128(pvHStore, vh);
990 			pvHStore += ROWSTRIDE;
991 
992 			// Update vE value
993 			vtmp = vh;
994 #if 0
995 			vhdtmp = vhd;
996 			vhd = _mm_subs_epi16(vhd, rdgapo);
997 			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
998 			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
999 			ve = _mm_subs_epi16(ve, rdgape);
1000 			ve = _mm_max_epi16(ve, vhd);
1001 #else
1002 			vh = _mm_subs_epi16(vh, rdgapo);
1003 			vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
1004 			vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
1005 			ve = _mm_subs_epi16(ve, rdgape);
1006 			ve = _mm_max_epi16(ve, vh);
1007 #endif
1008 			assert_all_lt(ve, vhi);
1009 
1010 			// Load the next h value
1011 #if 0
1012 			vh = vhdtmp;
1013 #else
1014 			vh = _mm_load_si128(pvHLoad);
1015 #endif
1016 			pvHLoad += ROWSTRIDE;
1017 
1018 			// Save E values
1019 			_mm_store_si128(pvEStore, ve);
1020 			pvEStore += ROWSTRIDE;
1021 
1022 			// Update vf value
1023 			vtmp = _mm_subs_epi16(vtmp, rfgapo);
1024 			vf = _mm_subs_epi16(vf, rfgape);
1025 			assert_all_lt(vf, vhi);
1026 			vf = _mm_max_epi16(vf, vtmp);
1027 
1028 			pvScore += 2; // move on to next query profile / gap veto
1029 		}
1030 		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
1031 		pvFTmp = pvFStore;
1032 		pvFStore -= colstride; // reset to start of column
1033 		vtmp = _mm_load_si128(pvFStore);
1034 
1035 		pvHStore -= colstride; // reset to start of column
1036 		vh = _mm_load_si128(pvHStore);
1037 
1038 #if 0
1039 #else
1040 		pvEStore -= colstride; // reset to start of column
1041 		ve = _mm_load_si128(pvEStore);
1042 #endif
1043 
1044 		pvHLoad = pvHStore;    // new pvHLoad = pvHStore
1045 		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
1046 
1047 		// vf from last row gets shifted down by one to overlay the first row
1048 		// rfgape has already been subtracted from it.
1049 		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
1050 		vf = _mm_or_si128(vf, vlolsw);
1051 
1052 		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
1053 		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
1054 		vf = _mm_max_epi16(vtmp, vf);
1055 		vtmp = _mm_cmpgt_epi16(vf, vtmp);
1056 		int cmp = _mm_movemask_epi8(vtmp);
1057 
1058 		// If any element of vtmp is greater than H - gap-open...
1059 		j = 0;
1060 		while(cmp != 0x0000) {
1061 			// Store this vf
1062 			_mm_store_si128(pvFStore, vf);
1063 			pvFStore += ROWSTRIDE;
1064 
1065 			// Update vh w/r/t new vf
1066 			vh = _mm_max_epi16(vh, vf);
1067 
1068 			// Save vH values
1069 			_mm_store_si128(pvHStore, vh);
1070 			pvHStore += ROWSTRIDE;
1071 
1072 			// Update E in case it can be improved using our new vh
1073 #if 0
1074 #else
1075 			vh = _mm_subs_epi16(vh, rdgapo);
1076 			vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
1077 			vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
1078 			ve = _mm_max_epi16(ve, vh);
1079 			_mm_store_si128(pvEStore, ve);
1080 			pvEStore += ROWSTRIDE;
1081 #endif
1082 			pvScore += 2;
1083 
1084 			assert_lt(j, iter);
1085 			if(++j == iter) {
1086 				pvFStore -= colstride;
1087 				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
1088 				pvHStore -= colstride;
1089 				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
1090 #if 0
1091 #else
1092 				pvEStore -= colstride;
1093 				ve = _mm_load_si128(pvEStore);     // load next ve ASAP
1094 #endif
1095 				pvScore = d.profbuf_.ptr() + off + 1;
1096 				j = 0;
1097 				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
1098 				vf = _mm_or_si128(vf, vlolsw);
1099 			} else {
1100 				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
1101 				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
1102 #if 0
1103 #else
1104 				ve = _mm_load_si128(pvEStore);     // load next vh ASAP
1105 #endif
1106 			}
1107 
1108 			// Update F with another gap extension
1109 			vf = _mm_subs_epi16(vf, rfgape);
1110 			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
1111 			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
1112 			vf = _mm_max_epi16(vtmp, vf);
1113 			vtmp = _mm_cmpgt_epi16(vf, vtmp);
1114 			cmp = _mm_movemask_epi8(vtmp);
1115 			nfixup++;
1116 		}
1117 
1118 #ifndef NDEBUG
1119 		if((rand() & 15) == 0) {
1120 			// This is a work-intensive sanity check; each time we finish filling
1121 			// a column, we check that each H, E, and F is sensible.
1122 			for(size_t k = 0; k < dpRows(); k++) {
1123 				assert(cellOkEnd2EndI16(
1124 					d,
1125 					k,                   // row
1126 					i - rfi_,            // col
1127 					refc,                // reference mask
1128 					(int)(*rd_)[rdi_+k], // read char
1129 					(int)(*qu_)[rdi_+k], // read quality
1130 					*sc_));              // scoring scheme
1131 			}
1132 		}
1133 #endif
1134 
1135 		__m128i *vtmp = d.mat_.hvec(d.lastIter_, i-rfi_);
1136 		// Note: we may not want to extract from the final row
1137 		TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
1138 		found = true;
1139 		if(lr > lrmax) {
1140 			lrmax = lr;
1141 		}
1142 
1143 		// pvELoad and pvHLoad are already where they need to be
1144 
1145 		// Adjust the load and store vectors here.
1146 		pvHStore = pvHLoad + colstride;
1147 		pvEStore = pvELoad + colstride;
1148 		pvFStore = pvFTmp;
1149 	}
1150 
1151 	// Update metrics
1152 	if(!debug) {
1153 		size_t ninner = (rff_ - rfi_) * iter;
1154 		met.col   += (rff_ - rfi_);             // DP columns
1155 		met.cell  += (ninner * NWORDS_PER_REG); // DP cells
1156 		met.inner += ninner;                    // DP inner loop iters
1157 		met.fixup += nfixup;                    // DP fixup loop iters
1158 	}
1159 
1160 	flag = 0;
1161 
1162 	// Did we find a solution?
1163 	TAlScore score = MIN_I64;
1164 	if(!found) {
1165 		flag = -1; // no
1166 		if(!debug) met.dpfail++;
1167 		return MIN_I64;
1168 	} else {
1169 		score = (TAlScore)(lrmax - 0x7fff);
1170 		if(score < minsc_) {
1171 			flag = -1; // no
1172 			if(!debug) met.dpfail++;
1173 			return score;
1174 		}
1175 	}
1176 
1177 	// Could we have saturated?
1178 	if(lrmax == MIN_I16) {
1179 		flag = -2; // yes
1180 		if(!debug) met.dpsat++;
1181 		return MIN_I64;
1182 	}
1183 
1184 	// Return largest score
1185 	if(!debug) met.dpsucc++;
1186 	return score;
1187 }
1188 
1189 /**
1190  * Given a filled-in DP table, populate the btncand_ list with candidate cells
1191  * that might be at the ends of valid alignments.  No need to do this unless
1192  * the maximum score returned by the align*() func is >= the minimum.
1193  *
1194  * Only cells that are exhaustively scored are candidates.  Those are the
1195  * cells inside the shape made of o's in this:
1196  *
1197  *  |-maxgaps-|
1198  *  *********************************    -
1199  *   ********************************    |
1200  *    *******************************    |
1201  *     ******************************    |
1202  *      *****************************    |
1203  *       **************************** read len
1204  *        ***************************    |
1205  *         **************************    |
1206  *          *************************    |
1207  *           ************************    |
1208  *            ***********oooooooooooo    -
1209  *            |-maxgaps-|
1210  *  |-readlen-|
1211  *  |-------skip--------|
1212  *
1213  * And it's possible for the shape to be truncated on the left and right sides.
1214  *
1215  *
1216  */
gatherCellsNucleotidesEnd2EndSseI16(TAlScore best)1217 bool SwAligner::gatherCellsNucleotidesEnd2EndSseI16(TAlScore best) {
1218 	// What's the minimum number of rows that can possibly be spanned by an
1219 	// alignment that meets the minimum score requirement?
1220 	assert(sse16succ_);
1221 	const size_t ncol = rff_ - rfi_;
1222 	const size_t nrow = dpRows();
1223 	assert_gt(nrow, 0);
1224 	btncand_.clear();
1225 	btncanddone_.clear();
1226 	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
1227 	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
1228 	assert(!d.profbuf_.empty());
1229 	const size_t colstride = d.mat_.colstride();
1230 	ASSERT_ONLY(bool sawbest = false);
1231 	__m128i *pvH = d.mat_.hvec(d.lastIter_, 0);
1232 	for(size_t j = 0; j < ncol; j++) {
1233 		TAlScore sc = (TAlScore)(((TCScore*)pvH)[d.lastWord_] - 0x7fff);
1234 		assert_leq(sc, best);
1235 		ASSERT_ONLY(sawbest = (sawbest || sc == best));
1236 		if(sc >= minsc_) {
1237 			// Yes, this is legit
1238 			met.gathsol++;
1239 			btncand_.expand();
1240 			btncand_.back().init(nrow-1, j, sc);
1241 		}
1242 		pvH += colstride;
1243 	}
1244 	assert(sawbest);
1245 	if(!btncand_.empty()) {
1246 		d.mat_.initMasks();
1247 	}
1248 	return !btncand_.empty();
1249 }
1250 
1251 #define MOVE_VEC_PTR_UP(vec, rowvec, rowelt) { \
1252 	if(rowvec == 0) { \
1253 		rowvec += d.mat_.nvecrow_; \
1254 		vec += d.mat_.colstride_; \
1255 		rowelt--; \
1256 	} \
1257 	rowvec--; \
1258 	vec -= ROWSTRIDE; \
1259 }
1260 
1261 #define MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt) { vec -= d.mat_.colstride_; }
1262 
1263 #define MOVE_VEC_PTR_UPLEFT(vec, rowvec, rowelt) { \
1264  	MOVE_VEC_PTR_UP(vec, rowvec, rowelt); \
1265  	MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt); \
1266 }
1267 
1268 #define MOVE_ALL_LEFT() { \
1269 	MOVE_VEC_PTR_LEFT(cur_vec, rowvec, rowelt); \
1270 	MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
1271 	MOVE_VEC_PTR_LEFT(up_vec, up_rowvec, up_rowelt); \
1272 	MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
1273 }
1274 
1275 #define MOVE_ALL_UP() { \
1276 	MOVE_VEC_PTR_UP(cur_vec, rowvec, rowelt); \
1277 	MOVE_VEC_PTR_UP(left_vec, left_rowvec, left_rowelt); \
1278 	MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
1279 	MOVE_VEC_PTR_UP(upleft_vec, upleft_rowvec, upleft_rowelt); \
1280 }
1281 
1282 #define MOVE_ALL_UPLEFT() { \
1283 	MOVE_VEC_PTR_UPLEFT(cur_vec, rowvec, rowelt); \
1284 	MOVE_VEC_PTR_UPLEFT(left_vec, left_rowvec, left_rowelt); \
1285 	MOVE_VEC_PTR_UPLEFT(up_vec, up_rowvec, up_rowelt); \
1286 	MOVE_VEC_PTR_UPLEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
1287 }
1288 
1289 #define NEW_ROW_COL(row, col) { \
1290 	rowelt = row / d.mat_.nvecrow_; \
1291 	rowvec = row % d.mat_.nvecrow_; \
1292 	eltvec = (col * d.mat_.colstride_) + (rowvec * ROWSTRIDE); \
1293 	cur_vec = d.mat_.matbuf_.ptr() + eltvec; \
1294 	left_vec = cur_vec; \
1295 	left_rowelt = rowelt; \
1296 	left_rowvec = rowvec; \
1297 	MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
1298 	up_vec = cur_vec; \
1299 	up_rowelt = rowelt; \
1300 	up_rowvec = rowvec; \
1301 	MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
1302 	upleft_vec = up_vec; \
1303 	upleft_rowelt = up_rowelt; \
1304 	upleft_rowvec = up_rowvec; \
1305 	MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
1306 }
1307 
1308 /**
1309  * Given the dynamic programming table and a cell, trace backwards from the
1310  * cell and install the edits and score/penalty in the appropriate fields
1311  * of res.  The RandomSource is used to break ties among equally good ways
1312  * of tracing back.
1313  *
1314  * Whenever we enter a cell, we check whether the read/ref coordinates of
1315  * that cell correspond to a cell we traversed constructing a previous
1316  * alignment.  If so, we backtrack to the last decision point, mask out the
1317  * path that led to the previously observed cell, and continue along a
1318  * different path; or, if there are no more paths to try, we give up.
1319  *
1320  * If an alignment is found, 'off' is set to the alignment's upstream-most
1321  * reference character's offset into the chromosome and true is returned.
1322  * Otherwise, false is returned.
1323  */
backtraceNucleotidesEnd2EndSseI16(TAlScore escore,SwResult & res,size_t & off,size_t & nbts,size_t row,size_t col,RandomSource & rnd)1324 bool SwAligner::backtraceNucleotidesEnd2EndSseI16(
1325 	TAlScore       escore, // in: expected score
1326 	SwResult&      res,    // out: store results (edits and scores) here
1327 	size_t&        off,    // out: store diagonal projection of origin
1328 	size_t&        nbts,   // out: # backtracks
1329 	size_t         row,    // start in this row
1330 	size_t         col,    // start in this column
1331 	RandomSource&  rnd)    // random gen, to choose among equal paths
1332 {
1333 	assert_lt(row, dpRows());
1334 	assert_lt(col, (size_t)(rff_ - rfi_));
1335 	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
1336 	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
1337 	met.bt++;
1338 	assert(!d.profbuf_.empty());
1339 	assert_lt(row, rd_->length());
1340 	btnstack_.clear(); // empty the backtrack stack
1341 	btcells_.clear();  // empty the cells-so-far list
1342 	AlnScore score; score.score_ = 0;
1343 	score.gaps_ = score.ns_ = 0;
1344 	size_t origCol = col;
1345 	size_t gaps = 0, readGaps = 0, refGaps = 0;
1346 	res.alres.reset();
1347 	EList<Edit>& ned = res.alres.ned();
1348 	assert(ned.empty());
1349 	assert_gt(dpRows(), row);
1350 	size_t trimEnd = dpRows() - row - 1;
1351 	size_t trimBeg = 0;
1352 	size_t ct = SSEMatrix::H; // cell type
1353 	// Row and col in terms of where they fall in the SSE vector matrix
1354 	size_t rowelt, rowvec, eltvec;
1355 	size_t left_rowelt, up_rowelt, upleft_rowelt;
1356 	size_t left_rowvec, up_rowvec, upleft_rowvec;
1357 	__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
1358 	NEW_ROW_COL(row, col);
1359 	while((int)row >= 0) {
1360 		met.btcell++;
1361 		nbts++;
1362 		int readc = (*rd_)[rdi_ + row];
1363 		int refm  = (int)rf_[rfi_ + col];
1364 		int readq = (*qu_)[row];
1365 		assert_leq(col, origCol);
1366 		// Get score in this cell
1367 		bool empty = false, reportedThru, canMoveThru, branch = false;
1368 		int cur = SSEMatrix::H;
1369 		if(!d.mat_.reset_[row]) {
1370 			d.mat_.resetRow(row);
1371 		}
1372 		reportedThru = d.mat_.reportedThrough(row, col);
1373 		canMoveThru = true;
1374 		if(reportedThru) {
1375 			canMoveThru = false;
1376 		} else {
1377 			empty = false;
1378 			if(row > 0) {
1379 				assert_gt(row, 0);
1380 				size_t rowFromEnd = d.mat_.nrow() - row - 1;
1381 				bool gapsAllowed = true;
1382 				if(row < (size_t)sc_->gapbar ||
1383 				   rowFromEnd < (size_t)sc_->gapbar)
1384 				{
1385 					gapsAllowed = false;
1386 				}
1387 				const TAlScore floorsc = MIN_I64;
1388 				const int offsetsc = -0x7fff;
1389 				// Move to beginning of column/row
1390 				if(ct == SSEMatrix::E) { // AKA rdgap
1391 					assert_gt(col, 0);
1392 					TAlScore sc_cur = ((TCScore*)(cur_vec + SSEMatrix::E))[rowelt] + offsetsc;
1393 					assert(gapsAllowed);
1394 					// Currently in the E matrix; incoming transition must come from the
1395 					// left.  It's either a gap open from the H matrix or a gap extend from
1396 					// the E matrix.
1397 					// TODO: save and restore origMask as well as mask
1398 					int origMask = 0, mask = 0;
1399 					// Get H score of cell to the left
1400 					TAlScore sc_h_left = ((TCScore*)(left_vec + SSEMatrix::H))[left_rowelt] + offsetsc;
1401 					if(sc_h_left > floorsc && sc_h_left - sc_->readGapOpen() == sc_cur) {
1402 						mask |= (1 << 0);
1403 					}
1404 					// Get E score of cell to the left
1405 					TAlScore sc_e_left = ((TCScore*)(left_vec + SSEMatrix::E))[left_rowelt] + offsetsc;
1406 					if(sc_e_left > floorsc && sc_e_left - sc_->readGapExtend() == sc_cur) {
1407 						mask |= (1 << 1);
1408 					}
1409 					origMask = mask;
1410 					assert(origMask > 0 || sc_cur <= sc_->match());
1411 					if(d.mat_.isEMaskSet(row, col)) {
1412 						mask = (d.mat_.masks_[row][col] >> 8) & 3;
1413 					}
1414 					if(mask == 3) {
1415 #if 1
1416 						// Pick H -> E cell
1417 						cur = SW_BT_OALL_READ_OPEN;
1418 						d.mat_.eMaskSet(row, col, 2); // might choose E later
1419 #else
1420 						if(rnd.nextU2()) {
1421 							// Pick H -> E cell
1422 							cur = SW_BT_OALL_READ_OPEN;
1423 							d.mat_.eMaskSet(row, col, 2); // might choose E later
1424 						} else {
1425 							// Pick E -> E cell
1426 							cur = SW_BT_RDGAP_EXTEND;
1427 							d.mat_.eMaskSet(row, col, 1); // might choose H later
1428 						}
1429 #endif
1430 						branch = true;
1431 					} else if(mask == 2) {
1432 						// I chose the E cell
1433 						cur = SW_BT_RDGAP_EXTEND;
1434 						d.mat_.eMaskSet(row, col, 0); // done
1435 					} else if(mask == 1) {
1436 						// I chose the H cell
1437 						cur = SW_BT_OALL_READ_OPEN;
1438 						d.mat_.eMaskSet(row, col, 0); // done
1439 					} else {
1440 						empty = true;
1441 						// It's empty, so the only question left is whether we should be
1442 						// allowed in terimnate in this cell.  If it's got a valid score
1443 						// then we *shouldn't* be allowed to terminate here because that
1444 						// means it's part of a larger alignment that was already reported.
1445 						canMoveThru = (origMask == 0);
1446 					}
1447 					assert(!empty || !canMoveThru);
1448 				} else if(ct == SSEMatrix::F) { // AKA rfgap
1449 					assert_gt(row, 0);
1450 					assert(gapsAllowed);
1451 					TAlScore sc_h_up = ((TCScore*)(up_vec  + SSEMatrix::H))[up_rowelt] + offsetsc;
1452 					TAlScore sc_f_up = ((TCScore*)(up_vec  + SSEMatrix::F))[up_rowelt] + offsetsc;
1453 					TAlScore sc_cur  = ((TCScore*)(cur_vec + SSEMatrix::F))[rowelt] + offsetsc;
1454 					// Currently in the F matrix; incoming transition must come from above.
1455 					// It's either a gap open from the H matrix or a gap extend from the F
1456 					// matrix.
1457 					// TODO: save and restore origMask as well as mask
1458 					int origMask = 0, mask = 0;
1459 					// Get H score of cell above
1460 					if(sc_h_up > floorsc && sc_h_up - sc_->refGapOpen() == sc_cur) {
1461 						mask |= (1 << 0);
1462 					}
1463 					// Get F score of cell above
1464 					if(sc_f_up > floorsc && sc_f_up - sc_->refGapExtend() == sc_cur) {
1465 						mask |= (1 << 1);
1466 					}
1467 					origMask = mask;
1468 					assert(origMask > 0 || sc_cur <= sc_->match());
1469 					if(d.mat_.isFMaskSet(row, col)) {
1470 						mask = (d.mat_.masks_[row][col] >> 11) & 3;
1471 					}
1472 					if(mask == 3) {
1473 #if 1
1474 						// I chose the H cell
1475 						cur = SW_BT_OALL_REF_OPEN;
1476 						d.mat_.fMaskSet(row, col, 2); // might choose E later
1477 #else
1478 						if(rnd.nextU2()) {
1479 							// I chose the H cell
1480 							cur = SW_BT_OALL_REF_OPEN;
1481 							d.mat_.fMaskSet(row, col, 2); // might choose E later
1482 						} else {
1483 							// I chose the F cell
1484 							cur = SW_BT_RFGAP_EXTEND;
1485 							d.mat_.fMaskSet(row, col, 1); // might choose E later
1486 						}
1487 #endif
1488 						branch = true;
1489 					} else if(mask == 2) {
1490 						// I chose the F cell
1491 						cur = SW_BT_RFGAP_EXTEND;
1492 						d.mat_.fMaskSet(row, col, 0); // done
1493 					} else if(mask == 1) {
1494 						// I chose the H cell
1495 						cur = SW_BT_OALL_REF_OPEN;
1496 						d.mat_.fMaskSet(row, col, 0); // done
1497 					} else {
1498 						empty = true;
1499 						// It's empty, so the only question left is whether we should be
1500 						// allowed in terimnate in this cell.  If it's got a valid score
1501 						// then we *shouldn't* be allowed to terminate here because that
1502 						// means it's part of a larger alignment that was already reported.
1503 						canMoveThru = (origMask == 0);
1504 					}
1505 					assert(!empty || !canMoveThru);
1506 				} else {
1507 					assert_eq(SSEMatrix::H, ct);
1508 					TAlScore sc_cur      = ((TCScore*)(cur_vec + SSEMatrix::H))[rowelt]    + offsetsc;
1509 					TAlScore sc_f_up     = ((TCScore*)(up_vec  + SSEMatrix::F))[up_rowelt] + offsetsc;
1510 					TAlScore sc_h_up     = ((TCScore*)(up_vec  + SSEMatrix::H))[up_rowelt] + offsetsc;
1511 					TAlScore sc_h_left   = col > 0 ? (((TCScore*)(left_vec   + SSEMatrix::H))[left_rowelt]   + offsetsc) : floorsc;
1512 					TAlScore sc_e_left   = col > 0 ? (((TCScore*)(left_vec   + SSEMatrix::E))[left_rowelt]   + offsetsc) : floorsc;
1513 					TAlScore sc_h_upleft = col > 0 ? (((TCScore*)(upleft_vec + SSEMatrix::H))[upleft_rowelt] + offsetsc) : floorsc;
1514 					TAlScore sc_diag     = sc_->score(readc, refm, readq - 33);
1515 					// TODO: save and restore origMask as well as mask
1516 					int origMask = 0, mask = 0;
1517 					if(gapsAllowed) {
1518 						if(sc_h_up     > floorsc && sc_cur == sc_h_up   - sc_->refGapOpen()) {
1519 							mask |= (1 << 0);
1520 						}
1521 						if(sc_h_left   > floorsc && sc_cur == sc_h_left - sc_->readGapOpen()) {
1522 							mask |= (1 << 1);
1523 						}
1524 						if(sc_f_up     > floorsc && sc_cur == sc_f_up   - sc_->refGapExtend()) {
1525 							mask |= (1 << 2);
1526 						}
1527 						if(sc_e_left   > floorsc && sc_cur == sc_e_left - sc_->readGapExtend()) {
1528 							mask |= (1 << 3);
1529 						}
1530 					}
1531 					if(sc_h_upleft > floorsc && sc_cur == sc_h_upleft + sc_diag) {
1532 						mask |= (1 << 4);
1533 					}
1534 					origMask = mask;
1535 					assert(origMask > 0 || sc_cur <= sc_->match());
1536 					if(d.mat_.isHMaskSet(row, col)) {
1537 						mask = (d.mat_.masks_[row][col] >> 2) & 31;
1538 					}
1539 					assert(gapsAllowed || mask == (1 << 4) || mask == 0);
1540 					int opts = alts5[mask];
1541 					int select = -1;
1542 					if(opts == 1) {
1543 						select = firsts5[mask];
1544 						assert_geq(mask, 0);
1545 						d.mat_.hMaskSet(row, col, 0);
1546 					} else if(opts > 1) {
1547 #if 1
1548 						if(       (mask & 16) != 0) {
1549 							select = 4; // H diag
1550 						} else if((mask & 1) != 0) {
1551 							select = 0; // H up
1552 						} else if((mask & 4) != 0) {
1553 							select = 2; // F up
1554 						} else if((mask & 2) != 0) {
1555 							select = 1; // H left
1556 						} else if((mask & 8) != 0) {
1557 							select = 3; // E left
1558 						}
1559 #else
1560 						select = randFromMask(rnd, mask);
1561 #endif
1562 						assert_geq(mask, 0);
1563 						mask &= ~(1 << select);
1564 						assert(gapsAllowed || mask == (1 << 4) || mask == 0);
1565 						d.mat_.hMaskSet(row, col, mask);
1566 						branch = true;
1567 					} else { /* No way to backtrack! */ }
1568 					if(select != -1) {
1569 						if(select == 4) {
1570 							cur = SW_BT_OALL_DIAG;
1571 						} else if(select == 0) {
1572 							cur = SW_BT_OALL_REF_OPEN;
1573 						} else if(select == 1) {
1574 							cur = SW_BT_OALL_READ_OPEN;
1575 						} else if(select == 2) {
1576 							cur = SW_BT_RFGAP_EXTEND;
1577 						} else {
1578 							assert_eq(3, select)
1579 							cur = SW_BT_RDGAP_EXTEND;
1580 						}
1581 					} else {
1582 						empty = true;
1583 						// It's empty, so the only question left is whether we should be
1584 						// allowed in terimnate in this cell.  If it's got a valid score
1585 						// then we *shouldn't* be allowed to terminate here because that
1586 						// means it's part of a larger alignment that was already reported.
1587 						canMoveThru = (origMask == 0);
1588 					}
1589 				}
1590 				assert(!empty || !canMoveThru || ct == SSEMatrix::H);
1591 			}
1592 		}
1593 		d.mat_.setReportedThrough(row, col);
1594 		assert_eq(gaps, Edit::numGaps(ned));
1595 		assert_leq(gaps, rdgap_ + rfgap_);
1596 		// Cell was involved in a previously-reported alignment?
1597 		if(!canMoveThru) {
1598 			if(!btnstack_.empty()) {
1599 				// Remove all the cells from list back to and including the
1600 				// cell where the branch occurred
1601 				btcells_.resize(btnstack_.back().celsz);
1602 				// Pop record off the top of the stack
1603 				ned.resize(btnstack_.back().nedsz);
1604 				//aed.resize(btnstack_.back().aedsz);
1605 				row      = btnstack_.back().row;
1606 				col      = btnstack_.back().col;
1607 				gaps     = btnstack_.back().gaps;
1608 				readGaps = btnstack_.back().readGaps;
1609 				refGaps  = btnstack_.back().refGaps;
1610 				score    = btnstack_.back().score;
1611 				ct       = btnstack_.back().ct;
1612 				btnstack_.pop_back();
1613 				assert(!sc_->monotone || score.score() >= escore);
1614 				NEW_ROW_COL(row, col);
1615 				continue;
1616 			} else {
1617 				// No branch points to revisit; just give up
1618 				res.reset();
1619 				met.btfail++; // DP backtraces failed
1620 				return false;
1621 			}
1622 		}
1623 		assert(!reportedThru);
1624 		assert(!sc_->monotone || score.score() >= minsc_);
1625 		if(empty || row == 0) {
1626 			assert_eq(SSEMatrix::H, ct);
1627 			btcells_.expand();
1628 			btcells_.back().first = row;
1629 			btcells_.back().second = col;
1630 			// This cell is at the end of a legitimate alignment
1631 			trimBeg = row;
1632 			assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
1633 			break;
1634 		}
1635 		if(branch) {
1636 			// Add a frame to the backtrack stack
1637 			btnstack_.expand();
1638 			btnstack_.back().init(
1639 				ned.size(),
1640 				0,               // aed.size()
1641 				btcells_.size(),
1642 				row,
1643 				col,
1644 				gaps,
1645 				readGaps,
1646 				refGaps,
1647 				score,
1648 				(int)ct);
1649 		}
1650 		btcells_.expand();
1651 		btcells_.back().first = row;
1652 		btcells_.back().second = col;
1653 		switch(cur) {
1654 			// Move up and to the left.  If the reference nucleotide in the
1655 			// source row mismatches the read nucleotide, penalize
1656 			// it and add a nucleotide mismatch.
1657 			case SW_BT_OALL_DIAG: {
1658 				assert_gt(row, 0); assert_gt(col, 0);
1659 				// Check for color mismatch
1660 				int readC = (*rd_)[row];
1661 				int refNmask = (int)rf_[rfi_+col];
1662 				assert_gt(refNmask, 0);
1663 				int m = matchesEx(readC, refNmask);
1664 				ct = SSEMatrix::H;
1665 				if(m != 1) {
1666 					Edit e(
1667 						(int)row,
1668 						mask2dna[refNmask],
1669 						"ACGTN"[readC],
1670 						EDIT_TYPE_MM);
1671 					assert(e.repOk());
1672 					assert(ned.empty() || ned.back().pos >= row);
1673 					ned.push_back(e);
1674 					int pen = QUAL2(row, col);
1675 					score.score_ -= pen;
1676 					assert(!sc_->monotone || score.score() >= escore);
1677 				} else {
1678 					// Reward a match
1679 					int64_t bonus = sc_->match(30);
1680 					score.score_ += bonus;
1681 					assert(!sc_->monotone || score.score() >= escore);
1682 				}
1683 				if(m == -1) {
1684 					score.ns_++;
1685 				}
1686 				row--; col--;
1687 				MOVE_ALL_UPLEFT();
1688 				assert(VALID_AL_SCORE(score));
1689 				break;
1690 			}
1691 			// Move up.  Add an edit encoding the ref gap.
1692 			case SW_BT_OALL_REF_OPEN:
1693 			{
1694 				assert_gt(row, 0);
1695 				Edit e(
1696 					(int)row,
1697 					'-',
1698 					"ACGTN"[(int)(*rd_)[row]],
1699 					EDIT_TYPE_REF_GAP);
1700 				assert(e.repOk());
1701 				assert(ned.empty() || ned.back().pos >= row);
1702 				ned.push_back(e);
1703 				assert_geq(row, (size_t)sc_->gapbar);
1704 				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
1705 				row--;
1706 				ct = SSEMatrix::H;
1707 				int pen = sc_->refGapOpen();
1708 				score.score_ -= pen;
1709 				assert(!sc_->monotone || score.score() >= minsc_);
1710 				gaps++; refGaps++;
1711 				assert_eq(gaps, Edit::numGaps(ned));
1712 				assert_leq(gaps, rdgap_ + rfgap_);
1713 				MOVE_ALL_UP();
1714 				break;
1715 			}
1716 			// Move up.  Add an edit encoding the ref gap.
1717 			case SW_BT_RFGAP_EXTEND:
1718 			{
1719 				assert_gt(row, 1);
1720 				Edit e(
1721 					(int)row,
1722 					'-',
1723 					"ACGTN"[(int)(*rd_)[row]],
1724 					EDIT_TYPE_REF_GAP);
1725 				assert(e.repOk());
1726 				assert(ned.empty() || ned.back().pos >= row);
1727 				ned.push_back(e);
1728 				assert_geq(row, (size_t)sc_->gapbar);
1729 				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
1730 				row--;
1731 				ct = SSEMatrix::F;
1732 				int pen = sc_->refGapExtend();
1733 				score.score_ -= pen;
1734 				assert(!sc_->monotone || score.score() >= minsc_);
1735 				gaps++; refGaps++;
1736 				assert_eq(gaps, Edit::numGaps(ned));
1737 				assert_leq(gaps, rdgap_ + rfgap_);
1738 				MOVE_ALL_UP();
1739 				break;
1740 			}
1741 			case SW_BT_OALL_READ_OPEN:
1742 			{
1743 				assert_gt(col, 0);
1744 				Edit e(
1745 					(int)row+1,
1746 					mask2dna[(int)rf_[rfi_+col]],
1747 					'-',
1748 					EDIT_TYPE_READ_GAP);
1749 				assert(e.repOk());
1750 				assert(ned.empty() || ned.back().pos >= row);
1751 				ned.push_back(e);
1752 				assert_geq(row, (size_t)sc_->gapbar);
1753 				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
1754 				col--;
1755 				ct = SSEMatrix::H;
1756 				int pen = sc_->readGapOpen();
1757 				score.score_ -= pen;
1758 				assert(!sc_->monotone || score.score() >= minsc_);
1759 				gaps++; readGaps++;
1760 				assert_eq(gaps, Edit::numGaps(ned));
1761 				assert_leq(gaps, rdgap_ + rfgap_);
1762 				MOVE_ALL_LEFT();
1763 				break;
1764 			}
1765 			case SW_BT_RDGAP_EXTEND:
1766 			{
1767 				assert_gt(col, 1);
1768 				Edit e(
1769 					(int)row+1,
1770 					mask2dna[(int)rf_[rfi_+col]],
1771 					'-',
1772 					EDIT_TYPE_READ_GAP);
1773 				assert(e.repOk());
1774 				assert(ned.empty() || ned.back().pos >= row);
1775 				ned.push_back(e);
1776 				assert_geq(row, (size_t)sc_->gapbar);
1777 				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
1778 				col--;
1779 				ct = SSEMatrix::E;
1780 				int pen = sc_->readGapExtend();
1781 				score.score_ -= pen;
1782 				assert(!sc_->monotone || score.score() >= minsc_);
1783 				gaps++; readGaps++;
1784 				assert_eq(gaps, Edit::numGaps(ned));
1785 				assert_leq(gaps, rdgap_ + rfgap_);
1786 				MOVE_ALL_LEFT();
1787 				break;
1788 			}
1789 			default: throw 1;
1790 		}
1791 	} // while((int)row > 0)
1792 	assert_eq(0, trimBeg);
1793 	assert_eq(0, trimEnd);
1794 	assert_geq(col, 0);
1795 	assert_eq(SSEMatrix::H, ct);
1796 	// The number of cells in the backtracs should equal the number of read
1797 	// bases after trimming plus the number of gaps
1798 	assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
1799 	// Check whether we went through a core diagonal and set 'reported' flag on
1800 	// each cell
1801 	bool overlappedCoreDiag = false;
1802 	for(size_t i = 0; i < btcells_.size(); i++) {
1803 		size_t rw = btcells_[i].first;
1804 		size_t cl = btcells_[i].second;
1805 		// Calculate the diagonal within the *trimmed* rectangle, i.e. the
1806 		// rectangle we dealt with in align, gather and backtrack.
1807 		int64_t diagi = cl - rw;
1808 		// Now adjust to the diagonal within the *untrimmed* rectangle by
1809 		// adding on the amount trimmed from the left.
1810 		diagi += rect_->triml;
1811 		if(diagi >= 0) {
1812 			size_t diag = (size_t)diagi;
1813 			if(diag >= rect_->corel && diag <= rect_->corer) {
1814 				overlappedCoreDiag = true;
1815 				break;
1816 			}
1817 		}
1818 		assert(d.mat_.reportedThrough(rw, cl));
1819 	}
1820 	if(!overlappedCoreDiag) {
1821 		// Must overlap a core diagonal.  Otherwise, we run the risk of
1822 		// reporting an alignment that overlaps (and trumps) a higher-scoring
1823 		// alignment that lies partially outside the dynamic programming
1824 		// rectangle.
1825 		res.reset();
1826 		met.corerej++;
1827 		return false;
1828 	}
1829 	int readC = (*rd_)[rdi_+row];      // get last char in read
1830 	int refNmask = (int)rf_[rfi_+col]; // get last ref char ref involved in aln
1831 	assert_gt(refNmask, 0);
1832 	int m = matchesEx(readC, refNmask);
1833 	if(m != 1) {
1834 		Edit e((int)row, mask2dna[refNmask], "ACGTN"[readC], EDIT_TYPE_MM);
1835 		assert(e.repOk());
1836 		assert(ned.empty() || ned.back().pos >= row);
1837 		ned.push_back(e);
1838 		score.score_ -= QUAL2(row, col);
1839 		assert_geq(score.score(), minsc_);
1840 	} else {
1841 		score.score_ += sc_->match(30);
1842 	}
1843 	if(m == -1) {
1844 		score.ns_++;
1845 	}
1846 	if(score.ns_ > nceil_) {
1847 		// Alignment has too many Ns in it!
1848 		res.reset();
1849 		met.nrej++;
1850 		return false;
1851 	}
1852 	res.reverse();
1853 	assert(Edit::repOk(ned, (*rd_)));
1854 	assert_eq(score.score(), escore);
1855 	assert_leq(gaps, rdgap_ + rfgap_);
1856 	off = col;
1857 	assert_lt(col + (size_t)rfi_, (size_t)rff_);
1858 	score.gaps_ = gaps;
1859 	res.alres.setScore(score);
1860 	res.alres.setShape(
1861 		refidx_,                  // ref id
1862 		off + rfi_ + rect_->refl, // 0-based ref offset
1863 		reflen_,                  // reference length
1864 		fw_,                      // aligned to Watson?
1865 		rdf_ - rdi_,              // read length
1866         0,                        // read ID
1867 		true,                     // pretrim soft?
1868 		0,                        // pretrim 5' end
1869 		0,                        // pretrim 3' end
1870 		true,                     // alignment trim soft?
1871 		fw_ ? trimBeg : trimEnd,  // alignment trim 5' end
1872 		fw_ ? trimEnd : trimBeg); // alignment trim 3' end
1873 	size_t refns = 0;
1874 	for(size_t i = col; i <= origCol; i++) {
1875 		if((int)rf_[rfi_+i] > 15) {
1876 			refns++;
1877 		}
1878 	}
1879 	res.alres.setRefNs(refns);
1880 	assert(Edit::repOk(ned, (*rd_), true, trimBeg, trimEnd));
1881 	assert(res.repOk());
1882 #ifndef NDEBUG
1883 	size_t gapsCheck = 0;
1884 	for(size_t i = 0; i < ned.size(); i++) {
1885 		if(ned[i].isGap()) gapsCheck++;
1886 	}
1887 	assert_eq(gaps, gapsCheck);
1888 	BTDnaString refstr;
1889 	for(size_t i = col; i <= origCol; i++) {
1890 		refstr.append(firsts5[(int)rf_[rfi_+i]]);
1891 	}
1892 	BTDnaString editstr;
1893     // daehwan
1894 	// Edit::toRef((*rd_), ned, editstr, true, trimBeg, trimEnd);
1895     Edit::toRef((*rd_), ned, editstr, true, trimBeg + rdi_, trimEnd + (rd_->length() - rdf_));
1896 	if(refstr != editstr) {
1897 		cerr << "Decoded nucleotides and edits don't match reference:" << endl;
1898 		cerr << "           score: " << score.score()
1899 		     << " (" << gaps << " gaps)" << endl;
1900 		cerr << "           edits: ";
1901 		Edit::print(cerr, ned);
1902 		cerr << endl;
1903 		cerr << "    decoded nucs: " << (*rd_) << endl;
1904 		cerr << "     edited nucs: " << editstr << endl;
1905 		cerr << "  reference nucs: " << refstr << endl;
1906 		assert(0);
1907 	}
1908 #endif
1909 	met.btsucc++; // DP backtraces succeeded
1910 	return true;
1911 }
1912