1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 /**
21 * aligner_sw_sse.cpp
22 *
23 * Versions of key alignment functions that use vector instructions to
24 * accelerate dynamic programming. Based chiefly on the striped Smith-Waterman
25 * paper and implementation by Michael Farrar. See:
26 *
27 * Farrar M. Striped Smith-Waterman speeds database searches six times over
28 * other SIMD implementations. Bioinformatics. 2007 Jan 15;23(2):156-61.
29 * http://sites.google.com/site/farrarmichael/smith-waterman
30 *
31 * While the paper describes an implementation of Smith-Waterman, we extend it
32 * do end-to-end read alignment as well as local alignment. The change
33 * required for this is minor: we simply let vmax be the maximum element in the
34 * score domain rather than the minimum.
35 *
36 * The vectorized dynamic programming implementation lacks some features that
37 * make it hard to adapt to solving the entire dynamic-programming alignment
38 * problem. For instance:
39 *
40 * - It doesn't respect gap barriers on either end of the read
41 * - It just gives a maximum; not enough information to backtrace without
42 * redoing some alignment
43 * - It's a little difficult to handle st_ and en_, especially st_.
44 * - The query profile mechanism makes handling of ambiguous reference bases a
45 * little tricky (16 cols in query profile lookup table instead of 5)
46 *
47 * Given the drawbacks, it is tempting to use SSE dynamic programming as a
48 * filter rather than as an aligner per se. Here are a few ideas for how it
49 * can be extended to handle more of the alignment problem:
50 *
51 * - Save calculated scores to a big array as we go. We return to this array
52 * to find and backtrace from good solutions.
53 */
54
55 #include <limits>
56 #include "aligner_sw.h"
57
58 static const size_t NBYTES_PER_REG = 16;
59 static const size_t NWORDS_PER_REG = 8;
60 static const size_t NBITS_PER_WORD = 16;
61 static const size_t NBYTES_PER_WORD = 2;
62
63 // In 16-bit end-to-end mode, we have the option of using signed saturated
64 // arithmetic. Because we have signed arithmetic, there's no need to add/subtract
65 // bias when building an applying the query profile. The lowest value we can
66 // use is 0x8000, and the greatest is 0x7fff.
67
68 typedef int16_t TCScore;
69
70 /**
71 * Build query profile look up tables for the read. The query profile look
72 * up table is organized as a 1D array indexed by [i][j] where i is the
73 * reference character in the current DP column (0=A, 1=C, etc), and j is
74 * the segment of the query we're currently working on.
75 */
buildQueryProfileEnd2EndSseI16(bool fw)76 void SwAligner::buildQueryProfileEnd2EndSseI16(bool fw) {
77 bool& done = fw ? sseI16fwBuilt_ : sseI16rcBuilt_;
78 if(done) {
79 return;
80 }
81 done = true;
82 const BTDnaString* rd = fw ? rdfw_ : rdrc_;
83 const BTString* qu = fw ? qufw_ : qurc_;
84 // daehwan - allows to align a portion of a read, not the whole
85 // const size_t len = rd->length();
86 const size_t len = dpRows();
87 const size_t seglen = (len + (NWORDS_PER_REG-1)) / NWORDS_PER_REG;
88 // How many __m128i's are needed
89 size_t n128s =
90 64 + // slack bytes, for alignment?
91 (seglen * ALPHA_SIZE) // query profile data
92 * 2; // & gap barrier data
93 assert_gt(n128s, 0);
94 SSEData& d = fw ? sseI16fw_ : sseI16rc_;
95 d.profbuf_.resizeNoCopy(n128s);
96 assert(!d.profbuf_.empty());
97 d.maxPen_ = d.maxBonus_ = 0;
98 d.lastIter_ = d.lastWord_ = 0;
99 d.qprofStride_ = d.gbarStride_ = 2;
100 d.bias_ = 0; // no bias when words are signed
101 // For each reference character A, C, G, T, N ...
102 for(size_t refc = 0; refc < ALPHA_SIZE; refc++) {
103 // For each segment ...
104 for(size_t i = 0; i < seglen; i++) {
105 size_t j = i;
106 int16_t *qprofWords =
107 reinterpret_cast<int16_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2));
108 int16_t *gbarWords =
109 reinterpret_cast<int16_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2) + 1);
110 // For each sub-word (byte) ...
111 for(size_t k = 0; k < NWORDS_PER_REG; k++) {
112 int sc = 0;
113 *gbarWords = 0;
114 if(j < len) {
115 int readc = (*rd)[j];
116 int readq = (*qu)[j];
117 sc = sc_->score(readc, (int)(1 << refc), readq - 33);
118 size_t j_from_end = len - j - 1;
119 if(j < (size_t)sc_->gapbar ||
120 j_from_end < (size_t)sc_->gapbar)
121 {
122 // Inside the gap barrier
123 *gbarWords = 0x8000; // add this twice
124 }
125 }
126 if(refc == 0 && j == len-1) {
127 // Remember which 128-bit word and which smaller word has
128 // the final row
129 d.lastIter_ = i;
130 d.lastWord_ = k;
131 }
132 if(sc < 0) {
133 if((size_t)(-sc) > d.maxPen_) {
134 d.maxPen_ = (size_t)(-sc);
135 }
136 } else {
137 if((size_t)sc > d.maxBonus_) {
138 d.maxBonus_ = (size_t)sc;
139 }
140 }
141 *qprofWords = (int16_t)sc;
142 gbarWords++;
143 qprofWords++;
144 j += seglen; // update offset into query
145 }
146 }
147 }
148 }
149
150 #ifndef NDEBUG
151 /**
152 * Return true iff the cell has sane E/F/H values w/r/t its predecessors.
153 */
cellOkEnd2EndI16(SSEData & d,size_t row,size_t col,int refc,int readc,int readq,const Scoring & sc)154 static bool cellOkEnd2EndI16(
155 SSEData& d,
156 size_t row,
157 size_t col,
158 int refc,
159 int readc,
160 int readq,
161 const Scoring& sc) // scoring scheme
162 {
163 TCScore floorsc = 0x8000;
164 TCScore ceilsc = MAX_I64;
165 TAlScore offsetsc = -0x7fff;
166 TAlScore sc_h_cur = (TAlScore)d.mat_.helt(row, col);
167 TAlScore sc_e_cur = (TAlScore)d.mat_.eelt(row, col);
168 TAlScore sc_f_cur = (TAlScore)d.mat_.felt(row, col);
169 if(sc_h_cur > floorsc) {
170 sc_h_cur += offsetsc;
171 }
172 if(sc_e_cur > floorsc) {
173 sc_e_cur += offsetsc;
174 }
175 if(sc_f_cur > floorsc) {
176 sc_f_cur += offsetsc;
177 }
178 bool gapsAllowed = true;
179 size_t rowFromEnd = d.mat_.nrow() - row - 1;
180 if(row < (size_t)sc.gapbar || rowFromEnd < (size_t)sc.gapbar) {
181 gapsAllowed = false;
182 }
183 bool e_left_trans = false, h_left_trans = false;
184 bool f_up_trans = false, h_up_trans = false;
185 bool h_diag_trans = false;
186 if(gapsAllowed) {
187 TAlScore sc_h_left = floorsc;
188 TAlScore sc_e_left = floorsc;
189 TAlScore sc_h_up = floorsc;
190 TAlScore sc_f_up = floorsc;
191 if(col > 0 && sc_e_cur > floorsc && sc_e_cur <= ceilsc) {
192 sc_h_left = d.mat_.helt(row, col-1) + offsetsc;
193 sc_e_left = d.mat_.eelt(row, col-1) + offsetsc;
194 e_left_trans = (sc_e_left > floorsc && sc_e_cur == sc_e_left - sc.readGapExtend());
195 h_left_trans = (sc_h_left > floorsc && sc_e_cur == sc_h_left - sc.readGapOpen());
196 assert(e_left_trans || h_left_trans);
197 }
198 if(row > 0 && sc_f_cur > floorsc && sc_f_cur <= ceilsc) {
199 sc_h_up = d.mat_.helt(row-1, col) + offsetsc;
200 sc_f_up = d.mat_.felt(row-1, col) + offsetsc;
201 f_up_trans = (sc_f_up > floorsc && sc_f_cur == sc_f_up - sc.refGapExtend());
202 h_up_trans = (sc_h_up > floorsc && sc_f_cur == sc_h_up - sc.refGapOpen());
203 assert(f_up_trans || h_up_trans);
204 }
205 } else {
206 assert_geq(floorsc, sc_e_cur);
207 assert_geq(floorsc, sc_f_cur);
208 }
209 if(col > 0 && row > 0 && sc_h_cur > floorsc && sc_h_cur <= ceilsc) {
210 TAlScore sc_h_upleft = d.mat_.helt(row-1, col-1) + offsetsc;
211 TAlScore sc_diag = sc.score(readc, (int)refc, readq - 33);
212 h_diag_trans = sc_h_cur == sc_h_upleft + sc_diag;
213 }
214 assert(
215 sc_h_cur <= floorsc ||
216 e_left_trans ||
217 h_left_trans ||
218 f_up_trans ||
219 h_up_trans ||
220 h_diag_trans ||
221 sc_h_cur > ceilsc ||
222 row == 0 ||
223 col == 0);
224 return true;
225 }
226 #endif /*ndef NDEBUG*/
227
228 #ifdef NDEBUG
229
230 #define assert_all_eq0(x)
231 #define assert_all_gt(x, y)
232 #define assert_all_gt_lo(x)
233 #define assert_all_lt(x, y)
234 #define assert_all_lt_hi(x)
235
236 #else
237
238 #define assert_all_eq0(x) { \
239 __m128i z = _mm_setzero_si128(); \
240 __m128i tmp = _mm_setzero_si128(); \
241 z = _mm_xor_si128(z, z); \
242 tmp = _mm_cmpeq_epi16(x, z); \
243 assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
244 }
245
246 #define assert_all_gt(x, y) { \
247 __m128i tmp = _mm_cmpgt_epi16(x, y); \
248 assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
249 }
250
251 #define assert_all_gt_lo(x) { \
252 __m128i z = _mm_setzero_si128(); \
253 __m128i tmp = _mm_setzero_si128(); \
254 z = _mm_xor_si128(z, z); \
255 tmp = _mm_cmpgt_epi16(x, z); \
256 assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
257 }
258
259 #define assert_all_lt(x, y) { \
260 __m128i tmp = _mm_cmplt_epi16(x, y); \
261 assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
262 }
263
264 #define assert_all_leq(x, y) { \
265 __m128i tmp = _mm_cmpgt_epi16(x, y); \
266 assert_eq(0x0000, _mm_movemask_epi8(tmp)); \
267 }
268
269 #define assert_all_lt_hi(x) { \
270 __m128i z = _mm_setzero_si128(); \
271 __m128i tmp = _mm_setzero_si128(); \
272 z = _mm_cmpeq_epi16(z, z); \
273 z = _mm_srli_epi16(z, 1); \
274 tmp = _mm_cmplt_epi16(x, z); \
275 assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
276 }
277 #endif
278
279 /**
280 * Aligns by filling a dynamic programming matrix with the SSE-accelerated,
281 * banded DP approach of Farrar. As it goes, it determines which cells we
282 * might backtrace from and tallies the best (highest-scoring) N backtrace
283 * candidate cells per diagonal. Also returns the alignment score of the best
284 * alignment in the matrix.
285 *
286 * This routine does *not* maintain a matrix holding the entire matrix worth of
287 * scores, nor does it maintain any other dense O(mn) data structure, as this
288 * would quickly exhaust memory for queries longer than about 10,000 kb.
289 * Instead, in the fill stage it maintains two columns worth of scores at a
290 * time (current/previous, or right/left) - these take O(m) space. When
291 * finished with the current column, it determines which cells from the
292 * previous column, if any, are candidates we might backtrace from to find a
293 * full alignment. A candidate cell has a score that rises above the threshold
294 * and isn't improved upon by a match in the next column. The best N
295 * candidates per diagonal are stored in a O(m + n) data structure.
296 */
alignGatherEE16(int & flag,bool debug)297 TAlScore SwAligner::alignGatherEE16(int& flag, bool debug) {
298 assert_leq(rdf_, rd_->length());
299 assert_leq(rdf_, qu_->length());
300 assert_lt(rfi_, rff_);
301 assert_lt(rdi_, rdf_);
302 assert_eq(rd_->length(), qu_->length());
303 assert_geq(sc_->gapbar, 1);
304 assert(repOk());
305 #ifndef NDEBUG
306 for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
307 assert_range(0, 16, (int)rf_[i]);
308 }
309 #endif
310
311 SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
312 SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
313 if(!debug) met.dp++;
314 buildQueryProfileEnd2EndSseI16(fw_);
315 assert(!d.profbuf_.empty());
316
317 assert_eq(0, d.maxBonus_);
318 size_t iter =
319 (dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
320
321 // Now set up the score vectors. We just need two columns worth, which
322 // we'll call "left" and "right".
323 d.vecbuf_.resize(4 * 2 * iter);
324 d.vecbuf_.zero();
325 __m128i *vbuf_l = d.vecbuf_.ptr();
326 __m128i *vbuf_r = d.vecbuf_.ptr() + (4 * iter);
327
328 // This is the data structure that holds candidate cells per diagonal.
329 const size_t ndiags = rff_ - rfi_ + dpRows() - 1;
330 if(!debug) {
331 btdiag_.init(ndiags, 2);
332 }
333
334 // Data structure that holds checkpointed anti-diagonals
335 TAlScore perfectScore = sc_->perfectScore(dpRows());
336 bool checkpoint = true;
337 bool cpdebug = false;
338 #ifndef NDEBUG
339 cpdebug = dpRows() < 1000;
340 #endif
341 cper_.init(
342 dpRows(), // # rows
343 rff_ - rfi_, // # columns
344 cperPerPow2_, // checkpoint every 1 << perpow2 diags (& next)
345 perfectScore, // perfect score (for sanity checks)
346 false, // matrix cells have 8-bit scores?
347 cperTri_, // triangular mini-fills?
348 false, // alignment is local?
349 cpdebug); // save all cells for debugging?
350
351 // Many thanks to Michael Farrar for releasing his striped Smith-Waterman
352 // implementation:
353 //
354 // http://sites.google.com/site/farrarmichael/smith-waterman
355 //
356 // Much of the implmentation below is adapted from Michael's code.
357
358 // Set all elts to reference gap open penalty
359 __m128i rfgapo = _mm_setzero_si128();
360 __m128i rfgape = _mm_setzero_si128();
361 __m128i rdgapo = _mm_setzero_si128();
362 __m128i rdgape = _mm_setzero_si128();
363 __m128i vlo = _mm_setzero_si128();
364 __m128i vhi = _mm_setzero_si128();
365 __m128i vhilsw = _mm_setzero_si128();
366 __m128i vlolsw = _mm_setzero_si128();
367 __m128i ve = _mm_setzero_si128();
368 __m128i vf = _mm_setzero_si128();
369 __m128i vh = _mm_setzero_si128();
370 __m128i vhd = _mm_setzero_si128();
371 __m128i vhdtmp = _mm_setzero_si128();
372 __m128i vtmp = _mm_setzero_si128();
373
374 assert_gt(sc_->refGapOpen(), 0);
375 assert_leq(sc_->refGapOpen(), MAX_I16);
376 rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
377 rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
378 rfgapo = _mm_shuffle_epi32(rfgapo, 0);
379
380 // Set all elts to reference gap extension penalty
381 assert_gt(sc_->refGapExtend(), 0);
382 assert_leq(sc_->refGapExtend(), MAX_I16);
383 assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
384 rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
385 rfgape = _mm_shufflelo_epi16(rfgape, 0);
386 rfgape = _mm_shuffle_epi32(rfgape, 0);
387
388 // Set all elts to read gap open penalty
389 assert_gt(sc_->readGapOpen(), 0);
390 assert_leq(sc_->readGapOpen(), MAX_I16);
391 rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
392 rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
393 rdgapo = _mm_shuffle_epi32(rdgapo, 0);
394
395 // Set all elts to read gap extension penalty
396 assert_gt(sc_->readGapExtend(), 0);
397 assert_leq(sc_->readGapExtend(), MAX_I16);
398 assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
399 rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
400 rdgape = _mm_shufflelo_epi16(rdgape, 0);
401 rdgape = _mm_shuffle_epi32(rdgape, 0);
402
403 // Set all elts to 0x8000 (min value for signed 16-bit)
404 vlo = _mm_cmpeq_epi16(vlo, vlo); // all elts = 0xffff
405 vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
406
407 // Set all elts to 0x7fff (max value for signed 16-bit)
408 vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
409 vhi = _mm_srli_epi16(vhi, 1); // all elts = 0x7fff
410
411 // vlolsw: topmost (least sig) word set to 0x8000, all other words=0
412 vlolsw = _mm_shuffle_epi32(vlo, 0);
413 vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
414
415 // vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
416 vhilsw = _mm_shuffle_epi32(vhi, 0);
417 vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
418
419 // Points to a long vector of __m128i where each element is a block of
420 // contiguous cells in the E, F or H matrix. If the index % 3 == 0, then
421 // the block of cells is from the E matrix. If index % 3 == 1, they're
422 // from the F matrix. If index % 3 == 2, then they're from the H matrix.
423 // Blocks of cells are organized in the same interleaved manner as they are
424 // calculated by the Farrar algorithm.
425 const __m128i *pvScore; // points into the query profile
426
427 const size_t colstride = ROWSTRIDE_2COL * iter;
428
429 // Initialize the H and E vectors in the first matrix column
430 __m128i *pvELeft = vbuf_l + 0; __m128i *pvERight = vbuf_r + 0;
431 /* __m128i *pvFLeft = vbuf_l + 1; */ __m128i *pvFRight = vbuf_r + 1;
432 __m128i *pvHLeft = vbuf_l + 2; __m128i *pvHRight = vbuf_r + 2;
433
434 // Maximum score in final row
435 bool found = false;
436 TCScore lrmax = MIN_I16;
437
438 for(size_t i = 0; i < iter; i++) {
439 _mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
440 // Could initialize Hs to high or low. If high, cells in the lower
441 // triangle will have somewhat more legitiate scores, but still won't
442 // be exhaustively scored.
443 _mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
444 }
445
446 assert_gt(sc_->gapbar, 0);
447 size_t nfixup = 0;
448
449 // Fill in the table as usual but instead of using the same gap-penalty
450 // vector for each iteration of the inner loop, load words out of a
451 // pre-calculated gap vector parallel to the query profile. The pre-
452 // calculated gap vectors enforce the gap barrier constraint by making it
453 // infinitely costly to introduce a gap in barrier rows.
454 //
455 // AND use a separate loop to fill in the first row of the table, enforcing
456 // the st_ constraints in the process. This is awkward because it
457 // separates the processing of the first row from the others and might make
458 // it difficult to use the first-row results in the next row, but it might
459 // be the simplest and least disruptive way to deal with the st_ constraint.
460
461 for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
462 // Swap left and right; vbuf_l is the vector on the left, which we
463 // generally load from, and vbuf_r is the vector on the right, which we
464 // generally store to.
465 swap(vbuf_l, vbuf_r);
466 pvELeft = vbuf_l + 0; pvERight = vbuf_r + 0;
467 /* pvFLeft = vbuf_l + 1; */ pvFRight = vbuf_r + 1;
468 pvHLeft = vbuf_l + 2; pvHRight = vbuf_r + 2;
469
470 // Fetch the appropriate query profile. Note that elements of rf_ must
471 // be numbers, not masks.
472 const int refc = (int)rf_[i];
473
474 // Fetch the appropriate query profile
475 size_t off = (size_t)firsts5[refc] * iter * 2;
476 pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
477
478 // Set all cells to low value
479 vf = _mm_cmpeq_epi16(vf, vf);
480 vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
481 vf = _mm_or_si128(vf, vlolsw);
482
483 // Load H vector from the final row of the previous column
484 vh = _mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
485 // Shift 2 bytes down so that topmost (least sig) cell gets 0
486 vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
487 // Fill topmost (least sig) cell with high value
488 vh = _mm_or_si128(vh, vhilsw);
489
490 // For each character in the reference text:
491 size_t j;
492 for(j = 0; j < iter; j++) {
493 // Load cells from E, calculated previously
494 ve = _mm_load_si128(pvELeft);
495 vhd = _mm_load_si128(pvHLeft);
496 assert_all_lt(ve, vhi);
497 pvELeft += ROWSTRIDE_2COL;
498
499 // Store cells in F, calculated previously
500 vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
501 vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
502 _mm_store_si128(pvFRight, vf);
503 pvFRight += ROWSTRIDE_2COL;
504
505 // Factor in query profile (matches and mismatches)
506 vh = _mm_adds_epi16(vh, pvScore[0]);
507
508 // Update H, factoring in E and F
509 vh = _mm_max_epi16(vh, vf);
510
511 // Update vE value
512 vhdtmp = vhd;
513 vhd = _mm_subs_epi16(vhd, rdgapo);
514 vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
515 vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
516 ve = _mm_subs_epi16(ve, rdgape);
517 ve = _mm_max_epi16(ve, vhd);
518 vh = _mm_max_epi16(vh, ve);
519
520 // Save the new vH values
521 _mm_store_si128(pvHRight, vh);
522 pvHRight += ROWSTRIDE_2COL;
523 vtmp = vh;
524 assert_all_lt(ve, vhi);
525
526 // Load the next h value
527 vh = vhdtmp;
528 pvHLeft += ROWSTRIDE_2COL;
529
530 // Save E values
531 _mm_store_si128(pvERight, ve);
532 pvERight += ROWSTRIDE_2COL;
533
534 // Update vf value
535 vtmp = _mm_subs_epi16(vtmp, rfgapo);
536 vf = _mm_subs_epi16(vf, rfgape);
537 assert_all_lt(vf, vhi);
538 vf = _mm_max_epi16(vf, vtmp);
539
540 pvScore += 2; // move on to next query profile / gap veto
541 }
542 // pvHStore, pvELoad, pvEStore have all rolled over to the next column
543 pvFRight -= colstride; // reset to start of column
544 vtmp = _mm_load_si128(pvFRight);
545
546 pvHRight -= colstride; // reset to start of column
547 vh = _mm_load_si128(pvHRight);
548
549 pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
550
551 // vf from last row gets shifted down by one to overlay the first row
552 // rfgape has already been subtracted from it.
553 vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
554 vf = _mm_or_si128(vf, vlolsw);
555
556 vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
557 vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
558 vf = _mm_max_epi16(vtmp, vf);
559 vtmp = _mm_cmpgt_epi16(vf, vtmp);
560 int cmp = _mm_movemask_epi8(vtmp);
561
562 // If any element of vtmp is greater than H - gap-open...
563 j = 0;
564 while(cmp != 0x0000) {
565 // Store this vf
566 _mm_store_si128(pvFRight, vf);
567 pvFRight += ROWSTRIDE_2COL;
568
569 // Update vh w/r/t new vf
570 vh = _mm_max_epi16(vh, vf);
571
572 // Save vH values
573 _mm_store_si128(pvHRight, vh);
574 pvHRight += ROWSTRIDE_2COL;
575
576 pvScore += 2;
577
578 assert_lt(j, iter);
579 if(++j == iter) {
580 pvFRight -= colstride;
581 vtmp = _mm_load_si128(pvFRight); // load next vf ASAP
582 pvHRight -= colstride;
583 vh = _mm_load_si128(pvHRight); // load next vh ASAP
584 pvScore = d.profbuf_.ptr() + off + 1;
585 j = 0;
586 vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
587 vf = _mm_or_si128(vf, vlolsw);
588 } else {
589 vtmp = _mm_load_si128(pvFRight); // load next vf ASAP
590 vh = _mm_load_si128(pvHRight); // load next vh ASAP
591 }
592
593 // Update F with another gap extension
594 vf = _mm_subs_epi16(vf, rfgape);
595 vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
596 vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
597 vf = _mm_max_epi16(vtmp, vf);
598 vtmp = _mm_cmpgt_epi16(vf, vtmp);
599 cmp = _mm_movemask_epi8(vtmp);
600 nfixup++;
601 }
602
603
604 // Check in the last row for the maximum so far
605 __m128i *vtmp = vbuf_r + 2 /* H */ + (d.lastIter_ * ROWSTRIDE_2COL);
606 // Note: we may not want to extract from the final row
607 TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
608 found = true;
609 if(lr > lrmax) {
610 lrmax = lr;
611 }
612
613 // Now we'd like to know whether the bottommost element of the right
614 // column is a candidate we might backtrace from. First question is:
615 // did it exceed the minimum score threshold?
616 TAlScore score = (TAlScore)(lr - 0x7fff);
617 if(lr == MIN_I16) {
618 score = MIN_I64;
619 }
620 if(!debug && score >= minsc_) {
621 DpBtCandidate cand(dpRows() - 1, i - rfi_, score);
622 btdiag_.add(i - rfi_, cand);
623 }
624
625 // Save some elements to checkpoints
626 if(checkpoint) {
627
628 __m128i *pvE = vbuf_r + 0;
629 __m128i *pvF = vbuf_r + 1;
630 __m128i *pvH = vbuf_r + 2;
631 size_t coli = i - rfi_;
632 if(coli < cper_.locol_) cper_.locol_ = coli;
633 if(coli > cper_.hicol_) cper_.hicol_ = coli;
634
635 if(cperTri_) {
636 size_t rc_mod = coli & cper_.lomask_;
637 assert_lt(rc_mod, cper_.per_);
638 int64_t row = -(int64_t)rc_mod-1;
639 int64_t row_mod = row;
640 int64_t row_div = 0;
641 size_t idx = coli >> cper_.perpow2_;
642 size_t idxrow = idx * cper_.nrow_;
643 assert_eq(4, ROWSTRIDE_2COL);
644 bool done = false;
645 while(true) {
646 row += (cper_.per_ - 2);
647 row_mod += (cper_.per_ - 2);
648 for(size_t j = 0; j < 2; j++) {
649 row++;
650 row_mod++;
651 if(row >= 0 && (size_t)row < cper_.nrow_) {
652 // Update row divided by iter_ and mod iter_
653 while(row_mod >= (int64_t)iter) {
654 row_mod -= (int64_t)iter;
655 row_div++;
656 }
657 size_t delt = idxrow + row;
658 size_t vecoff = (row_mod << 5) + row_div;
659 assert_lt(row_div, 8);
660 int16_t h_sc = ((int16_t*)pvH)[vecoff];
661 int16_t e_sc = ((int16_t*)pvE)[vecoff];
662 int16_t f_sc = ((int16_t*)pvF)[vecoff];
663 if(h_sc != MIN_I16) h_sc -= 0x7fff;
664 if(e_sc != MIN_I16) e_sc -= 0x7fff;
665 if(f_sc != MIN_I16) f_sc -= 0x7fff;
666 assert_leq(h_sc, cper_.perf_);
667 assert_leq(e_sc, cper_.perf_);
668 assert_leq(f_sc, cper_.perf_);
669 CpQuad *qdiags = ((j == 0) ? cper_.qdiag1s_.ptr() : cper_.qdiag2s_.ptr());
670 qdiags[delt].sc[0] = h_sc;
671 qdiags[delt].sc[1] = e_sc;
672 qdiags[delt].sc[2] = f_sc;
673 } // if(row >= 0 && row < nrow_)
674 else if(row >= 0 && (size_t)row >= cper_.nrow_) {
675 done = true;
676 break;
677 }
678 } // end of loop over anti-diags
679 if(done) {
680 break;
681 }
682 idx++;
683 idxrow += cper_.nrow_;
684 }
685 } else {
686 // If this is the first column, take this opportunity to
687 // pre-calculate the coordinates of the elements we're going to
688 // checkpoint.
689 if(coli == 0) {
690 size_t cpi = cper_.per_-1;
691 size_t cpimod = cper_.per_-1;
692 size_t cpidiv = 0;
693 cper_.commitMap_.clear();
694 while(cpi < cper_.nrow_) {
695 while(cpimod >= iter) {
696 cpimod -= iter;
697 cpidiv++;
698 }
699 size_t vecoff = (cpimod << 5) + cpidiv;
700 cper_.commitMap_.push_back(vecoff);
701 cpi += cper_.per_;
702 cpimod += cper_.per_;
703 }
704 }
705 // Save all the rows
706 size_t rowoff = 0;
707 size_t sz = cper_.commitMap_.size();
708 for(size_t i = 0; i < sz; i++, rowoff += cper_.ncol_) {
709 size_t vecoff = cper_.commitMap_[i];
710 int16_t h_sc = ((int16_t*)pvH)[vecoff];
711 int16_t e_sc = ((int16_t*)pvE)[vecoff];
712 int16_t f_sc = ((int16_t*)pvF)[vecoff];
713 if(h_sc != MIN_I16) h_sc -= 0x7fff;
714 if(e_sc != MIN_I16) e_sc -= 0x7fff;
715 if(f_sc != MIN_I16) f_sc -= 0x7fff;
716 assert_leq(h_sc, cper_.perf_);
717 assert_leq(e_sc, cper_.perf_);
718 assert_leq(f_sc, cper_.perf_);
719 CpQuad& dst = cper_.qrows_[rowoff + coli];
720 dst.sc[0] = h_sc;
721 dst.sc[1] = e_sc;
722 dst.sc[2] = f_sc;
723 }
724 // Is this a column we'd like to checkpoint?
725 if((coli & cper_.lomask_) == cper_.lomask_) {
726 // Save the column using memcpys
727 assert_gt(coli, 0);
728 size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
729 size_t coloff = (coli >> cper_.perpow2_) * wordspercol;
730 __m128i *dst = cper_.qcols_.ptr() + coloff;
731 memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
732 }
733 }
734 if(cper_.debug_) {
735 // Save the column using memcpys
736 size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
737 size_t coloff = coli * wordspercol;
738 __m128i *dst = cper_.qcolsD_.ptr() + coloff;
739 memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
740 }
741 }
742 }
743
744 // Update metrics
745 if(!debug) {
746 size_t ninner = (rff_ - rfi_) * iter;
747 met.col += (rff_ - rfi_); // DP columns
748 met.cell += (ninner * NWORDS_PER_REG); // DP cells
749 met.inner += ninner; // DP inner loop iters
750 met.fixup += nfixup; // DP fixup loop iters
751 }
752
753 flag = 0;
754
755 // Did we find a solution?
756 TAlScore score = MIN_I64;
757 if(!found) {
758 flag = -1; // no
759 if(!debug) met.dpfail++;
760 return MIN_I64;
761 } else {
762 score = (TAlScore)(lrmax - 0x7fff);
763 if(score < minsc_) {
764 flag = -1; // no
765 if(!debug) met.dpfail++;
766 return score;
767 }
768 }
769
770 // Could we have saturated?
771 if(lrmax == MIN_I16) {
772 flag = -2; // yes
773 if(!debug) met.dpsat++;
774 return MIN_I64;
775 }
776
777 // Now take all the backtrace candidates in the btdaig_ structure and
778 // dump them into the btncand_ array. They'll be sorted later.
779 if(!debug) {
780 btdiag_.dump(btncand_);
781 assert(!btncand_.empty());
782 }
783
784 // Return largest score
785 if(!debug) met.dpsucc++;
786 return score;
787 }
788
789 /**
790 * Solve the current alignment problem using SSE instructions that operate on 8
791 * signed 16-bit values packed into a single 128-bit register.
792 */
alignNucleotidesEnd2EndSseI16(int & flag,bool debug)793 TAlScore SwAligner::alignNucleotidesEnd2EndSseI16(int& flag, bool debug) {
794 assert_leq(rdf_, rd_->length());
795 assert_leq(rdf_, qu_->length());
796 assert_lt(rfi_, rff_);
797 assert_lt(rdi_, rdf_);
798 assert_eq(rd_->length(), qu_->length());
799 assert_geq(sc_->gapbar, 1);
800 assert(repOk());
801 #ifndef NDEBUG
802 for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
803 assert_range(0, 16, (int)rf_[i]);
804 }
805 #endif
806
807 SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
808 SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
809 if(!debug) met.dp++;
810 buildQueryProfileEnd2EndSseI16(fw_);
811 assert(!d.profbuf_.empty());
812
813 assert_eq(0, d.maxBonus_);
814 size_t iter =
815 (dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
816
817 // Many thanks to Michael Farrar for releasing his striped Smith-Waterman
818 // implementation:
819 //
820 // http://sites.google.com/site/farrarmichael/smith-waterman
821 //
822 // Much of the implmentation below is adapted from Michael's code.
823
824 // Set all elts to reference gap open penalty
825 __m128i rfgapo = _mm_setzero_si128();
826 __m128i rfgape = _mm_setzero_si128();
827 __m128i rdgapo = _mm_setzero_si128();
828 __m128i rdgape = _mm_setzero_si128();
829 __m128i vlo = _mm_setzero_si128();
830 __m128i vhi = _mm_setzero_si128();
831 __m128i vhilsw = _mm_setzero_si128();
832 __m128i vlolsw = _mm_setzero_si128();
833 __m128i ve = _mm_setzero_si128();
834 __m128i vf = _mm_setzero_si128();
835 __m128i vh = _mm_setzero_si128();
836 #if 0
837 __m128i vhd = _mm_setzero_si128();
838 __m128i vhdtmp = _mm_setzero_si128();
839 #endif
840 __m128i vtmp = _mm_setzero_si128();
841
842 assert_gt(sc_->refGapOpen(), 0);
843 assert_leq(sc_->refGapOpen(), MAX_I16);
844 rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
845 rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
846 rfgapo = _mm_shuffle_epi32(rfgapo, 0);
847
848 // Set all elts to reference gap extension penalty
849 assert_gt(sc_->refGapExtend(), 0);
850 assert_leq(sc_->refGapExtend(), MAX_I16);
851 assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
852 rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
853 rfgape = _mm_shufflelo_epi16(rfgape, 0);
854 rfgape = _mm_shuffle_epi32(rfgape, 0);
855
856 // Set all elts to read gap open penalty
857 assert_gt(sc_->readGapOpen(), 0);
858 assert_leq(sc_->readGapOpen(), MAX_I16);
859 rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
860 rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
861 rdgapo = _mm_shuffle_epi32(rdgapo, 0);
862
863 // Set all elts to read gap extension penalty
864 assert_gt(sc_->readGapExtend(), 0);
865 assert_leq(sc_->readGapExtend(), MAX_I16);
866 assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
867 rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
868 rdgape = _mm_shufflelo_epi16(rdgape, 0);
869 rdgape = _mm_shuffle_epi32(rdgape, 0);
870
871 // Set all elts to 0x8000 (min value for signed 16-bit)
872 vlo = _mm_cmpeq_epi16(vlo, vlo); // all elts = 0xffff
873 vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
874
875 // Set all elts to 0x7fff (max value for signed 16-bit)
876 vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
877 vhi = _mm_srli_epi16(vhi, 1); // all elts = 0x7fff
878
879 // vlolsw: topmost (least sig) word set to 0x8000, all other words=0
880 vlolsw = _mm_shuffle_epi32(vlo, 0);
881 vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
882
883 // vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
884 vhilsw = _mm_shuffle_epi32(vhi, 0);
885 vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
886
887 // Points to a long vector of __m128i where each element is a block of
888 // contiguous cells in the E, F or H matrix. If the index % 3 == 0, then
889 // the block of cells is from the E matrix. If index % 3 == 1, they're
890 // from the F matrix. If index % 3 == 2, then they're from the H matrix.
891 // Blocks of cells are organized in the same interleaved manner as they are
892 // calculated by the Farrar algorithm.
893 const __m128i *pvScore; // points into the query profile
894
895 d.mat_.init(dpRows(), rff_ - rfi_, NWORDS_PER_REG);
896 const size_t colstride = d.mat_.colstride();
897 assert_eq(ROWSTRIDE, colstride / iter);
898
899 // Initialize the H and E vectors in the first matrix column
900 __m128i *pvHTmp = d.mat_.tmpvec(0, 0);
901 __m128i *pvETmp = d.mat_.evec(0, 0);
902
903 // Maximum score in final row
904 bool found = false;
905 TCScore lrmax = MIN_I16;
906
907 for(size_t i = 0; i < iter; i++) {
908 _mm_store_si128(pvETmp, vlo);
909 // Could initialize Hs to high or low. If high, cells in the lower
910 // triangle will have somewhat more legitiate scores, but still won't
911 // be exhaustively scored.
912 _mm_store_si128(pvHTmp, vlo);
913 pvETmp += ROWSTRIDE;
914 pvHTmp += ROWSTRIDE;
915 }
916 // These are swapped just before the innermost loop
917 __m128i *pvHStore = d.mat_.hvec(0, 0);
918 __m128i *pvHLoad = d.mat_.tmpvec(0, 0);
919 __m128i *pvELoad = d.mat_.evec(0, 0);
920 __m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
921 __m128i *pvFStore = d.mat_.fvec(0, 0);
922 __m128i *pvFTmp = NULL;
923
924 assert_gt(sc_->gapbar, 0);
925 size_t nfixup = 0;
926
927 // Fill in the table as usual but instead of using the same gap-penalty
928 // vector for each iteration of the inner loop, load words out of a
929 // pre-calculated gap vector parallel to the query profile. The pre-
930 // calculated gap vectors enforce the gap barrier constraint by making it
931 // infinitely costly to introduce a gap in barrier rows.
932 //
933 // AND use a separate loop to fill in the first row of the table, enforcing
934 // the st_ constraints in the process. This is awkward because it
935 // separates the processing of the first row from the others and might make
936 // it difficult to use the first-row results in the next row, but it might
937 // be the simplest and least disruptive way to deal with the st_ constraint.
938
939 colstop_ = rff_ - 1;
940 lastsolcol_ = 0;
941
942 for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
943 assert(pvFStore == d.mat_.fvec(0, i - rfi_));
944 assert(pvHStore == d.mat_.hvec(0, i - rfi_));
945
946 // Fetch the appropriate query profile. Note that elements of rf_ must
947 // be numbers, not masks.
948 const int refc = (int)rf_[i];
949 size_t off = (size_t)firsts5[refc] * iter * 2;
950 pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
951
952 // Set all cells to low value
953 vf = _mm_cmpeq_epi16(vf, vf);
954 vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
955 vf = _mm_or_si128(vf, vlolsw);
956
957 // Load H vector from the final row of the previous column
958 vh = _mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
959 // Shift 2 bytes down so that topmost (least sig) cell gets 0
960 vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
961 // Fill topmost (least sig) cell with high value
962 vh = _mm_or_si128(vh, vhilsw);
963
964 // For each character in the reference text:
965 size_t j;
966 for(j = 0; j < iter; j++) {
967 // Load cells from E, calculated previously
968 ve = _mm_load_si128(pvELoad);
969 #if 0
970 vhd = _mm_load_si128(pvHLoad);
971 #endif
972 assert_all_lt(ve, vhi);
973 pvELoad += ROWSTRIDE;
974
975 // Store cells in F, calculated previously
976 vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
977 vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
978 _mm_store_si128(pvFStore, vf);
979 pvFStore += ROWSTRIDE;
980
981 // Factor in query profile (matches and mismatches)
982 vh = _mm_adds_epi16(vh, pvScore[0]);
983
984 // Update H, factoring in E and F
985 vh = _mm_max_epi16(vh, ve);
986 vh = _mm_max_epi16(vh, vf);
987
988 // Save the new vH values
989 _mm_store_si128(pvHStore, vh);
990 pvHStore += ROWSTRIDE;
991
992 // Update vE value
993 vtmp = vh;
994 #if 0
995 vhdtmp = vhd;
996 vhd = _mm_subs_epi16(vhd, rdgapo);
997 vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
998 vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
999 ve = _mm_subs_epi16(ve, rdgape);
1000 ve = _mm_max_epi16(ve, vhd);
1001 #else
1002 vh = _mm_subs_epi16(vh, rdgapo);
1003 vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
1004 vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
1005 ve = _mm_subs_epi16(ve, rdgape);
1006 ve = _mm_max_epi16(ve, vh);
1007 #endif
1008 assert_all_lt(ve, vhi);
1009
1010 // Load the next h value
1011 #if 0
1012 vh = vhdtmp;
1013 #else
1014 vh = _mm_load_si128(pvHLoad);
1015 #endif
1016 pvHLoad += ROWSTRIDE;
1017
1018 // Save E values
1019 _mm_store_si128(pvEStore, ve);
1020 pvEStore += ROWSTRIDE;
1021
1022 // Update vf value
1023 vtmp = _mm_subs_epi16(vtmp, rfgapo);
1024 vf = _mm_subs_epi16(vf, rfgape);
1025 assert_all_lt(vf, vhi);
1026 vf = _mm_max_epi16(vf, vtmp);
1027
1028 pvScore += 2; // move on to next query profile / gap veto
1029 }
1030 // pvHStore, pvELoad, pvEStore have all rolled over to the next column
1031 pvFTmp = pvFStore;
1032 pvFStore -= colstride; // reset to start of column
1033 vtmp = _mm_load_si128(pvFStore);
1034
1035 pvHStore -= colstride; // reset to start of column
1036 vh = _mm_load_si128(pvHStore);
1037
1038 #if 0
1039 #else
1040 pvEStore -= colstride; // reset to start of column
1041 ve = _mm_load_si128(pvEStore);
1042 #endif
1043
1044 pvHLoad = pvHStore; // new pvHLoad = pvHStore
1045 pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
1046
1047 // vf from last row gets shifted down by one to overlay the first row
1048 // rfgape has already been subtracted from it.
1049 vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
1050 vf = _mm_or_si128(vf, vlolsw);
1051
1052 vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
1053 vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
1054 vf = _mm_max_epi16(vtmp, vf);
1055 vtmp = _mm_cmpgt_epi16(vf, vtmp);
1056 int cmp = _mm_movemask_epi8(vtmp);
1057
1058 // If any element of vtmp is greater than H - gap-open...
1059 j = 0;
1060 while(cmp != 0x0000) {
1061 // Store this vf
1062 _mm_store_si128(pvFStore, vf);
1063 pvFStore += ROWSTRIDE;
1064
1065 // Update vh w/r/t new vf
1066 vh = _mm_max_epi16(vh, vf);
1067
1068 // Save vH values
1069 _mm_store_si128(pvHStore, vh);
1070 pvHStore += ROWSTRIDE;
1071
1072 // Update E in case it can be improved using our new vh
1073 #if 0
1074 #else
1075 vh = _mm_subs_epi16(vh, rdgapo);
1076 vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
1077 vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
1078 ve = _mm_max_epi16(ve, vh);
1079 _mm_store_si128(pvEStore, ve);
1080 pvEStore += ROWSTRIDE;
1081 #endif
1082 pvScore += 2;
1083
1084 assert_lt(j, iter);
1085 if(++j == iter) {
1086 pvFStore -= colstride;
1087 vtmp = _mm_load_si128(pvFStore); // load next vf ASAP
1088 pvHStore -= colstride;
1089 vh = _mm_load_si128(pvHStore); // load next vh ASAP
1090 #if 0
1091 #else
1092 pvEStore -= colstride;
1093 ve = _mm_load_si128(pvEStore); // load next ve ASAP
1094 #endif
1095 pvScore = d.profbuf_.ptr() + off + 1;
1096 j = 0;
1097 vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
1098 vf = _mm_or_si128(vf, vlolsw);
1099 } else {
1100 vtmp = _mm_load_si128(pvFStore); // load next vf ASAP
1101 vh = _mm_load_si128(pvHStore); // load next vh ASAP
1102 #if 0
1103 #else
1104 ve = _mm_load_si128(pvEStore); // load next vh ASAP
1105 #endif
1106 }
1107
1108 // Update F with another gap extension
1109 vf = _mm_subs_epi16(vf, rfgape);
1110 vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
1111 vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
1112 vf = _mm_max_epi16(vtmp, vf);
1113 vtmp = _mm_cmpgt_epi16(vf, vtmp);
1114 cmp = _mm_movemask_epi8(vtmp);
1115 nfixup++;
1116 }
1117
1118 #ifndef NDEBUG
1119 if((rand() & 15) == 0) {
1120 // This is a work-intensive sanity check; each time we finish filling
1121 // a column, we check that each H, E, and F is sensible.
1122 for(size_t k = 0; k < dpRows(); k++) {
1123 assert(cellOkEnd2EndI16(
1124 d,
1125 k, // row
1126 i - rfi_, // col
1127 refc, // reference mask
1128 (int)(*rd_)[rdi_+k], // read char
1129 (int)(*qu_)[rdi_+k], // read quality
1130 *sc_)); // scoring scheme
1131 }
1132 }
1133 #endif
1134
1135 __m128i *vtmp = d.mat_.hvec(d.lastIter_, i-rfi_);
1136 // Note: we may not want to extract from the final row
1137 TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
1138 found = true;
1139 if(lr > lrmax) {
1140 lrmax = lr;
1141 }
1142
1143 // pvELoad and pvHLoad are already where they need to be
1144
1145 // Adjust the load and store vectors here.
1146 pvHStore = pvHLoad + colstride;
1147 pvEStore = pvELoad + colstride;
1148 pvFStore = pvFTmp;
1149 }
1150
1151 // Update metrics
1152 if(!debug) {
1153 size_t ninner = (rff_ - rfi_) * iter;
1154 met.col += (rff_ - rfi_); // DP columns
1155 met.cell += (ninner * NWORDS_PER_REG); // DP cells
1156 met.inner += ninner; // DP inner loop iters
1157 met.fixup += nfixup; // DP fixup loop iters
1158 }
1159
1160 flag = 0;
1161
1162 // Did we find a solution?
1163 TAlScore score = MIN_I64;
1164 if(!found) {
1165 flag = -1; // no
1166 if(!debug) met.dpfail++;
1167 return MIN_I64;
1168 } else {
1169 score = (TAlScore)(lrmax - 0x7fff);
1170 if(score < minsc_) {
1171 flag = -1; // no
1172 if(!debug) met.dpfail++;
1173 return score;
1174 }
1175 }
1176
1177 // Could we have saturated?
1178 if(lrmax == MIN_I16) {
1179 flag = -2; // yes
1180 if(!debug) met.dpsat++;
1181 return MIN_I64;
1182 }
1183
1184 // Return largest score
1185 if(!debug) met.dpsucc++;
1186 return score;
1187 }
1188
1189 /**
1190 * Given a filled-in DP table, populate the btncand_ list with candidate cells
1191 * that might be at the ends of valid alignments. No need to do this unless
1192 * the maximum score returned by the align*() func is >= the minimum.
1193 *
1194 * Only cells that are exhaustively scored are candidates. Those are the
1195 * cells inside the shape made of o's in this:
1196 *
1197 * |-maxgaps-|
1198 * ********************************* -
1199 * ******************************** |
1200 * ******************************* |
1201 * ****************************** |
1202 * ***************************** |
1203 * **************************** read len
1204 * *************************** |
1205 * ************************** |
1206 * ************************* |
1207 * ************************ |
1208 * ***********oooooooooooo -
1209 * |-maxgaps-|
1210 * |-readlen-|
1211 * |-------skip--------|
1212 *
1213 * And it's possible for the shape to be truncated on the left and right sides.
1214 *
1215 *
1216 */
gatherCellsNucleotidesEnd2EndSseI16(TAlScore best)1217 bool SwAligner::gatherCellsNucleotidesEnd2EndSseI16(TAlScore best) {
1218 // What's the minimum number of rows that can possibly be spanned by an
1219 // alignment that meets the minimum score requirement?
1220 assert(sse16succ_);
1221 const size_t ncol = rff_ - rfi_;
1222 const size_t nrow = dpRows();
1223 assert_gt(nrow, 0);
1224 btncand_.clear();
1225 btncanddone_.clear();
1226 SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
1227 SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
1228 assert(!d.profbuf_.empty());
1229 const size_t colstride = d.mat_.colstride();
1230 ASSERT_ONLY(bool sawbest = false);
1231 __m128i *pvH = d.mat_.hvec(d.lastIter_, 0);
1232 for(size_t j = 0; j < ncol; j++) {
1233 TAlScore sc = (TAlScore)(((TCScore*)pvH)[d.lastWord_] - 0x7fff);
1234 assert_leq(sc, best);
1235 ASSERT_ONLY(sawbest = (sawbest || sc == best));
1236 if(sc >= minsc_) {
1237 // Yes, this is legit
1238 met.gathsol++;
1239 btncand_.expand();
1240 btncand_.back().init(nrow-1, j, sc);
1241 }
1242 pvH += colstride;
1243 }
1244 assert(sawbest);
1245 if(!btncand_.empty()) {
1246 d.mat_.initMasks();
1247 }
1248 return !btncand_.empty();
1249 }
1250
1251 #define MOVE_VEC_PTR_UP(vec, rowvec, rowelt) { \
1252 if(rowvec == 0) { \
1253 rowvec += d.mat_.nvecrow_; \
1254 vec += d.mat_.colstride_; \
1255 rowelt--; \
1256 } \
1257 rowvec--; \
1258 vec -= ROWSTRIDE; \
1259 }
1260
1261 #define MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt) { vec -= d.mat_.colstride_; }
1262
1263 #define MOVE_VEC_PTR_UPLEFT(vec, rowvec, rowelt) { \
1264 MOVE_VEC_PTR_UP(vec, rowvec, rowelt); \
1265 MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt); \
1266 }
1267
1268 #define MOVE_ALL_LEFT() { \
1269 MOVE_VEC_PTR_LEFT(cur_vec, rowvec, rowelt); \
1270 MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
1271 MOVE_VEC_PTR_LEFT(up_vec, up_rowvec, up_rowelt); \
1272 MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
1273 }
1274
1275 #define MOVE_ALL_UP() { \
1276 MOVE_VEC_PTR_UP(cur_vec, rowvec, rowelt); \
1277 MOVE_VEC_PTR_UP(left_vec, left_rowvec, left_rowelt); \
1278 MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
1279 MOVE_VEC_PTR_UP(upleft_vec, upleft_rowvec, upleft_rowelt); \
1280 }
1281
1282 #define MOVE_ALL_UPLEFT() { \
1283 MOVE_VEC_PTR_UPLEFT(cur_vec, rowvec, rowelt); \
1284 MOVE_VEC_PTR_UPLEFT(left_vec, left_rowvec, left_rowelt); \
1285 MOVE_VEC_PTR_UPLEFT(up_vec, up_rowvec, up_rowelt); \
1286 MOVE_VEC_PTR_UPLEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
1287 }
1288
1289 #define NEW_ROW_COL(row, col) { \
1290 rowelt = row / d.mat_.nvecrow_; \
1291 rowvec = row % d.mat_.nvecrow_; \
1292 eltvec = (col * d.mat_.colstride_) + (rowvec * ROWSTRIDE); \
1293 cur_vec = d.mat_.matbuf_.ptr() + eltvec; \
1294 left_vec = cur_vec; \
1295 left_rowelt = rowelt; \
1296 left_rowvec = rowvec; \
1297 MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
1298 up_vec = cur_vec; \
1299 up_rowelt = rowelt; \
1300 up_rowvec = rowvec; \
1301 MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
1302 upleft_vec = up_vec; \
1303 upleft_rowelt = up_rowelt; \
1304 upleft_rowvec = up_rowvec; \
1305 MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
1306 }
1307
1308 /**
1309 * Given the dynamic programming table and a cell, trace backwards from the
1310 * cell and install the edits and score/penalty in the appropriate fields
1311 * of res. The RandomSource is used to break ties among equally good ways
1312 * of tracing back.
1313 *
1314 * Whenever we enter a cell, we check whether the read/ref coordinates of
1315 * that cell correspond to a cell we traversed constructing a previous
1316 * alignment. If so, we backtrack to the last decision point, mask out the
1317 * path that led to the previously observed cell, and continue along a
1318 * different path; or, if there are no more paths to try, we give up.
1319 *
1320 * If an alignment is found, 'off' is set to the alignment's upstream-most
1321 * reference character's offset into the chromosome and true is returned.
1322 * Otherwise, false is returned.
1323 */
backtraceNucleotidesEnd2EndSseI16(TAlScore escore,SwResult & res,size_t & off,size_t & nbts,size_t row,size_t col,RandomSource & rnd)1324 bool SwAligner::backtraceNucleotidesEnd2EndSseI16(
1325 TAlScore escore, // in: expected score
1326 SwResult& res, // out: store results (edits and scores) here
1327 size_t& off, // out: store diagonal projection of origin
1328 size_t& nbts, // out: # backtracks
1329 size_t row, // start in this row
1330 size_t col, // start in this column
1331 RandomSource& rnd) // random gen, to choose among equal paths
1332 {
1333 assert_lt(row, dpRows());
1334 assert_lt(col, (size_t)(rff_ - rfi_));
1335 SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
1336 SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
1337 met.bt++;
1338 assert(!d.profbuf_.empty());
1339 assert_lt(row, rd_->length());
1340 btnstack_.clear(); // empty the backtrack stack
1341 btcells_.clear(); // empty the cells-so-far list
1342 AlnScore score; score.score_ = 0;
1343 score.gaps_ = score.ns_ = 0;
1344 size_t origCol = col;
1345 size_t gaps = 0, readGaps = 0, refGaps = 0;
1346 res.alres.reset();
1347 EList<Edit>& ned = res.alres.ned();
1348 assert(ned.empty());
1349 assert_gt(dpRows(), row);
1350 size_t trimEnd = dpRows() - row - 1;
1351 size_t trimBeg = 0;
1352 size_t ct = SSEMatrix::H; // cell type
1353 // Row and col in terms of where they fall in the SSE vector matrix
1354 size_t rowelt, rowvec, eltvec;
1355 size_t left_rowelt, up_rowelt, upleft_rowelt;
1356 size_t left_rowvec, up_rowvec, upleft_rowvec;
1357 __m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
1358 NEW_ROW_COL(row, col);
1359 while((int)row >= 0) {
1360 met.btcell++;
1361 nbts++;
1362 int readc = (*rd_)[rdi_ + row];
1363 int refm = (int)rf_[rfi_ + col];
1364 int readq = (*qu_)[row];
1365 assert_leq(col, origCol);
1366 // Get score in this cell
1367 bool empty = false, reportedThru, canMoveThru, branch = false;
1368 int cur = SSEMatrix::H;
1369 if(!d.mat_.reset_[row]) {
1370 d.mat_.resetRow(row);
1371 }
1372 reportedThru = d.mat_.reportedThrough(row, col);
1373 canMoveThru = true;
1374 if(reportedThru) {
1375 canMoveThru = false;
1376 } else {
1377 empty = false;
1378 if(row > 0) {
1379 assert_gt(row, 0);
1380 size_t rowFromEnd = d.mat_.nrow() - row - 1;
1381 bool gapsAllowed = true;
1382 if(row < (size_t)sc_->gapbar ||
1383 rowFromEnd < (size_t)sc_->gapbar)
1384 {
1385 gapsAllowed = false;
1386 }
1387 const TAlScore floorsc = MIN_I64;
1388 const int offsetsc = -0x7fff;
1389 // Move to beginning of column/row
1390 if(ct == SSEMatrix::E) { // AKA rdgap
1391 assert_gt(col, 0);
1392 TAlScore sc_cur = ((TCScore*)(cur_vec + SSEMatrix::E))[rowelt] + offsetsc;
1393 assert(gapsAllowed);
1394 // Currently in the E matrix; incoming transition must come from the
1395 // left. It's either a gap open from the H matrix or a gap extend from
1396 // the E matrix.
1397 // TODO: save and restore origMask as well as mask
1398 int origMask = 0, mask = 0;
1399 // Get H score of cell to the left
1400 TAlScore sc_h_left = ((TCScore*)(left_vec + SSEMatrix::H))[left_rowelt] + offsetsc;
1401 if(sc_h_left > floorsc && sc_h_left - sc_->readGapOpen() == sc_cur) {
1402 mask |= (1 << 0);
1403 }
1404 // Get E score of cell to the left
1405 TAlScore sc_e_left = ((TCScore*)(left_vec + SSEMatrix::E))[left_rowelt] + offsetsc;
1406 if(sc_e_left > floorsc && sc_e_left - sc_->readGapExtend() == sc_cur) {
1407 mask |= (1 << 1);
1408 }
1409 origMask = mask;
1410 assert(origMask > 0 || sc_cur <= sc_->match());
1411 if(d.mat_.isEMaskSet(row, col)) {
1412 mask = (d.mat_.masks_[row][col] >> 8) & 3;
1413 }
1414 if(mask == 3) {
1415 #if 1
1416 // Pick H -> E cell
1417 cur = SW_BT_OALL_READ_OPEN;
1418 d.mat_.eMaskSet(row, col, 2); // might choose E later
1419 #else
1420 if(rnd.nextU2()) {
1421 // Pick H -> E cell
1422 cur = SW_BT_OALL_READ_OPEN;
1423 d.mat_.eMaskSet(row, col, 2); // might choose E later
1424 } else {
1425 // Pick E -> E cell
1426 cur = SW_BT_RDGAP_EXTEND;
1427 d.mat_.eMaskSet(row, col, 1); // might choose H later
1428 }
1429 #endif
1430 branch = true;
1431 } else if(mask == 2) {
1432 // I chose the E cell
1433 cur = SW_BT_RDGAP_EXTEND;
1434 d.mat_.eMaskSet(row, col, 0); // done
1435 } else if(mask == 1) {
1436 // I chose the H cell
1437 cur = SW_BT_OALL_READ_OPEN;
1438 d.mat_.eMaskSet(row, col, 0); // done
1439 } else {
1440 empty = true;
1441 // It's empty, so the only question left is whether we should be
1442 // allowed in terimnate in this cell. If it's got a valid score
1443 // then we *shouldn't* be allowed to terminate here because that
1444 // means it's part of a larger alignment that was already reported.
1445 canMoveThru = (origMask == 0);
1446 }
1447 assert(!empty || !canMoveThru);
1448 } else if(ct == SSEMatrix::F) { // AKA rfgap
1449 assert_gt(row, 0);
1450 assert(gapsAllowed);
1451 TAlScore sc_h_up = ((TCScore*)(up_vec + SSEMatrix::H))[up_rowelt] + offsetsc;
1452 TAlScore sc_f_up = ((TCScore*)(up_vec + SSEMatrix::F))[up_rowelt] + offsetsc;
1453 TAlScore sc_cur = ((TCScore*)(cur_vec + SSEMatrix::F))[rowelt] + offsetsc;
1454 // Currently in the F matrix; incoming transition must come from above.
1455 // It's either a gap open from the H matrix or a gap extend from the F
1456 // matrix.
1457 // TODO: save and restore origMask as well as mask
1458 int origMask = 0, mask = 0;
1459 // Get H score of cell above
1460 if(sc_h_up > floorsc && sc_h_up - sc_->refGapOpen() == sc_cur) {
1461 mask |= (1 << 0);
1462 }
1463 // Get F score of cell above
1464 if(sc_f_up > floorsc && sc_f_up - sc_->refGapExtend() == sc_cur) {
1465 mask |= (1 << 1);
1466 }
1467 origMask = mask;
1468 assert(origMask > 0 || sc_cur <= sc_->match());
1469 if(d.mat_.isFMaskSet(row, col)) {
1470 mask = (d.mat_.masks_[row][col] >> 11) & 3;
1471 }
1472 if(mask == 3) {
1473 #if 1
1474 // I chose the H cell
1475 cur = SW_BT_OALL_REF_OPEN;
1476 d.mat_.fMaskSet(row, col, 2); // might choose E later
1477 #else
1478 if(rnd.nextU2()) {
1479 // I chose the H cell
1480 cur = SW_BT_OALL_REF_OPEN;
1481 d.mat_.fMaskSet(row, col, 2); // might choose E later
1482 } else {
1483 // I chose the F cell
1484 cur = SW_BT_RFGAP_EXTEND;
1485 d.mat_.fMaskSet(row, col, 1); // might choose E later
1486 }
1487 #endif
1488 branch = true;
1489 } else if(mask == 2) {
1490 // I chose the F cell
1491 cur = SW_BT_RFGAP_EXTEND;
1492 d.mat_.fMaskSet(row, col, 0); // done
1493 } else if(mask == 1) {
1494 // I chose the H cell
1495 cur = SW_BT_OALL_REF_OPEN;
1496 d.mat_.fMaskSet(row, col, 0); // done
1497 } else {
1498 empty = true;
1499 // It's empty, so the only question left is whether we should be
1500 // allowed in terimnate in this cell. If it's got a valid score
1501 // then we *shouldn't* be allowed to terminate here because that
1502 // means it's part of a larger alignment that was already reported.
1503 canMoveThru = (origMask == 0);
1504 }
1505 assert(!empty || !canMoveThru);
1506 } else {
1507 assert_eq(SSEMatrix::H, ct);
1508 TAlScore sc_cur = ((TCScore*)(cur_vec + SSEMatrix::H))[rowelt] + offsetsc;
1509 TAlScore sc_f_up = ((TCScore*)(up_vec + SSEMatrix::F))[up_rowelt] + offsetsc;
1510 TAlScore sc_h_up = ((TCScore*)(up_vec + SSEMatrix::H))[up_rowelt] + offsetsc;
1511 TAlScore sc_h_left = col > 0 ? (((TCScore*)(left_vec + SSEMatrix::H))[left_rowelt] + offsetsc) : floorsc;
1512 TAlScore sc_e_left = col > 0 ? (((TCScore*)(left_vec + SSEMatrix::E))[left_rowelt] + offsetsc) : floorsc;
1513 TAlScore sc_h_upleft = col > 0 ? (((TCScore*)(upleft_vec + SSEMatrix::H))[upleft_rowelt] + offsetsc) : floorsc;
1514 TAlScore sc_diag = sc_->score(readc, refm, readq - 33);
1515 // TODO: save and restore origMask as well as mask
1516 int origMask = 0, mask = 0;
1517 if(gapsAllowed) {
1518 if(sc_h_up > floorsc && sc_cur == sc_h_up - sc_->refGapOpen()) {
1519 mask |= (1 << 0);
1520 }
1521 if(sc_h_left > floorsc && sc_cur == sc_h_left - sc_->readGapOpen()) {
1522 mask |= (1 << 1);
1523 }
1524 if(sc_f_up > floorsc && sc_cur == sc_f_up - sc_->refGapExtend()) {
1525 mask |= (1 << 2);
1526 }
1527 if(sc_e_left > floorsc && sc_cur == sc_e_left - sc_->readGapExtend()) {
1528 mask |= (1 << 3);
1529 }
1530 }
1531 if(sc_h_upleft > floorsc && sc_cur == sc_h_upleft + sc_diag) {
1532 mask |= (1 << 4);
1533 }
1534 origMask = mask;
1535 assert(origMask > 0 || sc_cur <= sc_->match());
1536 if(d.mat_.isHMaskSet(row, col)) {
1537 mask = (d.mat_.masks_[row][col] >> 2) & 31;
1538 }
1539 assert(gapsAllowed || mask == (1 << 4) || mask == 0);
1540 int opts = alts5[mask];
1541 int select = -1;
1542 if(opts == 1) {
1543 select = firsts5[mask];
1544 assert_geq(mask, 0);
1545 d.mat_.hMaskSet(row, col, 0);
1546 } else if(opts > 1) {
1547 #if 1
1548 if( (mask & 16) != 0) {
1549 select = 4; // H diag
1550 } else if((mask & 1) != 0) {
1551 select = 0; // H up
1552 } else if((mask & 4) != 0) {
1553 select = 2; // F up
1554 } else if((mask & 2) != 0) {
1555 select = 1; // H left
1556 } else if((mask & 8) != 0) {
1557 select = 3; // E left
1558 }
1559 #else
1560 select = randFromMask(rnd, mask);
1561 #endif
1562 assert_geq(mask, 0);
1563 mask &= ~(1 << select);
1564 assert(gapsAllowed || mask == (1 << 4) || mask == 0);
1565 d.mat_.hMaskSet(row, col, mask);
1566 branch = true;
1567 } else { /* No way to backtrack! */ }
1568 if(select != -1) {
1569 if(select == 4) {
1570 cur = SW_BT_OALL_DIAG;
1571 } else if(select == 0) {
1572 cur = SW_BT_OALL_REF_OPEN;
1573 } else if(select == 1) {
1574 cur = SW_BT_OALL_READ_OPEN;
1575 } else if(select == 2) {
1576 cur = SW_BT_RFGAP_EXTEND;
1577 } else {
1578 assert_eq(3, select)
1579 cur = SW_BT_RDGAP_EXTEND;
1580 }
1581 } else {
1582 empty = true;
1583 // It's empty, so the only question left is whether we should be
1584 // allowed in terimnate in this cell. If it's got a valid score
1585 // then we *shouldn't* be allowed to terminate here because that
1586 // means it's part of a larger alignment that was already reported.
1587 canMoveThru = (origMask == 0);
1588 }
1589 }
1590 assert(!empty || !canMoveThru || ct == SSEMatrix::H);
1591 }
1592 }
1593 d.mat_.setReportedThrough(row, col);
1594 assert_eq(gaps, Edit::numGaps(ned));
1595 assert_leq(gaps, rdgap_ + rfgap_);
1596 // Cell was involved in a previously-reported alignment?
1597 if(!canMoveThru) {
1598 if(!btnstack_.empty()) {
1599 // Remove all the cells from list back to and including the
1600 // cell where the branch occurred
1601 btcells_.resize(btnstack_.back().celsz);
1602 // Pop record off the top of the stack
1603 ned.resize(btnstack_.back().nedsz);
1604 //aed.resize(btnstack_.back().aedsz);
1605 row = btnstack_.back().row;
1606 col = btnstack_.back().col;
1607 gaps = btnstack_.back().gaps;
1608 readGaps = btnstack_.back().readGaps;
1609 refGaps = btnstack_.back().refGaps;
1610 score = btnstack_.back().score;
1611 ct = btnstack_.back().ct;
1612 btnstack_.pop_back();
1613 assert(!sc_->monotone || score.score() >= escore);
1614 NEW_ROW_COL(row, col);
1615 continue;
1616 } else {
1617 // No branch points to revisit; just give up
1618 res.reset();
1619 met.btfail++; // DP backtraces failed
1620 return false;
1621 }
1622 }
1623 assert(!reportedThru);
1624 assert(!sc_->monotone || score.score() >= minsc_);
1625 if(empty || row == 0) {
1626 assert_eq(SSEMatrix::H, ct);
1627 btcells_.expand();
1628 btcells_.back().first = row;
1629 btcells_.back().second = col;
1630 // This cell is at the end of a legitimate alignment
1631 trimBeg = row;
1632 assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
1633 break;
1634 }
1635 if(branch) {
1636 // Add a frame to the backtrack stack
1637 btnstack_.expand();
1638 btnstack_.back().init(
1639 ned.size(),
1640 0, // aed.size()
1641 btcells_.size(),
1642 row,
1643 col,
1644 gaps,
1645 readGaps,
1646 refGaps,
1647 score,
1648 (int)ct);
1649 }
1650 btcells_.expand();
1651 btcells_.back().first = row;
1652 btcells_.back().second = col;
1653 switch(cur) {
1654 // Move up and to the left. If the reference nucleotide in the
1655 // source row mismatches the read nucleotide, penalize
1656 // it and add a nucleotide mismatch.
1657 case SW_BT_OALL_DIAG: {
1658 assert_gt(row, 0); assert_gt(col, 0);
1659 // Check for color mismatch
1660 int readC = (*rd_)[row];
1661 int refNmask = (int)rf_[rfi_+col];
1662 assert_gt(refNmask, 0);
1663 int m = matchesEx(readC, refNmask);
1664 ct = SSEMatrix::H;
1665 if(m != 1) {
1666 Edit e(
1667 (int)row,
1668 mask2dna[refNmask],
1669 "ACGTN"[readC],
1670 EDIT_TYPE_MM);
1671 assert(e.repOk());
1672 assert(ned.empty() || ned.back().pos >= row);
1673 ned.push_back(e);
1674 int pen = QUAL2(row, col);
1675 score.score_ -= pen;
1676 assert(!sc_->monotone || score.score() >= escore);
1677 } else {
1678 // Reward a match
1679 int64_t bonus = sc_->match(30);
1680 score.score_ += bonus;
1681 assert(!sc_->monotone || score.score() >= escore);
1682 }
1683 if(m == -1) {
1684 score.ns_++;
1685 }
1686 row--; col--;
1687 MOVE_ALL_UPLEFT();
1688 assert(VALID_AL_SCORE(score));
1689 break;
1690 }
1691 // Move up. Add an edit encoding the ref gap.
1692 case SW_BT_OALL_REF_OPEN:
1693 {
1694 assert_gt(row, 0);
1695 Edit e(
1696 (int)row,
1697 '-',
1698 "ACGTN"[(int)(*rd_)[row]],
1699 EDIT_TYPE_REF_GAP);
1700 assert(e.repOk());
1701 assert(ned.empty() || ned.back().pos >= row);
1702 ned.push_back(e);
1703 assert_geq(row, (size_t)sc_->gapbar);
1704 assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
1705 row--;
1706 ct = SSEMatrix::H;
1707 int pen = sc_->refGapOpen();
1708 score.score_ -= pen;
1709 assert(!sc_->monotone || score.score() >= minsc_);
1710 gaps++; refGaps++;
1711 assert_eq(gaps, Edit::numGaps(ned));
1712 assert_leq(gaps, rdgap_ + rfgap_);
1713 MOVE_ALL_UP();
1714 break;
1715 }
1716 // Move up. Add an edit encoding the ref gap.
1717 case SW_BT_RFGAP_EXTEND:
1718 {
1719 assert_gt(row, 1);
1720 Edit e(
1721 (int)row,
1722 '-',
1723 "ACGTN"[(int)(*rd_)[row]],
1724 EDIT_TYPE_REF_GAP);
1725 assert(e.repOk());
1726 assert(ned.empty() || ned.back().pos >= row);
1727 ned.push_back(e);
1728 assert_geq(row, (size_t)sc_->gapbar);
1729 assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
1730 row--;
1731 ct = SSEMatrix::F;
1732 int pen = sc_->refGapExtend();
1733 score.score_ -= pen;
1734 assert(!sc_->monotone || score.score() >= minsc_);
1735 gaps++; refGaps++;
1736 assert_eq(gaps, Edit::numGaps(ned));
1737 assert_leq(gaps, rdgap_ + rfgap_);
1738 MOVE_ALL_UP();
1739 break;
1740 }
1741 case SW_BT_OALL_READ_OPEN:
1742 {
1743 assert_gt(col, 0);
1744 Edit e(
1745 (int)row+1,
1746 mask2dna[(int)rf_[rfi_+col]],
1747 '-',
1748 EDIT_TYPE_READ_GAP);
1749 assert(e.repOk());
1750 assert(ned.empty() || ned.back().pos >= row);
1751 ned.push_back(e);
1752 assert_geq(row, (size_t)sc_->gapbar);
1753 assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
1754 col--;
1755 ct = SSEMatrix::H;
1756 int pen = sc_->readGapOpen();
1757 score.score_ -= pen;
1758 assert(!sc_->monotone || score.score() >= minsc_);
1759 gaps++; readGaps++;
1760 assert_eq(gaps, Edit::numGaps(ned));
1761 assert_leq(gaps, rdgap_ + rfgap_);
1762 MOVE_ALL_LEFT();
1763 break;
1764 }
1765 case SW_BT_RDGAP_EXTEND:
1766 {
1767 assert_gt(col, 1);
1768 Edit e(
1769 (int)row+1,
1770 mask2dna[(int)rf_[rfi_+col]],
1771 '-',
1772 EDIT_TYPE_READ_GAP);
1773 assert(e.repOk());
1774 assert(ned.empty() || ned.back().pos >= row);
1775 ned.push_back(e);
1776 assert_geq(row, (size_t)sc_->gapbar);
1777 assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
1778 col--;
1779 ct = SSEMatrix::E;
1780 int pen = sc_->readGapExtend();
1781 score.score_ -= pen;
1782 assert(!sc_->monotone || score.score() >= minsc_);
1783 gaps++; readGaps++;
1784 assert_eq(gaps, Edit::numGaps(ned));
1785 assert_leq(gaps, rdgap_ + rfgap_);
1786 MOVE_ALL_LEFT();
1787 break;
1788 }
1789 default: throw 1;
1790 }
1791 } // while((int)row > 0)
1792 assert_eq(0, trimBeg);
1793 assert_eq(0, trimEnd);
1794 assert_geq(col, 0);
1795 assert_eq(SSEMatrix::H, ct);
1796 // The number of cells in the backtracs should equal the number of read
1797 // bases after trimming plus the number of gaps
1798 assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
1799 // Check whether we went through a core diagonal and set 'reported' flag on
1800 // each cell
1801 bool overlappedCoreDiag = false;
1802 for(size_t i = 0; i < btcells_.size(); i++) {
1803 size_t rw = btcells_[i].first;
1804 size_t cl = btcells_[i].second;
1805 // Calculate the diagonal within the *trimmed* rectangle, i.e. the
1806 // rectangle we dealt with in align, gather and backtrack.
1807 int64_t diagi = cl - rw;
1808 // Now adjust to the diagonal within the *untrimmed* rectangle by
1809 // adding on the amount trimmed from the left.
1810 diagi += rect_->triml;
1811 if(diagi >= 0) {
1812 size_t diag = (size_t)diagi;
1813 if(diag >= rect_->corel && diag <= rect_->corer) {
1814 overlappedCoreDiag = true;
1815 break;
1816 }
1817 }
1818 assert(d.mat_.reportedThrough(rw, cl));
1819 }
1820 if(!overlappedCoreDiag) {
1821 // Must overlap a core diagonal. Otherwise, we run the risk of
1822 // reporting an alignment that overlaps (and trumps) a higher-scoring
1823 // alignment that lies partially outside the dynamic programming
1824 // rectangle.
1825 res.reset();
1826 met.corerej++;
1827 return false;
1828 }
1829 int readC = (*rd_)[rdi_+row]; // get last char in read
1830 int refNmask = (int)rf_[rfi_+col]; // get last ref char ref involved in aln
1831 assert_gt(refNmask, 0);
1832 int m = matchesEx(readC, refNmask);
1833 if(m != 1) {
1834 Edit e((int)row, mask2dna[refNmask], "ACGTN"[readC], EDIT_TYPE_MM);
1835 assert(e.repOk());
1836 assert(ned.empty() || ned.back().pos >= row);
1837 ned.push_back(e);
1838 score.score_ -= QUAL2(row, col);
1839 assert_geq(score.score(), minsc_);
1840 } else {
1841 score.score_ += sc_->match(30);
1842 }
1843 if(m == -1) {
1844 score.ns_++;
1845 }
1846 if(score.ns_ > nceil_) {
1847 // Alignment has too many Ns in it!
1848 res.reset();
1849 met.nrej++;
1850 return false;
1851 }
1852 res.reverse();
1853 assert(Edit::repOk(ned, (*rd_)));
1854 assert_eq(score.score(), escore);
1855 assert_leq(gaps, rdgap_ + rfgap_);
1856 off = col;
1857 assert_lt(col + (size_t)rfi_, (size_t)rff_);
1858 score.gaps_ = gaps;
1859 res.alres.setScore(score);
1860 res.alres.setShape(
1861 refidx_, // ref id
1862 off + rfi_ + rect_->refl, // 0-based ref offset
1863 reflen_, // reference length
1864 fw_, // aligned to Watson?
1865 rdf_ - rdi_, // read length
1866 0, // read ID
1867 true, // pretrim soft?
1868 0, // pretrim 5' end
1869 0, // pretrim 3' end
1870 true, // alignment trim soft?
1871 fw_ ? trimBeg : trimEnd, // alignment trim 5' end
1872 fw_ ? trimEnd : trimBeg); // alignment trim 3' end
1873 size_t refns = 0;
1874 for(size_t i = col; i <= origCol; i++) {
1875 if((int)rf_[rfi_+i] > 15) {
1876 refns++;
1877 }
1878 }
1879 res.alres.setRefNs(refns);
1880 assert(Edit::repOk(ned, (*rd_), true, trimBeg, trimEnd));
1881 assert(res.repOk());
1882 #ifndef NDEBUG
1883 size_t gapsCheck = 0;
1884 for(size_t i = 0; i < ned.size(); i++) {
1885 if(ned[i].isGap()) gapsCheck++;
1886 }
1887 assert_eq(gaps, gapsCheck);
1888 BTDnaString refstr;
1889 for(size_t i = col; i <= origCol; i++) {
1890 refstr.append(firsts5[(int)rf_[rfi_+i]]);
1891 }
1892 BTDnaString editstr;
1893 // daehwan
1894 // Edit::toRef((*rd_), ned, editstr, true, trimBeg, trimEnd);
1895 Edit::toRef((*rd_), ned, editstr, true, trimBeg + rdi_, trimEnd + (rd_->length() - rdf_));
1896 if(refstr != editstr) {
1897 cerr << "Decoded nucleotides and edits don't match reference:" << endl;
1898 cerr << " score: " << score.score()
1899 << " (" << gaps << " gaps)" << endl;
1900 cerr << " edits: ";
1901 Edit::print(cerr, ned);
1902 cerr << endl;
1903 cerr << " decoded nucs: " << (*rd_) << endl;
1904 cerr << " edited nucs: " << editstr << endl;
1905 cerr << " reference nucs: " << refstr << endl;
1906 assert(0);
1907 }
1908 #endif
1909 met.btsucc++; // DP backtraces succeeded
1910 return true;
1911 }
1912