1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 /*
21  *  aligner_sw_driver.h
22  *
23  * REDUNDANT SEED HITS
24  *
25  * We say that two seed hits are redundant if they trigger identical
26  * seed-extend dynamic programming problems.  Put another way, they both lie on
27  * the same diagonal of the overall read/reference dynamic programming matrix.
28  * Detecting redundant seed hits is simple when the seed hits are ungapped.  We
29  * do this after offset resolution but before the offset is converted to genome
30  * coordinates (see uses of the seenDiags1_/seenDiags2_ fields for examples).
31  *
32  * REDUNDANT ALIGNMENTS
33  *
34  * In an unpaired context, we say that two alignments are redundant if they
35  * share any cells in the global DP table.  Roughly speaking, this is like
36  * saying that two alignments are redundant if any read character aligns to the
37  * same reference character (same reference sequence, same strand, same offset)
38  * in both alignments.
39  *
40  * In a paired-end context, we say that two paired-end alignments are redundant
41  * if the mate #1s are redundant and the mate #2s are redundant.
42  *
43  * How do we enforce this?  In the unpaired context, this is relatively simple:
44  * the cells from each alignment are checked against a set containing all cells
45  * from all previous alignments.  Given a new alignment, for each cell in the
46  * new alignment we check whether it is in the set.  If there is any overlap,
47  * the new alignment is rejected as redundant.  Otherwise, the new alignment is
48  * accepted and its cells are added to the set.
49  *
50  * Enforcement in a paired context is a little trickier.  Consider the
51  * following approaches:
52  *
53  * 1. Skip anchors that are redundant with any previous anchor or opposite
54  *    alignment.  This is sufficient to ensure no two concordant alignments
55  *    found are redundant.
56  *
57  * 2. Same as scheme 1, but with a "transitive closure" scheme for finding all
58  *    concordant pairs in the vicinity of an anchor.  Consider the AB/AC
59  *    scenario from the previous paragraph.  If B is the anchor alignment, we
60  *    will find AB but not AC.  But under this scheme, once we find AB we then
61  *    let B be a new anchor and immediately look for its opposites.  Likewise,
62  *    if we find any opposite, we make them anchors and continue searching.  We
63  *    don't stop searching until every opposite is used as an anchor.
64  *
65  * 3. Skip anchors that are redundant with any previous anchor alignment (but
66  *    allow anchors that are redundant with previous opposite alignments).
67  *    This isn't sufficient to avoid redundant concordant alignments.  To avoid
68  *    redundant concordants, we need an additional procedure that checks each
69  *    new concordant alignment one-by-one against a list of previous concordant
70  *    alignments to see if it is redundant.
71  *
72  * We take approach 1.
73  */
74 
75 #ifndef ALIGNER_SW_DRIVER_H_
76 #define ALIGNER_SW_DRIVER_H_
77 
78 #include <iostream>
79 // -- BTL remove --
80 #include <stdlib.h>
81 #include <sys/time.h>
82 // -- --
83 #include <utility>
84 #include "ds.h"
85 #include "aligner_seed.h"
86 #include "aligner_sw.h"
87 #include "aligner_cache.h"
88 #include "reference.h"
89 #include "group_walk.h"
90 #include "gfm.h"
91 #include "mem_ids.h"
92 #include "aln_sink.h"
93 #include "pe.h"
94 #include "ival_list.h"
95 #include "simple_func.h"
96 #include "random_util.h"
97 #include "dp_framer.h"
98 
99 using namespace std;
100 
101 template <typename index_t>
102 struct SeedPos {
103 
SeedPosSeedPos104 	SeedPos() : fw(false), offidx(0), rdoff(0), seedlen(0) { }
105 
SeedPosSeedPos106 	SeedPos(
107 		bool fw_,
108 		index_t offidx_,
109 		index_t rdoff_,
110 		index_t seedlen_)
111 	{
112 		init(fw_, offidx_, rdoff_, seedlen_);
113 	}
114 
initSeedPos115 	void init(
116 		bool fw_,
117 		index_t offidx_,
118 		index_t rdoff_,
119 		index_t seedlen_)
120 	{
121 		fw      = fw_;
122 		offidx  = offidx_;
123 		rdoff   = rdoff_;
124 		seedlen = seedlen_;
125 	}
126 
127 	bool operator<(const SeedPos& o) const {
128 		if(offidx < o.offidx)   return true;
129 		if(offidx > o.offidx)   return false;
130 		if(rdoff < o.rdoff)     return true;
131 		if(rdoff > o.rdoff)     return false;
132 		if(seedlen < o.seedlen) return true;
133 		if(seedlen > o.seedlen) return false;
134 		if(fw && !o.fw)         return true;
135 		if(!fw && o.fw)         return false;
136 		return false;
137 	}
138 
139 	bool operator>(const SeedPos& o) const {
140 		if(offidx < o.offidx)   return false;
141 		if(offidx > o.offidx)   return true;
142 		if(rdoff < o.rdoff)     return false;
143 		if(rdoff > o.rdoff)     return true;
144 		if(seedlen < o.seedlen) return false;
145 		if(seedlen > o.seedlen) return true;
146 		if(fw && !o.fw)         return false;
147 		if(!fw && o.fw)         return true;
148 		return false;
149 	}
150 
151 	bool operator==(const SeedPos& o) const {
152 		return fw == o.fw && offidx == o.offidx &&
153 		       rdoff == o.rdoff && seedlen == o.seedlen;
154 	}
155 
156 	bool fw;
157 	index_t offidx;
158 	index_t rdoff;
159 	index_t seedlen;
160 };
161 
162 /**
163  * An SATuple along with the associated seed position.
164  */
165 template <typename index_t>
166 struct SATupleAndPos {
167 
168 	SATuple<index_t> sat;    // result for this seed hit
169 	SeedPos<index_t> pos;    // seed position that yielded the range this was taken from
170 	index_t          origSz; // size of range this was taken from
171 	index_t          nlex;   // # position we can extend seed hit to left w/o edit
172 	index_t          nrex;   // # position we can extend seed hit to right w/o edit
173 
174 	bool operator<(const SATupleAndPos& o) const {
175 		if(sat < o.sat) return true;
176 		if(sat > o.sat) return false;
177 		return pos < o.pos;
178 	}
179 
180 	bool operator==(const SATupleAndPos& o) const {
181 		return sat == o.sat && pos == o.pos;
182 	}
183 };
184 
185 /**
186  * Encapsulates the weighted random sampling scheme we want to use to pick
187  * which seed hit range to sample a row from.
188  */
189 template <typename index_t>
190 class RowSampler {
191 
192 public:
193 
elim_(cat)194 	RowSampler(int cat = 0) : elim_(cat), masses_(cat) {
195 		mass_ = 0.0f;
196 	}
197 
198 	/**
199 	 * Initialze sampler with respect to a range of elements in a list of
200 	 * SATupleAndPos's.
201 	 */
init(const EList<SATupleAndPos<index_t>,16> & salist,index_t sai,index_t saf,bool lensq,bool szsq)202 	void init(
203 		const EList<SATupleAndPos<index_t>, 16>& salist,
204 		index_t sai,
205 		index_t saf,
206 		bool lensq, // whether to square the numerator, which = extended length
207 		bool szsq)  // whether to square denominator, which =
208 	{
209 		assert_gt(saf, sai);
210 		elim_.resize(saf - sai);
211 		elim_.fill(false);
212 		// Initialize mass
213 		mass_ = 0.0f;
214 		masses_.resize(saf - sai);
215 		for(index_t i = sai; i < saf; i++) {
216 			index_t len = salist[i].nlex + salist[i].nrex + 1; // + salist[i].sat.key.len;
217 			double num = (double)len;
218 			if(lensq) {
219 				num *= num;
220 			}
221 			double denom = (double)salist[i].sat.size();
222 			if(szsq) {
223 				denom *= denom;
224 			}
225 			masses_[i - sai] = num / denom;
226 			mass_ += masses_[i - sai];
227 		}
228 	}
229 
230 	/**
231 	 * Caller is indicating that the bin at index i is exhausted and we should
232 	 * exclude it from our sampling from now on.
233 	 */
finishedRange(index_t i)234 	void finishedRange(index_t i) {
235 		assert_lt(i, masses_.size());
236 		elim_[i] = true;
237 		mass_ -= masses_[i];
238 	}
239 
240 	/**
241 	 * Sample randomly from the mass.
242 	 */
next(RandomSource & rnd)243 	size_t next(RandomSource& rnd) {
244 		// Throw the dart
245 		double rd = rnd.nextFloat() * mass_;
246 		double mass_sofar = 0.0f;
247 		size_t sz = masses_.size();
248 		size_t last_unelim = std::numeric_limits<size_t>::max();
249 		for(size_t i = 0; i < sz; i++) {
250 			if(!elim_[i]) {
251 				last_unelim = i;
252 				mass_sofar += masses_[i];
253 				if(rd < mass_sofar) {
254 					// This is the one we hit
255 					return i;
256 				}
257 			}
258 		}
259 		assert_neq(std::numeric_limits<size_t>::max(), last_unelim);
260 		return last_unelim;
261 	}
262 
263 protected:
264 	double        mass_;    // total probability mass to throw darts at
265 	EList<bool>   elim_;    // whether the range is eliminated
266 	EList<double> masses_;  // mass of each range
267 };
268 
269 /**
270  * Return values from extendSeeds and extendSeedsPaired.
271  */
272 enum {
273 	// All end-to-end and seed hits were examined
274 	// The policy does not need us to look any further
275 	EXTEND_EXHAUSTED_CANDIDATES = 1,
276 	EXTEND_POLICY_FULFILLED,
277 	// We stopped because we reached a point where the only remaining
278 	// alignments of interest have perfect scores, but we already investigated
279 	// perfect alignments
280 	EXTEND_PERFECT_SCORE,
281 	// We stopped because we ran up against a limit on how much work we should
282 	// do for one set of seed ranges, e.g. the limit on number of consecutive
283 	// unproductive DP extensions
284 	EXTEND_EXCEEDED_SOFT_LIMIT,
285 	// We stopped because we ran up against a limit on how much work we should
286 	// do for overall before giving up on a mate
287 	EXTEND_EXCEEDED_HARD_LIMIT
288 };
289 
290 /**
291  * Data structure encapsulating a range that's been extended out in two
292  * directions.
293  */
294 struct ExtendRange {
295 
initExtendRange296 	void init(size_t off_, size_t len_, size_t sz_) {
297 		off = off_; len = len_; sz = sz_;
298 	}
299 
300 	size_t off; // offset of extended region
301 	size_t len; // length between extremes of extended region
302 	size_t sz;  // # of elements in SA range
303 };
304 
305 template <typename index_t>
306 class SwDriver {
307 
308 	typedef PList<index_t, CACHE_PAGE_SZ> TSAList;
309 
310 public:
311 
SwDriver(size_t bytes)312 	SwDriver(size_t bytes) :
313 		satups_(DP_CAT),
314 		gws_(DP_CAT),
315 		seenDiags1_(DP_CAT),
316 		seenDiags2_(DP_CAT),
317 		redAnchor_(DP_CAT),
318 		redMate1_(DP_CAT),
319 		redMate2_(DP_CAT),
320 		pool_(bytes, CACHE_PAGE_SZ, DP_CAT),
321 		salistEe_(DP_CAT),
322 		gwstate_(GW_CAT) { }
323 
324 	/**
325 	 * Given a collection of SeedHits for a single read, extend seed alignments
326 	 * into full alignments.  Where possible, try to avoid redundant offset
327 	 * lookups and dynamic programming problems.  Optionally report alignments
328 	 * to a AlnSinkWrap object as they are discovered.
329 	 *
330 	 * If 'reportImmediately' is true, returns true iff a call to
331 	 * mhs->report() returned true (indicating that the reporting
332 	 * policy is satisfied and we can stop).  Otherwise, returns false.
333 	 */
334 	int extendSeeds(
335 		Read& rd,                    // read to align
336 		bool mate1,                  // true iff rd is mate #1
337 		SeedResults<index_t>& sh,    // seed hits to extend into full alignments
338 		const GFM<index_t>& gfmFw,   // BWT
339 		const GFM<index_t>* gfmBw,   // BWT'
340 		const BitPairReference& ref, // Reference strings
341 		SwAligner& swa,              // dynamic programming aligner
342 		const Scoring& sc,           // scoring scheme
343 		int seedmms,                 // # mismatches allowed in seed
344 		int seedlen,                 // length of seed
345 		int seedival,                // interval between seeds
346 		TAlScore& minsc,             // minimum score for anchor
347 		int nceil,                   // maximum # Ns permitted in ref portion
348 		size_t maxhalf,              // maximum width on one side of DP table
349 		bool doUngapped,             // do ungapped alignment
350 		size_t maxIters,             // stop after this many seed-extend loop iters
351 		size_t maxUg,                // max # ungapped extends
352 		size_t maxDp,                // max # DPs
353 		size_t maxUgStreak,          // stop after streak of this many ungap fails
354 		size_t maxDpStreak,          // stop after streak of this many dp fails
355 		bool doExtend,               // do seed extension
356 		bool enable8,                // use 8-bit SSE where possible
357 		size_t cminlen,              // use checkpointer if read longer than this
358 		size_t cpow2,                // interval between diagonals to checkpoint
359 		bool doTri,                  // triangular mini-fills
360 		int tighten,                 // -M score tightening mode
361 		AlignmentCacheIface<index_t>& ca,     // alignment cache for seed hits
362 		RandomSource& rnd,           // pseudo-random source
363 		WalkMetrics& wlm,            // group walk left metrics
364 		SwMetrics& swmSeed,          // DP metrics for seed-extend
365 		PerReadMetrics& prm,         // per-read metrics
366 		AlnSinkWrap<index_t>* mhs,   // HitSink for multiseed-style aligner
367 		bool reportImmediately,      // whether to report hits immediately to mhs
368 		bool& exhaustive);
369 
370 	/**
371 	 * Given a collection of SeedHits for a read pair, extend seed
372 	 * alignments into full alignments and then look for the opposite
373 	 * mate using dynamic programming.  Where possible, try to avoid
374 	 * redundant offset lookups.  Optionally report alignments to a
375 	 * AlnSinkWrap object as they are discovered.
376 	 *
377 	 * If 'reportImmediately' is true, returns true iff a call to
378 	 * mhs->report() returned true (indicating that the reporting
379 	 * policy is satisfied and we can stop).  Otherwise, returns false.
380 	 */
381 	int extendSeedsPaired(
382 		Read& rd,                    // mate to align as anchor
383 		Read& ord,                   // mate to align as opposite
384 		bool anchor1,                // true iff anchor mate is mate1
385 		bool oppFilt,                // true iff opposite mate was filtered out
386 		SeedResults<index_t>& sh,    // seed hits for anchor
387 		const GFM<index_t>& gfmFw,   // BWT
388 		const GFM<index_t>* gfmBw,   // BWT'
389 		const BitPairReference& ref, // Reference strings
390 		SwAligner& swa,              // dyn programming aligner for anchor
391 		SwAligner& swao,             // dyn programming aligner for opposite
392 		const Scoring& sc,           // scoring scheme
393 		const PairedEndPolicy& pepol,// paired-end policy
394 		int seedmms,                 // # mismatches allowed in seed
395 		int seedlen,                 // length of seed
396 		int seedival,                // interval between seeds
397 		TAlScore& minsc,             // minimum score for anchor
398 		TAlScore& ominsc,            // minimum score for opposite
399 		int nceil,                   // max # Ns permitted in ref for anchor
400 		int onceil,                  // max # Ns permitted in ref for opposite
401 		bool nofw,                   // don't align forward read
402 		bool norc,                   // don't align revcomp read
403 		size_t maxhalf,              // maximum width on one side of DP table
404 		bool doUngapped,             // do ungapped alignment
405 		size_t maxIters,             // stop after this many seed-extend loop iters
406 		size_t maxUg,                // max # ungapped extends
407 		size_t maxDp,                // max # DPs
408 		size_t maxEeStreak,          // stop after streak of this many end-to-end fails
409 		size_t maxUgStreak,          // stop after streak of this many ungap fails
410 		size_t maxDpStreak,          // stop after streak of this many dp fails
411 		size_t maxMateStreak,        // stop seed range after N mate-find fails
412 		bool doExtend,               // do seed extension
413 		bool enable8,                // use 8-bit SSE where possible
414 		size_t cminlen,              // use checkpointer if read longer than this
415 		size_t cpow2,                // interval between diagonals to checkpoint
416 		bool doTri,                  // triangular mini-fills
417 		int tighten,                 // -M score tightening mode
418 		AlignmentCacheIface<index_t>& cs,     // alignment cache for seed hits
419 		RandomSource& rnd,           // pseudo-random source
420 		WalkMetrics& wlm,            // group walk left metrics
421 		SwMetrics& swmSeed,          // DP metrics for seed-extend
422 		SwMetrics& swmMate,          // DP metrics for mate finidng
423 		PerReadMetrics& prm,         // per-read metrics for anchor
424 		AlnSinkWrap<index_t>* msink, // AlnSink wrapper for multiseed-style aligner
425 		bool swMateImmediately,      // whether to look for mate immediately
426 		bool reportImmediately,      // whether to report hits immediately to msink
427 		bool discord,                // look for discordant alignments?
428 		bool mixed,                  // look for unpaired as well as paired alns?
429 		bool& exhaustive);
430 
431 	/**
432 	 * Prepare for a new read.
433 	 */
nextRead(bool paired,size_t mate1len,size_t mate2len)434 	void nextRead(bool paired, size_t mate1len, size_t mate2len) {
435 		redAnchor_.reset();
436 		seenDiags1_.reset();
437 		seenDiags2_.reset();
438 		seedExRangeFw_[0].clear(); // mate 1 fw
439 		seedExRangeFw_[1].clear(); // mate 2 fw
440 		seedExRangeRc_[0].clear(); // mate 1 rc
441 		seedExRangeRc_[1].clear(); // mate 2 rc
442 		size_t maxlen = mate1len;
443 		if(paired) {
444 			redMate1_.reset();
445 			redMate1_.init(mate1len);
446 			redMate2_.reset();
447 			redMate2_.init(mate2len);
448 			if(mate2len > maxlen) {
449 				maxlen = mate2len;
450 			}
451 		}
452 		redAnchor_.init(maxlen);
453 	}
454 
455 protected:
456 
457 	bool eeSaTups(
458 		const Read& rd,              // read
459 		SeedResults<index_t>& sh,    // seed hits to extend into full alignments
460 		const GFM<index_t>& gfm,     // BWT
461 		const BitPairReference& ref, // Reference strings
462 		RandomSource& rnd,           // pseudo-random generator
463 		WalkMetrics& wlm,            // group walk left metrics
464 		SwMetrics& swmSeed,          // metrics for seed extensions
465 		index_t& nelt_out,           // out: # elements total
466         index_t maxelts,             // max # elts to report
467 		bool all);                   // report all hits?
468 
469     void extend(
470 		const Read& rd,       // read
471 		const GFM<index_t>& gfmFw,   // Forward Bowtie index
472 		const GFM<index_t>* gfmBw,   // Backward Bowtie index
473 		index_t topf,        // top in fw index
474 		index_t botf,        // bot in fw index
475 		index_t topb,        // top in bw index
476 		index_t botb,        // bot in bw index
477 		bool fw,             // seed orientation
478 		index_t off,         // seed offset from 5' end
479 		index_t len,         // seed length
480 		PerReadMetrics& prm, // per-read metrics
481 		index_t& nlex,       // # positions we can extend to left w/o edit
482 		index_t& nrex);      // # positions we can extend to right w/o edit
483 
484 	void prioritizeSATups(
485 		const Read& rd,              // read
486 		SeedResults<index_t>& sh,    // seed hits to extend into full alignments
487 		const GFM<index_t>& gfmFw, // BWT
488 		const GFM<index_t>* gfmBw, // BWT'
489 		const BitPairReference& ref, // Reference strings
490 		int seedmms,                 // # seed mismatches allowed
491 		index_t maxelt,              // max elts we'll consider
492 		bool doExtend,               // extend out seeds
493 		bool lensq,                  // square extended length
494 		bool szsq,                   // square SA range size
495 		index_t nsm,                 // if range as <= nsm elts, it's "small"
496 		AlignmentCacheIface<index_t>& ca,     // alignment cache for seed hits
497 		RandomSource& rnd,           // pseudo-random generator
498 		WalkMetrics& wlm,            // group walk left metrics
499 		PerReadMetrics& prm,         // per-read metrics
500 		index_t& nelt_out,           // out: # elements total
501 		bool all);                   // report all hits?
502 
503 	Random1toN               rand_;    // random number generators
504 	EList<Random1toN, 16>    rands_;   // random number generators
505 	EList<Random1toN, 16>    rands2_;  // random number generators
506 	EList<EEHit<index_t>, 16>         eehits_;  // holds end-to-end hits
507 	EList<SATupleAndPos<index_t>, 16> satpos_;  // holds SATuple, SeedPos pairs
508 	EList<SATupleAndPos<index_t>, 16> satpos2_; // holds SATuple, SeedPos pairs
509 	EList<SATuple<index_t>, 16>       satups_;  // holds SATuples to explore elements from
510 	EList<GroupWalk2S<index_t, TSlice, 16> > gws_;   // list of GroupWalks; no particular order
511 	EList<size_t>            mateStreaks_; // mate-find fail streaks
512 	RowSampler<index_t>      rowsamp_;     // row sampler
513 
514 	// Ranges that we've extended through when extending seed hits
515 	EList<ExtendRange> seedExRangeFw_[2];
516 	EList<ExtendRange> seedExRangeRc_[2];
517 
518 	// Data structures encapsulating the diagonals that have already been used
519 	// to seed alignment for mate 1 and mate 2.
520 	EIvalMergeListBinned seenDiags1_;
521 	EIvalMergeListBinned seenDiags2_;
522 
523 	// For weeding out redundant alignments
524 	RedundantAlns  redAnchor_;  // database of cells used for anchor alignments
525 	RedundantAlns  redMate1_;   // database of cells used for mate 1 alignments
526 	RedundantAlns  redMate2_;   // database of cells used for mate 2 alignments
527 
528 	// For holding results for anchor (res_) and opposite (ores_) mates
529 	SwResult       resGap_;    // temp holder for alignment result
530 	SwResult       oresGap_;   // temp holder for alignment result, opp mate
531 	SwResult       resUngap_;  // temp holder for ungapped alignment result
532 	SwResult       oresUngap_; // temp holder for ungap. aln. opp mate
533 	SwResult       resEe_;     // temp holder for ungapped alignment result
534 	SwResult       oresEe_;    // temp holder for ungap. aln. opp mate
535 
536 	Pool           pool_;      // memory pages for salistExact_
537 	TSAList        salistEe_;  // PList for offsets for end-to-end hits
538 	GroupWalkState<index_t> gwstate_;   // some per-thread state shared by all GroupWalks
539 
540 	// For AlnRes::matchesRef:
541 	ASSERT_ONLY(SStringExpandable<char>     raw_refbuf_);
542 	ASSERT_ONLY(SStringExpandable<uint32_t> raw_destU32_);
543 	ASSERT_ONLY(EList<bool>                 raw_matches_);
544 	ASSERT_ONLY(BTDnaString                 tmp_rf_);
545 	ASSERT_ONLY(BTDnaString                 tmp_rdseq_);
546 	ASSERT_ONLY(BTString                    tmp_qseq_);
547 	ASSERT_ONLY(EList<index_t>              tmp_reflens_);
548 	ASSERT_ONLY(EList<index_t>              tmp_refoffs_);
549 };
550 
551 #define TIMER_START() \
552 struct timeval tv_i, tv_f; \
553 struct timezone tz_i, tz_f; \
554 size_t total_usecs; \
555 gettimeofday(&tv_i, &tz_i)
556 
557 #define IF_TIMER_END() \
558 gettimeofday(&tv_f, &tz_f); \
559 total_usecs = \
560 (tv_f.tv_sec - tv_i.tv_sec) * 1000000 + (tv_f.tv_usec - tv_i.tv_usec); \
561 if(total_usecs > 300000)
562 
563 /*
564  * aligner_sw_driver.cpp
565  *
566  * Routines that drive the alignment process given a collection of seed hits.
567  * This is generally done in a few stages: extendSeeds visits the set of
568  * seed-hit BW elements in some order; for each element visited it resolves its
569  * reference offset; once the reference offset is known, bounds for a dynamic
570  * programming subproblem are established; if these bounds are distinct from
571  * the bounds we've already tried, we solve the dynamic programming subproblem
572  * and report the hit; if the AlnSinkWrap indicates that we can stop, we
573  * return, otherwise we continue on to the next BW element.
574  */
575 
576 
577 /**
578  * Given end-to-end alignment results stored in the SeedResults structure, set
579  * up all of our state for resolving and keeping track of reference offsets for
580  * hits.  Order the list of ranges to examine such that all exact end-to-end
581  * alignments are examined before any 1mm end-to-end alignments.
582  *
583  * Note: there might be a lot of hits and a lot of wide ranges to look for
584  * here.  We use 'maxelt'.
585  */
586 template <typename index_t>
eeSaTups(const Read & rd,SeedResults<index_t> & sh,const GFM<index_t> & gfm,const BitPairReference & ref,RandomSource & rnd,WalkMetrics & wlm,SwMetrics & swmSeed,index_t & nelt_out,index_t maxelt,bool all)587 bool SwDriver<index_t>::eeSaTups(
588 								 const Read& rd,              // read
589 								 SeedResults<index_t>& sh,    // seed hits to extend into full alignments
590                                  const GFM<index_t>& gfm,     // BWT
591 								 const BitPairReference& ref, // Reference strings
592 								 RandomSource& rnd,           // pseudo-random generator
593 								 WalkMetrics& wlm,            // group walk left metrics
594 								 SwMetrics& swmSeed,          // metrics for seed extensions
595 								 index_t& nelt_out,           // out: # elements total
596 								 index_t maxelt,              // max elts we'll consider
597 								 bool all)                    // report all hits?
598 {
599     assert_eq(0, nelt_out);
600 	gws_.clear();
601 	rands_.clear();
602 	satpos_.clear();
603 	eehits_.clear();
604 	// First, count up the total number of satpos_, rands_, eehits_, and gws_
605 	// we're going to tuse
606 	index_t nobj = 0;
607 	if(!sh.exactFwEEHit().empty()) nobj++;
608 	if(!sh.exactRcEEHit().empty()) nobj++;
609 	nobj += sh.mm1EEHits().size();
610     nobj = min(nobj, maxelt);
611 	gws_.ensure(nobj);
612 	rands_.ensure(nobj);
613 	satpos_.ensure(nobj);
614 	eehits_.ensure(nobj);
615 	index_t tot = sh.exactFwEEHit().size() + sh.exactRcEEHit().size();
616 	bool succ = false;
617 	bool firstEe = true;
618     bool done = false;
619 	if(tot > 0) {
620 		bool fwFirst = true;
621         // Pick fw / rc to go first in a weighted random fashion
622 #ifdef BOWTIE_64BIT_INDEX
623 		index_t rn64 = rnd.nextU64();
624 		index_t rn = rn64 % (uint64_t)tot;
625 #else
626 		index_t rn32 = rnd.nextU32();
627 		index_t rn = rn32 % (uint32_t)tot;
628 #endif
629 		if(rn >= sh.exactFwEEHit().size()) {
630 			fwFirst = false;
631 		}
632 		for(int fwi = 0; fwi < 2 && !done; fwi++) {
633 			bool fw = ((fwi == 0) == fwFirst);
634 			EEHit<index_t> hit = fw ? sh.exactFwEEHit() : sh.exactRcEEHit();
635 			if(hit.empty()) {
636 				continue;
637 			}
638 			assert(hit.fw == fw);
639 			if(hit.bot > hit.top) {
640                 // Possibly adjust bot and width if we would have exceeded maxelt
641                 index_t tops[2] = { hit.top, 0 };
642                 index_t bots[2] = { hit.bot, 0 };
643                 index_t width = hit.bot - hit.top;
644                 if(nelt_out + width > maxelt) {
645                     index_t trim = (index_t)((nelt_out + width) - maxelt);
646 #ifdef BOWTIE_64BIT_INDEX
647                     index_t rn = rnd.nextU64() % width;
648 #else
649                     index_t rn = rnd.nextU32() % width;
650 #endif
651                     index_t newwidth = width - trim;
652                     if(hit.top + rn + newwidth > hit.bot) {
653                         // Two pieces
654                         tops[0] = hit.top + rn;
655                         bots[0] = hit.bot;
656                         tops[1] = hit.top;
657                         bots[1] = hit.top + newwidth - (bots[0] - tops[0]);
658                     } else {
659                         // One piece
660                         tops[0] = hit.top + rn;
661                         bots[0] = tops[0] + newwidth;
662                     }
663                     assert_leq(bots[0], hit.bot);
664                     assert_leq(bots[1], hit.bot);
665                     assert_geq(bots[0], tops[0]);
666                     assert_geq(bots[1], tops[1]);
667                     assert_eq(newwidth, (bots[0] - tops[0]) + (bots[1] - tops[1]));
668                 }
669                 for(int i = 0; i < 2 && !done; i++) {
670                     if(bots[i] <= tops[i]) break;
671                     index_t width = bots[i] - tops[i];
672                     index_t top = tops[i];
673                     // Clear list where resolved offsets are stored
674                     swmSeed.exranges++;
675                     swmSeed.exrows += width;
676                     if(!succ) {
677                         swmSeed.exsucc++;
678                         succ = true;
679                     }
680                     if(firstEe) {
681                         salistEe_.clear();
682                         pool_.clear();
683                         firstEe = false;
684                     }
685                     // We have to be careful not to allocate excessive amounts of memory here
686                     TSlice o(salistEe_, (index_t)salistEe_.size(), width);
687                     for(index_t i = 0; i < width; i++) {
688                         if(!salistEe_.add(pool_, (index_t)OFF_MASK)) {
689                             swmSeed.exooms++;
690                             return false;
691                         }
692                     }
693                     assert(!done);
694                     eehits_.push_back(hit);
695                     satpos_.expand();
696                     satpos_.back().sat.init(SAKey(), top, (index_t)OFF_MASK, o);
697                     satpos_.back().sat.key.seq = MAX_U64;
698                     satpos_.back().sat.key.len = (index_t)rd.length();
699                     satpos_.back().pos.init(fw, 0, 0, (index_t)rd.length());
700                     satpos_.back().origSz = width;
701                     rands_.expand();
702                     rands_.back().init(width, all);
703                     gws_.expand();
704 					SARangeWithOffs<TSlice, index_t> sa;
705 					sa.topf = satpos_.back().sat.topf;
706 					sa.len = satpos_.back().sat.key.len;
707 					sa.offs = satpos_.back().sat.offs;
708                     gws_.back().init(
709 									 gfm,                // forward Bowtie index
710 									 ref,                // reference sequences
711 									 sa,                 // SATuple
712 									 rnd,                // pseudo-random generator
713 									 wlm);               // metrics
714                     assert(gws_.back().repOk(sa));
715                     nelt_out += width;
716                     if(nelt_out >= maxelt) {
717                         done = true;
718                     }
719                 }
720 			}
721 		}
722 	}
723 	succ = false;
724 	if(!done && !sh.mm1EEHits().empty()) {
725 		sh.sort1mmEe(rnd);
726 		index_t sz = sh.mm1EEHits().size();
727 		for(index_t i = 0; i < sz && !done; i++) {
728 			EEHit<index_t> hit = sh.mm1EEHits()[i];
729 			assert(hit.repOk(rd));
730 			assert(!hit.empty());
731             // Possibly adjust bot and width if we would have exceeded maxelt
732             index_t tops[2] = { hit.top, 0 };
733             index_t bots[2] = { hit.bot, 0 };
734             index_t width = hit.bot - hit.top;
735             if(nelt_out + width > maxelt) {
736                 index_t trim = (index_t)((nelt_out + width) - maxelt);
737 #ifdef BOWTIE_64BIT_INDEX
738                 index_t rn = rnd.nextU64() % width;
739 #else
740                 index_t rn = rnd.nextU32() % width;
741 #endif
742                 index_t newwidth = width - trim;
743                 if(hit.top + rn + newwidth > hit.bot) {
744                     // Two pieces
745                     tops[0] = hit.top + rn;
746                     bots[0] = hit.bot;
747                     tops[1] = hit.top;
748                     bots[1] = hit.top + newwidth - (bots[0] - tops[0]);
749                 } else {
750                     // One piece
751                     tops[0] = hit.top + rn;
752                     bots[0] = tops[0] + newwidth;
753                 }
754                 assert_leq(bots[0], hit.bot);
755                 assert_leq(bots[1], hit.bot);
756                 assert_geq(bots[0], tops[0]);
757                 assert_geq(bots[1], tops[1]);
758                 assert_eq(newwidth, (bots[0] - tops[0]) + (bots[1] - tops[1]));
759             }
760             for(int i = 0; i < 2 && !done; i++) {
761                 if(bots[i] <= tops[i]) break;
762                 index_t width = bots[i] - tops[i];
763                 index_t top = tops[i];
764                 // Clear list where resolved offsets are stored
765                 swmSeed.mm1ranges++;
766                 swmSeed.mm1rows += width;
767                 if(!succ) {
768                     swmSeed.mm1succ++;
769                     succ = true;
770                 }
771                 if(firstEe) {
772                     salistEe_.clear();
773                     pool_.clear();
774                     firstEe = false;
775                 }
776                 TSlice o(salistEe_, (index_t)salistEe_.size(), width);
777                 for(size_t i = 0; i < width; i++) {
778                     if(!salistEe_.add(pool_, (index_t)OFF_MASK)) {
779                         swmSeed.mm1ooms++;
780                         return false;
781                     }
782                 }
783                 eehits_.push_back(hit);
784                 satpos_.expand();
785                 satpos_.back().sat.init(SAKey(), top, (index_t)OFF_MASK, o);
786                 satpos_.back().sat.key.seq = MAX_U64;
787                 satpos_.back().sat.key.len = (index_t)rd.length();
788                 satpos_.back().pos.init(hit.fw, 0, 0, (index_t)rd.length());
789                 satpos_.back().origSz = width;
790                 rands_.expand();
791                 rands_.back().init(width, all);
792                 gws_.expand();
793 				SARangeWithOffs<TSlice, index_t> sa;
794 				sa.topf = satpos_.back().sat.topf;
795 				sa.len = satpos_.back().sat.key.len;
796 				sa.offs = satpos_.back().sat.offs;
797                 gws_.back().init(
798 								 gfm,  // forward Bowtie index
799 								 ref,  // reference sequences
800 								 sa,   // SATuple
801 								 rnd,  // pseudo-random generator
802 								 wlm); // metrics
803                 assert(gws_.back().repOk(sa));
804                 nelt_out += width;
805                 if(nelt_out >= maxelt) {
806                     done = true;
807                 }
808             }
809 		}
810 	}
811 	return true;
812 }
813 
814 /**
815  * Extend a seed hit out on either side.  Requires that we know the seed hit's
816  * offset into the read and orientation.  Also requires that we know top/bot
817  * for the seed hit in both the forward and (if we want to extend to the right)
818  * reverse index.
819  */
820 template <typename index_t>
extend(const Read & rd,const GFM<index_t> & gfmFw,const GFM<index_t> * gfmBw,index_t topf,index_t botf,index_t topb,index_t botb,bool fw,index_t off,index_t len,PerReadMetrics & prm,index_t & nlex,index_t & nrex)821 void SwDriver<index_t>::extend(
822 							   const Read& rd,       // read
823 							   const GFM<index_t>& gfmFw,   // Forward Bowtie index
824 							   const GFM<index_t>* gfmBw,   // Backward Bowtie index
825 							   index_t topf,         // top in fw index
826 							   index_t botf,         // bot in fw index
827 							   index_t topb,         // top in bw index
828 							   index_t botb,         // bot in bw index
829 							   bool fw,              // seed orientation
830 							   index_t off,          // seed offset from 5' end
831 							   index_t len,          // seed length
832 							   PerReadMetrics& prm,  // per-read metrics
833 							   index_t& nlex,        // # positions we can extend to left w/o edit
834 							   index_t& nrex)        // # positions we can extend to right w/o edit
835 {
836 	index_t t[4], b[4];
837 	index_t tp[4], bp[4];
838 	SideLocus<index_t> tloc, bloc;
839 	index_t rdlen = (index_t)rd.length();
840 	index_t lim = fw ? off : rdlen - len - off;
841 	// We're about to add onto the beginning, so reverse it
842 #ifndef NDEBUG
843 	if(false) {
844 		// TODO: This will sometimes fail even when the extension is legitimate
845 		// This is because contains() comes in from one extreme or the other,
846 		// whereas we started from the inside and worked outwards.  This
847 		// affects which Ns are OK and which are not OK.
848 
849 		// Have to do both because whether we can get through an N depends on
850 		// which direction we're coming in
851 		bool fwContains = gfmFw.contains(tmp_rdseq_);
852 		tmp_rdseq_.reverse();
853 		bool bwContains = gfmBw != NULL && gfmBw->contains(tmp_rdseq_);
854 		tmp_rdseq_.reverse();
855 		assert(fwContains || bwContains);
856 	}
857 #endif
858 	ASSERT_ONLY(tmp_rdseq_.reverse());
859 	if(lim > 0) {
860 		const GFM<index_t> *gfm = &gfmFw;
861 		assert(gfm != NULL);
862 		// Extend left using forward index
863 		const BTDnaString& seq = fw ? rd.patFw : rd.patRc;
864 		// See what we get by extending
865 		index_t top = topf, bot = botf;
866 		t[0] = t[1] = t[2] = t[3] = 0;
867 		b[0] = b[1] = b[2] = b[3] = 0;
868 		tp[0] = tp[1] = tp[2] = tp[3] = topb;
869 		bp[0] = bp[1] = bp[2] = bp[3] = botb;
870 		SideLocus<index_t> tloc, bloc;
871 		INIT_LOCS(top, bot, tloc, bloc, *gfm);
872 		for(index_t ii = 0; ii < lim; ii++) {
873 			// Starting to left of seed (<off) and moving left
874 			index_t i = 0;
875 			if(fw) {
876 				i = off - ii - 1;
877 			} else {
878 				i = rdlen - off - len - 1 - ii;
879 			}
880 			// Get char from read
881 			int rdc = seq.get(i);
882 			// See what we get by extending
883 			if(bloc.valid()) {
884 				prm.nSdFmops++;
885 				t[0] = t[1] = t[2] = t[3] =
886 				b[0] = b[1] = b[2] = b[3] = 0;
887 				gfm->mapBiLFEx(tloc, bloc, t, b, tp, bp);
888 				SANITY_CHECK_4TUP(t, b, tp, bp);
889 				int nonz = -1;
890 				bool abort = false;
891 				size_t origSz = bot - top;
892 				for(int j = 0; j < 4; j++) {
893 					if(b[j] > t[j]) {
894 						if(nonz >= 0) {
895 							abort = true;
896 							break;
897 						}
898 						nonz = j;
899 						top = t[j]; bot = b[j];
900 					}
901 				}
902 				assert_leq(bot - top, origSz);
903 				if(abort || (nonz != rdc && rdc <= 3) || bot - top < origSz) {
904 					break;
905 				}
906 			} else {
907 				assert_eq(bot, top+1);
908 				prm.nSdFmops++;
909 				int c = gfm->mapLF1(top, tloc);
910 				if(c != rdc && rdc <= 3) {
911 					break;
912 				}
913 				bot = top + 1;
914 			}
915 			ASSERT_ONLY(tmp_rdseq_.append(rdc));
916 			if(++nlex == 255) {
917 				break;
918 			}
919 			INIT_LOCS(top, bot, tloc, bloc, *gfm);
920 		}
921 	}
922 	// We're about to add onto the end, so re-reverse
923 	ASSERT_ONLY(tmp_rdseq_.reverse());
924 	lim = fw ? rdlen - len - off : off;
925 	if(lim > 0 && gfmBw != NULL) {
926 		const GFM<index_t> *gfm = gfmBw;
927 		assert(gfm != NULL);
928 		// Extend right using backward index
929 		const BTDnaString& seq = fw ? rd.patFw : rd.patRc;
930 		// See what we get by extending
931 		index_t top = topb, bot = botb;
932 		t[0] = t[1] = t[2] = t[3] = 0;
933 		b[0] = b[1] = b[2] = b[3] = 0;
934 		tp[0] = tp[1] = tp[2] = tp[3] = topf;
935 		bp[0] = bp[1] = bp[2] = bp[3] = botf;
936 		INIT_LOCS(top, bot, tloc, bloc, *gfm);
937 		for(index_t ii = 0; ii < lim; ii++) {
938 			// Starting to right of seed (<off) and moving right
939 			index_t i;
940 			if(fw) {
941 				i = ii + len + off;
942 			} else {
943 				i = rdlen - off + ii;
944 			}
945 			// Get char from read
946 			int rdc = seq.get(i);
947 			// See what we get by extending
948 			if(bloc.valid()) {
949 				prm.nSdFmops++;
950 				t[0] = t[1] = t[2] = t[3] =
951 				b[0] = b[1] = b[2] = b[3] = 0;
952 				gfm->mapBiLFEx(tloc, bloc, t, b, tp, bp);
953 				SANITY_CHECK_4TUP(t, b, tp, bp);
954 				int nonz = -1;
955 				bool abort = false;
956 				size_t origSz = bot - top;
957 				for(int j = 0; j < 4; j++) {
958 					if(b[j] > t[j]) {
959 						if(nonz >= 0) {
960 							abort = true;
961 							break;
962 						}
963 						nonz = j;
964 						top = t[j]; bot = b[j];
965 					}
966 				}
967 				assert_leq(bot - top, origSz);
968 				if(abort || (nonz != rdc && rdc <= 3) || bot - top < origSz) {
969 					break;
970 				}
971 			} else {
972 				assert_eq(bot, top+1);
973 				prm.nSdFmops++;
974 				int c = gfm->mapLF1(top, tloc);
975 				if(c != rdc && rdc <= 3) {
976 					break;
977 				}
978 				bot = top + 1;
979 			}
980 			ASSERT_ONLY(tmp_rdseq_.append(rdc));
981 			if(++nrex == 255) {
982 				break;
983 			}
984 			INIT_LOCS(top, bot, tloc, bloc, *gfm);
985 		}
986 	}
987 #ifndef NDEBUG
988 	if(false) {
989 		// TODO: This will sometimes fail even when the extension is legitimate
990 		// This is because contains() comes in from one extreme or the other,
991 		// whereas we started from the inside and worked outwards.  This
992 		// affects which Ns are OK and which are not OK.
993 
994 		// Have to do both because whether we can get through an N depends on
995 		// which direction we're coming in
996 		bool fwContains = gfmFw.contains(tmp_rdseq_);
997 		tmp_rdseq_.reverse();
998 		bool bwContains = gfmBw != NULL && gfmBw->contains(tmp_rdseq_);
999 		tmp_rdseq_.reverse();
1000 		assert(fwContains || bwContains);
1001 	}
1002 #endif
1003 	assert_lt(nlex, rdlen);
1004 	assert_lt(nrex, rdlen);
1005 	return;
1006 }
1007 
1008 /**
1009  * Given seed results, set up all of our state for resolving and keeping
1010  * track of reference offsets for hits.
1011  */
1012 template <typename index_t>
prioritizeSATups(const Read & read,SeedResults<index_t> & sh,const GFM<index_t> & gfmFw,const GFM<index_t> * gfmBw,const BitPairReference & ref,int seedmms,index_t maxelt,bool doExtend,bool lensq,bool szsq,index_t nsm,AlignmentCacheIface<index_t> & ca,RandomSource & rnd,WalkMetrics & wlm,PerReadMetrics & prm,index_t & nelt_out,bool all)1013 void SwDriver<index_t>::prioritizeSATups(
1014 										 const Read& read,            // read
1015 										 SeedResults<index_t>& sh,    // seed hits to extend into full alignments
1016 										 const GFM<index_t>& gfmFw,   // BWT
1017 										 const GFM<index_t>* gfmBw,   // BWT
1018 										 const BitPairReference& ref, // Reference strings
1019 										 int seedmms,                 // # mismatches allowed in seed
1020 										 index_t maxelt,              // max elts we'll consider
1021 										 bool doExtend,               // do extension of seed hits?
1022 										 bool lensq,                  // square length in weight calculation
1023 										 bool szsq,                   // square range size in weight calculation
1024 										 index_t nsm,                 // if range as <= nsm elts, it's "small"
1025 										 AlignmentCacheIface<index_t>& ca,     // alignment cache for seed hits
1026 										 RandomSource& rnd,           // pseudo-random generator
1027 										 WalkMetrics& wlm,            // group walk left metrics
1028 										 PerReadMetrics& prm,         // per-read metrics
1029 										 index_t& nelt_out,           // out: # elements total
1030 										 bool all)                    // report all hits?
1031 {
1032 	const index_t nonz = sh.nonzeroOffsets(); // non-zero positions
1033 	const int matei = (read.mate <= 1 ? 0 : 1);
1034 	satups_.clear();
1035 	gws_.clear();
1036 	rands_.clear();
1037 	rands2_.clear();
1038 	satpos_.clear();
1039 	satpos2_.clear();
1040 	index_t nrange = 0, nelt = 0, nsmall = 0, nsmall_elts = 0;
1041 	bool keepWhole = false;
1042 	EList<SATupleAndPos<index_t>, 16>& satpos = keepWhole ? satpos_ : satpos2_;
1043 	for(index_t i = 0; i < nonz; i++) {
1044 		bool fw = true;
1045 		index_t offidx = 0, rdoff = 0, seedlen = 0;
1046 		QVal<index_t> qv = sh.hitsByRank(i, offidx, rdoff, fw, seedlen);
1047 		assert(qv.valid());
1048 		assert(!qv.empty());
1049 		assert(qv.repOk(ca.current()));
1050 		ca.queryQval(qv, satups_, nrange, nelt);
1051 		for(size_t j = 0; j < satups_.size(); j++) {
1052 			const index_t sz = satups_[j].size();
1053 			// Check whether this hit occurs inside the extended boundaries of
1054 			// another hit we already processed for this read.
1055 			if(seedmms == 0) {
1056 				// See if we're covered by a previous extended seed hit
1057 				EList<ExtendRange>& range =
1058 				fw ? seedExRangeFw_[matei] : seedExRangeRc_[matei];
1059 				bool skip = false;
1060 				for(index_t k = 0; k < range.size(); k++) {
1061 					index_t p5 = range[k].off;
1062 					index_t len = range[k].len;
1063 					if(p5 <= rdoff && p5 + len >= (rdoff + seedlen)) {
1064 						if(sz <= range[k].sz) {
1065 							skip = true;
1066 							break;
1067 						}
1068 					}
1069 				}
1070 				if(skip) {
1071 					assert_gt(nrange, 0);
1072 					nrange--;
1073 					assert_geq(nelt, sz);
1074 					nelt -= sz;
1075 					continue; // Skip this seed
1076 				}
1077 			}
1078 			satpos.expand();
1079 			satpos.back().sat = satups_[j];
1080 			satpos.back().origSz = sz;
1081 			satpos.back().pos.init(fw, offidx, rdoff, seedlen);
1082 			if(sz <= nsm) {
1083 				nsmall++;
1084 				nsmall_elts += sz;
1085 			}
1086 			satpos.back().nlex = satpos.back().nrex = 0;
1087 #ifndef NDEBUG
1088 			tmp_rdseq_.clear();
1089 			uint64_t key = satpos.back().sat.key.seq;
1090 			for(size_t k = 0; k < seedlen; k++) {
1091 				int c = (int)(key & 3);
1092 				tmp_rdseq_.append(c);
1093 				key >>= 2;
1094 			}
1095 			tmp_rdseq_.reverse();
1096 #endif
1097 			index_t nlex = 0, nrex = 0;
1098 			if(doExtend) {
1099 				extend(
1100 					   read,
1101 					   gfmFw,
1102 					   gfmBw,
1103 					   satpos.back().sat.topf,
1104 					   (index_t)(satpos.back().sat.topf + sz),
1105 					   satpos.back().sat.topb,
1106 					   (index_t)(satpos.back().sat.topb + sz),
1107 					   fw,
1108 					   rdoff,
1109 					   seedlen,
1110 					   prm,
1111 					   nlex,
1112 					   nrex);
1113 			}
1114 			satpos.back().nlex = nlex;
1115 			satpos.back().nrex = nrex;
1116 			if(seedmms == 0 && (nlex > 0 || nrex > 0)) {
1117 				assert_geq(rdoff, (fw ? nlex : nrex));
1118 				index_t p5 = rdoff - (fw ? nlex : nrex);
1119 				EList<ExtendRange>& range =
1120 				fw ? seedExRangeFw_[matei] : seedExRangeRc_[matei];
1121 				range.expand();
1122 				range.back().off = p5;
1123 				range.back().len = seedlen + nlex + nrex;
1124 				range.back().sz = sz;
1125 			}
1126 		}
1127 		satups_.clear();
1128 	}
1129 	assert_leq(nsmall, nrange);
1130 	nelt_out = nelt; // return the total number of elements
1131 	assert_eq(nrange, satpos.size());
1132 	satpos.sort();
1133 	if(keepWhole) {
1134 		gws_.ensure(nrange);
1135 		rands_.ensure(nrange);
1136 		for(index_t i = 0; i < nrange; i++) {
1137 			gws_.expand();
1138 			SARangeWithOffs<TSlice, index_t> sa;
1139 			sa.topf = satpos_.back().sat.topf;
1140 			sa.len = satpos_.back().sat.key.len;
1141 			sa.offs = satpos_.back().sat.offs;
1142 			gws_.back().init(
1143 							 gfmFw,  // forward Bowtie index
1144 							 ref,    // reference sequences
1145 							 sa,     // SA tuples: ref hit, salist range
1146 							 rnd,    // pseudo-random generator
1147 							 wlm);   // metrics
1148 			assert(gws_.back().initialized());
1149 			rands_.expand();
1150 			rands_.back().init(satpos_[i].sat.size(), all);
1151 		}
1152 		return;
1153 	}
1154 	// Resize satups_ list so that ranges having elements that we might
1155 	// possibly explore are present
1156 	satpos_.ensure(min(maxelt, nelt));
1157 	gws_.ensure(min(maxelt, nelt));
1158 	rands_.ensure(min(maxelt, nelt));
1159 	rands2_.ensure(min(maxelt, nelt));
1160 	size_t nlarge_elts = nelt - nsmall_elts;
1161 	if(maxelt < nelt) {
1162 		size_t diff = nelt - maxelt;
1163 		if(diff >= nlarge_elts) {
1164 			nlarge_elts = 0;
1165 		} else {
1166 			nlarge_elts -= diff;
1167 		}
1168 	}
1169 	index_t nelt_added = 0;
1170 	// Now we have a collection of ranges in satpos2_.  Now we want to decide
1171 	// how we explore elements from them.  The basic idea is that: for very
1172 	// small guys, where "very small" means that the size of the range is less
1173 	// than or equal to the parameter 'nsz', we explore them in their entirety
1174 	// right away.  For the rest, we want to select in a way that is (a)
1175 	// random, and (b) weighted toward examining elements from the smaller
1176 	// ranges more frequently (and first).
1177 	//
1178 	// 1. do the smalls
1179 	for(index_t j = 0; j < nsmall && nelt_added < maxelt; j++) {
1180 		satpos_.expand();
1181 		satpos_.back() = satpos2_[j];
1182 		gws_.expand();
1183 		SARangeWithOffs<TSlice, index_t> sa;
1184 		sa.topf = satpos_.back().sat.topf;
1185 		sa.len = satpos_.back().sat.key.len;
1186 		sa.offs = satpos_.back().sat.offs;
1187 		gws_.back().init(
1188 						 gfmFw,  // forward Bowtie index
1189 						 ref,    // reference sequences
1190 						 sa,     // SA tuples: ref hit, salist range
1191 						 rnd,    // pseudo-random generator
1192 						 wlm);   // metrics
1193 		assert(gws_.back().initialized());
1194 		rands_.expand();
1195 		rands_.back().init(satpos_.back().sat.size(), all);
1196 		nelt_added += satpos_.back().sat.size();
1197 #ifndef NDEBUG
1198 		for(size_t k = 0; k < satpos_.size()-1; k++) {
1199 			assert(!(satpos_[k] == satpos_.back()));
1200 		}
1201 #endif
1202 	}
1203 	if(nelt_added >= maxelt || nsmall == satpos2_.size()) {
1204 		nelt_out = nelt_added;
1205 		return;
1206 	}
1207 	// 2. do the non-smalls
1208 	// Initialize the row sampler
1209 	rowsamp_.init(satpos2_, nsmall, satpos2_.size(), lensq, szsq);
1210 	// Initialize the random choosers
1211 	rands2_.resize(satpos2_.size());
1212 	for(index_t j = 0; j < satpos2_.size(); j++) {
1213 		rands2_[j].reset();
1214 	}
1215 	while(nelt_added < maxelt && nelt_added < nelt) {
1216 		// Pick a non-small range to sample from
1217 		index_t ri = rowsamp_.next(rnd) + nsmall;
1218 		assert_geq(ri, nsmall);
1219 		assert_lt(ri, satpos2_.size());
1220 		// Initialize random element chooser for that range
1221 		if(!rands2_[ri].inited()) {
1222 			rands2_[ri].init(satpos2_[ri].sat.size(), all);
1223 			assert(!rands2_[ri].done());
1224 		}
1225 		assert(!rands2_[ri].done());
1226 		// Choose an element from the range
1227 		uint32_t r = rands2_[ri].next(rnd);
1228 		if(rands2_[ri].done()) {
1229 			// Tell the row sampler this range is done
1230 			rowsamp_.finishedRange(ri - nsmall);
1231 		}
1232 		// Add the element to the satpos_ list
1233 		SATuple<index_t> sat;
1234 		TSlice o;
1235 		o.init(satpos2_[ri].sat.offs, r, r+1);
1236 		sat.init(satpos2_[ri].sat.key, (index_t)(satpos2_[ri].sat.topf + r), (index_t)OFF_MASK, o);
1237 		satpos_.expand();
1238 		satpos_.back().sat = sat;
1239 		satpos_.back().origSz = satpos2_[ri].origSz;
1240 		satpos_.back().pos = satpos2_[ri].pos;
1241 		// Initialize GroupWalk object
1242 		gws_.expand();
1243 		SARangeWithOffs<TSlice, index_t> sa;
1244 		sa.topf = sat.topf;
1245 		sa.len = sat.key.len;
1246 		sa.offs = sat.offs;
1247 		gws_.back().init(
1248 						 gfmFw,  // forward Bowtie index
1249 						 ref,    // reference sequences
1250 						 sa,     // SA tuples: ref hit, salist range
1251 						 rnd,    // pseudo-random generator
1252 						 wlm);   // metrics
1253 		assert(gws_.back().initialized());
1254 		// Initialize random selector
1255 		rands_.expand();
1256 		rands_.back().init(1, all);
1257 		nelt_added++;
1258 	}
1259 	nelt_out = nelt_added;
1260 	return;
1261 }
1262 
1263 enum {
1264 	FOUND_NONE = 0,
1265 	FOUND_EE,
1266 	FOUND_UNGAPPED,
1267 };
1268 
1269 /**
1270  * Given a collection of SeedHits for a single read, extend seed alignments
1271  * into full alignments.  Where possible, try to avoid redundant offset lookups
1272  * and dynamic programming wherever possible.  Optionally report alignments to
1273  * a AlnSinkWrap object as they are discovered.
1274  *
1275  * If 'reportImmediately' is true, returns true iff a call to msink->report()
1276  * returned true (indicating that the reporting policy is satisfied and we can
1277  * stop).  Otherwise, returns false.
1278  */
1279 template <typename index_t>
extendSeeds(Read & rd,bool mate1,SeedResults<index_t> & sh,const GFM<index_t> & gfmFw,const GFM<index_t> * gfmBw,const BitPairReference & ref,SwAligner & swa,const Scoring & sc,int seedmms,int seedlen,int seedival,TAlScore & minsc,int nceil,size_t maxhalf,bool doUngapped,size_t maxIters,size_t maxUg,size_t maxDp,size_t maxUgStreak,size_t maxDpStreak,bool doExtend,bool enable8,size_t cminlen,size_t cpow2,bool doTri,int tighten,AlignmentCacheIface<index_t> & ca,RandomSource & rnd,WalkMetrics & wlm,SwMetrics & swmSeed,PerReadMetrics & prm,AlnSinkWrap<index_t> * msink,bool reportImmediately,bool & exhaustive)1280 int SwDriver<index_t>::extendSeeds(
1281 								   Read& rd,                    // read to align
1282 								   bool mate1,                  // true iff rd is mate #1
1283 								   SeedResults<index_t>& sh,    // seed hits to extend into full alignments
1284 								   const GFM<index_t>& gfmFw,   // BWT
1285 								   const GFM<index_t>* gfmBw,   // BWT'
1286 								   const BitPairReference& ref, // Reference strings
1287 								   SwAligner& swa,              // dynamic programming aligner
1288 								   const Scoring& sc,           // scoring scheme
1289 								   int seedmms,                 // # mismatches allowed in seed
1290 								   int seedlen,                 // length of seed
1291 								   int seedival,                // interval between seeds
1292 								   TAlScore& minsc,             // minimum score for anchor
1293 								   int nceil,                   // maximum # Ns permitted in reference portion
1294 								   size_t maxhalf,  	         // max width in either direction for DP tables
1295 								   bool doUngapped,             // do ungapped alignment
1296 								   size_t maxIters,             // stop after this many seed-extend loop iters
1297 								   size_t maxUg,                // stop after this many ungaps
1298 								   size_t maxDp,                // stop after this many dps
1299 								   size_t maxUgStreak,          // stop after streak of this many ungap fails
1300 								   size_t maxDpStreak,          // stop after streak of this many dp fails
1301 								   bool doExtend,               // do seed extension
1302 								   bool enable8,                // use 8-bit SSE where possible
1303 								   size_t cminlen,              // use checkpointer if read longer than this
1304 								   size_t cpow2,                // interval between diagonals to checkpoint
1305 								   bool doTri,                  // triangular mini-fills?
1306 								   int tighten,                 // -M score tightening mode
1307 								   AlignmentCacheIface<index_t>& ca,     // alignment cache for seed hits
1308 								   RandomSource& rnd,           // pseudo-random source
1309 								   WalkMetrics& wlm,            // group walk left metrics
1310 								   SwMetrics& swmSeed,          // DP metrics for seed-extend
1311 								   PerReadMetrics& prm,         // per-read metrics
1312 								   AlnSinkWrap<index_t>* msink, // AlnSink wrapper for multiseed-style aligner
1313 								   bool reportImmediately,      // whether to report hits immediately to msink
1314 								   bool& exhaustive)            // set to true iff we searched all seeds exhaustively
1315 {
1316 	bool all = msink->allHits();
1317 	// typedef std::pair<index_t, index_t> UPair;
1318 
1319 	assert(!reportImmediately || msink != NULL);
1320 	assert(!reportImmediately || !msink->maxed());
1321 
1322 	assert_geq(nceil, 0);
1323 	assert_leq((size_t)nceil, rd.length());
1324 
1325 	// Calculate the largest possible number of read and reference gaps
1326 	const index_t rdlen = (index_t)rd.length();
1327 	TAlScore perfectScore = sc.perfectScore(rdlen);
1328 
1329 	DynProgFramer dpframe(!gReportOverhangs);
1330 	swa.reset();
1331 
1332 	// Initialize a set of GroupWalks, one for each seed.  Also, intialize the
1333 	// accompanying lists of reference seed hits (satups*)
1334 	const index_t nsm = 5;
1335 	const index_t nonz = sh.nonzeroOffsets(); // non-zero positions
1336 	index_t eeHits = sh.numE2eHits();
1337 	bool eeMode = eeHits > 0;
1338 	bool firstEe = true;
1339 	bool firstExtend = true;
1340 
1341 	// Reset all the counters related to streaks
1342 	prm.nEeFail = 0;
1343 	prm.nUgFail = 0;
1344 	prm.nDpFail = 0;
1345 
1346 	index_t nelt = 0, neltLeft = 0;
1347 	index_t rows = rdlen;
1348 	index_t eltsDone = 0;
1349 	// cerr << "===" << endl;
1350 	while(true) {
1351 		if(eeMode) {
1352 			if(firstEe) {
1353 				firstEe = false;
1354 				eeMode = eeSaTups(
1355 								  rd,           // read
1356 								  sh,           // seed hits to extend into full alignments
1357 								  gfmFw,        // BWT
1358 								  ref,          // Reference strings
1359 								  rnd,          // pseudo-random generator
1360 								  wlm,          // group walk left metrics
1361 								  swmSeed,      // seed-extend metrics
1362 								  nelt,         // out: # elements total
1363 								  maxIters,     // max # to report
1364 								  all);         // report all hits?
1365 				assert_eq(gws_.size(), rands_.size());
1366 				assert_eq(gws_.size(), satpos_.size());
1367 			} else {
1368 				eeMode = false;
1369 			}
1370 		}
1371 		if(!eeMode) {
1372 			if(nonz == 0) {
1373 				return EXTEND_EXHAUSTED_CANDIDATES; // No seed hits!  Bail.
1374 			}
1375 			if(minsc == perfectScore) {
1376 				return EXTEND_PERFECT_SCORE; // Already found all perfect hits!
1377 			}
1378 			if(firstExtend) {
1379 				nelt = 0;
1380 				prioritizeSATups(
1381 								 rd,            // read
1382 								 sh,            // seed hits to extend into full alignments
1383 								 gfmFw,         // BWT
1384 								 gfmBw,         // BWT'
1385 								 ref,           // Reference strings
1386 								 seedmms,       // # seed mismatches allowed
1387 								 maxIters,      // max rows to consider per position
1388 								 doExtend,      // extend out seeds
1389 								 true,          // square extended length
1390 								 true,          // square SA range size
1391 								 nsm,           // smallness threshold
1392 								 ca,            // alignment cache for seed hits
1393 								 rnd,           // pseudo-random generator
1394 								 wlm,           // group walk left metrics
1395 								 prm,           // per-read metrics
1396 								 nelt,          // out: # elements total
1397 								 all);          // report all hits?
1398 				assert_eq(gws_.size(), rands_.size());
1399 				assert_eq(gws_.size(), satpos_.size());
1400 				neltLeft = nelt;
1401 				firstExtend = false;
1402 			}
1403 			if(neltLeft == 0) {
1404 				// Finished examining gapped candidates
1405 				break;
1406 			}
1407 		}
1408 		for(size_t i = 0; i < gws_.size(); i++) {
1409 			if(eeMode && eehits_[i].score < minsc) {
1410 				return EXTEND_PERFECT_SCORE;
1411 			}
1412 			bool is_small      = satpos_[i].sat.size() < nsm;
1413 			bool fw            = satpos_[i].pos.fw;
1414 			index_t rdoff      = satpos_[i].pos.rdoff;
1415 			index_t seedhitlen = satpos_[i].pos.seedlen;
1416 			if(!fw) {
1417 				// 'rdoff' and 'offidx' are with respect to the 5' end of
1418 				// the read.  Here we convert rdoff to be with respect to
1419 				// the upstream (3') end of ther read.
1420 				rdoff = (index_t)(rdlen - rdoff - seedhitlen);
1421 			}
1422 			bool first = true;
1423 			// If the range is small, investigate all elements now.  If the
1424 			// range is large, just investigate one and move on - we might come
1425 			// back to this range later.
1426 			index_t riter = 0;
1427 			while(!rands_[i].done() && (first || is_small || eeMode)) {
1428 				assert(!gws_[i].done());
1429 				riter++;
1430 				if(minsc == perfectScore) {
1431 					if(!eeMode || eehits_[i].score < perfectScore) {
1432 						return EXTEND_PERFECT_SCORE;
1433 					}
1434 				} else if(eeMode && eehits_[i].score < minsc) {
1435 					break;
1436 				}
1437 				if(prm.nExDps >= maxDp || prm.nMateDps >= maxDp) {
1438 					return EXTEND_EXCEEDED_HARD_LIMIT;
1439 				}
1440 				if(prm.nExUgs >= maxUg || prm.nMateUgs >= maxUg) {
1441 					return EXTEND_EXCEEDED_HARD_LIMIT;
1442 				}
1443 				if(prm.nExIters >= maxIters) {
1444 					return EXTEND_EXCEEDED_HARD_LIMIT;
1445 				}
1446 				prm.nExIters++;
1447 				first = false;
1448 				// Resolve next element offset
1449 				WalkResult<index_t> wr;
1450 				uint32_t elt = rands_[i].next(rnd);
1451 				//cerr << "elt=" << elt << endl;
1452 				SARangeWithOffs<TSlice, index_t> sa;
1453 				sa.topf = satpos_[i].sat.topf;
1454 				sa.len = satpos_[i].sat.key.len;
1455 				sa.offs = satpos_[i].sat.offs;
1456 				gws_[i].advanceElement((index_t)elt, gfmFw, ref, sa, gwstate_, wr, wlm, prm);
1457 				eltsDone++;
1458 				if(!eeMode) {
1459 					assert_gt(neltLeft, 0);
1460 					neltLeft--;
1461 				}
1462 				assert_neq((index_t)OFF_MASK, wr.toff);
1463 				index_t tidx = 0, toff = 0, tlen = 0;
1464 				bool straddled = false;
1465 				gfmFw.joinedToTextOff(
1466                                       wr.elt.len,
1467                                       wr.toff,
1468                                       tidx,
1469                                       toff,
1470                                       tlen,
1471                                       eeMode,     // reject straddlers?
1472                                       straddled); // did it straddle?
1473 				if(tidx == (index_t)OFF_MASK) {
1474 					// The seed hit straddled a reference boundary so the seed hit
1475 					// isn't valid
1476 					continue;
1477 				}
1478 #ifndef NDEBUG
1479 				if(!eeMode && !straddled) { // Check that seed hit matches reference
1480 					uint64_t key = satpos_[i].sat.key.seq;
1481 					for(index_t k = 0; k < wr.elt.len; k++) {
1482 						int c = ref.getBase(tidx, toff + wr.elt.len - k - 1);
1483 						assert_leq(c, 3);
1484 						int ck = (int)(key & 3);
1485 						key >>= 2;
1486 						assert_eq(c, ck);
1487 					}
1488 				}
1489 #endif
1490 				// Find offset of alignment's upstream base assuming net gaps=0
1491 				// between beginning of read and beginning of seed hit
1492 				int64_t refoff = (int64_t)toff - rdoff;
1493 				// Coordinate of the seed hit w/r/t the pasted reference string
1494 				Coord refcoord(tidx, refoff, fw);
1495 				if(seenDiags1_.locusPresent(refcoord)) {
1496 					// Already handled alignments seeded on this diagonal
1497 					prm.nRedundants++;
1498 					swmSeed.rshit++;
1499 					continue;
1500 				}
1501 				// Now that we have a seed hit, there are many issues to solve
1502 				// before we have a completely framed dynamic programming problem.
1503 				// They include:
1504 				//
1505 				// 1. Setting reference offsets on either side of the seed hit,
1506 				//    accounting for where the seed occurs in the read
1507 				// 2. Adjusting the width of the banded dynamic programming problem
1508 				//    and adjusting reference bounds to allow for gaps in the
1509 				//    alignment
1510 				// 3. Accounting for the edges of the reference, which can impact
1511 				//    the width of the DP problem and reference bounds.
1512 				// 4. Perhaps filtering the problem down to a smaller problem based
1513 				//    on what DPs we've already solved for this read
1514 				//
1515 				// We do #1 here, since it is simple and we have all the seed-hit
1516 				// information here.  #2 and #3 are handled in the DynProgFramer.
1517 				int readGaps = 0, refGaps = 0;
1518 				bool ungapped = false;
1519 				if(!eeMode) {
1520 					readGaps = sc.maxReadGaps(minsc, rdlen);
1521 					refGaps  = sc.maxRefGaps(minsc, rdlen);
1522 					ungapped = (readGaps == 0 && refGaps == 0);
1523 				}
1524 				int state = FOUND_NONE;
1525 				bool found = false;
1526 				if(eeMode) {
1527 					resEe_.reset();
1528 					resEe_.alres.reset();
1529 					const EEHit<index_t>& h = eehits_[i];
1530 					assert_leq(h.score, perfectScore);
1531 					resEe_.alres.setScore(AlnScore(h.score, h.ns(), 0));
1532 					resEe_.alres.setShape(
1533 										  refcoord.ref(),  // ref id
1534 										  refcoord.off(),  // 0-based ref offset
1535 										  tlen,            // length of reference
1536 										  fw,              // aligned to Watson?
1537 										  rdlen,           // read length
1538 										  true,            // pretrim soft?
1539 										  0,               // pretrim 5' end
1540 										  0,               // pretrim 3' end
1541 										  true,            // alignment trim soft?
1542 										  0,               // alignment trim 5' end
1543 										  0);              // alignment trim 3' end
1544 					resEe_.alres.setRefNs(h.refns());
1545 					if(h.mms() > 0) {
1546 						assert_eq(1, h.mms());
1547 						assert_lt(h.e1.pos, rd.length());
1548 						resEe_.alres.ned().push_back(h.e1);
1549 					}
1550 					assert(resEe_.repOk(rd));
1551 					state = FOUND_EE;
1552 					found = true;
1553 					Interval refival(refcoord, 1);
1554 					seenDiags1_.add(refival);
1555 				} else if(doUngapped && ungapped) {
1556 					resUngap_.reset();
1557 					int al = swa.ungappedAlign(
1558 											   fw ? rd.patFw : rd.patRc,
1559 											   fw ? rd.qual  : rd.qualRev,
1560 											   refcoord,
1561 											   ref,
1562 											   tlen,
1563 											   sc,
1564 											   gReportOverhangs,
1565 											   minsc,
1566 											   resUngap_);
1567 					Interval refival(refcoord, 1);
1568 					seenDiags1_.add(refival);
1569 					prm.nExUgs++;
1570 					if(al == 0) {
1571 						prm.nExUgFails++;
1572 						prm.nUgFail++;
1573 						if(prm.nUgFail >= maxUgStreak) {
1574 							return EXTEND_EXCEEDED_SOFT_LIMIT;
1575 						}
1576 						swmSeed.ungapfail++;
1577 						continue;
1578 					} else if(al == -1) {
1579 						prm.nExUgFails++;
1580 						prm.nUgFail++; // count this as failure
1581 						if(prm.nUgFail >= maxUgStreak) {
1582 							return EXTEND_EXCEEDED_SOFT_LIMIT;
1583 						}
1584 						swmSeed.ungapnodec++;
1585 					} else {
1586 						prm.nExUgSuccs++;
1587 						prm.nUgLastSucc = prm.nExUgs-1;
1588 						if(prm.nUgFail > prm.nUgFailStreak) {
1589 							prm.nUgFailStreak = prm.nUgFail;
1590 						}
1591 						prm.nUgFail = 0;
1592 						found = true;
1593 						state = FOUND_UNGAPPED;
1594 						swmSeed.ungapsucc++;
1595 					}
1596 				}
1597 				int64_t pastedRefoff = (int64_t)wr.toff - rdoff;
1598 				DPRect rect;
1599 				if(state == FOUND_NONE) {
1600 					found = dpframe.frameSeedExtensionRect(
1601 														   refoff,   // ref offset implied by seed hit assuming no gaps
1602 														   rows,     // length of read sequence used in DP table
1603 														   tlen,     // length of reference
1604 														   readGaps, // max # of read gaps permitted in opp mate alignment
1605 														   refGaps,  // max # of ref gaps permitted in opp mate alignment
1606 														   (size_t)nceil, // # Ns permitted
1607 														   maxhalf,  // max width in either direction
1608 														   rect);    // DP rectangle
1609 					assert(rect.repOk());
1610 					// Add the seed diagonal at least
1611 					seenDiags1_.add(Interval(refcoord, 1));
1612 					if(!found) {
1613 						continue;
1614 					}
1615 				}
1616 				int64_t leftShift = refoff - rect.refl;
1617 				size_t nwindow = 0;
1618 				if(toff >= rect.refl) {
1619 					nwindow = (size_t)(toff - rect.refl);
1620 				}
1621 				// NOTE: We might be taking off more than we should because the
1622 				// pasted string omits non-A/C/G/T characters, but we included them
1623 				// when calculating leftShift.  We'll account for this later.
1624 				pastedRefoff -= leftShift;
1625 				size_t nsInLeftShift = 0;
1626 				if(state == FOUND_NONE) {
1627 					if(!swa.initedRead()) {
1628 						// Initialize the aligner with a new read
1629 						swa.initRead(
1630 									 rd.patFw,  // fw version of query
1631 									 rd.patRc,  // rc version of query
1632 									 rd.qual,   // fw version of qualities
1633 									 rd.qualRev,// rc version of qualities
1634 									 0,         // off of first char in 'rd' to consider
1635 									 rdlen,     // off of last char (excl) in 'rd' to consider
1636 									 sc);       // scoring scheme
1637 					}
1638 					swa.initRef(
1639 								fw,        // whether to align forward or revcomp read
1640 								tidx,      // reference aligned against
1641 								rect,      // DP rectangle
1642 								ref,       // Reference strings
1643 								tlen,      // length of reference sequence
1644 								sc,        // scoring scheme
1645 								minsc,     // minimum score permitted
1646 								enable8,   // use 8-bit SSE if possible?
1647 								cminlen,   // minimum length for using checkpointing scheme
1648 								cpow2,     // interval b/t checkpointed diags; 1 << this
1649 								doTri,     // triangular mini-fills?
1650 								true,      // this is a seed extension - not finding a mate
1651 								nwindow,
1652 								nsInLeftShift);
1653 					// Because of how we framed the problem, we can say that we've
1654 					// exhaustively scored the seed diagonal as well as maxgaps
1655 					// diagonals on either side
1656 					Interval refival(tidx, 0, fw, 0);
1657 					rect.initIval(refival);
1658 					seenDiags1_.add(refival);
1659 					// Now fill the dynamic programming matrix and return true iff
1660 					// there is at least one valid alignment
1661 					TAlScore bestCell = std::numeric_limits<TAlScore>::min();
1662 					found = swa.align(rnd, bestCell);
1663 					swmSeed.tallyGappedDp(readGaps, refGaps);
1664 					prm.nExDps++;
1665 					if(!found) {
1666 						prm.nExDpFails++;
1667 						prm.nDpFail++;
1668 						if(prm.nDpFail >= maxDpStreak) {
1669 							return EXTEND_EXCEEDED_SOFT_LIMIT;
1670 						}
1671 						if(bestCell > std::numeric_limits<TAlScore>::min() && bestCell > prm.bestLtMinscMate1) {
1672 							prm.bestLtMinscMate1 = bestCell;
1673 						}
1674 						continue; // Look for more anchor alignments
1675 					} else {
1676 						prm.nExDpSuccs++;
1677 						prm.nDpLastSucc = prm.nExDps-1;
1678 						if(prm.nDpFail > prm.nDpFailStreak) {
1679 							prm.nDpFailStreak = prm.nDpFail;
1680 						}
1681 						prm.nDpFail = 0;
1682 					}
1683 				}
1684 				bool firstInner = true;
1685 				while(true) {
1686 					assert(found);
1687 					SwResult *res = NULL;
1688 					if(state == FOUND_EE) {
1689 						if(!firstInner) {
1690 							break;
1691 						}
1692 						res = &resEe_;
1693 					} else if(state == FOUND_UNGAPPED) {
1694 						if(!firstInner) {
1695 							break;
1696 						}
1697 						res = &resUngap_;
1698 					} else {
1699 						resGap_.reset();
1700 						assert(resGap_.empty());
1701 						if(swa.done()) {
1702 							break;
1703 						}
1704 						swa.nextAlignment(resGap_, minsc, rnd);
1705 						found = !resGap_.empty();
1706 						if(!found) {
1707 							break;
1708 						}
1709 						res = &resGap_;
1710 					}
1711 					assert(res != NULL);
1712 					firstInner = false;
1713 					assert(res->alres.matchesRef(
1714 												 rd,
1715 												 ref,
1716 												 tmp_rf_,
1717 												 tmp_rdseq_,
1718 												 tmp_qseq_,
1719 												 raw_refbuf_,
1720 												 raw_destU32_,
1721 												 raw_matches_,
1722 												 tmp_reflens_,
1723 												 tmp_refoffs_));
1724 					Interval refival(tidx, 0, fw, tlen);
1725 					assert_gt(res->alres.refExtent(), 0);
1726 					if(gReportOverhangs &&
1727 					   !refival.containsIgnoreOrient(res->alres.refival()))
1728 					{
1729 						res->alres.clipOutside(true, 0, tlen);
1730 						if(res->alres.refExtent() == 0) {
1731 							continue;
1732 						}
1733 					}
1734 					assert(gReportOverhangs ||
1735 					       refival.containsIgnoreOrient(res->alres.refival()));
1736 					// Did the alignment fall entirely outside the reference?
1737 					if(!refival.overlapsIgnoreOrient(res->alres.refival())) {
1738 						continue;
1739 					}
1740 					// Is this alignment redundant with one we've seen previously?
1741 					if(redAnchor_.overlap(res->alres)) {
1742 						// Redundant with an alignment we found already
1743 						continue;
1744 					}
1745 					redAnchor_.add(res->alres);
1746 					// Annotate the AlnRes object with some key parameters
1747 					// that were used to obtain the alignment.
1748 					res->alres.setParams(
1749 										 seedmms,   // # mismatches allowed in seed
1750 										 seedlen,   // length of seed
1751 										 seedival,  // interval between seeds
1752 										 minsc);    // minimum score for valid alignment
1753 
1754 					if(reportImmediately) {
1755 						assert(msink != NULL);
1756 						assert(res->repOk());
1757 						// Check that alignment accurately reflects the
1758 						// reference characters aligned to
1759 						assert(res->alres.matchesRef(
1760 													 rd,
1761 													 ref,
1762 													 tmp_rf_,
1763 													 tmp_rdseq_,
1764 													 tmp_qseq_,
1765 													 raw_refbuf_,
1766 													 raw_destU32_,
1767 													 raw_matches_,
1768 													 tmp_reflens_,
1769 													 tmp_refoffs_));
1770 						// Report an unpaired alignment
1771 						assert(!msink->maxed());
1772 						if(msink->report(
1773 										 0,
1774 										 mate1 ? &res->alres : NULL,
1775 										 mate1 ? NULL : &res->alres))
1776 						{
1777 							// Short-circuited because a limit, e.g. -k, -m or
1778 							// -M, was exceeded
1779 							return EXTEND_POLICY_FULFILLED;
1780 						}
1781 						if(tighten > 0 &&
1782 						   msink->Mmode() &&
1783 						   msink->hasSecondBestUnp1())
1784 						{
1785 							if(tighten == 1) {
1786 								if(msink->bestUnp1() >= minsc) {
1787 									minsc = msink->bestUnp1();
1788 									if(minsc < perfectScore &&
1789 									   msink->bestUnp1() == msink->secondBestUnp1())
1790 									{
1791 										minsc++;
1792 									}
1793 								}
1794 							} else if(tighten == 2) {
1795 								if(msink->secondBestUnp1() >= minsc) {
1796 									minsc = msink->secondBestUnp1();
1797 									if(minsc < perfectScore) {
1798 										minsc++;
1799 									}
1800 								}
1801 							} else {
1802 								TAlScore diff = msink->bestUnp1() - msink->secondBestUnp1();
1803 								TAlScore bot = msink->secondBestUnp1() + ((diff*3)/4);
1804 								if(bot >= minsc) {
1805 									minsc = bot;
1806 									if(minsc < perfectScore) {
1807 										minsc++;
1808 									}
1809 								}
1810 							}
1811 							assert_leq(minsc, perfectScore);
1812 						}
1813 					}
1814 				}
1815 
1816 				// At this point we know that we aren't bailing, and will
1817 				// continue to resolve seed hits.
1818 
1819 			} // while(!gws_[i].done())
1820 		}
1821 	}
1822 	// Short-circuited because a limit, e.g. -k, -m or -M, was exceeded
1823 	return EXTEND_EXHAUSTED_CANDIDATES;
1824 }
1825 
1826 /**
1827  * Given a collection of SeedHits for both mates in a read pair, extend seed
1828  * alignments into full alignments and then look for the opposite mate using
1829  * dynamic programming.  Where possible, try to avoid redundant offset lookups.
1830  * Optionally report alignments to a AlnSinkWrap object as they are discovered.
1831  *
1832  * If 'reportImmediately' is true, returns true iff a call to
1833  * msink->report() returned true (indicating that the reporting
1834  * policy is satisfied and we can stop).  Otherwise, returns false.
1835  *
1836  * REDUNDANT SEED HITS
1837  *
1838  * See notes at top of aligner_sw_driver.h.
1839  *
1840  * REDUNDANT ALIGNMENTS
1841  *
1842  * See notes at top of aligner_sw_driver.h.
1843  *
1844  * MIXING PAIRED AND UNPAIRED ALIGNMENTS
1845  *
1846  * There are distinct paired-end alignment modes for the cases where (a) the
1847  * user does or does not want to see unpaired alignments for individual mates
1848  * when there are no reportable paired-end alignments involving both mates, and
1849  * (b) the user does or does not want to see discordant paired-end alignments.
1850  * The modes have implications for this function and for the AlnSinkWrap, since
1851  * it affects when we're "done."  Also, whether the user has asked us to report
1852  * discordant alignments affects whether and how much searching for unpaired
1853  * alignments we must do (i.e. if there are no paired-end alignments, we must
1854  * at least do -m 1 for both mates).
1855  *
1856  * Mode 1: Just concordant paired-end.  Print only concordant paired-end
1857  * alignments.  As soon as any limits (-k/-m/-M) are reached, stop.
1858  *
1859  * Mode 2: Concordant and discordant paired-end.  If -k/-m/-M limits are
1860  * reached for paired-end alignments, stop.  Otherwise, if no paired-end
1861  * alignments are found, align both mates in an unpaired -m 1 fashion.  If
1862  * there is exactly one unpaired alignment for each mate, report the
1863  * combination as a discordant alignment.
1864  *
1865  * Mode 3: Concordant paired-end if possible, otherwise unpaired.  If -k/-M
1866  * limit is reached for paired-end alignmnts, stop.  If -m limit is reached for
1867  * paired-end alignments or no paired-end alignments are found, align both
1868  * mates in an unpaired fashion.  All the same settings governing validity and
1869  * reportability in paired-end mode apply here too (-k/-m/-M/etc).
1870  *
1871  * Mode 4: Concordant or discordant paired-end if possible, otherwise unpaired.
1872  * If -k/-M limit is reached for paired-end alignmnts, stop.  If -m limit is
1873  * reached for paired-end alignments or no paired-end alignments are found,
1874  * align both mates in an unpaired fashion.  If the -m limit was reached, there
1875  * is no need to search for a discordant alignment, and unapired alignment can
1876  * proceed as in Mode 3.  If no paired-end alignments were found, then unpaired
1877  * alignment proceeds as in Mode 3 but with this caveat: alignment must be at
1878  * least as thorough as dictated by -m 1 up until the point where
1879  *
1880  * Print paired-end alignments when there are reportable paired-end
1881  * alignments, otherwise report reportable unpaired alignments.  If -k limit is
1882  * reached for paired-end alignments, stop.  If -m/-M limit is reached for
1883  * paired-end alignments, stop searching for paired-end alignments and look
1884  * only for unpaired alignments.  If searching only for unpaired alignments,
1885  * respect -k/-m/-M limits separately for both mates.
1886  *
1887  * The return value from the AlnSinkWrap's report member function must be
1888  * specific enough to distinguish between:
1889  *
1890  * 1. Stop searching for paired-end alignments
1891  * 2. Stop searching for alignments for unpaired alignments for mate #1
1892  * 3. Stop searching for alignments for unpaired alignments for mate #2
1893  * 4. Stop searching for any alignments
1894  *
1895  * Note that in Mode 2, options affecting validity and reportability of
1896  * alignments apply .  E.g. if -m 1 is specified
1897  *
1898  * WORKFLOW
1899  *
1900  * Our general approach to finding paired and unpaired alignments here
1901  * is as follows:
1902  *
1903  * - For mate in mate1, mate2:
1904  *   - For each seed hit in mate:
1905  *     - Try to extend it into a full alignment; if we can't, continue
1906  *       to the next seed hit
1907  *     - Look for alignment for opposite mate; if we can't find one,
1908  *     -
1909  *     -
1910  *
1911  */
1912 template <typename index_t>
extendSeedsPaired(Read & rd,Read & ord,bool anchor1,bool oppFilt,SeedResults<index_t> & sh,const GFM<index_t> & gfmFw,const GFM<index_t> * gfmBw,const BitPairReference & ref,SwAligner & swa,SwAligner & oswa,const Scoring & sc,const PairedEndPolicy & pepol,int seedmms,int seedlen,int seedival,TAlScore & minsc,TAlScore & ominsc,int nceil,int onceil,bool nofw,bool norc,size_t maxhalf,bool doUngapped,size_t maxIters,size_t maxUg,size_t maxDp,size_t maxEeStreak,size_t maxUgStreak,size_t maxDpStreak,size_t maxMateStreak,bool doExtend,bool enable8,size_t cminlen,size_t cpow2,bool doTri,int tighten,AlignmentCacheIface<index_t> & ca,RandomSource & rnd,WalkMetrics & wlm,SwMetrics & swmSeed,SwMetrics & swmMate,PerReadMetrics & prm,AlnSinkWrap<index_t> * msink,bool swMateImmediately,bool reportImmediately,bool discord,bool mixed,bool & exhaustive)1913 int SwDriver<index_t>::extendSeedsPaired(
1914 										 Read& rd,                    // mate to align as anchor
1915 										 Read& ord,                   // mate to align as opposite
1916 										 bool anchor1,                // true iff anchor mate is mate1
1917 										 bool oppFilt,                // true iff opposite mate was filtered out
1918 										 SeedResults<index_t>& sh,    // seed hits for anchor
1919 										 const GFM<index_t>& gfmFw,   // BWT
1920 										 const GFM<index_t>* gfmBw,   // BWT'
1921 										 const BitPairReference& ref, // Reference strings
1922 										 SwAligner& swa,              // dynamic programming aligner for anchor
1923 										 SwAligner& oswa,             // dynamic programming aligner for opposite
1924 										 const Scoring& sc,           // scoring scheme
1925 										 const PairedEndPolicy& pepol,// paired-end policy
1926 										 int seedmms,                 // # mismatches allowed in seed
1927 										 int seedlen,                 // length of seed
1928 										 int seedival,                // interval between seeds
1929 										 TAlScore& minsc,             // minimum score for valid anchor aln
1930 										 TAlScore& ominsc,            // minimum score for valid opposite aln
1931 										 int nceil,                   // max # Ns permitted in ref for anchor
1932 										 int onceil,                  // max # Ns permitted in ref for opposite
1933 										 bool nofw,                   // don't align forward read
1934 										 bool norc,                   // don't align revcomp read
1935 										 size_t maxhalf,              // max width in either direction for DP tables
1936 										 bool doUngapped,             // do ungapped alignment
1937 										 size_t maxIters,             // stop after this many seed-extend loop iters
1938 										 size_t maxUg,                // stop after this many ungaps
1939 										 size_t maxDp,                // stop after this many dps
1940 										 size_t maxEeStreak,          // stop after streak of this many end-to-end fails
1941 										 size_t maxUgStreak,          // stop after streak of this many ungap fails
1942 										 size_t maxDpStreak,          // stop after streak of this many dp fails
1943 										 size_t maxMateStreak,        // stop seed range after N mate-find fails
1944 										 bool doExtend,               // do seed extension
1945 										 bool enable8,                // use 8-bit SSE where possible
1946 										 size_t cminlen,              // use checkpointer if read longer than this
1947 										 size_t cpow2,                // interval between diagonals to checkpoint
1948 										 bool doTri,                  // triangular mini-fills?
1949 										 int tighten,                 // -M score tightening mode
1950 										 AlignmentCacheIface<index_t>& ca,     // alignment cache for seed hits
1951 										 RandomSource& rnd,           // pseudo-random source
1952 										 WalkMetrics& wlm,            // group walk left metrics
1953 										 SwMetrics& swmSeed,          // DP metrics for seed-extend
1954 										 SwMetrics& swmMate,          // DP metrics for mate finidng
1955 										 PerReadMetrics& prm,         // per-read metrics
1956 										 AlnSinkWrap<index_t>* msink, // AlnSink wrapper for multiseed-style aligner
1957 										 bool swMateImmediately,      // whether to look for mate immediately
1958 										 bool reportImmediately,      // whether to report hits immediately to msink
1959 										 bool discord,                // look for discordant alignments?
1960 										 bool mixed,                  // look for unpaired as well as paired alns?
1961 										 bool& exhaustive)
1962 {
1963 	bool all = msink->allHits();
1964 	// typedef std::pair<uint32_t, uint32_t> U32Pair;
1965 
1966 	assert(!reportImmediately || msink != NULL);
1967 	assert(!reportImmediately || !msink->maxed());
1968 	assert(!msink->state().doneWithMate(anchor1));
1969 
1970 	assert_geq(nceil, 0);
1971 	assert_geq(onceil, 0);
1972 	assert_leq((size_t)nceil,  rd.length());
1973 	assert_leq((size_t)onceil, ord.length());
1974 
1975 	const index_t rdlen  = rd.length();
1976 	const index_t ordlen = ord.length();
1977 	const TAlScore perfectScore = sc.perfectScore(rdlen);
1978 	const TAlScore operfectScore = sc.perfectScore(ordlen);
1979 
1980 	assert_leq(minsc, perfectScore);
1981 	assert(oppFilt || ominsc <= operfectScore);
1982 
1983 	TAlScore bestPairScore = perfectScore + operfectScore;
1984 	if(tighten > 0 && msink->Mmode() && msink->hasSecondBestPair()) {
1985 		// Paired-end alignments should have at least this score from now
1986 		TAlScore ps;
1987 		if(tighten == 1) {
1988 			ps = msink->bestPair();
1989 		} else if(tighten == 2) {
1990 			ps = msink->secondBestPair();
1991 		} else {
1992 			TAlScore diff = msink->bestPair() - msink->secondBestPair();
1993 			ps = msink->secondBestPair() + (diff * 3)/4;
1994 		}
1995 		if(tighten == 1 && ps < bestPairScore &&
1996 		   msink->bestPair() == msink->secondBestPair())
1997 		{
1998 			ps++;
1999 		}
2000 		if(tighten >= 2 && ps < bestPairScore) {
2001 			ps++;
2002 		}
2003 		// Anchor mate must have score at least 'ps' minus the best possible
2004 		// score for the opposite mate.
2005 		TAlScore nc = ps - operfectScore;
2006 		if(nc > minsc) {
2007 			minsc = nc;
2008 		}
2009 		assert_leq(minsc, perfectScore);
2010 	}
2011 
2012 	DynProgFramer dpframe(!gReportOverhangs);
2013 	swa.reset();
2014 	oswa.reset();
2015 
2016 	// Initialize a set of GroupWalks, one for each seed.  Also, intialize the
2017 	// accompanying lists of reference seed hits (satups*)
2018 	const index_t nsm = 5;
2019 	const index_t nonz = sh.nonzeroOffsets(); // non-zero positions
2020 	index_t eeHits = sh.numE2eHits();
2021 	bool eeMode = eeHits > 0;
2022 	bool firstEe = true;
2023 	bool firstExtend = true;
2024 
2025 	// Reset all the counters related to streaks
2026 	prm.nEeFail = 0;
2027 	prm.nUgFail = 0;
2028 	prm.nDpFail = 0;
2029 
2030 	index_t nelt = 0, neltLeft = 0;
2031 	const index_t rows = rdlen;
2032 	const index_t orows  = ordlen;
2033 	index_t eltsDone = 0;
2034 	while(true) {
2035 		if(eeMode) {
2036 			if(firstEe) {
2037 				firstEe = false;
2038 				eeMode = eeSaTups(
2039 								  rd,           // read
2040 								  sh,           // seed hits to extend into full alignments
2041 								  gfmFw,        // BWT
2042 								  ref,          // Reference strings
2043 								  rnd,          // pseudo-random generator
2044 								  wlm,          // group walk left metrics
2045 								  swmSeed,      // seed-extend metrics
2046 								  nelt,         // out: # elements total
2047 								  maxIters,     // max elts to report
2048 								  all);         // report all hits
2049 				assert_eq(gws_.size(), rands_.size());
2050 				assert_eq(gws_.size(), satpos_.size());
2051 				neltLeft = nelt;
2052 				// Initialize list that contains the mate-finding failure
2053 				// streak for each range
2054 				mateStreaks_.resize(gws_.size());
2055 				mateStreaks_.fill(0);
2056 			} else {
2057 				eeMode = false;
2058 			}
2059 		}
2060 		if(!eeMode) {
2061 			if(nonz == 0) {
2062 				// No seed hits!  Bail.
2063 				return EXTEND_EXHAUSTED_CANDIDATES;
2064 			}
2065 			if(msink->Mmode() && minsc == perfectScore) {
2066 				// Already found all perfect hits!
2067 				return EXTEND_PERFECT_SCORE;
2068 			}
2069 			if(firstExtend) {
2070 				nelt = 0;
2071 				prioritizeSATups(
2072 								 rd,            // read
2073 								 sh,            // seed hits to extend into full alignments
2074 								 gfmFw,         // BWT
2075 								 gfmBw,         // BWT'
2076 								 ref,           // Reference strings
2077 								 seedmms,       // # seed mismatches allowed
2078 								 maxIters,      // max rows to consider per position
2079 								 doExtend,      // extend out seeds
2080 								 true,          // square extended length
2081 								 true,          // square SA range size
2082 								 nsm,           // smallness threshold
2083 								 ca,            // alignment cache for seed hits
2084 								 rnd,           // pseudo-random generator
2085 								 wlm,           // group walk left metrics
2086 								 prm,           // per-read metrics
2087 								 nelt,          // out: # elements total
2088 								 all);          // report all hits?
2089 				assert_eq(gws_.size(), rands_.size());
2090 				assert_eq(gws_.size(), satpos_.size());
2091 				neltLeft = nelt;
2092 				firstExtend = false;
2093 				mateStreaks_.resize(gws_.size());
2094 				mateStreaks_.fill(0);
2095 			}
2096 			if(neltLeft == 0) {
2097 				// Finished examining gapped candidates
2098 				break;
2099 			}
2100 		}
2101 		for(index_t i = 0; i < gws_.size(); i++) {
2102 			if(eeMode && eehits_[i].score < minsc) {
2103 				return EXTEND_PERFECT_SCORE;
2104 			}
2105 			bool is_small      = satpos_[i].sat.size() < nsm;
2106 			bool fw            = satpos_[i].pos.fw;
2107 			index_t rdoff      = satpos_[i].pos.rdoff;
2108 			index_t seedhitlen = satpos_[i].pos.seedlen;
2109 			if(!fw) {
2110 				// 'rdoff' and 'offidx' are with respect to the 5' end of
2111 				// the read.  Here we convert rdoff to be with respect to
2112 				// the upstream (3') end of ther read.
2113 				rdoff = (index_t)(rdlen - rdoff - seedhitlen);
2114 			}
2115 			bool first = true;
2116 			// If the range is small, investigate all elements now.  If the
2117 			// range is large, just investigate one and move on - we might come
2118 			// back to this range later.
2119 			while(!rands_[i].done() && (first || is_small || eeMode)) {
2120 				if(minsc == perfectScore) {
2121 					if(!eeMode || eehits_[i].score < perfectScore) {
2122 						return EXTEND_PERFECT_SCORE;
2123 					}
2124 				} else if(eeMode && eehits_[i].score < minsc) {
2125 					break;
2126 				}
2127 				if(prm.nExDps >= maxDp || prm.nMateDps >= maxDp) {
2128 					return EXTEND_EXCEEDED_HARD_LIMIT;
2129 				}
2130 				if(prm.nExUgs >= maxUg || prm.nMateUgs >= maxUg) {
2131 					return EXTEND_EXCEEDED_HARD_LIMIT;
2132 				}
2133 				if(prm.nExIters >= maxIters) {
2134 					return EXTEND_EXCEEDED_HARD_LIMIT;
2135 				}
2136 				if(eeMode && prm.nEeFail >= maxEeStreak) {
2137 					return EXTEND_EXCEEDED_SOFT_LIMIT;
2138 				}
2139 				if(!eeMode && prm.nDpFail >= maxDpStreak) {
2140 					return EXTEND_EXCEEDED_SOFT_LIMIT;
2141 				}
2142 				if(!eeMode && prm.nUgFail >= maxUgStreak) {
2143 					return EXTEND_EXCEEDED_SOFT_LIMIT;
2144 				}
2145 				if(mateStreaks_[i] >= maxMateStreak) {
2146 					// Don't try this seed range anymore
2147 					rands_[i].setDone();
2148 					assert(rands_[i].done());
2149 					break;
2150 				}
2151 				prm.nExIters++;
2152 				first = false;
2153 				assert(!gws_[i].done());
2154 				// Resolve next element offset
2155 				WalkResult<index_t> wr;
2156 				uint32_t elt = rands_[i].next(rnd);
2157 				SARangeWithOffs<TSlice, index_t> sa;
2158 				sa.topf = satpos_[i].sat.topf;
2159 				sa.len = satpos_[i].sat.key.len;
2160 				sa.offs = satpos_[i].sat.offs;
2161 				gws_[i].advanceElement((index_t)elt, gfmFw, ref, sa, gwstate_, wr, wlm, prm);
2162 				eltsDone++;
2163 				assert_gt(neltLeft, 0);
2164 				neltLeft--;
2165 				assert_neq((index_t)OFF_MASK, wr.toff);
2166 				index_t tidx = 0, toff = 0, tlen = 0;
2167 				bool straddled = false;
2168 				gfmFw.joinedToTextOff(
2169                                       wr.elt.len,
2170                                       wr.toff,
2171                                       tidx,
2172                                       toff,
2173                                       tlen,
2174                                       eeMode,       // reject straddlers?
2175                                       straddled);   // straddled?
2176 				if(tidx == (index_t)OFF_MASK) {
2177 					// The seed hit straddled a reference boundary so the seed hit
2178 					// isn't valid
2179 					continue;
2180 				}
2181 #ifndef NDEBUG
2182 				if(!eeMode && !straddled) { // Check that seed hit matches reference
2183 					uint64_t key = satpos_[i].sat.key.seq;
2184 					for(index_t k = 0; k < wr.elt.len; k++) {
2185 						int c = ref.getBase(tidx, toff + wr.elt.len - k - 1);
2186 						assert_leq(c, 3);
2187 						int ck = (int)(key & 3);
2188 						key >>= 2;
2189 						assert_eq(c, ck);
2190 					}
2191 				}
2192 #endif
2193 				// Find offset of alignment's upstream base assuming net gaps=0
2194 				// between beginning of read and beginning of seed hit
2195 				int64_t refoff = (int64_t)toff - rdoff;
2196 				EIvalMergeListBinned& seenDiags  = anchor1 ? seenDiags1_ : seenDiags2_;
2197 				// Coordinate of the seed hit w/r/t the pasted reference string
2198 				Coord refcoord(tidx, refoff, fw);
2199 				if(seenDiags.locusPresent(refcoord)) {
2200 					// Already handled alignments seeded on this diagonal
2201 					prm.nRedundants++;
2202 					swmSeed.rshit++;
2203 					continue;
2204 				}
2205 				// Now that we have a seed hit, there are many issues to solve
2206 				// before we have a completely framed dynamic programming problem.
2207 				// They include:
2208 				//
2209 				// 1. Setting reference offsets on either side of the seed hit,
2210 				//    accounting for where the seed occurs in the read
2211 				// 2. Adjusting the width of the banded dynamic programming problem
2212 				//    and adjusting reference bounds to allow for gaps in the
2213 				//    alignment
2214 				// 3. Accounting for the edges of the reference, which can impact
2215 				//    the width of the DP problem and reference bounds.
2216 				// 4. Perhaps filtering the problem down to a smaller problem based
2217 				//    on what DPs we've already solved for this read
2218 				//
2219 				// We do #1 here, since it is simple and we have all the seed-hit
2220 				// information here.  #2 and #3 are handled in the DynProgFramer.
2221 				int readGaps = 0, refGaps = 0;
2222 				bool ungapped = false;
2223 				if(!eeMode) {
2224 					readGaps = sc.maxReadGaps(minsc, rdlen);
2225 					refGaps  = sc.maxRefGaps(minsc, rdlen);
2226 					ungapped = (readGaps == 0 && refGaps == 0);
2227 				}
2228 				int state = FOUND_NONE;
2229 				bool found = false;
2230 				// In unpaired mode, a seed extension is successful if it
2231 				// results in a full alignment that meets the minimum score
2232 				// threshold.  In paired-end mode, a seed extension is
2233 				// successful if it results in a *full paired-end* alignment
2234 				// that meets the minimum score threshold.
2235 				if(eeMode) {
2236 					resEe_.reset();
2237 					resEe_.alres.reset();
2238 					const EEHit<index_t>& h = eehits_[i];
2239 					assert_leq(h.score, perfectScore);
2240 					resEe_.alres.setScore(AlnScore(h.score, h.ns(), 0));
2241 					resEe_.alres.setShape(
2242 										  refcoord.ref(),  // ref id
2243 										  refcoord.off(),  // 0-based ref offset
2244 										  tlen,            // reference length
2245 										  fw,              // aligned to Watson?
2246 										  rdlen,           // read length
2247 										  true,            // pretrim soft?
2248 										  0,               // pretrim 5' end
2249 										  0,               // pretrim 3' end
2250 										  true,            // alignment trim soft?
2251 										  0,               // alignment trim 5' end
2252 										  0);              // alignment trim 3' end
2253 					resEe_.alres.setRefNs(h.refns());
2254 					if(h.mms() > 0) {
2255 						assert_eq(1, h.mms());
2256 						assert_lt(h.e1.pos, rd.length());
2257 						resEe_.alres.ned().push_back(h.e1);
2258 					}
2259 					assert(resEe_.repOk(rd));
2260 					state = FOUND_EE;
2261 					found = true;
2262 					Interval refival(refcoord, 1);
2263 					seenDiags.add(refival);
2264 					prm.nExEes++;
2265 					prm.nEeFail++; // say it's failed until proven successful
2266 					prm.nExEeFails++;
2267 				} else if(doUngapped && ungapped) {
2268 					resUngap_.reset();
2269 					int al = swa.ungappedAlign(
2270 											   fw ? rd.patFw : rd.patRc,
2271 											   fw ? rd.qual  : rd.qualRev,
2272 											   refcoord,
2273 											   ref,
2274 											   tlen,
2275 											   sc,
2276 											   gReportOverhangs,
2277 											   minsc, // minimum
2278 											   resUngap_);
2279 					Interval refival(refcoord, 1);
2280 					seenDiags.add(refival);
2281 					prm.nExUgs++;
2282 					prm.nUgFail++; // say it's failed until proven successful
2283 					prm.nExUgFails++;
2284 					if(al == 0) {
2285 						swmSeed.ungapfail++;
2286 						continue;
2287 					} else if(al == -1) {
2288 						swmSeed.ungapnodec++;
2289 					} else {
2290 						found = true;
2291 						state = FOUND_UNGAPPED;
2292 						swmSeed.ungapsucc++;
2293 					}
2294 				}
2295 				int64_t pastedRefoff = (int64_t)wr.toff - rdoff;
2296 				DPRect rect;
2297 				if(state == FOUND_NONE) {
2298 					found = dpframe.frameSeedExtensionRect(
2299 														   refoff,   // ref offset implied by seed hit assuming no gaps
2300 														   rows,     // length of read sequence used in DP table
2301 														   tlen,     // length of reference
2302 														   readGaps, // max # of read gaps permitted in opp mate alignment
2303 														   refGaps,  // max # of ref gaps permitted in opp mate alignment
2304 														   (size_t)nceil, // # Ns permitted
2305 														   maxhalf,  // max width in either direction
2306 														   rect);    // DP rectangle
2307 					assert(rect.repOk());
2308 					// Add the seed diagonal at least
2309 					seenDiags.add(Interval(refcoord, 1));
2310 					if(!found) {
2311 						continue;
2312 					}
2313 				}
2314 				int64_t leftShift = refoff - rect.refl;
2315 				size_t nwindow = 0;
2316 				if(toff >= rect.refl) {
2317 					nwindow = (size_t)(toff - rect.refl);
2318 				}
2319 				// NOTE: We might be taking off more than we should because the
2320 				// pasted string omits non-A/C/G/T characters, but we included them
2321 				// when calculating leftShift.  We'll account for this later.
2322 				pastedRefoff -= leftShift;
2323 				size_t nsInLeftShift = 0;
2324 				if(state == FOUND_NONE) {
2325 					if(!swa.initedRead()) {
2326 						// Initialize the aligner with a new read
2327 						swa.initRead(
2328 									 rd.patFw,  // fw version of query
2329 									 rd.patRc,  // rc version of query
2330 									 rd.qual,   // fw version of qualities
2331 									 rd.qualRev,// rc version of qualities
2332 									 0,         // off of first char in 'rd' to consider
2333 									 rdlen,     // off of last char (excl) in 'rd' to consider
2334 									 sc);       // scoring scheme
2335 					}
2336 					swa.initRef(
2337 								fw,        // whether to align forward or revcomp read
2338 								tidx,      // reference aligned against
2339 								rect,      // DP rectangle
2340 								ref,       // Reference strings
2341 								tlen,      // length of reference sequence
2342 								sc,        // scoring scheme
2343 								minsc,     // minimum score permitted
2344 								enable8,   // use 8-bit SSE if possible?
2345 								cminlen,   // minimum length for using checkpointing scheme
2346 								cpow2,     // interval b/t checkpointed diags; 1 << this
2347 								doTri,     // triangular mini-fills?
2348 								true,      // this is a seed extension - not finding a mate
2349 								nwindow,
2350 								nsInLeftShift);
2351 					// Because of how we framed the problem, we can say that we've
2352 					// exhaustively scored the seed diagonal as well as maxgaps
2353 					// diagonals on either side
2354 					Interval refival(tidx, 0, fw, 0);
2355 					rect.initIval(refival);
2356 					seenDiags.add(refival);
2357 					// Now fill the dynamic programming matrix and return true iff
2358 					// there is at least one valid alignment
2359 					TAlScore bestCell = std::numeric_limits<TAlScore>::min();
2360 					found = swa.align(rnd, bestCell);
2361 					swmSeed.tallyGappedDp(readGaps, refGaps);
2362 					prm.nExDps++;
2363 					prm.nDpFail++;    // failed until proven successful
2364 					prm.nExDpFails++; // failed until proven successful
2365 					if(!found) {
2366 						TAlScore bestLast = anchor1 ? prm.bestLtMinscMate1 : prm.bestLtMinscMate2;
2367 						if(bestCell > std::numeric_limits<TAlScore>::min() && bestCell > bestLast) {
2368 							if(anchor1) {
2369 								prm.bestLtMinscMate1 = bestCell;
2370 							} else {
2371 								prm.bestLtMinscMate2 = bestCell;
2372 							}
2373 						}
2374 						continue; // Look for more anchor alignments
2375 					}
2376 				}
2377 				bool firstInner = true;
2378 				bool foundConcordant = false;
2379 				while(true) {
2380 					assert(found);
2381 					SwResult *res = NULL;
2382 					if(state == FOUND_EE) {
2383 						if(!firstInner) {
2384 							break;
2385 						}
2386 						res = &resEe_;
2387 						assert(res->repOk(rd));
2388 					} else if(state == FOUND_UNGAPPED) {
2389 						if(!firstInner) {
2390 							break;
2391 						}
2392 						res = &resUngap_;
2393 						assert(res->repOk(rd));
2394 					} else {
2395 						resGap_.reset();
2396 						assert(resGap_.empty());
2397 						if(swa.done()) {
2398 							break;
2399 						}
2400 						swa.nextAlignment(resGap_, minsc, rnd);
2401 						found = !resGap_.empty();
2402 						if(!found) {
2403 							break;
2404 						}
2405 						res = &resGap_;
2406 						assert(res->repOk(rd));
2407 					}
2408 					// TODO: If we're just taking anchor alignments out of the
2409 					// same rectangle, aren't we getting very similar
2410 					// rectangles for the opposite mate each time?  Seems like
2411 					// we could save some work by detecting this.
2412 					assert(res != NULL);
2413 					firstInner = false;
2414 					assert(res->alres.matchesRef(
2415 												 rd,
2416 												 ref,
2417 												 tmp_rf_,
2418 												 tmp_rdseq_,
2419 												 tmp_qseq_,
2420 												 raw_refbuf_,
2421 												 raw_destU32_,
2422 												 raw_matches_,
2423 												 tmp_reflens_,
2424 												 tmp_refoffs_));
2425 					Interval refival(tidx, 0, fw, tlen);
2426 					assert_gt(res->alres.refExtent(), 0);
2427 					if(gReportOverhangs &&
2428 					   !refival.containsIgnoreOrient(res->alres.refival()))
2429 					{
2430 						res->alres.clipOutside(true, 0, tlen);
2431 						if(res->alres.refExtent() == 0) {
2432 							continue;
2433 						}
2434 					}
2435 					assert(gReportOverhangs ||
2436 					       refival.containsIgnoreOrient(res->alres.refival()));
2437 					// Did the alignment fall entirely outside the reference?
2438 					if(!refival.overlapsIgnoreOrient(res->alres.refival())) {
2439 						continue;
2440 					}
2441 					// Is this alignment redundant with one we've seen previously?
2442 					if(redAnchor_.overlap(res->alres)) {
2443 						continue;
2444 					}
2445 					redAnchor_.add(res->alres);
2446 					// Annotate the AlnRes object with some key parameters
2447 					// that were used to obtain the alignment.
2448 					res->alres.setParams(
2449 										 seedmms,   // # mismatches allowed in seed
2450 										 seedlen,   // length of seed
2451 										 seedival,  // interval between seeds
2452 										 minsc);    // minimum score for valid alignment
2453 					bool foundMate = false;
2454 					TRefOff off = res->alres.refoff();
2455 					if( msink->state().doneWithMate(!anchor1) &&
2456 					   !msink->state().doneWithMate( anchor1))
2457 					{
2458 						// We're done with the opposite mate but not with the
2459 						// anchor mate; don't try to mate up the anchor.
2460 						swMateImmediately = false;
2461 					}
2462 					if(found && swMateImmediately) {
2463 						assert(!msink->state().doneWithMate(!anchor1));
2464 						bool oleft = false, ofw = false;
2465 						int64_t oll = 0, olr = 0, orl = 0, orr = 0;
2466 						assert(!msink->state().done());
2467 						foundMate = !oppFilt;
2468 						TAlScore ominsc_cur = ominsc;
2469 						//bool oungapped = false;
2470 						int oreadGaps = 0, orefGaps = 0;
2471 						//int oungappedAlign = -1; // defer
2472 						if(foundMate) {
2473 							// Adjust ominsc given the alignment score of the
2474 							// anchor mate
2475 							ominsc_cur = ominsc;
2476 							if(tighten > 0 && msink->Mmode() && msink->hasSecondBestPair()) {
2477 								// Paired-end alignments should have at least this score from now
2478 								TAlScore ps;
2479 								if(tighten == 1) {
2480 									ps = msink->bestPair();
2481 								} else if(tighten == 2) {
2482 									ps = msink->secondBestPair();
2483 								} else {
2484 									TAlScore diff = msink->bestPair() - msink->secondBestPair();
2485 									ps = msink->secondBestPair() + (diff * 3)/4;
2486 								}
2487 								if(tighten == 1 && ps < bestPairScore &&
2488 								   msink->bestPair() == msink->secondBestPair())
2489 								{
2490 									ps++;
2491 								}
2492 								if(tighten >= 2 && ps < bestPairScore) {
2493 									ps++;
2494 								}
2495 								// Anchor mate must have score at least 'ps' minus the best possible
2496 								// score for the opposite mate.
2497 								TAlScore nc = ps - res->alres.score().score();
2498 								if(nc > ominsc_cur) {
2499 									ominsc_cur = nc;
2500 									assert_leq(ominsc_cur, operfectScore);
2501 								}
2502 							}
2503 							oreadGaps = sc.maxReadGaps(ominsc_cur, ordlen);
2504 							orefGaps  = sc.maxRefGaps (ominsc_cur, ordlen);
2505 							//oungapped = (oreadGaps == 0 && orefGaps == 0);
2506 							// TODO: Something lighter-weight than DP to scan
2507 							// for other mate??
2508 							//if(oungapped) {
2509 							//	oresUngap_.reset();
2510 							//	oungappedAlign = oswa.ungappedAlign(
2511 							//		ofw ? ord.patFw : ord.patRc,
2512 							//		ofw ? ord.qual  : ord.qualRev,
2513 							//		orefcoord,
2514 							//		ref,
2515 							//		otlen,
2516 							//		sc,
2517 							//		gReportOverhangs,
2518 							//		ominsc_cur,
2519 							//		0,
2520 							//		oresUngap_);
2521 							//}
2522 							foundMate = pepol.otherMate(
2523 														anchor1,             // anchor mate is mate #1?
2524 														fw,                  // anchor aligned to Watson?
2525 														off,                 // offset of anchor mate
2526 														orows + oreadGaps,   // max # columns spanned by alignment
2527 														tlen,                // reference length
2528 														anchor1 ? rd.length() : ord.length(), // mate 1 len
2529 														anchor1 ? ord.length() : rd.length(), // mate 2 len
2530 														oleft,               // out: look left for opposite mate?
2531 														oll,
2532 														olr,
2533 														orl,
2534 														orr,
2535 														ofw);
2536 						}
2537 						DPRect orect;
2538 						if(foundMate) {
2539 							foundMate = dpframe.frameFindMateRect(
2540 																  !oleft,      // true iff anchor alignment is to the left
2541 																  oll,         // leftmost Watson off for LHS of opp aln
2542 																  olr,         // rightmost Watson off for LHS of opp aln
2543 																  orl,         // leftmost Watson off for RHS of opp aln
2544 																  orr,         // rightmost Watson off for RHS of opp aln
2545 																  orows,       // length of opposite mate
2546 																  tlen,        // length of reference sequence aligned to
2547 																  oreadGaps,   // max # of read gaps in opp mate aln
2548 																  orefGaps,    // max # of ref gaps in opp mate aln
2549 																  (size_t)onceil, // max # Ns on opp mate
2550 																  maxhalf,     // max width in either direction
2551 																  orect);      // DP rectangle
2552 							assert(!foundMate || orect.refr >= orect.refl);
2553 						}
2554 						if(foundMate) {
2555 							oresGap_.reset();
2556 							assert(oresGap_.empty());
2557 							if(!oswa.initedRead()) {
2558 								oswa.initRead(
2559 											  ord.patFw,  // read to align
2560 											  ord.patRc,  // qualities
2561 											  ord.qual,   // read to align
2562 											  ord.qualRev,// qualities
2563 											  0,          // off of first char to consider
2564 											  ordlen,     // off of last char (ex) to consider
2565 											  sc);        // scoring scheme
2566 							}
2567 							// Given the boundaries defined by refi and reff, initilize
2568 							// the SwAligner with the dynamic programming problem that
2569 							// aligns the read to this reference stretch.
2570 							size_t onsInLeftShift = 0;
2571 							assert_geq(orect.refr, orect.refl);
2572 							oswa.initRef(
2573 										 ofw,       // align forward or revcomp read?
2574 										 tidx,      // reference aligned against
2575 										 orect,     // DP rectangle
2576 										 ref,       // Reference strings
2577 										 tlen,      // length of reference sequence
2578 										 sc,        // scoring scheme
2579 										 ominsc_cur,// min score for valid alignments
2580 										 enable8,   // use 8-bit SSE if possible?
2581 										 cminlen,   // minimum length for using checkpointing scheme
2582 										 cpow2,     // interval b/t checkpointed diags; 1 << this
2583 										 doTri,     // triangular mini-fills?
2584 										 false,     // this is finding a mate - not seed ext
2585 										 0,         // nwindow?
2586 										 onsInLeftShift);
2587 							// TODO: Can't we add some diagonals to the
2588 							// opposite mate's seenDiags when we fill in the
2589 							// opposite mate's DP?  Or can we?  We might want
2590 							// to use this again as an anchor - will that still
2591 							// happen?  Also, isn't there a problem with
2592 							// consistency of the minimum score?  Minimum score
2593 							// here depends in part on the score of the anchor
2594 							// alignment here, but it won't when the current
2595 							// opposite becomes the anchor.
2596 
2597 							// Because of how we framed the problem, we can say
2598 							// that we've exhaustively explored the "core"
2599 							// diagonals
2600 							//Interval orefival(tidx, 0, ofw, 0);
2601 							//orect.initIval(orefival);
2602 							//oseenDiags.add(orefival);
2603 
2604 							// Now fill the dynamic programming matrix, return true
2605 							// iff there is at least one valid alignment
2606 							TAlScore bestCell = std::numeric_limits<TAlScore>::min();
2607 							foundMate = oswa.align(rnd, bestCell);
2608 							prm.nMateDps++;
2609 							swmMate.tallyGappedDp(oreadGaps, orefGaps);
2610 							if(!foundMate) {
2611 								TAlScore bestLast = anchor1 ? prm.bestLtMinscMate2 : prm.bestLtMinscMate1;
2612 								if(bestCell > std::numeric_limits<TAlScore>::min() && bestCell > bestLast) {
2613 									if(anchor1) {
2614 										prm.bestLtMinscMate2 = bestCell;
2615 									} else {
2616 										prm.bestLtMinscMate1 = bestCell;
2617 									}
2618 								}
2619 							}
2620 						}
2621 						bool didAnchor = false;
2622 						do {
2623 							oresGap_.reset();
2624 							assert(oresGap_.empty());
2625 							if(foundMate && oswa.done()) {
2626 								foundMate = false;
2627 							} else if(foundMate) {
2628 								oswa.nextAlignment(oresGap_, ominsc_cur, rnd);
2629 								foundMate = !oresGap_.empty();
2630 								assert(!foundMate || oresGap_.alres.matchesRef(
2631 																			   ord,
2632 																			   ref,
2633 																			   tmp_rf_,
2634 																			   tmp_rdseq_,
2635 																			   tmp_qseq_,
2636 																			   raw_refbuf_,
2637 																			   raw_destU32_,
2638 																			   raw_matches_,
2639 																			   tmp_reflens_,
2640 																			   tmp_refoffs_));
2641 							}
2642 							if(foundMate) {
2643 								// Redundant with one we've seen previously?
2644 								if(!redAnchor_.overlap(oresGap_.alres)) {
2645 									redAnchor_.add(oresGap_.alres);
2646 								}
2647 								assert_eq(ofw, oresGap_.alres.fw());
2648 								// Annotate the AlnRes object with some key parameters
2649 								// that were used to obtain the alignment.
2650 								oresGap_.alres.setParams(
2651 														 seedmms,    // # mismatches allowed in seed
2652 														 seedlen,    // length of seed
2653 														 seedival,   // interval between seeds
2654 														 ominsc);    // minimum score for valid alignment
2655 								assert_gt(oresGap_.alres.refExtent(), 0);
2656 								if(gReportOverhangs &&
2657 								   !refival.containsIgnoreOrient(oresGap_.alres.refival()))
2658 								{
2659 									oresGap_.alres.clipOutside(true, 0, tlen);
2660 									foundMate = oresGap_.alres.refExtent() > 0;
2661 								}
2662 								if(foundMate &&
2663 								   ((!gReportOverhangs &&
2664 									 !refival.containsIgnoreOrient(oresGap_.alres.refival())) ||
2665 									!refival.overlapsIgnoreOrient(oresGap_.alres.refival())))
2666 								{
2667 									foundMate = false;
2668 								}
2669 							}
2670 							ASSERT_ONLY(TRefId refid);
2671 							TRefOff off1, off2;
2672 							size_t len1, len2;
2673 							bool fw1, fw2;
2674 							int pairCl = PE_ALS_DISCORD;
2675 							if(foundMate) {
2676 								ASSERT_ONLY(refid =) res->alres.refid();
2677 								assert_eq(refid, oresGap_.alres.refid());
2678 								off1 = anchor1 ? off : oresGap_.alres.refoff();
2679 								off2 = anchor1 ? oresGap_.alres.refoff() : off;
2680 								len1 = anchor1 ?
2681 								res->alres.refExtent() : oresGap_.alres.refExtent();
2682 								len2 = anchor1 ?
2683 								oresGap_.alres.refExtent() : res->alres.refExtent();
2684 								fw1  = anchor1 ? res->alres.fw() : oresGap_.alres.fw();
2685 								fw2  = anchor1 ? oresGap_.alres.fw() : res->alres.fw();
2686 								// Check that final mate alignments are consistent with
2687 								// paired-end fragment constraints
2688 								pairCl = pepol.peClassifyPair(
2689 															  off1,
2690 															  len1,
2691 															  fw1,
2692 															  off2,
2693 															  len2,
2694 															  fw2);
2695 								// Instead of trying
2696 								//foundMate = pairCl != PE_ALS_DISCORD;
2697 							}
2698 							if(msink->state().doneConcordant()) {
2699 								foundMate = false;
2700 							}
2701 							if(reportImmediately) {
2702 								if(foundMate) {
2703 									// Report pair to the AlnSinkWrap
2704 									assert(!msink->state().doneConcordant());
2705 									assert(msink != NULL);
2706 									assert(res->repOk());
2707 									assert(oresGap_.repOk());
2708 									// Report an unpaired alignment
2709 									assert(!msink->maxed());
2710 									assert(!msink->state().done());
2711 									bool doneUnpaired = false;
2712 									//if(mixed || discord) {
2713 									// Report alignment for mate #1 as an
2714 									// unpaired alignment.
2715 									if(!anchor1 || !didAnchor) {
2716 										if(anchor1) {
2717 											didAnchor = true;
2718 										}
2719 										const AlnRes& r1 = anchor1 ?
2720 										res->alres : oresGap_.alres;
2721 										if(!redMate1_.overlap(r1)) {
2722 											redMate1_.add(r1);
2723 											if(msink->report(0, &r1, NULL)) {
2724 												doneUnpaired = true; // Short-circuited
2725 											}
2726 										}
2727 									}
2728 									// Report alignment for mate #2 as an
2729 									// unpaired alignment.
2730 									if(anchor1 || !didAnchor) {
2731 										if(!anchor1) {
2732 											didAnchor = true;
2733 										}
2734 										const AlnRes& r2 = anchor1 ?
2735 										oresGap_.alres : res->alres;
2736 										if(!redMate2_.overlap(r2)) {
2737 											redMate2_.add(r2);
2738 											if(msink->report(0, NULL, &r2)) {
2739 												doneUnpaired = true; // Short-circuited
2740 											}
2741 										}
2742 									}
2743 									//} // if(mixed || discord)
2744 									bool donePaired = false;
2745 									if(pairCl != PE_ALS_DISCORD) {
2746 										foundConcordant = true;
2747 										if(msink->report(
2748 														 0,
2749 														 anchor1 ? &res->alres : &oresGap_.alres,
2750 														 anchor1 ? &oresGap_.alres : &res->alres))
2751 										{
2752 											// Short-circuited because a limit, e.g.
2753 											// -k, -m or -M, was exceeded
2754 											donePaired = true;
2755 										} else {
2756 											if(tighten > 0 && msink->Mmode() && msink->hasSecondBestPair()) {
2757 												// Paired-end alignments should have at least this score from now
2758 												TAlScore ps;
2759 												if(tighten == 1) {
2760 													ps = msink->bestPair();
2761 												} else if(tighten == 2) {
2762 													ps = msink->secondBestPair();
2763 												} else {
2764 													TAlScore diff = msink->bestPair() - msink->secondBestPair();
2765 													ps = msink->secondBestPair() + (diff * 3)/4;
2766 												}
2767 												if(tighten == 1 && ps < bestPairScore &&
2768 												   msink->bestPair() == msink->secondBestPair())
2769 												{
2770 													ps++;
2771 												}
2772 												if(tighten >= 2 && ps < bestPairScore) {
2773 													ps++;
2774 												}
2775 												// Anchor mate must have score at least 'ps' minus the best possible
2776 												// score for the opposite mate.
2777 												TAlScore nc = ps - operfectScore;
2778 												if(nc > minsc) {
2779 													minsc = nc;
2780 													assert_leq(minsc, perfectScore);
2781 													if(minsc > res->alres.score().score()) {
2782 														// We're done with this anchor
2783 														break;
2784 													}
2785 												}
2786 												assert_leq(minsc, perfectScore);
2787 											}
2788 										}
2789 									} // if(pairCl != PE_ALS_DISCORD)
2790 									if(donePaired || doneUnpaired) {
2791 										return EXTEND_POLICY_FULFILLED;
2792 									}
2793 									if(msink->state().doneWithMate(anchor1)) {
2794 										// We're now done with the mate that we're
2795 										// currently using as our anchor.  We're not
2796 										// with the read overall.
2797 										return EXTEND_POLICY_FULFILLED;
2798 									}
2799 								} else if((mixed || discord) && !didAnchor) {
2800 									didAnchor = true;
2801 									// Report unpaired hit for anchor
2802 									assert(msink != NULL);
2803 									assert(res->repOk());
2804 									// Check that alignment accurately reflects the
2805 									// reference characters aligned to
2806 									assert(res->alres.matchesRef(
2807 																 rd,
2808 																 ref,
2809 																 tmp_rf_,
2810 																 tmp_rdseq_,
2811 																 tmp_qseq_,
2812 																 raw_refbuf_,
2813 																 raw_destU32_,
2814 																 raw_matches_,
2815 																 tmp_reflens_,
2816 																 tmp_refoffs_));
2817 									// Report an unpaired alignment
2818 									assert(!msink->maxed());
2819 									assert(!msink->state().done());
2820 									// Report alignment for mate #1 as an
2821 									// unpaired alignment.
2822 									if(!msink->state().doneUnpaired(anchor1)) {
2823 										const AlnRes& r = res->alres;
2824 										RedundantAlns& red = anchor1 ? redMate1_ : redMate2_;
2825 										const AlnRes* r1 = anchor1 ? &res->alres : NULL;
2826 										const AlnRes* r2 = anchor1 ? NULL : &res->alres;
2827 										if(!red.overlap(r)) {
2828 											red.add(r);
2829 											if(msink->report(0, r1, r2)) {
2830 												return EXTEND_POLICY_FULFILLED; // Short-circuited
2831 											}
2832 										}
2833 									}
2834 									if(msink->state().doneWithMate(anchor1)) {
2835 										// Done with mate, but not read overall
2836 										return EXTEND_POLICY_FULFILLED;
2837 									}
2838 								}
2839 							}
2840 						} while(!oresGap_.empty());
2841 					} // if(found && swMateImmediately)
2842 					else if(found) {
2843 						assert(!msink->state().doneWithMate(anchor1));
2844 						// We found an anchor alignment but did not attempt to find
2845 						// an alignment for the opposite mate (probably because
2846 						// we're done with it)
2847 						if(reportImmediately && (mixed || discord)) {
2848 							// Report unpaired hit for anchor
2849 							assert(msink != NULL);
2850 							assert(res->repOk());
2851 							// Check that alignment accurately reflects the
2852 							// reference characters aligned to
2853 							assert(res->alres.matchesRef(
2854 														 rd,
2855 														 ref,
2856 														 tmp_rf_,
2857 														 tmp_rdseq_,
2858 														 tmp_qseq_,
2859 														 raw_refbuf_,
2860 														 raw_destU32_,
2861 														 raw_matches_,
2862 														 tmp_reflens_,
2863 														 tmp_refoffs_));
2864 							// Report an unpaired alignment
2865 							assert(!msink->maxed());
2866 							assert(!msink->state().done());
2867 							// Report alignment for mate #1 as an
2868 							// unpaired alignment.
2869 							if(!msink->state().doneUnpaired(anchor1)) {
2870 								const AlnRes& r = res->alres;
2871 								RedundantAlns& red = anchor1 ? redMate1_ : redMate2_;
2872 								const AlnRes* r1 = anchor1 ? &res->alres : NULL;
2873 								const AlnRes* r2 = anchor1 ? NULL : &res->alres;
2874 								if(!red.overlap(r)) {
2875 									red.add(r);
2876 									if(msink->report(0, r1, r2)) {
2877 										return EXTEND_POLICY_FULFILLED; // Short-circuited
2878 									}
2879 								}
2880 							}
2881 							if(msink->state().doneWithMate(anchor1)) {
2882 								// Done with mate, but not read overall
2883 								return EXTEND_POLICY_FULFILLED;
2884 							}
2885 						}
2886 					}
2887 				} // while(true)
2888 
2889 				if(foundConcordant) {
2890 					prm.nMateDpSuccs++;
2891 					mateStreaks_[i] = 0;
2892 					// Register this as a success.  Now we need to
2893 					// make the streak variables reflect the
2894 					// success.
2895 					if(state == FOUND_UNGAPPED) {
2896 						assert_gt(prm.nUgFail, 0);
2897 						assert_gt(prm.nExUgFails, 0);
2898 						prm.nExUgFails--;
2899 						prm.nExUgSuccs++;
2900 						prm.nUgLastSucc = prm.nExUgs-1;
2901 						if(prm.nUgFail > prm.nUgFailStreak) {
2902 							prm.nUgFailStreak = prm.nUgFail;
2903 						}
2904 						prm.nUgFail = 0;
2905 					} else if(state == FOUND_EE) {
2906 						assert_gt(prm.nEeFail, 0);
2907 						assert_gt(prm.nExEeFails, 0);
2908 						prm.nExEeFails--;
2909 						prm.nExEeSuccs++;
2910 						prm.nEeLastSucc = prm.nExEes-1;
2911 						if(prm.nEeFail > prm.nEeFailStreak) {
2912 							prm.nEeFailStreak = prm.nEeFail;
2913 						}
2914 						prm.nEeFail = 0;
2915 					} else {
2916 						assert_gt(prm.nDpFail, 0);
2917 						assert_gt(prm.nExDpFails, 0);
2918 						prm.nExDpFails--;
2919 						prm.nExDpSuccs++;
2920 						prm.nDpLastSucc = prm.nExDps-1;
2921 						if(prm.nDpFail > prm.nDpFailStreak) {
2922 							prm.nDpFailStreak = prm.nDpFail;
2923 						}
2924 						prm.nDpFail = 0;
2925 					}
2926 				} else {
2927 					prm.nMateDpFails++;
2928 					mateStreaks_[i]++;
2929 				}
2930 				// At this point we know that we aren't bailing, and will continue to resolve seed hits.
2931 
2932 			} // while(!gw.done())
2933 		} // for(size_t i = 0; i < gws_.size(); i++)
2934 	}
2935 	return EXTEND_EXHAUSTED_CANDIDATES;
2936 }
2937 
2938 #endif /*ALIGNER_SW_DRIVER_H_*/
2939