1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 /*
21 * aligner_sw_driver.h
22 *
23 * REDUNDANT SEED HITS
24 *
25 * We say that two seed hits are redundant if they trigger identical
26 * seed-extend dynamic programming problems. Put another way, they both lie on
27 * the same diagonal of the overall read/reference dynamic programming matrix.
28 * Detecting redundant seed hits is simple when the seed hits are ungapped. We
29 * do this after offset resolution but before the offset is converted to genome
30 * coordinates (see uses of the seenDiags1_/seenDiags2_ fields for examples).
31 *
32 * REDUNDANT ALIGNMENTS
33 *
34 * In an unpaired context, we say that two alignments are redundant if they
35 * share any cells in the global DP table. Roughly speaking, this is like
36 * saying that two alignments are redundant if any read character aligns to the
37 * same reference character (same reference sequence, same strand, same offset)
38 * in both alignments.
39 *
40 * In a paired-end context, we say that two paired-end alignments are redundant
41 * if the mate #1s are redundant and the mate #2s are redundant.
42 *
43 * How do we enforce this? In the unpaired context, this is relatively simple:
44 * the cells from each alignment are checked against a set containing all cells
45 * from all previous alignments. Given a new alignment, for each cell in the
46 * new alignment we check whether it is in the set. If there is any overlap,
47 * the new alignment is rejected as redundant. Otherwise, the new alignment is
48 * accepted and its cells are added to the set.
49 *
50 * Enforcement in a paired context is a little trickier. Consider the
51 * following approaches:
52 *
53 * 1. Skip anchors that are redundant with any previous anchor or opposite
54 * alignment. This is sufficient to ensure no two concordant alignments
55 * found are redundant.
56 *
57 * 2. Same as scheme 1, but with a "transitive closure" scheme for finding all
58 * concordant pairs in the vicinity of an anchor. Consider the AB/AC
59 * scenario from the previous paragraph. If B is the anchor alignment, we
60 * will find AB but not AC. But under this scheme, once we find AB we then
61 * let B be a new anchor and immediately look for its opposites. Likewise,
62 * if we find any opposite, we make them anchors and continue searching. We
63 * don't stop searching until every opposite is used as an anchor.
64 *
65 * 3. Skip anchors that are redundant with any previous anchor alignment (but
66 * allow anchors that are redundant with previous opposite alignments).
67 * This isn't sufficient to avoid redundant concordant alignments. To avoid
68 * redundant concordants, we need an additional procedure that checks each
69 * new concordant alignment one-by-one against a list of previous concordant
70 * alignments to see if it is redundant.
71 *
72 * We take approach 1.
73 */
74
75 #ifndef ALIGNER_SW_DRIVER_H_
76 #define ALIGNER_SW_DRIVER_H_
77
78 #include <iostream>
79 // -- BTL remove --
80 #include <stdlib.h>
81 #include <sys/time.h>
82 // -- --
83 #include <utility>
84 #include "ds.h"
85 #include "aligner_seed.h"
86 #include "aligner_sw.h"
87 #include "aligner_cache.h"
88 #include "reference.h"
89 #include "group_walk.h"
90 #include "gfm.h"
91 #include "mem_ids.h"
92 #include "aln_sink.h"
93 #include "pe.h"
94 #include "ival_list.h"
95 #include "simple_func.h"
96 #include "random_util.h"
97 #include "dp_framer.h"
98
99 using namespace std;
100
101 template <typename index_t>
102 struct SeedPos {
103
SeedPosSeedPos104 SeedPos() : fw(false), offidx(0), rdoff(0), seedlen(0) { }
105
SeedPosSeedPos106 SeedPos(
107 bool fw_,
108 index_t offidx_,
109 index_t rdoff_,
110 index_t seedlen_)
111 {
112 init(fw_, offidx_, rdoff_, seedlen_);
113 }
114
initSeedPos115 void init(
116 bool fw_,
117 index_t offidx_,
118 index_t rdoff_,
119 index_t seedlen_)
120 {
121 fw = fw_;
122 offidx = offidx_;
123 rdoff = rdoff_;
124 seedlen = seedlen_;
125 }
126
127 bool operator<(const SeedPos& o) const {
128 if(offidx < o.offidx) return true;
129 if(offidx > o.offidx) return false;
130 if(rdoff < o.rdoff) return true;
131 if(rdoff > o.rdoff) return false;
132 if(seedlen < o.seedlen) return true;
133 if(seedlen > o.seedlen) return false;
134 if(fw && !o.fw) return true;
135 if(!fw && o.fw) return false;
136 return false;
137 }
138
139 bool operator>(const SeedPos& o) const {
140 if(offidx < o.offidx) return false;
141 if(offidx > o.offidx) return true;
142 if(rdoff < o.rdoff) return false;
143 if(rdoff > o.rdoff) return true;
144 if(seedlen < o.seedlen) return false;
145 if(seedlen > o.seedlen) return true;
146 if(fw && !o.fw) return false;
147 if(!fw && o.fw) return true;
148 return false;
149 }
150
151 bool operator==(const SeedPos& o) const {
152 return fw == o.fw && offidx == o.offidx &&
153 rdoff == o.rdoff && seedlen == o.seedlen;
154 }
155
156 bool fw;
157 index_t offidx;
158 index_t rdoff;
159 index_t seedlen;
160 };
161
162 /**
163 * An SATuple along with the associated seed position.
164 */
165 template <typename index_t>
166 struct SATupleAndPos {
167
168 SATuple<index_t> sat; // result for this seed hit
169 SeedPos<index_t> pos; // seed position that yielded the range this was taken from
170 index_t origSz; // size of range this was taken from
171 index_t nlex; // # position we can extend seed hit to left w/o edit
172 index_t nrex; // # position we can extend seed hit to right w/o edit
173
174 bool operator<(const SATupleAndPos& o) const {
175 if(sat < o.sat) return true;
176 if(sat > o.sat) return false;
177 return pos < o.pos;
178 }
179
180 bool operator==(const SATupleAndPos& o) const {
181 return sat == o.sat && pos == o.pos;
182 }
183 };
184
185 /**
186 * Encapsulates the weighted random sampling scheme we want to use to pick
187 * which seed hit range to sample a row from.
188 */
189 template <typename index_t>
190 class RowSampler {
191
192 public:
193
elim_(cat)194 RowSampler(int cat = 0) : elim_(cat), masses_(cat) {
195 mass_ = 0.0f;
196 }
197
198 /**
199 * Initialze sampler with respect to a range of elements in a list of
200 * SATupleAndPos's.
201 */
init(const EList<SATupleAndPos<index_t>,16> & salist,index_t sai,index_t saf,bool lensq,bool szsq)202 void init(
203 const EList<SATupleAndPos<index_t>, 16>& salist,
204 index_t sai,
205 index_t saf,
206 bool lensq, // whether to square the numerator, which = extended length
207 bool szsq) // whether to square denominator, which =
208 {
209 assert_gt(saf, sai);
210 elim_.resize(saf - sai);
211 elim_.fill(false);
212 // Initialize mass
213 mass_ = 0.0f;
214 masses_.resize(saf - sai);
215 for(index_t i = sai; i < saf; i++) {
216 index_t len = salist[i].nlex + salist[i].nrex + 1; // + salist[i].sat.key.len;
217 double num = (double)len;
218 if(lensq) {
219 num *= num;
220 }
221 double denom = (double)salist[i].sat.size();
222 if(szsq) {
223 denom *= denom;
224 }
225 masses_[i - sai] = num / denom;
226 mass_ += masses_[i - sai];
227 }
228 }
229
230 /**
231 * Caller is indicating that the bin at index i is exhausted and we should
232 * exclude it from our sampling from now on.
233 */
finishedRange(index_t i)234 void finishedRange(index_t i) {
235 assert_lt(i, masses_.size());
236 elim_[i] = true;
237 mass_ -= masses_[i];
238 }
239
240 /**
241 * Sample randomly from the mass.
242 */
next(RandomSource & rnd)243 size_t next(RandomSource& rnd) {
244 // Throw the dart
245 double rd = rnd.nextFloat() * mass_;
246 double mass_sofar = 0.0f;
247 size_t sz = masses_.size();
248 size_t last_unelim = std::numeric_limits<size_t>::max();
249 for(size_t i = 0; i < sz; i++) {
250 if(!elim_[i]) {
251 last_unelim = i;
252 mass_sofar += masses_[i];
253 if(rd < mass_sofar) {
254 // This is the one we hit
255 return i;
256 }
257 }
258 }
259 assert_neq(std::numeric_limits<size_t>::max(), last_unelim);
260 return last_unelim;
261 }
262
263 protected:
264 double mass_; // total probability mass to throw darts at
265 EList<bool> elim_; // whether the range is eliminated
266 EList<double> masses_; // mass of each range
267 };
268
269 /**
270 * Return values from extendSeeds and extendSeedsPaired.
271 */
272 enum {
273 // All end-to-end and seed hits were examined
274 // The policy does not need us to look any further
275 EXTEND_EXHAUSTED_CANDIDATES = 1,
276 EXTEND_POLICY_FULFILLED,
277 // We stopped because we reached a point where the only remaining
278 // alignments of interest have perfect scores, but we already investigated
279 // perfect alignments
280 EXTEND_PERFECT_SCORE,
281 // We stopped because we ran up against a limit on how much work we should
282 // do for one set of seed ranges, e.g. the limit on number of consecutive
283 // unproductive DP extensions
284 EXTEND_EXCEEDED_SOFT_LIMIT,
285 // We stopped because we ran up against a limit on how much work we should
286 // do for overall before giving up on a mate
287 EXTEND_EXCEEDED_HARD_LIMIT
288 };
289
290 /**
291 * Data structure encapsulating a range that's been extended out in two
292 * directions.
293 */
294 struct ExtendRange {
295
initExtendRange296 void init(size_t off_, size_t len_, size_t sz_) {
297 off = off_; len = len_; sz = sz_;
298 }
299
300 size_t off; // offset of extended region
301 size_t len; // length between extremes of extended region
302 size_t sz; // # of elements in SA range
303 };
304
305 template <typename index_t>
306 class SwDriver {
307
308 typedef PList<index_t, CACHE_PAGE_SZ> TSAList;
309
310 public:
311
SwDriver(size_t bytes)312 SwDriver(size_t bytes) :
313 satups_(DP_CAT),
314 gws_(DP_CAT),
315 seenDiags1_(DP_CAT),
316 seenDiags2_(DP_CAT),
317 redAnchor_(DP_CAT),
318 redMate1_(DP_CAT),
319 redMate2_(DP_CAT),
320 pool_(bytes, CACHE_PAGE_SZ, DP_CAT),
321 salistEe_(DP_CAT),
322 gwstate_(GW_CAT) { }
323
324 /**
325 * Given a collection of SeedHits for a single read, extend seed alignments
326 * into full alignments. Where possible, try to avoid redundant offset
327 * lookups and dynamic programming problems. Optionally report alignments
328 * to a AlnSinkWrap object as they are discovered.
329 *
330 * If 'reportImmediately' is true, returns true iff a call to
331 * mhs->report() returned true (indicating that the reporting
332 * policy is satisfied and we can stop). Otherwise, returns false.
333 */
334 int extendSeeds(
335 Read& rd, // read to align
336 bool mate1, // true iff rd is mate #1
337 SeedResults<index_t>& sh, // seed hits to extend into full alignments
338 const GFM<index_t>& gfmFw, // BWT
339 const GFM<index_t>* gfmBw, // BWT'
340 const BitPairReference& ref, // Reference strings
341 SwAligner& swa, // dynamic programming aligner
342 const Scoring& sc, // scoring scheme
343 int seedmms, // # mismatches allowed in seed
344 int seedlen, // length of seed
345 int seedival, // interval between seeds
346 TAlScore& minsc, // minimum score for anchor
347 int nceil, // maximum # Ns permitted in ref portion
348 size_t maxhalf, // maximum width on one side of DP table
349 bool doUngapped, // do ungapped alignment
350 size_t maxIters, // stop after this many seed-extend loop iters
351 size_t maxUg, // max # ungapped extends
352 size_t maxDp, // max # DPs
353 size_t maxUgStreak, // stop after streak of this many ungap fails
354 size_t maxDpStreak, // stop after streak of this many dp fails
355 bool doExtend, // do seed extension
356 bool enable8, // use 8-bit SSE where possible
357 size_t cminlen, // use checkpointer if read longer than this
358 size_t cpow2, // interval between diagonals to checkpoint
359 bool doTri, // triangular mini-fills
360 int tighten, // -M score tightening mode
361 AlignmentCacheIface<index_t>& ca, // alignment cache for seed hits
362 RandomSource& rnd, // pseudo-random source
363 WalkMetrics& wlm, // group walk left metrics
364 SwMetrics& swmSeed, // DP metrics for seed-extend
365 PerReadMetrics& prm, // per-read metrics
366 AlnSinkWrap<index_t>* mhs, // HitSink for multiseed-style aligner
367 bool reportImmediately, // whether to report hits immediately to mhs
368 bool& exhaustive);
369
370 /**
371 * Given a collection of SeedHits for a read pair, extend seed
372 * alignments into full alignments and then look for the opposite
373 * mate using dynamic programming. Where possible, try to avoid
374 * redundant offset lookups. Optionally report alignments to a
375 * AlnSinkWrap object as they are discovered.
376 *
377 * If 'reportImmediately' is true, returns true iff a call to
378 * mhs->report() returned true (indicating that the reporting
379 * policy is satisfied and we can stop). Otherwise, returns false.
380 */
381 int extendSeedsPaired(
382 Read& rd, // mate to align as anchor
383 Read& ord, // mate to align as opposite
384 bool anchor1, // true iff anchor mate is mate1
385 bool oppFilt, // true iff opposite mate was filtered out
386 SeedResults<index_t>& sh, // seed hits for anchor
387 const GFM<index_t>& gfmFw, // BWT
388 const GFM<index_t>* gfmBw, // BWT'
389 const BitPairReference& ref, // Reference strings
390 SwAligner& swa, // dyn programming aligner for anchor
391 SwAligner& swao, // dyn programming aligner for opposite
392 const Scoring& sc, // scoring scheme
393 const PairedEndPolicy& pepol,// paired-end policy
394 int seedmms, // # mismatches allowed in seed
395 int seedlen, // length of seed
396 int seedival, // interval between seeds
397 TAlScore& minsc, // minimum score for anchor
398 TAlScore& ominsc, // minimum score for opposite
399 int nceil, // max # Ns permitted in ref for anchor
400 int onceil, // max # Ns permitted in ref for opposite
401 bool nofw, // don't align forward read
402 bool norc, // don't align revcomp read
403 size_t maxhalf, // maximum width on one side of DP table
404 bool doUngapped, // do ungapped alignment
405 size_t maxIters, // stop after this many seed-extend loop iters
406 size_t maxUg, // max # ungapped extends
407 size_t maxDp, // max # DPs
408 size_t maxEeStreak, // stop after streak of this many end-to-end fails
409 size_t maxUgStreak, // stop after streak of this many ungap fails
410 size_t maxDpStreak, // stop after streak of this many dp fails
411 size_t maxMateStreak, // stop seed range after N mate-find fails
412 bool doExtend, // do seed extension
413 bool enable8, // use 8-bit SSE where possible
414 size_t cminlen, // use checkpointer if read longer than this
415 size_t cpow2, // interval between diagonals to checkpoint
416 bool doTri, // triangular mini-fills
417 int tighten, // -M score tightening mode
418 AlignmentCacheIface<index_t>& cs, // alignment cache for seed hits
419 RandomSource& rnd, // pseudo-random source
420 WalkMetrics& wlm, // group walk left metrics
421 SwMetrics& swmSeed, // DP metrics for seed-extend
422 SwMetrics& swmMate, // DP metrics for mate finidng
423 PerReadMetrics& prm, // per-read metrics for anchor
424 AlnSinkWrap<index_t>* msink, // AlnSink wrapper for multiseed-style aligner
425 bool swMateImmediately, // whether to look for mate immediately
426 bool reportImmediately, // whether to report hits immediately to msink
427 bool discord, // look for discordant alignments?
428 bool mixed, // look for unpaired as well as paired alns?
429 bool& exhaustive);
430
431 /**
432 * Prepare for a new read.
433 */
nextRead(bool paired,size_t mate1len,size_t mate2len)434 void nextRead(bool paired, size_t mate1len, size_t mate2len) {
435 redAnchor_.reset();
436 seenDiags1_.reset();
437 seenDiags2_.reset();
438 seedExRangeFw_[0].clear(); // mate 1 fw
439 seedExRangeFw_[1].clear(); // mate 2 fw
440 seedExRangeRc_[0].clear(); // mate 1 rc
441 seedExRangeRc_[1].clear(); // mate 2 rc
442 size_t maxlen = mate1len;
443 if(paired) {
444 redMate1_.reset();
445 redMate1_.init(mate1len);
446 redMate2_.reset();
447 redMate2_.init(mate2len);
448 if(mate2len > maxlen) {
449 maxlen = mate2len;
450 }
451 }
452 redAnchor_.init(maxlen);
453 }
454
455 protected:
456
457 bool eeSaTups(
458 const Read& rd, // read
459 SeedResults<index_t>& sh, // seed hits to extend into full alignments
460 const GFM<index_t>& gfm, // BWT
461 const BitPairReference& ref, // Reference strings
462 RandomSource& rnd, // pseudo-random generator
463 WalkMetrics& wlm, // group walk left metrics
464 SwMetrics& swmSeed, // metrics for seed extensions
465 index_t& nelt_out, // out: # elements total
466 index_t maxelts, // max # elts to report
467 bool all); // report all hits?
468
469 void extend(
470 const Read& rd, // read
471 const GFM<index_t>& gfmFw, // Forward Bowtie index
472 const GFM<index_t>* gfmBw, // Backward Bowtie index
473 index_t topf, // top in fw index
474 index_t botf, // bot in fw index
475 index_t topb, // top in bw index
476 index_t botb, // bot in bw index
477 bool fw, // seed orientation
478 index_t off, // seed offset from 5' end
479 index_t len, // seed length
480 PerReadMetrics& prm, // per-read metrics
481 index_t& nlex, // # positions we can extend to left w/o edit
482 index_t& nrex); // # positions we can extend to right w/o edit
483
484 void prioritizeSATups(
485 const Read& rd, // read
486 SeedResults<index_t>& sh, // seed hits to extend into full alignments
487 const GFM<index_t>& gfmFw, // BWT
488 const GFM<index_t>* gfmBw, // BWT'
489 const BitPairReference& ref, // Reference strings
490 int seedmms, // # seed mismatches allowed
491 index_t maxelt, // max elts we'll consider
492 bool doExtend, // extend out seeds
493 bool lensq, // square extended length
494 bool szsq, // square SA range size
495 index_t nsm, // if range as <= nsm elts, it's "small"
496 AlignmentCacheIface<index_t>& ca, // alignment cache for seed hits
497 RandomSource& rnd, // pseudo-random generator
498 WalkMetrics& wlm, // group walk left metrics
499 PerReadMetrics& prm, // per-read metrics
500 index_t& nelt_out, // out: # elements total
501 bool all); // report all hits?
502
503 Random1toN rand_; // random number generators
504 EList<Random1toN, 16> rands_; // random number generators
505 EList<Random1toN, 16> rands2_; // random number generators
506 EList<EEHit<index_t>, 16> eehits_; // holds end-to-end hits
507 EList<SATupleAndPos<index_t>, 16> satpos_; // holds SATuple, SeedPos pairs
508 EList<SATupleAndPos<index_t>, 16> satpos2_; // holds SATuple, SeedPos pairs
509 EList<SATuple<index_t>, 16> satups_; // holds SATuples to explore elements from
510 EList<GroupWalk2S<index_t, TSlice, 16> > gws_; // list of GroupWalks; no particular order
511 EList<size_t> mateStreaks_; // mate-find fail streaks
512 RowSampler<index_t> rowsamp_; // row sampler
513
514 // Ranges that we've extended through when extending seed hits
515 EList<ExtendRange> seedExRangeFw_[2];
516 EList<ExtendRange> seedExRangeRc_[2];
517
518 // Data structures encapsulating the diagonals that have already been used
519 // to seed alignment for mate 1 and mate 2.
520 EIvalMergeListBinned seenDiags1_;
521 EIvalMergeListBinned seenDiags2_;
522
523 // For weeding out redundant alignments
524 RedundantAlns redAnchor_; // database of cells used for anchor alignments
525 RedundantAlns redMate1_; // database of cells used for mate 1 alignments
526 RedundantAlns redMate2_; // database of cells used for mate 2 alignments
527
528 // For holding results for anchor (res_) and opposite (ores_) mates
529 SwResult resGap_; // temp holder for alignment result
530 SwResult oresGap_; // temp holder for alignment result, opp mate
531 SwResult resUngap_; // temp holder for ungapped alignment result
532 SwResult oresUngap_; // temp holder for ungap. aln. opp mate
533 SwResult resEe_; // temp holder for ungapped alignment result
534 SwResult oresEe_; // temp holder for ungap. aln. opp mate
535
536 Pool pool_; // memory pages for salistExact_
537 TSAList salistEe_; // PList for offsets for end-to-end hits
538 GroupWalkState<index_t> gwstate_; // some per-thread state shared by all GroupWalks
539
540 // For AlnRes::matchesRef:
541 ASSERT_ONLY(SStringExpandable<char> raw_refbuf_);
542 ASSERT_ONLY(SStringExpandable<uint32_t> raw_destU32_);
543 ASSERT_ONLY(EList<bool> raw_matches_);
544 ASSERT_ONLY(BTDnaString tmp_rf_);
545 ASSERT_ONLY(BTDnaString tmp_rdseq_);
546 ASSERT_ONLY(BTString tmp_qseq_);
547 ASSERT_ONLY(EList<index_t> tmp_reflens_);
548 ASSERT_ONLY(EList<index_t> tmp_refoffs_);
549 };
550
551 #define TIMER_START() \
552 struct timeval tv_i, tv_f; \
553 struct timezone tz_i, tz_f; \
554 size_t total_usecs; \
555 gettimeofday(&tv_i, &tz_i)
556
557 #define IF_TIMER_END() \
558 gettimeofday(&tv_f, &tz_f); \
559 total_usecs = \
560 (tv_f.tv_sec - tv_i.tv_sec) * 1000000 + (tv_f.tv_usec - tv_i.tv_usec); \
561 if(total_usecs > 300000)
562
563 /*
564 * aligner_sw_driver.cpp
565 *
566 * Routines that drive the alignment process given a collection of seed hits.
567 * This is generally done in a few stages: extendSeeds visits the set of
568 * seed-hit BW elements in some order; for each element visited it resolves its
569 * reference offset; once the reference offset is known, bounds for a dynamic
570 * programming subproblem are established; if these bounds are distinct from
571 * the bounds we've already tried, we solve the dynamic programming subproblem
572 * and report the hit; if the AlnSinkWrap indicates that we can stop, we
573 * return, otherwise we continue on to the next BW element.
574 */
575
576
577 /**
578 * Given end-to-end alignment results stored in the SeedResults structure, set
579 * up all of our state for resolving and keeping track of reference offsets for
580 * hits. Order the list of ranges to examine such that all exact end-to-end
581 * alignments are examined before any 1mm end-to-end alignments.
582 *
583 * Note: there might be a lot of hits and a lot of wide ranges to look for
584 * here. We use 'maxelt'.
585 */
586 template <typename index_t>
eeSaTups(const Read & rd,SeedResults<index_t> & sh,const GFM<index_t> & gfm,const BitPairReference & ref,RandomSource & rnd,WalkMetrics & wlm,SwMetrics & swmSeed,index_t & nelt_out,index_t maxelt,bool all)587 bool SwDriver<index_t>::eeSaTups(
588 const Read& rd, // read
589 SeedResults<index_t>& sh, // seed hits to extend into full alignments
590 const GFM<index_t>& gfm, // BWT
591 const BitPairReference& ref, // Reference strings
592 RandomSource& rnd, // pseudo-random generator
593 WalkMetrics& wlm, // group walk left metrics
594 SwMetrics& swmSeed, // metrics for seed extensions
595 index_t& nelt_out, // out: # elements total
596 index_t maxelt, // max elts we'll consider
597 bool all) // report all hits?
598 {
599 assert_eq(0, nelt_out);
600 gws_.clear();
601 rands_.clear();
602 satpos_.clear();
603 eehits_.clear();
604 // First, count up the total number of satpos_, rands_, eehits_, and gws_
605 // we're going to tuse
606 index_t nobj = 0;
607 if(!sh.exactFwEEHit().empty()) nobj++;
608 if(!sh.exactRcEEHit().empty()) nobj++;
609 nobj += sh.mm1EEHits().size();
610 nobj = min(nobj, maxelt);
611 gws_.ensure(nobj);
612 rands_.ensure(nobj);
613 satpos_.ensure(nobj);
614 eehits_.ensure(nobj);
615 index_t tot = sh.exactFwEEHit().size() + sh.exactRcEEHit().size();
616 bool succ = false;
617 bool firstEe = true;
618 bool done = false;
619 if(tot > 0) {
620 bool fwFirst = true;
621 // Pick fw / rc to go first in a weighted random fashion
622 #ifdef BOWTIE_64BIT_INDEX
623 index_t rn64 = rnd.nextU64();
624 index_t rn = rn64 % (uint64_t)tot;
625 #else
626 index_t rn32 = rnd.nextU32();
627 index_t rn = rn32 % (uint32_t)tot;
628 #endif
629 if(rn >= sh.exactFwEEHit().size()) {
630 fwFirst = false;
631 }
632 for(int fwi = 0; fwi < 2 && !done; fwi++) {
633 bool fw = ((fwi == 0) == fwFirst);
634 EEHit<index_t> hit = fw ? sh.exactFwEEHit() : sh.exactRcEEHit();
635 if(hit.empty()) {
636 continue;
637 }
638 assert(hit.fw == fw);
639 if(hit.bot > hit.top) {
640 // Possibly adjust bot and width if we would have exceeded maxelt
641 index_t tops[2] = { hit.top, 0 };
642 index_t bots[2] = { hit.bot, 0 };
643 index_t width = hit.bot - hit.top;
644 if(nelt_out + width > maxelt) {
645 index_t trim = (index_t)((nelt_out + width) - maxelt);
646 #ifdef BOWTIE_64BIT_INDEX
647 index_t rn = rnd.nextU64() % width;
648 #else
649 index_t rn = rnd.nextU32() % width;
650 #endif
651 index_t newwidth = width - trim;
652 if(hit.top + rn + newwidth > hit.bot) {
653 // Two pieces
654 tops[0] = hit.top + rn;
655 bots[0] = hit.bot;
656 tops[1] = hit.top;
657 bots[1] = hit.top + newwidth - (bots[0] - tops[0]);
658 } else {
659 // One piece
660 tops[0] = hit.top + rn;
661 bots[0] = tops[0] + newwidth;
662 }
663 assert_leq(bots[0], hit.bot);
664 assert_leq(bots[1], hit.bot);
665 assert_geq(bots[0], tops[0]);
666 assert_geq(bots[1], tops[1]);
667 assert_eq(newwidth, (bots[0] - tops[0]) + (bots[1] - tops[1]));
668 }
669 for(int i = 0; i < 2 && !done; i++) {
670 if(bots[i] <= tops[i]) break;
671 index_t width = bots[i] - tops[i];
672 index_t top = tops[i];
673 // Clear list where resolved offsets are stored
674 swmSeed.exranges++;
675 swmSeed.exrows += width;
676 if(!succ) {
677 swmSeed.exsucc++;
678 succ = true;
679 }
680 if(firstEe) {
681 salistEe_.clear();
682 pool_.clear();
683 firstEe = false;
684 }
685 // We have to be careful not to allocate excessive amounts of memory here
686 TSlice o(salistEe_, (index_t)salistEe_.size(), width);
687 for(index_t i = 0; i < width; i++) {
688 if(!salistEe_.add(pool_, (index_t)OFF_MASK)) {
689 swmSeed.exooms++;
690 return false;
691 }
692 }
693 assert(!done);
694 eehits_.push_back(hit);
695 satpos_.expand();
696 satpos_.back().sat.init(SAKey(), top, (index_t)OFF_MASK, o);
697 satpos_.back().sat.key.seq = MAX_U64;
698 satpos_.back().sat.key.len = (index_t)rd.length();
699 satpos_.back().pos.init(fw, 0, 0, (index_t)rd.length());
700 satpos_.back().origSz = width;
701 rands_.expand();
702 rands_.back().init(width, all);
703 gws_.expand();
704 SARangeWithOffs<TSlice, index_t> sa;
705 sa.topf = satpos_.back().sat.topf;
706 sa.len = satpos_.back().sat.key.len;
707 sa.offs = satpos_.back().sat.offs;
708 gws_.back().init(
709 gfm, // forward Bowtie index
710 ref, // reference sequences
711 sa, // SATuple
712 rnd, // pseudo-random generator
713 wlm); // metrics
714 assert(gws_.back().repOk(sa));
715 nelt_out += width;
716 if(nelt_out >= maxelt) {
717 done = true;
718 }
719 }
720 }
721 }
722 }
723 succ = false;
724 if(!done && !sh.mm1EEHits().empty()) {
725 sh.sort1mmEe(rnd);
726 index_t sz = sh.mm1EEHits().size();
727 for(index_t i = 0; i < sz && !done; i++) {
728 EEHit<index_t> hit = sh.mm1EEHits()[i];
729 assert(hit.repOk(rd));
730 assert(!hit.empty());
731 // Possibly adjust bot and width if we would have exceeded maxelt
732 index_t tops[2] = { hit.top, 0 };
733 index_t bots[2] = { hit.bot, 0 };
734 index_t width = hit.bot - hit.top;
735 if(nelt_out + width > maxelt) {
736 index_t trim = (index_t)((nelt_out + width) - maxelt);
737 #ifdef BOWTIE_64BIT_INDEX
738 index_t rn = rnd.nextU64() % width;
739 #else
740 index_t rn = rnd.nextU32() % width;
741 #endif
742 index_t newwidth = width - trim;
743 if(hit.top + rn + newwidth > hit.bot) {
744 // Two pieces
745 tops[0] = hit.top + rn;
746 bots[0] = hit.bot;
747 tops[1] = hit.top;
748 bots[1] = hit.top + newwidth - (bots[0] - tops[0]);
749 } else {
750 // One piece
751 tops[0] = hit.top + rn;
752 bots[0] = tops[0] + newwidth;
753 }
754 assert_leq(bots[0], hit.bot);
755 assert_leq(bots[1], hit.bot);
756 assert_geq(bots[0], tops[0]);
757 assert_geq(bots[1], tops[1]);
758 assert_eq(newwidth, (bots[0] - tops[0]) + (bots[1] - tops[1]));
759 }
760 for(int i = 0; i < 2 && !done; i++) {
761 if(bots[i] <= tops[i]) break;
762 index_t width = bots[i] - tops[i];
763 index_t top = tops[i];
764 // Clear list where resolved offsets are stored
765 swmSeed.mm1ranges++;
766 swmSeed.mm1rows += width;
767 if(!succ) {
768 swmSeed.mm1succ++;
769 succ = true;
770 }
771 if(firstEe) {
772 salistEe_.clear();
773 pool_.clear();
774 firstEe = false;
775 }
776 TSlice o(salistEe_, (index_t)salistEe_.size(), width);
777 for(size_t i = 0; i < width; i++) {
778 if(!salistEe_.add(pool_, (index_t)OFF_MASK)) {
779 swmSeed.mm1ooms++;
780 return false;
781 }
782 }
783 eehits_.push_back(hit);
784 satpos_.expand();
785 satpos_.back().sat.init(SAKey(), top, (index_t)OFF_MASK, o);
786 satpos_.back().sat.key.seq = MAX_U64;
787 satpos_.back().sat.key.len = (index_t)rd.length();
788 satpos_.back().pos.init(hit.fw, 0, 0, (index_t)rd.length());
789 satpos_.back().origSz = width;
790 rands_.expand();
791 rands_.back().init(width, all);
792 gws_.expand();
793 SARangeWithOffs<TSlice, index_t> sa;
794 sa.topf = satpos_.back().sat.topf;
795 sa.len = satpos_.back().sat.key.len;
796 sa.offs = satpos_.back().sat.offs;
797 gws_.back().init(
798 gfm, // forward Bowtie index
799 ref, // reference sequences
800 sa, // SATuple
801 rnd, // pseudo-random generator
802 wlm); // metrics
803 assert(gws_.back().repOk(sa));
804 nelt_out += width;
805 if(nelt_out >= maxelt) {
806 done = true;
807 }
808 }
809 }
810 }
811 return true;
812 }
813
814 /**
815 * Extend a seed hit out on either side. Requires that we know the seed hit's
816 * offset into the read and orientation. Also requires that we know top/bot
817 * for the seed hit in both the forward and (if we want to extend to the right)
818 * reverse index.
819 */
820 template <typename index_t>
extend(const Read & rd,const GFM<index_t> & gfmFw,const GFM<index_t> * gfmBw,index_t topf,index_t botf,index_t topb,index_t botb,bool fw,index_t off,index_t len,PerReadMetrics & prm,index_t & nlex,index_t & nrex)821 void SwDriver<index_t>::extend(
822 const Read& rd, // read
823 const GFM<index_t>& gfmFw, // Forward Bowtie index
824 const GFM<index_t>* gfmBw, // Backward Bowtie index
825 index_t topf, // top in fw index
826 index_t botf, // bot in fw index
827 index_t topb, // top in bw index
828 index_t botb, // bot in bw index
829 bool fw, // seed orientation
830 index_t off, // seed offset from 5' end
831 index_t len, // seed length
832 PerReadMetrics& prm, // per-read metrics
833 index_t& nlex, // # positions we can extend to left w/o edit
834 index_t& nrex) // # positions we can extend to right w/o edit
835 {
836 index_t t[4], b[4];
837 index_t tp[4], bp[4];
838 SideLocus<index_t> tloc, bloc;
839 index_t rdlen = (index_t)rd.length();
840 index_t lim = fw ? off : rdlen - len - off;
841 // We're about to add onto the beginning, so reverse it
842 #ifndef NDEBUG
843 if(false) {
844 // TODO: This will sometimes fail even when the extension is legitimate
845 // This is because contains() comes in from one extreme or the other,
846 // whereas we started from the inside and worked outwards. This
847 // affects which Ns are OK and which are not OK.
848
849 // Have to do both because whether we can get through an N depends on
850 // which direction we're coming in
851 bool fwContains = gfmFw.contains(tmp_rdseq_);
852 tmp_rdseq_.reverse();
853 bool bwContains = gfmBw != NULL && gfmBw->contains(tmp_rdseq_);
854 tmp_rdseq_.reverse();
855 assert(fwContains || bwContains);
856 }
857 #endif
858 ASSERT_ONLY(tmp_rdseq_.reverse());
859 if(lim > 0) {
860 const GFM<index_t> *gfm = &gfmFw;
861 assert(gfm != NULL);
862 // Extend left using forward index
863 const BTDnaString& seq = fw ? rd.patFw : rd.patRc;
864 // See what we get by extending
865 index_t top = topf, bot = botf;
866 t[0] = t[1] = t[2] = t[3] = 0;
867 b[0] = b[1] = b[2] = b[3] = 0;
868 tp[0] = tp[1] = tp[2] = tp[3] = topb;
869 bp[0] = bp[1] = bp[2] = bp[3] = botb;
870 SideLocus<index_t> tloc, bloc;
871 INIT_LOCS(top, bot, tloc, bloc, *gfm);
872 for(index_t ii = 0; ii < lim; ii++) {
873 // Starting to left of seed (<off) and moving left
874 index_t i = 0;
875 if(fw) {
876 i = off - ii - 1;
877 } else {
878 i = rdlen - off - len - 1 - ii;
879 }
880 // Get char from read
881 int rdc = seq.get(i);
882 // See what we get by extending
883 if(bloc.valid()) {
884 prm.nSdFmops++;
885 t[0] = t[1] = t[2] = t[3] =
886 b[0] = b[1] = b[2] = b[3] = 0;
887 gfm->mapBiLFEx(tloc, bloc, t, b, tp, bp);
888 SANITY_CHECK_4TUP(t, b, tp, bp);
889 int nonz = -1;
890 bool abort = false;
891 size_t origSz = bot - top;
892 for(int j = 0; j < 4; j++) {
893 if(b[j] > t[j]) {
894 if(nonz >= 0) {
895 abort = true;
896 break;
897 }
898 nonz = j;
899 top = t[j]; bot = b[j];
900 }
901 }
902 assert_leq(bot - top, origSz);
903 if(abort || (nonz != rdc && rdc <= 3) || bot - top < origSz) {
904 break;
905 }
906 } else {
907 assert_eq(bot, top+1);
908 prm.nSdFmops++;
909 int c = gfm->mapLF1(top, tloc);
910 if(c != rdc && rdc <= 3) {
911 break;
912 }
913 bot = top + 1;
914 }
915 ASSERT_ONLY(tmp_rdseq_.append(rdc));
916 if(++nlex == 255) {
917 break;
918 }
919 INIT_LOCS(top, bot, tloc, bloc, *gfm);
920 }
921 }
922 // We're about to add onto the end, so re-reverse
923 ASSERT_ONLY(tmp_rdseq_.reverse());
924 lim = fw ? rdlen - len - off : off;
925 if(lim > 0 && gfmBw != NULL) {
926 const GFM<index_t> *gfm = gfmBw;
927 assert(gfm != NULL);
928 // Extend right using backward index
929 const BTDnaString& seq = fw ? rd.patFw : rd.patRc;
930 // See what we get by extending
931 index_t top = topb, bot = botb;
932 t[0] = t[1] = t[2] = t[3] = 0;
933 b[0] = b[1] = b[2] = b[3] = 0;
934 tp[0] = tp[1] = tp[2] = tp[3] = topf;
935 bp[0] = bp[1] = bp[2] = bp[3] = botf;
936 INIT_LOCS(top, bot, tloc, bloc, *gfm);
937 for(index_t ii = 0; ii < lim; ii++) {
938 // Starting to right of seed (<off) and moving right
939 index_t i;
940 if(fw) {
941 i = ii + len + off;
942 } else {
943 i = rdlen - off + ii;
944 }
945 // Get char from read
946 int rdc = seq.get(i);
947 // See what we get by extending
948 if(bloc.valid()) {
949 prm.nSdFmops++;
950 t[0] = t[1] = t[2] = t[3] =
951 b[0] = b[1] = b[2] = b[3] = 0;
952 gfm->mapBiLFEx(tloc, bloc, t, b, tp, bp);
953 SANITY_CHECK_4TUP(t, b, tp, bp);
954 int nonz = -1;
955 bool abort = false;
956 size_t origSz = bot - top;
957 for(int j = 0; j < 4; j++) {
958 if(b[j] > t[j]) {
959 if(nonz >= 0) {
960 abort = true;
961 break;
962 }
963 nonz = j;
964 top = t[j]; bot = b[j];
965 }
966 }
967 assert_leq(bot - top, origSz);
968 if(abort || (nonz != rdc && rdc <= 3) || bot - top < origSz) {
969 break;
970 }
971 } else {
972 assert_eq(bot, top+1);
973 prm.nSdFmops++;
974 int c = gfm->mapLF1(top, tloc);
975 if(c != rdc && rdc <= 3) {
976 break;
977 }
978 bot = top + 1;
979 }
980 ASSERT_ONLY(tmp_rdseq_.append(rdc));
981 if(++nrex == 255) {
982 break;
983 }
984 INIT_LOCS(top, bot, tloc, bloc, *gfm);
985 }
986 }
987 #ifndef NDEBUG
988 if(false) {
989 // TODO: This will sometimes fail even when the extension is legitimate
990 // This is because contains() comes in from one extreme or the other,
991 // whereas we started from the inside and worked outwards. This
992 // affects which Ns are OK and which are not OK.
993
994 // Have to do both because whether we can get through an N depends on
995 // which direction we're coming in
996 bool fwContains = gfmFw.contains(tmp_rdseq_);
997 tmp_rdseq_.reverse();
998 bool bwContains = gfmBw != NULL && gfmBw->contains(tmp_rdseq_);
999 tmp_rdseq_.reverse();
1000 assert(fwContains || bwContains);
1001 }
1002 #endif
1003 assert_lt(nlex, rdlen);
1004 assert_lt(nrex, rdlen);
1005 return;
1006 }
1007
1008 /**
1009 * Given seed results, set up all of our state for resolving and keeping
1010 * track of reference offsets for hits.
1011 */
1012 template <typename index_t>
prioritizeSATups(const Read & read,SeedResults<index_t> & sh,const GFM<index_t> & gfmFw,const GFM<index_t> * gfmBw,const BitPairReference & ref,int seedmms,index_t maxelt,bool doExtend,bool lensq,bool szsq,index_t nsm,AlignmentCacheIface<index_t> & ca,RandomSource & rnd,WalkMetrics & wlm,PerReadMetrics & prm,index_t & nelt_out,bool all)1013 void SwDriver<index_t>::prioritizeSATups(
1014 const Read& read, // read
1015 SeedResults<index_t>& sh, // seed hits to extend into full alignments
1016 const GFM<index_t>& gfmFw, // BWT
1017 const GFM<index_t>* gfmBw, // BWT
1018 const BitPairReference& ref, // Reference strings
1019 int seedmms, // # mismatches allowed in seed
1020 index_t maxelt, // max elts we'll consider
1021 bool doExtend, // do extension of seed hits?
1022 bool lensq, // square length in weight calculation
1023 bool szsq, // square range size in weight calculation
1024 index_t nsm, // if range as <= nsm elts, it's "small"
1025 AlignmentCacheIface<index_t>& ca, // alignment cache for seed hits
1026 RandomSource& rnd, // pseudo-random generator
1027 WalkMetrics& wlm, // group walk left metrics
1028 PerReadMetrics& prm, // per-read metrics
1029 index_t& nelt_out, // out: # elements total
1030 bool all) // report all hits?
1031 {
1032 const index_t nonz = sh.nonzeroOffsets(); // non-zero positions
1033 const int matei = (read.mate <= 1 ? 0 : 1);
1034 satups_.clear();
1035 gws_.clear();
1036 rands_.clear();
1037 rands2_.clear();
1038 satpos_.clear();
1039 satpos2_.clear();
1040 index_t nrange = 0, nelt = 0, nsmall = 0, nsmall_elts = 0;
1041 bool keepWhole = false;
1042 EList<SATupleAndPos<index_t>, 16>& satpos = keepWhole ? satpos_ : satpos2_;
1043 for(index_t i = 0; i < nonz; i++) {
1044 bool fw = true;
1045 index_t offidx = 0, rdoff = 0, seedlen = 0;
1046 QVal<index_t> qv = sh.hitsByRank(i, offidx, rdoff, fw, seedlen);
1047 assert(qv.valid());
1048 assert(!qv.empty());
1049 assert(qv.repOk(ca.current()));
1050 ca.queryQval(qv, satups_, nrange, nelt);
1051 for(size_t j = 0; j < satups_.size(); j++) {
1052 const index_t sz = satups_[j].size();
1053 // Check whether this hit occurs inside the extended boundaries of
1054 // another hit we already processed for this read.
1055 if(seedmms == 0) {
1056 // See if we're covered by a previous extended seed hit
1057 EList<ExtendRange>& range =
1058 fw ? seedExRangeFw_[matei] : seedExRangeRc_[matei];
1059 bool skip = false;
1060 for(index_t k = 0; k < range.size(); k++) {
1061 index_t p5 = range[k].off;
1062 index_t len = range[k].len;
1063 if(p5 <= rdoff && p5 + len >= (rdoff + seedlen)) {
1064 if(sz <= range[k].sz) {
1065 skip = true;
1066 break;
1067 }
1068 }
1069 }
1070 if(skip) {
1071 assert_gt(nrange, 0);
1072 nrange--;
1073 assert_geq(nelt, sz);
1074 nelt -= sz;
1075 continue; // Skip this seed
1076 }
1077 }
1078 satpos.expand();
1079 satpos.back().sat = satups_[j];
1080 satpos.back().origSz = sz;
1081 satpos.back().pos.init(fw, offidx, rdoff, seedlen);
1082 if(sz <= nsm) {
1083 nsmall++;
1084 nsmall_elts += sz;
1085 }
1086 satpos.back().nlex = satpos.back().nrex = 0;
1087 #ifndef NDEBUG
1088 tmp_rdseq_.clear();
1089 uint64_t key = satpos.back().sat.key.seq;
1090 for(size_t k = 0; k < seedlen; k++) {
1091 int c = (int)(key & 3);
1092 tmp_rdseq_.append(c);
1093 key >>= 2;
1094 }
1095 tmp_rdseq_.reverse();
1096 #endif
1097 index_t nlex = 0, nrex = 0;
1098 if(doExtend) {
1099 extend(
1100 read,
1101 gfmFw,
1102 gfmBw,
1103 satpos.back().sat.topf,
1104 (index_t)(satpos.back().sat.topf + sz),
1105 satpos.back().sat.topb,
1106 (index_t)(satpos.back().sat.topb + sz),
1107 fw,
1108 rdoff,
1109 seedlen,
1110 prm,
1111 nlex,
1112 nrex);
1113 }
1114 satpos.back().nlex = nlex;
1115 satpos.back().nrex = nrex;
1116 if(seedmms == 0 && (nlex > 0 || nrex > 0)) {
1117 assert_geq(rdoff, (fw ? nlex : nrex));
1118 index_t p5 = rdoff - (fw ? nlex : nrex);
1119 EList<ExtendRange>& range =
1120 fw ? seedExRangeFw_[matei] : seedExRangeRc_[matei];
1121 range.expand();
1122 range.back().off = p5;
1123 range.back().len = seedlen + nlex + nrex;
1124 range.back().sz = sz;
1125 }
1126 }
1127 satups_.clear();
1128 }
1129 assert_leq(nsmall, nrange);
1130 nelt_out = nelt; // return the total number of elements
1131 assert_eq(nrange, satpos.size());
1132 satpos.sort();
1133 if(keepWhole) {
1134 gws_.ensure(nrange);
1135 rands_.ensure(nrange);
1136 for(index_t i = 0; i < nrange; i++) {
1137 gws_.expand();
1138 SARangeWithOffs<TSlice, index_t> sa;
1139 sa.topf = satpos_.back().sat.topf;
1140 sa.len = satpos_.back().sat.key.len;
1141 sa.offs = satpos_.back().sat.offs;
1142 gws_.back().init(
1143 gfmFw, // forward Bowtie index
1144 ref, // reference sequences
1145 sa, // SA tuples: ref hit, salist range
1146 rnd, // pseudo-random generator
1147 wlm); // metrics
1148 assert(gws_.back().initialized());
1149 rands_.expand();
1150 rands_.back().init(satpos_[i].sat.size(), all);
1151 }
1152 return;
1153 }
1154 // Resize satups_ list so that ranges having elements that we might
1155 // possibly explore are present
1156 satpos_.ensure(min(maxelt, nelt));
1157 gws_.ensure(min(maxelt, nelt));
1158 rands_.ensure(min(maxelt, nelt));
1159 rands2_.ensure(min(maxelt, nelt));
1160 size_t nlarge_elts = nelt - nsmall_elts;
1161 if(maxelt < nelt) {
1162 size_t diff = nelt - maxelt;
1163 if(diff >= nlarge_elts) {
1164 nlarge_elts = 0;
1165 } else {
1166 nlarge_elts -= diff;
1167 }
1168 }
1169 index_t nelt_added = 0;
1170 // Now we have a collection of ranges in satpos2_. Now we want to decide
1171 // how we explore elements from them. The basic idea is that: for very
1172 // small guys, where "very small" means that the size of the range is less
1173 // than or equal to the parameter 'nsz', we explore them in their entirety
1174 // right away. For the rest, we want to select in a way that is (a)
1175 // random, and (b) weighted toward examining elements from the smaller
1176 // ranges more frequently (and first).
1177 //
1178 // 1. do the smalls
1179 for(index_t j = 0; j < nsmall && nelt_added < maxelt; j++) {
1180 satpos_.expand();
1181 satpos_.back() = satpos2_[j];
1182 gws_.expand();
1183 SARangeWithOffs<TSlice, index_t> sa;
1184 sa.topf = satpos_.back().sat.topf;
1185 sa.len = satpos_.back().sat.key.len;
1186 sa.offs = satpos_.back().sat.offs;
1187 gws_.back().init(
1188 gfmFw, // forward Bowtie index
1189 ref, // reference sequences
1190 sa, // SA tuples: ref hit, salist range
1191 rnd, // pseudo-random generator
1192 wlm); // metrics
1193 assert(gws_.back().initialized());
1194 rands_.expand();
1195 rands_.back().init(satpos_.back().sat.size(), all);
1196 nelt_added += satpos_.back().sat.size();
1197 #ifndef NDEBUG
1198 for(size_t k = 0; k < satpos_.size()-1; k++) {
1199 assert(!(satpos_[k] == satpos_.back()));
1200 }
1201 #endif
1202 }
1203 if(nelt_added >= maxelt || nsmall == satpos2_.size()) {
1204 nelt_out = nelt_added;
1205 return;
1206 }
1207 // 2. do the non-smalls
1208 // Initialize the row sampler
1209 rowsamp_.init(satpos2_, nsmall, satpos2_.size(), lensq, szsq);
1210 // Initialize the random choosers
1211 rands2_.resize(satpos2_.size());
1212 for(index_t j = 0; j < satpos2_.size(); j++) {
1213 rands2_[j].reset();
1214 }
1215 while(nelt_added < maxelt && nelt_added < nelt) {
1216 // Pick a non-small range to sample from
1217 index_t ri = rowsamp_.next(rnd) + nsmall;
1218 assert_geq(ri, nsmall);
1219 assert_lt(ri, satpos2_.size());
1220 // Initialize random element chooser for that range
1221 if(!rands2_[ri].inited()) {
1222 rands2_[ri].init(satpos2_[ri].sat.size(), all);
1223 assert(!rands2_[ri].done());
1224 }
1225 assert(!rands2_[ri].done());
1226 // Choose an element from the range
1227 uint32_t r = rands2_[ri].next(rnd);
1228 if(rands2_[ri].done()) {
1229 // Tell the row sampler this range is done
1230 rowsamp_.finishedRange(ri - nsmall);
1231 }
1232 // Add the element to the satpos_ list
1233 SATuple<index_t> sat;
1234 TSlice o;
1235 o.init(satpos2_[ri].sat.offs, r, r+1);
1236 sat.init(satpos2_[ri].sat.key, (index_t)(satpos2_[ri].sat.topf + r), (index_t)OFF_MASK, o);
1237 satpos_.expand();
1238 satpos_.back().sat = sat;
1239 satpos_.back().origSz = satpos2_[ri].origSz;
1240 satpos_.back().pos = satpos2_[ri].pos;
1241 // Initialize GroupWalk object
1242 gws_.expand();
1243 SARangeWithOffs<TSlice, index_t> sa;
1244 sa.topf = sat.topf;
1245 sa.len = sat.key.len;
1246 sa.offs = sat.offs;
1247 gws_.back().init(
1248 gfmFw, // forward Bowtie index
1249 ref, // reference sequences
1250 sa, // SA tuples: ref hit, salist range
1251 rnd, // pseudo-random generator
1252 wlm); // metrics
1253 assert(gws_.back().initialized());
1254 // Initialize random selector
1255 rands_.expand();
1256 rands_.back().init(1, all);
1257 nelt_added++;
1258 }
1259 nelt_out = nelt_added;
1260 return;
1261 }
1262
1263 enum {
1264 FOUND_NONE = 0,
1265 FOUND_EE,
1266 FOUND_UNGAPPED,
1267 };
1268
1269 /**
1270 * Given a collection of SeedHits for a single read, extend seed alignments
1271 * into full alignments. Where possible, try to avoid redundant offset lookups
1272 * and dynamic programming wherever possible. Optionally report alignments to
1273 * a AlnSinkWrap object as they are discovered.
1274 *
1275 * If 'reportImmediately' is true, returns true iff a call to msink->report()
1276 * returned true (indicating that the reporting policy is satisfied and we can
1277 * stop). Otherwise, returns false.
1278 */
1279 template <typename index_t>
extendSeeds(Read & rd,bool mate1,SeedResults<index_t> & sh,const GFM<index_t> & gfmFw,const GFM<index_t> * gfmBw,const BitPairReference & ref,SwAligner & swa,const Scoring & sc,int seedmms,int seedlen,int seedival,TAlScore & minsc,int nceil,size_t maxhalf,bool doUngapped,size_t maxIters,size_t maxUg,size_t maxDp,size_t maxUgStreak,size_t maxDpStreak,bool doExtend,bool enable8,size_t cminlen,size_t cpow2,bool doTri,int tighten,AlignmentCacheIface<index_t> & ca,RandomSource & rnd,WalkMetrics & wlm,SwMetrics & swmSeed,PerReadMetrics & prm,AlnSinkWrap<index_t> * msink,bool reportImmediately,bool & exhaustive)1280 int SwDriver<index_t>::extendSeeds(
1281 Read& rd, // read to align
1282 bool mate1, // true iff rd is mate #1
1283 SeedResults<index_t>& sh, // seed hits to extend into full alignments
1284 const GFM<index_t>& gfmFw, // BWT
1285 const GFM<index_t>* gfmBw, // BWT'
1286 const BitPairReference& ref, // Reference strings
1287 SwAligner& swa, // dynamic programming aligner
1288 const Scoring& sc, // scoring scheme
1289 int seedmms, // # mismatches allowed in seed
1290 int seedlen, // length of seed
1291 int seedival, // interval between seeds
1292 TAlScore& minsc, // minimum score for anchor
1293 int nceil, // maximum # Ns permitted in reference portion
1294 size_t maxhalf, // max width in either direction for DP tables
1295 bool doUngapped, // do ungapped alignment
1296 size_t maxIters, // stop after this many seed-extend loop iters
1297 size_t maxUg, // stop after this many ungaps
1298 size_t maxDp, // stop after this many dps
1299 size_t maxUgStreak, // stop after streak of this many ungap fails
1300 size_t maxDpStreak, // stop after streak of this many dp fails
1301 bool doExtend, // do seed extension
1302 bool enable8, // use 8-bit SSE where possible
1303 size_t cminlen, // use checkpointer if read longer than this
1304 size_t cpow2, // interval between diagonals to checkpoint
1305 bool doTri, // triangular mini-fills?
1306 int tighten, // -M score tightening mode
1307 AlignmentCacheIface<index_t>& ca, // alignment cache for seed hits
1308 RandomSource& rnd, // pseudo-random source
1309 WalkMetrics& wlm, // group walk left metrics
1310 SwMetrics& swmSeed, // DP metrics for seed-extend
1311 PerReadMetrics& prm, // per-read metrics
1312 AlnSinkWrap<index_t>* msink, // AlnSink wrapper for multiseed-style aligner
1313 bool reportImmediately, // whether to report hits immediately to msink
1314 bool& exhaustive) // set to true iff we searched all seeds exhaustively
1315 {
1316 bool all = msink->allHits();
1317 // typedef std::pair<index_t, index_t> UPair;
1318
1319 assert(!reportImmediately || msink != NULL);
1320 assert(!reportImmediately || !msink->maxed());
1321
1322 assert_geq(nceil, 0);
1323 assert_leq((size_t)nceil, rd.length());
1324
1325 // Calculate the largest possible number of read and reference gaps
1326 const index_t rdlen = (index_t)rd.length();
1327 TAlScore perfectScore = sc.perfectScore(rdlen);
1328
1329 DynProgFramer dpframe(!gReportOverhangs);
1330 swa.reset();
1331
1332 // Initialize a set of GroupWalks, one for each seed. Also, intialize the
1333 // accompanying lists of reference seed hits (satups*)
1334 const index_t nsm = 5;
1335 const index_t nonz = sh.nonzeroOffsets(); // non-zero positions
1336 index_t eeHits = sh.numE2eHits();
1337 bool eeMode = eeHits > 0;
1338 bool firstEe = true;
1339 bool firstExtend = true;
1340
1341 // Reset all the counters related to streaks
1342 prm.nEeFail = 0;
1343 prm.nUgFail = 0;
1344 prm.nDpFail = 0;
1345
1346 index_t nelt = 0, neltLeft = 0;
1347 index_t rows = rdlen;
1348 index_t eltsDone = 0;
1349 // cerr << "===" << endl;
1350 while(true) {
1351 if(eeMode) {
1352 if(firstEe) {
1353 firstEe = false;
1354 eeMode = eeSaTups(
1355 rd, // read
1356 sh, // seed hits to extend into full alignments
1357 gfmFw, // BWT
1358 ref, // Reference strings
1359 rnd, // pseudo-random generator
1360 wlm, // group walk left metrics
1361 swmSeed, // seed-extend metrics
1362 nelt, // out: # elements total
1363 maxIters, // max # to report
1364 all); // report all hits?
1365 assert_eq(gws_.size(), rands_.size());
1366 assert_eq(gws_.size(), satpos_.size());
1367 } else {
1368 eeMode = false;
1369 }
1370 }
1371 if(!eeMode) {
1372 if(nonz == 0) {
1373 return EXTEND_EXHAUSTED_CANDIDATES; // No seed hits! Bail.
1374 }
1375 if(minsc == perfectScore) {
1376 return EXTEND_PERFECT_SCORE; // Already found all perfect hits!
1377 }
1378 if(firstExtend) {
1379 nelt = 0;
1380 prioritizeSATups(
1381 rd, // read
1382 sh, // seed hits to extend into full alignments
1383 gfmFw, // BWT
1384 gfmBw, // BWT'
1385 ref, // Reference strings
1386 seedmms, // # seed mismatches allowed
1387 maxIters, // max rows to consider per position
1388 doExtend, // extend out seeds
1389 true, // square extended length
1390 true, // square SA range size
1391 nsm, // smallness threshold
1392 ca, // alignment cache for seed hits
1393 rnd, // pseudo-random generator
1394 wlm, // group walk left metrics
1395 prm, // per-read metrics
1396 nelt, // out: # elements total
1397 all); // report all hits?
1398 assert_eq(gws_.size(), rands_.size());
1399 assert_eq(gws_.size(), satpos_.size());
1400 neltLeft = nelt;
1401 firstExtend = false;
1402 }
1403 if(neltLeft == 0) {
1404 // Finished examining gapped candidates
1405 break;
1406 }
1407 }
1408 for(size_t i = 0; i < gws_.size(); i++) {
1409 if(eeMode && eehits_[i].score < minsc) {
1410 return EXTEND_PERFECT_SCORE;
1411 }
1412 bool is_small = satpos_[i].sat.size() < nsm;
1413 bool fw = satpos_[i].pos.fw;
1414 index_t rdoff = satpos_[i].pos.rdoff;
1415 index_t seedhitlen = satpos_[i].pos.seedlen;
1416 if(!fw) {
1417 // 'rdoff' and 'offidx' are with respect to the 5' end of
1418 // the read. Here we convert rdoff to be with respect to
1419 // the upstream (3') end of ther read.
1420 rdoff = (index_t)(rdlen - rdoff - seedhitlen);
1421 }
1422 bool first = true;
1423 // If the range is small, investigate all elements now. If the
1424 // range is large, just investigate one and move on - we might come
1425 // back to this range later.
1426 index_t riter = 0;
1427 while(!rands_[i].done() && (first || is_small || eeMode)) {
1428 assert(!gws_[i].done());
1429 riter++;
1430 if(minsc == perfectScore) {
1431 if(!eeMode || eehits_[i].score < perfectScore) {
1432 return EXTEND_PERFECT_SCORE;
1433 }
1434 } else if(eeMode && eehits_[i].score < minsc) {
1435 break;
1436 }
1437 if(prm.nExDps >= maxDp || prm.nMateDps >= maxDp) {
1438 return EXTEND_EXCEEDED_HARD_LIMIT;
1439 }
1440 if(prm.nExUgs >= maxUg || prm.nMateUgs >= maxUg) {
1441 return EXTEND_EXCEEDED_HARD_LIMIT;
1442 }
1443 if(prm.nExIters >= maxIters) {
1444 return EXTEND_EXCEEDED_HARD_LIMIT;
1445 }
1446 prm.nExIters++;
1447 first = false;
1448 // Resolve next element offset
1449 WalkResult<index_t> wr;
1450 uint32_t elt = rands_[i].next(rnd);
1451 //cerr << "elt=" << elt << endl;
1452 SARangeWithOffs<TSlice, index_t> sa;
1453 sa.topf = satpos_[i].sat.topf;
1454 sa.len = satpos_[i].sat.key.len;
1455 sa.offs = satpos_[i].sat.offs;
1456 gws_[i].advanceElement((index_t)elt, gfmFw, ref, sa, gwstate_, wr, wlm, prm);
1457 eltsDone++;
1458 if(!eeMode) {
1459 assert_gt(neltLeft, 0);
1460 neltLeft--;
1461 }
1462 assert_neq((index_t)OFF_MASK, wr.toff);
1463 index_t tidx = 0, toff = 0, tlen = 0;
1464 bool straddled = false;
1465 gfmFw.joinedToTextOff(
1466 wr.elt.len,
1467 wr.toff,
1468 tidx,
1469 toff,
1470 tlen,
1471 eeMode, // reject straddlers?
1472 straddled); // did it straddle?
1473 if(tidx == (index_t)OFF_MASK) {
1474 // The seed hit straddled a reference boundary so the seed hit
1475 // isn't valid
1476 continue;
1477 }
1478 #ifndef NDEBUG
1479 if(!eeMode && !straddled) { // Check that seed hit matches reference
1480 uint64_t key = satpos_[i].sat.key.seq;
1481 for(index_t k = 0; k < wr.elt.len; k++) {
1482 int c = ref.getBase(tidx, toff + wr.elt.len - k - 1);
1483 assert_leq(c, 3);
1484 int ck = (int)(key & 3);
1485 key >>= 2;
1486 assert_eq(c, ck);
1487 }
1488 }
1489 #endif
1490 // Find offset of alignment's upstream base assuming net gaps=0
1491 // between beginning of read and beginning of seed hit
1492 int64_t refoff = (int64_t)toff - rdoff;
1493 // Coordinate of the seed hit w/r/t the pasted reference string
1494 Coord refcoord(tidx, refoff, fw);
1495 if(seenDiags1_.locusPresent(refcoord)) {
1496 // Already handled alignments seeded on this diagonal
1497 prm.nRedundants++;
1498 swmSeed.rshit++;
1499 continue;
1500 }
1501 // Now that we have a seed hit, there are many issues to solve
1502 // before we have a completely framed dynamic programming problem.
1503 // They include:
1504 //
1505 // 1. Setting reference offsets on either side of the seed hit,
1506 // accounting for where the seed occurs in the read
1507 // 2. Adjusting the width of the banded dynamic programming problem
1508 // and adjusting reference bounds to allow for gaps in the
1509 // alignment
1510 // 3. Accounting for the edges of the reference, which can impact
1511 // the width of the DP problem and reference bounds.
1512 // 4. Perhaps filtering the problem down to a smaller problem based
1513 // on what DPs we've already solved for this read
1514 //
1515 // We do #1 here, since it is simple and we have all the seed-hit
1516 // information here. #2 and #3 are handled in the DynProgFramer.
1517 int readGaps = 0, refGaps = 0;
1518 bool ungapped = false;
1519 if(!eeMode) {
1520 readGaps = sc.maxReadGaps(minsc, rdlen);
1521 refGaps = sc.maxRefGaps(minsc, rdlen);
1522 ungapped = (readGaps == 0 && refGaps == 0);
1523 }
1524 int state = FOUND_NONE;
1525 bool found = false;
1526 if(eeMode) {
1527 resEe_.reset();
1528 resEe_.alres.reset();
1529 const EEHit<index_t>& h = eehits_[i];
1530 assert_leq(h.score, perfectScore);
1531 resEe_.alres.setScore(AlnScore(h.score, h.ns(), 0));
1532 resEe_.alres.setShape(
1533 refcoord.ref(), // ref id
1534 refcoord.off(), // 0-based ref offset
1535 tlen, // length of reference
1536 fw, // aligned to Watson?
1537 rdlen, // read length
1538 true, // pretrim soft?
1539 0, // pretrim 5' end
1540 0, // pretrim 3' end
1541 true, // alignment trim soft?
1542 0, // alignment trim 5' end
1543 0); // alignment trim 3' end
1544 resEe_.alres.setRefNs(h.refns());
1545 if(h.mms() > 0) {
1546 assert_eq(1, h.mms());
1547 assert_lt(h.e1.pos, rd.length());
1548 resEe_.alres.ned().push_back(h.e1);
1549 }
1550 assert(resEe_.repOk(rd));
1551 state = FOUND_EE;
1552 found = true;
1553 Interval refival(refcoord, 1);
1554 seenDiags1_.add(refival);
1555 } else if(doUngapped && ungapped) {
1556 resUngap_.reset();
1557 int al = swa.ungappedAlign(
1558 fw ? rd.patFw : rd.patRc,
1559 fw ? rd.qual : rd.qualRev,
1560 refcoord,
1561 ref,
1562 tlen,
1563 sc,
1564 gReportOverhangs,
1565 minsc,
1566 resUngap_);
1567 Interval refival(refcoord, 1);
1568 seenDiags1_.add(refival);
1569 prm.nExUgs++;
1570 if(al == 0) {
1571 prm.nExUgFails++;
1572 prm.nUgFail++;
1573 if(prm.nUgFail >= maxUgStreak) {
1574 return EXTEND_EXCEEDED_SOFT_LIMIT;
1575 }
1576 swmSeed.ungapfail++;
1577 continue;
1578 } else if(al == -1) {
1579 prm.nExUgFails++;
1580 prm.nUgFail++; // count this as failure
1581 if(prm.nUgFail >= maxUgStreak) {
1582 return EXTEND_EXCEEDED_SOFT_LIMIT;
1583 }
1584 swmSeed.ungapnodec++;
1585 } else {
1586 prm.nExUgSuccs++;
1587 prm.nUgLastSucc = prm.nExUgs-1;
1588 if(prm.nUgFail > prm.nUgFailStreak) {
1589 prm.nUgFailStreak = prm.nUgFail;
1590 }
1591 prm.nUgFail = 0;
1592 found = true;
1593 state = FOUND_UNGAPPED;
1594 swmSeed.ungapsucc++;
1595 }
1596 }
1597 int64_t pastedRefoff = (int64_t)wr.toff - rdoff;
1598 DPRect rect;
1599 if(state == FOUND_NONE) {
1600 found = dpframe.frameSeedExtensionRect(
1601 refoff, // ref offset implied by seed hit assuming no gaps
1602 rows, // length of read sequence used in DP table
1603 tlen, // length of reference
1604 readGaps, // max # of read gaps permitted in opp mate alignment
1605 refGaps, // max # of ref gaps permitted in opp mate alignment
1606 (size_t)nceil, // # Ns permitted
1607 maxhalf, // max width in either direction
1608 rect); // DP rectangle
1609 assert(rect.repOk());
1610 // Add the seed diagonal at least
1611 seenDiags1_.add(Interval(refcoord, 1));
1612 if(!found) {
1613 continue;
1614 }
1615 }
1616 int64_t leftShift = refoff - rect.refl;
1617 size_t nwindow = 0;
1618 if(toff >= rect.refl) {
1619 nwindow = (size_t)(toff - rect.refl);
1620 }
1621 // NOTE: We might be taking off more than we should because the
1622 // pasted string omits non-A/C/G/T characters, but we included them
1623 // when calculating leftShift. We'll account for this later.
1624 pastedRefoff -= leftShift;
1625 size_t nsInLeftShift = 0;
1626 if(state == FOUND_NONE) {
1627 if(!swa.initedRead()) {
1628 // Initialize the aligner with a new read
1629 swa.initRead(
1630 rd.patFw, // fw version of query
1631 rd.patRc, // rc version of query
1632 rd.qual, // fw version of qualities
1633 rd.qualRev,// rc version of qualities
1634 0, // off of first char in 'rd' to consider
1635 rdlen, // off of last char (excl) in 'rd' to consider
1636 sc); // scoring scheme
1637 }
1638 swa.initRef(
1639 fw, // whether to align forward or revcomp read
1640 tidx, // reference aligned against
1641 rect, // DP rectangle
1642 ref, // Reference strings
1643 tlen, // length of reference sequence
1644 sc, // scoring scheme
1645 minsc, // minimum score permitted
1646 enable8, // use 8-bit SSE if possible?
1647 cminlen, // minimum length for using checkpointing scheme
1648 cpow2, // interval b/t checkpointed diags; 1 << this
1649 doTri, // triangular mini-fills?
1650 true, // this is a seed extension - not finding a mate
1651 nwindow,
1652 nsInLeftShift);
1653 // Because of how we framed the problem, we can say that we've
1654 // exhaustively scored the seed diagonal as well as maxgaps
1655 // diagonals on either side
1656 Interval refival(tidx, 0, fw, 0);
1657 rect.initIval(refival);
1658 seenDiags1_.add(refival);
1659 // Now fill the dynamic programming matrix and return true iff
1660 // there is at least one valid alignment
1661 TAlScore bestCell = std::numeric_limits<TAlScore>::min();
1662 found = swa.align(rnd, bestCell);
1663 swmSeed.tallyGappedDp(readGaps, refGaps);
1664 prm.nExDps++;
1665 if(!found) {
1666 prm.nExDpFails++;
1667 prm.nDpFail++;
1668 if(prm.nDpFail >= maxDpStreak) {
1669 return EXTEND_EXCEEDED_SOFT_LIMIT;
1670 }
1671 if(bestCell > std::numeric_limits<TAlScore>::min() && bestCell > prm.bestLtMinscMate1) {
1672 prm.bestLtMinscMate1 = bestCell;
1673 }
1674 continue; // Look for more anchor alignments
1675 } else {
1676 prm.nExDpSuccs++;
1677 prm.nDpLastSucc = prm.nExDps-1;
1678 if(prm.nDpFail > prm.nDpFailStreak) {
1679 prm.nDpFailStreak = prm.nDpFail;
1680 }
1681 prm.nDpFail = 0;
1682 }
1683 }
1684 bool firstInner = true;
1685 while(true) {
1686 assert(found);
1687 SwResult *res = NULL;
1688 if(state == FOUND_EE) {
1689 if(!firstInner) {
1690 break;
1691 }
1692 res = &resEe_;
1693 } else if(state == FOUND_UNGAPPED) {
1694 if(!firstInner) {
1695 break;
1696 }
1697 res = &resUngap_;
1698 } else {
1699 resGap_.reset();
1700 assert(resGap_.empty());
1701 if(swa.done()) {
1702 break;
1703 }
1704 swa.nextAlignment(resGap_, minsc, rnd);
1705 found = !resGap_.empty();
1706 if(!found) {
1707 break;
1708 }
1709 res = &resGap_;
1710 }
1711 assert(res != NULL);
1712 firstInner = false;
1713 assert(res->alres.matchesRef(
1714 rd,
1715 ref,
1716 tmp_rf_,
1717 tmp_rdseq_,
1718 tmp_qseq_,
1719 raw_refbuf_,
1720 raw_destU32_,
1721 raw_matches_,
1722 tmp_reflens_,
1723 tmp_refoffs_));
1724 Interval refival(tidx, 0, fw, tlen);
1725 assert_gt(res->alres.refExtent(), 0);
1726 if(gReportOverhangs &&
1727 !refival.containsIgnoreOrient(res->alres.refival()))
1728 {
1729 res->alres.clipOutside(true, 0, tlen);
1730 if(res->alres.refExtent() == 0) {
1731 continue;
1732 }
1733 }
1734 assert(gReportOverhangs ||
1735 refival.containsIgnoreOrient(res->alres.refival()));
1736 // Did the alignment fall entirely outside the reference?
1737 if(!refival.overlapsIgnoreOrient(res->alres.refival())) {
1738 continue;
1739 }
1740 // Is this alignment redundant with one we've seen previously?
1741 if(redAnchor_.overlap(res->alres)) {
1742 // Redundant with an alignment we found already
1743 continue;
1744 }
1745 redAnchor_.add(res->alres);
1746 // Annotate the AlnRes object with some key parameters
1747 // that were used to obtain the alignment.
1748 res->alres.setParams(
1749 seedmms, // # mismatches allowed in seed
1750 seedlen, // length of seed
1751 seedival, // interval between seeds
1752 minsc); // minimum score for valid alignment
1753
1754 if(reportImmediately) {
1755 assert(msink != NULL);
1756 assert(res->repOk());
1757 // Check that alignment accurately reflects the
1758 // reference characters aligned to
1759 assert(res->alres.matchesRef(
1760 rd,
1761 ref,
1762 tmp_rf_,
1763 tmp_rdseq_,
1764 tmp_qseq_,
1765 raw_refbuf_,
1766 raw_destU32_,
1767 raw_matches_,
1768 tmp_reflens_,
1769 tmp_refoffs_));
1770 // Report an unpaired alignment
1771 assert(!msink->maxed());
1772 if(msink->report(
1773 0,
1774 mate1 ? &res->alres : NULL,
1775 mate1 ? NULL : &res->alres))
1776 {
1777 // Short-circuited because a limit, e.g. -k, -m or
1778 // -M, was exceeded
1779 return EXTEND_POLICY_FULFILLED;
1780 }
1781 if(tighten > 0 &&
1782 msink->Mmode() &&
1783 msink->hasSecondBestUnp1())
1784 {
1785 if(tighten == 1) {
1786 if(msink->bestUnp1() >= minsc) {
1787 minsc = msink->bestUnp1();
1788 if(minsc < perfectScore &&
1789 msink->bestUnp1() == msink->secondBestUnp1())
1790 {
1791 minsc++;
1792 }
1793 }
1794 } else if(tighten == 2) {
1795 if(msink->secondBestUnp1() >= minsc) {
1796 minsc = msink->secondBestUnp1();
1797 if(minsc < perfectScore) {
1798 minsc++;
1799 }
1800 }
1801 } else {
1802 TAlScore diff = msink->bestUnp1() - msink->secondBestUnp1();
1803 TAlScore bot = msink->secondBestUnp1() + ((diff*3)/4);
1804 if(bot >= minsc) {
1805 minsc = bot;
1806 if(minsc < perfectScore) {
1807 minsc++;
1808 }
1809 }
1810 }
1811 assert_leq(minsc, perfectScore);
1812 }
1813 }
1814 }
1815
1816 // At this point we know that we aren't bailing, and will
1817 // continue to resolve seed hits.
1818
1819 } // while(!gws_[i].done())
1820 }
1821 }
1822 // Short-circuited because a limit, e.g. -k, -m or -M, was exceeded
1823 return EXTEND_EXHAUSTED_CANDIDATES;
1824 }
1825
1826 /**
1827 * Given a collection of SeedHits for both mates in a read pair, extend seed
1828 * alignments into full alignments and then look for the opposite mate using
1829 * dynamic programming. Where possible, try to avoid redundant offset lookups.
1830 * Optionally report alignments to a AlnSinkWrap object as they are discovered.
1831 *
1832 * If 'reportImmediately' is true, returns true iff a call to
1833 * msink->report() returned true (indicating that the reporting
1834 * policy is satisfied and we can stop). Otherwise, returns false.
1835 *
1836 * REDUNDANT SEED HITS
1837 *
1838 * See notes at top of aligner_sw_driver.h.
1839 *
1840 * REDUNDANT ALIGNMENTS
1841 *
1842 * See notes at top of aligner_sw_driver.h.
1843 *
1844 * MIXING PAIRED AND UNPAIRED ALIGNMENTS
1845 *
1846 * There are distinct paired-end alignment modes for the cases where (a) the
1847 * user does or does not want to see unpaired alignments for individual mates
1848 * when there are no reportable paired-end alignments involving both mates, and
1849 * (b) the user does or does not want to see discordant paired-end alignments.
1850 * The modes have implications for this function and for the AlnSinkWrap, since
1851 * it affects when we're "done." Also, whether the user has asked us to report
1852 * discordant alignments affects whether and how much searching for unpaired
1853 * alignments we must do (i.e. if there are no paired-end alignments, we must
1854 * at least do -m 1 for both mates).
1855 *
1856 * Mode 1: Just concordant paired-end. Print only concordant paired-end
1857 * alignments. As soon as any limits (-k/-m/-M) are reached, stop.
1858 *
1859 * Mode 2: Concordant and discordant paired-end. If -k/-m/-M limits are
1860 * reached for paired-end alignments, stop. Otherwise, if no paired-end
1861 * alignments are found, align both mates in an unpaired -m 1 fashion. If
1862 * there is exactly one unpaired alignment for each mate, report the
1863 * combination as a discordant alignment.
1864 *
1865 * Mode 3: Concordant paired-end if possible, otherwise unpaired. If -k/-M
1866 * limit is reached for paired-end alignmnts, stop. If -m limit is reached for
1867 * paired-end alignments or no paired-end alignments are found, align both
1868 * mates in an unpaired fashion. All the same settings governing validity and
1869 * reportability in paired-end mode apply here too (-k/-m/-M/etc).
1870 *
1871 * Mode 4: Concordant or discordant paired-end if possible, otherwise unpaired.
1872 * If -k/-M limit is reached for paired-end alignmnts, stop. If -m limit is
1873 * reached for paired-end alignments or no paired-end alignments are found,
1874 * align both mates in an unpaired fashion. If the -m limit was reached, there
1875 * is no need to search for a discordant alignment, and unapired alignment can
1876 * proceed as in Mode 3. If no paired-end alignments were found, then unpaired
1877 * alignment proceeds as in Mode 3 but with this caveat: alignment must be at
1878 * least as thorough as dictated by -m 1 up until the point where
1879 *
1880 * Print paired-end alignments when there are reportable paired-end
1881 * alignments, otherwise report reportable unpaired alignments. If -k limit is
1882 * reached for paired-end alignments, stop. If -m/-M limit is reached for
1883 * paired-end alignments, stop searching for paired-end alignments and look
1884 * only for unpaired alignments. If searching only for unpaired alignments,
1885 * respect -k/-m/-M limits separately for both mates.
1886 *
1887 * The return value from the AlnSinkWrap's report member function must be
1888 * specific enough to distinguish between:
1889 *
1890 * 1. Stop searching for paired-end alignments
1891 * 2. Stop searching for alignments for unpaired alignments for mate #1
1892 * 3. Stop searching for alignments for unpaired alignments for mate #2
1893 * 4. Stop searching for any alignments
1894 *
1895 * Note that in Mode 2, options affecting validity and reportability of
1896 * alignments apply . E.g. if -m 1 is specified
1897 *
1898 * WORKFLOW
1899 *
1900 * Our general approach to finding paired and unpaired alignments here
1901 * is as follows:
1902 *
1903 * - For mate in mate1, mate2:
1904 * - For each seed hit in mate:
1905 * - Try to extend it into a full alignment; if we can't, continue
1906 * to the next seed hit
1907 * - Look for alignment for opposite mate; if we can't find one,
1908 * -
1909 * -
1910 *
1911 */
1912 template <typename index_t>
extendSeedsPaired(Read & rd,Read & ord,bool anchor1,bool oppFilt,SeedResults<index_t> & sh,const GFM<index_t> & gfmFw,const GFM<index_t> * gfmBw,const BitPairReference & ref,SwAligner & swa,SwAligner & oswa,const Scoring & sc,const PairedEndPolicy & pepol,int seedmms,int seedlen,int seedival,TAlScore & minsc,TAlScore & ominsc,int nceil,int onceil,bool nofw,bool norc,size_t maxhalf,bool doUngapped,size_t maxIters,size_t maxUg,size_t maxDp,size_t maxEeStreak,size_t maxUgStreak,size_t maxDpStreak,size_t maxMateStreak,bool doExtend,bool enable8,size_t cminlen,size_t cpow2,bool doTri,int tighten,AlignmentCacheIface<index_t> & ca,RandomSource & rnd,WalkMetrics & wlm,SwMetrics & swmSeed,SwMetrics & swmMate,PerReadMetrics & prm,AlnSinkWrap<index_t> * msink,bool swMateImmediately,bool reportImmediately,bool discord,bool mixed,bool & exhaustive)1913 int SwDriver<index_t>::extendSeedsPaired(
1914 Read& rd, // mate to align as anchor
1915 Read& ord, // mate to align as opposite
1916 bool anchor1, // true iff anchor mate is mate1
1917 bool oppFilt, // true iff opposite mate was filtered out
1918 SeedResults<index_t>& sh, // seed hits for anchor
1919 const GFM<index_t>& gfmFw, // BWT
1920 const GFM<index_t>* gfmBw, // BWT'
1921 const BitPairReference& ref, // Reference strings
1922 SwAligner& swa, // dynamic programming aligner for anchor
1923 SwAligner& oswa, // dynamic programming aligner for opposite
1924 const Scoring& sc, // scoring scheme
1925 const PairedEndPolicy& pepol,// paired-end policy
1926 int seedmms, // # mismatches allowed in seed
1927 int seedlen, // length of seed
1928 int seedival, // interval between seeds
1929 TAlScore& minsc, // minimum score for valid anchor aln
1930 TAlScore& ominsc, // minimum score for valid opposite aln
1931 int nceil, // max # Ns permitted in ref for anchor
1932 int onceil, // max # Ns permitted in ref for opposite
1933 bool nofw, // don't align forward read
1934 bool norc, // don't align revcomp read
1935 size_t maxhalf, // max width in either direction for DP tables
1936 bool doUngapped, // do ungapped alignment
1937 size_t maxIters, // stop after this many seed-extend loop iters
1938 size_t maxUg, // stop after this many ungaps
1939 size_t maxDp, // stop after this many dps
1940 size_t maxEeStreak, // stop after streak of this many end-to-end fails
1941 size_t maxUgStreak, // stop after streak of this many ungap fails
1942 size_t maxDpStreak, // stop after streak of this many dp fails
1943 size_t maxMateStreak, // stop seed range after N mate-find fails
1944 bool doExtend, // do seed extension
1945 bool enable8, // use 8-bit SSE where possible
1946 size_t cminlen, // use checkpointer if read longer than this
1947 size_t cpow2, // interval between diagonals to checkpoint
1948 bool doTri, // triangular mini-fills?
1949 int tighten, // -M score tightening mode
1950 AlignmentCacheIface<index_t>& ca, // alignment cache for seed hits
1951 RandomSource& rnd, // pseudo-random source
1952 WalkMetrics& wlm, // group walk left metrics
1953 SwMetrics& swmSeed, // DP metrics for seed-extend
1954 SwMetrics& swmMate, // DP metrics for mate finidng
1955 PerReadMetrics& prm, // per-read metrics
1956 AlnSinkWrap<index_t>* msink, // AlnSink wrapper for multiseed-style aligner
1957 bool swMateImmediately, // whether to look for mate immediately
1958 bool reportImmediately, // whether to report hits immediately to msink
1959 bool discord, // look for discordant alignments?
1960 bool mixed, // look for unpaired as well as paired alns?
1961 bool& exhaustive)
1962 {
1963 bool all = msink->allHits();
1964 // typedef std::pair<uint32_t, uint32_t> U32Pair;
1965
1966 assert(!reportImmediately || msink != NULL);
1967 assert(!reportImmediately || !msink->maxed());
1968 assert(!msink->state().doneWithMate(anchor1));
1969
1970 assert_geq(nceil, 0);
1971 assert_geq(onceil, 0);
1972 assert_leq((size_t)nceil, rd.length());
1973 assert_leq((size_t)onceil, ord.length());
1974
1975 const index_t rdlen = rd.length();
1976 const index_t ordlen = ord.length();
1977 const TAlScore perfectScore = sc.perfectScore(rdlen);
1978 const TAlScore operfectScore = sc.perfectScore(ordlen);
1979
1980 assert_leq(minsc, perfectScore);
1981 assert(oppFilt || ominsc <= operfectScore);
1982
1983 TAlScore bestPairScore = perfectScore + operfectScore;
1984 if(tighten > 0 && msink->Mmode() && msink->hasSecondBestPair()) {
1985 // Paired-end alignments should have at least this score from now
1986 TAlScore ps;
1987 if(tighten == 1) {
1988 ps = msink->bestPair();
1989 } else if(tighten == 2) {
1990 ps = msink->secondBestPair();
1991 } else {
1992 TAlScore diff = msink->bestPair() - msink->secondBestPair();
1993 ps = msink->secondBestPair() + (diff * 3)/4;
1994 }
1995 if(tighten == 1 && ps < bestPairScore &&
1996 msink->bestPair() == msink->secondBestPair())
1997 {
1998 ps++;
1999 }
2000 if(tighten >= 2 && ps < bestPairScore) {
2001 ps++;
2002 }
2003 // Anchor mate must have score at least 'ps' minus the best possible
2004 // score for the opposite mate.
2005 TAlScore nc = ps - operfectScore;
2006 if(nc > minsc) {
2007 minsc = nc;
2008 }
2009 assert_leq(minsc, perfectScore);
2010 }
2011
2012 DynProgFramer dpframe(!gReportOverhangs);
2013 swa.reset();
2014 oswa.reset();
2015
2016 // Initialize a set of GroupWalks, one for each seed. Also, intialize the
2017 // accompanying lists of reference seed hits (satups*)
2018 const index_t nsm = 5;
2019 const index_t nonz = sh.nonzeroOffsets(); // non-zero positions
2020 index_t eeHits = sh.numE2eHits();
2021 bool eeMode = eeHits > 0;
2022 bool firstEe = true;
2023 bool firstExtend = true;
2024
2025 // Reset all the counters related to streaks
2026 prm.nEeFail = 0;
2027 prm.nUgFail = 0;
2028 prm.nDpFail = 0;
2029
2030 index_t nelt = 0, neltLeft = 0;
2031 const index_t rows = rdlen;
2032 const index_t orows = ordlen;
2033 index_t eltsDone = 0;
2034 while(true) {
2035 if(eeMode) {
2036 if(firstEe) {
2037 firstEe = false;
2038 eeMode = eeSaTups(
2039 rd, // read
2040 sh, // seed hits to extend into full alignments
2041 gfmFw, // BWT
2042 ref, // Reference strings
2043 rnd, // pseudo-random generator
2044 wlm, // group walk left metrics
2045 swmSeed, // seed-extend metrics
2046 nelt, // out: # elements total
2047 maxIters, // max elts to report
2048 all); // report all hits
2049 assert_eq(gws_.size(), rands_.size());
2050 assert_eq(gws_.size(), satpos_.size());
2051 neltLeft = nelt;
2052 // Initialize list that contains the mate-finding failure
2053 // streak for each range
2054 mateStreaks_.resize(gws_.size());
2055 mateStreaks_.fill(0);
2056 } else {
2057 eeMode = false;
2058 }
2059 }
2060 if(!eeMode) {
2061 if(nonz == 0) {
2062 // No seed hits! Bail.
2063 return EXTEND_EXHAUSTED_CANDIDATES;
2064 }
2065 if(msink->Mmode() && minsc == perfectScore) {
2066 // Already found all perfect hits!
2067 return EXTEND_PERFECT_SCORE;
2068 }
2069 if(firstExtend) {
2070 nelt = 0;
2071 prioritizeSATups(
2072 rd, // read
2073 sh, // seed hits to extend into full alignments
2074 gfmFw, // BWT
2075 gfmBw, // BWT'
2076 ref, // Reference strings
2077 seedmms, // # seed mismatches allowed
2078 maxIters, // max rows to consider per position
2079 doExtend, // extend out seeds
2080 true, // square extended length
2081 true, // square SA range size
2082 nsm, // smallness threshold
2083 ca, // alignment cache for seed hits
2084 rnd, // pseudo-random generator
2085 wlm, // group walk left metrics
2086 prm, // per-read metrics
2087 nelt, // out: # elements total
2088 all); // report all hits?
2089 assert_eq(gws_.size(), rands_.size());
2090 assert_eq(gws_.size(), satpos_.size());
2091 neltLeft = nelt;
2092 firstExtend = false;
2093 mateStreaks_.resize(gws_.size());
2094 mateStreaks_.fill(0);
2095 }
2096 if(neltLeft == 0) {
2097 // Finished examining gapped candidates
2098 break;
2099 }
2100 }
2101 for(index_t i = 0; i < gws_.size(); i++) {
2102 if(eeMode && eehits_[i].score < minsc) {
2103 return EXTEND_PERFECT_SCORE;
2104 }
2105 bool is_small = satpos_[i].sat.size() < nsm;
2106 bool fw = satpos_[i].pos.fw;
2107 index_t rdoff = satpos_[i].pos.rdoff;
2108 index_t seedhitlen = satpos_[i].pos.seedlen;
2109 if(!fw) {
2110 // 'rdoff' and 'offidx' are with respect to the 5' end of
2111 // the read. Here we convert rdoff to be with respect to
2112 // the upstream (3') end of ther read.
2113 rdoff = (index_t)(rdlen - rdoff - seedhitlen);
2114 }
2115 bool first = true;
2116 // If the range is small, investigate all elements now. If the
2117 // range is large, just investigate one and move on - we might come
2118 // back to this range later.
2119 while(!rands_[i].done() && (first || is_small || eeMode)) {
2120 if(minsc == perfectScore) {
2121 if(!eeMode || eehits_[i].score < perfectScore) {
2122 return EXTEND_PERFECT_SCORE;
2123 }
2124 } else if(eeMode && eehits_[i].score < minsc) {
2125 break;
2126 }
2127 if(prm.nExDps >= maxDp || prm.nMateDps >= maxDp) {
2128 return EXTEND_EXCEEDED_HARD_LIMIT;
2129 }
2130 if(prm.nExUgs >= maxUg || prm.nMateUgs >= maxUg) {
2131 return EXTEND_EXCEEDED_HARD_LIMIT;
2132 }
2133 if(prm.nExIters >= maxIters) {
2134 return EXTEND_EXCEEDED_HARD_LIMIT;
2135 }
2136 if(eeMode && prm.nEeFail >= maxEeStreak) {
2137 return EXTEND_EXCEEDED_SOFT_LIMIT;
2138 }
2139 if(!eeMode && prm.nDpFail >= maxDpStreak) {
2140 return EXTEND_EXCEEDED_SOFT_LIMIT;
2141 }
2142 if(!eeMode && prm.nUgFail >= maxUgStreak) {
2143 return EXTEND_EXCEEDED_SOFT_LIMIT;
2144 }
2145 if(mateStreaks_[i] >= maxMateStreak) {
2146 // Don't try this seed range anymore
2147 rands_[i].setDone();
2148 assert(rands_[i].done());
2149 break;
2150 }
2151 prm.nExIters++;
2152 first = false;
2153 assert(!gws_[i].done());
2154 // Resolve next element offset
2155 WalkResult<index_t> wr;
2156 uint32_t elt = rands_[i].next(rnd);
2157 SARangeWithOffs<TSlice, index_t> sa;
2158 sa.topf = satpos_[i].sat.topf;
2159 sa.len = satpos_[i].sat.key.len;
2160 sa.offs = satpos_[i].sat.offs;
2161 gws_[i].advanceElement((index_t)elt, gfmFw, ref, sa, gwstate_, wr, wlm, prm);
2162 eltsDone++;
2163 assert_gt(neltLeft, 0);
2164 neltLeft--;
2165 assert_neq((index_t)OFF_MASK, wr.toff);
2166 index_t tidx = 0, toff = 0, tlen = 0;
2167 bool straddled = false;
2168 gfmFw.joinedToTextOff(
2169 wr.elt.len,
2170 wr.toff,
2171 tidx,
2172 toff,
2173 tlen,
2174 eeMode, // reject straddlers?
2175 straddled); // straddled?
2176 if(tidx == (index_t)OFF_MASK) {
2177 // The seed hit straddled a reference boundary so the seed hit
2178 // isn't valid
2179 continue;
2180 }
2181 #ifndef NDEBUG
2182 if(!eeMode && !straddled) { // Check that seed hit matches reference
2183 uint64_t key = satpos_[i].sat.key.seq;
2184 for(index_t k = 0; k < wr.elt.len; k++) {
2185 int c = ref.getBase(tidx, toff + wr.elt.len - k - 1);
2186 assert_leq(c, 3);
2187 int ck = (int)(key & 3);
2188 key >>= 2;
2189 assert_eq(c, ck);
2190 }
2191 }
2192 #endif
2193 // Find offset of alignment's upstream base assuming net gaps=0
2194 // between beginning of read and beginning of seed hit
2195 int64_t refoff = (int64_t)toff - rdoff;
2196 EIvalMergeListBinned& seenDiags = anchor1 ? seenDiags1_ : seenDiags2_;
2197 // Coordinate of the seed hit w/r/t the pasted reference string
2198 Coord refcoord(tidx, refoff, fw);
2199 if(seenDiags.locusPresent(refcoord)) {
2200 // Already handled alignments seeded on this diagonal
2201 prm.nRedundants++;
2202 swmSeed.rshit++;
2203 continue;
2204 }
2205 // Now that we have a seed hit, there are many issues to solve
2206 // before we have a completely framed dynamic programming problem.
2207 // They include:
2208 //
2209 // 1. Setting reference offsets on either side of the seed hit,
2210 // accounting for where the seed occurs in the read
2211 // 2. Adjusting the width of the banded dynamic programming problem
2212 // and adjusting reference bounds to allow for gaps in the
2213 // alignment
2214 // 3. Accounting for the edges of the reference, which can impact
2215 // the width of the DP problem and reference bounds.
2216 // 4. Perhaps filtering the problem down to a smaller problem based
2217 // on what DPs we've already solved for this read
2218 //
2219 // We do #1 here, since it is simple and we have all the seed-hit
2220 // information here. #2 and #3 are handled in the DynProgFramer.
2221 int readGaps = 0, refGaps = 0;
2222 bool ungapped = false;
2223 if(!eeMode) {
2224 readGaps = sc.maxReadGaps(minsc, rdlen);
2225 refGaps = sc.maxRefGaps(minsc, rdlen);
2226 ungapped = (readGaps == 0 && refGaps == 0);
2227 }
2228 int state = FOUND_NONE;
2229 bool found = false;
2230 // In unpaired mode, a seed extension is successful if it
2231 // results in a full alignment that meets the minimum score
2232 // threshold. In paired-end mode, a seed extension is
2233 // successful if it results in a *full paired-end* alignment
2234 // that meets the minimum score threshold.
2235 if(eeMode) {
2236 resEe_.reset();
2237 resEe_.alres.reset();
2238 const EEHit<index_t>& h = eehits_[i];
2239 assert_leq(h.score, perfectScore);
2240 resEe_.alres.setScore(AlnScore(h.score, h.ns(), 0));
2241 resEe_.alres.setShape(
2242 refcoord.ref(), // ref id
2243 refcoord.off(), // 0-based ref offset
2244 tlen, // reference length
2245 fw, // aligned to Watson?
2246 rdlen, // read length
2247 true, // pretrim soft?
2248 0, // pretrim 5' end
2249 0, // pretrim 3' end
2250 true, // alignment trim soft?
2251 0, // alignment trim 5' end
2252 0); // alignment trim 3' end
2253 resEe_.alres.setRefNs(h.refns());
2254 if(h.mms() > 0) {
2255 assert_eq(1, h.mms());
2256 assert_lt(h.e1.pos, rd.length());
2257 resEe_.alres.ned().push_back(h.e1);
2258 }
2259 assert(resEe_.repOk(rd));
2260 state = FOUND_EE;
2261 found = true;
2262 Interval refival(refcoord, 1);
2263 seenDiags.add(refival);
2264 prm.nExEes++;
2265 prm.nEeFail++; // say it's failed until proven successful
2266 prm.nExEeFails++;
2267 } else if(doUngapped && ungapped) {
2268 resUngap_.reset();
2269 int al = swa.ungappedAlign(
2270 fw ? rd.patFw : rd.patRc,
2271 fw ? rd.qual : rd.qualRev,
2272 refcoord,
2273 ref,
2274 tlen,
2275 sc,
2276 gReportOverhangs,
2277 minsc, // minimum
2278 resUngap_);
2279 Interval refival(refcoord, 1);
2280 seenDiags.add(refival);
2281 prm.nExUgs++;
2282 prm.nUgFail++; // say it's failed until proven successful
2283 prm.nExUgFails++;
2284 if(al == 0) {
2285 swmSeed.ungapfail++;
2286 continue;
2287 } else if(al == -1) {
2288 swmSeed.ungapnodec++;
2289 } else {
2290 found = true;
2291 state = FOUND_UNGAPPED;
2292 swmSeed.ungapsucc++;
2293 }
2294 }
2295 int64_t pastedRefoff = (int64_t)wr.toff - rdoff;
2296 DPRect rect;
2297 if(state == FOUND_NONE) {
2298 found = dpframe.frameSeedExtensionRect(
2299 refoff, // ref offset implied by seed hit assuming no gaps
2300 rows, // length of read sequence used in DP table
2301 tlen, // length of reference
2302 readGaps, // max # of read gaps permitted in opp mate alignment
2303 refGaps, // max # of ref gaps permitted in opp mate alignment
2304 (size_t)nceil, // # Ns permitted
2305 maxhalf, // max width in either direction
2306 rect); // DP rectangle
2307 assert(rect.repOk());
2308 // Add the seed diagonal at least
2309 seenDiags.add(Interval(refcoord, 1));
2310 if(!found) {
2311 continue;
2312 }
2313 }
2314 int64_t leftShift = refoff - rect.refl;
2315 size_t nwindow = 0;
2316 if(toff >= rect.refl) {
2317 nwindow = (size_t)(toff - rect.refl);
2318 }
2319 // NOTE: We might be taking off more than we should because the
2320 // pasted string omits non-A/C/G/T characters, but we included them
2321 // when calculating leftShift. We'll account for this later.
2322 pastedRefoff -= leftShift;
2323 size_t nsInLeftShift = 0;
2324 if(state == FOUND_NONE) {
2325 if(!swa.initedRead()) {
2326 // Initialize the aligner with a new read
2327 swa.initRead(
2328 rd.patFw, // fw version of query
2329 rd.patRc, // rc version of query
2330 rd.qual, // fw version of qualities
2331 rd.qualRev,// rc version of qualities
2332 0, // off of first char in 'rd' to consider
2333 rdlen, // off of last char (excl) in 'rd' to consider
2334 sc); // scoring scheme
2335 }
2336 swa.initRef(
2337 fw, // whether to align forward or revcomp read
2338 tidx, // reference aligned against
2339 rect, // DP rectangle
2340 ref, // Reference strings
2341 tlen, // length of reference sequence
2342 sc, // scoring scheme
2343 minsc, // minimum score permitted
2344 enable8, // use 8-bit SSE if possible?
2345 cminlen, // minimum length for using checkpointing scheme
2346 cpow2, // interval b/t checkpointed diags; 1 << this
2347 doTri, // triangular mini-fills?
2348 true, // this is a seed extension - not finding a mate
2349 nwindow,
2350 nsInLeftShift);
2351 // Because of how we framed the problem, we can say that we've
2352 // exhaustively scored the seed diagonal as well as maxgaps
2353 // diagonals on either side
2354 Interval refival(tidx, 0, fw, 0);
2355 rect.initIval(refival);
2356 seenDiags.add(refival);
2357 // Now fill the dynamic programming matrix and return true iff
2358 // there is at least one valid alignment
2359 TAlScore bestCell = std::numeric_limits<TAlScore>::min();
2360 found = swa.align(rnd, bestCell);
2361 swmSeed.tallyGappedDp(readGaps, refGaps);
2362 prm.nExDps++;
2363 prm.nDpFail++; // failed until proven successful
2364 prm.nExDpFails++; // failed until proven successful
2365 if(!found) {
2366 TAlScore bestLast = anchor1 ? prm.bestLtMinscMate1 : prm.bestLtMinscMate2;
2367 if(bestCell > std::numeric_limits<TAlScore>::min() && bestCell > bestLast) {
2368 if(anchor1) {
2369 prm.bestLtMinscMate1 = bestCell;
2370 } else {
2371 prm.bestLtMinscMate2 = bestCell;
2372 }
2373 }
2374 continue; // Look for more anchor alignments
2375 }
2376 }
2377 bool firstInner = true;
2378 bool foundConcordant = false;
2379 while(true) {
2380 assert(found);
2381 SwResult *res = NULL;
2382 if(state == FOUND_EE) {
2383 if(!firstInner) {
2384 break;
2385 }
2386 res = &resEe_;
2387 assert(res->repOk(rd));
2388 } else if(state == FOUND_UNGAPPED) {
2389 if(!firstInner) {
2390 break;
2391 }
2392 res = &resUngap_;
2393 assert(res->repOk(rd));
2394 } else {
2395 resGap_.reset();
2396 assert(resGap_.empty());
2397 if(swa.done()) {
2398 break;
2399 }
2400 swa.nextAlignment(resGap_, minsc, rnd);
2401 found = !resGap_.empty();
2402 if(!found) {
2403 break;
2404 }
2405 res = &resGap_;
2406 assert(res->repOk(rd));
2407 }
2408 // TODO: If we're just taking anchor alignments out of the
2409 // same rectangle, aren't we getting very similar
2410 // rectangles for the opposite mate each time? Seems like
2411 // we could save some work by detecting this.
2412 assert(res != NULL);
2413 firstInner = false;
2414 assert(res->alres.matchesRef(
2415 rd,
2416 ref,
2417 tmp_rf_,
2418 tmp_rdseq_,
2419 tmp_qseq_,
2420 raw_refbuf_,
2421 raw_destU32_,
2422 raw_matches_,
2423 tmp_reflens_,
2424 tmp_refoffs_));
2425 Interval refival(tidx, 0, fw, tlen);
2426 assert_gt(res->alres.refExtent(), 0);
2427 if(gReportOverhangs &&
2428 !refival.containsIgnoreOrient(res->alres.refival()))
2429 {
2430 res->alres.clipOutside(true, 0, tlen);
2431 if(res->alres.refExtent() == 0) {
2432 continue;
2433 }
2434 }
2435 assert(gReportOverhangs ||
2436 refival.containsIgnoreOrient(res->alres.refival()));
2437 // Did the alignment fall entirely outside the reference?
2438 if(!refival.overlapsIgnoreOrient(res->alres.refival())) {
2439 continue;
2440 }
2441 // Is this alignment redundant with one we've seen previously?
2442 if(redAnchor_.overlap(res->alres)) {
2443 continue;
2444 }
2445 redAnchor_.add(res->alres);
2446 // Annotate the AlnRes object with some key parameters
2447 // that were used to obtain the alignment.
2448 res->alres.setParams(
2449 seedmms, // # mismatches allowed in seed
2450 seedlen, // length of seed
2451 seedival, // interval between seeds
2452 minsc); // minimum score for valid alignment
2453 bool foundMate = false;
2454 TRefOff off = res->alres.refoff();
2455 if( msink->state().doneWithMate(!anchor1) &&
2456 !msink->state().doneWithMate( anchor1))
2457 {
2458 // We're done with the opposite mate but not with the
2459 // anchor mate; don't try to mate up the anchor.
2460 swMateImmediately = false;
2461 }
2462 if(found && swMateImmediately) {
2463 assert(!msink->state().doneWithMate(!anchor1));
2464 bool oleft = false, ofw = false;
2465 int64_t oll = 0, olr = 0, orl = 0, orr = 0;
2466 assert(!msink->state().done());
2467 foundMate = !oppFilt;
2468 TAlScore ominsc_cur = ominsc;
2469 //bool oungapped = false;
2470 int oreadGaps = 0, orefGaps = 0;
2471 //int oungappedAlign = -1; // defer
2472 if(foundMate) {
2473 // Adjust ominsc given the alignment score of the
2474 // anchor mate
2475 ominsc_cur = ominsc;
2476 if(tighten > 0 && msink->Mmode() && msink->hasSecondBestPair()) {
2477 // Paired-end alignments should have at least this score from now
2478 TAlScore ps;
2479 if(tighten == 1) {
2480 ps = msink->bestPair();
2481 } else if(tighten == 2) {
2482 ps = msink->secondBestPair();
2483 } else {
2484 TAlScore diff = msink->bestPair() - msink->secondBestPair();
2485 ps = msink->secondBestPair() + (diff * 3)/4;
2486 }
2487 if(tighten == 1 && ps < bestPairScore &&
2488 msink->bestPair() == msink->secondBestPair())
2489 {
2490 ps++;
2491 }
2492 if(tighten >= 2 && ps < bestPairScore) {
2493 ps++;
2494 }
2495 // Anchor mate must have score at least 'ps' minus the best possible
2496 // score for the opposite mate.
2497 TAlScore nc = ps - res->alres.score().score();
2498 if(nc > ominsc_cur) {
2499 ominsc_cur = nc;
2500 assert_leq(ominsc_cur, operfectScore);
2501 }
2502 }
2503 oreadGaps = sc.maxReadGaps(ominsc_cur, ordlen);
2504 orefGaps = sc.maxRefGaps (ominsc_cur, ordlen);
2505 //oungapped = (oreadGaps == 0 && orefGaps == 0);
2506 // TODO: Something lighter-weight than DP to scan
2507 // for other mate??
2508 //if(oungapped) {
2509 // oresUngap_.reset();
2510 // oungappedAlign = oswa.ungappedAlign(
2511 // ofw ? ord.patFw : ord.patRc,
2512 // ofw ? ord.qual : ord.qualRev,
2513 // orefcoord,
2514 // ref,
2515 // otlen,
2516 // sc,
2517 // gReportOverhangs,
2518 // ominsc_cur,
2519 // 0,
2520 // oresUngap_);
2521 //}
2522 foundMate = pepol.otherMate(
2523 anchor1, // anchor mate is mate #1?
2524 fw, // anchor aligned to Watson?
2525 off, // offset of anchor mate
2526 orows + oreadGaps, // max # columns spanned by alignment
2527 tlen, // reference length
2528 anchor1 ? rd.length() : ord.length(), // mate 1 len
2529 anchor1 ? ord.length() : rd.length(), // mate 2 len
2530 oleft, // out: look left for opposite mate?
2531 oll,
2532 olr,
2533 orl,
2534 orr,
2535 ofw);
2536 }
2537 DPRect orect;
2538 if(foundMate) {
2539 foundMate = dpframe.frameFindMateRect(
2540 !oleft, // true iff anchor alignment is to the left
2541 oll, // leftmost Watson off for LHS of opp aln
2542 olr, // rightmost Watson off for LHS of opp aln
2543 orl, // leftmost Watson off for RHS of opp aln
2544 orr, // rightmost Watson off for RHS of opp aln
2545 orows, // length of opposite mate
2546 tlen, // length of reference sequence aligned to
2547 oreadGaps, // max # of read gaps in opp mate aln
2548 orefGaps, // max # of ref gaps in opp mate aln
2549 (size_t)onceil, // max # Ns on opp mate
2550 maxhalf, // max width in either direction
2551 orect); // DP rectangle
2552 assert(!foundMate || orect.refr >= orect.refl);
2553 }
2554 if(foundMate) {
2555 oresGap_.reset();
2556 assert(oresGap_.empty());
2557 if(!oswa.initedRead()) {
2558 oswa.initRead(
2559 ord.patFw, // read to align
2560 ord.patRc, // qualities
2561 ord.qual, // read to align
2562 ord.qualRev,// qualities
2563 0, // off of first char to consider
2564 ordlen, // off of last char (ex) to consider
2565 sc); // scoring scheme
2566 }
2567 // Given the boundaries defined by refi and reff, initilize
2568 // the SwAligner with the dynamic programming problem that
2569 // aligns the read to this reference stretch.
2570 size_t onsInLeftShift = 0;
2571 assert_geq(orect.refr, orect.refl);
2572 oswa.initRef(
2573 ofw, // align forward or revcomp read?
2574 tidx, // reference aligned against
2575 orect, // DP rectangle
2576 ref, // Reference strings
2577 tlen, // length of reference sequence
2578 sc, // scoring scheme
2579 ominsc_cur,// min score for valid alignments
2580 enable8, // use 8-bit SSE if possible?
2581 cminlen, // minimum length for using checkpointing scheme
2582 cpow2, // interval b/t checkpointed diags; 1 << this
2583 doTri, // triangular mini-fills?
2584 false, // this is finding a mate - not seed ext
2585 0, // nwindow?
2586 onsInLeftShift);
2587 // TODO: Can't we add some diagonals to the
2588 // opposite mate's seenDiags when we fill in the
2589 // opposite mate's DP? Or can we? We might want
2590 // to use this again as an anchor - will that still
2591 // happen? Also, isn't there a problem with
2592 // consistency of the minimum score? Minimum score
2593 // here depends in part on the score of the anchor
2594 // alignment here, but it won't when the current
2595 // opposite becomes the anchor.
2596
2597 // Because of how we framed the problem, we can say
2598 // that we've exhaustively explored the "core"
2599 // diagonals
2600 //Interval orefival(tidx, 0, ofw, 0);
2601 //orect.initIval(orefival);
2602 //oseenDiags.add(orefival);
2603
2604 // Now fill the dynamic programming matrix, return true
2605 // iff there is at least one valid alignment
2606 TAlScore bestCell = std::numeric_limits<TAlScore>::min();
2607 foundMate = oswa.align(rnd, bestCell);
2608 prm.nMateDps++;
2609 swmMate.tallyGappedDp(oreadGaps, orefGaps);
2610 if(!foundMate) {
2611 TAlScore bestLast = anchor1 ? prm.bestLtMinscMate2 : prm.bestLtMinscMate1;
2612 if(bestCell > std::numeric_limits<TAlScore>::min() && bestCell > bestLast) {
2613 if(anchor1) {
2614 prm.bestLtMinscMate2 = bestCell;
2615 } else {
2616 prm.bestLtMinscMate1 = bestCell;
2617 }
2618 }
2619 }
2620 }
2621 bool didAnchor = false;
2622 do {
2623 oresGap_.reset();
2624 assert(oresGap_.empty());
2625 if(foundMate && oswa.done()) {
2626 foundMate = false;
2627 } else if(foundMate) {
2628 oswa.nextAlignment(oresGap_, ominsc_cur, rnd);
2629 foundMate = !oresGap_.empty();
2630 assert(!foundMate || oresGap_.alres.matchesRef(
2631 ord,
2632 ref,
2633 tmp_rf_,
2634 tmp_rdseq_,
2635 tmp_qseq_,
2636 raw_refbuf_,
2637 raw_destU32_,
2638 raw_matches_,
2639 tmp_reflens_,
2640 tmp_refoffs_));
2641 }
2642 if(foundMate) {
2643 // Redundant with one we've seen previously?
2644 if(!redAnchor_.overlap(oresGap_.alres)) {
2645 redAnchor_.add(oresGap_.alres);
2646 }
2647 assert_eq(ofw, oresGap_.alres.fw());
2648 // Annotate the AlnRes object with some key parameters
2649 // that were used to obtain the alignment.
2650 oresGap_.alres.setParams(
2651 seedmms, // # mismatches allowed in seed
2652 seedlen, // length of seed
2653 seedival, // interval between seeds
2654 ominsc); // minimum score for valid alignment
2655 assert_gt(oresGap_.alres.refExtent(), 0);
2656 if(gReportOverhangs &&
2657 !refival.containsIgnoreOrient(oresGap_.alres.refival()))
2658 {
2659 oresGap_.alres.clipOutside(true, 0, tlen);
2660 foundMate = oresGap_.alres.refExtent() > 0;
2661 }
2662 if(foundMate &&
2663 ((!gReportOverhangs &&
2664 !refival.containsIgnoreOrient(oresGap_.alres.refival())) ||
2665 !refival.overlapsIgnoreOrient(oresGap_.alres.refival())))
2666 {
2667 foundMate = false;
2668 }
2669 }
2670 ASSERT_ONLY(TRefId refid);
2671 TRefOff off1, off2;
2672 size_t len1, len2;
2673 bool fw1, fw2;
2674 int pairCl = PE_ALS_DISCORD;
2675 if(foundMate) {
2676 ASSERT_ONLY(refid =) res->alres.refid();
2677 assert_eq(refid, oresGap_.alres.refid());
2678 off1 = anchor1 ? off : oresGap_.alres.refoff();
2679 off2 = anchor1 ? oresGap_.alres.refoff() : off;
2680 len1 = anchor1 ?
2681 res->alres.refExtent() : oresGap_.alres.refExtent();
2682 len2 = anchor1 ?
2683 oresGap_.alres.refExtent() : res->alres.refExtent();
2684 fw1 = anchor1 ? res->alres.fw() : oresGap_.alres.fw();
2685 fw2 = anchor1 ? oresGap_.alres.fw() : res->alres.fw();
2686 // Check that final mate alignments are consistent with
2687 // paired-end fragment constraints
2688 pairCl = pepol.peClassifyPair(
2689 off1,
2690 len1,
2691 fw1,
2692 off2,
2693 len2,
2694 fw2);
2695 // Instead of trying
2696 //foundMate = pairCl != PE_ALS_DISCORD;
2697 }
2698 if(msink->state().doneConcordant()) {
2699 foundMate = false;
2700 }
2701 if(reportImmediately) {
2702 if(foundMate) {
2703 // Report pair to the AlnSinkWrap
2704 assert(!msink->state().doneConcordant());
2705 assert(msink != NULL);
2706 assert(res->repOk());
2707 assert(oresGap_.repOk());
2708 // Report an unpaired alignment
2709 assert(!msink->maxed());
2710 assert(!msink->state().done());
2711 bool doneUnpaired = false;
2712 //if(mixed || discord) {
2713 // Report alignment for mate #1 as an
2714 // unpaired alignment.
2715 if(!anchor1 || !didAnchor) {
2716 if(anchor1) {
2717 didAnchor = true;
2718 }
2719 const AlnRes& r1 = anchor1 ?
2720 res->alres : oresGap_.alres;
2721 if(!redMate1_.overlap(r1)) {
2722 redMate1_.add(r1);
2723 if(msink->report(0, &r1, NULL)) {
2724 doneUnpaired = true; // Short-circuited
2725 }
2726 }
2727 }
2728 // Report alignment for mate #2 as an
2729 // unpaired alignment.
2730 if(anchor1 || !didAnchor) {
2731 if(!anchor1) {
2732 didAnchor = true;
2733 }
2734 const AlnRes& r2 = anchor1 ?
2735 oresGap_.alres : res->alres;
2736 if(!redMate2_.overlap(r2)) {
2737 redMate2_.add(r2);
2738 if(msink->report(0, NULL, &r2)) {
2739 doneUnpaired = true; // Short-circuited
2740 }
2741 }
2742 }
2743 //} // if(mixed || discord)
2744 bool donePaired = false;
2745 if(pairCl != PE_ALS_DISCORD) {
2746 foundConcordant = true;
2747 if(msink->report(
2748 0,
2749 anchor1 ? &res->alres : &oresGap_.alres,
2750 anchor1 ? &oresGap_.alres : &res->alres))
2751 {
2752 // Short-circuited because a limit, e.g.
2753 // -k, -m or -M, was exceeded
2754 donePaired = true;
2755 } else {
2756 if(tighten > 0 && msink->Mmode() && msink->hasSecondBestPair()) {
2757 // Paired-end alignments should have at least this score from now
2758 TAlScore ps;
2759 if(tighten == 1) {
2760 ps = msink->bestPair();
2761 } else if(tighten == 2) {
2762 ps = msink->secondBestPair();
2763 } else {
2764 TAlScore diff = msink->bestPair() - msink->secondBestPair();
2765 ps = msink->secondBestPair() + (diff * 3)/4;
2766 }
2767 if(tighten == 1 && ps < bestPairScore &&
2768 msink->bestPair() == msink->secondBestPair())
2769 {
2770 ps++;
2771 }
2772 if(tighten >= 2 && ps < bestPairScore) {
2773 ps++;
2774 }
2775 // Anchor mate must have score at least 'ps' minus the best possible
2776 // score for the opposite mate.
2777 TAlScore nc = ps - operfectScore;
2778 if(nc > minsc) {
2779 minsc = nc;
2780 assert_leq(minsc, perfectScore);
2781 if(minsc > res->alres.score().score()) {
2782 // We're done with this anchor
2783 break;
2784 }
2785 }
2786 assert_leq(minsc, perfectScore);
2787 }
2788 }
2789 } // if(pairCl != PE_ALS_DISCORD)
2790 if(donePaired || doneUnpaired) {
2791 return EXTEND_POLICY_FULFILLED;
2792 }
2793 if(msink->state().doneWithMate(anchor1)) {
2794 // We're now done with the mate that we're
2795 // currently using as our anchor. We're not
2796 // with the read overall.
2797 return EXTEND_POLICY_FULFILLED;
2798 }
2799 } else if((mixed || discord) && !didAnchor) {
2800 didAnchor = true;
2801 // Report unpaired hit for anchor
2802 assert(msink != NULL);
2803 assert(res->repOk());
2804 // Check that alignment accurately reflects the
2805 // reference characters aligned to
2806 assert(res->alres.matchesRef(
2807 rd,
2808 ref,
2809 tmp_rf_,
2810 tmp_rdseq_,
2811 tmp_qseq_,
2812 raw_refbuf_,
2813 raw_destU32_,
2814 raw_matches_,
2815 tmp_reflens_,
2816 tmp_refoffs_));
2817 // Report an unpaired alignment
2818 assert(!msink->maxed());
2819 assert(!msink->state().done());
2820 // Report alignment for mate #1 as an
2821 // unpaired alignment.
2822 if(!msink->state().doneUnpaired(anchor1)) {
2823 const AlnRes& r = res->alres;
2824 RedundantAlns& red = anchor1 ? redMate1_ : redMate2_;
2825 const AlnRes* r1 = anchor1 ? &res->alres : NULL;
2826 const AlnRes* r2 = anchor1 ? NULL : &res->alres;
2827 if(!red.overlap(r)) {
2828 red.add(r);
2829 if(msink->report(0, r1, r2)) {
2830 return EXTEND_POLICY_FULFILLED; // Short-circuited
2831 }
2832 }
2833 }
2834 if(msink->state().doneWithMate(anchor1)) {
2835 // Done with mate, but not read overall
2836 return EXTEND_POLICY_FULFILLED;
2837 }
2838 }
2839 }
2840 } while(!oresGap_.empty());
2841 } // if(found && swMateImmediately)
2842 else if(found) {
2843 assert(!msink->state().doneWithMate(anchor1));
2844 // We found an anchor alignment but did not attempt to find
2845 // an alignment for the opposite mate (probably because
2846 // we're done with it)
2847 if(reportImmediately && (mixed || discord)) {
2848 // Report unpaired hit for anchor
2849 assert(msink != NULL);
2850 assert(res->repOk());
2851 // Check that alignment accurately reflects the
2852 // reference characters aligned to
2853 assert(res->alres.matchesRef(
2854 rd,
2855 ref,
2856 tmp_rf_,
2857 tmp_rdseq_,
2858 tmp_qseq_,
2859 raw_refbuf_,
2860 raw_destU32_,
2861 raw_matches_,
2862 tmp_reflens_,
2863 tmp_refoffs_));
2864 // Report an unpaired alignment
2865 assert(!msink->maxed());
2866 assert(!msink->state().done());
2867 // Report alignment for mate #1 as an
2868 // unpaired alignment.
2869 if(!msink->state().doneUnpaired(anchor1)) {
2870 const AlnRes& r = res->alres;
2871 RedundantAlns& red = anchor1 ? redMate1_ : redMate2_;
2872 const AlnRes* r1 = anchor1 ? &res->alres : NULL;
2873 const AlnRes* r2 = anchor1 ? NULL : &res->alres;
2874 if(!red.overlap(r)) {
2875 red.add(r);
2876 if(msink->report(0, r1, r2)) {
2877 return EXTEND_POLICY_FULFILLED; // Short-circuited
2878 }
2879 }
2880 }
2881 if(msink->state().doneWithMate(anchor1)) {
2882 // Done with mate, but not read overall
2883 return EXTEND_POLICY_FULFILLED;
2884 }
2885 }
2886 }
2887 } // while(true)
2888
2889 if(foundConcordant) {
2890 prm.nMateDpSuccs++;
2891 mateStreaks_[i] = 0;
2892 // Register this as a success. Now we need to
2893 // make the streak variables reflect the
2894 // success.
2895 if(state == FOUND_UNGAPPED) {
2896 assert_gt(prm.nUgFail, 0);
2897 assert_gt(prm.nExUgFails, 0);
2898 prm.nExUgFails--;
2899 prm.nExUgSuccs++;
2900 prm.nUgLastSucc = prm.nExUgs-1;
2901 if(prm.nUgFail > prm.nUgFailStreak) {
2902 prm.nUgFailStreak = prm.nUgFail;
2903 }
2904 prm.nUgFail = 0;
2905 } else if(state == FOUND_EE) {
2906 assert_gt(prm.nEeFail, 0);
2907 assert_gt(prm.nExEeFails, 0);
2908 prm.nExEeFails--;
2909 prm.nExEeSuccs++;
2910 prm.nEeLastSucc = prm.nExEes-1;
2911 if(prm.nEeFail > prm.nEeFailStreak) {
2912 prm.nEeFailStreak = prm.nEeFail;
2913 }
2914 prm.nEeFail = 0;
2915 } else {
2916 assert_gt(prm.nDpFail, 0);
2917 assert_gt(prm.nExDpFails, 0);
2918 prm.nExDpFails--;
2919 prm.nExDpSuccs++;
2920 prm.nDpLastSucc = prm.nExDps-1;
2921 if(prm.nDpFail > prm.nDpFailStreak) {
2922 prm.nDpFailStreak = prm.nDpFail;
2923 }
2924 prm.nDpFail = 0;
2925 }
2926 } else {
2927 prm.nMateDpFails++;
2928 mateStreaks_[i]++;
2929 }
2930 // At this point we know that we aren't bailing, and will continue to resolve seed hits.
2931
2932 } // while(!gw.done())
2933 } // for(size_t i = 0; i < gws_.size(); i++)
2934 }
2935 return EXTEND_EXHAUSTED_CANDIDATES;
2936 }
2937
2938 #endif /*ALIGNER_SW_DRIVER_H_*/
2939