1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include <limits>
21 #include <ctype.h>
22 #include "aligner_seed2.h"
23 #include "assert_helpers.h"
24 #include "bt2_idx.h"
25
26 /**
27 * Drive the process of descending from all search roots.
28 */
go(const Scoring & sc,const Ebwt & ebwtFw,const Ebwt & ebwtBw,DescentMetrics & met,PerReadMetrics & prm)29 void DescentDriver::go(
30 const Scoring& sc, // scoring scheme
31 const Ebwt& ebwtFw, // forward index
32 const Ebwt& ebwtBw, // mirror index
33 DescentMetrics& met, // metrics
34 PerReadMetrics& prm) // per-read metrics
35 {
36 assert(q_.repOk());
37 // Convert DescentRoots to the initial Descents
38 for(size_t i = 0; i < roots_.size(); i++) {
39 size_t dfsz = df_.size();
40 size_t pfsz = pf_.size();
41 TDescentId id = df_.alloc();
42 Edit e_null;
43 assert(!e_null.inited());
44 bool succ = df_[id].init(
45 q_, // read
46 i, // root and conf id
47 sc, // scoring scheme
48 minsc_, // minimum score
49 maxpen_, // maximum penalty
50 id, // new Descent's id
51 ebwtFw, // forward index
52 ebwtBw, // mirror index
53 re_, // redundancy checker
54 df_, // Descent factory
55 pf_, // DescentPos factory
56 roots_, // DescentRoots
57 confs_, // DescentConfs
58 heap_, // heap
59 alsink_, // alignment sink
60 met, // metrics
61 prm); // per-read metrics
62 if(veryVerbose_) {
63 bool fw = roots_[i].fw;
64 tmpedit_.clear();
65 df_[id].print(
66 &cerr,
67 "",
68 q_,
69 0,
70 0,
71 fw,
72 tmpedit_,
73 0,
74 tmpedit_.size(),
75 tmprfdnastr_);
76 }
77 if(!succ) {
78 // Reclaim memory we had used for this descent and its DescentPos info
79 df_.resize(dfsz);
80 pf_.resize(pfsz);
81 }
82 }
83 // Advance until some stopping condition
84 bool stop = heap_.empty();
85 while(!stop) {
86 // Pop off the highest-priority descent. Note that some outgoing edges
87 // might have since been explored, which could reduce the priority of
88 // the descent once we .
89 TDescentPair p = heap_.pop();
90 df_.alloc(); df_.pop();
91 df_[p.second].followBestOutgoing(
92 q_, // read
93 ebwtFw, // index over text
94 ebwtBw, // index over reverse text
95 sc, // scoring scheme
96 minsc_, // minimum score
97 maxpen_, // maximum penalty
98 re_, // redundancy checker
99 df_, // Descent factory
100 pf_, // DescentPos factory
101 roots_, //
102 confs_, //
103 heap_, // priority queue for Descents
104 alsink_, // alignment sink
105 met, // metrics
106 prm); // per-read metrics
107 stop = heap_.empty();
108 }
109 }
110
111 /**
112 * Perform seed alignment until some stopping condition is satisfied.
113 */
advance(const DescentStoppingConditions & stopc,const Scoring & sc,const Ebwt & ebwtFw,const Ebwt & ebwtBw,DescentMetrics & met,PerReadMetrics & prm)114 int DescentDriver::advance(
115 const DescentStoppingConditions& stopc, // stopping conditions
116 const Scoring& sc, // scoring scheme
117 const Ebwt& ebwtFw, // forward index
118 const Ebwt& ebwtBw, // mirror index
119 DescentMetrics& met, // metrics
120 PerReadMetrics& prm) // per-read metrics
121 {
122 size_t nbwop_i = met.bwops;
123 while(rootsInited_ < roots_.size()) {
124 size_t dfsz = df_.size();
125 size_t pfsz = pf_.size();
126 TDescentId id = df_.alloc();
127 Edit e_null;
128 assert(!e_null.inited());
129 bool succ = df_[id].init(
130 q_, // query
131 rootsInited_, // root and conf id
132 sc, // scoring scheme
133 minsc_, // minimum score
134 maxpen_, // maximum penalty
135 id, // new Descent's id
136 ebwtFw, // forward index
137 ebwtBw, // mirror index
138 re_, // redundancy checker
139 df_, // Descent factory
140 pf_, // DescentPos factory
141 roots_, // DescentRoots
142 confs_, // DescentConfs
143 heap_, // heap
144 alsink_, // alignment sink
145 met, // metrics
146 prm); // per-read metrics
147 if(!succ) {
148 // Reclaim memory we had used for this descent and its DescentPos info
149 df_.resize(dfsz);
150 pf_.resize(pfsz);
151 }
152 rootsInited_++;
153 TAlScore best = std::numeric_limits<TAlScore>::max();
154 if(!heap_.empty()) {
155 best = heap_.top().first.pen;
156 }
157 if(stopc.nfound > 0 && alsink_.nelt() > stopc.nfound) {
158 return DESCENT_DRIVER_ALN;
159 }
160 if(alsink_.stratumDone(best)) {
161 return DESCENT_DRIVER_STRATA;
162 }
163 if(stopc.nbwop > 0 && (met.bwops - nbwop_i) > stopc.nbwop) {
164 return DESCENT_DRIVER_BWOPS;
165 }
166 if(stopc.totsz > 0 && totalSizeBytes() > stopc.totsz) {
167 return DESCENT_DRIVER_MEM;
168 }
169 }
170 // Advance until some stopping condition
171 bool stop = heap_.empty();
172 while(!stop) {
173 // Pop off the highest-priority descent.
174 TDescentPair p = heap_.pop();
175 df_.alloc(); df_.pop(); // Create new descent
176 df_[p.second].followBestOutgoing(
177 q_, // read
178 ebwtFw, // forward index
179 ebwtBw, // backward index
180 sc, // scoring scheme
181 minsc_, // minimum score
182 maxpen_, // maximum penalty
183 re_, // redundancy checker
184 df_, // Descent factory
185 pf_, // DescentPos factory
186 roots_, // search roots
187 confs_, // search root configurations
188 heap_, // descent heap
189 alsink_, // alignment sink
190 met, // metrics
191 prm); // per-read metrics
192 TAlScore best = std::numeric_limits<TAlScore>::max();
193 if(!heap_.empty()) {
194 best = heap_.top().first.pen;
195 }
196 if(stopc.nfound > 0 && alsink_.nelt() > stopc.nfound) {
197 return DESCENT_DRIVER_ALN;
198 }
199 if(alsink_.stratumDone(best)) {
200 return DESCENT_DRIVER_STRATA;
201 }
202 if(stopc.nbwop > 0 && (met.bwops - nbwop_i) > stopc.nbwop) {
203 return DESCENT_DRIVER_BWOPS;
204 }
205 if(stopc.totsz > 0 && totalSizeBytes() > stopc.totsz) {
206 return DESCENT_DRIVER_MEM;
207 }
208 stop = heap_.empty();
209 }
210 return DESCENT_DRIVER_DONE;
211 }
212
213 /**
214 * If this is the final descent in a complete end-to-end alignment, report
215 * the alignment.
216 */
reportAlignment(const Read & q,const Ebwt & ebwtFw,const Ebwt & ebwtBw,TIndexOffU topf,TIndexOffU botf,TIndexOffU topb,TIndexOffU botb,TDescentId id,TRootId rid,const Edit & e,TScore pen,EFactory<Descent> & df,EFactory<DescentPos> & pf,const EList<DescentRoot> & rs,const EList<DescentConfig> & cs)217 bool DescentAlignmentSink::reportAlignment(
218 const Read& q, // query string
219 const Ebwt& ebwtFw, // forward index
220 const Ebwt& ebwtBw, // mirror index
221 TIndexOffU topf, // SA range top in forward index
222 TIndexOffU botf, // SA range bottom in forward index
223 TIndexOffU topb, // SA range top in backward index
224 TIndexOffU botb, // SA range bottom in backward index
225 TDescentId id, // id of leaf Descent
226 TRootId rid, // id of search root
227 const Edit& e, // final edit, if needed
228 TScore pen, // total penalty
229 EFactory<Descent>& df, // factory with Descent
230 EFactory<DescentPos>& pf, // factory with DescentPoss
231 const EList<DescentRoot>& rs, // roots
232 const EList<DescentConfig>& cs) // configs
233 {
234 TDescentId cur = id;
235 ASSERT_ONLY(const Descent& desc = df[id]);
236 const bool fw = rs[rid].fw;
237 ASSERT_ONLY(size_t len = q.length());
238 assert(q.repOk());
239 assert_lt(desc.al5pf(), len);
240 // Adjust al5pi and al5pf to take the final edit into account (if
241 // there is one)
242 // Check if this is redundant with a previous reported alignment
243 Triple<TIndexOffU, TIndexOffU, size_t> lhs(topf, botf, 0);
244 Triple<TIndexOffU, TIndexOffU, size_t> rhs(topb, botb, q.length()-1);
245 if(!lhs_.insert(lhs)) {
246 rhs_.insert(rhs);
247 return false; // Already there
248 }
249 if(!rhs_.insert(rhs)) {
250 return false; // Already there
251 }
252 size_t ei = edits_.size();
253 df[cur].collectEdits(edits_, &e, df);
254 size_t en = edits_.size() - ei;
255 #ifndef NDEBUG
256 {
257 for(size_t i = 1; i < en; i++) {
258 assert_geq(edits_[ei+i].pos, edits_[ei+i-1].pos);
259 }
260 // Now figure out how much we refrained from aligning on either
261 // side.
262 size_t trimLf = 0;
263 size_t trimRg = 0;
264 BTDnaString& rf = tmprfdnastr_;
265 rf.clear();
266 if(!fw) {
267 // Edit offsets are w/r/t 5' end, but desc.print wants them w/r/t
268 // the *left* end of the read sequence that aligned
269 Edit::invertPoss(edits_, len, ei, en, true);
270 }
271 desc.print(NULL, "", q, trimLf, trimRg, fw, edits_, ei, en, rf);
272 if(!fw) {
273 // Invert them back to how they were before
274 Edit::invertPoss(edits_, len, ei, en, true);
275 }
276 ASSERT_ONLY(TIndexOffU toptmp = 0);
277 ASSERT_ONLY(TIndexOffU bottmp = 0);
278 // Check that the edited string occurs in the reference
279 if(!ebwtFw.contains(rf, &toptmp, &bottmp)) {
280 std::cerr << rf << std::endl;
281 assert(false);
282 }
283 }
284 #endif
285 als_.expand();
286 als_.back().init(pen, fw, topf, botf, ei, en);
287 nelt_ += (botf - topf);
288 if(bestPen_ == std::numeric_limits<TAlScore>::max() || pen < bestPen_) {
289 bestPen_ = pen;
290 }
291 if(worstPen_ == std::numeric_limits<TAlScore>::max() || pen > worstPen_) {
292 worstPen_ = pen;
293 }
294 return true;
295 }
296
297 /**
298 * Initialize a new descent branching from the given descent via the given
299 * edit. Return false if the Descent has no outgoing edges (and can
300 * therefore have its memory freed), true otherwise.
301 */
init(const Read & q,TRootId rid,const Scoring & sc,TAlScore minsc,TAlScore maxpen,TReadOff al5pi,TReadOff al5pf,TIndexOffU topf,TIndexOffU botf,TIndexOffU topb,TIndexOffU botb,bool l2r,size_t descid,TDescentId parent,TScore pen,const Edit & e,const Ebwt & ebwtFw,const Ebwt & ebwtBw,DescentRedundancyChecker & re,EFactory<Descent> & df,EFactory<DescentPos> & pf,const EList<DescentRoot> & rs,const EList<DescentConfig> & cs,EHeap<TDescentPair> & heap,DescentAlignmentSink & alsink,DescentMetrics & met,PerReadMetrics & prm)302 bool Descent::init(
303 const Read& q, // query
304 TRootId rid, // root id
305 const Scoring& sc, // scoring scheme
306 TAlScore minsc, // minimum score
307 TAlScore maxpen, // maximum penalty
308 TReadOff al5pi, // offset from 5' of 1st aligned char
309 TReadOff al5pf, // offset from 5' of last aligned char
310 TIndexOffU topf, // SA range top in FW index
311 TIndexOffU botf, // SA range bottom in FW index
312 TIndexOffU topb, // SA range top in BW index
313 TIndexOffU botb, // SA range bottom in BW index
314 bool l2r, // direction this descent will go in
315 size_t descid, // my ID
316 TDescentId parent, // parent ID
317 TScore pen, // total penalties so far
318 const Edit& e, // edit for incoming edge
319 const Ebwt& ebwtFw, // forward index
320 const Ebwt& ebwtBw, // mirror index
321 DescentRedundancyChecker& re, // redundancy checker
322 EFactory<Descent>& df, // Descent factory
323 EFactory<DescentPos>& pf, // DescentPos factory
324 const EList<DescentRoot>& rs, // roots
325 const EList<DescentConfig>& cs, // configs
326 EHeap<TDescentPair>& heap, // heap
327 DescentAlignmentSink& alsink, // alignment sink
328 DescentMetrics& met, // metrics
329 PerReadMetrics& prm) // per-read metrics
330 {
331 assert(q.repOk());
332 rid_ = rid;
333 al5pi_ = al5pi;
334 al5pf_ = al5pf;
335 l2r_ = l2r;
336 topf_ = topf;
337 botf_ = botf;
338 topb_ = topb;
339 botb_ = botb;
340 descid_ = descid;
341 parent_ = parent;
342 pen_ = pen;
343 posid_ = std::numeric_limits<size_t>::max();
344 len_ = 0;
345 out_.clear();
346 edit_ = e;
347 lastRecalc_ = true;
348 gapadd_ = df[parent].gapadd_;
349 if(e.inited()) {
350 if(e.isReadGap()) {
351 gapadd_++;
352 } else if(e.isRefGap()) {
353 gapadd_--;
354 }
355 }
356 bool branches = false, hitEnd = false, done = false;
357 TIndexOffU topf_new = 0, botf_new = 0, topb_new = 0, botb_new = 0;
358 off5p_i_ = 0;
359 #ifndef NDEBUG
360 size_t depth = al5pf_ - al5pi_ + 1;
361 TAlScore maxpend = cs[rid_].cons.get(depth, q.length(), maxpen);
362 assert_geq(maxpend, pen_); // can't have already exceeded max penalty
363 #endif
364 bool matchSucc = followMatches(
365 q,
366 sc,
367 ebwtFw,
368 ebwtBw,
369 re,
370 df,
371 pf,
372 rs,
373 cs,
374 heap,
375 alsink,
376 met,
377 prm,
378 branches,
379 hitEnd,
380 done,
381 off5p_i_,
382 topf_new,
383 botf_new,
384 topb_new,
385 botb_new);
386 bool bounceSucc = false;
387 if(matchSucc && hitEnd && !done) {
388 assert(topf_new > 0 || botf_new > 0);
389 bounceSucc = bounce(
390 q,
391 topf_new,
392 botf_new,
393 topb_new,
394 botb_new,
395 ebwtFw,
396 ebwtBw,
397 sc,
398 minsc, // minimum score
399 maxpen, // maximum penalty
400 re,
401 df,
402 pf,
403 rs,
404 cs,
405 heap,
406 alsink,
407 met, // descent metrics
408 prm); // per-read metrics
409 }
410 if(matchSucc) {
411 // Calculate info about outgoing edges
412 recalcOutgoing(q, sc, minsc, maxpen, re, pf, rs, cs, prm);
413 if(!empty()) {
414 heap.insert(make_pair(out_.bestPri(), descid)); // Add to heap
415 }
416 }
417 return !empty() || bounceSucc;
418 }
419
420 /**
421 * Initialize a new descent beginning at the given root. Return false if
422 * the Descent has no outgoing edges (and can therefore have its memory
423 * freed), true otherwise.
424 */
init(const Read & q,TRootId rid,const Scoring & sc,TAlScore minsc,TAlScore maxpen,size_t descid,const Ebwt & ebwtFw,const Ebwt & ebwtBw,DescentRedundancyChecker & re,EFactory<Descent> & df,EFactory<DescentPos> & pf,const EList<DescentRoot> & rs,const EList<DescentConfig> & cs,EHeap<TDescentPair> & heap,DescentAlignmentSink & alsink,DescentMetrics & met,PerReadMetrics & prm)425 bool Descent::init(
426 const Read& q, // query
427 TRootId rid, // root id
428 const Scoring& sc, // scoring scheme
429 TAlScore minsc, // minimum score
430 TAlScore maxpen, // maximum penalty
431 size_t descid, // id of this Descent
432 const Ebwt& ebwtFw, // forward index
433 const Ebwt& ebwtBw, // mirror index
434 DescentRedundancyChecker& re, // redundancy checker
435 EFactory<Descent>& df, // Descent factory
436 EFactory<DescentPos>& pf, // DescentPos factory
437 const EList<DescentRoot>& rs, // roots
438 const EList<DescentConfig>& cs, // configs
439 EHeap<TDescentPair>& heap, // heap
440 DescentAlignmentSink& alsink, // alignment sink
441 DescentMetrics& met, // metrics
442 PerReadMetrics& prm) // per-read metrics
443 {
444 rid_ = rid;
445 al5pi_ = rs[rid].off5p;
446 al5pf_ = rs[rid].off5p;
447 assert_lt(al5pi_, q.length());
448 assert_lt(al5pf_, q.length());
449 l2r_ = rs[rid].l2r;
450 topf_ = botf_ = topb_ = botb_ = 0;
451 descid_ = descid;
452 parent_ = std::numeric_limits<size_t>::max();
453 pen_ = 0;
454 posid_ = std::numeric_limits<size_t>::max();
455 len_ = 0;
456 out_.clear();
457 edit_.reset();
458 lastRecalc_ = true;
459 gapadd_ = 0;
460 bool branches = false, hitEnd = false, done = false;
461 TIndexOffU topf_new = 0, botf_new = 0, topb_new = 0, botb_new = 0;
462 off5p_i_ = 0;
463 bool matchSucc = followMatches(
464 q,
465 sc,
466 ebwtFw,
467 ebwtBw,
468 re,
469 df,
470 pf,
471 rs,
472 cs,
473 heap,
474 alsink,
475 met,
476 prm,
477 branches,
478 hitEnd,
479 done,
480 off5p_i_,
481 topf_new,
482 botf_new,
483 topb_new,
484 botb_new);
485 bool bounceSucc = false;
486 if(matchSucc && hitEnd && !done) {
487 assert(topf_new > 0 || botf_new > 0);
488 bounceSucc = bounce(
489 q,
490 topf_new,
491 botf_new,
492 topb_new,
493 botb_new,
494 ebwtFw,
495 ebwtBw,
496 sc,
497 minsc, // minimum score
498 maxpen, // maximum penalty
499 re,
500 df,
501 pf,
502 rs,
503 cs,
504 heap,
505 alsink,
506 met, // descent metrics
507 prm); // per-read metrics
508 }
509 // Calculate info about outgoing edges
510 assert(empty());
511 if(matchSucc) {
512 recalcOutgoing(q, sc, minsc, maxpen, re, pf, rs, cs, prm);
513 if(!empty()) {
514 heap.insert(make_pair(out_.bestPri(), descid)); // Add to heap
515 }
516 }
517 return !empty() || bounceSucc;
518 }
519
520 /**
521 * Recalculate our summary of the outgoing edges from this descent. When
522 * deciding what outgoing edges are legal, we abide by constraints.
523 * Typically, they limit the total of the penalties accumulated so far, as
524 * a function of distance from the search root. E.g. a constraint might
525 * disallow any gaps or mismatches within 20 ply of the search root, then
526 * allow 1 mismatch within 30 ply, then allow up to 1 mismatch or 1 gap
527 * within 40 ply, etc.
528 *
529 * Return the total number of valid outgoing edges found.
530 *
531 * TODO: Eliminate outgoing gap edges that are redundant with others owing to
532 * the DNA sequence and the fact that we don't care to distinguish among
533 * "equivalent" homopolymer extensinos and retractions.
534 */
recalcOutgoing(const Read & q,const Scoring & sc,TAlScore minsc,TAlScore maxpen,DescentRedundancyChecker & re,EFactory<DescentPos> & pf,const EList<DescentRoot> & rs,const EList<DescentConfig> & cs,PerReadMetrics & prm)535 size_t Descent::recalcOutgoing(
536 const Read& q, // query string
537 const Scoring& sc, // scoring scheme
538 TAlScore minsc, // minimum score
539 TAlScore maxpen, // maximum penalty
540 DescentRedundancyChecker& re, // redundancy checker
541 EFactory<DescentPos>& pf, // factory with DescentPoss
542 const EList<DescentRoot>& rs, // roots
543 const EList<DescentConfig>& cs, // configs
544 PerReadMetrics& prm) // per-read metrics
545 {
546 assert_eq(botf_ - topf_, botb_ - topb_);
547 assert(out_.empty());
548 assert(repOk(&q));
549 // Get initial 5' and 3' offsets
550 bool fw = rs[rid_].fw;
551 float rootpri = rs[rid_].pri;
552 bool toward3p = (l2r_ == fw);
553 size_t off5p = off5p_i_;
554 assert_geq(al5pf_, al5pi_);
555 size_t off3p = q.length() - off5p - 1;
556 // By "depth" we essentially mean the number of characters already aligned
557 size_t depth, extrai = 0, extraf = 0;
558 size_t cur5pi = al5pi_, cur5pf = al5pf_;
559 if(toward3p) {
560 // Toward 3'
561 cur5pf = off5p;
562 depth = off5p - al5pi_;
563 // Failed to match out to the end?
564 if(al5pf_ < q.length() - 1) {
565 extraf = 1; // extra
566 }
567 } else {
568 // Toward 5'
569 cur5pi = off5p;
570 depth = al5pf_ - off5p;
571 if(al5pi_ > 0) {
572 extrai = 1;
573 }
574 }
575 // Get gap penalties
576 TScore pen_rdg_ex = sc.readGapExtend(), pen_rfg_ex = sc.refGapExtend();
577 TScore pen_rdg_op = sc.readGapOpen(), pen_rfg_op = sc.refGapOpen();
578 // Top and bot in the direction of the descent
579 TIndexOffU top = l2r_ ? topb_ : topf_;
580 TIndexOffU bot = l2r_ ? botb_ : botf_;
581 // Top and bot in the opposite direction
582 TIndexOffU topp = l2r_ ? topf_ : topb_;
583 TIndexOffU botp = l2r_ ? botf_ : botb_;
584 assert_eq(botp - topp, bot - top);
585 DescentEdge edge;
586 size_t nout = 0;
587 // Enumerate all outgoing edges, starting at the root and going out
588 size_t d = posid_;
589 // At first glance, we might think we should be bounded by al5pi_ and
590 // al5pf_, but those delimit the positions that matched between reference
591 // and read. If we hit a position that failed to match as part of
592 // followMatches, then we also want to evaluate ways of leaving that
593 // position, which adds one more position to viist.
594 while(off5p >= al5pi_ - extrai && off5p <= al5pf_ + extraf) {
595 assert_lt(off5p, q.length());
596 assert_lt(off3p, q.length());
597 TScore maxpend = cs[rid_].cons.get(depth, q.length(), maxpen);
598 assert(depth > 0 || maxpend == 0);
599 assert_geq(maxpend, pen_); // can't have already exceeded max penalty
600 TScore diff = maxpend - pen_; // room we have left
601 // Get pointer to SA ranges in the direction of descent
602 const TIndexOffU *t = l2r_ ? pf[d].topb : pf[d].topf;
603 const TIndexOffU *b = l2r_ ? pf[d].botb : pf[d].botf;
604 const TIndexOffU *tp = l2r_ ? pf[d].topf : pf[d].topb;
605 const TIndexOffU *bp = l2r_ ? pf[d].botf : pf[d].botb;
606 assert_eq(pf[d].botf - pf[d].topf, pf[d].botb - pf[d].topb);
607 // What are the read char / quality?
608 std::pair<int, int> p = q.get(off5p, fw);
609 int c = p.first;
610 assert_range(0, 4, c);
611 // Only entertain edits if there is at least one type of edit left and
612 // there is some penalty budget left
613 if(!pf[d].flags.exhausted() && diff > 0) {
614 // What would the penalty be if we mismatched at this position?
615 // This includes the case where the mismatch is for an N in the
616 // read.
617 int qq = p.second;
618 assert_geq(qq, 0);
619 TScore pen_mm = sc.mm(c, qq);
620 if(pen_mm <= diff) {
621 for(int j = 0; j < 4; j++) {
622 if(j == c) continue; // Match, not mismatch
623 if(b[j] <= t[j]) {
624 continue; // No outgoing edge with this nucleotide
625 }
626 if(!pf[d].flags.mmExplore(j)) {
627 continue; // Already been explored
628 }
629 TIndexOffU topf = pf[d].topf[j], botf = pf[d].botf[j];
630 ASSERT_ONLY(TIndexOffU topb = pf[d].topb[j], botb = pf[d].botb[j]);
631 if(re.contains(fw, l2r_, cur5pi, cur5pf, cur5pf - cur5pi + 1 + gapadd_, topf, botf, pen_ + pen_mm)) {
632 prm.nRedSkip++;
633 continue; // Redundant with a path already explored
634 }
635 prm.nRedFail++;
636 TIndexOffU width = b[j] - t[j];
637 Edit edit((uint32_t)off5p, (int)("ACGTN"[j]), (int)("ACGTN"[c]), EDIT_TYPE_MM);
638 DescentPriority pri(pen_ + pen_mm, depth, width, rootpri);
639 assert(topf != 0 || botf != 0);
640 assert(topb != 0 || botb != 0);
641 assert_eq(botb - topb, botf - topf);
642 edge.init(edit, off5p, pri, d
643 #ifndef NDEBUG
644 , d, topf, botf, topb, botb
645 #endif
646 );
647 out_.update(edge);
648 nout++;
649 }
650 }
651 bool gapsAllowed = (off5p >= (size_t)sc.gapbar && off3p >= (size_t)sc.gapbar);
652 if(gapsAllowed) {
653 assert_gt(depth, 0);
654 // An easy redundancy check is: if all ways of proceeding are
655 // matches, then there's no need to entertain gaps here.
656 // Shifting the gap one position further downstream is
657 // guarnteed not to be worse.
658 size_t totwidth = (b[0] - t[0]) +
659 (b[1] - t[1]) +
660 (b[2] - t[2]) +
661 (b[3] - t[3]);
662 assert(c > 3 || b[c] - t[c] <= totwidth);
663 bool allmatch = c < 4 && (totwidth == (b[c] - t[c]));
664 bool rdex = false, rfex = false;
665 size_t cur5pi_i = cur5pi, cur5pf_i = cur5pf;
666 if(toward3p) {
667 cur5pf_i--;
668 } else {
669 cur5pi_i++;
670 }
671 if(off5p == off5p_i_ && edit_.inited()) {
672 // If we're at the root of the descent, and the descent
673 // branched on a gap, then this could be scored as an
674 // extension of that gap.
675 if(pen_rdg_ex <= diff && edit_.isReadGap()) {
676 // Extension of a read gap
677 rdex = true;
678 for(int j = 0; j < 4; j++) {
679 if(b[j] <= t[j]) {
680 continue; // No outgoing edge with this nucleotide
681 }
682 if(!pf[d].flags.rdgExplore(j)) {
683 continue; // Already been explored
684 }
685 TIndexOffU topf = pf[d].topf[j], botf = pf[d].botf[j];
686 ASSERT_ONLY(TIndexOffU topb = pf[d].topb[j], botb = pf[d].botb[j]);
687 assert(topf != 0 || botf != 0);
688 assert(topb != 0 || botb != 0);
689 if(re.contains(fw, l2r_, cur5pi_i, cur5pf_i, cur5pf - cur5pi + 1 + gapadd_, topf, botf, pen_ + pen_rdg_ex)) {
690 prm.nRedSkip++;
691 continue; // Redundant with a path already explored
692 }
693 prm.nRedFail++;
694 TIndexOffU width = b[j] - t[j];
695 // off5p holds the offset from the 5' of the next
696 // character we were trying to align when we decided to
697 // introduce a read gap (before that character). If we
698 // were walking toward the 5' end, we need to increment
699 // by 1.
700 uint32_t off = (uint32_t)off5p + (toward3p ? 0 : 1);
701 Edit edit(off, (int)("ACGTN"[j]), '-', EDIT_TYPE_READ_GAP);
702 assert(edit.pos2 != std::numeric_limits<uint32_t>::max());
703 edit.pos2 = edit_.pos2 + (toward3p ? 1 : -1);
704 DescentPriority pri(pen_ + pen_rdg_ex, depth, width, rootpri);
705 assert(topf != 0 || botf != 0);
706 assert(topb != 0 || botb != 0);
707 assert_eq(botb - topb, botf - topf);
708 edge.init(edit, off5p, pri, d
709 #ifndef NDEBUG
710 , d,
711 topf, botf, topb, botb
712 #endif
713 );
714 out_.update(edge);
715 nout++;
716 }
717 }
718 if(pen_rfg_ex <= diff && edit_.isRefGap()) {
719 // Extension of a reference gap
720 rfex = true;
721 if(pf[d].flags.rfgExplore()) {
722 TIndexOffU topf = l2r_ ? topp : top;
723 TIndexOffU botf = l2r_ ? botp : bot;
724 ASSERT_ONLY(TIndexOffU topb = l2r_ ? top : topp);
725 ASSERT_ONLY(TIndexOffU botb = l2r_ ? bot : botp);
726 assert(topf != 0 || botf != 0);
727 assert(topb != 0 || botb != 0);
728 size_t nrefal = cur5pf - cur5pi + gapadd_;
729 if(!re.contains(fw, l2r_, cur5pi, cur5pf, nrefal, topf, botf, pen_ + pen_rfg_ex)) {
730 TIndexOffU width = bot - top;
731 Edit edit((uint32_t)off5p, '-', (int)("ACGTN"[c]), EDIT_TYPE_REF_GAP);
732 DescentPriority pri(pen_ + pen_rfg_ex, depth, width, rootpri);
733 assert(topf != 0 || botf != 0);
734 assert(topb != 0 || botb != 0);
735 edge.init(edit, off5p, pri, d
736 #ifndef NDEBUG
737 // It's a little unclear what the depth ought to be.
738 // Is it the depth we were at when we did the ref
739 // gap? I.e. the depth of the flags where rfgExplore()
740 // returned true? Or is it the depth where we can
741 // retrieve the appropriate top/bot? We make it the
742 // latter, might wrap around, indicating we should get
743 // top/bot from the descent's topf_, ... fields.
744 , (d == posid_) ? std::numeric_limits<size_t>::max() : (d - 1),
745 topf, botf, topb, botb
746 #endif
747 );
748 out_.update(edge);
749 nout++;
750 prm.nRedFail++;
751 } else {
752 prm.nRedSkip++;
753 }
754 }
755 }
756 }
757 if(!allmatch && pen_rdg_op <= diff && !rdex) {
758 // Opening a new read gap
759 for(int j = 0; j < 4; j++) {
760 if(b[j] <= t[j]) {
761 continue; // No outgoing edge with this nucleotide
762 }
763 if(!pf[d].flags.rdgExplore(j)) {
764 continue; // Already been explored
765 }
766 TIndexOffU topf = pf[d].topf[j], botf = pf[d].botf[j];
767 ASSERT_ONLY(TIndexOffU topb = pf[d].topb[j], botb = pf[d].botb[j]);
768 assert(topf != 0 || botf != 0);
769 assert(topb != 0 || botb != 0);
770 if(re.contains(fw, l2r_, cur5pi_i, cur5pf_i, cur5pf - cur5pi + 1 + gapadd_, topf, botf, pen_ + pen_rdg_op)) {
771 prm.nRedSkip++;
772 continue; // Redundant with a path already explored
773 }
774 prm.nRedFail++;
775 TIndexOffU width = b[j] - t[j];
776 // off5p holds the offset from the 5' of the next
777 // character we were trying to align when we decided to
778 // introduce a read gap (before that character). If we
779 // were walking toward the 5' end, we need to increment
780 // by 1.
781 uint32_t off = (uint32_t)off5p + (toward3p ? 0 : 1);
782 Edit edit(off, (int)("ACGTN"[j]), '-', EDIT_TYPE_READ_GAP);
783 assert(edit.pos2 != std::numeric_limits<uint32_t>::max());
784 DescentPriority pri(pen_ + pen_rdg_op, depth, width, rootpri);
785 assert(topf != 0 || botf != 0);
786 assert(topb != 0 || botb != 0);
787 assert_eq(botb - topb, botf - topf);
788 edge.init(edit, off5p, pri, d
789 #ifndef NDEBUG
790 , d, topf, botf, topb, botb
791 #endif
792 );
793 out_.update(edge);
794 nout++;
795 }
796 }
797 if(!allmatch && pen_rfg_op <= diff && !rfex) {
798 // Opening a new reference gap
799 if(pf[d].flags.rfgExplore()) {
800 TIndexOffU topf = l2r_ ? topp : top;
801 TIndexOffU botf = l2r_ ? botp : bot;
802 ASSERT_ONLY(TIndexOffU topb = l2r_ ? top : topp);
803 ASSERT_ONLY(TIndexOffU botb = l2r_ ? bot : botp);
804 assert(topf != 0 || botf != 0);
805 assert(topb != 0 || botb != 0);
806 size_t nrefal = cur5pf - cur5pi + gapadd_;
807 if(!re.contains(fw, l2r_, cur5pi, cur5pf, nrefal, topf, botf, pen_ + pen_rfg_op)) {
808 TIndexOffU width = bot - top;
809 Edit edit((uint32_t)off5p, '-', (int)("ACGTN"[c]), EDIT_TYPE_REF_GAP);
810 DescentPriority pri(pen_ + pen_rfg_op, depth, width, rootpri);
811 assert(topf != 0 || botf != 0);
812 assert(topb != 0 || botb != 0);
813 edge.init(edit, off5p, pri, d
814 #ifndef NDEBUG
815 // It's a little unclear what the depth ought to be.
816 // Is it the depth we were at when we did the ref
817 // gap? I.e. the depth of the flags where rfgExplore()
818 // returned true? Or is it the depth where we can
819 // retrieve the appropriate top/bot? We make it the
820 // latter, might wrap around, indicating we should get
821 // top/bot from the descent's topf_, ... fields.
822 , (d == posid_) ? std::numeric_limits<size_t>::max() : (d - 1),
823 topf, botf, topb, botb
824 #endif
825 );
826 out_.update(edge);
827 nout++;
828 prm.nRedFail++;
829 } else {
830 prm.nRedSkip++;
831 }
832 }
833 }
834 }
835 }
836 // Update off5p, off3p, depth
837 d++;
838 depth++;
839 assert_leq(depth, al5pf_ - al5pi_ + 2);
840 if(toward3p) {
841 if(off3p == 0) {
842 break;
843 }
844 off5p++;
845 off3p--;
846 cur5pf++;
847 } else {
848 if(off5p == 0) {
849 break;
850 }
851 off3p++;
852 off5p--;
853 cur5pi--;
854 }
855 // Update top and bot
856 if(off5p >= al5pi_ - extrai && off5p <= al5pf_ + extraf) {
857 assert_range(0, 3, c);
858 top = t[c], topp = tp[c];
859 bot = b[c], botp = bp[c];
860 assert_eq(bot-top, botp-topp);
861 }
862 }
863 lastRecalc_ = (nout <= 5);
864 out_.best1.updateFlags(pf);
865 out_.best2.updateFlags(pf);
866 out_.best3.updateFlags(pf);
867 out_.best4.updateFlags(pf);
868 out_.best5.updateFlags(pf);
869 return nout;
870 }
871
print(std::ostream * os,const char * prefix,const Read & q,size_t trimLf,size_t trimRg,bool fw,const EList<Edit> & edits,size_t ei,size_t en,BTDnaString & rf) const872 void Descent::print(
873 std::ostream *os,
874 const char *prefix,
875 const Read& q,
876 size_t trimLf,
877 size_t trimRg,
878 bool fw,
879 const EList<Edit>& edits,
880 size_t ei,
881 size_t en,
882 BTDnaString& rf) const
883 {
884 const BTDnaString& read = fw ? q.patFw : q.patRc;
885 size_t eidx = ei;
886 if(os != NULL) { *os << prefix; }
887 // Print read
888 for(size_t i = 0; i < read.length(); i++) {
889 if(i < trimLf || i >= read.length() - trimRg) {
890 if(os != NULL) { *os << (char)tolower(read.toChar(i)); }
891 continue;
892 }
893 bool del = false, mm = false;
894 while(eidx < ei + en && edits[eidx].pos == i) {
895 if(edits[eidx].isReadGap()) {
896 if(os != NULL) { *os << '-'; }
897 } else if(edits[eidx].isRefGap()) {
898 del = true;
899 assert_eq((int)edits[eidx].qchr, read.toChar(i));
900 if(os != NULL) { *os << read.toChar(i); }
901 } else {
902 mm = true;
903 assert(edits[eidx].isMismatch());
904 assert_eq((int)edits[eidx].qchr, read.toChar(i));
905 if(os != NULL) { *os << (char)edits[eidx].qchr; }
906 }
907 eidx++;
908 }
909 if(!del && !mm) {
910 // Print read character
911 if(os != NULL) { *os << read.toChar(i); }
912 }
913 }
914 if(os != NULL) {
915 *os << endl;
916 *os << prefix;
917 }
918 eidx = ei;
919 // Print match bars
920 for(size_t i = 0; i < read.length(); i++) {
921 if(i < trimLf || i >= read.length() - trimRg) {
922 if(os != NULL) { *os << ' '; }
923 continue;
924 }
925 bool del = false, mm = false;
926 while(eidx < ei + en && edits[eidx].pos == i) {
927 if(edits[eidx].isReadGap()) {
928 if(os != NULL) { *os << ' '; }
929 } else if(edits[eidx].isRefGap()) {
930 del = true;
931 if(os != NULL) { *os << ' '; }
932 } else {
933 mm = true;
934 assert(edits[eidx].isMismatch());
935 if(os != NULL) { *os << ' '; }
936 }
937 eidx++;
938 }
939 if(!del && !mm && os != NULL) { *os << '|'; }
940 }
941 if(os != NULL) {
942 *os << endl;
943 *os << prefix;
944 }
945 eidx = ei;
946 // Print reference
947 for(size_t i = 0; i < read.length(); i++) {
948 if(i < trimLf || i >= read.length() - trimRg) {
949 if(os != NULL) { *os << ' '; }
950 continue;
951 }
952 bool del = false, mm = false;
953 while(eidx < ei + en && edits[eidx].pos == i) {
954 if(edits[eidx].isReadGap()) {
955 rf.appendChar((char)edits[eidx].chr);
956 if(os != NULL) { *os << (char)edits[eidx].chr; }
957 } else if(edits[eidx].isRefGap()) {
958 del = true;
959 if(os != NULL) { *os << '-'; }
960 } else {
961 mm = true;
962 assert(edits[eidx].isMismatch());
963 rf.appendChar((char)edits[eidx].chr);
964 if(os != NULL) { *os << (char)edits[eidx].chr; }
965 }
966 eidx++;
967 }
968 if(!del && !mm) {
969 rf.append(read[i]);
970 if(os != NULL) { *os << read.toChar(i); }
971 }
972 }
973 if(os != NULL) { *os << endl; }
974 }
975
976 /**
977 * Create a new Descent
978 */
bounce(const Read & q,TIndexOffU topf,TIndexOffU botf,TIndexOffU topb,TIndexOffU botb,const Ebwt & ebwtFw,const Ebwt & ebwtBw,const Scoring & sc,TAlScore minsc,TAlScore maxpen,DescentRedundancyChecker & re,EFactory<Descent> & df,EFactory<DescentPos> & pf,const EList<DescentRoot> & rs,const EList<DescentConfig> & cs,EHeap<TDescentPair> & heap,DescentAlignmentSink & alsink,DescentMetrics & met,PerReadMetrics & prm)979 bool Descent::bounce(
980 const Read& q, // query string
981 TIndexOffU topf, // SA range top in fw index
982 TIndexOffU botf, // SA range bottom in fw index
983 TIndexOffU topb, // SA range top in bw index
984 TIndexOffU botb, // SA range bottom in bw index
985 const Ebwt& ebwtFw, // forward index
986 const Ebwt& ebwtBw, // mirror index
987 const Scoring& sc, // scoring scheme
988 TAlScore minsc, // minimum score
989 TAlScore maxpen, // maximum penalty
990 DescentRedundancyChecker& re, // redundancy checker
991 EFactory<Descent>& df, // factory with Descent
992 EFactory<DescentPos>& pf, // factory with DescentPoss
993 const EList<DescentRoot>& rs, // roots
994 const EList<DescentConfig>& cs, // configs
995 EHeap<TDescentPair>& heap, // heap of descents
996 DescentAlignmentSink& alsink, // alignment sink
997 DescentMetrics& met, // metrics
998 PerReadMetrics& prm) // per-read metrics
999 {
1000 assert_gt(botf, topf);
1001 assert(al5pi_ == 0 || al5pf_ == q.length()-1);
1002 assert(!(al5pi_ == 0 && al5pf_ == q.length()-1));
1003 size_t dfsz = df.size();
1004 size_t pfsz = pf.size();
1005 TDescentId id = df.alloc();
1006 Edit e_null;
1007 assert(!e_null.inited());
1008 // Follow matches
1009 bool succ = df[id].init(
1010 q, // query
1011 rid_, // root id
1012 sc, // scoring scheme
1013 minsc, // minimum score
1014 maxpen, // maximum penalty
1015 al5pi_, // new near-5' extreme
1016 al5pf_, // new far-5' extreme
1017 topf, // SA range top in FW index
1018 botf, // SA range bottom in FW index
1019 topb, // SA range top in BW index
1020 botb, // SA range bottom in BW index
1021 !l2r_, // direction this descent will go in; opposite from parent
1022 id, // my ID
1023 descid_, // parent ID
1024 pen_, // total penalties so far - same as parent
1025 e_null, // edit for incoming edge; uninitialized if bounced
1026 ebwtFw, // forward index
1027 ebwtBw, // mirror index
1028 re, // redundancy checker
1029 df, // Descent factory
1030 pf, // DescentPos factory
1031 rs, // DescentRoot list
1032 cs, // DescentConfig list
1033 heap, // heap
1034 alsink, // alignment sink
1035 met, // metrics
1036 prm); // per-read metrics
1037 if(!succ) {
1038 // Reclaim memory we had used for this descent and its DescentPos info
1039 df.resize(dfsz);
1040 pf.resize(pfsz);
1041 }
1042 return succ;
1043 }
1044
1045 /**
1046 * Take the best outgoing edge and place it in the heap. When deciding what
1047 * outgoing edges exist, abide by constraints in DescentConfig. These
1048 * constraints limit total penalty accumulated so far versus distance from
1049 * search root. E.g. a constraint might disallow any gaps or mismatches within
1050 * 20 ply of the root, then allow 1 mismatch within 30 ply, 1 mismatch or 1 gap
1051 * within 40 ply, etc.
1052 */
followBestOutgoing(const Read & q,const Ebwt & ebwtFw,const Ebwt & ebwtBw,const Scoring & sc,TAlScore minsc,TAlScore maxpen,DescentRedundancyChecker & re,EFactory<Descent> & df,EFactory<DescentPos> & pf,const EList<DescentRoot> & rs,const EList<DescentConfig> & cs,EHeap<TDescentPair> & heap,DescentAlignmentSink & alsink,DescentMetrics & met,PerReadMetrics & prm)1053 void Descent::followBestOutgoing(
1054 const Read& q, // query string
1055 const Ebwt& ebwtFw, // forward index
1056 const Ebwt& ebwtBw, // mirror index
1057 const Scoring& sc, // scoring scheme
1058 TAlScore minsc, // minimum score
1059 TAlScore maxpen, // maximum penalty
1060 DescentRedundancyChecker& re, // redundancy checker
1061 EFactory<Descent>& df, // factory with Descent
1062 EFactory<DescentPos>& pf, // factory with DescentPoss
1063 const EList<DescentRoot>& rs, // roots
1064 const EList<DescentConfig>& cs, // configs
1065 EHeap<TDescentPair>& heap, // heap of descents
1066 DescentAlignmentSink& alsink, // alignment sink
1067 DescentMetrics& met, // metrics
1068 PerReadMetrics& prm) // per-read metrics
1069 {
1070 // We assume this descent has been popped off the heap. We'll re-add it if
1071 // it hasn't been exhausted.
1072 assert(q.repOk());
1073 assert(!empty());
1074 assert(!out_.empty());
1075 while(!out_.empty()) {
1076 DescentPriority best = out_.bestPri();
1077 DescentEdge e = out_.rotate();
1078 TReadOff al5pi_new = al5pi_, al5pf_new = al5pf_;
1079 bool fw = rs[rid_].fw;
1080 bool toward3p = (l2r_ == fw);
1081 TReadOff edoff = e.off5p; // 5' offset of edit
1082 assert_leq(edoff, al5pf_ + 1);
1083 assert_geq(edoff + 1, al5pi_);
1084 if(out_.empty()) {
1085 if(!lastRecalc_) {
1086 // This might allocate new Descents
1087 recalcOutgoing(q, sc, minsc, maxpen, re, pf, rs, cs, prm);
1088 if(empty()) {
1089 // Could happen, since some outgoing edges may have become
1090 // redundant in the meantime.
1091 break;
1092 }
1093 } else {
1094 assert(empty());
1095 }
1096 }
1097 TReadOff doff; // edit's offset into this descent
1098 int chr = asc2dna[e.e.chr];
1099 // hitEnd is set to true iff this edit pushes us to the extreme 5' or 3'
1100 // end of the alignment
1101 bool hitEnd = false;
1102 // done is set to true iff this edit aligns the only remaining character of
1103 // the read
1104 bool done = false;
1105 if(toward3p) {
1106 // The 3' extreme of the new Descent is further in (away from the 3'
1107 // end) than the parent's.
1108 al5pf_new = doff = edoff;
1109 if(e.e.isReadGap()) {
1110 // We didn't actually consume the read character at 'edoff', so
1111 // retract al5pf_new by one position. This doesn't effect the
1112 // "depth" (doff) of the SA range we took, though.
1113 assert_gt(al5pf_new, 0);
1114 al5pf_new--;
1115 }
1116 assert_lt(al5pf_new, q.length());
1117 hitEnd = (al5pf_new == q.length() - 1);
1118 done = (hitEnd && al5pi_new == 0);
1119 assert_geq(doff, off5p_i_);
1120 doff = doff - off5p_i_;
1121 assert_leq(doff, len_);
1122 } else {
1123 // The 5' extreme of the new Descent is further in (away from the 5'
1124 // end) than the parent's.
1125 al5pi_new = doff = edoff;
1126 if(e.e.isReadGap()) {
1127 // We didn't actually consume the read character at 'edoff', so
1128 // move al5pi_new closer to the 3' end by one position. This
1129 // doesn't effect the "depth" (doff) of the SA range we took,
1130 // though.
1131 al5pi_new++;
1132 }
1133 hitEnd = (al5pi_new == 0);
1134 done = (hitEnd && al5pf_new == q.length() - 1);
1135 assert_geq(off5p_i_, doff);
1136 doff = off5p_i_ - doff;
1137 assert_leq(doff, len_);
1138 }
1139 // Check if this is redundant with an already-explored path
1140 bool l2r = l2r_; // gets overridden if we bounce
1141 if(!done && hitEnd) {
1142 // Alignment finsihed extending in one direction
1143 l2r = !l2r;
1144 }
1145 size_t dfsz = df.size();
1146 size_t pfsz = pf.size();
1147 TIndexOffU topf, botf, topb, botb;
1148 size_t d = posid_ + doff;
1149 if(e.e.isRefGap()) {
1150 d--; // might underflow
1151 if(doff == 0) {
1152 topf = topf_;
1153 botf = botf_;
1154 topb = topb_;
1155 botb = botb_;
1156 d = std::numeric_limits<size_t>::max();
1157 assert_eq(botf-topf, botb-topb);
1158 } else {
1159 assert_gt(al5pf_new, 0);
1160 assert_gt(d, 0);
1161 chr = pf[d].c;
1162 assert(pf[d].inited());
1163 assert_range(0, 3, chr);
1164 topf = pf[d].topf[chr];
1165 botf = pf[d].botf[chr];
1166 topb = pf[d].topb[chr];
1167 botb = pf[d].botb[chr];
1168 assert_eq(botf-topf, botb-topb);
1169 }
1170 } else {
1171 // A read gap or a mismatch
1172 assert(pf[d].inited());
1173 topf = pf[d].topf[chr];
1174 botf = pf[d].botf[chr];
1175 topb = pf[d].topb[chr];
1176 botb = pf[d].botb[chr];
1177 assert_eq(botf-topf, botb-topb);
1178 }
1179 assert_eq(d, e.d);
1180 assert_eq(topf, e.topf);
1181 assert_eq(botf, e.botf);
1182 assert_eq(topb, e.topb);
1183 assert_eq(botb, e.botb);
1184 if(done) {
1185 // Aligned the entire read end-to-end. Presumably there's no need to
1186 // create a new Descent object. We just report the alignment.
1187 alsink.reportAlignment(
1188 q, // query
1189 ebwtFw, // forward index
1190 ebwtBw, // backward index
1191 topf, // top of SA range in forward index
1192 botf, // bottom of SA range in forward index
1193 topb, // top of SA range in backward index
1194 botb, // bottom of SA range in backward index
1195 descid_, // Descent at the leaf
1196 rid_, // root id
1197 e.e, // extra edit, if necessary
1198 best.pen, // penalty
1199 df, // factory with Descent
1200 pf, // factory with DescentPoss
1201 rs, // roots
1202 cs); // configs
1203 assert(alsink.repOk());
1204 return;
1205 }
1206 assert(al5pi_new != 0 || al5pf_new != q.length() - 1);
1207 TDescentId id = df.alloc();
1208 bool succ = df[id].init(
1209 q, // query
1210 rid_, // root id
1211 sc, // scoring scheme
1212 minsc, // minimum score
1213 maxpen, // maximum penalty
1214 al5pi_new, // new near-5' extreme
1215 al5pf_new, // new far-5' extreme
1216 topf, // SA range top in FW index
1217 botf, // SA range bottom in FW index
1218 topb, // SA range top in BW index
1219 botb, // SA range bottom in BW index
1220 l2r, // direction this descent will go in
1221 id, // my ID
1222 descid_, // parent ID
1223 best.pen, // total penalties so far
1224 e.e, // edit for incoming edge; uninitialized if bounced
1225 ebwtFw, // forward index
1226 ebwtBw, // mirror index
1227 re, // redundancy checker
1228 df, // Descent factory
1229 pf, // DescentPos factory
1230 rs, // DescentRoot list
1231 cs, // DescentConfig list
1232 heap, // heap
1233 alsink, // alignment sink
1234 met, // metrics
1235 prm); // per-read metrics
1236 if(!succ) {
1237 // Reclaim memory we had used for this descent and its DescentPos info
1238 df.resize(dfsz);
1239 pf.resize(pfsz);
1240 }
1241 break;
1242 }
1243 if(!empty()) {
1244 // Re-insert this Descent with its new priority
1245 heap.insert(make_pair(out_.bestPri(), descid_));
1246 }
1247 }
1248
1249 /**
1250 * Given the forward and backward indexes, and given topf/botf/topb/botb, get
1251 * tloc, bloc ready for the next step.
1252 */
nextLocsBi(const Ebwt & ebwtFw,const Ebwt & ebwtBw,SideLocus & tloc,SideLocus & bloc,TIndexOffU topf,TIndexOffU botf,TIndexOffU topb,TIndexOffU botb)1253 void Descent::nextLocsBi(
1254 const Ebwt& ebwtFw, // forward index
1255 const Ebwt& ebwtBw, // mirror index
1256 SideLocus& tloc, // top locus
1257 SideLocus& bloc, // bot locus
1258 TIndexOffU topf, // top in BWT
1259 TIndexOffU botf, // bot in BWT
1260 TIndexOffU topb, // top in BWT'
1261 TIndexOffU botb) // bot in BWT'
1262 {
1263 assert_gt(botf, 0);
1264 // Which direction are we going in next?
1265 if(l2r_) {
1266 // Left to right; use BWT'
1267 if(botb - topb == 1) {
1268 // Already down to 1 row; just init top locus
1269 tloc.initFromRow(topb, ebwtBw.eh(), ebwtBw.ebwt());
1270 bloc.invalidate();
1271 } else {
1272 SideLocus::initFromTopBot(
1273 topb, botb, ebwtBw.eh(), ebwtBw.ebwt(), tloc, bloc);
1274 assert(bloc.valid());
1275 }
1276 } else {
1277 // Right to left; use BWT
1278 if(botf - topf == 1) {
1279 // Already down to 1 row; just init top locus
1280 tloc.initFromRow(topf, ebwtFw.eh(), ebwtFw.ebwt());
1281 bloc.invalidate();
1282 } else {
1283 SideLocus::initFromTopBot(
1284 topf, botf, ebwtFw.eh(), ebwtFw.ebwt(), tloc, bloc);
1285 assert(bloc.valid());
1286 }
1287 }
1288 // Check if we should update the tracker with this refinement
1289 assert(botf - topf == 1 || bloc.valid());
1290 assert(botf - topf > 1 || !bloc.valid());
1291 }
1292
1293 /**
1294 * Advance this descent by following read matches as far as possible.
1295 *
1296 * This routine doesn't have to consider the whole gamut of constraints on
1297 * which outgoing edges can be followed. If it is a root descent, it does have
1298 * to know how deep the no-edit constraint goes, though, so we can decide
1299 * whether using the ftab would potentially jump over relevant branch points.
1300 * Apart from that, though, we simply proceed as far as it can go by matching
1301 * characters in the query, irrespective of the constraints.
1302 * recalcOutgoing(...) and followBestOutgoing(...) do have to consider these
1303 * constraints, though.
1304 *
1305 * Conceptually, as we make descending steps, we have:
1306 * 1. Before each step, a single range indicating how we departed the previous
1307 * step
1308 * 2. As part of each step, a quad of ranges indicating what range would result
1309 * if we proceeded on an a, c, g ot t
1310 *
1311 * Return true iff it is possible to branch from this descent. If we haven't
1312 * exceeded the no-branch depth.
1313 */
followMatches(const Read & q,const Scoring & sc,const Ebwt & ebwtFw,const Ebwt & ebwtBw,DescentRedundancyChecker & re,EFactory<Descent> & df,EFactory<DescentPos> & pf,const EList<DescentRoot> & rs,const EList<DescentConfig> & cs,EHeap<TDescentPair> & heap,DescentAlignmentSink & alsink,DescentMetrics & met,PerReadMetrics & prm,bool & branches,bool & hitEnd,bool & done,TReadOff & off5p_i,TIndexOffU & topf_bounce,TIndexOffU & botf_bounce,TIndexOffU & topb_bounce,TIndexOffU & botb_bounce)1314 bool Descent::followMatches(
1315 const Read& q, // query string
1316 const Scoring& sc, // scoring scheme
1317 const Ebwt& ebwtFw, // forward index
1318 const Ebwt& ebwtBw, // mirror index
1319 DescentRedundancyChecker& re, // redundancy checker
1320 EFactory<Descent>& df, // Descent factory
1321 EFactory<DescentPos>& pf, // DescentPos factory
1322 const EList<DescentRoot>& rs, // roots
1323 const EList<DescentConfig>& cs, // configs
1324 EHeap<TDescentPair>& heap, // heap
1325 DescentAlignmentSink& alsink, // alignment sink
1326 DescentMetrics& met, // metrics
1327 PerReadMetrics& prm, // per-read metrics
1328 bool& branches, // out: true -> there are > 0 ways to branch
1329 bool& hitEnd, // out: true -> hit read end with non-empty range
1330 bool& done, // out: true -> we made a full alignment
1331 TReadOff& off5p_i, // out: initial 5' offset
1332 TIndexOffU& topf_bounce, // out: top of SA range for fw idx for bounce
1333 TIndexOffU& botf_bounce, // out: bot of SA range for fw idx for bounce
1334 TIndexOffU& topb_bounce, // out: top of SA range for bw idx for bounce
1335 TIndexOffU& botb_bounce) // out: bot of SA range for bw idx for bounce
1336 {
1337 // TODO: make these full-fledged parameters
1338 size_t nobranchDepth = 20;
1339 bool stopOnN = true;
1340 assert(q.repOk());
1341 assert(repOk(&q));
1342 assert_eq(ebwtFw.eh().ftabChars(), ebwtBw.eh().ftabChars());
1343 #ifndef NDEBUG
1344 for(int i = 0; i < 4; i++) {
1345 assert_eq(ebwtFw.fchr()[i], ebwtBw.fchr()[i]);
1346 }
1347 #endif
1348 SideLocus tloc, bloc;
1349 TIndexOffU topf = topf_, botf = botf_, topb = topb_, botb = botb_;
1350 bool fw = rs[rid_].fw;
1351 bool toward3p;
1352 size_t off5p;
1353 assert_lt(al5pi_, q.length());
1354 assert_lt(al5pf_, q.length());
1355 while(true) {
1356 toward3p = (l2r_ == fw);
1357 assert_geq(al5pf_, al5pi_);
1358 assert(al5pi_ != 0 || al5pf_ != q.length() - 1);
1359 if(toward3p) {
1360 if(al5pf_ == q.length()-1) {
1361 l2r_ = !l2r_;
1362 continue;
1363 }
1364 if(al5pi_ == al5pf_ && root()) {
1365 off5p = off5p_i = al5pi_;
1366 } else {
1367 off5p = off5p_i = (al5pf_ + 1);
1368 }
1369 } else {
1370 if(al5pi_ == 0) {
1371 l2r_ = !l2r_;
1372 continue;
1373 }
1374 assert_gt(al5pi_, 0);
1375 if(al5pi_ == al5pf_ && root()) {
1376 off5p = off5p_i = al5pi_;
1377 } else {
1378 off5p = off5p_i = (al5pi_ - 1);
1379 }
1380 }
1381 break;
1382 }
1383 size_t off3p = q.length() - off5p - 1;
1384 assert_lt(off5p, q.length());
1385 assert_lt(off3p, q.length());
1386 bool firstPos = true;
1387 assert_eq(0, len_);
1388
1389 // Number of times pf.alloc() is called. So we can sanity check it.
1390 size_t nalloc = 0;
1391 // Set to true as soon as we encounter a branch point along this descent.
1392 branches = false;
1393 // hitEnd is set to true iff this edit pushes us to the extreme 5' or 3'
1394 // end of the alignment
1395 hitEnd = false;
1396 // done is set to true iff this edit aligns the only remaining character of
1397 // the read
1398 done = false;
1399 if(root()) {
1400 assert_eq(al5pi_, al5pf_);
1401 // Check whether/how far we can jump using ftab
1402 int ftabLen = ebwtFw.eh().ftabChars();
1403 bool ftabFits = true;
1404 if(toward3p && ftabLen + off5p > q.length()) {
1405 ftabFits = false;
1406 } else if(!toward3p && off5p < (size_t)ftabLen) {
1407 ftabFits = false;
1408 }
1409 bool useFtab = ftabLen > 1 && (size_t)ftabLen <= nobranchDepth && ftabFits;
1410 bool ftabFailed = false;
1411 if(useFtab) {
1412 prm.nFtabs++;
1413 // Forward index: right-to-left
1414 size_t off_r2l = fw ? off5p : q.length() - off5p - 1;
1415 if(l2r_) {
1416 //
1417 } else {
1418 assert_geq((int)off_r2l, ftabLen - 1);
1419 off_r2l -= (ftabLen - 1);
1420 }
1421 bool ret = ebwtFw.ftabLoHi(fw ? q.patFw : q.patRc, off_r2l,
1422 false, // reverse
1423 topf, botf);
1424 if(!ret) {
1425 // Encountered an N or something else that made it impossible
1426 // to use the ftab
1427 ftabFailed = true;
1428 } else {
1429 if(botf - topf == 0) {
1430 return false;
1431 }
1432 int c_r2l = fw ? q.patFw[off_r2l] : q.patRc[off_r2l];
1433 // Backward index: left-to-right
1434 size_t off_l2r = fw ? off5p : q.length() - off5p - 1;
1435 if(l2r_) {
1436 //
1437 } else {
1438 assert_geq((int)off_l2r, ftabLen - 1);
1439 off_l2r -= (ftabLen - 1);
1440 }
1441 ASSERT_ONLY(bool ret2 = )
1442 ebwtBw.ftabLoHi(fw ? q.patFw : q.patRc, off_l2r,
1443 false, // don't reverse
1444 topb, botb);
1445 assert(ret == ret2);
1446 int c_l2r = fw ? q.patFw[off_l2r + ftabLen - 1] :
1447 q.patRc[off_l2r + ftabLen - 1];
1448 assert_eq(botf - topf, botb - topb);
1449 if(toward3p) {
1450 assert_geq((int)off3p, ftabLen - 1);
1451 off5p += ftabLen; off3p -= ftabLen;
1452 } else {
1453 assert_geq((int)off5p, ftabLen - 1);
1454 off5p -= ftabLen; off3p += ftabLen;
1455 }
1456 len_ += ftabLen;
1457 if(toward3p) {
1458 // By convention, al5pf_ and al5pi_ start out equal, so we only
1459 // advance al5pf_ by ftabLen - 1 (not ftabLen)
1460 al5pf_ += (ftabLen - 1); // -1 accounts for inclusive al5pf_
1461 if(al5pf_ == q.length() - 1) {
1462 hitEnd = true;
1463 done = (al5pi_ == 0);
1464 }
1465 } else {
1466 // By convention, al5pf_ and al5pi_ start out equal, so we only
1467 // advance al5pi_ by ftabLen - 1 (not ftabLen)
1468 al5pi_ -= (ftabLen - 1);
1469 if(al5pi_ == 0) {
1470 hitEnd = true;
1471 done = (al5pf_ == q.length()-1);
1472 }
1473 }
1474 // Allocate DescentPos data structures and leave them empty. We
1475 // jumped over them by doing our lookup in the ftab, so we have no
1476 // info about outgoing edges from them, besides the matching
1477 // outgoing edge from the last pos which is in topf/botf and
1478 // topb/botb.
1479 size_t id = 0;
1480 if(firstPos) {
1481 posid_ = pf.alloc();
1482 pf[posid_].reset();
1483 firstPos = false;
1484 for(int i = 1; i < ftabLen; i++) {
1485 id = pf.alloc();
1486 pf[id].reset();
1487 }
1488 } else {
1489 for(int i = 0; i < ftabLen; i++) {
1490 id = pf.alloc();
1491 pf[id].reset();
1492 }
1493 }
1494 assert_eq(botf-topf, botb-topb);
1495 pf[id].c = l2r_ ? c_l2r : c_r2l;
1496 pf[id].topf[l2r_ ? c_l2r : c_r2l] = topf;
1497 pf[id].botf[l2r_ ? c_l2r : c_r2l] = botf;
1498 pf[id].topb[l2r_ ? c_l2r : c_r2l] = topb;
1499 pf[id].botb[l2r_ ? c_l2r : c_r2l] = botb;
1500 assert(pf[id].inited());
1501 nalloc += ftabLen;
1502 }
1503 }
1504 if(!useFtab || ftabFailed) {
1505 // Can't use ftab, use fchr instead
1506 int rdc = q.getc(off5p, fw);
1507 // If rdc is N, that's pretty bad! That means we placed a root
1508 // right on an N. The only thing we can reasonably do is to pick a
1509 // nucleotide at random and proceed.
1510 if(rdc > 3) {
1511 return false;
1512 }
1513 assert_range(0, 3, rdc);
1514 topf = topb = ebwtFw.fchr()[rdc];
1515 botf = botb = ebwtFw.fchr()[rdc+1];
1516 if(botf - topf == 0) {
1517 return false;
1518 }
1519 if(toward3p) {
1520 off5p++; off3p--;
1521 } else {
1522 off5p--; off3p++;
1523 }
1524 len_++;
1525 if(toward3p) {
1526 if(al5pf_ == q.length()-1) {
1527 hitEnd = true;
1528 done = (al5pi_ == 0);
1529 }
1530 } else {
1531 if(al5pi_ == 0) {
1532 hitEnd = true;
1533 done = (al5pf_ == q.length()-1);
1534 }
1535 }
1536 // Allocate DescentPos data structure. We could fill it with the
1537 // four ranges from fchr if we wanted to, but that will never be
1538 // relevant.
1539 size_t id = 0;
1540 if(firstPos) {
1541 posid_ = id = pf.alloc();
1542 firstPos = false;
1543 } else {
1544 id = pf.alloc();
1545 }
1546 assert_eq(botf-topf, botb-topb);
1547 pf[id].c = rdc;
1548 pf[id].topf[rdc] = topf;
1549 pf[id].botf[rdc] = botf;
1550 pf[id].topb[rdc] = topb;
1551 pf[id].botb[rdc] = botb;
1552 assert(pf[id].inited());
1553 nalloc++;
1554 }
1555 assert_gt(botf, topf);
1556 assert_eq(botf - topf, botb - topb);
1557 // Check if this is redundant with an already-explored path
1558 if(!re.check(fw, l2r_, al5pi_, al5pf_, al5pf_ - al5pi_ + 1 + gapadd_,
1559 topf, botf, pen_))
1560 {
1561 prm.nRedSkip++;
1562 return false;
1563 }
1564 prm.nRedFail++; // not pruned by redundancy list
1565 prm.nRedIns++; // inserted into redundancy list
1566 }
1567 if(done) {
1568 Edit eempty;
1569 alsink.reportAlignment(
1570 q, // query
1571 ebwtFw, // forward index
1572 ebwtBw, // backward index
1573 topf, // top of SA range in forward index
1574 botf, // bottom of SA range in forward index
1575 topb, // top of SA range in backward index
1576 botb, // bottom of SA range in backward index
1577 descid_, // Descent at the leaf
1578 rid_, // root id
1579 eempty, // extra edit, if necessary
1580 pen_, // penalty
1581 df, // factory with Descent
1582 pf, // factory with DescentPoss
1583 rs, // roots
1584 cs); // configs
1585 assert(alsink.repOk());
1586 return true;
1587 } else if(hitEnd) {
1588 assert(botf > 0 || topf > 0);
1589 assert_gt(botf, topf);
1590 topf_bounce = topf;
1591 botf_bounce = botf;
1592 topb_bounce = topb;
1593 botb_bounce = botb;
1594 return true; // Bounced
1595 }
1596 // We just advanced either ftabLen characters, or 1 character,
1597 // depending on whether we used ftab or fchr.
1598 nextLocsBi(ebwtFw, ebwtBw, tloc, bloc, topf, botf, topb, botb);
1599 assert(tloc.valid());
1600 assert(botf - topf == 1 || bloc.valid());
1601 assert(botf - topf > 1 || !bloc.valid());
1602 TIndexOffU t[4], b[4]; // dest BW ranges
1603 TIndexOffU tp[4], bp[4]; // dest BW ranges for "prime" index
1604 ASSERT_ONLY(TIndexOffU lasttot = botf - topf);
1605 bool fail = false;
1606 while(!fail && !hitEnd) {
1607 assert(!done);
1608 int rdc = q.getc(off5p, fw);
1609 int rdq = q.getq(off5p);
1610 assert_range(0, 4, rdc);
1611 assert_gt(botf, topf);
1612 assert(botf - topf == 1 || bloc.valid());
1613 assert(botf - topf > 1 || !bloc.valid());
1614 assert(tloc.valid());
1615 TIndexOffU width = botf - topf;
1616 bool ltr = l2r_;
1617 const Ebwt& ebwt = ltr ? ebwtBw : ebwtFw;
1618 t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0;
1619 int only = -1; // if we only get 1 non-empty range, this is the char
1620 size_t nopts = 1;
1621 if(bloc.valid()) {
1622 // Set up initial values for the primes
1623 if(ltr) {
1624 tp[0] = tp[1] = tp[2] = tp[3] = topf;
1625 bp[0] = bp[1] = bp[2] = bp[3] = botf;
1626 } else {
1627 tp[0] = tp[1] = tp[2] = tp[3] = topb;
1628 bp[0] = bp[1] = bp[2] = bp[3] = botb;
1629 }
1630 // Range delimited by tloc/bloc has size >1. If size == 1,
1631 // we use a simpler query (see if(!bloc.valid()) blocks below)
1632 met.bwops++;
1633 met.bwops_bi++;
1634 prm.nSdFmops++;
1635 if(prm.doFmString) {
1636 prm.fmString.add(false, pen_, 1);
1637 }
1638 ebwt.mapBiLFEx(tloc, bloc, t, b, tp, bp);
1639 // t, b, tp and bp now filled
1640 ASSERT_ONLY(TIndexOffU tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3]));
1641 ASSERT_ONLY(TIndexOffU totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3]));
1642 assert_eq(tot, totp);
1643 assert_leq(tot, lasttot);
1644 ASSERT_ONLY(lasttot = tot);
1645 fail = (rdc > 3 || b[rdc] <= t[rdc]);
1646 size_t nopts = 0;
1647 if(b[0] > t[0]) { nopts++; only = 0; }
1648 if(b[1] > t[1]) { nopts++; only = 1; }
1649 if(b[2] > t[2]) { nopts++; only = 2; }
1650 if(b[3] > t[3]) { nopts++; only = 3; }
1651 if(!fail && b[rdc] - t[rdc] < width) {
1652 branches = true;
1653 }
1654 } else {
1655 tp[0] = tp[1] = tp[2] = tp[3] = bp[0] = bp[1] = bp[2] = bp[3] = 0;
1656 // Range delimited by tloc/bloc has size 1
1657 TIndexOffU ntop = ltr ? topb : topf;
1658 met.bwops++;
1659 met.bwops_1++;
1660 prm.nSdFmops++;
1661 if(prm.doFmString) {
1662 prm.fmString.add(false, pen_, 1);
1663 }
1664 int cc = ebwt.mapLF1(ntop, tloc);
1665 assert_range(-1, 3, cc);
1666 fail = (cc != rdc);
1667 if(fail) {
1668 branches = true;
1669 }
1670 if(cc >= 0) {
1671 only = cc;
1672 t[cc] = ntop; b[cc] = ntop+1;
1673 tp[cc] = ltr ? topf : topb;
1674 bp[cc] = ltr ? botf : botb;
1675 }
1676 }
1677 // Now figure out what to do with our N.
1678 int origRdc = rdc;
1679 if(rdc == 4) {
1680 fail = true;
1681 } else {
1682 topf = ltr ? tp[rdc] : t[rdc];
1683 botf = ltr ? bp[rdc] : b[rdc];
1684 topb = ltr ? t[rdc] : tp[rdc];
1685 botb = ltr ? b[rdc] : bp[rdc];
1686 assert_eq(botf - topf, botb - topb);
1687 }
1688 // The trouble with !stopOnN is that we don't have a way to store the N
1689 // edits. There could be several per Descent.
1690 if(rdc == 4 && !stopOnN && nopts == 1) {
1691 fail = false;
1692 rdc = only;
1693 int pen = sc.n(rdq);
1694 assert_gt(pen, 0);
1695 pen_ += pen;
1696 }
1697 assert_range(0, 4, origRdc);
1698 assert_range(0, 4, rdc);
1699 // If 'fail' is true, we failed to align this read character. We still
1700 // install the SA ranges into the DescentPos and increment len_ in this
1701 // case.
1702
1703 // Convert t, tp, b, bp info tf, bf, tb, bb
1704 TIndexOffU *tf = ltr ? tp : t;
1705 TIndexOffU *bf = ltr ? bp : b;
1706 TIndexOffU *tb = ltr ? t : tp;
1707 TIndexOffU *bb = ltr ? b : bp;
1708 // Allocate DescentPos data structure.
1709 if(firstPos) {
1710 posid_ = pf.alloc();
1711 firstPos = false;
1712 } else {
1713 pf.alloc();
1714 }
1715 nalloc++;
1716 pf[posid_ + len_].reset();
1717 pf[posid_ + len_].c = origRdc;
1718 for(size_t i = 0; i < 4; i++) {
1719 pf[posid_ + len_].topf[i] = tf[i];
1720 pf[posid_ + len_].botf[i] = bf[i];
1721 pf[posid_ + len_].topb[i] = tb[i];
1722 pf[posid_ + len_].botb[i] = bb[i];
1723 assert_eq(pf[posid_ + len_].botf[i] - pf[posid_ + len_].topf[i],
1724 pf[posid_ + len_].botb[i] - pf[posid_ + len_].topb[i]);
1725 }
1726 if(!fail) {
1727 // Check if this is redundant with an already-explored path
1728 size_t al5pf = al5pf_, al5pi = al5pi_;
1729 if(toward3p) {
1730 al5pf++;
1731 } else {
1732 al5pi--;
1733 }
1734 fail = !re.check(fw, l2r_, al5pi, al5pf,
1735 al5pf - al5pi + 1 + gapadd_, topf, botf, pen_);
1736 if(fail) {
1737 prm.nRedSkip++;
1738 } else {
1739 prm.nRedFail++; // not pruned by redundancy list
1740 prm.nRedIns++; // inserted into redundancy list
1741 }
1742 }
1743 if(!fail) {
1744 len_++;
1745 if(toward3p) {
1746 al5pf_++;
1747 off5p++;
1748 off3p--;
1749 if(al5pf_ == q.length() - 1) {
1750 hitEnd = true;
1751 done = (al5pi_ == 0);
1752 }
1753 } else {
1754 assert_gt(al5pi_, 0);
1755 al5pi_--;
1756 off5p--;
1757 off3p++;
1758 if(al5pi_ == 0) {
1759 hitEnd = true;
1760 done = (al5pf_ == q.length() - 1);
1761 }
1762 }
1763 }
1764 if(!fail && !hitEnd) {
1765 nextLocsBi(ebwtFw, ebwtBw, tloc, bloc, tf[rdc], bf[rdc], tb[rdc], bb[rdc]);
1766 }
1767 }
1768 assert_geq(al5pf_, al5pi_);
1769 assert(!root() || al5pf_ - al5pi_ + 1 == nalloc || al5pf_ - al5pi_ + 2 == nalloc);
1770 assert_geq(pf.size(), nalloc);
1771 if(done) {
1772 Edit eempty;
1773 alsink.reportAlignment(
1774 q, // query
1775 ebwtFw, // forward index
1776 ebwtBw, // backward index
1777 topf, // top of SA range in forward index
1778 botf, // bottom of SA range in forward index
1779 topb, // top of SA range in backward index
1780 botb, // bottom of SA range in backward index
1781 descid_, // Descent at the leaf
1782 rid_, // root id
1783 eempty, // extra edit, if necessary
1784 pen_, // penalty
1785 df, // factory with Descent
1786 pf, // factory with DescentPoss
1787 rs, // roots
1788 cs); // configs
1789 assert(alsink.repOk());
1790 return true;
1791 } else if(hitEnd) {
1792 assert(botf > 0 || topf > 0);
1793 assert_gt(botf, topf);
1794 topf_bounce = topf;
1795 botf_bounce = botf;
1796 topb_bounce = topb;
1797 botb_bounce = botb;
1798 return true; // Bounced
1799 }
1800 assert(repOk(&q));
1801 assert(!hitEnd || topf_bounce > 0 || botf_bounce > 0);
1802 return true;
1803 }
1804
1805 #ifdef ALIGNER_SEED2_MAIN
1806
1807 #include <string>
1808 #include "sstring.h"
1809 #include "aligner_driver.h"
1810
1811 using namespace std;
1812
1813 bool gReportOverhangs = true;
1814
1815 /**
1816 * A way of feeding simply tests to the seed alignment infrastructure.
1817 */
main(int argc,char ** argv)1818 int main(int argc, char **argv) {
1819
1820 EList<string> strs;
1821 // GCTATATAGCGCGCTCGCATCATTTTGTGT
1822 strs.push_back(string("CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA"
1823 "NNNNNNNNNN"
1824 "CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA"));
1825 // GCTATATAGCGCGCTTGCATCATTTTGTGT
1826 // ^
1827 bool packed = false;
1828 pair<Ebwt*, Ebwt*> ebwts = Ebwt::fromStrings<SString<char> >(
1829 strs,
1830 packed,
1831 0,
1832 REF_READ_REVERSE,
1833 Ebwt::default_bigEndian,
1834 Ebwt::default_lineRate,
1835 Ebwt::default_offRate,
1836 Ebwt::default_ftabChars,
1837 ".aligner_seed2.cpp.tmp",
1838 Ebwt::default_useBlockwise,
1839 Ebwt::default_bmax,
1840 Ebwt::default_bmaxMultSqrt,
1841 Ebwt::default_bmaxDivN,
1842 Ebwt::default_dcv,
1843 Ebwt::default_seed,
1844 false, // verbose
1845 false, // autoMem
1846 false); // sanity
1847
1848 ebwts.first->loadIntoMemory (0, -1, true, true, true, true, false);
1849 ebwts.second->loadIntoMemory(0, 1, true, true, true, true, false);
1850
1851 int testnum = 0;
1852
1853 // Query is longer than ftab and matches exactly twice
1854 for(int rc = 0; rc < 2; rc++) {
1855 for(int i = 0; i < 2; i++) {
1856 cerr << "Test " << (++testnum) << endl;
1857 cerr << " Query with length greater than ftab" << endl;
1858 DescentMetrics mets;
1859 PerReadMetrics prm;
1860 DescentDriver dr;
1861
1862 // Set up the read
1863 BTDnaString seq ("GCTATATAGCGCGCTCGCATCATTTTGTGT", true);
1864 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc");
1865 if(rc) {
1866 seq.reverseComp();
1867 qual.reverse();
1868 }
1869 dr.initRead(
1870 Read("test", seq.toZBuf(), qual.toZBuf()),
1871 false, // nofw
1872 false, // norc
1873 -30, // minsc
1874 30); // maxpen
1875
1876 // Set up the DescentConfig
1877 DescentConfig conf;
1878 conf.cons.init(Ebwt::default_ftabChars, 1.0);
1879 conf.expol = DESC_EX_NONE;
1880
1881 // Set up the search roots
1882 dr.addRoot(
1883 conf, // DescentConfig
1884 (i == 0) ? 0 : (seq.length() - 1), // 5' offset into read of root
1885 (i == 0) ? true : false, // left-to-right?
1886 rc == 0, // forward?
1887 1, // landing
1888 0.0f); // root priority
1889
1890 // Do the search
1891 Scoring sc = Scoring::base1();
1892 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
1893
1894 // Confirm that an exact-matching alignment was found
1895 assert_eq(1, dr.sink().nrange());
1896 assert_eq(2, dr.sink().nelt());
1897 }
1898 }
1899
1900 // Query has length euqal to ftab and matches exactly twice
1901 for(int i = 0; i < 2; i++) {
1902 cerr << "Test " << (++testnum) << endl;
1903 cerr << " Query with length equal to ftab" << endl;
1904 DescentMetrics mets;
1905 PerReadMetrics prm;
1906 DescentDriver dr;
1907
1908 // Set up the read
1909 BTDnaString seq ("GCTATATAGC", true);
1910 BTString qual("ABCDEFGHIa");
1911 dr.initRead(
1912 Read("test", seq.toZBuf(), qual.toZBuf()),
1913 false, // nofw
1914 false, // norc
1915 -30, // minsc
1916 30); // maxpen
1917
1918 // Set up the DescentConfig
1919 DescentConfig conf;
1920 conf.cons.init(Ebwt::default_ftabChars, 1.0);
1921 conf.expol = DESC_EX_NONE;
1922
1923 // Set up the search roots
1924 dr.addRoot(
1925 conf, // DescentConfig
1926 (i == 0) ? 0 : (seq.length() - 1), // 5' offset into read of root
1927 (i == 0) ? true : false, // left-to-right?
1928 true, // forward?
1929 1, // landing
1930 0.0f); // root priority
1931
1932 // Do the search
1933 Scoring sc = Scoring::base1();
1934 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
1935
1936 // Confirm that an exact-matching alignment was found
1937 assert_eq(1, dr.sink().nrange());
1938 assert_eq(2, dr.sink().nelt());
1939 }
1940
1941 // Query has length less than ftab length and matches exactly twice
1942 for(int i = 0; i < 2; i++) {
1943 cerr << "Test " << (++testnum) << endl;
1944 cerr << " Query with length less than ftab" << endl;
1945 DescentMetrics mets;
1946 PerReadMetrics prm;
1947 DescentDriver dr;
1948
1949 // Set up the read
1950 BTDnaString seq ("GCTATATAG", true);
1951 BTString qual("ABCDEFGHI");
1952 dr.initRead(
1953 Read("test", seq.toZBuf(), qual.toZBuf()),
1954 false, // nofw
1955 false, // norc
1956 -30, // minsc
1957 30); // maxpen
1958
1959 // Set up the DescentConfig
1960 DescentConfig conf;
1961 conf.cons.init(Ebwt::default_ftabChars, 1.0);
1962 conf.expol = DESC_EX_NONE;
1963
1964 // Set up the search roots
1965 dr.addRoot(
1966 conf, // DescentConfig
1967 (i == 0) ? 0 : (seq.length() - 1), // 5' offset into read of root
1968 (i == 0) ? true : false, // left-to-right?
1969 true, // forward?
1970 1, // landing
1971 0.0f); // root priority
1972
1973 // Do the search
1974 Scoring sc = Scoring::base1();
1975 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
1976
1977 // Confirm that an exact-matching alignment was found
1978 assert_eq(1, dr.sink().nrange());
1979 assert_eq(2, dr.sink().nelt());
1980 }
1981
1982 // Search root is in the middle of the read, requiring a bounce
1983 for(int i = 0; i < 2; i++) {
1984 cerr << "Test " << (++testnum) << endl;
1985 cerr << " Search root in middle of read" << endl;
1986 DescentMetrics mets;
1987 PerReadMetrics prm;
1988 DescentDriver dr;
1989
1990 // Set up the read
1991 // 012345678901234567890123456789
1992 BTDnaString seq ("GCTATATAGCGCGCTCGCATCATTTTGTGT", true);
1993 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc");
1994 TIndexOffU top, bot;
1995 top = bot = 0;
1996 bool ret = ebwts.first->contains("GCGCTCGCATCATTTTGTGT", &top, &bot);
1997 cerr << ret << ", " << top << ", " << bot << endl;
1998 dr.initRead(
1999 Read("test", seq.toZBuf(), qual.toZBuf()),
2000 false, // nofw
2001 false, // norc
2002 -30, // minsc
2003 30); // maxpen
2004
2005 // Set up the DescentConfig
2006 DescentConfig conf;
2007 conf.cons.init(Ebwt::default_ftabChars, 1.0);
2008 conf.expol = DESC_EX_NONE;
2009
2010 // Set up the search roots
2011 dr.addRoot(
2012 conf, // DescentConfig
2013 (i == 0) ? 10 : (seq.length() - 1 - 10), // 5' offset into read of root
2014 (i == 0) ? true : false, // left-to-right?
2015 true, // forward?
2016 1, // landing
2017 0.0f); // root priority
2018
2019 // Do the search
2020 Scoring sc = Scoring::base1();
2021 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2022
2023 // Confirm that an exact-matching alignment was found
2024 assert_eq(1, dr.sink().nrange());
2025 assert_eq(2, dr.sink().nelt());
2026 }
2027
2028 delete ebwts.first;
2029 delete ebwts.second;
2030
2031 strs.clear();
2032 strs.push_back(string("CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA"
2033 "NNNNNNNNNN"
2034 "CATGTCAGCTATATAGCG"));
2035 ebwts = Ebwt::fromStrings<SString<char> >(
2036 strs,
2037 packed,
2038 0,
2039 REF_READ_REVERSE,
2040 Ebwt::default_bigEndian,
2041 Ebwt::default_lineRate,
2042 Ebwt::default_offRate,
2043 Ebwt::default_ftabChars,
2044 ".aligner_seed2.cpp.tmp",
2045 Ebwt::default_useBlockwise,
2046 Ebwt::default_bmax,
2047 Ebwt::default_bmaxMultSqrt,
2048 Ebwt::default_bmaxDivN,
2049 Ebwt::default_dcv,
2050 Ebwt::default_seed,
2051 false, // verbose
2052 false, // autoMem
2053 false); // sanity
2054
2055 ebwts.first->loadIntoMemory (0, -1, true, true, true, true, false);
2056 ebwts.second->loadIntoMemory(0, 1, true, true, true, true, false);
2057
2058 // Test to see if the root selector works as we expect
2059 {
2060 BTDnaString seq ("GCTATATAGCGCGCTCGCATCATTTTGTGT", true);
2061 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc");
2062 // 012345678901234567890123456789
2063 // abcdefghiA: 644
2064 // CDEFGHIabc : 454 + 100 = 554
2065 DescentDriver dr;
2066 PrioritizedRootSelector sel(
2067 2.0,
2068 SimpleFunc(SIMPLE_FUNC_CONST, 10.0, 10.0, 10.0, 10.0), // 6 roots
2069 10);
2070 // Set up the read
2071 dr.initRead(
2072 Read("test", seq.toZBuf(), qual.toZBuf()),
2073 false, // nofw
2074 false, // norc
2075 -30, // minsc
2076 30, // maxpen
2077 NULL, // opposite mate
2078 &sel); // root selector
2079 dr.printRoots(std::cerr);
2080 assert_eq(12, dr.roots().size());
2081 assert_eq(652, dr.roots()[0].pri);
2082 assert_eq(652, dr.roots()[1].pri);
2083 }
2084
2085 // Test to see if the root selector works as we expect; this time the
2086 // string is longer (64 chars) and there are a couple Ns
2087 {
2088 BTDnaString seq ("NCTATATAGCGCGCTCGCATCNTTTTGTGTGCTATATAGCGCGCTCGCATCATTTTGTGTTTAT", true);
2089 BTString qual("ABCDEFGHIJKLMNOPabcdefghijklmnopABCDEFGHIJKLMNOPabcdefghijklmnop");
2090 // 0123456789012345678901234567890123456789012345678901234567890123
2091 // bcdefghijklmnop: 1080 - 150 bcdefghijklmnop: 1080 + 150 = 1230
2092 // = 930
2093 DescentDriver dr;
2094 PrioritizedRootSelector sel(
2095 2.0,
2096 SimpleFunc(SIMPLE_FUNC_CONST, 10.0, 10.0, 10.0, 10.0), // 6 * 4 = 24 roots
2097 15); // landing size
2098 // Set up the read
2099 dr.initRead(
2100 Read("test", seq.toZBuf(), qual.toZBuf()),
2101 false, // nofw
2102 false, // norc
2103 -30, // minsc
2104 30, // maxpen
2105 NULL, // opposite mate
2106 &sel); // root selector
2107 dr.printRoots(std::cerr);
2108 assert_eq(24, dr.roots().size());
2109 assert_eq(1230, dr.roots()[0].pri);
2110 assert_eq(1230, dr.roots()[1].pri);
2111 }
2112
2113 // Query is longer than ftab and matches exactly once. One search root for
2114 // forward read.
2115 {
2116 size_t last_topf = std::numeric_limits<size_t>::max();
2117 size_t last_botf = std::numeric_limits<size_t>::max();
2118 for(int i = 0; i < 2; i++) {
2119 BTDnaString seq ("GCTATATAGCGCGCTCGCATCATTTTGTGT", true);
2120 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc");
2121 for(size_t j = 0; j < seq.length(); j++) {
2122 cerr << "Test " << (++testnum) << endl;
2123 cerr << " Query with length greater than ftab and matches exactly once" << endl;
2124 DescentMetrics mets;
2125 PerReadMetrics prm;
2126 DescentDriver dr;
2127
2128 // Set up the read
2129 dr.initRead(
2130 Read("test", seq.toZBuf(), qual.toZBuf()),
2131 false, // nofw
2132 false, // norc
2133 -30, // minsc
2134 30); // maxpen
2135
2136 // Set up the DescentConfig
2137 DescentConfig conf;
2138 conf.cons.init(Ebwt::default_ftabChars, 1.0);
2139 conf.expol = DESC_EX_NONE;
2140
2141 // Set up the search roots
2142 dr.addRoot(
2143 conf, // DescentConfig
2144 j, // 5' offset into read of root
2145 i == 0, // left-to-right?
2146 true, // forward?
2147 1, // landing
2148 0.0f); // root priority
2149
2150 // Do the search
2151 Scoring sc = Scoring::base1();
2152 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2153
2154 // Confirm that an exact-matching alignment was found
2155 assert_eq(1, dr.sink().nrange());
2156 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
2157 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
2158 assert_eq(1, dr.sink().nelt());
2159 }
2160 }
2161 }
2162
2163 // Query is longer than ftab and its reverse complement matches exactly
2164 // once. Search roots on forward and reverse-comp reads.
2165 {
2166 size_t last_topf = std::numeric_limits<size_t>::max();
2167 size_t last_botf = std::numeric_limits<size_t>::max();
2168 for(int i = 0; i < 2; i++) {
2169 BTDnaString seq ("GCTATATAGCGCGCTCGCATCATTTTGTGT", true);
2170 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc");
2171 for(size_t j = 0; j < seq.length(); j++) {
2172 cerr << "Test " << (++testnum) << endl;
2173 cerr << " Query with length greater than ftab and reverse complement matches exactly once" << endl;
2174 DescentMetrics mets;
2175 PerReadMetrics prm;
2176 DescentDriver dr;
2177
2178 // Set up the read
2179 dr.initRead(
2180 Read("test", seq.toZBuf(), qual.toZBuf()),
2181 false, // nofw
2182 false, // norc
2183 -30, // minsc
2184 30); // maxpen
2185
2186 // Set up the DescentConfig
2187 DescentConfig conf;
2188 conf.cons.init(Ebwt::default_ftabChars, 1.0);
2189 conf.expol = DESC_EX_NONE;
2190
2191 // Set up the search roots
2192 dr.addRoot(
2193 conf, // DescentConfig
2194 j, // 5' offset into read of root
2195 i == 0, // left-to-right?
2196 true, // forward?
2197 1, // landing
2198 0.0f); // root priority
2199 dr.addRoot(
2200 conf, // DescentConfig
2201 j, // 5' offset into read of root
2202 i == 0, // left-to-right?
2203 false, // forward?
2204 1, // landing
2205 1.0f); // root priority
2206
2207 // Do the search
2208 Scoring sc = Scoring::base1();
2209 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2210
2211 // Confirm that an exact-matching alignment was found
2212 assert_eq(1, dr.sink().nrange());
2213 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
2214 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
2215 assert_eq(1, dr.sink().nelt());
2216 }
2217 }
2218 }
2219
2220 // Query is longer than ftab and matches exactly once with one mismatch
2221 {
2222 size_t last_topf = std::numeric_limits<size_t>::max();
2223 size_t last_botf = std::numeric_limits<size_t>::max();
2224 for(int i = 0; i < 2; i++) {
2225 // Set up the read
2226 // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA
2227 // ||||||||||||||||||||||||||||||
2228 BTDnaString orig("GCTATATAGCGCGCTCGCATCATTTTGTGT", true);
2229 // 012345678901234567890123456789
2230 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc");
2231 for(size_t k = 0; k < orig.length(); k++) {
2232 BTDnaString seq = orig;
2233 seq.set(seq[k] ^ 3, k);
2234 for(size_t j = 0; j < seq.length(); j++) {
2235 // Assume left-to-right
2236 size_t beg = j;
2237 size_t end = j + Ebwt::default_ftabChars;
2238 // Mismatch penalty is 3, so we have to skip starting
2239 // points that are within 2 from the mismatch
2240 if((i > 0 && j > 0) || j == seq.length()-1) {
2241 // Right-to-left
2242 if(beg < Ebwt::default_ftabChars) {
2243 beg = 0;
2244 } else {
2245 beg -= Ebwt::default_ftabChars;
2246 }
2247 end -= Ebwt::default_ftabChars;
2248 }
2249 size_t kk = k;
2250 //if(rc) {
2251 // kk = seq.length() - k - 1;
2252 //}
2253 if(beg <= kk && end > kk) {
2254 continue;
2255 }
2256 if((j > kk) ? (j - kk <= 2) : (kk - j <= 2)) {
2257 continue;
2258 }
2259 cerr << "Test " << (++testnum) << endl;
2260 cerr << " Query with length greater than ftab and matches exactly once with 1mm" << endl;
2261 DescentMetrics mets;
2262 PerReadMetrics prm;
2263 DescentDriver dr;
2264
2265 dr.initRead(
2266 Read("test", seq.toZBuf(), qual.toZBuf()),
2267 false, // nofw
2268 false, // norc
2269 -30, // minsc
2270 30); // maxpen
2271
2272 // Set up the DescentConfig
2273 DescentConfig conf;
2274 // Changed
2275 conf.cons.init(0, 1.0);
2276 conf.expol = DESC_EX_NONE;
2277
2278 // Set up the search roots
2279 dr.addRoot(
2280 conf, // DescentConfig
2281 j, // 5' offset into read of root
2282 i == 0, // left-to-right?
2283 true, // forward?
2284 1, // landing
2285 0.0f); // root priority
2286
2287 // Do the search
2288 Scoring sc = Scoring::base1();
2289 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2290
2291 // Confirm that an exact-matching alignment was found
2292 assert_eq(1, dr.sink().nrange());
2293 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
2294 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
2295 cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl;
2296 assert_eq(1, dr.sink().nelt());
2297 last_topf = dr.sink()[0].topf;
2298 last_botf = dr.sink()[0].botf;
2299 }
2300 }
2301 }
2302 }
2303
2304 // Query is longer than ftab and matches exactly once with one N mismatch
2305 {
2306 size_t last_topf = std::numeric_limits<size_t>::max();
2307 size_t last_botf = std::numeric_limits<size_t>::max();
2308 for(int i = 0; i < 2; i++) {
2309 // Set up the read
2310 // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA
2311 // ||||||||||||||||||||||||||||||
2312 BTDnaString orig("GCTATATAGCGCGCTCGCATCATTTTGTGT", true);
2313 // 012345678901234567890123456789
2314 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc");
2315 for(size_t k = 0; k < orig.length(); k++) {
2316 BTDnaString seq = orig;
2317 seq.set(4, k);
2318 for(size_t j = 0; j < seq.length(); j++) {
2319 // Assume left-to-right
2320 size_t beg = j;
2321 size_t end = j + Ebwt::default_ftabChars;
2322 // Mismatch penalty is 3, so we have to skip starting
2323 // points that are within 2 from the mismatch
2324 if((i > 0 && j > 0) || j == seq.length()-1) {
2325 // Right-to-left
2326 if(beg < Ebwt::default_ftabChars) {
2327 beg = 0;
2328 } else {
2329 beg -= Ebwt::default_ftabChars;
2330 }
2331 end -= Ebwt::default_ftabChars;
2332 }
2333 if(beg <= k && end > k) {
2334 continue;
2335 }
2336 if((j > k) ? (j - k <= 2) : (k - j <= 2)) {
2337 continue;
2338 }
2339 cerr << "Test " << (++testnum) << endl;
2340 cerr << " Query with length greater than ftab and matches exactly once with 1mm" << endl;
2341 DescentMetrics mets;
2342 PerReadMetrics prm;
2343 DescentDriver dr;
2344
2345 dr.initRead(
2346 Read("test", seq.toZBuf(), qual.toZBuf()),
2347 false, // nofw
2348 false, // norc
2349 -30, // minsc
2350 30); // maxpen
2351
2352 // Set up the DescentConfig
2353 DescentConfig conf;
2354 // Changed
2355 conf.cons.init(0, 1.0);
2356 conf.expol = DESC_EX_NONE;
2357
2358 // Set up the search roots
2359 dr.addRoot(
2360 conf, // DescentConfig
2361 j, // 5' offset into read of root
2362 i == 0, // left-to-right?
2363 true, // forward?
2364 1, // landing
2365 0.0f); // root priority
2366
2367 // Do the search
2368 Scoring sc = Scoring::base1();
2369 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2370
2371 // Confirm that an exact-matching alignment was found
2372 assert_eq(1, dr.sink().nrange());
2373 assert_eq(sc.n(40), dr.sink()[0].pen);
2374 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
2375 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
2376 cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl;
2377 assert_eq(1, dr.sink().nelt());
2378 last_topf = dr.sink()[0].topf;
2379 last_botf = dr.sink()[0].botf;
2380 }
2381 }
2382 }
2383 }
2384
2385 // Throw a bunch of queries with a bunch of Ns in and try to force an assert
2386 {
2387 RandomSource rnd(79);
2388 for(int i = 0; i < 2; i++) {
2389 // Set up the read
2390 // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA
2391 // ||||||||||||||||||||||||||||||
2392 BTDnaString orig("GCTATATAGCGCGCTCGCATCATTTTGTGT", true);
2393 // 012345678901234567890123456789
2394 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc");
2395 if(i == 1) {
2396 orig.reverseComp();
2397 qual.reverse();
2398 }
2399 for(size_t trials = 0; trials < 100; trials++) {
2400 BTDnaString seq = orig;
2401 size_t ns = 10;
2402 for(size_t k = 0; k < ns; k++) {
2403 size_t pos = rnd.nextU32() % seq.length();
2404 seq.set(4, pos);
2405 }
2406
2407 cerr << "Test " << (++testnum) << endl;
2408 cerr << " Query with a bunch of Ns" << endl;
2409
2410 DescentMetrics mets;
2411 PerReadMetrics prm;
2412 DescentDriver dr;
2413
2414 dr.initRead(
2415 Read("test", seq.toZBuf(), qual.toZBuf()),
2416 false, // nofw
2417 false, // norc
2418 -30, // minsc
2419 30); // maxpen
2420
2421 // Set up the DescentConfig
2422 DescentConfig conf;
2423 // Changed
2424 conf.cons.init(Ebwt::default_ftabChars, 1.0);
2425 conf.expol = DESC_EX_NONE;
2426
2427 // Set up the search roots
2428 for(size_t k = 0; k < ns; k++) {
2429 size_t j = rnd.nextU32() % seq.length();
2430 bool ltr = (rnd.nextU2() == 0) ? true : false;
2431 bool fw = (rnd.nextU2() == 0) ? true : false;
2432 dr.addRoot(
2433 conf, // DescentConfig
2434 j, // 5' offset into read of root
2435 ltr, // left-to-right?
2436 fw, // forward?
2437 1, // landing
2438 0.0f); // root priority
2439 }
2440
2441 // Do the search
2442 Scoring sc = Scoring::base1();
2443 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2444 }
2445 }
2446 }
2447
2448 // Query is longer than ftab and matches exactly once with one mismatch
2449 {
2450 RandomSource rnd(77);
2451 size_t last_topf = std::numeric_limits<size_t>::max();
2452 size_t last_botf = std::numeric_limits<size_t>::max();
2453 for(int i = 0; i < 2; i++) {
2454 // Set up the read
2455 // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA
2456 // ||||||||||||||||||||||||||||||
2457 BTDnaString orig("GCTATATAGCGCGCTCGCATCATTTTGTGT", true);
2458 // 012345678901234567890123456789
2459 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc");
2460 // revcomp: ACACAAAATGATGCGAGCGCGCTATATAGC
2461 // revqual: cbaIHGFEDCBAihgfedcbaIHGFEDCBA
2462 bool fwi = (i == 0);
2463 if(!fwi) {
2464 orig.reverseComp();
2465 }
2466 for(size_t k = 0; k < orig.length(); k++) {
2467 BTDnaString seq = orig;
2468 seq.set(seq[k] ^ 3, k);
2469 cerr << "Test " << (++testnum) << endl;
2470 cerr << " Query with length greater than ftab and matches exactly once with 1mm. Many search roots." << endl;
2471 DescentMetrics mets;
2472 PerReadMetrics prm;
2473 DescentDriver dr;
2474
2475 dr.initRead(
2476 Read("test", seq.toZBuf(), qual.toZBuf()),
2477 false, // nofw
2478 false, // norc
2479 -30, // minsc
2480 30); // maxpen
2481
2482 // Set up the DescentConfig
2483 DescentConfig conf;
2484 // Changed
2485 conf.cons.init(0, 1.0);
2486 conf.expol = DESC_EX_NONE;
2487
2488 // Set up several random search roots
2489 bool onegood = false;
2490 for(size_t y = 0; y < 10; y++) {
2491 size_t j = rnd.nextU32() % seq.length();
2492 bool ltr = (rnd.nextU2() == 0) ? true : false;
2493 bool fw = (rnd.nextU2() == 0) ? true : false;
2494 dr.addRoot(
2495 conf, // DescentConfig
2496 (TReadOff)j, // 5' offset into read of root
2497 ltr, // left-to-right?
2498 fw, // forward?
2499 1, // landing
2500 (float)((float)y * 1.0f)); // root priority
2501 // Assume left-to-right
2502 size_t beg = j;
2503 size_t end = j + Ebwt::default_ftabChars;
2504 // Mismatch penalty is 3, so we have to skip starting
2505 // points that are within 2 from the mismatch
2506 if(!ltr) {
2507 // Right-to-left
2508 if(beg < Ebwt::default_ftabChars) {
2509 beg = 0;
2510 } else {
2511 beg -= Ebwt::default_ftabChars;
2512 }
2513 end -= Ebwt::default_ftabChars;
2514 }
2515 bool good = true;
2516 if(fw != fwi) {
2517 good = false;
2518 }
2519 if(beg <= k && end > k) {
2520 good = false;
2521 }
2522 if((j > k) ? (j - k <= 2) : (k - j <= 2)) {
2523 good = false;
2524 }
2525 if(good) {
2526 onegood = true;
2527 }
2528 }
2529 if(!onegood) {
2530 continue;
2531 }
2532
2533 // Do the search
2534 Scoring sc = Scoring::base1();
2535 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2536
2537 // Confirm that an exact-matching alignment was found
2538 assert_eq(1, dr.sink().nrange());
2539 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
2540 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
2541 cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl;
2542 assert_eq(1, dr.sink().nelt());
2543 last_topf = dr.sink()[0].topf;
2544 last_botf = dr.sink()[0].botf;
2545 }
2546 }
2547 }
2548
2549 // Query is longer than ftab and matches exactly once with one read gap
2550 {
2551 size_t last_topf = std::numeric_limits<size_t>::max();
2552 size_t last_botf = std::numeric_limits<size_t>::max();
2553 for(int i = 0; i < 2; i++) {
2554 for(int k = 0; k < 2; k++) {
2555 // Set up the read
2556 // GCTATATAGCGCGCCTGCATCATTTTGTGT
2557 // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA
2558 // |||||||||||||||///////////////
2559 BTDnaString seq ("GCTATATAGCGCGCTGCATCATTTTGTGT", true);
2560 // 01234567890123456789012345678
2561 // 87654321098765432109876543210
2562 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIab");
2563 if(k == 1) {
2564 seq.reverseComp();
2565 qual.reverse();
2566 }
2567 assert_eq(seq.length(), qual.length());
2568 // js iterate over offsets from 5' end for the search root
2569 for(size_t j = 0; j < seq.length(); j++) {
2570 // Assume left-to-right
2571 size_t beg = j;
2572 if(k == 1) {
2573 beg = seq.length() - beg - 1;
2574 }
2575 size_t end = beg + Ebwt::default_ftabChars;
2576 // Mismatch penalty is 3, so we have to skip starting
2577 // points that are within 2 from the mismatch
2578 if((i > 0 && j > 0) || j == seq.length()-1) {
2579 // Right-to-left
2580 if(beg < Ebwt::default_ftabChars) {
2581 beg = 0;
2582 } else {
2583 beg -= Ebwt::default_ftabChars;
2584 }
2585 end -= Ebwt::default_ftabChars;
2586 }
2587 assert_geq(end, beg);
2588 if(beg <= 15 && end >= 15) {
2589 continue;
2590 }
2591 cerr << "Test " << (++testnum) << endl;
2592 cerr << " Query matches once with a read gap of length 1" << endl;
2593 DescentMetrics mets;
2594 PerReadMetrics prm;
2595 DescentDriver dr;
2596
2597 Read q("test", seq.toZBuf(), qual.toZBuf());
2598 assert(q.repOk());
2599 dr.initRead(
2600 q, // read
2601 false, // nofw
2602 false, // norc
2603 -30, // minsc
2604 30); // maxpen
2605
2606 // Set up the DescentConfig
2607 DescentConfig conf;
2608 // Changed
2609 conf.cons.init(0, 0.5);
2610 conf.expol = DESC_EX_NONE;
2611
2612 // Set up the search roots
2613 dr.addRoot(
2614 conf, // DescentConfig
2615 j, // 5' offset into read of root
2616 i == 0, // left-to-right?
2617 k == 0, // forward?
2618 1, // landing
2619 0.0f); // root priority
2620
2621 // Do the search
2622 Scoring sc = Scoring::base1();
2623 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2624
2625 // Confirm that an exact-matching alignment was found
2626 assert_eq(1, dr.sink().nrange());
2627 assert_eq(sc.readGapOpen() + 0 * sc.readGapExtend(), dr.sink()[0].pen);
2628 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
2629 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
2630 cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl;
2631 assert_eq(1, dr.sink().nelt());
2632 last_topf = dr.sink()[0].topf;
2633 last_botf = dr.sink()[0].botf;
2634 }
2635 }}
2636 }
2637
2638 // Query is longer than ftab and matches exactly once with one read gap of
2639 // length 3
2640 {
2641 size_t last_topf = std::numeric_limits<size_t>::max();
2642 size_t last_botf = std::numeric_limits<size_t>::max();
2643 for(int i = 0; i < 2; i++) {
2644 for(int k = 0; k < 2; k++) {
2645 // Set up the read
2646 // GCTATATAGCGCGCGCTCATCATTTTGTGT
2647 // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA
2648 // |||||||||||||| |||||||||||||
2649 BTDnaString seq ("GCTATATAGCGCGC" "CATCATTTTGTGT", true);
2650 // 01234567890123 4567890123456
2651 // 65432109876543 2109876543210
2652 BTString qual("ABCDEFGHIabcde" "fghiABCDEFGHI");
2653 if(k == 1) {
2654 seq.reverseComp();
2655 qual.reverse();
2656 }
2657 for(size_t j = 0; j < seq.length(); j++) {
2658 // Assume left-to-right
2659 size_t beg = j;
2660 if(k == 1) {
2661 beg = seq.length() - beg - 1;
2662 }
2663 size_t end = beg + Ebwt::default_ftabChars;
2664 // Mismatch penalty is 3, so we have to skip starting
2665 // points that are within 2 from the mismatch
2666 if((i > 0 && j > 0) || j == seq.length()-1) {
2667 // Right-to-left
2668 if(beg < Ebwt::default_ftabChars) {
2669 beg = 0;
2670 } else {
2671 beg -= Ebwt::default_ftabChars;
2672 }
2673 end -= Ebwt::default_ftabChars;
2674 }
2675 if(beg <= 14 && end >= 14) {
2676 continue;
2677 }
2678 cerr << "Test " << (++testnum) << endl;
2679 cerr << " Query matches once with a read gap of length 3" << endl;
2680 DescentMetrics mets;
2681 PerReadMetrics prm;
2682 DescentDriver dr;
2683
2684 dr.initRead(
2685 Read("test", seq.toZBuf(), qual.toZBuf()),
2686 false, // nofw
2687 false, // norc
2688 -30, // minsc
2689 30); // maxpen
2690
2691 // Set up the DescentConfig
2692 DescentConfig conf;
2693 // Changed
2694 conf.cons.init(0, 0.2);
2695 conf.expol = DESC_EX_NONE;
2696
2697 // Set up the search roots
2698 dr.addRoot(
2699 conf, // DescentConfig
2700 j, // 5' offset into read of root
2701 i == 0, // left-to-right?
2702 k == 0, // forward?
2703 1, // landing
2704 0.0f); // root priority
2705
2706 // Do the search
2707 Scoring sc = Scoring::base1();
2708 // Need to adjust the mismatch penalty up to avoid alignments
2709 // with lots of mismatches.
2710 sc.setMmPen(COST_MODEL_CONSTANT, 6, 6);
2711 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2712
2713 // Confirm that an exact-matching alignment was found
2714 assert_eq(1, dr.sink().nrange());
2715 assert_eq(sc.readGapOpen() + 2 * sc.readGapExtend(), dr.sink()[0].pen);
2716 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
2717 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
2718 cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl;
2719 assert_eq(1, dr.sink().nelt());
2720 last_topf = dr.sink()[0].topf;
2721 last_botf = dr.sink()[0].botf;
2722 }
2723 }}
2724 }
2725
2726 // Query is longer than ftab and matches exactly once with one reference gap
2727 {
2728 size_t last_topf = std::numeric_limits<size_t>::max();
2729 size_t last_botf = std::numeric_limits<size_t>::max();
2730 for(int i = 0; i < 2; i++) {
2731 // Set up the read
2732 // Ref: CATGTCAGCTATATAGCGCGC" "TCGCATCATTTTGTGTGTAAACCA
2733 // |||||||||||||| ||||||||||||||||
2734 BTDnaString seq ("GCTATATAGCGCGCA""TCGCATCATTTTGTGT", true);
2735 // 012345678901234 5678901234567890
2736 BTString qual("ABCDEFGHIabcdef""ghiABCDEFGHIabcd");
2737 for(size_t j = 0; j < seq.length(); j++) {
2738 // Assume left-to-right
2739 size_t beg = j;
2740 size_t end = j + Ebwt::default_ftabChars;
2741 // Mismatch penalty is 3, so we have to skip starting
2742 // points that are within 2 from the mismatch
2743 if((i > 0 && j > 0) || j == seq.length()-1) {
2744 // Right-to-left
2745 if(beg < Ebwt::default_ftabChars) {
2746 beg = 0;
2747 } else {
2748 beg -= Ebwt::default_ftabChars;
2749 }
2750 end -= Ebwt::default_ftabChars;
2751 }
2752 if(beg <= 14 && end >= 14) {
2753 continue;
2754 }
2755 cerr << "Test " << (++testnum) << endl;
2756 cerr << " Query matches once with a reference gap of length 1" << endl;
2757 DescentMetrics mets;
2758 PerReadMetrics prm;
2759 DescentDriver dr;
2760
2761 dr.initRead(
2762 Read("test", seq.toZBuf(), qual.toZBuf()),
2763 false, // nofw
2764 false, // norc
2765 -30, // minsc
2766 30); // maxpen
2767
2768 // Set up the DescentConfig
2769 DescentConfig conf;
2770 // Changed
2771 conf.cons.init(1, 0.5);
2772 conf.expol = DESC_EX_NONE;
2773
2774 // Set up the search roots
2775 dr.addRoot(
2776 conf, // DescentConfig
2777 j, // 5' offset into read of root
2778 i == 0, // left-to-right?
2779 true, // forward?
2780 1, // landing
2781 0.0f); // root priority
2782
2783 // Do the search
2784 Scoring sc = Scoring::base1();
2785 // Need to adjust the mismatch penalty up to avoid alignments
2786 // with lots of mismatches.
2787 sc.setMmPen(COST_MODEL_CONSTANT, 6, 6);
2788 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2789
2790 // Confirm that an exact-matching alignment was found
2791 assert_eq(1, dr.sink().nrange());
2792 assert_eq(sc.refGapOpen() + 0 * sc.refGapExtend(), dr.sink()[0].pen);
2793 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
2794 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
2795 cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl;
2796 assert_eq(1, dr.sink().nelt());
2797 last_topf = dr.sink()[0].topf;
2798 last_botf = dr.sink()[0].botf;
2799 }
2800 }
2801 }
2802
2803 // Query is longer than ftab and matches exactly once with one reference gap
2804 {
2805 size_t last_topf = std::numeric_limits<size_t>::max();
2806 size_t last_botf = std::numeric_limits<size_t>::max();
2807 for(int i = 0; i < 2; i++) {
2808 // Set up the read
2809 // Ref: CATGTCAGCTATATAGCGCGC" "TCGCATCATTTTGTGTGTAAACCA
2810 // |||||||||||||| ||||||||||||||||
2811 BTDnaString seq ("GCTATATAGCGCGCATG""TCGCATCATTTTGTGT", true);
2812 // 01234567890123456 7890123456789012
2813 BTString qual("ABCDEFGHIabcdefgh""iABCDEFGHIabcdef");
2814 for(size_t j = 0; j < seq.length(); j++) {
2815 // Assume left-to-right
2816 size_t beg = j;
2817 size_t end = j + Ebwt::default_ftabChars;
2818 // Mismatch penalty is 3, so we have to skip starting
2819 // points that are within 2 from the mismatch
2820 if((i > 0 && j > 0) || j == seq.length()-1) {
2821 // Right-to-left
2822 if(beg < Ebwt::default_ftabChars) {
2823 beg = 0;
2824 } else {
2825 beg -= Ebwt::default_ftabChars;
2826 }
2827 end -= Ebwt::default_ftabChars;
2828 }
2829 if(beg <= 14 && end >= 14) {
2830 continue;
2831 }
2832 if(beg <= 15 && end >= 15) {
2833 continue;
2834 }
2835 if(beg <= 16 && end >= 16) {
2836 continue;
2837 }
2838 cerr << "Test " << (++testnum) << endl;
2839 cerr << " Query matches once with a reference gap of length 1" << endl;
2840 DescentMetrics mets;
2841 PerReadMetrics prm;
2842 DescentDriver dr;
2843
2844 dr.initRead(
2845 Read("test", seq.toZBuf(), qual.toZBuf()),
2846 false, // nofw
2847 false, // norc
2848 -30, // minsc
2849 30); // maxpen
2850
2851 // Set up the DescentConfig
2852 DescentConfig conf;
2853 // Changed
2854 conf.cons.init(1, 0.25);
2855 conf.expol = DESC_EX_NONE;
2856
2857 // Set up the search roots
2858 dr.addRoot(
2859 conf, // DescentConfig
2860 j, // 5' offset into read of root
2861 i == 0, // left-to-right?
2862 true, // forward?
2863 1, // landing
2864 0.0f); // root priority
2865
2866 // Do the search
2867 Scoring sc = Scoring::base1();
2868 // Need to adjust the mismatch penalty up to avoid alignments
2869 // with lots of mismatches.
2870 sc.setMmPen(COST_MODEL_CONSTANT, 6, 6);
2871 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2872
2873 // Confirm that an exact-matching alignment was found
2874 assert_eq(1, dr.sink().nrange());
2875 assert_eq(sc.refGapOpen() + 2 * sc.refGapExtend(), dr.sink()[0].pen);
2876 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
2877 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
2878 cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl;
2879 assert_eq(1, dr.sink().nelt());
2880 last_topf = dr.sink()[0].topf;
2881 last_botf = dr.sink()[0].botf;
2882 }
2883 }
2884 }
2885
2886 // Query is longer than ftab and matches exactly once with one read gap,
2887 // one ref gap, and one mismatch
2888 {
2889 size_t last_topf = std::numeric_limits<size_t>::max();
2890 size_t last_botf = std::numeric_limits<size_t>::max();
2891 for(int i = 0; i < 2; i++) {
2892 // Set up the read
2893 // Ref: CATGTCAGCT ATATAGCGCGCT CGCATCATTTTGTGTGTAAACCA
2894 // |||||||||| |||||||||||| |||||| |||||||||||||
2895 BTDnaString seq ("CATGTCAGCT""GATATAGCGCGCT" "GCATCAATTTGTGTGTAAAC", true);
2896 // 0123456789 0123456789012 34567890123456789012
2897 BTString qual("ABCDEFGHIa""bcdefghiACDEF" "GHIabcdefghijkABCDEF");
2898 for(size_t j = 0; j < seq.length(); j++) {
2899 // Assume left-to-right
2900 size_t beg = j;
2901 size_t end = j + Ebwt::default_ftabChars;
2902 // Mismatch penalty is 3, so we have to skip starting
2903 // points that are within 2 from the mismatch
2904 if((i > 0 && j > 0) || j == seq.length()-1) {
2905 // Right-to-left
2906 if(beg < Ebwt::default_ftabChars) {
2907 beg = 0;
2908 } else {
2909 beg -= Ebwt::default_ftabChars;
2910 }
2911 end -= Ebwt::default_ftabChars;
2912 }
2913 if(beg <= 10 && end >= 10) {
2914 continue;
2915 }
2916 if(beg <= 22 && end >= 22) {
2917 continue;
2918 }
2919 if(beg <= 30 && end >= 30) {
2920 continue;
2921 }
2922 cerr << "Test " << (++testnum) << endl;
2923 cerr << " Query matches once with a read gap of length 1" << endl;
2924 DescentMetrics mets;
2925 PerReadMetrics prm;
2926 DescentDriver dr;
2927
2928 dr.initRead(
2929 Read("test", seq.toZBuf(), qual.toZBuf()),
2930 false, // nofw
2931 false, // norc
2932 -50, // minsc
2933 50); // maxpen
2934
2935 // Set up the DescentConfig
2936 DescentConfig conf;
2937 // Changed
2938 conf.cons.init(1, 0.5);
2939 conf.expol = DESC_EX_NONE;
2940
2941 // Set up the search roots
2942 dr.addRoot(
2943 conf, // DescentConfig
2944 j, // 5' offset into read of root
2945 i == 0, // left-to-right?
2946 true, // forward?
2947 1, // landing
2948 0.0f); // root priority
2949
2950 // Do the search
2951 Scoring sc = Scoring::base1();
2952 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
2953
2954 // Confirm that an exact-matching alignment was found
2955 assert_eq(1, dr.sink().nrange());
2956 assert_eq(sc.readGapOpen() + sc.refGapOpen() + sc.mm((int)'d' - 33), dr.sink()[0].pen);
2957 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
2958 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
2959 cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl;
2960 assert_eq(1, dr.sink().nelt());
2961 last_topf = dr.sink()[0].topf;
2962 last_botf = dr.sink()[0].botf;
2963 }
2964 }
2965 }
2966
2967 delete ebwts.first;
2968 delete ebwts.second;
2969
2970 // Ref CATGTCAGCT-ATATAGCGCGCTCGCATCATTTTGTGTGTAAAC
2971 // |||||||||| |||||||||||| |||||| |||||||||||||
2972 // Rd CATGTCAGCTGATATAGCGCGCT-GCATCAATTTGTGTGTAAAC
2973 strs.clear();
2974 strs.push_back(string("CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAAC"
2975 "NNNNNNNNNN"
2976 "CATGTCAGCTGATATAGCGCGCTCGCATCATTTTGTGTGTAAAC" // same but without first ref gap
2977 "N"
2978 "CATGTCAGCTATATAGCGCGCTGCATCATTTTGTGTGTAAAC" // same but without first read gap
2979 "N"
2980 "CATGTCAGCTATATAGCGCGCTCGCATCAATTTGTGTGTAAAC" // same but without first mismatch
2981 "N"
2982 "CATGTCAGCTGATATAGCGCGCTGCATCAATTTGTGTGTAAAC" // Exact match for read
2983 ));
2984 ebwts = Ebwt::fromStrings<SString<char> >(
2985 strs,
2986 packed,
2987 0,
2988 REF_READ_REVERSE,
2989 Ebwt::default_bigEndian,
2990 Ebwt::default_lineRate,
2991 Ebwt::default_offRate,
2992 Ebwt::default_ftabChars,
2993 ".aligner_seed2.cpp.tmp",
2994 Ebwt::default_useBlockwise,
2995 Ebwt::default_bmax,
2996 Ebwt::default_bmaxMultSqrt,
2997 Ebwt::default_bmaxDivN,
2998 Ebwt::default_dcv,
2999 Ebwt::default_seed,
3000 false, // verbose
3001 false, // autoMem
3002 false); // sanity
3003
3004 ebwts.first->loadIntoMemory (0, -1, true, true, true, true, false);
3005 ebwts.second->loadIntoMemory(0, 1, true, true, true, true, false);
3006
3007 // Query is longer than ftab and matches exactly once with one read gap,
3008 // one ref gap, and one mismatch
3009 {
3010 size_t last_topf = std::numeric_limits<size_t>::max();
3011 size_t last_botf = std::numeric_limits<size_t>::max();
3012 for(int i = 0; i < 2; i++) {
3013 // Set up the read
3014 // Ref: CATGTCAGCT ATATAGCGCGCT CGCATCATTTTGTGTGTAAACCA
3015 // |||||||||| |||||||||||| |||||| |||||||||||||
3016 BTDnaString seq ("CATGTCAGCT""GATATAGCGCGCT" "GCATCAATTTGTGTGTAAAC", true);
3017 // 0123456789 0123456789012 34567890123456789012
3018 BTString qual("ABCDEFGHIa""bcdefghiACDEF" "GHIabcdefghijkABCDEF");
3019 for(size_t j = 0; j < seq.length(); j++) {
3020 // Assume left-to-right
3021 size_t beg = j;
3022 size_t end = j + Ebwt::default_ftabChars;
3023 // Mismatch penalty is 3, so we have to skip starting
3024 // points that are within 2 from the mismatch
3025 if((i > 0 && j > 0) || j == seq.length()-1) {
3026 // Right-to-left
3027 if(beg < Ebwt::default_ftabChars) {
3028 beg = 0;
3029 } else {
3030 beg -= Ebwt::default_ftabChars;
3031 }
3032 end -= Ebwt::default_ftabChars;
3033 }
3034 if(beg <= 10 && end >= 10) {
3035 continue;
3036 }
3037 if(beg <= 22 && end >= 22) {
3038 continue;
3039 }
3040 if(beg <= 30 && end >= 30) {
3041 continue;
3042 }
3043 cerr << "Test " << (++testnum) << endl;
3044 cerr << " Query matches once with a read gap of length 1" << endl;
3045 DescentMetrics mets;
3046 PerReadMetrics prm;
3047 DescentDriver dr;
3048
3049 dr.initRead(
3050 Read("test", seq.toZBuf(), qual.toZBuf()),
3051 false, // nofw
3052 false, // norc
3053 -50, // minsc
3054 50); // maxpen
3055
3056 // Set up the DescentConfig
3057 DescentConfig conf;
3058 // Changed
3059 conf.cons.init(1, 0.5);
3060 conf.expol = DESC_EX_NONE;
3061
3062 // Set up the search roots
3063 dr.addRoot(
3064 conf, // DescentConfig
3065 j, // 5' offset into read of root
3066 i == 0, // left-to-right?
3067 true, // forward?
3068 1, // landing
3069 0.0f); // root priority
3070
3071 // Do the search
3072 Scoring sc = Scoring::base1();
3073 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
3074
3075 // Confirm that an exact-matching alignment was found
3076 assert_eq(5, dr.sink().nrange());
3077 assert_eq(0, dr.sink()[0].pen);
3078 assert_eq(min(sc.readGapOpen(), sc.refGapOpen()) + sc.mm((int)'d' - 33), dr.sink()[1].pen);
3079 assert_eq(max(sc.readGapOpen(), sc.refGapOpen()) + sc.mm((int)'d' - 33), dr.sink()[2].pen);
3080 assert_eq(sc.readGapOpen() + sc.refGapOpen(), dr.sink()[3].pen);
3081 assert_eq(sc.readGapOpen() + sc.refGapOpen() + sc.mm((int)'d' - 33), dr.sink()[4].pen);
3082 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
3083 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
3084 cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl;
3085 assert_eq(5, dr.sink().nelt());
3086 last_topf = dr.sink()[0].topf;
3087 last_botf = dr.sink()[0].botf;
3088 }
3089 }
3090 }
3091
3092 // Query is longer than ftab and matches exactly once with one read gap,
3093 // one ref gap, one mismatch, and one N
3094 {
3095 size_t last_topf = std::numeric_limits<size_t>::max();
3096 size_t last_botf = std::numeric_limits<size_t>::max();
3097 for(int i = 0; i < 2; i++) {
3098 // Set up the read
3099 // Ref: CATGTCAGCT ATATAGCGCGCT CGCATCATTTTGTGTGTAAACCA
3100 // |||||||||| |||||||||||| |||||| |||||| ||||||
3101 BTDnaString seq ("CATGTCAGCT""GATATAGCGCGCT" "GCATCAATTTGTGNGTAAAC", true);
3102 // 0123456789 0123456789012 34567890123456789012
3103 BTString qual("ABCDEFGHIa""bcdefghiACDEF" "GHIabcdefghijkABCDEF");
3104 for(size_t j = 0; j < seq.length(); j++) {
3105 // Assume left-to-right
3106 size_t beg = j;
3107 size_t end = j + Ebwt::default_ftabChars;
3108 // Mismatch penalty is 3, so we have to skip starting
3109 // points that are within 2 from the mismatch
3110 if((i > 0 && j > 0) || j == seq.length()-1) {
3111 // Right-to-left
3112 if(beg < Ebwt::default_ftabChars) {
3113 beg = 0;
3114 } else {
3115 beg -= Ebwt::default_ftabChars;
3116 }
3117 end -= Ebwt::default_ftabChars;
3118 }
3119 if(beg <= 10 && end >= 10) {
3120 continue;
3121 }
3122 if(beg <= 22 && end >= 22) {
3123 continue;
3124 }
3125 if(beg <= 30 && end >= 30) {
3126 continue;
3127 }
3128 if(beg <= 36 && end >= 36) {
3129 continue;
3130 }
3131 cerr << "Test " << (++testnum) << endl;
3132 cerr << " Query matches with various patterns of gaps, mismatches and Ns" << endl;
3133 DescentMetrics mets;
3134 PerReadMetrics prm;
3135 DescentDriver dr;
3136
3137 dr.initRead(
3138 Read("test", seq.toZBuf(), qual.toZBuf()),
3139 false, // nofw
3140 false, // norc
3141 -50, // minsc
3142 50); // maxpen
3143
3144 // Set up the DescentConfig
3145 DescentConfig conf;
3146 // Changed
3147 conf.cons.init(1, 0.5);
3148 conf.expol = DESC_EX_NONE;
3149
3150 // Set up the search roots
3151 dr.addRoot(
3152 conf, // DescentConfig
3153 j, // 5' offset into read of root
3154 i == 0, // left-to-right?
3155 true, // forward?
3156 1, // landing
3157 0.0f); // root priority
3158
3159 // Do the search
3160 Scoring sc = Scoring::base1();
3161 sc.setNPen(COST_MODEL_CONSTANT, 1);
3162 dr.go(sc, *ebwts.first, *ebwts.second, mets, prm);
3163
3164 // Confirm that an exact-matching alignment was found
3165 assert_eq(5, dr.sink().nrange());
3166 assert_eq(sc.n(40), dr.sink()[0].pen);
3167 assert_eq(sc.n(40) + min(sc.readGapOpen(), sc.refGapOpen()) + sc.mm((int)'d' - 33), dr.sink()[1].pen);
3168 assert_eq(sc.n(40) + max(sc.readGapOpen(), sc.refGapOpen()) + sc.mm((int)'d' - 33), dr.sink()[2].pen);
3169 assert_eq(sc.n(40) + sc.readGapOpen() + sc.refGapOpen(), dr.sink()[3].pen);
3170 assert_eq(sc.n(40) + sc.readGapOpen() + sc.refGapOpen() + sc.mm((int)'d' - 33), dr.sink()[4].pen);
3171 assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf);
3172 assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf);
3173 cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl;
3174 assert_eq(5, dr.sink().nelt());
3175 last_topf = dr.sink()[0].topf;
3176 last_botf = dr.sink()[0].botf;
3177 }
3178 }
3179 }
3180
3181 delete ebwts.first;
3182 delete ebwts.second;
3183
3184 cerr << "DONE" << endl;
3185 }
3186 #endif
3187
3188