1 /* 2 * range_source.h 3 */ 4 5 #ifndef RANGE_SOURCE_H_ 6 #define RANGE_SOURCE_H_ 7 8 #include <stdint.h> 9 #include <queue> 10 #include <vector> 11 12 #include "ds.h" 13 #include "ebwt.h" 14 #include "range.h" 15 #include "pool.h" 16 #include "edit.h" 17 18 enum AdvanceUntil { 19 ADV_FOUND_RANGE = 1, 20 ADV_COST_CHANGES, 21 ADV_STEP 22 }; 23 24 /** 25 * List of Edits that automatically expands as edits are added. 26 */ 27 struct EditList { 28 EditListEditList29 EditList() : sz_(0), moreEdits_(NULL), yetMoreEdits_(NULL) { } 30 31 /** 32 * Add an edit to the edit list. 33 */ addEditList34 bool add(const Edit& e, AllocOnlyPool<Edit>& pool, size_t qlen) { 35 assert_lt(sz_, qlen + 10); 36 if(sz_ < numEdits) { 37 assert(moreEdits_ == NULL); 38 assert(yetMoreEdits_ == NULL); 39 edits_[sz_++] = e; 40 } else if(sz_ == numEdits) { 41 assert(moreEdits_ == NULL); 42 assert(yetMoreEdits_ == NULL); 43 moreEdits_ = pool.alloc(numMoreEdits); 44 if(moreEdits_ == NULL) { 45 return false; 46 } 47 assert(moreEdits_ != NULL); 48 moreEdits_[0] = e; 49 sz_++; 50 } else if(sz_ < (numEdits + numMoreEdits)) { 51 assert(moreEdits_ != NULL); 52 assert(yetMoreEdits_ == NULL); 53 moreEdits_[sz_ - numEdits] = e; 54 sz_++; 55 } else if(sz_ == (numEdits + numMoreEdits)) { 56 assert(moreEdits_ != NULL); 57 assert(yetMoreEdits_ == NULL); 58 yetMoreEdits_ = pool.alloc((uint32_t)qlen + 10 - numMoreEdits - numEdits); 59 if(yetMoreEdits_ == NULL) { 60 return false; 61 } 62 assert(yetMoreEdits_ != NULL); 63 yetMoreEdits_[0] = e; 64 sz_++; 65 } else { 66 assert(moreEdits_ != NULL); 67 assert(yetMoreEdits_ != NULL); 68 yetMoreEdits_[sz_ - numEdits - numMoreEdits] = e; 69 sz_++; 70 } 71 return true; 72 } 73 74 /** 75 * Return a const reference to the ith Edit in the list. 76 */ getEditList77 const Edit& get(size_t i) const { 78 assert_lt(i, sz_); 79 if(i < numEdits) { 80 return edits_[i]; 81 } else if(i < (numEdits + numMoreEdits)) { 82 assert(moreEdits_ != NULL); 83 return moreEdits_[i-numEdits]; 84 } else { 85 assert(moreEdits_ != NULL); 86 assert(yetMoreEdits_ != NULL); 87 return yetMoreEdits_[i-numEdits-numMoreEdits]; 88 } 89 } 90 91 /** 92 * Get most recently added Edit. 93 */ topEditList94 const Edit& top() const { 95 assert_gt(size(), 0); 96 return get(size()-1); 97 } 98 99 /** 100 * Return true iff no Edits have been added. 101 */ emptyEditList102 bool empty() const { return size() == 0; } 103 104 /** 105 * Set a particular element of the EditList. 106 */ setEditList107 void set(size_t i, const Edit& e) { 108 assert_lt(i, sz_); 109 if(i < numEdits) { 110 edits_[i] = e; 111 } else if(i < (numEdits + numMoreEdits)) { 112 assert(moreEdits_ != NULL); 113 moreEdits_[i-numEdits] = e; 114 } else { 115 assert(moreEdits_ != NULL); 116 assert(yetMoreEdits_ != NULL); 117 yetMoreEdits_[i-numEdits-numMoreEdits] = e; 118 } 119 } 120 121 /** 122 * Remove all Edits from the list. 123 */ clearEditList124 void clear() { 125 sz_ = 0; 126 moreEdits_ = NULL; 127 yetMoreEdits_ = NULL; 128 } 129 130 /** 131 * Return number of Edits in the List. 132 */ sizeEditList133 size_t size() const { return sz_; } 134 135 /** 136 * Free all the heap-allocated edit lists 137 */ freeEditList138 void free(AllocOnlyPool<Edit>& epool, size_t qlen) { 139 if(yetMoreEdits_ != NULL) 140 epool.free(yetMoreEdits_, (uint32_t)qlen + 10 - numMoreEdits - numEdits); 141 if(moreEdits_ != NULL) 142 epool.free(moreEdits_, numMoreEdits); 143 } 144 145 const static size_t numEdits = 6; // part of object allocation 146 const static size_t numMoreEdits = 16; // first extra allocation 147 size_t sz_; // number of Edits stored in the EditList 148 Edit edits_[numEdits]; // first 4 edits; typically, no more are needed 149 Edit *moreEdits_; // if used, size is dictated by numMoreEdits 150 Edit *yetMoreEdits_; // if used, size is dictated by length of read 151 }; 152 153 /** 154 * Holds per-position information about what outgoing paths have been 155 * eliminated and what the quality value(s) is (are) at the position. 156 */ 157 union ElimsAndQual { 158 159 /** 160 * Assuming qual A/C/G/T are already set, set quallo and quallo2 161 * to the additional cost incurred by the least and second-least 162 * costly paths. 163 */ updateLo()164 void updateLo() { 165 flags.quallo = 127; 166 flags.quallo2 = 127; 167 if(!flags.mmA) { 168 // A mismatch to an A in the genome has not been ruled out 169 if(flags.qualA < flags.quallo) { 170 //flags.quallo2 = flags.quallo; 171 flags.quallo = flags.qualA; 172 } 173 //else if(flags.qualA == flags.quallo) { 174 // flags.quallo2 = flags.quallo; 175 //} else if(flags.qualA < flags.quallo2) { 176 // flags.quallo2 = flags.qualA; 177 //} 178 } 179 if(!flags.mmC) { 180 // A mismatch to a C in the genome has not been ruled out 181 if(flags.qualC < flags.quallo) { 182 flags.quallo2 = flags.quallo; 183 flags.quallo = flags.qualC; 184 } else if(flags.qualC == flags.quallo) { 185 flags.quallo2 = flags.quallo; 186 } else if(flags.qualC < flags.quallo2) { 187 flags.quallo2 = flags.qualC; 188 } 189 } 190 if(!flags.mmG) { 191 // A mismatch to a G in the genome has not been ruled out 192 if(flags.qualG < flags.quallo) { 193 flags.quallo2 = flags.quallo; 194 flags.quallo = flags.qualG; 195 } else if(flags.qualG == flags.quallo) { 196 flags.quallo2 = flags.quallo; 197 } else if(flags.qualG < flags.quallo2) { 198 flags.quallo2 = flags.qualG; 199 } 200 } 201 if(!flags.mmT) { 202 // A mismatch to a T in the genome has not been ruled out 203 if(flags.qualT < flags.quallo) { 204 flags.quallo2 = flags.quallo; 205 flags.quallo = flags.qualT; 206 } else if(flags.qualT == flags.quallo) { 207 flags.quallo2 = flags.quallo; 208 } else if(flags.qualT < flags.quallo2) { 209 flags.quallo2 = flags.qualT; 210 } 211 } 212 assert(repOk()); 213 } 214 215 /** 216 * Set all 13 elimination bits of the flags field to 1, indicating 217 * that all outgoing paths are eliminated. 218 */ eliminate()219 inline void eliminate() { 220 join.elims = ((1 << 13) - 1); 221 } 222 223 /** 224 * Internal consistency check. Basically just checks that lo and 225 * lo2 are set correctly. 226 */ repOk()227 bool repOk() const { 228 uint8_t lo = 127; 229 uint8_t lo2 = 127; 230 assert_lt(flags.qualA, 127); 231 assert_lt(flags.qualC, 127); 232 assert_lt(flags.qualG, 127); 233 assert_lt(flags.qualT, 127); 234 if(!flags.mmA) { 235 if(flags.qualA < lo) { 236 lo = flags.qualA; 237 } 238 //else if(flags.qualA == lo || flags.qualA < lo2) { 239 // lo2 = flags.qualA; 240 //} 241 } 242 if(!flags.mmC) { 243 if(flags.qualC < lo) { 244 lo2 = lo; 245 lo = flags.qualC; 246 } else if(flags.qualC == lo || flags.qualC < lo2) { 247 lo2 = flags.qualC; 248 } 249 } 250 if(!flags.mmG) { 251 if(flags.qualG < lo) { 252 lo2 = lo; 253 lo = flags.qualG; 254 } else if(flags.qualG == lo || flags.qualG < lo2) { 255 lo2 = flags.qualG; 256 } 257 } 258 if(!flags.mmT) { 259 if(flags.qualT < lo) { 260 lo2 = lo; 261 lo = flags.qualT; 262 } else if(flags.qualT == lo || flags.qualT < lo2) { 263 lo2 = flags.qualT; 264 } 265 } 266 assert_eq((int)lo, (int)flags.quallo); 267 assert_eq((int)lo2, (int)flags.quallo2); 268 return true; 269 } 270 271 struct { 272 uint64_t mmA : 1; // A in ref aligns to non-A char in read 273 uint64_t mmC : 1; // C in ref aligns to non-C char in read 274 uint64_t mmG : 1; // G in ref aligns to non-G char in read 275 uint64_t mmT : 1; // T in ref aligns to non-T char in read 276 uint64_t snpA : 1; // Same as mmA, but we think it's a SNP and not a miscall 277 uint64_t snpC : 1; // Same as mmC, but we think it's a SNP and not a miscall 278 uint64_t snpG : 1; // Same as mmG, but we think it's a SNP and not a miscall 279 uint64_t snpT : 1; // Same as mmT, but we think it's a SNP and not a miscall 280 uint64_t insA : 1; // A insertion in reference w/r/t read 281 uint64_t insC : 1; // C insertion in reference w/r/t read 282 uint64_t insG : 1; // G insertion in reference w/r/t read 283 uint64_t insT : 1; // T insertion in reference w/r/t read 284 uint64_t del : 1; // deletion of read character 285 uint64_t qualA : 7; // quality penalty for picking A at this position 286 uint64_t qualC : 7; // quality penalty for picking C at this position 287 uint64_t qualG : 7; // quality penalty for picking G at this position 288 uint64_t qualT : 7; // quality penalty for picking T at this position 289 uint64_t quallo : 7; // lowest quality penalty at this position 290 uint64_t quallo2 : 7; // 2nd-lowest quality penalty at this position 291 uint64_t reserved : 9; 292 } flags; 293 struct { 294 uint64_t elims : 13; // all of the edit-elim flags bundled together 295 uint64_t quals : 42; // quality of positions 296 uint64_t reserved : 9; 297 } join; 298 struct { 299 uint64_t mmElims : 4; // substitution flags bundled together 300 uint64_t snpElims : 4; // substitution flags bundled together 301 uint64_t insElims : 4; // inserts-in-reference flags bundled together 302 uint64_t delElims : 1; // deletion of read character 303 uint64_t quals : 42; // quality of positions 304 uint64_t reserved : 9; 305 } join2; 306 }; 307 308 /** 309 * All per-position state, including the ranges calculated for each 310 * character, the quality value at the position, and a set of flags 311 * recording whether we've tried each way of proceeding from this 312 * position. 313 */ 314 struct RangeState { 315 316 /** 317 * Using randomness when picking from among multiple choices, pick 318 * an edit to make. TODO: Only knows how to pick mismatches for 319 * now. 320 */ pickEditRangeState321 Edit pickEdit(int pos, RandomSource& rand, 322 TIndexOffU& top, TIndexOffU& bot, bool indels, 323 bool& last) 324 { 325 Edit ret; 326 ret.type = EDIT_TYPE_MM; 327 ret.pos = pos; 328 ret.chr = 0; 329 ret.qchr = 0; 330 ret.reserved = 0; 331 assert(!eliminated_); 332 assert(!eq.flags.mmA || !eq.flags.mmC || !eq.flags.mmG || !eq.flags.mmT); 333 int num = !eq.flags.mmA + !eq.flags.mmC + !eq.flags.mmG + !eq.flags.mmT; 334 assert_leq(num, 4); 335 assert_gt(num, 0); 336 if(num == 2) eq.flags.quallo2 = 127; 337 // Only need to pick randomly if there's a quality tie 338 if(num > 1) { 339 last = false; // not the last at this pos 340 // Sum up range sizes and do a random weighted pick 341 TIndexOffU tot = 0; 342 bool candA = !eq.flags.mmA; bool candC = !eq.flags.mmC; 343 bool candG = !eq.flags.mmG; bool candT = !eq.flags.mmT; 344 bool candInsA = false, candInsC = false; 345 bool candInsG = false, candInsT = false; 346 bool candDel = false; 347 if(indels) { 348 // Insertions and deletions can only be candidates 349 // if the user asked for indels 350 candInsA = !eq.flags.insA; 351 candInsC = !eq.flags.insC; 352 candInsG = !eq.flags.insG; 353 candInsT = !eq.flags.insT; 354 candDel = !eq.flags.del; 355 } 356 ASSERT_ONLY(int origNum = num); 357 if(candA) { 358 assert_gt(bots[0], tops[0]); 359 tot += (bots[0] - tops[0]); 360 assert_gt(num, 0); 361 ASSERT_ONLY(num--); 362 } 363 if(candC) { 364 assert_gt(bots[1], tops[1]); 365 tot += (bots[1] - tops[1]); 366 assert_gt(num, 0); 367 ASSERT_ONLY(num--); 368 } 369 if(candG) { 370 assert_gt(bots[2], tops[2]); 371 tot += (bots[2] - tops[2]); 372 assert_gt(num, 0); 373 ASSERT_ONLY(num--); 374 } 375 if(candT) { 376 assert_gt(bots[3], tops[3]); 377 tot += (bots[3] - tops[3]); 378 assert_gt(num, 0); 379 ASSERT_ONLY(num--); 380 } 381 if(indels) { 382 if(candInsA) { 383 assert_gt(bots[0], tops[0]); 384 tot += (bots[0] - tops[0]); 385 assert_gt(num, 0); 386 ASSERT_ONLY(num--); 387 } 388 if(candInsC) { 389 assert_gt(bots[1], tops[1]); 390 tot += (bots[1] - tops[1]); 391 assert_gt(num, 0); 392 ASSERT_ONLY(num--); 393 } 394 if(candInsG) { 395 assert_gt(bots[2], tops[2]); 396 tot += (bots[2] - tops[2]); 397 assert_gt(num, 0); 398 ASSERT_ONLY(num--); 399 } 400 if(candInsT) { 401 assert_gt(bots[3], tops[3]); 402 tot += (bots[3] - tops[3]); 403 assert_gt(num, 0); 404 ASSERT_ONLY(num--); 405 } 406 if(candDel) { 407 // Always a candidate? 408 // Always a candidate just within the window? 409 // We can eliminate some indels based on the content downstream, but not many 410 } 411 } 412 413 assert_geq(num, 0); 414 assert_lt(num, origNum); 415 // Throw a dart randomly that hits one of the possible 416 // substitutions, with likelihoods weighted by range size 417 uint32_t dart = rand.nextU32() % tot; 418 if(candA) { 419 if(dart < (bots[0] - tops[0])) { 420 // Eliminate A mismatch 421 top = tops[0]; 422 bot = bots[0]; 423 eq.flags.mmA = 1; 424 assert_lt(eq.join2.mmElims, 15); 425 ret.chr = 'A'; 426 return ret; 427 } 428 dart -= (bots[0] - tops[0]); 429 } 430 if(candC) { 431 if(dart < (bots[1] - tops[1])) { 432 // Eliminate C mismatch 433 top = tops[1]; 434 bot = bots[1]; 435 eq.flags.mmC = 1; 436 assert_lt(eq.join2.mmElims, 15); 437 ret.chr = 'C'; 438 return ret; 439 } 440 dart -= (bots[1] - tops[1]); 441 } 442 if(candG) { 443 if(dart < (bots[2] - tops[2])) { 444 // Eliminate G mismatch 445 top = tops[2]; 446 bot = bots[2]; 447 eq.flags.mmG = 1; 448 assert_lt(eq.join2.mmElims, 15); 449 ret.chr = 'G'; 450 return ret; 451 } 452 dart -= (bots[2] - tops[2]); 453 } 454 if(candT) { 455 assert_lt(dart, (bots[3] - tops[3])); 456 // Eliminate T mismatch 457 top = tops[3]; 458 bot = bots[3]; 459 eq.flags.mmT = 1; 460 assert_lt(eq.join2.mmElims, 15); 461 ret.chr = 'T'; 462 } 463 } else { 464 last = true; // last at this pos 465 // There's only one; pick it! 466 int chr = -1; 467 if(!eq.flags.mmA) { 468 chr = 0; 469 } else if(!eq.flags.mmC) { 470 chr = 1; 471 } else if(!eq.flags.mmG) { 472 chr = 2; 473 } else { 474 assert(!eq.flags.mmT); 475 chr = 3; 476 } 477 ret.chr = "ACGT"[chr]; 478 top = tops[chr]; 479 bot = bots[chr]; 480 //assert_eq(15, eq.join2.mmElims); 481 // Mark entire position as eliminated 482 eliminated_ = true; 483 } 484 return ret; 485 } 486 487 /** 488 * Return true (without assertion) iff this RangeState is 489 * internally consistent. 490 */ repOkRangeState491 bool repOk() { 492 if(eliminated_) return true; 493 // Uneliminated chars must have non-empty ranges 494 if(!eq.flags.mmA || !eq.flags.insA) assert_gt(bots[0], tops[0]); 495 if(!eq.flags.mmC || !eq.flags.insC) assert_gt(bots[1], tops[1]); 496 if(!eq.flags.mmG || !eq.flags.insG) assert_gt(bots[2], tops[2]); 497 if(!eq.flags.mmT || !eq.flags.insT) assert_gt(bots[3], tops[3]); 498 return true; 499 } 500 501 // Outgoing ranges; if the position being described is not a 502 // legitimate jumping-off point for a branch, tops[] and bots[] 503 // will be filled with 0s and all possibilities in eq will be 504 // eliminated 505 TIndexOffU tops[4]; // A, C, G, T top offsets 506 TIndexOffU bots[4]; // A, C, G, T bot offsets 507 ElimsAndQual eq; // Which outgoing paths have been tried already 508 bool eliminated_; // Whether all outgoing paths have been eliminated 509 }; 510 511 /** 512 * Encapsulates a "branch" of the search space; i.e. all of the 513 * information deduced by walking along a path with only matches, along 514 * with information about the decisions that lead to the root of that 515 * path. 516 */ 517 class Branch { 518 typedef std::pair<TIndexOffU, TIndexOffU> UPair; 519 public: Branch()520 Branch() : 521 delayedCost_(0), curtailed_(false), exhausted_(false), 522 prepped_(false), delayedIncrease_(false) { } 523 524 /** 525 * Initialize a new branch object with an empty path. 526 */ 527 bool init(AllocOnlyPool<RangeState>& rsPool, 528 AllocOnlyPool<Edit>& epool, 529 uint32_t id, 530 uint32_t qlen, 531 uint16_t depth0, 532 uint16_t depth1, 533 uint16_t depth2, 534 uint16_t depth3, 535 uint16_t rdepth, 536 uint16_t len, 537 uint16_t cost, 538 uint16_t ham, 539 TIndexOffU itop, 540 TIndexOffU ibot, 541 const EbwtParams& ep, 542 const uint8_t* ebwt, 543 const EditList* edits = NULL) 544 { 545 id_ = id; 546 delayedCost_ = 0; 547 depth0_ = depth0; 548 depth1_ = depth1; 549 depth2_ = depth2; 550 depth3_ = depth3; 551 assert_gt(depth3_, 0); 552 rdepth_ = rdepth; 553 len_ = len; 554 cost_ = cost; 555 ham_ = ham; 556 top_ = itop; 557 bot_ = ibot; 558 559 if(ibot > itop+1) { 560 // Care about both top and bot 561 SideLocus::initFromTopBot(itop, ibot, ep, ebwt, ltop_, lbot_); 562 } else if(ibot > itop) { 563 // Only care about top 564 ltop_.initFromRow(itop, ep, ebwt); 565 lbot_.invalidate(); 566 } 567 if(qlen - rdepth_ > 0) { 568 ranges_ = rsPool.allocC(qlen - rdepth_); // allocated from the RangeStatePool 569 if(ranges_ == NULL) { 570 return false; // RangeStatePool exhausted 571 } 572 rangesSz_ = qlen - rdepth_; 573 } else { 574 ranges_ = NULL; 575 rangesSz_ = 0; 576 } 577 #ifndef NDEBUG 578 for(size_t i = 0; i < (qlen - rdepth_); i++) { 579 for(int j = 0; j < 4; j++) { 580 assert_eq(0, ranges_[i].tops[j]); 581 assert_eq(0, ranges_[i].bots[j]); 582 } 583 } 584 #endif 585 curtailed_ = false; 586 exhausted_ = false; 587 prepped_ = true; 588 delayedIncrease_ = false; 589 edits_.clear(); 590 if(edits != NULL) { 591 const size_t numEdits = edits->size(); 592 for(size_t i = 0; i < numEdits; i++) { 593 edits_.add(edits->get(i), epool, qlen); 594 } 595 } 596 // If we're starting with a non-zero length, that means we're 597 // jumping over a bunch of unrevisitable positions. 598 for(size_t i = 0; i < len_; i++) { 599 ranges_[i].eliminated_ = true; 600 assert(eliminated((int)i)); 601 } 602 assert(repOk(qlen)); 603 return true; 604 } 605 606 /** 607 * Depth of the deepest tip of the branch. 608 */ tipDepth()609 uint16_t tipDepth() const { 610 return rdepth_ + len_; 611 } 612 613 /** 614 * Return true iff all outgoing edges from position i have been 615 * eliminated. 616 */ eliminated(int i)617 inline bool eliminated(int i) const { 618 assert(!exhausted_); 619 if(i <= len_ && i < rangesSz_) { 620 assert(ranges_ != NULL); 621 #ifndef NDEBUG 622 if(!ranges_[i].eliminated_) { 623 // Someone must be as-yet-uneliminated 624 assert(!ranges_[i].eq.flags.mmA || 625 !ranges_[i].eq.flags.mmC || 626 !ranges_[i].eq.flags.mmG || 627 !ranges_[i].eq.flags.mmT); 628 assert_lt(ranges_[i].eq.flags.quallo, 127); 629 } 630 #endif 631 return ranges_[i].eliminated_; 632 } 633 return true; 634 } 635 636 /** 637 * Split off a new branch by selecting a good outgoing path and 638 * creating a new Branch object for it and inserting that branch 639 * into the priority queue. Mark that outgoing path from the 640 * parent branch as eliminated. If the second-best outgoing path 641 * cost more, add the difference to the cost of this branch (since 642 * that's the best we can do starting from here from now on). 643 */ splitBranch(AllocOnlyPool<RangeState> & rpool,AllocOnlyPool<Edit> & epool,AllocOnlyPool<Branch> & bpool,uint32_t pmSz,RandomSource & rand,uint32_t qlen,uint32_t qualLim,int seedLen,bool qualOrder,const EbwtParams & ep,const uint8_t * ebwt,bool ebwtFw,bool verbose,bool quiet)644 Branch* splitBranch(AllocOnlyPool<RangeState>& rpool, 645 AllocOnlyPool<Edit>& epool, 646 AllocOnlyPool<Branch>& bpool, 647 uint32_t pmSz, 648 RandomSource& rand, uint32_t qlen, 649 uint32_t qualLim, int seedLen, 650 bool qualOrder, const EbwtParams& ep, 651 const uint8_t* ebwt, bool ebwtFw, 652 bool verbose, 653 bool quiet) 654 { 655 assert(!exhausted_); 656 assert(ranges_ != NULL); 657 assert(curtailed_); 658 assert_gt(pmSz, 0); 659 Branch *newBranch = bpool.alloc(); 660 if(newBranch == NULL) { 661 return NULL; 662 } 663 assert(newBranch != NULL); 664 uint32_t id = bpool.lastId(); 665 int tiedPositions[3]; 666 int numTiedPositions = 0; 667 // Lowest marginal cost incurred by any of the positions with 668 // non-eliminated outgoing edges 669 uint16_t bestCost = 0xffff; 670 // Next-lowest 671 uint16_t nextCost = 0xffff; 672 int numNotEliminated = 0; 673 int i = (int)depth0_; 674 i = max(0, i - rdepth_); 675 // Iterate over revisitable positions in the path 676 for(; i <= len_; i++) { 677 // If there are still valid options for leaving out of this 678 // position 679 if(!eliminated(i)) { 680 numNotEliminated++; 681 uint16_t stratum = (rdepth_ + i < seedLen) ? (1 << 14) : 0; 682 uint16_t cost = stratum; 683 cost |= (qualOrder ? ranges_[i].eq.flags.quallo : 0); 684 if(cost < bestCost) { 685 // Demote the old best to the next-best 686 nextCost = bestCost; 687 // Update the new best 688 bestCost = cost; 689 numTiedPositions = 1; 690 tiedPositions[0] = i; 691 } else if(cost == bestCost) { 692 // As good as the best so far 693 assert_gt(numTiedPositions, 0); 694 if(numTiedPositions < 3) { 695 tiedPositions[numTiedPositions++] = i; 696 } else { 697 tiedPositions[0] = tiedPositions[1]; 698 tiedPositions[1] = tiedPositions[2]; 699 tiedPositions[2] = i; 700 } 701 } else if(cost < nextCost) { 702 // 'cost' isn't beter than the best, but it is 703 // better than the next-best 704 nextCost = cost; 705 } 706 } 707 } 708 assert_gt(numNotEliminated, 0); 709 assert_gt(numTiedPositions, 0); 710 //if(nextCost != 0xffff) assert_gt(nextCost, bestCost); 711 int r = 0; 712 if(numTiedPositions > 1) { 713 r = rand.nextU32() % numTiedPositions; 714 } 715 int pos = tiedPositions[r]; 716 bool last = false; 717 // Pick an edit from among the edits tied for lowest cost 718 // (using randomness to break ties). If the selected edit is 719 // the last remaining one at this position, 'last' is set to 720 // true. 721 TIndexOffU top = 0, bot = 0; 722 Edit e = ranges_[pos].pickEdit(pos + rdepth_, rand, top, bot, false, last); 723 assert_gt(bot, top); 724 // Create and initialize a new Branch 725 uint16_t newRdepth = rdepth_ + pos + 1; 726 assert_lt((bestCost >> 14), 4); 727 uint32_t hamadd = (bestCost & ~0xc000); 728 uint16_t depth = pos + rdepth_; 729 assert_geq(depth, depth0_); 730 uint16_t newDepth0 = depth0_; 731 uint16_t newDepth1 = depth1_; 732 uint16_t newDepth2 = depth2_; 733 uint16_t newDepth3 = depth3_; 734 if(depth < depth1_) newDepth0 = depth1_; 735 if(depth < depth2_) newDepth1 = depth2_; 736 if(depth < depth3_) newDepth2 = depth3_; 737 assert_eq((uint32_t)(cost_ & ~0xc000), (uint32_t)(ham_ + hamadd)); 738 if(!newBranch->init( 739 rpool, epool, id, qlen, 740 newDepth0, newDepth1, newDepth2, newDepth3, 741 newRdepth, 0, cost_, ham_ + hamadd, 742 top, bot, ep, ebwt, &edits_)) 743 { 744 return NULL; 745 } 746 // Add the new edit 747 newBranch->edits_.add(e, epool, qlen); 748 if(numNotEliminated == 1 && last) { 749 // This branch is totally exhausted; there are no more 750 // valid outgoing paths from any positions within it. 751 // Remove it from the PathManager and mark it as exhausted. 752 // The caller should delete it. 753 exhausted_ = true; 754 if(ranges_ != NULL) { 755 assert_gt(rangesSz_, 0); 756 if(rpool.free(ranges_, rangesSz_)) { 757 ranges_ = NULL; 758 rangesSz_ = 0; 759 } 760 } 761 } 762 else if(numTiedPositions == 1 && last) { 763 // We exhausted the last outgoing edge at the current best 764 // cost; update the best cost to be the next-best 765 assert_neq(0xffff, nextCost); 766 if(bestCost != nextCost) { 767 assert_gt(nextCost, bestCost); 768 delayedCost_ = (cost_ - bestCost + nextCost); 769 delayedIncrease_ = true; 770 } 771 } 772 return newBranch; 773 } 774 775 /** 776 * Free a branch and all its contents. 777 */ free(uint32_t qlen,AllocOnlyPool<RangeState> & rpool,AllocOnlyPool<Edit> & epool,AllocOnlyPool<Branch> & bpool)778 void free(uint32_t qlen, 779 AllocOnlyPool<RangeState>& rpool, 780 AllocOnlyPool<Edit>& epool, 781 AllocOnlyPool<Branch>& bpool) 782 { 783 edits_.free(epool, qlen); 784 if(ranges_ != NULL) { 785 assert_gt(rangesSz_, 0); 786 rpool.free(ranges_, rangesSz_); 787 ranges_ = NULL; 788 rangesSz_ = 0; 789 } 790 bpool.free(this); 791 } 792 793 /** 794 * Pretty-print the state of this branch. 795 */ print(const BTDnaString & qry,const BTString & quals,uint16_t minCost,std::ostream & out,bool halfAndHalf,bool seeded,bool fw,bool ebwtFw)796 void print(const BTDnaString& qry, 797 const BTString& quals, 798 uint16_t minCost, 799 std::ostream& out, 800 bool halfAndHalf, 801 bool seeded, 802 bool fw, 803 bool ebwtFw) 804 { 805 size_t editidx = 0; 806 size_t printed = 0; 807 const size_t qlen = qry.length(); 808 if(exhausted_) out << "E "; 809 else if(curtailed_) out << "C "; 810 else out << " "; 811 if(ebwtFw) out << "<"; 812 else out << ">"; 813 if(fw) out << "F "; 814 else out << "R "; 815 std::stringstream ss; 816 ss << cost_; 817 string s = ss.str(); 818 if(s.length() < 6) { 819 for(size_t i = 0; i < 6 - s.length(); i++) { 820 out << "0"; 821 } 822 } 823 out << s << " "; 824 std::stringstream ss2; 825 ss2 << minCost; 826 s = ss2.str(); 827 if(s.length() < 6) { 828 for(size_t i = 0; i < 6 - s.length(); i++) { 829 out << "0"; 830 } 831 } 832 out << s; 833 if(halfAndHalf) out << " h "; 834 else if(seeded) out << " s "; 835 else out << " "; 836 std::stringstream ss3; 837 const size_t numEdits = edits_.size(); 838 if(rdepth_ > 0) { 839 for(size_t i = 0; i < rdepth_; i++) { 840 if(editidx < numEdits && edits_.get(editidx).pos == i) { 841 ss3 << " " << (char)tolower(edits_.get(editidx).chr); 842 editidx++; 843 } else { 844 ss3 << " " << (char)qry.toChar(qlen - i - 1); 845 } 846 printed++; 847 } 848 ss3 << "|"; 849 } else { 850 ss3 << " "; 851 } 852 for(size_t i = 0; i < len_; i++) { 853 if(editidx < numEdits && edits_.get(editidx).pos == printed) { 854 ss3 << (char)tolower(edits_.get(editidx).chr) << " "; 855 editidx++; 856 } else { 857 ss3 << (char)qry.toChar(qlen - printed - 1) << " "; 858 } 859 printed++; 860 } 861 assert_eq(editidx, edits_.size()); 862 for(size_t i = printed; i < qlen; i++) { 863 ss3 << "= "; 864 } 865 s = ss3.str(); 866 if(ebwtFw) { 867 std::reverse(s.begin(), s.end()); 868 } 869 out << s << endl; 870 } 871 872 /** 873 * Called when the most recent branch extension resulted in an 874 * empty range or some other constraint violation (e.g., a 875 * half-and-half constraint). 876 */ curtail(AllocOnlyPool<RangeState> & rpool,int seedLen,bool qualOrder)877 void curtail(AllocOnlyPool<RangeState>& rpool, int seedLen, bool qualOrder) { 878 assert(!curtailed_); 879 assert(!exhausted_); 880 if(ranges_ == NULL) { 881 exhausted_ = true; 882 curtailed_ = true; 883 return; 884 } 885 uint16_t lowestCost = 0xffff; 886 // Iterate over positions in the path looking for the cost of 887 // the lowest-cost non-eliminated position 888 uint32_t eliminatedStretch = 0; 889 int i = (int)depth0_; 890 i = max(0, i - rdepth_); 891 // TODO: It matters whether an insertion/deletion at given 892 // position would be a gap open or a gap extension 893 for(; i <= len_; i++) { 894 if(!eliminated(i)) { 895 eliminatedStretch = 0; 896 uint16_t stratum = (rdepth_ + i < seedLen) ? (1 << 14) : 0; 897 uint16_t cost = (qualOrder ? /*TODO*/ ranges_[i].eq.flags.quallo : 0) | stratum; 898 if(cost < lowestCost) lowestCost = cost; 899 } else if(i < rangesSz_) { 900 eliminatedStretch++; 901 } 902 } 903 if(lowestCost > 0 && lowestCost != 0xffff) { 904 // This branch's cost will change when curtailed; the 905 // caller should re-insert it into the priority queue so 906 // that the new cost takes effect. 907 cost_ += lowestCost; 908 } else if(lowestCost == 0xffff) { 909 // This branch is totally exhausted; there are no more 910 // valid outgoing paths from any positions within it. 911 // Remove it from the PathManager and mark it as exhausted. 912 // The caller should delete it. 913 exhausted_ = true; 914 if(ranges_ != NULL) { 915 assert_gt(rangesSz_, 0); 916 if(rpool.free(ranges_, rangesSz_)) { 917 ranges_ = NULL; 918 rangesSz_ = 0; 919 } 920 } 921 } else { 922 // Just mark it as curtailed and keep the same cost 923 } 924 if(ranges_ != NULL) { 925 // Try to trim off no-longer-relevant elements of the 926 // ranges_ array 927 assert(!exhausted_); 928 assert_gt(rangesSz_, 0); 929 uint32_t trim = (rangesSz_ - len_ - 1) + eliminatedStretch; 930 assert_leq(trim, rangesSz_); 931 if(rpool.free(ranges_ + rangesSz_ - trim, trim)) { 932 rangesSz_ -= trim; 933 if(rangesSz_ == 0) { 934 ranges_ = NULL; 935 } 936 } 937 } 938 curtailed_ = true; 939 } 940 941 /** 942 * Prep this branch for the next extension by calculating the 943 * SideLocus information and prefetching cache lines from the 944 * appropriate loci. 945 */ prep(const EbwtParams & ep,const uint8_t * ebwt)946 void prep(const EbwtParams& ep, const uint8_t* ebwt) { 947 if(bot_ > top_+1) { 948 SideLocus::initFromTopBot(top_, bot_, ep, ebwt, ltop_, lbot_); 949 } else if(bot_ > top_) { 950 ltop_.initFromRow(top_, ep, ebwt); 951 lbot_.invalidate(); 952 } 953 prepped_ = true; 954 } 955 956 /** 957 * Get the furthest-out RangeState. 958 */ rangeState()959 RangeState* rangeState() { 960 assert(!exhausted_); 961 assert(ranges_ != NULL); 962 assert_lt(len_, rangesSz_); 963 return &ranges_[len_]; 964 } 965 966 /** 967 * Set the elims to match the ranges in ranges_[len_], already 968 * calculated by the caller. Only does mismatches for now. 969 */ installRanges(int c,int nextc,uint32_t qAllow,const uint8_t * qs)970 int installRanges(int c, int nextc, uint32_t qAllow, const uint8_t* qs) { 971 assert(!exhausted_); 972 assert(ranges_ != NULL); 973 RangeState& r = ranges_[len_]; 974 int ret = 0; 975 r.eliminated_ = true; // start with everything eliminated 976 r.eq.eliminate(); // set all elim flags to 1 977 assert_lt(qs[0], 127); 978 assert_lt(qs[1], 127); 979 assert_lt(qs[2], 127); 980 assert_lt(qs[3], 127); 981 assert_eq(qs[0], qs[1]); 982 assert_eq(qs[0], qs[2]); 983 assert_eq(qs[0], qs[3]); 984 r.eq.flags.quallo = qs[0]; 985 if(qs[0] > qAllow) return 0; 986 // Set one/both of these to true to do the accounting for 987 // insertions and deletions as well as mismatches 988 bool doInserts = false; 989 bool doDeletes = false; 990 // We can proceed on an A 991 if(c != 0 && r.bots[0] > r.tops[0] && qs[0] <= qAllow) { 992 r.eliminated_ = false; 993 r.eq.flags.mmA = 0; // A substitution is an option 994 if(doInserts) r.eq.flags.insA = 0; 995 if(doDeletes && nextc == 0) r.eq.flags.del = 0; 996 ret++; 997 } 998 // We can proceed on a C 999 if(c != 1 && r.bots[1] > r.tops[1] && qs[1] <= qAllow) { 1000 r.eliminated_ = false; 1001 r.eq.flags.mmC = 0; // C substitution is an option 1002 if(doInserts) r.eq.flags.insC = 0; 1003 if(doDeletes && nextc == 1) r.eq.flags.del = 0; 1004 ret++; 1005 } 1006 // We can proceed on a G 1007 if(c != 2 && r.bots[2] > r.tops[2] && qs[2] <= qAllow) { 1008 r.eliminated_ = false; 1009 r.eq.flags.mmG = 0; // G substitution is an option 1010 if(doInserts) r.eq.flags.insG = 0; 1011 if(doDeletes && nextc == 2) r.eq.flags.del = 0; 1012 ret++; 1013 } 1014 // We can proceed on a T 1015 if(c != 3 && r.bots[3] > r.tops[3] && qs[3] <= qAllow) { 1016 r.eliminated_ = false; 1017 r.eq.flags.mmT = 0; // T substitution is an option 1018 if(doInserts) r.eq.flags.insT = 0; 1019 if(doDeletes && nextc == 3) r.eq.flags.del = 0; 1020 ret++; 1021 } 1022 return ret; 1023 } 1024 1025 /** 1026 * Extend this branch by one position. 1027 */ extend()1028 void extend() { 1029 assert(!exhausted_); 1030 assert(!curtailed_); 1031 assert(ranges_ != NULL); 1032 assert(repOk()); 1033 prepped_ = false; 1034 len_++; 1035 } 1036 1037 /** 1038 * Do an internal consistency check 1039 */ 1040 bool repOk(uint32_t qlen = 0) const{ 1041 assert_leq(depth0_, depth1_); 1042 assert_leq(depth1_, depth2_); 1043 assert_leq(depth2_, depth3_); 1044 assert_gt(depth3_, 0); 1045 if(qlen > 0) { 1046 assert_leq(edits_.size(), qlen); // might have to relax this with inserts 1047 assert_leq(rdepth_, qlen); 1048 } 1049 for(int i = 0; i < len_; i++) { 1050 if(!eliminated(i)) { 1051 assert_lt(i, (int)(len_)); 1052 assert(ranges_[i].repOk()); 1053 } 1054 } 1055 const size_t numEdits = edits_.size(); 1056 for(size_t i = 0; i < numEdits; i++) { 1057 for(size_t j = i+1; j < numEdits; j++) { 1058 // No two edits should be at the same position (might 1059 // have to relax this with inserts) 1060 assert_neq(edits_.get(i).pos, edits_.get(j).pos); 1061 } 1062 } 1063 assert_lt((cost_ >> 14), 4); 1064 return true; 1065 } 1066 1067 uint32_t id_; // branch id; needed to make the ordering of 1068 // branches that are tied in the priority queue 1069 // totally unambiguous. Otherwise, things start 1070 // getting non-deterministic. 1071 uint16_t depth0_; // no edits at depths < depth0 1072 uint16_t depth1_; // at most 1 edit at depths < depth1 1073 uint16_t depth2_; // at most 2 edits at depths < depth2 1074 uint16_t depth3_; // at most 3 edits at depths < depth3 1075 uint16_t rdepth_; // offset in read space from root of search space 1076 uint16_t len_; // length of the branch 1077 uint16_t cost_; // top 2 bits = stratum, bottom 14 = qual ham 1078 // it's up to Branch to keep this updated with the 1079 // cumulative cost of the best path leaving the 1080 // branch; if the branch hasn't been fully 1081 // extended yet, then that path will always be the 1082 // one that extends it by one more 1083 uint16_t ham_; // quality-weighted hamming distance so far 1084 RangeState *ranges_; // Allocated from the RangeStatePool 1085 uint16_t rangesSz_; 1086 TIndexOffU top_; // top offset leading to the root of this subtree 1087 TIndexOffU bot_; // bot offset leading to the root of this subtree 1088 SideLocus ltop_; 1089 SideLocus lbot_; 1090 EditList edits_; // edits leading to the root of the branch 1091 1092 uint16_t delayedCost_; 1093 1094 bool curtailed_; // can't be extended anymore without using edits 1095 bool exhausted_; // all outgoing edges exhausted, including all edits 1096 bool prepped_; // whether SideLocus's are inited 1097 bool delayedIncrease_; 1098 }; 1099 1100 /** 1101 * Order two Branches based on cost. 1102 */ 1103 class CostCompare { 1104 public: 1105 /** 1106 * true -> b before a 1107 * false -> a before b 1108 */ operator()1109 bool operator()(const Branch* a, const Branch* b) const { 1110 bool aUnextendable = a->curtailed_ || a->exhausted_; 1111 bool bUnextendable = b->curtailed_ || b->exhausted_; 1112 // Branch with the best cost 1113 if(a->cost_ == b->cost_) { 1114 // If one or the other is curtailed, take the one that's 1115 // still getting extended 1116 if(bUnextendable && !aUnextendable) { 1117 // a still being extended, return false 1118 return false; 1119 } 1120 if(aUnextendable && !bUnextendable) { 1121 // b still being extended, return true 1122 return true; 1123 } 1124 // Either both are curtailed or both are still being 1125 // extended, pick based on which one is deeper 1126 if(a->tipDepth() != b->tipDepth()) { 1127 // Expression is true if b is deeper 1128 return a->tipDepth() < b->tipDepth(); 1129 } 1130 // Keep things deterministic by providing an unambiguous 1131 // order using the id_ field 1132 assert_neq(b->id_, a->id_); 1133 return b->id_ < a->id_; 1134 } else { 1135 return b->cost_ < a->cost_; 1136 } 1137 } 1138 equal(const Branch * a,const Branch * b)1139 static bool equal(const Branch* a, const Branch* b) { 1140 return a->cost_ == b->cost_ && a->curtailed_ == b->curtailed_ && a->tipDepth() == b->tipDepth(); 1141 } 1142 }; 1143 1144 /** 1145 * A priority queue for Branch objects; makes it easy to process 1146 * branches in a best-first manner by prioritizing branches with lower 1147 * cumulative costs over branches with higher cumulative costs. 1148 */ 1149 class BranchQueue { 1150 1151 typedef std::pair<int, int> TIntPair; 1152 typedef std::priority_queue<Branch*, std::vector<Branch*>, CostCompare> TBranchQueue; 1153 1154 public: 1155 BranchQueue(bool verbose,bool quiet)1156 BranchQueue(bool verbose, bool quiet) : 1157 sz_(0), branchQ_(), patid_(0), verbose_(verbose), quiet_(quiet) 1158 { } 1159 1160 /** 1161 * Return the front (highest-priority) element of the queue. 1162 */ front()1163 Branch *front() { 1164 Branch *b = branchQ_.top(); 1165 if(verbose_) { 1166 stringstream ss; 1167 ss << patid_ << ": Fronting " << b->id_ << ", " << b << ", " << b->cost_ << ", " << b->exhausted_ << ", " << b->curtailed_ << ", " << sz_ << "->" << (sz_-1); 1168 glog.msg(ss.str()); 1169 } 1170 return b; 1171 } 1172 1173 /** 1174 * Remove and return the front (highest-priority) element of the 1175 * queue. 1176 */ pop()1177 Branch *pop() { 1178 Branch *b = branchQ_.top(); // get it 1179 branchQ_.pop(); // remove it 1180 if(verbose_) { 1181 stringstream ss; 1182 ss << patid_ << ": Popping " << b->id_ << ", " << b << ", " << b->cost_ << ", " << b->exhausted_ << ", " << b->curtailed_ << ", " << sz_ << "->" << (sz_-1); 1183 glog.msg(ss.str()); 1184 } 1185 sz_--; 1186 return b; 1187 } 1188 1189 /** 1190 * Insert a new Branch into the sorted priority queue. 1191 */ push(Branch * b)1192 void push(Branch *b) { 1193 #ifndef NDEBUG 1194 bool bIsBetter = empty() || !CostCompare()(b, branchQ_.top()); 1195 #endif 1196 if(verbose_) { 1197 stringstream ss; 1198 ss << patid_ << ": Pushing " << b->id_ << ", " << b << ", " << b->cost_ << ", " << b->exhausted_ << ", " << b->curtailed_ << ", " << sz_ << "->" << (sz_+1); 1199 glog.msg(ss.str()); 1200 } 1201 branchQ_.push(b); 1202 #ifndef NDEBUG 1203 assert(bIsBetter || branchQ_.top() != b || CostCompare::equal(branchQ_.top(), b)); 1204 assert(!bIsBetter || branchQ_.top() == b || CostCompare::equal(branchQ_.top(), b)); 1205 #endif 1206 sz_++; 1207 } 1208 1209 /** 1210 * Empty the priority queue and reset the count. 1211 */ reset(uint32_t patid)1212 void reset(uint32_t patid) { 1213 patid_ = patid; 1214 branchQ_ = TBranchQueue(); 1215 sz_ = 0; 1216 } 1217 1218 /** 1219 * Return true iff the priority queue of branches is empty. 1220 */ empty()1221 bool empty() const { 1222 bool ret = branchQ_.empty(); 1223 assert(ret || sz_ > 0); 1224 assert(!ret || sz_ == 0); 1225 return ret; 1226 } 1227 1228 /** 1229 * Return the number of Branches in the queue. 1230 */ size()1231 uint32_t size() const { 1232 return sz_; 1233 } 1234 1235 #ifndef NDEBUG 1236 /** 1237 * Consistency check. 1238 */ repOk(std::set<Branch * > & bset)1239 bool repOk(std::set<Branch*>& bset) { 1240 TIntPair pair = bestStratumAndHam(bset); 1241 Branch *b = branchQ_.top(); 1242 assert_eq(pair.first, (b->cost_ >> 14)); 1243 assert_eq(pair.second, (b->cost_ & ~0xc000)); 1244 std::set<Branch*>::iterator it; 1245 for(it = bset.begin(); it != bset.end(); it++) { 1246 assert_gt((*it)->depth3_, 0); 1247 } 1248 return true; 1249 } 1250 #endif 1251 1252 protected: 1253 1254 #ifndef NDEBUG 1255 /** 1256 * Return the stratum and quality-weight (sum of qualities of all 1257 * edited positions) of the lowest-cost branch. 1258 */ bestStratumAndHam(std::set<Branch * > & bset)1259 TIntPair bestStratumAndHam(std::set<Branch*>& bset) const { 1260 TIntPair ret = make_pair(0xffff, 0xffff); 1261 std::set<Branch*>::iterator it; 1262 for(it = bset.begin(); it != bset.end(); it++) { 1263 Branch *b = *it; 1264 int stratum = b->cost_ >> 14; 1265 assert_lt(stratum, 4); 1266 int qual = b->cost_ & ~0xc000; 1267 if(stratum < ret.first || 1268 (stratum == ret.first && qual < ret.second)) 1269 { 1270 ret.first = stratum; 1271 ret.second = qual; 1272 } 1273 } 1274 return ret; 1275 } 1276 #endif 1277 1278 uint32_t sz_; 1279 TBranchQueue branchQ_; // priority queue of branches 1280 uint32_t patid_; 1281 bool verbose_; 1282 bool quiet_; 1283 }; 1284 1285 /** 1286 * A class that both contains Branches and determines how those 1287 * branches are extended to form longer paths. The overall goal is to 1288 * find the best full alignment(s) as quickly as possible so that a 1289 * successful search can finish quickly. A second goal is to ensure 1290 * that the most "promising" paths are tried first so that, if there is 1291 * a limit on the amount of effort spent searching before we give up, 1292 * we can be as sensitive as possible given that limit. 1293 * 1294 * The quality (or cost) of a given alignment path will ultimately be 1295 * configurable. The default cost model is: 1296 * 1297 * 1. Mismatches incur one "edit" penalty and a "quality" penalty with 1298 * magnitude equal to the Phred quality score of the read position 1299 * involved in the edit (note that insertions into the read are a 1300 * little trickier). 1301 * 2. Edit penalties are all more costly than any quality penalty; i.e. 1302 * the policy sorts alignments first by edit penalty, then by 1303 * quality penalty. 1304 * 3. For the Maq-like alignment policy, edit penalties saturate (don't 1305 * get any greater) after leaving the seed region of the alignment. 1306 */ 1307 class PathManager { 1308 1309 public: 1310 PathManager(ChunkPool * cpool_,int * btCnt,bool verbose,bool quiet)1311 PathManager(ChunkPool* cpool_, int *btCnt, bool verbose, bool quiet) : 1312 branchQ_(verbose, quiet), 1313 cpool(cpool_), 1314 bpool(cpool, "branch"), 1315 rpool(cpool, "range state"), 1316 epool(cpool, "edit"), 1317 minCost(0), btCnt_(btCnt), 1318 verbose_(verbose), 1319 quiet_(quiet) 1320 { } 1321 ~PathManager()1322 ~PathManager() { } 1323 1324 /** 1325 * Return the "front" (highest-priority) branch in the collection. 1326 */ front()1327 Branch* front() { 1328 assert(!empty()); 1329 assert_gt(branchQ_.front()->depth3_, 0); 1330 return branchQ_.front(); 1331 } 1332 1333 /** 1334 * Pop the highest-priority (lowest cost) element from the 1335 * priority queue. 1336 */ pop()1337 Branch* pop() { 1338 Branch* b = branchQ_.pop(); 1339 assert_gt(b->depth3_, 0); 1340 #ifndef NDEBUG 1341 // Also remove it from the set 1342 assert(branchSet_.find(b) != branchSet_.end()); 1343 ASSERT_ONLY(size_t setSz = branchSet_.size()); 1344 branchSet_.erase(branchSet_.find(b)); 1345 assert_eq(setSz-1, branchSet_.size()); 1346 if(!branchQ_.empty()) { 1347 // Top shouldn't be b any more 1348 Branch *newtop = branchQ_.front(); 1349 assert(b != newtop); 1350 } 1351 #endif 1352 // Update this PathManager's cost 1353 minCost = branchQ_.front()->cost_; 1354 assert(repOk()); 1355 return b; 1356 } 1357 1358 /** 1359 * Push a new element onto the priority queue. 1360 */ push(Branch * b)1361 void push(Branch *b) { 1362 assert(!b->exhausted_); 1363 assert_gt(b->depth3_, 0); 1364 branchQ_.push(b); 1365 #ifndef NDEBUG 1366 // Also insert it into the set 1367 assert(branchSet_.find(b) == branchSet_.end()); 1368 branchSet_.insert(b); 1369 #endif 1370 // Update this PathManager's cost 1371 minCost = branchQ_.front()->cost_; 1372 } 1373 1374 /** 1375 * Return the number of active branches in the best-first 1376 * BranchQueue. 1377 */ size()1378 uint32_t size() { 1379 return branchQ_.size(); 1380 } 1381 1382 /** 1383 * Reset the PathManager, clearing out the priority queue and 1384 * resetting the RangeStatePool. 1385 */ reset(uint32_t patid)1386 void reset(uint32_t patid) { 1387 branchQ_.reset(patid); // reset the priority queue 1388 assert(branchQ_.empty()); 1389 bpool.reset(); // reset the Branch pool 1390 epool.reset(); // reset the Edit pool 1391 rpool.reset(); // reset the RangeState pool 1392 assert(bpool.empty()); 1393 assert(epool.empty()); 1394 assert(rpool.empty()); 1395 ASSERT_ONLY(branchSet_.clear()); 1396 assert_eq(0, branchSet_.size()); 1397 assert_eq(0, branchQ_.size()); 1398 minCost = 0; 1399 } 1400 1401 #ifndef NDEBUG 1402 /** 1403 * Return true iff Branch b is in the priority queue; 1404 */ contains(Branch * b)1405 bool contains(Branch *b) const { 1406 bool ret = branchSet_.find(b) != branchSet_.end(); 1407 assert(!ret || !b->exhausted_); 1408 return ret; 1409 } 1410 1411 /** 1412 * Do a consistenty-check on the collection of branches contained 1413 * in this PathManager. 1414 */ repOk()1415 bool repOk() { 1416 if(empty()) return true; 1417 assert(branchQ_.repOk(branchSet_)); 1418 return true; 1419 } 1420 #endif 1421 1422 /** 1423 * Return true iff the priority queue of branches is empty. 1424 */ empty()1425 bool empty() const { 1426 bool ret = branchQ_.empty(); 1427 assert_eq(ret, branchSet_.empty()); 1428 return ret; 1429 } 1430 1431 /** 1432 * Curtail the given branch, and possibly remove it from or 1433 * re-insert it into the priority queue. 1434 */ curtail(Branch * br,uint32_t qlen,int seedLen,bool qualOrder)1435 void curtail(Branch *br, uint32_t qlen, int seedLen, bool qualOrder) { 1436 assert(!br->exhausted_); 1437 assert(!br->curtailed_); 1438 uint16_t origCost = br->cost_; 1439 br->curtail(rpool, seedLen, qualOrder); 1440 assert(br->curtailed_); 1441 assert_geq(br->cost_, origCost); 1442 if(br->exhausted_) { 1443 assert(br == front()); 1444 ASSERT_ONLY(Branch *popped =) pop(); 1445 assert(popped == br); 1446 br->free(qlen, rpool, epool, bpool); 1447 } else if(br->cost_ != origCost) { 1448 // Re-insert the newly-curtailed branch 1449 assert(br == front()); 1450 Branch *popped = pop(); 1451 assert(popped == br); 1452 push(popped); 1453 } 1454 } 1455 1456 /** 1457 * If the frontmost branch is a curtailed branch, split off an 1458 * extendable branch and add it to the queue. 1459 */ splitAndPrep(RandomSource & rand,uint32_t qlen,uint32_t qualLim,int seedLen,bool qualOrder,const EbwtParams & ep,const uint8_t * ebwt,bool ebwtFw)1460 bool splitAndPrep(RandomSource& rand, uint32_t qlen, 1461 uint32_t qualLim, int seedLen, 1462 bool qualOrder, 1463 const EbwtParams& ep, const uint8_t* ebwt, 1464 bool ebwtFw) 1465 { 1466 if(empty()) return true; 1467 // This counts as a backtrack 1468 if(btCnt_ != NULL && (*btCnt_ == 0)) { 1469 // Abruptly end search 1470 return false; 1471 } 1472 Branch *f = front(); 1473 assert(!f->exhausted_); 1474 while(f->delayedIncrease_) { 1475 assert(!f->exhausted_); 1476 if(f->delayedIncrease_) { 1477 assert_neq(0, f->delayedCost_); 1478 ASSERT_ONLY(Branch *popped =) pop(); 1479 assert(popped == f); 1480 f->cost_ = f->delayedCost_; 1481 f->delayedIncrease_ = false; 1482 f->delayedCost_ = 0; 1483 push(f); // re-insert it 1484 assert(!empty()); 1485 } 1486 f = front(); 1487 assert(!f->exhausted_); 1488 } 1489 if(f->curtailed_) { 1490 ASSERT_ONLY(uint16_t origCost = f->cost_); 1491 // This counts as a backtrack 1492 if(btCnt_ != NULL) { 1493 if(--(*btCnt_) == 0) { 1494 // Abruptly end search 1495 return false; 1496 } 1497 } 1498 Branch* newbr = splitBranch( 1499 f, rand, qlen, qualLim, seedLen, qualOrder, ep, ebwt, 1500 ebwtFw); 1501 if(newbr == NULL) { 1502 return false; 1503 } 1504 // If f is exhausted, get rid of it immediately 1505 if(f->exhausted_) { 1506 assert(!f->delayedIncrease_); 1507 ASSERT_ONLY(Branch *popped =) pop(); 1508 assert(popped == f); 1509 f->free(qlen, rpool, epool, bpool); 1510 } 1511 assert_eq(origCost, f->cost_); 1512 assert(newbr != NULL); 1513 push(newbr); 1514 assert(newbr == front()); 1515 } 1516 prep(ep, ebwt); 1517 return true; 1518 } 1519 1520 /** 1521 * Return true iff the front element of the queue is prepped. 1522 */ prepped()1523 bool prepped() { 1524 return front()->prepped_; 1525 } 1526 1527 protected: 1528 1529 /** 1530 * Split off an extendable branch from a curtailed branch. 1531 */ splitBranch(Branch * src,RandomSource & rand,uint32_t qlen,uint32_t qualLim,int seedLen,bool qualOrder,const EbwtParams & ep,const uint8_t * ebwt,bool ebwtFw)1532 Branch* splitBranch(Branch* src, RandomSource& rand, uint32_t qlen, 1533 uint32_t qualLim, int seedLen, bool qualOrder, 1534 const EbwtParams& ep, const uint8_t* ebwt, 1535 bool ebwtFw) 1536 { 1537 Branch* dst = src->splitBranch( 1538 rpool, epool, bpool, size(), rand, 1539 qlen, qualLim, seedLen, qualOrder, ep, ebwt, ebwtFw, 1540 verbose_, quiet_); 1541 assert(dst->repOk()); 1542 return dst; 1543 } 1544 1545 /** 1546 * Prep the next branch to be extended in advanceBranch(). 1547 */ prep(const EbwtParams & ep,const uint8_t * ebwt)1548 void prep(const EbwtParams& ep, const uint8_t* ebwt) { 1549 if(!branchQ_.empty()) { 1550 branchQ_.front()->prep(ep, ebwt); 1551 } 1552 } 1553 1554 BranchQueue branchQ_; // priority queue for selecting lowest-cost Branch 1555 // set of branches in priority queue, for sanity checks 1556 ASSERT_ONLY(std::set<Branch*> branchSet_); 1557 1558 public: 1559 1560 ChunkPool *cpool; // pool for generic chunks of memory 1561 AllocOnlyPool<Branch> bpool; // pool for allocating Branches 1562 AllocOnlyPool<RangeState> rpool; // pool for allocating RangeStates 1563 AllocOnlyPool<Edit> epool; // pool for allocating Edits 1564 /// The minimum possible cost for any alignments obtained by 1565 /// advancing further 1566 uint16_t minCost; 1567 1568 protected: 1569 /// Pointer to the aligner's per-read backtrack counter. We 1570 /// increment it in splitBranch. 1571 int *btCnt_; 1572 bool verbose_; 1573 bool quiet_; 1574 }; 1575 1576 /** 1577 * Encapsulates an algorithm that navigates the Bowtie index to produce 1578 * candidate ranges of alignments in the Burrows-Wheeler matrix. A 1579 * higher authority is responsible for reporting hits out of those 1580 * ranges, and stopping when the consumer is satisfied. 1581 */ 1582 class RangeSource { 1583 public: RangeSource()1584 RangeSource() : 1585 done(false), foundRange(false), curEbwt_(NULL) { } 1586 ~RangeSource()1587 virtual ~RangeSource() { } 1588 1589 /// Set query to find ranges for 1590 virtual void setQuery(Read& r, Range *partial) = 0; 1591 /// Set up the range search. 1592 virtual void initBranch(PathManager& pm) = 0; 1593 /// Advance the range search by one memory op 1594 virtual void advanceBranch(int until, uint16_t minCost, PathManager& pm) = 0; 1595 1596 /// Return the last valid range found 1597 virtual Range& range() = 0; 1598 /// Return ptr to index this RangeSource is currently getting ranges from curEbwt()1599 const Ebwt *curEbwt() const { return curEbwt_; } 1600 1601 /// All searching w/r/t the current query is finished 1602 bool done; 1603 /// Set to true iff the last call to advance yielded a range 1604 bool foundRange; 1605 protected: 1606 /// ptr to index this RangeSource is currently getting ranges from 1607 const Ebwt *curEbwt_; 1608 }; 1609 1610 /** 1611 * Abstract parent of RangeSourceDrivers. 1612 */ 1613 template<typename TRangeSource> 1614 class RangeSourceDriver { 1615 public: 1616 RangeSourceDriver(bool _done, uint32_t minCostAdjustment = 0) : foundRange(false)1617 foundRange(false), done(_done), minCostAdjustment_(minCostAdjustment) 1618 { 1619 minCost = minCostAdjustment_; 1620 } 1621 ~RangeSourceDriver()1622 virtual ~RangeSourceDriver() { } 1623 1624 /** 1625 * Prepare this aligner for the next read. 1626 */ setQuery(PatternSourcePerThread * patsrc,Range * r)1627 virtual void setQuery(PatternSourcePerThread* patsrc, Range *r) { 1628 // Clear our buffer of previously-dished-out top offsets 1629 ASSERT_ONLY(allTops_.clear()); 1630 setQueryImpl(patsrc, r); 1631 } 1632 1633 /** 1634 * Prepare this aligner for the next read. 1635 */ 1636 virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) = 0; 1637 1638 /** 1639 * Advance the aligner by one memory op. Return true iff we're 1640 * done with this read. 1641 */ advance(int until)1642 virtual void advance(int until) { 1643 advanceImpl(until); 1644 #ifndef NDEBUG 1645 if(this->foundRange) { 1646 // Assert that we have not yet dished out a range with this 1647 // top offset 1648 assert_gt(range().bot, range().top); 1649 assert(range().ebwt != NULL); 1650 TIndexOff top = (TIndexOff)range().top; 1651 top++; // ensure it's not 0 1652 if(!range().ebwt->fw()) top = -top; 1653 assert(allTops_.find(top) == allTops_.end()); 1654 allTops_.insert(top); 1655 } 1656 #endif 1657 } 1658 /** 1659 * Advance the aligner by one memory op. Return true iff we're 1660 * done with this read. 1661 */ 1662 virtual void advanceImpl(int until) = 0; 1663 /** 1664 * Return the range found. 1665 */ 1666 virtual Range& range() = 0; 1667 1668 /** 1669 * Return whether we're generating ranges for the first or the 1670 * second mate. 1671 */ 1672 virtual bool mate1() const = 0; 1673 1674 /** 1675 * Return true iff current pattern is forward-oriented. 1676 */ 1677 virtual bool fw() const = 0; 1678 removeMate(int m)1679 virtual void removeMate(int m) { } 1680 1681 /// Set to true iff we just found a range. 1682 bool foundRange; 1683 1684 /** 1685 * Set to true if all searching w/r/t the current query is 1686 * finished or if there is no current query. 1687 */ 1688 bool done; 1689 1690 /** 1691 * The minimum "combined" stratum/qual cost that could possibly 1692 * result from subsequent calls to advance() for this driver. 1693 */ 1694 uint16_t minCost; 1695 1696 /** 1697 * Adjustment to the minCost given by the caller that constructed 1698 * the object. This is useful if we know the lowest-cost branch is 1699 * likely to cost less than the any of the alignments that could 1700 * possibly result from advancing (e.g. when we're going to force a 1701 * mismatch somewhere down the line). 1702 */ 1703 uint16_t minCostAdjustment_; 1704 1705 protected: 1706 1707 #ifndef NDEBUG 1708 std::set<TIndexOff> allTops_; 1709 #endif 1710 }; 1711 1712 /** 1713 * A concrete driver wrapper for a single RangeSource. 1714 */ 1715 template<typename TRangeSource> 1716 class SingleRangeSourceDriver : public RangeSourceDriver<TRangeSource> { 1717 1718 public: SingleRangeSourceDriver(EbwtSearchParams & params,TRangeSource * rs,bool fw,HitSink & sink,HitSinkPerThread * sinkPt,EList<BTRefString> & os,bool verbose,bool quiet,bool mate1,uint32_t minCostAdjustment,ChunkPool * pool,int * btCnt)1719 SingleRangeSourceDriver( 1720 EbwtSearchParams& params, 1721 TRangeSource* rs, 1722 bool fw, 1723 HitSink& sink, 1724 HitSinkPerThread* sinkPt, 1725 EList<BTRefString >& os, 1726 bool verbose, 1727 bool quiet, 1728 bool mate1, 1729 uint32_t minCostAdjustment, 1730 ChunkPool* pool, 1731 int *btCnt) : 1732 RangeSourceDriver<TRangeSource>(true, minCostAdjustment), 1733 len_(0), mate1_(mate1), 1734 sinkPt_(sinkPt), 1735 params_(params), 1736 fw_(fw), rs_(rs), 1737 ebwtFw_(rs_->curEbwt()->fw()), 1738 pm_(pool, btCnt, verbose, quiet) 1739 { 1740 assert(rs_ != NULL); 1741 } 1742 ~SingleRangeSourceDriver()1743 virtual ~SingleRangeSourceDriver() { 1744 delete rs_; rs_ = NULL; 1745 } 1746 1747 /** 1748 * Prepare this aligner for the next read. 1749 */ setQueryImpl(PatternSourcePerThread * patsrc,Range * r)1750 virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) { 1751 this->done = false; 1752 pm_.reset((uint32_t)patsrc->rdid()); 1753 Read* buf = mate1_ ? &patsrc->bufa() : &patsrc->bufb(); 1754 len_ = buf->length(); 1755 rs_->setQuery(*buf, r); 1756 initRangeSource((fw_ == ebwtFw_) ? buf->qual : buf->qualRev); 1757 assert_gt(len_, 0); 1758 if(this->done) return; 1759 ASSERT_ONLY(allTops_.clear()); 1760 if(!rs_->done) { 1761 rs_->initBranch(pm_); // set up initial branch 1762 } 1763 uint16_t icost = (r != NULL) ? r->cost : 0; 1764 this->minCost = max<uint16_t>(icost, this->minCostAdjustment_); 1765 this->done = rs_->done; 1766 this->foundRange = rs_->foundRange; 1767 if(!pm_.empty()) { 1768 assert(!pm_.front()->curtailed_); 1769 assert(!pm_.front()->exhausted_); 1770 } 1771 } 1772 1773 /** 1774 * Advance the aligner by one memory op. Return true iff we're 1775 * done with this read. 1776 */ advanceImpl(int until)1777 virtual void advanceImpl(int until) { 1778 if(this->done || pm_.empty()) { 1779 this->done = true; 1780 return; 1781 } 1782 assert(!pm_.empty()); 1783 assert(!pm_.front()->curtailed_); 1784 assert(!pm_.front()->exhausted_); 1785 params_.setFw(fw_); 1786 // Advance the RangeSource for the forward-oriented read 1787 ASSERT_ONLY(uint16_t oldMinCost = this->minCost); 1788 ASSERT_ONLY(uint16_t oldPmMinCost = pm_.minCost); 1789 rs_->advanceBranch(until, this->minCost, pm_); 1790 this->done = pm_.empty(); 1791 if(pm_.minCost != 0) { 1792 this->minCost = max(pm_.minCost, this->minCostAdjustment_); 1793 } else { 1794 // pm_.minCost is 0 because we reset it due to exceptional 1795 // circumstances 1796 } 1797 #ifndef NDEBUG 1798 { 1799 bool error = false; 1800 if(pm_.minCost != 0 && pm_.minCost < oldPmMinCost) { 1801 cerr << "PathManager's cost went down" << endl; 1802 error = true; 1803 } 1804 if(this->minCost < oldMinCost) { 1805 cerr << "this->minCost cost went down" << endl; 1806 error = true; 1807 } 1808 if(error) { 1809 cerr << "pm.minCost went from " << oldPmMinCost 1810 << " to " << pm_.minCost << endl; 1811 cerr << "this->minCost went from " << oldMinCost 1812 << " to " << this->minCost << endl; 1813 cerr << "this->minCostAdjustment_ == " 1814 << this->minCostAdjustment_ << endl; 1815 } 1816 assert(!error); 1817 } 1818 #endif 1819 this->foundRange = rs_->foundRange; 1820 #ifndef NDEBUG 1821 if(this->foundRange) { 1822 if(until >= ADV_COST_CHANGES) assert_eq(oldMinCost, range().cost); 1823 // Assert that we have not yet dished out a range with this 1824 // top offset 1825 assert_gt(range().bot, range().top); 1826 assert(range().ebwt != NULL); 1827 TIndexOff top = (TIndexOff)range().top; 1828 top++; // ensure it's not 0 1829 if(!range().ebwt->fw()) top = -top; 1830 assert(allTops_.find(top) == allTops_.end()); 1831 allTops_.insert(top); 1832 } 1833 if(!pm_.empty()) { 1834 assert(!pm_.front()->curtailed_); 1835 assert(!pm_.front()->exhausted_); 1836 } 1837 #endif 1838 } 1839 1840 /** 1841 * Return the range found. 1842 */ range()1843 virtual Range& range() { 1844 rs_->range().fw = fw_; 1845 rs_->range().mate1 = mate1_; 1846 return rs_->range(); 1847 } 1848 1849 /** 1850 * Return whether we're generating ranges for the first or the 1851 * second mate. 1852 */ mate1()1853 bool mate1() const { 1854 return mate1_; 1855 } 1856 1857 /** 1858 * Return true iff current pattern is forward-oriented. 1859 */ fw()1860 bool fw() const { 1861 return fw_; 1862 } 1863 1864 virtual void initRangeSource(const BTString& qual) = 0; 1865 1866 protected: 1867 1868 // Progress state 1869 uint32_t len_; 1870 bool mate1_; 1871 1872 // Temporary HitSink; to be deleted 1873 HitSinkPerThread* sinkPt_; 1874 1875 // State for alignment 1876 EbwtSearchParams& params_; 1877 bool fw_; 1878 TRangeSource* rs_; // delete this in destructor 1879 bool ebwtFw_; 1880 PathManager pm_; 1881 ASSERT_ONLY(std::set<TIndexOff> allTops_); 1882 }; 1883 1884 /** 1885 * Encapsulates an algorithm that navigates the Bowtie index to produce 1886 * candidate ranges of alignments in the Burrows-Wheeler matrix. A 1887 * higher authority is responsible for reporting hits out of those 1888 * ranges, and stopping when the consumer is satisfied. 1889 */ 1890 template<typename TRangeSource> 1891 class StubRangeSourceDriver : public RangeSourceDriver<TRangeSource> { 1892 1893 typedef EList<RangeSourceDriver<TRangeSource>*> TRangeSrcDrPtrVec; 1894 1895 public: 1896 StubRangeSourceDriver()1897 StubRangeSourceDriver() : 1898 RangeSourceDriver<TRangeSource>(false) 1899 { 1900 this->done = true; 1901 this->foundRange = false; 1902 } 1903 ~StubRangeSourceDriver()1904 virtual ~StubRangeSourceDriver() { } 1905 1906 /// Set query to find ranges for setQueryImpl(PatternSourcePerThread * patsrc,Range * r)1907 virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) { } 1908 1909 /// Advance the range search by one memory op advanceImpl(int until)1910 virtual void advanceImpl(int until) { } 1911 1912 /// Return the last valid range found range()1913 virtual Range& range() { throw 1; } 1914 1915 /** 1916 * Return whether we're generating ranges for the first or the 1917 * second mate. 1918 */ mate1()1919 virtual bool mate1() const { 1920 return true; 1921 } 1922 1923 /** 1924 * Return true iff current pattern is forward-oriented. 1925 */ fw()1926 virtual bool fw() const { 1927 return true; 1928 } 1929 1930 }; 1931 1932 /** 1933 * Encapsulates an algorithm that navigates the Bowtie index to produce 1934 * candidate ranges of alignments in the Burrows-Wheeler matrix. A 1935 * higher authority is responsible for reporting hits out of those 1936 * ranges, and stopping when the consumer is satisfied. 1937 */ 1938 template<typename TRangeSource> 1939 class ListRangeSourceDriver : public RangeSourceDriver<TRangeSource> { 1940 1941 typedef EList<RangeSourceDriver<TRangeSource>*> TRangeSrcDrPtrVec; 1942 1943 public: 1944 ListRangeSourceDriver(const TRangeSrcDrPtrVec & rss)1945 ListRangeSourceDriver(const TRangeSrcDrPtrVec& rss) : 1946 RangeSourceDriver<TRangeSource>(false), 1947 cur_(0), ham_(0), rss_(rss) /* copy */, 1948 patsrc_(NULL), seedRange_(NULL) 1949 { 1950 assert_gt(rss_.size(), 0); 1951 assert(!this->done); 1952 } 1953 ~ListRangeSourceDriver()1954 virtual ~ListRangeSourceDriver() { 1955 for(size_t i = 0; i < rss_.size(); i++) { 1956 delete rss_[i]; 1957 } 1958 } 1959 1960 /// Set query to find ranges for setQueryImpl(PatternSourcePerThread * patsrc,Range * r)1961 virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) { 1962 cur_ = 0; // go back to first RangeSource in list 1963 this->done = false; 1964 rss_[cur_]->setQuery(patsrc, r); 1965 patsrc_ = patsrc; // so that we can call setQuery on the other elements later 1966 seedRange_ = r; 1967 this->done = (cur_ == rss_.size()-1) && rss_[cur_]->done; 1968 this->minCost = max(rss_[cur_]->minCost, this->minCostAdjustment_); 1969 this->foundRange = rss_[cur_]->foundRange; 1970 } 1971 1972 /// Advance the range search by one memory op advanceImpl(int until)1973 virtual void advanceImpl(int until) { 1974 assert(!this->done); 1975 assert_lt(cur_, rss_.size()); 1976 if(rss_[cur_]->done) { 1977 // Move on to next RangeSourceDriver 1978 if(cur_ < rss_.size()-1) { 1979 rss_[++cur_]->setQuery(patsrc_, seedRange_); 1980 this->minCost = max(rss_[cur_]->minCost, this->minCostAdjustment_); 1981 this->foundRange = rss_[cur_]->foundRange; 1982 } else { 1983 // No RangeSources in list; done 1984 cur_ = OFF_MASK; 1985 this->done = true; 1986 } 1987 } else { 1988 // Advance current RangeSource 1989 rss_[cur_]->advance(until); 1990 this->done = (cur_ == rss_.size()-1 && rss_[cur_]->done); 1991 this->foundRange = rss_[cur_]->foundRange; 1992 this->minCost = max(rss_[cur_]->minCost, this->minCostAdjustment_); 1993 } 1994 } 1995 1996 /// Return the last valid range found range()1997 virtual Range& range() { return rss_[cur_]->range(); } 1998 1999 /** 2000 * Return whether we're generating ranges for the first or the 2001 * second mate. 2002 */ mate1()2003 virtual bool mate1() const { 2004 return rss_[0]->mate1(); 2005 } 2006 2007 /** 2008 * Return true iff current pattern is forward-oriented. 2009 */ fw()2010 virtual bool fw() const { 2011 return rss_[0]->fw(); 2012 } 2013 2014 protected: 2015 2016 TIndexOffU cur_; 2017 uint32_t ham_; 2018 TRangeSrcDrPtrVec rss_; 2019 PatternSourcePerThread* patsrc_; 2020 Range *seedRange_; 2021 }; 2022 2023 /** 2024 * A RangeSourceDriver that wraps a set of other RangeSourceDrivers and 2025 * chooses which one to advance at any given moment by picking one with 2026 * minimal "cumulative cost" so far. 2027 * 2028 * Note that costs have to be "adjusted" to account for the fact that 2029 * the alignment policy for the underlying RangeSource might force 2030 * mismatches. 2031 */ 2032 template<typename TRangeSource> 2033 class CostAwareRangeSourceDriver : public RangeSourceDriver<TRangeSource> { 2034 2035 typedef RangeSourceDriver<TRangeSource>* TRangeSrcDrPtr; 2036 typedef EList<TRangeSrcDrPtr> TRangeSrcDrPtrVec; 2037 2038 public: 2039 CostAwareRangeSourceDriver(bool strandFix,const TRangeSrcDrPtrVec * rss,bool verbose,bool quiet,bool mixesReads)2040 CostAwareRangeSourceDriver( 2041 bool strandFix, 2042 const TRangeSrcDrPtrVec* rss, 2043 bool verbose, 2044 bool quiet, 2045 bool mixesReads) : 2046 RangeSourceDriver<TRangeSource>(false), 2047 rss_(), active_(), strandFix_(strandFix), 2048 lastRange_(NULL), delayedRange_(NULL), patsrc_(NULL), 2049 verbose_(verbose), quiet_(quiet), mixesReads_(mixesReads) 2050 { 2051 if(rss != NULL) { 2052 rss_ = (*rss); 2053 } 2054 paired_ = false; 2055 this->foundRange = false; 2056 this->done = false; 2057 if(rss_.empty()) { 2058 return; 2059 } 2060 calcPaired(); 2061 active_ = rss_; 2062 this->minCost = 0; 2063 } 2064 2065 /// Destroy all underlying RangeSourceDrivers ~CostAwareRangeSourceDriver()2066 virtual ~CostAwareRangeSourceDriver() { 2067 const size_t rssSz = rss_.size(); 2068 for(size_t i = 0; i < rssSz; i++) { 2069 delete rss_[i]; 2070 } 2071 rss_.clear(); 2072 active_.clear(); 2073 } 2074 2075 /// Set query to find ranges for setQueryImpl(PatternSourcePerThread * patsrc,Range * r)2076 virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) { 2077 this->done = false; 2078 this->foundRange = false; 2079 lastRange_ = NULL; 2080 delayedRange_ = NULL; 2081 ASSERT_ONLY(allTopsRc_.clear()); 2082 patsrc_ = patsrc; 2083 rand_.init(patsrc->bufa().seed); 2084 const size_t rssSz = rss_.size(); 2085 if(rssSz == 0) return; 2086 for(size_t i = 0; i < rssSz; i++) { 2087 // Assuming that all 2088 rss_[i]->setQuery(patsrc, r); 2089 } 2090 active_ = rss_; 2091 this->minCost = 0; 2092 sortActives(); 2093 } 2094 2095 /** 2096 * Add a new RangeSource to the list and re-sort it. 2097 */ addSource(TRangeSrcDrPtr p,Range * r)2098 void addSource(TRangeSrcDrPtr p, Range *r) { 2099 assert(!this->foundRange); 2100 this->lastRange_ = NULL; 2101 this->delayedRange_ = NULL; 2102 this->done = false; 2103 if(patsrc_ != NULL) { 2104 p->setQuery(patsrc_, r); 2105 } 2106 rss_.push_back(p); 2107 active_.push_back(p); 2108 calcPaired(); 2109 this->minCost = 0; 2110 sortActives(); 2111 } 2112 2113 /** 2114 * Free and remove all contained RangeSources. 2115 */ clearSources()2116 void clearSources() { 2117 const size_t rssSz = rss_.size(); 2118 for(size_t i = 0; i < rssSz; i++) { 2119 delete rss_[i]; 2120 } 2121 rss_.clear(); 2122 active_.clear(); 2123 paired_ = false; 2124 } 2125 2126 /** 2127 * Return the number of RangeSources contained within. 2128 */ size()2129 size_t size() const { 2130 return rss_.size(); 2131 } 2132 2133 /** 2134 * Return true iff no RangeSources are contained within. 2135 */ empty()2136 bool empty() const { 2137 return rss_.empty(); 2138 } 2139 2140 /** 2141 * Advance the aligner by one memory op. Return true iff we're 2142 * done with this read. 2143 */ advance(int until)2144 virtual void advance(int until) { 2145 ASSERT_ONLY(uint16_t precost = this->minCost); 2146 assert(!this->done); 2147 assert(!this->foundRange); 2148 until = max<int>(until, ADV_COST_CHANGES); 2149 advanceImpl(until); 2150 assert(!this->foundRange || lastRange_ != NULL); 2151 if(this->foundRange) { 2152 assert_eq(range().cost, precost); 2153 } 2154 } 2155 2156 /// Advance the range search advanceImpl(int until)2157 virtual void advanceImpl(int until) { 2158 lastRange_ = NULL; 2159 ASSERT_ONLY(uint16_t iminCost = this->minCost); 2160 const size_t actSz = active_.size(); 2161 assert(sortedActives()); 2162 if(delayedRange_ != NULL) { 2163 assert_eq(iminCost, delayedRange_->cost); 2164 lastRange_ = delayedRange_; 2165 delayedRange_ = NULL; 2166 this->foundRange = true; 2167 assert_eq(range().cost, iminCost); 2168 if(!active_.empty()) { 2169 assert_geq(active_[0]->minCost, this->minCost); 2170 this->minCost = max(active_[0]->minCost, this->minCost); 2171 } else { 2172 this->done = true; 2173 } 2174 return; // found a range 2175 } 2176 assert(delayedRange_ == NULL); 2177 if(mateEliminated() || actSz == 0) { 2178 // No more alternatoves; clear the active set and signal 2179 // we're done 2180 active_.clear(); 2181 this->done = true; 2182 return; 2183 } 2184 // Advance lowest-cost RangeSourceDriver 2185 TRangeSrcDrPtr p = active_[0]; 2186 uint16_t precost = p->minCost; 2187 assert(!p->done || p->foundRange); 2188 if(!p->foundRange) { 2189 p->advance(until); 2190 } 2191 bool needsSort = false; 2192 if(p->foundRange) { 2193 Range *r = &p->range(); 2194 assert_eq(r->cost, iminCost); 2195 needsSort = foundFirstRange(r); // may set delayedRange_; re-sorts active_ 2196 assert_eq(lastRange_->cost, iminCost); 2197 if(delayedRange_ != NULL) assert_eq(delayedRange_->cost, iminCost); 2198 p->foundRange = false; 2199 } 2200 if(p->done || (precost != p->minCost) || needsSort) { 2201 sortActives(); 2202 if(mateEliminated() || active_.empty()) { 2203 active_.clear(); 2204 this->done = (delayedRange_ == NULL); 2205 } 2206 } 2207 assert(sortedActives()); 2208 assert(lastRange_ == NULL || lastRange_->cost == iminCost); 2209 assert(delayedRange_ == NULL || delayedRange_->cost == iminCost); 2210 } 2211 2212 /// Return the last valid range found range()2213 virtual Range& range() { 2214 assert(lastRange_ != NULL); 2215 return *lastRange_; 2216 } 2217 2218 /** 2219 * Return whether we're generating ranges for the first or the 2220 * second mate. 2221 */ mate1()2222 virtual bool mate1() const { 2223 return rss_[0]->mate1(); 2224 } 2225 2226 /** 2227 * Return true iff current pattern is forward-oriented. 2228 */ fw()2229 virtual bool fw() const { 2230 return rss_[0]->fw(); 2231 } 2232 removeMate(int m)2233 virtual void removeMate(int m) { 2234 bool qmate1 = (m == 1); 2235 assert(paired_); 2236 for(size_t i = 0; i < active_.size(); i++) { 2237 if(active_[i]->mate1() == qmate1) { 2238 active_[i]->done = true; 2239 } 2240 } 2241 sortActives(); 2242 assert(mateEliminated()); 2243 } 2244 2245 protected: 2246 2247 /** 2248 * Set paired_ to true iff there are mate1 and mate2 range sources 2249 * in the rss_ vector. 2250 */ calcPaired()2251 void calcPaired() { 2252 const size_t rssSz = rss_.size(); 2253 bool saw1 = false; 2254 bool saw2 = false; 2255 for(size_t i = 0; i < rssSz; i++) { 2256 if(rss_[i]->mate1()) saw1 = true; 2257 else saw2 = true; 2258 } 2259 assert(saw1 || saw2); 2260 paired_ = saw1 && saw2; 2261 } 2262 2263 /** 2264 * Return true iff one mate or the other has been eliminated. 2265 */ mateEliminated()2266 bool mateEliminated() { 2267 if(!paired_) return false; 2268 bool mate1sLeft = false; 2269 bool mate2sLeft = false; 2270 // If this RangeSourceDriver is done, shift everyone else 2271 // up and delete it 2272 const size_t rssSz = active_.size(); 2273 for(size_t i = 0; i < rssSz; i++) { 2274 if(!active_[i]->done) { 2275 if(active_[i]->mate1()) mate1sLeft = true; 2276 else mate2sLeft = true; 2277 } 2278 } 2279 return !mate1sLeft || !mate2sLeft; 2280 } 2281 2282 #ifndef NDEBUG 2283 /** 2284 * Check that the given range has not yet been reported. 2285 */ checkRange(Range * r)2286 bool checkRange(Range* r) { 2287 // Assert that we have not yet dished out a range with this 2288 // top offset 2289 assert_gt(r->bot, r->top); 2290 assert(r->ebwt != NULL); 2291 TIndexOff top = (TIndexOff)r->top; 2292 top++; // ensure it's not 0 2293 if(!r->ebwt->fw()) top = -top; 2294 if(r->fw) { 2295 assert(this->allTops_.find(top) == this->allTops_.end()); 2296 if(!mixesReads_) this->allTops_.insert(top); 2297 } else { 2298 assert(this->allTopsRc_.find(top) == this->allTopsRc_.end()); 2299 if(!mixesReads_) this->allTopsRc_.insert(top); 2300 } 2301 return true; 2302 } 2303 #endif 2304 2305 /** 2306 * We found a range; check whether we should attempt to find a 2307 * range of equal quality from the opposite strand so that we can 2308 * resolve the strand bias. Return true iff we modified the cost 2309 * of one or more items after the first item. 2310 */ foundFirstRange(Range * r)2311 bool foundFirstRange(Range* r) { 2312 assert(r != NULL); 2313 assert(checkRange(r)); 2314 this->foundRange = true; 2315 lastRange_ = r; 2316 if(strandFix_) { 2317 // We found a range but there may be an equally good range 2318 // on the other strand; let's try to get it. 2319 const size_t sz = active_.size(); 2320 for(size_t i = 1; i < sz; i++) { 2321 // Same mate, different orientation? 2322 if(rss_[i]->mate1() == r->mate1 && rss_[i]->fw() != r->fw) { 2323 // Yes; see if it has the same cost 2324 TRangeSrcDrPtr p = active_[i]; 2325 uint16_t minCost = max(this->minCost, p->minCost); 2326 if(minCost > r->cost) break; 2327 // Yes, it has the same cost 2328 assert_eq(minCost, r->cost); // can't be better 2329 // Advance it until it's done, we've found a range, 2330 // or its cost increases 2331 if(this->verbose_) cout << " Looking for opposite range to avoid strand bias:" << endl; 2332 while(!p->done && !p->foundRange) { 2333 p->advance(ADV_COST_CHANGES); 2334 assert_geq(p->minCost, minCost); 2335 if(p->minCost > minCost) break; 2336 } 2337 if(p->foundRange) { 2338 // Found one! Now we have to choose which one 2339 // to give out first; we choose randomly using 2340 // the size of the ranges as weights. 2341 delayedRange_ = &p->range(); 2342 assert(checkRange(delayedRange_)); 2343 size_t tot = (delayedRange_->bot - delayedRange_->top) + 2344 (lastRange_->bot - lastRange_->top); 2345 uint32_t rq = rand_.nextU32() % tot; 2346 // We picked this range, not the first one 2347 if(rq < (delayedRange_->bot - delayedRange_->top)) { 2348 Range *tmp = lastRange_; 2349 lastRange_ = delayedRange_; 2350 delayedRange_ = tmp; 2351 } 2352 p->foundRange = false; 2353 } 2354 // Return true iff we need to force a re-sort 2355 return true; 2356 } 2357 } 2358 // OK, now we have a choice of two equally good ranges from 2359 // each strand. 2360 } 2361 return false; 2362 } 2363 2364 /** 2365 * Sort all of the RangeSourceDriver ptrs in the rss_ array so that 2366 * the one with the lowest cumulative cost is at the top. Break 2367 * ties randomly. Just do selection sort for now; we don't expect 2368 * the list to be long. 2369 */ sortActives()2370 void sortActives() { 2371 TRangeSrcDrPtrVec& vec = active_; 2372 size_t sz = vec.size(); 2373 // Selection sort / removal outer loop 2374 for(size_t i = 0; i < sz;) { 2375 // Remove elements that we're done with 2376 if(vec[i]->done && !vec[i]->foundRange) { 2377 vec.erase(i); 2378 if(sz == 0) break; 2379 else sz--; 2380 continue; 2381 } 2382 uint16_t minCost = vec[i]->minCost; 2383 size_t minOff = i; 2384 // Selection sort inner loop 2385 for(size_t j = i+1; j < sz; j++) { 2386 if(vec[j]->done && !vec[j]->foundRange) { 2387 // We'll get rid of this guy later 2388 continue; 2389 } 2390 if(vec[j]->minCost < minCost) { 2391 minCost = vec[j]->minCost; 2392 minOff = j; 2393 } else if(vec[j]->minCost == minCost) { 2394 // Possibly randomly pick the other 2395 if(rand_.nextU32() & 0x1000) { 2396 minOff = j; 2397 } 2398 } 2399 } 2400 // Do the swap, if necessary 2401 if(i != minOff) { 2402 assert_leq(minCost, vec[i]->minCost); 2403 TRangeSrcDrPtr tmp = vec[i]; 2404 vec[i] = vec[minOff]; 2405 vec[minOff] = tmp; 2406 } 2407 i++; 2408 } 2409 if(delayedRange_ == NULL && sz > 0) { 2410 assert_geq(this->minCost, this->minCostAdjustment_); 2411 assert_geq(vec[0]->minCost, this->minCost); 2412 this->minCost = vec[0]->minCost; 2413 } 2414 assert(sortedActives()); 2415 } 2416 2417 #ifndef NDEBUG 2418 /** 2419 * Check that the rss_ array is sorted by minCost; assert if it's 2420 * not. 2421 */ sortedActives()2422 bool sortedActives() const { 2423 // Selection sort outer loop 2424 const TRangeSrcDrPtrVec& vec = active_; 2425 const size_t sz = vec.size(); 2426 for(size_t i = 0; i < sz; i++) { 2427 assert(!vec[i]->done || vec[i]->foundRange); 2428 for(size_t j = i+1; j < sz; j++) { 2429 assert(!vec[j]->done || vec[j]->foundRange); 2430 assert_leq(vec[i]->minCost, vec[j]->minCost); 2431 } 2432 } 2433 if(delayedRange_ == NULL && sz > 0) { 2434 // Only assert this if there's no delayed range; if there's 2435 // a delayed range, the minCost is its cost, not the 0th 2436 // element's cost 2437 assert_leq(vec[0]->minCost, this->minCost); 2438 } 2439 return true; 2440 } 2441 #endif 2442 2443 /// List of all the drivers 2444 TRangeSrcDrPtrVec rss_; 2445 /// List of all the as-yet-uneliminated drivers 2446 TRangeSrcDrPtrVec active_; 2447 /// Whether the list of drivers contains drivers for both mates 1 and 2 2448 bool paired_; 2449 /// If true, this driver will make an attempt to dish out ranges in 2450 /// a way that approaches the right distribution based on the 2451 /// number of hits on both strands. 2452 bool strandFix_; 2453 uint32_t randSeed_; 2454 /// The random seed from the Aligner, which we use to randomly break ties 2455 RandomSource rand_; 2456 Range *lastRange_; 2457 Range *delayedRange_; 2458 PatternSourcePerThread* patsrc_; 2459 bool verbose_; 2460 bool quiet_; 2461 bool mixesReads_; 2462 ASSERT_ONLY(std::set<TIndexOff> allTopsRc_); 2463 }; 2464 2465 #endif /* RANGE_SOURCE_H_ */ 2466