1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: David Weese <david.weese@fu-berlin.de> 33 // ========================================================================== 34 35 #ifndef SEQAN_HEADER_FIND_SWIFT_H 36 #define SEQAN_HEADER_FIND_SWIFT_H 37 38 namespace seqan 39 { 40 41 ////////////////////////////////////////////////////////////////////////////// 42 // SWIFT to search a text for 43 // - semi-global alignments of one/multiple short sequences 44 // - local epsilon matches of one/multiple short sequences 45 ////////////////////////////////////////////////////////////////////////////// 46 47 // TODO(bkehr): Is this documentatin right? Should the Specializations be Tags? 48 // weese: of minimal length? 49 /*! 50 * @class SwiftPattern 51 * @extends Pattern 52 * @headerfile <seqan/index.h> 53 * @brief Pattern for SWIFT search, must be used together with @link SwiftFinder @endlink. 54 * 55 * @signature template <typename TIndex, typename TSpec> 56 * class Pattern<TIndex, Swift<TSpec> >; 57 * 58 * @tparam TSpec Specifies the type of Swift filter. 59 * @tparam TIndex A q-gram index of needle(s). Types: @link IndexQGram @endlink 60 * 61 * The @link Pattern @endlink must be a @link IndexQGram @endlink over multiple patterns. The allowed error rate must 62 * be given when @link Finder#find @endlink or @link SwiftFinder#windowFindBegin @endlink is called. 63 * 64 * Provides a fast filter alogrithm that guarantees to find all regions overlapping with potential ε-matches. 65 * An ε-match is a matching region of minimal length and an error rate of at most ε. 66 * 67 * @see SwiftFinder 68 * 69 * @var SwiftParameters SwiftPattern::parameters; 70 * @brief The SWIFT parameters to use for configuration. 71 */ 72 73 /*! 74 * @class SwiftFinder 75 * @extends Finder 76 * @headerfile <seqan/index.h> 77 * @brief Finder for SWIFT search, must be used together with @link SwiftPattern @endlink. 78 * 79 * @signature template <typename THaystack, typename TSpec> 80 * class Finder<THaystack, Swift<TSpec> >; 81 * 82 * @tparam TSpec Specifies the type of Swift filter. 83 * @tparam THaystack A haystack type. Types: @link Index @endlink, @link String @endlink, @link StringSet @endlink. 84 * 85 * See @link SwiftPattern @endlink for an overview of the SWIFT algorithm. 86 * 87 * @see SwiftPattern 88 */ 89 90 /*! 91 * @class SwiftLocalPattern 92 * @extends SwiftPattern 93 * @headerfile <seqan/index.h> 94 * 95 * @brief The specialization for the general SWIFT filter that finds ε-matches between haystack and needle. 96 * 97 * @signature template <typename THaystack> 98 * class Finder<THaystack, Swift<SwiftLocal> >; 99 * 100 * @tparam THaystack A haystack type. Types: @link Index @endlink, @link String @endlink, @link StringSet @endlink. 101 */ 102 103 /*! 104 * @class SwiftLocalFinder 105 * @extends SwiftFinder 106 * @headerfile <seqan/index.h> 107 * @brief The specialization for the general SWIFT filter that finds ε-matches between haystack and needle. 108 * 109 * @signature template <typename THaystack> 110 * class Finder<THaystack, Swift<SwiftLocal> >; 111 * 112 * @tparam THaystack A haystack type. Types: @link Index @endlink, @link String @endlink, @link StringSet @endlink. 113 */ 114 115 /*! 116 * @class SwiftSemiGlobalPattern 117 * @extends SwiftPattern 118 * @headerfile <seqan/index.h> 119 * @brief The specialization for the semi-global swift filter that finds regions of the haystack where a needle matches 120 * with an error rate less than \epsilon. 121 * 122 * @signature template <typename TIndex> 123 * class Pattern<TIndex, Swift<SwiftSemiGlobal> >; 124 * 125 * @tparam TIndex A @link IndexQGram q-gram index @endlink of needle(s). 126 */ 127 128 /*! 129 * @class SwiftSemiGlobalFinder 130 * @extends SwiftFinder 131 * @headerfile <seqan/index.h> 132 * @brief The specialization for the semi-global swift filter that finds regions of the haystack where a needle matches 133 * with an error rate less than \epsilon. 134 * 135 * @signature template <typename THaystack> 136 * class Finder<THaystack, Swift<SwiftSemiGlobal> >; 137 * 138 * @tparam THaystack A haystack type. Types: @link Index @endlink, @link String @endlink, @link StringSet @endlink 139 */ 140 141 template < typename TObject, typename TSpec > class Index; 142 template < typename TObject > struct SAValue; 143 144 struct SwiftLocal_; 145 typedef Tag<SwiftLocal_> SwiftLocal; 146 147 template <typename TSpec = void> 148 struct SwiftSemiGlobal_; 149 typedef Tag<SwiftSemiGlobal_<void> > SwiftSemiGlobal; 150 151 struct Hamming_; 152 typedef Tag<SwiftSemiGlobal_<Hamming_> > SwiftSemiGlobalHamming; 153 154 155 template <typename TSpec = SwiftSemiGlobal> 156 struct Swift; 157 158 template <> 159 struct Swift<SwiftSemiGlobal> { 160 enum { SEMIGLOBAL = 1 }; // 0..match eps-match of min.length n0; 1..match the whole read 161 enum { DIAGONAL = 1 }; // 0..use rectangular buckets (QUASAR); 1..use diagonal buckets (SWIFT) 162 enum { QGRAM_ERRORS = 0 }; // q-gram must match exactly 163 enum { HAMMING_ONLY = 0 }; // 0..no indels; 1..allow indels 164 enum { PARAMS_BY_LENGTH = 1 }; // params are determined only by seq.length 165 }; 166 167 template <> 168 struct Swift<SwiftSemiGlobalHamming> { 169 enum { SEMIGLOBAL = 1 }; // 0..match eps-match of min.length n0; 1..match the whole read 170 enum { DIAGONAL = 1 }; // 0..use rectangular buckets (QUASAR); 1..use diagonal buckets (SWIFT) 171 enum { QGRAM_ERRORS = 0 }; // q-gram must match exactly 172 enum { HAMMING_ONLY = 1 }; // 0..no indels; 1..allow indels 173 enum { PARAMS_BY_LENGTH = 1 }; // params are determined only by seq.length 174 }; 175 176 template <> 177 struct Swift<SwiftLocal> { 178 enum { SEMIGLOBAL = 0 }; // 0..match eps-match of min.length n0; 1..match the whole read 179 enum { DIAGONAL = 1 }; // 0..use rectangular buckets (QUASAR); 1..use diagonal buckets (SWIFT) 180 enum { QGRAM_ERRORS = 0 }; // allow 0 errors per q-gram 181 enum { HAMMING_ONLY = 0 }; // 0..no indels; 1..allow indels 182 enum { PARAMS_BY_LENGTH = 0 }; // params are determined by seq.no. 183 }; 184 185 /*! 186 * @class SwiftParameters 187 * @headerfile <seqan/index.h> 188 * @brief Parameters for the SWIFT algorithm. 189 * 190 * @signature struct SwiftParameters; 191 * 192 * @see SwiftPattern 193 * 194 * 195 * @fn SwiftParameters::SwiftParameters 196 * @brief Default constructor. 197 * 198 * @signature SwiftParameters::SwiftParameters(); 199 * 200 * Initializes all member variables, default values are given in the variable documentation. 201 * 202 * 203 * @var int SwiftParameters::minThreshold; 204 * @brief Minimal threshold to use initially, defaults to <tt>1</tt>. 205 * 206 * @var int SwiftParameters::minLog2Delta; 207 * @brief Minimal delta to use, defaults to <tt>16</tt>. 208 * 209 * @var int SwiftParameters::tabooLength; 210 * @brief Taboo length to use, defaults to <tt>1</tt>. 211 * 212 * @var bool SwiftParameters::printDots; 213 * @brief Whether to print dots for the scanning progress, defaults to <tt>false</tt>. 214 * 215 * @var bool SwiftParameters::debug; 216 * @brief Whether to print debug information, defaults to <tt>false</tt>. 217 */ 218 219 struct SwiftParameters 220 { 221 int minThreshold; 222 int minLog2Delta; 223 int tabooLength; 224 bool printDots; 225 bool debug; 226 227 SwiftParameters() : 228 minThreshold(1), // set minimal threshold to 1 229 minLog2Delta(4), // set minimal delta to 16 230 tabooLength(1), // minimal genomic distance between q-gram hits 231 printDots(false), // print a . for every 100kbps mapped genome 232 debug(false) 233 {} 234 }; 235 236 ////////////////////////////////////////////////////////////////////////////// 237 238 template <typename TSpec, typename TSize_, typename TShortSize_ = unsigned short> 239 struct SwiftBucket_ 240 { 241 typedef TSize_ TSize; 242 typedef TShortSize_ TShortSize; 243 244 TSize firstIncrement; 245 TSize lastIncrement; 246 TShortSize counter; // q-gram hits 247 TShortSize threshold; // at least threshold q-gram hits induce an approx match 248 bool notListed; // true if bucket is not listed in the patterns' verify list 249 #ifdef SEQAN_DEBUG_SWIFT 250 TSize _lastIncDiag; 251 #endif 252 }; 253 254 template <typename TSpec_, typename TSize_, typename TShortSize_> 255 struct SwiftBucket_<SwiftSemiGlobal_<TSpec_>, TSize_, TShortSize_> 256 { 257 typedef TSize_ TSize; 258 typedef TShortSize_ TShortSize; 259 260 TSize lastIncrement; 261 TShortSize counter; // q-gram hits 262 TShortSize threshold; // at least threshold q-gram hits induce an approx match 263 #ifdef SEQAN_DEBUG_SWIFT 264 int _lastIncDiag; 265 #endif 266 }; 267 268 template <typename TSpec, typename TSize_, typename TShortSize_ = unsigned short> 269 struct SwiftBucketParams_ 270 { 271 typedef TSize_ TSize; 272 typedef TShortSize_ TShortSize; 273 274 TSize firstBucket; // first SwiftBucket_ entry in pattern.buckets 275 TSize reuseMask; // 2^ceil(log2(x)) reuse every x-th bucket) 276 TShortSize threshold; // at least threshold q-gram hits induce an approx match 277 TShortSize distanceCut; // if lastIncrement is this far or farer away, threshold can't be reached 278 TShortSize delta; // buckets begin at multiples of delta 279 TShortSize overlap; // number of diagonals/columns a bucket shares with its neighbor 280 TShortSize tabooLength; // minimal genomic distance between q-gram hits 281 unsigned char logDelta; // log2(delta) 282 }; 283 284 template <typename TSpec_, typename TSize_, typename TShortSize_> 285 struct SwiftBucketParams_< Swift<Tag<SwiftSemiGlobal_<TSpec_> > >, TSize_, TShortSize_> 286 { 287 typedef TSize_ TSize; 288 typedef TShortSize_ TShortSize; 289 290 TSize firstBucket; // first SwiftBucket_ entry in pattern.buckets 291 TSize reuseMask; // 2^ceil(log2(x)) reuse every x-th bucket) 292 TShortSize threshold; // at least threshold q-gram hits induce an approx match 293 TShortSize delta; // buckets begin at multiples of delta 294 TShortSize overlap; // number of diagonals/columns a bucket shares with its neighbor 295 TShortSize tabooLength; // minimal genomic distance between q-gram hits 296 unsigned char logDelta; // log2(delta) 297 }; 298 299 //____________________________________________________________________________ 300 301 302 template <typename THstkPos> 303 struct SwiftHit_ 304 { 305 THstkPos hstkPos; // parallelogram begin in haystack 306 unsigned ndlSeqNo; // needle sequence number 307 THstkPos ndlPos; // begin position of hit in needle 308 unsigned bucketWidth; // (non-diagonal) bucket width (hitLengthNeedle + delta + overlap (for diagonals)) 309 unsigned hitLengthNeedle; // length of the hit in needle 310 }; 311 312 template <typename THstkPos> 313 struct SwiftHitSemiGlobal_ 314 { 315 THstkPos hstkPos; // parallelogram begin in haystack 316 unsigned ndlSeqNo; // needle sequence number 317 unsigned bucketWidth; // (non-diagonal) bucket width (bktHeight + delta + overlap (for diagonals)) 318 }; 319 320 321 template <typename TFinder, typename TPattern = void> 322 struct FindResult 323 { 324 }; 325 template <typename TFinder, typename TPattern = void> 326 struct WindowFindResult 327 { 328 typedef String<typename FindResult<TFinder, TPattern>::Type> Type; 329 }; 330 331 332 333 template <typename THaystack, typename TPattern, typename TSwiftSpec> 334 struct FindResult<Finder<THaystack, Swift<TSwiftSpec> >, TPattern> 335 { 336 typedef SwiftHit_<int64_t> Type; 337 }; 338 339 template <typename THaystack, typename TPattern, typename TSpec> 340 struct FindResult<Finder<THaystack, Swift<Tag<SwiftSemiGlobal_<TSpec> > > >, TPattern> 341 { 342 typedef SwiftHitSemiGlobal_<int64_t> Type; 343 }; 344 345 346 //____________________________________________________________________________ 347 348 349 /*! 350 * @typedef SwiftFinder::THitString 351 * @brief The type to use for the hit string returned in @link SwiftFinder#getWindowFindHits @endlink. 352 * 353 * @signature typedef (...) SwiftFinder::THitString; 354 */ 355 356 template <typename THaystack, typename TSpec> 357 class Finder<THaystack, Swift<TSpec> > 358 { 359 public: 360 typedef typename Iterator<THaystack, Rooted>::Type TIterator; 361 typedef typename Position<THaystack>::Type THstkPos; 362 typedef typename FindResult<Finder>::Type TSwiftHit; 363 typedef typename WindowFindResult<Finder>::Type THitString; 364 typedef typename Iterator<THitString, Standard>::Type THitIterator; 365 typedef typename SAValue<THaystack>::Type TSAValue; 366 typedef Repeat<TSAValue, unsigned> TRepeat; 367 // TODO(holtgrew): Make this a holder so we can share the actual string when copying. 368 typedef String<TRepeat> TRepeatString; 369 typedef typename Iterator<TRepeatString, Standard>::Type TRepeatIterator; 370 371 TIterator data_iterator; 372 TIterator haystackEnd; 373 bool _needReinit; // if true, the Pattern needs to be reinitialized 374 THitString hits; 375 THitIterator curHit, endHit; 376 THstkPos startPos, curPos, endPos; 377 THstkPos windowStart; 378 THstkPos dotPos, dotPos2; 379 TRepeatString data_repeats; 380 TRepeatIterator curRepeat, endRepeat; 381 382 Finder() : 383 data_iterator(), haystackEnd(), _needReinit(true), curHit(), endHit(), 384 startPos(), curPos(), endPos(), windowStart(), dotPos(), dotPos2(), 385 curRepeat(), endRepeat() 386 {} 387 388 Finder(THaystack & haystack) : 389 data_iterator(begin(haystack, Rooted())), haystackEnd(), _needReinit(true), curHit(), endHit(), 390 startPos(), curPos(), endPos(), windowStart(), dotPos(), dotPos2(), curRepeat(), endRepeat() 391 {} 392 393 template <typename TRepeatSize, typename TPeriodSize> 394 Finder(THaystack &haystack, TRepeatSize minRepeatLen, TPeriodSize maxPeriod) : 395 data_iterator(begin(haystack, Rooted())), haystackEnd(), _needReinit(true), curHit(), endHit(), 396 startPos(), curPos(), endPos(), windowStart(), dotPos(), dotPos2(), curRepeat(), endRepeat() 397 { 398 findRepeats(data_repeats, haystack, minRepeatLen, maxPeriod); 399 } 400 401 Finder(TIterator &iter) : 402 data_iterator(iter), haystackEnd(), _needReinit(true), curHit(), endHit(), 403 startPos(), curPos(), endPos(), windowStart(), dotPos(), dotPos2(), curRepeat(), endRepeat() 404 {} 405 406 Finder(TIterator const & iter): 407 data_iterator(iter), haystackEnd(), _needReinit(true), curHit(), endHit(), 408 startPos(), curPos(), endPos(), windowStart(), dotPos(), dotPos2(), curRepeat(), endRepeat() 409 {} 410 411 Finder(Finder const & orig): 412 data_iterator(orig.data_iterator), 413 haystackEnd(orig.haystackEnd), 414 _needReinit(orig._needReinit), 415 hits(orig.hits), 416 startPos(orig.startPos), 417 curPos(orig.curPos), 418 endPos(orig.endPos), 419 windowStart(orig.windowStart), 420 dotPos(orig.dotPos), 421 dotPos2(orig.dotPos2), 422 data_repeats(orig.data_repeats) 423 { 424 curHit = begin(hits, Standard()) + (orig.curHit - begin(orig.hits, Standard())); 425 endHit = end(hits, Standard()); 426 curRepeat = begin(data_repeats, Standard()) + (orig.curRepeat - begin(orig.data_repeats, Standard())); 427 endRepeat = end(data_repeats, Standard()); 428 } 429 430 inline typename Reference<TIterator>::Type 431 operator* () { return value(hostIterator(*this)); } 432 433 inline typename Reference<TIterator const>::Type 434 operator* () const { return value(hostIterator(*this)); } 435 436 operator TIterator () const { return data_iterator; } 437 438 Finder & operator = (Finder const &orig) 439 { 440 data_iterator = orig.data_iterator; 441 haystackEnd = orig.haystackEnd; 442 _needReinit = orig._needReinit; 443 hits = orig.hits; 444 startPos = orig.startPos; 445 windowStart = orig.windowStart; 446 curPos = orig.curPos; 447 endPos = orig.endPos; 448 dotPos = orig.dotPos; 449 dotPos2 = orig.dotPos2; 450 data_repeats = orig.data_repeats; 451 curHit = begin(hits, Standard()) + (orig.curHit - begin(orig.hits, Standard())); 452 endHit = end(hits, Standard()); 453 curRepeat = begin(data_repeats, Standard()) + (orig.curRepeat - begin(orig.data_repeats, Standard())); 454 endRepeat = end(data_repeats, Standard()); 455 return *this; 456 } 457 }; 458 459 460 //____________________________________________________________________________ 461 462 // forward 463 template < typename TInput, typename TSpec > 464 struct Pipe; 465 466 template < typename TTuples, typename TPipeSpec, typename TSpec > 467 class Finder< Pipe<TTuples, TPipeSpec>, Swift<TSpec> > 468 { 469 public: 470 typedef Pipe<TTuples, TPipeSpec> TInput; 471 typedef typename Size<TInput>::Type THstkPos; 472 typedef typename FindResult<Finder>::Type TSwiftHit; 473 typedef typename WindowFindResult<Finder>::Type THitString; 474 typedef typename Iterator<THitString, Standard>::Type THitIterator; 475 476 TInput ∈ 477 bool _needReinit; // if true, the Pattern needs to be reinitialized 478 THitString hits; 479 THitIterator curHit, endHit; 480 THstkPos curPos, dotPos, dotPos2; 481 482 Finder(TInput &_in): 483 in(_in), 484 _needReinit(true) {} 485 486 Finder(Finder const &orig): 487 in(orig.in), 488 hits(orig.hits), 489 _needReinit(orig._needReinit) 490 { 491 curHit = begin(hits, Standard()) + (orig.curHit - begin(orig.hits, Standard())); 492 endHit = end(hits, Standard()); 493 } 494 }; 495 496 497 //____________________________________________________________________________ 498 499 500 template <typename THaystack, typename TSpec> 501 inline bool 502 atEnd(Finder<THaystack, Swift<TSpec> > & me) 503 { 504 return hostIterator(hostIterator(me)) == hostIterator(me.haystackEnd); 505 } 506 507 template <typename THaystack, typename TSpec> 508 inline void 509 goEnd(Finder<THaystack, Swift<TSpec> > & me) 510 { 511 hostIterator(me) = me.haystackEnd; 512 } 513 514 515 //____________________________________________________________________________ 516 517 518 template <typename TIndex, typename TSpec> 519 class Pattern<TIndex, Swift<TSpec> > 520 { 521 public: 522 typedef typename Size<TIndex>::Type TSize; 523 typedef unsigned TShortSize; 524 typedef typename Fibre<TIndex, Tag<FibreSA_> const >::Type TSA; 525 typedef typename Fibre<TIndex, Tag<Fibre_Shape_> const >::Type TShape; 526 typedef typename Iterator<TSA const, Standard>::Type TIterator; 527 528 typedef SwiftBucket_<TSpec, TSize, TShortSize> TBucket; 529 typedef String<TBucket> TBucketString; 530 typedef SwiftBucketParams_<TSpec, TSize, TShortSize> TBucketParams; 531 typedef String<TBucketParams> TBucketParamsString; 532 typedef String<Pair<unsigned> > TVerifyList; 533 534 TShape shape; 535 TBucketString buckets; 536 TBucketParamsString bucketParams; 537 TVerifyList verifyList; // numbers of buckets that need to be verified 538 SwiftParameters params; 539 unsigned curSeqNo; 540 int64_t curBeginPos, curEndPos; 541 TSize finderPosOffset; // these must be of 542 TSize finderPosNextOffset; // type TSize of TBucket 543 int64_t finderLength; 544 int64_t maxPatternLength; 545 546 double _currentErrorRate; 547 int _currentMinLengthForAll; 548 549 Holder<TIndex> data_host; 550 551 Pattern() 552 { 553 clear(*this); 554 } 555 556 Pattern(TIndex &_index): data_host(_index) 557 { 558 clear(*this); 559 } 560 561 Pattern(TIndex const &_index): data_host(_index) 562 { 563 clear(*this); 564 } 565 }; 566 567 //____________________________________________________________________________ 568 569 570 template <typename TSpec_, typename TSize, typename TShortSize> 571 inline void _printSwiftParams(SwiftBucketParams_<TSpec_, TSize, TShortSize > &bucketParams) 572 { 573 std::cout << " firstBucket: " << bucketParams.firstBucket << std::endl; 574 std::cout << " reuseMask: " << bucketParams.reuseMask << std::endl; 575 std::cout << " distanceCut: " << bucketParams.distanceCut << std::endl; 576 std::cout << " delta: " << bucketParams.delta << std::endl; 577 std::cout << " threshold: " << bucketParams.threshold << std::endl; 578 std::cout << " overlap: " << bucketParams.overlap << std::endl; 579 std::cout << " logDelta: " << (int)bucketParams.logDelta << std::endl << std::endl; 580 } 581 582 template <typename TSpec_, typename TSize, typename TShortSize> 583 inline void _printSwiftParams(SwiftBucketParams_<Tag<SwiftSemiGlobal_<TSpec_> >, TSize, TShortSize > &bucketParams) 584 { 585 std::cout << " firstBucket: " << bucketParams.firstBucket << std::endl; 586 std::cout << " reuseMask: " << bucketParams.reuseMask << std::endl; 587 std::cout << " delta: " << bucketParams.delta << std::endl; 588 std::cout << " threshold: " << bucketParams.threshold << std::endl; 589 std::cout << " overlap: " << bucketParams.overlap << std::endl; 590 std::cout << " logDelta: " << (int)bucketParams.logDelta << std::endl << std::endl; 591 } 592 593 template <typename TIndex, typename TSpec> 594 inline void _printSwiftBuckets(Pattern< TIndex, Swift<TSpec> > &p) 595 { 596 typedef typename Pattern<TIndex, Swift<TSpec> >::TBucketParams TParams; 597 598 unsigned j = 0; 599 TParams *bucketParams = &_swiftBucketParams(p, 0); 600 601 for(unsigned i=0; i<length(p.buckets) && i<10; ++i) 602 { 603 if ((i & bucketParams->reuseMask) == 0) 604 { 605 std::cout << std::endl << "ReadBucket #" << j << " " << '"'; 606 std::cout << indexText(host(p))[j] << '"' << std::endl; 607 std::cout << " length: " << sequenceLength(j, host(p)) << std::endl; 608 bucketParams = &_swiftBucketParams(p, j++); 609 _printSwiftParams(*bucketParams); 610 } 611 612 std::cout << " lastInc: " << (int)p.buckets[i].lastIncrement; 613 std::cout << " \tCounter: " << p.buckets[i].counter << std::endl; 614 } 615 } 616 617 template <typename TIndex, typename TSpec, typename TSize> 618 inline typename Pattern<TIndex, Swift<TSpec> >::TBucketParams & 619 _swiftBucketParams(Pattern<TIndex, Swift<TSpec> > & pattern, TSize seqNo) 620 { 621 if (Swift<TSpec>::PARAMS_BY_LENGTH) 622 return pattern.bucketParams[sequenceLength(seqNo, host(pattern))]; 623 else 624 return pattern.bucketParams[seqNo]; 625 } 626 627 template <typename TIndex, typename TSpec, typename TParams, typename TSize> 628 inline unsigned 629 _swiftBucketNo(Pattern<TIndex, Swift<TSpec> > const &, TParams &bucketParams, TSize seqNo) 630 { 631 if (Swift<TSpec>::PARAMS_BY_LENGTH) 632 return (bucketParams.reuseMask + 1) * seqNo; // assumes the same reuseMask for all reads 633 else 634 return bucketParams.firstBucket; 635 } 636 637 template <typename TIndex, typename TSpec, typename TSeqNo> 638 inline int 639 _qgramLemma(Pattern<TIndex, Swift<TSpec> > const & pattern, TSeqNo seqNo, int errors) 640 { 641 // q-gram lemma: How many conserved q-grams we see at least? 642 // each error destroys at most <weight> many (gapped) q-grams 643 return qgramThreshold(indexShape(host(pattern)), sequenceLength(seqNo, host(pattern)), errors, EditDistance(), ThreshQGramLemma()); 644 } 645 646 /*! 647 * @fn SwiftPattern#setMinThreshold 648 * @headerfile <seqan/index.h> 649 * @brief Set minimal threshold (<i>t</i>) for a given needle. 650 * 651 * This function can be used to make the filter allow less errors for a given needle. 652 * 653 * @signature void setMinThreshold(pattern, seqNo, thresh); 654 * 655 * @param[in,out] pattern The SwiftPattern to modify. 656 * @param[in] seqNo The number to set the threshold for. 657 * @param[in] tresh The threshold to use. 658 */ 659 660 661 template <typename TIndex, typename TSpec, typename TSeqNo, typename TThreshold> 662 inline void 663 setMinThreshold(Pattern<TIndex, Swift<TSpec> > & pattern, TSeqNo seqNo, TThreshold thresh) 664 { 665 typedef Pattern<TIndex, Swift<TSpec> > TPattern; 666 typedef typename TPattern::TBucketParams TBucketParams; 667 typedef typename TPattern::TBucketString TBucketString; 668 typedef typename Iterator<TBucketString, Standard>::Type TBucketIterator; 669 670 TBucketParams &bucketParams = _swiftBucketParams(pattern, seqNo); 671 TBucketIterator it = begin(pattern.buckets, Standard()) + _swiftBucketNo(pattern, bucketParams, seqNo); 672 TBucketIterator itEnd = it + (bucketParams.reuseMask + 1); 673 674 for (; it != itEnd; ++it) 675 { 676 677 // increase the threshold if it is less than the minimal threshold 678 if ((*it).threshold < thresh) 679 { 680 // increase the counter once it has reached the threshold 681 // otherwise we could output the same hit multiple times 682 if ((*it).counter >= (*it).threshold) 683 (*it).counter = thresh; 684 685 (*it).threshold = thresh; 686 } 687 } 688 } 689 690 template <typename TSpec, typename TSize, typename TShortSize, typename TPos> 691 inline void 692 _resetBucket(SwiftBucket_<TSpec, TSize, TShortSize> & bkt, TPos lastIncrement) 693 { 694 bkt.lastIncrement = lastIncrement; 695 bkt.counter = 0; 696 bkt.notListed = true; 697 } 698 699 template <typename TSpec, typename TSize, typename TShortSize, typename TPos, typename TThresh> 700 inline void 701 _resetBucket(SwiftBucket_<TSpec, TSize, TShortSize> & bkt, TPos lastIncrement, TThresh threshold) 702 { 703 bkt.lastIncrement = lastIncrement; 704 bkt.counter = 0; 705 bkt.threshold = threshold; 706 bkt.notListed = true; 707 } 708 709 template <typename TSpec_, typename TSize, typename TShortSize, typename TPos> 710 inline void 711 _resetBucket(SwiftBucket_<SwiftSemiGlobal_<TSpec_>, TSize, TShortSize> & bkt, TPos lastIncrement) 712 { 713 bkt.lastIncrement = lastIncrement; 714 bkt.counter = 0; 715 } 716 717 template <typename TSpec_, typename TSize, typename TShortSize, typename TPos, typename TThresh> 718 inline void 719 _resetBucket(SwiftBucket_<SwiftSemiGlobal_<TSpec_>, TSize, TShortSize> & bkt, TPos lastIncrement, TThresh threshold) 720 { 721 bkt.lastIncrement = lastIncrement; 722 bkt.counter = 0; 723 bkt.threshold = threshold; 724 } 725 726 template <typename TIndex, typename TFloat, typename TSize_, typename TSpec> 727 inline void _patternInit(Pattern<TIndex, Swift<TSpec> > &pattern, TFloat errorRate, TSize_ minLengthForAll) 728 { 729 typedef Pattern<TIndex, Swift<TSpec> > TPattern; 730 typedef typename Size<TIndex>::Type TSize; 731 //typedef typename Fibre<TIndex, QGramSA>::Type TSA; 732 //typedef typename Iterator<TSA, Standard>::Type TSAIter; 733 typedef typename TPattern::TBucket TBucket; 734 typedef typename TBucket::TSize TBucketSize; 735 typedef typename TPattern::TBucketParams TBucketParams; 736 typedef typename TPattern::TBucketString TBucketString; 737 typedef typename Iterator<TBucketString, Standard>::Type TBucketIterator; 738 739 double _newErrorRate = errorRate; 740 TSize seqCount = countSequences(host(pattern)); 741 742 clear(pattern.verifyList); 743 744 if (pattern._currentErrorRate != _newErrorRate || pattern._currentMinLengthForAll != minLengthForAll) 745 { 746 // settings have been changed -> initialize bucket parameters 747 748 pattern._currentErrorRate = _newErrorRate; 749 pattern._currentMinLengthForAll = minLengthForAll; 750 751 indexRequire(host(pattern), QGramSADir()); 752 pattern.shape = indexShape(host(pattern)); 753 754 TSize span = length(pattern.shape); 755 TSize count = 0; 756 TSize bucketsPerCol2Max = 0; 757 TSize maxLength = 0; 758 759 if (Swift<TSpec>::PARAMS_BY_LENGTH) { 760 for(unsigned seqNo = 0; seqNo < seqCount; ++seqNo) { 761 TSize length = sequenceLength(seqNo, host(pattern)); 762 if (maxLength < length) 763 maxLength = length; 764 } 765 resize(pattern.bucketParams, maxLength + 1); 766 } else 767 resize(pattern.bucketParams, seqCount); 768 769 pattern.maxPatternLength = maxLength; 770 pattern.finderPosOffset = 0; 771 pattern.finderPosNextOffset = pattern.finderLength + pattern.maxPatternLength; 772 773 if (Swift<TSpec>::SEMIGLOBAL == 0) 774 { 775 // global matches 776 TSize minLength = minLengthForAll; 777 for(unsigned seqNo = 0; seqNo < seqCount; ++seqNo) 778 { 779 // swift q-gram lemma 780 TBucketParams &bucketParams = _swiftBucketParams(pattern, seqNo); 781 // n..next length that could decrease threshold 782 TSize n = (TSize) ceil((floor(errorRate * minLength) + 1) / errorRate); 783 // minimal threshold is minimum errors of minLength and n 784 int threshold = (TSize) _min( 785 (n + 1) - span * (floor(errorRate * n) + 1), 786 (minLength + 1) - span * (floor(errorRate * minLength) + 1)); 787 788 if (threshold > pattern.params.minThreshold) 789 bucketParams.threshold = threshold; 790 else 791 bucketParams.threshold = pattern.params.minThreshold; 792 793 SEQAN_ASSERT_GT_MSG((1 / errorRate), span, "SWIFT only works if span < 1 / error rate!"); 794 TSize errors = (TSize) floor((2 * bucketParams.threshold + span - 3) / (1 / errorRate - span)); 795 796 797 // a bucket has distanceCut different positions of q-grams 798 // if a q-gram is this far or further away it can't belong to the 799 // same bucket 800 bucketParams.distanceCut = (bucketParams.threshold - 1) + span * errors; 801 802 // from now, errors is the maximal number of indels 803 if (Swift<TSpec>::HAMMING_ONLY != 0) 804 errors = 0; 805 806 TSize bucketsPerCol2; 807 if(Swift<TSpec>::DIAGONAL == 1) 808 { 809 // Use overlapping parallelograms 810 bucketParams.overlap = errors; 811 812 // delta must be a power of 2 and greater than errors 813 bucketParams.logDelta = (TSize) ceil(log((double)errors + 1) / log(2.0)); 814 if (bucketParams.logDelta < pattern.params.minLog2Delta) 815 bucketParams.logDelta = pattern.params.minLog2Delta; 816 bucketParams.delta = 1 << bucketParams.logDelta; 817 bucketParams.tabooLength = pattern.params.tabooLength; 818 819 // maximal number of buckets in one column 820 TSize bucketsPerCol = (sequenceLength(seqNo, host(pattern)) - span + 2 * bucketParams.delta + errors - 1) / bucketParams.delta; 821 bucketsPerCol2 = 1 << (TSize) ceil(log((double)bucketsPerCol) / log(2.0)); // next greater or equal power of 2 822 } 823 else 824 { 825 // TODO: classical swift for rectangular buckets 826 // Use overlapping rectangles 827 //bucketParams.overlap = ; 828 829 // delta must be a power of 2 greater than seq.length + errors (define a minimal delta of 32) 830 //bucketParams.logDelta = ; 831 //if (bucketParams.logDelta < pattern.params.minLog2Delta) 832 // bucketParams.logDelta = pattern.params.minLog2Delta; 833 //bucketParams.delta = 1 << bucketParams.logDelta; 834 bucketsPerCol2 = 1; 835 } 836 837 bucketParams.firstBucket = count; // firstBucket is only used if Swift<TSpec>::PARAMS_BY_LENGTH == 0 838 bucketParams.reuseMask = bucketsPerCol2 - 1; 839 bucketParams.tabooLength = pattern.params.tabooLength; 840 841 if (Swift<TSpec>::PARAMS_BY_LENGTH) { 842 ++count; 843 if (bucketsPerCol2Max < bucketsPerCol2) 844 bucketsPerCol2Max = bucketsPerCol2; 845 } else 846 count += bucketsPerCol2; 847 848 /*if (seqNo<3) 849 _printSwiftParams(bucketParams);*/ 850 } 851 } else 852 for(unsigned seqNo = 0; seqNo < seqCount; ++seqNo) 853 { 854 // get pattern length and max. allowed errors 855 TBucketParams &bucketParams = _swiftBucketParams(pattern, seqNo); 856 TSize length = 0; 857 if (minLengthForAll != static_cast<TSize_>(0)) 858 length = minLengthForAll; 859 else 860 length = sequenceLength(seqNo, host(pattern)); 861 TSize errors = (TSize) floor(errorRate * length); 862 TSize errorsWC = errors / (1 + Swift<TSpec>::QGRAM_ERRORS); 863 864 // q-gram lemma: How many conserved q-grams we see at least? 865 // (define a minimal threshold of 1) 866 int threshold = length - span + 1 - errorsWC * weight(pattern.shape); 867 if (threshold > pattern.params.minThreshold) 868 bucketParams.threshold = threshold; 869 else 870 bucketParams.threshold = pattern.params.minThreshold; 871 872 // from now, errors is the maximal number of indels 873 if (Swift<TSpec>::HAMMING_ONLY != 0) 874 errors = 0; 875 876 // a bucket has distanceCut different positions of q-grams 877 // if a q-gram is this far or farer away it can't belong to the 878 // same bucket 879 // bucketParams.distanceCut = length - (span - 1) + errors; 880 881 TSize bucketsPerCol2; 882 if (Swift<TSpec>::DIAGONAL == 1) 883 { 884 // Use overlapping parallelograms 885 bucketParams.overlap = errors; 886 887 // delta must be a power of 2 greater then errors (define a minimal delta of 8) 888 bucketParams.logDelta = (TSize) ceil(log((double)(errors + 1)) / log(2.0)); 889 if (bucketParams.logDelta < pattern.params.minLog2Delta) 890 bucketParams.logDelta = pattern.params.minLog2Delta; 891 bucketParams.delta = 1 << bucketParams.logDelta; 892 893 // the formula for bucketsPerCol is (worst-case): 894 // (height-(q-1) - 1 - (delta+1-e))/delta + 3 895 // ^-- full paral. in the middle --^ ^-- 2 at the bottom, 1 at the top 896 TSize bucketsPerCol = (sequenceLength(seqNo, host(pattern)) - span + 2 * bucketParams.delta + errors - 1) / bucketParams.delta; 897 bucketsPerCol2 = 1 << (TSize) ceil(log((double)bucketsPerCol) / log(2.0)); 898 } 899 else 900 { 901 // Use overlapping rectangles 902 bucketParams.overlap = length - span + errors; 903 904 // delta must be a power of 2 greater then seq.length + errors (define a minimal delta of 32) 905 bucketParams.logDelta = (TSize) ceil(log((double)(length - span + 1 + errors)) / log(2.0)); 906 if (bucketParams.logDelta < pattern.params.minLog2Delta) 907 bucketParams.logDelta = pattern.params.minLog2Delta; 908 bucketParams.delta = 1 << bucketParams.logDelta; 909 910 bucketsPerCol2 = 2; 911 } 912 913 // SEQAN_ASSERT_LEQ(distanceCut, bucketsPerCol * (TSize) delta); 914 915 bucketParams.firstBucket = count; 916 bucketParams.reuseMask = bucketsPerCol2 - 1; 917 bucketParams.tabooLength = pattern.params.tabooLength; 918 919 if (Swift<TSpec>::PARAMS_BY_LENGTH) { 920 ++count; 921 if (bucketsPerCol2Max < bucketsPerCol2) 922 bucketsPerCol2Max = bucketsPerCol2; 923 } else 924 count += bucketsPerCol2; 925 926 /* if (seqNo<3) 927 _printSwiftParams(bucketParams); 928 */ } 929 930 if (Swift<TSpec>::PARAMS_BY_LENGTH) { 931 count *= bucketsPerCol2Max; 932 for(unsigned i = 0; i < length(pattern.bucketParams); ++i) 933 pattern.bucketParams[i].reuseMask = bucketsPerCol2Max - 1; 934 } 935 resize(pattern.buckets, count); 936 937 TBucketIterator bktEnd, bkt = begin(pattern.buckets, Standard()); 938 for(unsigned seqNo = 0; seqNo < seqCount; ++seqNo) 939 { 940 TBucketParams &bucketParams = _swiftBucketParams(pattern, seqNo); 941 TBucketSize lastIncrement = (TBucketSize)0 - (TBucketSize)bucketParams.tabooLength; 942 for(bktEnd = bkt + bucketParams.reuseMask + 1; bkt != bktEnd; ++bkt) 943 { 944 _resetBucket(*bkt, lastIncrement, bucketParams.threshold); 945 } 946 } 947 } 948 else 949 { 950 // settings are unchanged -> reset buckets 951 952 // finderPosOffset is used to circumvent expensive resetting of all buckets 953 int64_t clearance = pattern.finderLength + pattern.maxPatternLength; 954 pattern.finderPosOffset = pattern.finderPosNextOffset; 955 pattern.finderPosNextOffset += clearance; 956 957 // reset buckets only if an overflow of the finder position would occur 958 // we would not detect an overflow if clearance is larger than the largest TBucketSize value 959 if (pattern.finderPosNextOffset <= pattern.finderPosOffset || (int64_t)(TBucketSize)clearance < (int64_t)clearance) 960 { 961 pattern.finderPosOffset = 0; 962 pattern.finderPosNextOffset = pattern.finderLength + pattern.maxPatternLength; 963 964 TBucketIterator bktEnd, bkt = begin(pattern.buckets, Standard()); 965 for (TSize ndlSeqNo = 0; ndlSeqNo < seqCount; ++ndlSeqNo) 966 { 967 TBucketParams &bucketParams = _swiftBucketParams(pattern, ndlSeqNo); 968 TBucketSize lastIncrement = (TBucketSize)0 - (TBucketSize)bucketParams.tabooLength; 969 for (bktEnd = bkt + (bucketParams.reuseMask + 1); bkt != bktEnd; ++bkt) 970 { 971 _resetBucket(*bkt, lastIncrement); 972 } 973 } 974 } 975 } 976 977 /* 978 std::cerr << "Swift bucket params: " << length(pattern.bucketParams) << std::endl; 979 std::cerr << "Swift buckets: " << length(pattern.buckets) << std::endl; 980 std::cerr << "Buckets per read: " << bucketsPerCol2Max << std::endl; 981 */ 982 } 983 984 985 ///////////////////////////////////////////////////////////// 986 // Creates a new hit and appends it to the finders hit list 987 template < 988 typename THaystack, 989 typename TIndex, 990 typename TSpec, 991 typename TBucket, 992 typename TBucketParams, 993 typename TSize 994 > 995 inline void _createHit( 996 Finder<THaystack, Swift<TSpec> > & finder, 997 Pattern<TIndex, Swift<TSpec> > & pattern, 998 TBucket & bkt, 999 TBucketParams & bucketParams, 1000 int64_t diag, 1001 TSize ndlSeqNo) 1002 { 1003 typedef typename FindResult<Finder<THaystack, Swift<TSpec> >, Pattern<TIndex, Swift<TSpec> > >::Type THit; 1004 int64_t lastInc = (int64_t)bkt.lastIncrement - pattern.finderPosOffset; 1005 int64_t firstInc = (int64_t)bkt.firstIncrement - pattern.finderPosOffset; 1006 1007 if(diag > lastInc) 1008 { 1009 // bucket is reused since last increment 1010 TSize reusePos = (bucketParams.reuseMask + 1) << bucketParams.logDelta; 1011 diag -= (int64_t)ceil((diag-lastInc)/(double)reusePos) * reusePos; 1012 } 1013 1014 // determine width, height, and begin position in needle 1015 TSize width = lastInc - firstInc + length(pattern.shape); 1016 TSize height = width + bucketParams.delta + bucketParams.overlap; 1017 int64_t ndlBegin = lastInc + length(pattern.shape) - diag - height; 1018 1019 // create the hit 1020 THit hit = { // * 1021 firstInc, // bucket begin in haystack * * 1022 ndlSeqNo, // needle seq. number * * 1023 ndlBegin, // bucket begin in needle * * 1024 width, // bucket width (non-diagonal) * * 1025 height // bucket height * * 1026 }; // * 1027 1028 // append the hit to the finders hit list 1029 appendValue(finder.hits, hit); 1030 } 1031 1032 ////////////////////////////////////////////////////////////////////// 1033 // Updates the counters of the buckets in which the q-gram with hash value hash occurs. 1034 // Assures that those updated bucket counters are set to one 1035 // - that exceeded the reuse mask since last increment 1036 // - for which the last increment lies more than distanceCut away. 1037 // If a bucket counter reaches threshold a hit is appended to the hit-list of the finder. 1038 // Returns true if the hit-list of the finder is not empty after this call. 1039 template < 1040 typename TFinder, 1041 typename TIndex, 1042 typename TSpec, 1043 typename THashValue 1044 > 1045 inline bool _swiftMultiProcessQGram( 1046 TFinder & finder, 1047 Pattern<TIndex, Swift<TSpec> > & pattern, 1048 THashValue hash) 1049 { 1050 typedef Pattern<TIndex, Swift<TSpec> > TPattern; 1051 1052 //typedef typename Size<TIndex>::Type TSize; 1053 typedef typename Fibre<TIndex const, QGramSA>::Type TSA; 1054 typedef typename Iterator<TSA, Standard>::Type TSAIter; 1055 typedef typename TPattern::TBucketString TBucketString; 1056 typedef typename Iterator<TBucketString, Standard>::Type TBucketIter; 1057 typedef typename Value<TBucketString>::Type TBucket; 1058 typedef typename TBucket::TShortSize TShortSize; 1059 typedef typename TPattern::TBucketParams TBucketParams; 1060 //typedef typename FindResult<TFinder, TPattern>::Type THit; 1061 1062 TIndex const &index = host(pattern); 1063 1064 // create an iterator over the positions of the q-gram occurrences in pattern 1065 TSAIter saBegin = begin(indexSA(index), Standard()); 1066 TSAIter occ = saBegin + indexDir(index)[getBucket(index.bucketMap, hash)]; 1067 TSAIter occEnd = saBegin + indexDir(index)[getBucket(index.bucketMap, hash) + 1]; 1068 TBucketIter bktBegin = begin(pattern.buckets, Standard()); 1069 Pair<unsigned> ndlPos; 1070 1071 /* std::cerr<<"\t["<<(occEnd-occ)<<"]"<< std::flush; 1072 1073 if ((occEnd-occ)>100) 1074 { 1075 std::cerr<<" "; 1076 for(int i=0;i<length(indexShape(host(pattern)));++i) 1077 std::cerr<<*(hostIterator(hostIterator(finder))+i); 1078 } 1079 */ 1080 1081 // iterate over all q-gram occurrences and do the processing 1082 int64_t curPos = finder.curPos + pattern.finderPosOffset; 1083 for(; occ != occEnd; ++occ) 1084 { 1085 posLocalize(ndlPos, *occ, stringSetLimits(index)); // get pair of SeqNo and Pos in needle 1086 TBucketParams &bucketParams = _swiftBucketParams(pattern, getSeqNo(ndlPos)); 1087 1088 // begin position of the diagonal of q-gram occurrence in haystack (possibly negative) 1089 int64_t diag = finder.curPos; 1090 if (Swift<TSpec>::DIAGONAL == 1) diag -= getSeqOffset(ndlPos); 1091 1092 unsigned bktNo = (diag >> bucketParams.logDelta) & bucketParams.reuseMask; // bucket no of diagonal 1093 unsigned bktOfs = diag & (bucketParams.delta - 1); // offset of diagonal to bucket begin 1094 int64_t bktBeginHstk = diag & ~(int64_t)(bucketParams.delta - 1); // haystack position of bucket begin diagonal 1095 1096 // global (over all pattern sequences) number of current bucket 1097 TBucketIter bkt = bktBegin + (_swiftBucketNo(pattern, bucketParams, getSeqNo(ndlPos)) + bktNo); 1098 1099 TShortSize hitCount; 1100 1101 do { 1102 if ((int64_t)(*bkt).lastIncrement < bktBeginHstk + (int64_t)pattern.finderPosOffset 1103 || (int64_t)((*bkt).lastIncrement + bucketParams.distanceCut) < curPos) 1104 { 1105 // last increment was before the beginning of the current bucket => bucket is reused 1106 // Or last increment was in the same bucket but lies more than distanceCut away 1107 1108 if ((*bkt).counter >= (*bkt).threshold) 1109 { 1110 // create a new hit and append it to the finders hit list 1111 _createHit(finder, pattern, *bkt, bucketParams, bktBeginHstk, getSeqNo(ndlPos)); 1112 } 1113 1114 // reuse bucket 1115 hitCount = 1; 1116 (*bkt).firstIncrement = curPos; 1117 } 1118 else if((int64_t)((*bkt).lastIncrement + bucketParams.tabooLength) > curPos) 1119 { 1120 // bkt counter was already incremented for another q-gram at 1121 // a haystack position that is closer than tabooLength 1122 // we jump directly to 1123 // where we check whether the q-gram falls into another overlapping bucket or not 1124 goto checkOverlap; 1125 } 1126 else 1127 { 1128 if((*bkt).counter == 0) (*bkt).firstIncrement = curPos; 1129 hitCount = (*bkt).counter + 1; 1130 } 1131 1132 (*bkt).lastIncrement = curPos; 1133 (*bkt).counter = hitCount; 1134 #ifdef SEQAN_DEBUG_SWIFT 1135 (*bkt)._lastIncDiag = diag; 1136 #endif 1137 1138 if (hitCount == (*bkt).threshold && (*bkt).notListed) { 1139 // append bkt no to patterns verify list 1140 appendValue(pattern.verifyList, Pair<unsigned>(getSeqNo(ndlPos), bktNo)); 1141 (*bkt).notListed = false; 1142 } 1143 1144 checkOverlap: 1145 // check if q-gram falls into another overlapping bucket 1146 if(bktOfs >= bucketParams.overlap) break; 1147 1148 // set to previous overlapping bucket for next iteration 1149 bktBeginHstk -= bucketParams.delta; 1150 bktOfs += bucketParams.delta; 1151 if(bktNo) { 1152 --bktNo; 1153 --bkt; 1154 } else { 1155 bktNo = bucketParams.reuseMask; 1156 bkt += bktNo; 1157 } 1158 } 1159 while(true); 1160 } 1161 1162 finder.curHit = begin(finder.hits, Standard()); 1163 finder.endHit = end(finder.hits, Standard()); 1164 1165 return !empty(finder.hits); 1166 } 1167 1168 /////////////////////////////////////////////////////////////////// 1169 // Updates the counters of the buckets in which the q-gram with hash value hash occurs. 1170 // Assures that those updated bucket counters that exceeded the reuse mask since last increment are set to one. 1171 // If a bucket counter reaches threshold a hit is appended to the hit-list of the finder. 1172 // Returns true if the hit-list of the finder is not empty after this call. 1173 template < 1174 typename TFinder, 1175 typename TIndex, 1176 typename TSpec_, 1177 typename THValue 1178 > 1179 inline bool _swiftMultiProcessQGram( 1180 TFinder &finder, 1181 Pattern<TIndex, Swift<Tag<SwiftSemiGlobal_<TSpec_> > > > &pattern, 1182 THValue hash) 1183 { 1184 typedef Pattern<TIndex, Swift<Tag<SwiftSemiGlobal_<TSpec_> > > > TPattern; 1185 1186 typedef typename Size<TIndex>::Type TSize; 1187 typedef typename Fibre<TIndex, QGramSA>::Type TSA; 1188 typedef typename Iterator<TSA const, Standard>::Type TSAIter; 1189 typedef typename TPattern::TBucketString TBucketString; 1190 typedef typename Iterator<TBucketString, Standard>::Type TBucketIter; 1191 typedef typename Value<TBucketString>::Type TBucket; 1192 typedef typename TBucket::TShortSize TShortSize; 1193 typedef typename TPattern::TBucketParams TBucketParams; 1194 typedef typename FindResult<TFinder, TPattern>::Type THit; 1195 1196 TIndex const & index = host(pattern); 1197 1198 // create an iterator over the positions of the q-gram occurrences in pattern 1199 TSAIter saBegin = begin(indexSA(index), Standard()); 1200 TSAIter occ = saBegin + indexDir(index)[getBucket(index.bucketMap, hash)]; 1201 TSAIter occEnd = saBegin + indexDir(index)[getBucket(index.bucketMap, hash) + 1]; 1202 TBucketIter bktBegin = begin(pattern.buckets, Standard()); 1203 Pair<unsigned> ndlPos; 1204 1205 /* std::cerr<<"\t["<<(occEnd-occ)<<"]"<< std::flush; 1206 1207 if ((occEnd-occ)>100) 1208 { 1209 std::cerr<<" "; 1210 for(int i=0;i<length(indexShape(host(pattern)));++i) 1211 std::cerr<<*(hostIterator(hostIterator(finder))+i); 1212 } 1213 */ 1214 // iterate over all q-gram occurrences and do the processing 1215 int64_t curPos = finder.curPos + pattern.finderPosOffset; 1216 // bool dbg=pattern.params.debug && (finder.curPos > 496 && finder.curPos < 591); 1217 for(; occ != occEnd; ++occ) 1218 { 1219 posLocalize(ndlPos, *occ, stringSetLimits(index)); 1220 TBucketParams &bucketParams = _swiftBucketParams(pattern, getSeqNo(ndlPos)); 1221 1222 int64_t diag = finder.curPos; 1223 if (Swift<Tag<SwiftSemiGlobal_<TSpec_> > >::DIAGONAL == 1) diag -= getSeqOffset(ndlPos); 1224 1225 unsigned bktNo = (diag >> bucketParams.logDelta) & bucketParams.reuseMask; 1226 unsigned bktOfs = diag & (bucketParams.delta - 1); 1227 int64_t bktBeginHstk = diag & ~(int64_t)(bucketParams.delta - 1); 1228 1229 TBucketIter bkt = bktBegin + (_swiftBucketNo(pattern, bucketParams, getSeqNo(ndlPos)) + bktNo); 1230 TShortSize hitCount; 1231 1232 // if (dbg && (*occ).i1==15) 1233 // std::cout << finder.curPos << '\t' << *occ << '\t' << (*bkt).counter<<std::endl; 1234 // 1235 do 1236 { 1237 if ((int64_t)(*bkt).lastIncrement < bktBeginHstk + (int64_t)pattern.finderPosOffset) 1238 { 1239 // last increment was before the beginning of the current bucket 1240 // (we must ensure that bucketIdx doesn't collide) 1241 hitCount = 1; 1242 } 1243 else 1244 { 1245 if ((int64_t)((*bkt).lastIncrement + bucketParams.tabooLength) > curPos) 1246 goto checkOverlap; // increment only once per sequence 1247 hitCount = (*bkt).counter + 1; 1248 } 1249 1250 (*bkt).lastIncrement = curPos; 1251 (*bkt).counter = hitCount; 1252 #ifdef SEQAN_DEBUG_SWIFT 1253 (*bkt)._lastIncDiag = diag; 1254 #endif 1255 1256 if (hitCount == (*bkt).threshold) 1257 { 1258 1259 TSize height = 0; 1260 if (Swift<Tag<SwiftSemiGlobal_<TSpec_> > >::DIAGONAL == 1) 1261 height = sequenceLength(getSeqNo(ndlPos), host(pattern)) - 1; 1262 1263 #ifdef SEQAN_DEBUG_SWIFT 1264 // upper bucket no. of lastIncr. q-gram 1265 int64_t upperBktNo = ((*bkt).lastIncrement - pattern.finderPosOffset) >> bucketParams.logDelta; 1266 1267 // we must decrement bucket no. until (no. mod reuse == bktNo) 1268 int64_t _bktBeginHstk = 1269 (upperBktNo - ((upperBktNo - bktNo) & bucketParams.reuseMask)) << bucketParams.logDelta; 1270 1271 if ((*bkt)._lastIncDiag - _bktBeginHstk >= bucketParams.delta + bucketParams.overlap || (*bkt)._lastIncDiag < _bktBeginHstk) { 1272 std::cerr << "qgram stored in wrong bucket (diag:" << (*bkt)._lastIncDiag << ", begin:" << _bktBeginHstk; 1273 std::cerr << ", delta:" << bucketParams.delta << ", overlap:" << bucketParams.overlap << ")" << std::endl; 1274 } 1275 #endif 1276 // if (bktBeginHstk >= 0) 1277 // { 1278 THit hit = { 1279 bktBeginHstk, // bucket begin in haystack 1280 getSeqNo(ndlPos), // needle seq. number 1281 static_cast<unsigned>(height + bucketParams.delta + 1282 bucketParams.overlap) // bucket width (non-diagonal) 1283 }; 1284 appendValue(finder.hits, hit); 1285 // } else { 1286 // // match begins left of haystack begin 1287 // THit hit = { 1288 // 0, // bucket begin in haystack 1289 // getSeqNo(ndlPos), // needle seq. number 1290 // height + bucketParams.delta + bucketParams.overlap // bucket width (non-diagonal) 1291 // + (diag & ~(int64_t)(bucketParams.delta - 1)) 1292 // }; 1293 // appendValue(finder.hits, hit); 1294 // } 1295 } 1296 1297 checkOverlap: 1298 if (bktOfs >= bucketParams.overlap) break; 1299 1300 // repeat with the previous overlapping bucket 1301 bktBeginHstk -= bucketParams.delta; 1302 bktOfs += bucketParams.delta; 1303 if (bktNo) { 1304 --bktNo; 1305 --bkt; 1306 } else { 1307 bktNo = bucketParams.reuseMask; 1308 bkt += bktNo; 1309 } 1310 } while (true); 1311 } 1312 1313 finder.curHit = begin(finder.hits, Standard()); 1314 finder.endHit = end(finder.hits, Standard()); 1315 1316 return !empty(finder.hits); 1317 } 1318 1319 //////////////////////////////////////////////////////////////////////////////////// 1320 // resets counter and lastIncrement of all buckets listed in patterns verify list 1321 template < 1322 typename TFinder, 1323 typename TIndex, 1324 typename TSpec 1325 > 1326 inline bool _swiftMultiFlushBuckets( 1327 TFinder & finder, 1328 Pattern<TIndex, Swift<TSpec> > & pattern 1329 ) 1330 { 1331 typedef Pattern<TIndex, Swift<TSpec> > TPattern; 1332 1333 typedef typename TPattern::TBucket TBucket; 1334 typedef typename TBucket::TSize TBucketSize; 1335 typedef typename TPattern::TBucketString TBucketString; 1336 typedef typename Iterator<TBucketString, Standard>::Type TBucketIterator; 1337 typedef typename TPattern::TBucketParams TBucketParams; 1338 1339 typedef typename TPattern::TVerifyList TVerifyList; 1340 typedef typename Iterator<TVerifyList, Standard>::Type TListIterator; 1341 1342 typedef typename Size<TIndex>::Type TSize; 1343 1344 int64_t hstkLength = length(haystack(finder)); 1345 1346 TListIterator verifyBkt = begin(pattern.verifyList, Standard()); 1347 TListIterator verifyListEnd = end(pattern.verifyList, Standard()); 1348 for (; verifyBkt < verifyListEnd; ++verifyBkt) 1349 { 1350 unsigned bktNo = (*verifyBkt).i2; 1351 unsigned ndlSeqNo = (*verifyBkt).i1; 1352 TBucketParams &bucketParams = _swiftBucketParams(pattern, ndlSeqNo); 1353 1354 TBucketIterator bkt = begin(pattern.buckets, Standard()) + _swiftBucketNo(pattern, bucketParams, ndlSeqNo) + bktNo; 1355 if ((*bkt).counter >= (*bkt).threshold) 1356 { 1357 // hstkPos / delta: gives the number of the bucket that is at the top of this column (modulo reuseMask missing) 1358 TSize topBucket = (TSize)(hstkLength >> bucketParams.logDelta); 1359 // number of buckets in last column above the bucket with the number bktNo 1360 TSize bucketNoInCol = (topBucket + bucketParams.reuseMask + 1 - bktNo) & bucketParams.reuseMask; 1361 // begin position of lower diagonal of this bucket in haystack (possibly negative) 1362 int64_t diag = (hstkLength & ~(int64_t)(bucketParams.delta - 1)) - (bucketNoInCol << bucketParams.logDelta); 1363 1364 // create a new hit and append it to the finders hit list 1365 _createHit(finder, pattern, *bkt, bucketParams, diag, ndlSeqNo); 1366 } 1367 _resetBucket(*bkt, (TBucketSize)0 - (TBucketSize)bucketParams.tabooLength); 1368 } 1369 1370 clear(pattern.verifyList); 1371 1372 finder.curHit = begin(finder.hits, Standard()); 1373 finder.endHit = end(finder.hits, Standard()); 1374 1375 return !empty(finder.hits); 1376 } 1377 1378 ////////////////////////////////////////////////////// 1379 // no resetting is needed for the semiglobal version 1380 template < 1381 typename TFinder, 1382 typename TIndex, 1383 typename TSpec_ 1384 > 1385 inline bool _swiftMultiFlushBuckets( 1386 TFinder &, 1387 Pattern<TIndex, Swift<Tag<SwiftSemiGlobal_<TSpec_> > > > &) 1388 { 1389 // there is nothing to be done here as we dump matches immediately after reaching the threshold 1390 return false; 1391 } 1392 1393 template <typename TIndex, typename TSpec> 1394 inline bool 1395 empty(Pattern<TIndex, Swift<TSpec> > & me) 1396 { 1397 return empty(me.bucketParams); 1398 } 1399 1400 template <typename TIndex, typename TSpec> 1401 inline void 1402 clear(Pattern<TIndex, Swift<TSpec> > & me) 1403 { 1404 me.finderPosOffset = 0; 1405 me.finderPosNextOffset = 0; 1406 me.finderLength = 0; 1407 me.maxPatternLength = 0; 1408 me._currentErrorRate = -1; 1409 me._currentMinLengthForAll = -1; 1410 me.curSeqNo = 0; 1411 me.curBeginPos = 0; 1412 me.curEndPos = 0; 1413 clear(me.bucketParams); 1414 clear(me.buckets); 1415 } 1416 1417 //____________________________________________________________________________ 1418 1419 template <typename THaystack, typename TSpec> 1420 inline typename Position<Finder<THaystack, Swift<TSpec> > >::Type 1421 position(Finder<THaystack, Swift<TSpec> > const & finder) 1422 { 1423 typename Finder<THaystack, Swift<TSpec> >::TSwiftHit &hit = *finder.curHit; 1424 return hit.hstkPos + hit.bucketWidth; 1425 } 1426 1427 template <typename THaystack, typename TSpec> 1428 inline typename Position<Finder<THaystack, Swift<TSpec> > >::Type 1429 position(Finder<THaystack, Swift<TSpec> > & finder) 1430 { 1431 return position(const_cast<Finder<THaystack, Swift<TSpec> > const &>(finder)); 1432 } 1433 1434 //____________________________________________________________________________ 1435 1436 template <typename TIndex, typename TSpec> 1437 inline typename SAValue<TIndex>::Type 1438 position(Pattern<TIndex, Swift<TSpec> > const & pattern) 1439 { 1440 int64_t hitEnd = pattern.curEndPos; 1441 int64_t textLength = sequenceLength(pattern.curSeqNo, needle(pattern)); 1442 if(hitEnd > textLength) hitEnd = textLength; 1443 1444 typename SAValue<TIndex >::Type pos; 1445 posLocalToX(pos, Pair<unsigned, int64_t>(pattern.curSeqNo, hitEnd), stringSetLimits(host(pattern))); 1446 return pos; 1447 } 1448 1449 template <typename TIndex, typename TSpec> 1450 inline typename SAValue<TIndex>::Type 1451 position(Pattern<TIndex, Swift<Tag<SwiftSemiGlobal_<TSpec> > > > const & pattern) 1452 { 1453 typedef typename Size<TIndex>::Type TSize; 1454 typename SAValue<TIndex >::Type pos; 1455 posLocalToX(pos, Pair<unsigned, TSize>(pattern.curSeqNo, length(needle(pattern))), stringSetLimits(host(pattern))); 1456 return pos; 1457 } 1458 1459 template <typename TIndex, typename TSpec> 1460 inline typename SAValue<TIndex>::Type 1461 position(Pattern<TIndex, Swift<TSpec> > & pattern) 1462 { 1463 return position(const_cast<Pattern<TIndex, Swift<TSpec> > const &>(pattern)); 1464 } 1465 1466 //____________________________________________________________________________ 1467 1468 template <typename THaystack, typename TSpec> 1469 inline int64_t 1470 beginPosition(Finder<THaystack, Swift<TSpec> > const & finder) 1471 { 1472 return (*finder.curHit).hstkPos; 1473 } 1474 1475 template <typename THaystack, typename TSpec> 1476 inline int64_t 1477 beginPosition(Finder<THaystack, Swift<TSpec> > & finder) 1478 { 1479 return beginPosition(const_cast<Finder<THaystack, Swift<TSpec> > const &>(finder)); 1480 } 1481 1482 //____________________________________________________________________________ 1483 1484 /*! 1485 * @fn SwiftPattern#beginPosition 1486 * @headerfile <seqan/index.h> 1487 * @brief Returns the begin position of the local match in the pattern. 1488 * 1489 * @signature TPosition beginPosition(pattern); 1490 * 1491 * @param[in] pattern The SwiftPattern to query. 1492 * @return TPosition The @link TextConcept#SAValue SA value @endlink of the pattern text. 1493 */ 1494 1495 template <typename TIndex, typename TSpec> 1496 inline typename SAValue<TIndex>::Type 1497 beginPosition(Pattern<TIndex, Swift<TSpec> > const & pattern) 1498 { 1499 int64_t hitBegin = pattern.curBeginPos; 1500 if (hitBegin < 0) hitBegin = 0; 1501 1502 typename SAValue<TIndex >::Type pos; 1503 posLocalToX(pos, Pair<unsigned, int64_t>(pattern.curSeqNo, hitBegin), stringSetLimits(host(pattern))); 1504 return pos; 1505 } 1506 1507 template <typename TIndex, typename TSpec> 1508 inline typename SAValue<TIndex >::Type 1509 beginPosition(Pattern<TIndex, Swift<Tag<SwiftSemiGlobal_<TSpec> > > > const & pattern) 1510 { 1511 typename SAValue<TIndex >::Type pos; 1512 posLocalToX(pos, Pair<unsigned>(pattern.curSeqNo, 0), stringSetLimits(host(pattern))); 1513 return pos; 1514 } 1515 1516 template <typename TIndex, typename TSpec> 1517 inline typename SAValue<TIndex>::Type 1518 beginPosition(Pattern<TIndex, Swift<TSpec> > & pattern) 1519 { 1520 return beginPosition(const_cast<Pattern<TIndex, Swift<TSpec> > const &>(pattern)); 1521 } 1522 1523 //____________________________________________________________________________ 1524 1525 /*! 1526 * @fn SwiftPattern#endPosition 1527 * @headerfile <seqan/index.h> 1528 * @brief Returns the end position of the local match in the pattern. 1529 * 1530 * @signature TPosition endPosition(pattern); 1531 * 1532 * @param[in] pattern The SwiftPattern to query. 1533 * @return TPosition The @link TextConcept#SAValue SA value @endlink of the pattern text. 1534 */ 1535 1536 template <typename THaystack, typename TSpec> 1537 inline typename Position<Finder<THaystack, Swift<TSpec> > >::Type 1538 endPosition(Finder<THaystack, Swift<TSpec> > const & finder) 1539 { 1540 typename Finder<THaystack, Swift<TSpec> >::TSwiftHit &hit = *finder.curHit; 1541 return hit.hstkPos + hit.bucketWidth; 1542 } 1543 1544 template <typename THaystack, typename TSpec> 1545 inline typename Position<Finder<THaystack, Swift<TSpec> > >::Type 1546 endPosition(Finder<THaystack, Swift<TSpec> > & finder) 1547 { 1548 return endPosition(const_cast<Finder<THaystack, Swift<TSpec> > const &>(finder)); 1549 } 1550 1551 //____________________________________________________________________________ 1552 1553 template <typename TIndex, typename TSpec> 1554 inline typename SAValue<TIndex>::Type 1555 endPosition(Pattern<TIndex, Swift<TSpec> > const & pattern) 1556 { 1557 int64_t hitEnd = pattern.curEndPos; 1558 int64_t textLength = sequenceLength(pattern.curSeqNo, needle(pattern)); 1559 if(hitEnd > textLength) hitEnd = textLength; 1560 1561 typename SAValue<TIndex >::Type pos; 1562 posLocalToX(pos, Pair<unsigned, int64_t>(pattern.curSeqNo, hitEnd), stringSetLimits(host(pattern))); 1563 return pos; 1564 } 1565 1566 template <typename TIndex, typename TSpec> 1567 inline typename SAValue<TIndex >::Type 1568 endPosition(Pattern<TIndex, Swift<Tag<SwiftSemiGlobal_<TSpec> > > > const & pattern) 1569 { 1570 typedef typename Size<TIndex>::Type TSize; 1571 typename SAValue<TIndex >::Type pos; 1572 posLocalToX(pos, Pair<unsigned, TSize>(pattern.curSeqNo, length(needle(pattern))), stringSetLimits(host(pattern))); 1573 return pos; 1574 } 1575 1576 template <typename TIndex, typename TSpec> 1577 inline typename SAValue<TIndex>::Type 1578 endPosition(Pattern<TIndex, Swift<TSpec> > & pattern) 1579 { 1580 return endPosition(const_cast<Pattern<TIndex, Swift<TSpec> > const &>(pattern)); 1581 } 1582 1583 //____________________________________________________________________________ 1584 /*! 1585 * @fn SwiftFinder#positionRangeNoClip 1586 * @headerfile <seqan/index.h> 1587 * @brief Returns a pair of the begin and end position in or beyond the haystack or needle for the last hit found. 1588 * 1589 * @signature TPair positionRangeNoClip(finder); 1590 * 1591 * @param[in] finder A SwiftFinder object to query. 1592 * 1593 * @return TPair A pair of the begin and end position in the haystack or needle for the last hit found. These positions 1594 * could be negative or beyond the end of <tt>finder</tt> or <tt>pattern</tt> when using filter 1595 * algorithms. The return type is <tt>Pair<typename SAValue<THost>::Type></tt> if 1596 * <tt>THost</tt> is the type of haystack or needle. 1597 * 1598 * @see SwiftFinder#positionRange 1599 * @see SwiftPattern#positionRangeNoClip 1600 */ 1601 1602 /*! 1603 * @fn SwiftPattern#positionRangeNoClip 1604 * @headerfile <seqan/index.h> 1605 * @brief Returns a pair of the begin and end position in or beyond the haystack or needle for the last hit found. 1606 * 1607 * @signature TPair positionRangeNoClip(pattern) 1608 * 1609 * @param[in] pattern SwiftPattern object to query. 1610 * 1611 * @return TPair A pair of the begin and end position in the haystack or needle for the last hit found. These positions 1612 * could be negative or beyond the end of <tt>finder</tt> or <tt>pattern</tt> when using filter 1613 * algorithms. The return type is <tt>Pair<typename SAValue<THost>::Type></tt> if 1614 * <tt>THost</tt> is the type of haystack or needle. 1615 * 1616 * @see SwiftPattern#positionRange 1617 * @see SwiftFinder#positionRangeNoClip 1618 */ 1619 1620 template <typename THaystack, typename TSpec> 1621 inline Pair<typename Position<Finder<THaystack, Swift<TSpec> > >::Type> 1622 positionRangeNoClip(Finder<THaystack, Swift<TSpec> > const & finder) 1623 { 1624 typedef typename Position<Finder<THaystack, Swift<TSpec> > >::Type TPosition; 1625 typedef Pair<TPosition> TPair; 1626 typename Finder<THaystack, Swift<TSpec> >::TSwiftHit &hit = *finder.curHit; 1627 return TPair((TPosition)hit.hstkPos, (TPosition)(hit.hstkPos + hit.bucketWidth)); 1628 } 1629 1630 template <typename THaystack, typename TSpec> 1631 inline Pair<typename Position<Finder<THaystack, Swift<TSpec> > >::Type> 1632 positionRangeNoClip(Finder<THaystack, Swift<TSpec> > & finder) 1633 { 1634 return positionRangeNoClip(const_cast<Finder<THaystack, Swift<TSpec> > const &>(finder)); 1635 } 1636 1637 //____________________________________________________________________________ 1638 /*! 1639 * @fn SwiftFinder#positionRange 1640 * @headerfile <seqan/index.h> 1641 * @brief Returns a pair of the begin and end position in the haystack or needle for the last hit found. 1642 * 1643 * @signature TPair positionRange(finder); 1644 * 1645 * @param[in] finder A @link Finder @endlink object. 1646 * 1647 * @return TPair A @link Pair @endlink of the begin and end position in the haystack or needle for the last 1648 * hit found. The return type is <tt>Pair<typename SAValue<THost&g;::Type></tt> if 1649 * <tt>THost</tt> is the type of haystack or needle. 1650 * 1651 * @see Finder#beginPosition 1652 * @see Finder#endPosition 1653 * @see SwiftFinder#positionRangeNoClip 1654 */ 1655 1656 /*! 1657 * @fn SwiftPattern#positionRange 1658 * @headerfile <seqan/index.h> 1659 * @brief Returns a pair of the begin and end position in the haystack or needle for the last hit found. 1660 * 1661 * @signature TPair positionRange(pattern); 1662 * 1663 * @param[in] pattern A @link Pattern @endlink object. 1664 * 1665 * @return TPair A @link Pair @endlink of the begin and end position in the haystack or needle for the last 1666 * hit found. The return type is <tt>Pair<typename SAValue<THost&g;::Type></tt> if 1667 * <tt>THost</tt> is the type of haystack or needle. 1668 * 1669 * @see SwiftPattern#positionRangeNoClip 1670 */ 1671 1672 template <typename THaystack, typename TSpec> 1673 inline Pair<typename Position<Finder<THaystack, Swift<TSpec> > >::Type> 1674 positionRange(Finder<THaystack, Swift<TSpec> > const & finder) 1675 { 1676 typedef typename Position<Finder<THaystack, Swift<TSpec> > >::Type TPosition; 1677 typedef Pair<TPosition> TPair; 1678 typename Finder<THaystack, Swift<TSpec> >::TSwiftHit &hit = *finder.curHit; 1679 1680 int64_t hitBegin = hit.hstkPos; 1681 int64_t hitEnd = hit.hstkPos + hit.bucketWidth; 1682 int64_t textEnd = length(haystack(finder)); 1683 1684 if (hitBegin < 0) hitBegin = 0; 1685 if (hitEnd > textEnd) hitEnd = textEnd; 1686 return TPair((TPosition)hitBegin, (TPosition)hitEnd); 1687 } 1688 1689 template <typename THaystack, typename TSpec> 1690 inline Pair<typename Position<Finder<THaystack, Swift<TSpec> > >::Type> 1691 positionRange(Finder<THaystack, Swift<TSpec> > & finder) 1692 { 1693 return positionRange(const_cast<Finder<THaystack, Swift<TSpec> > const &>(finder)); 1694 } 1695 1696 //____________________________________________________________________________ 1697 1698 template <typename TIndex, typename TSpec> 1699 inline Pair<typename SAValue<TIndex>::Type> 1700 positionRange(Pattern<TIndex, Swift<TSpec> > & pattern) 1701 { 1702 return Pair<typename SAValue<TIndex>::Type> (beginPosition(pattern), endPosition(pattern)); 1703 } 1704 1705 //____________________________________________________________________________ 1706 1707 // TODO(holtgrew): Document this properly. 1708 1709 template <typename TSwiftHit, typename TText> 1710 inline typename Infix<TText>::Type 1711 swiftInfixNoClip(TSwiftHit const &hit, TText &text) 1712 { 1713 return infix(text, hit.hstkPos, hit.hstkPos + hit.bucketWidth); 1714 } 1715 1716 template <typename TSwiftHit, typename TText> 1717 inline typename Infix<TText>::Type 1718 swiftInfix(TSwiftHit const & hit, TText & text) 1719 { 1720 int64_t hitBegin = hit.hstkPos; 1721 int64_t hitEnd = hit.hstkPos + hit.bucketWidth; 1722 int64_t textEnd = length(text); 1723 1724 if (hitBegin < 0) hitBegin = 0; 1725 if (hitEnd > textEnd) hitEnd = textEnd; 1726 SEQAN_ASSERT_LEQ(hitBegin, hitEnd); 1727 return infix(text, hitBegin, hitEnd); 1728 } 1729 1730 //____________________________________________________________________________ 1731 1732 // now in find_base.h 1733 template <typename THaystack, typename TSpec> 1734 inline typename Infix<THaystack>::Type 1735 infix(Finder<THaystack, Swift<TSpec> > &finder) 1736 { 1737 typename Parameter_<THaystack>::Type tmpHaystack = haystack(finder); 1738 return swiftInfix(*finder.curHit, tmpHaystack); 1739 } 1740 1741 template <typename THaystack, typename TSpec, typename TText> 1742 inline typename Infix<TText>::Type 1743 infix(Finder<THaystack, Swift<TSpec> > &finder, TText &text) 1744 { 1745 return swiftInfix(*finder.curHit, text); 1746 } 1747 1748 //____________________________________________________________________________ 1749 1750 template <typename THaystack, typename TSpec> 1751 inline typename Infix<THaystack>::Type 1752 infixNoClip(Finder<THaystack, Swift<TSpec> > &finder) 1753 { 1754 return swiftInfixNoClip(*finder.curHit, haystack(finder)); 1755 } 1756 1757 template <typename THaystack, typename TSpec, typename TText> 1758 inline typename Infix<TText>::Type 1759 infixNoClip(Finder<THaystack, Swift<TSpec> > &finder, TText &text) 1760 { 1761 return swiftInfixNoClip(*finder.curHit, text); 1762 } 1763 1764 //____________________________________________________________________________ 1765 1766 template <typename TIndex, typename TSpec, typename TText> 1767 inline typename Infix<TText>::Type 1768 infix(Pattern<TIndex, Swift<TSpec> > const & pattern, TText &text) 1769 { 1770 int64_t hitBegin = pattern.curBeginPos; 1771 int64_t hitEnd = pattern.curEndPos; 1772 int64_t textLength = sequenceLength(pattern.curSeqNo, needle(pattern)); 1773 1774 if (hitEnd > textLength) hitEnd = textLength; 1775 if (hitBegin < 0) hitBegin = 0; 1776 1777 return infix(text, hitBegin, hitEnd); 1778 } 1779 1780 template <typename TIndex, typename TSpec> 1781 inline typename Infix< typename GetSequenceByNo< TIndex const >::Type >::Type 1782 infix(Pattern<TIndex, Swift<TSpec> > const & pattern) 1783 { 1784 return infix(pattern, getSequenceByNo(pattern.curSeqNo, needle(pattern))); 1785 } 1786 1787 template <typename TIndex, typename TSpec> 1788 inline typename Infix< typename GetSequenceByNo< TIndex const >::Type >::Type 1789 infix(Pattern<TIndex, Swift<Tag<SwiftSemiGlobal_<TSpec> > > > const & pattern) 1790 { 1791 return infix(getSequenceByNo(pattern.curSeqNo, needle(pattern)), 0, sequenceLength(pattern.curSeqNo, needle(pattern))); 1792 } 1793 1794 template <typename TIndex, typename TSpec> 1795 inline typename Infix< typename GetSequenceByNo< TIndex const >::Type >::Type 1796 infix(Pattern<TIndex, Swift<TSpec> > & pattern) 1797 { 1798 return infix(const_cast<Pattern<TIndex, Swift<TSpec> > const &>(pattern)); 1799 } 1800 1801 //____________________________________________________________________________ 1802 1803 template <typename THaystack, typename TSpec> 1804 inline void 1805 _printDots(Finder<THaystack, Swift<TSpec> > &finder) 1806 { 1807 while (finder.curPos >= finder.dotPos) 1808 { 1809 finder.dotPos += 100000; 1810 if (finder.dotPos >= finder.dotPos2) 1811 { 1812 std::cerr << (finder.dotPos2 / 1000000) << "M" << std::flush; 1813 finder.dotPos2 += 1000000; 1814 } else 1815 std::cerr << "." << std::flush; 1816 } 1817 } 1818 1819 template <typename TFinder, typename TIndex, typename TSpec> 1820 inline bool 1821 _nextNonRepeatRange( 1822 TFinder &finder, 1823 Pattern<TIndex, Swift<TSpec> > &pattern) 1824 { 1825 //typedef typename TFinder::TRepeat TRepeat; 1826 1827 if (finder.curRepeat == finder.endRepeat) return false; 1828 1829 do 1830 { 1831 finder.startPos = (*finder.curRepeat).endPosition; 1832 if (++finder.curRepeat == finder.endRepeat) 1833 { 1834 finder.endPos = length(host(finder)); 1835 if (finder.startPos + length(pattern.shape) > finder.endPos) 1836 return false; 1837 else 1838 break; 1839 } else 1840 finder.endPos = (*finder.curRepeat).beginPosition; 1841 // repeat until the shape fits in non-repeat range 1842 } while (finder.startPos + length(pattern.shape) > finder.endPos); 1843 1844 finder.curPos = finder.startPos; 1845 hostIterator(finder) = begin(host(finder)) + finder.startPos; 1846 finder.haystackEnd = begin(host(finder)) + (finder.endPos - length(pattern.shape) + 1); 1847 1848 // if (pattern.params.printDots) 1849 // std::cerr << std::endl << " scan range (" << finder.startPos << ", " << finder.endPos << ") " << std::flush; 1850 1851 return true; 1852 } 1853 1854 template <typename TFinder, typename TIndex, typename TSpec> 1855 inline bool 1856 _firstNonRepeatRange( 1857 TFinder &finder, 1858 Pattern<TIndex, Swift<TSpec> > &pattern) 1859 { 1860 //typedef typename TFinder::TRepeat TRepeat; 1861 1862 finder.curRepeat = begin(finder.data_repeats, Standard()); 1863 finder.endRepeat = end(finder.data_repeats, Standard()); 1864 1865 if (finder.curRepeat == finder.endRepeat) 1866 finder.endPos = length(host(finder)); 1867 else 1868 finder.endPos = (*finder.curRepeat).beginPosition; 1869 1870 if (length(pattern.shape) > finder.endPos) 1871 return _nextNonRepeatRange(finder, pattern); 1872 1873 finder.curPos = finder.startPos = 0; 1874 typename Parameter_<typename Host<TFinder>::Type>::Type tmpHost = host(finder); 1875 hostIterator(finder) = begin(tmpHost); 1876 finder.haystackEnd = begin(tmpHost) + (finder.endPos - length(pattern.shape) + 1); 1877 1878 // if (pattern.params.printDots) 1879 // std::cerr << std::endl << " scan range (" << finder.startPos << ", " << finder.endPos << ") " << std::flush; 1880 1881 return true; 1882 } 1883 1884 template <typename TFinder, typename TIndex, typename TSpec> 1885 inline void 1886 _copySwiftHit( 1887 TFinder &finder, 1888 Pattern<TIndex, Swift<TSpec> > &pattern) 1889 { 1890 pattern.curSeqNo = (*finder.curHit).ndlSeqNo; 1891 pattern.curBeginPos = (*finder.curHit).ndlPos; 1892 pattern.curEndPos = (*finder.curHit).ndlPos + (*finder.curHit).hitLengthNeedle; 1893 } 1894 1895 template <typename TFinder, typename TIndex, typename TSpec> 1896 inline void 1897 _copySwiftHit( 1898 TFinder &finder, 1899 Pattern<TIndex, Swift<Tag<SwiftSemiGlobal_<TSpec> > > > &pattern) 1900 { 1901 pattern.curSeqNo = (*finder.curHit).ndlSeqNo; 1902 pattern.curBeginPos = 0; 1903 pattern.curEndPos = length(indexText(needle(pattern))[pattern.curSeqNo]); 1904 } 1905 1906 template <typename TFinder, typename TIndex, typename TSpec> 1907 inline bool 1908 find( 1909 TFinder &finder, 1910 Pattern<TIndex, Swift<Tag<SwiftSemiGlobal_<TSpec> > > > &pattern, 1911 double errorRate) 1912 { 1913 return find(finder, pattern, errorRate, 0); 1914 } 1915 1916 template <typename THaystack, typename TIndex, typename TSpec, typename TSize> 1917 inline bool 1918 find( 1919 Finder<THaystack, Swift<TSpec> > &finder, 1920 Pattern<TIndex, Swift<TSpec> > &pattern, 1921 double errorRate, 1922 TSize minLength) 1923 { 1924 //typedef typename Fibre<TIndex, QGramShape>::Type TShape; 1925 1926 if (empty(finder)) 1927 { 1928 pattern.finderLength = pattern.params.tabooLength + length(container(finder)); 1929 _patternInit(pattern, errorRate, minLength); 1930 _finderSetNonEmpty(finder); 1931 finder.dotPos = 100000; 1932 finder.dotPos2 = 10 * finder.dotPos; 1933 1934 if (!_firstNonRepeatRange(finder, pattern)) return false; 1935 if (_swiftMultiProcessQGram(finder, pattern, hash(pattern.shape, hostIterator(hostIterator(finder))))) 1936 { 1937 _copySwiftHit(finder, pattern); 1938 return true; 1939 } 1940 } 1941 else 1942 { 1943 if (++finder.curHit < finder.endHit) 1944 { 1945 _copySwiftHit(finder, pattern); 1946 return true; 1947 } 1948 } 1949 1950 // all previous matches reported -> search new ones 1951 clear(finder.hits); 1952 1953 // are we at the end of the text? 1954 if (atEnd(finder) && finder.curRepeat == finder.endRepeat) 1955 { 1956 finder.curHit = finder.endHit; 1957 return false; 1958 } 1959 1960 do 1961 { 1962 if (pattern.params.printDots) _printDots(finder); 1963 if (atEnd(++finder)) 1964 { 1965 if (!_nextNonRepeatRange(finder, pattern)) 1966 { 1967 if(_swiftMultiFlushBuckets(finder, pattern)) 1968 { 1969 _copySwiftHit(finder, pattern); 1970 return true; 1971 } 1972 else 1973 return false; 1974 } 1975 hash(pattern.shape, hostIterator(hostIterator(finder))); 1976 } 1977 else 1978 { 1979 ++finder.curPos; 1980 hashNext(pattern.shape, hostIterator(hostIterator(finder))); 1981 } 1982 1983 if (_swiftMultiProcessQGram(finder, pattern, value(pattern.shape))) 1984 { 1985 _copySwiftHit(finder, pattern); 1986 return true; 1987 } 1988 1989 } while (true); 1990 } 1991 1992 template <typename THashes, typename TPipeSpec, typename TIndex, typename TSpec> 1993 inline bool 1994 find( 1995 Finder<Pipe<THashes, TPipeSpec>, Swift<TSpec> > &finder, 1996 Pattern<TIndex, Swift<TSpec> > &pattern, 1997 double errorRate) 1998 { 1999 if (empty(finder)) 2000 { 2001 pattern.finderLength = 0; 2002 _patternInit(pattern, errorRate, 0); 2003 _finderSetNonEmpty(finder); 2004 finder.dotPos = 100000; 2005 finder.dotPos2 = 10 * finder.dotPos; 2006 2007 beginRead(finder.in); 2008 if (eof(finder.in)) 2009 { 2010 endRead(finder.in); 2011 return false; 2012 } 2013 finder.curPos = (*finder.in).i1; 2014 if (_swiftMultiProcessQGram(finder, pattern, hash(pattern.shape, (*finder.in).i2))) 2015 { 2016 _copySwiftHit(finder, pattern); 2017 return true; 2018 } 2019 } else 2020 if (++finder.curHit != finder.endHit) 2021 { 2022 _copySwiftHit(finder, pattern); 2023 return true; 2024 } 2025 2026 clear(finder.hits); 2027 if (eof(finder.in)) return false; 2028 2029 do 2030 { 2031 ++finder.in; 2032 if (eof(finder.in)) 2033 { 2034 endRead(finder.in); 2035 #ifdef SEQAN_DEBUG_SWIFT 2036 _printSwiftBuckets(pattern); 2037 #endif 2038 if(_swiftMultiFlushBuckets(finder, pattern)) 2039 { 2040 _copySwiftHit(finder, pattern); 2041 return true; 2042 } 2043 else 2044 return false; 2045 } 2046 finder.curPos = (*finder.in).i1; 2047 if (pattern.params.printDots) _printDots(finder); 2048 2049 } while (!_swiftMultiProcessQGram(finder, pattern, hash(pattern.shape, (*finder.in).i2))); 2050 2051 _copySwiftHit(finder, pattern); 2052 return true; 2053 } 2054 2055 /*! 2056 * @fn SwiftFinder#windowFindBegin 2057 * @headerfile <seqan/index.h> 2058 * @brief Initializes the pattern. Sets the finder on the begin position. Gets the first non-repeat range and sets it 2059 * in the finder. Used together with @link SwiftFinder#windowFindEnd @endlink. 2060 * 2061 * @signature bool windowFindBegin(finder, pattern, errorRate); 2062 * 2063 * @param[in,out] finder A finder with window interface. 2064 * @param[in,out] pattern A pattern with window interface. Types: @link SwiftPattern @endlink. 2065 * @param[in] errorRate Error rate that is allowed between reads and reference. Should be the same in as in 2066 * @link SwiftFinder#windowFindNext @endlink. Types: <tt>double</tt> 2067 * 2068 * @return bool <tt>true</tt> if there were bases to be scanned and <tt>false</tt> otherwise. 2069 * 2070 * @see SwiftFinder#windowFindNext 2071 * @see SwiftFinder#windowFindEnd 2072 */ 2073 template <typename THaystack, typename TIndex, typename TSpec> 2074 inline bool 2075 windowFindBegin( 2076 Finder<THaystack, Swift<TSpec> > &finder, 2077 Pattern<TIndex, Swift<TSpec> > &pattern, 2078 double errorRate) 2079 { 2080 2081 pattern.finderLength = pattern.params.tabooLength + length(container(finder)); 2082 _patternInit(pattern, errorRate, 0); 2083 _finderSetNonEmpty(finder); 2084 finder.dotPos = 100000; 2085 finder.dotPos2 = 10 * finder.dotPos; 2086 finder.windowStart = 0; 2087 2088 if (!_firstNonRepeatRange(finder, pattern)) return false; 2089 2090 return true; 2091 } 2092 2093 2094 /*! 2095 * @fn SwiftFinder#windowFindNext 2096 * @headerfile <seqan/index.h> 2097 * @brief Searches over the next window with the finder. The found hits can be retrieved with @link 2098 * SwiftFinder#getWindowFindHits @endlink. Used together with @link SwiftFinder#windowFindBegin @endlink and 2099 * @link SwiftFinder#windowFindEnd @endlink. 2100 * 2101 * @signature bool windowFindNext(finder, pattern, finderWindowLength) 2102 * 2103 * @param[in,out] finder A SwiftFinder. 2104 * @param[in,out] pattern A SwiftPattern. 2105 * @param[in] finderWindowLength Number of bases that are scanned beginning from the position the finder is at. 2106 * Including bases that are marked as repeats and that are skipped. Types: 2107 * nolink:unsigned int 2108 * 2109 * @return bool <tt>true</tt>, if there are bases that can be scanned, <tt>false</tt> otherwise. 2110 * 2111 * @see SwiftFinder#windowFindBegin 2112 * @see SwiftFinder#windowFindEnd 2113 * @see SwiftFinder#getWindowFindHits 2114 */ 2115 2116 template <typename THaystack, typename TIndex, typename TSpec, typename TSize> 2117 inline bool 2118 windowFindNext( 2119 Finder<THaystack, Swift<TSpec> > &finder, 2120 Pattern<TIndex, Swift<TSpec> > &pattern, 2121 TSize finderWindowLength 2122 ) 2123 { 2124 2125 typedef typename Fibre<TIndex, QGramShape>::Type TShape; 2126 2127 typedef Finder<THaystack, Swift<TSpec> > TFinder; 2128 typedef typename TFinder::THstkPos THstkPos; 2129 2130 // all previous matches reported -> search new ones 2131 clear(finder.hits); 2132 2133 THstkPos windowEnd = finder.windowStart + finderWindowLength; 2134 2135 // iterate over all non-repeat regions within the window 2136 for (; finder.curPos < windowEnd; ) 2137 { 2138 THstkPos nonRepeatEnd = finder.endPos - length(pattern.shape) + 1; 2139 THstkPos localEnd = _min(windowEnd, nonRepeatEnd); 2140 2141 // filter a non-repeat region within the window 2142 if (finder.curPos < localEnd) 2143 { 2144 TShape &shape = pattern.shape; 2145 _swiftMultiProcessQGram(finder, pattern, hash(shape, hostIterator(hostIterator(finder)))); 2146 2147 for (++finder.curPos, ++finder; finder.curPos < localEnd; ++finder.curPos, ++finder){ 2148 _swiftMultiProcessQGram(finder, pattern, hashNext(shape, hostIterator(hostIterator(finder)))); 2149 } 2150 } 2151 2152 if (pattern.params.printDots) _printDots(finder); 2153 2154 if (finder.curPos >= nonRepeatEnd) 2155 if (!_nextNonRepeatRange(finder, pattern)) 2156 { 2157 finder.windowStart = windowEnd; 2158 return false; 2159 } 2160 } 2161 finder.windowStart = windowEnd; 2162 return true; 2163 } 2164 2165 /*! 2166 * @fn SwiftFinder#windowFindEnd 2167 * @headerfile <seqan/index.h> 2168 * @brief Flushes the finder. Used together with @link SwiftFinder#windowFindBegin @endlink and @link 2169 * SwiftFinder#windowFindNext @endlink. 2170 * 2171 * @signature void windowFindEnd(finder, pattern); 2172 * 2173 * @param[in,out] finder A finder with window interface. 2174 * @param[in,out] pattern A pattern with window interface. Types: @link SwiftPattern @endlink 2175 * 2176 * @see SwiftFinder#windowFindBegin 2177 * @see SwiftFinder#windowFindNext 2178 */ 2179 2180 template <typename THaystack, typename TIndex, typename TSpec> 2181 inline void 2182 windowFindEnd( 2183 Finder<THaystack, Swift<TSpec> > & finder, 2184 Pattern<TIndex, Swift<TSpec> > &pattern) 2185 { 2186 _swiftMultiFlushBuckets(finder, pattern); 2187 } 2188 2189 /*! 2190 * @fn SwiftFinder#getWindowFindHits 2191 * @headerfile <seqan/index.h> 2192 * @brief Returns the string of hits from the finder. 2193 * 2194 * @signature THitString getWindowFindHits(finder); 2195 2196 * @param[in] finder A finder with window interface. 2197 * @return THitString @link String @endlink of hits, see @link SwiftFinder::THitString @endlink. 2198 * 2199 * @see SwiftFinder#windowFindNext 2200 */ 2201 2202 template <typename THaystack, typename TSpec> 2203 inline typename WindowFindResult<Finder<THaystack, Swift<TSpec> >, void>::Type & 2204 getWindowFindHits(Finder<THaystack, Swift<TSpec> > &finder) 2205 { 2206 2207 return finder.hits; 2208 } 2209 2210 /*! 2211 * @fn SwiftPattern#getMaxDeviationOfOrder 2212 * @headerfile <seqan/index.h> 2213 * @brief Returns the maximal out-of-order distance of adjacent hits. 2214 * 2215 * @signature TSize getMaxDeviationOfOrder(pattern); 2216 * 2217 * @param[in] pattern A SwiftPattern. 2218 * @return TSize Returns the maximal distance two adjacent hits can have which are not in increasing order. 2219 * <tt>TSize</tt> is the size type of the underlying index. 2220 */ 2221 2222 template <typename TIndex, typename TSpec> 2223 inline typename Size<TIndex>::Type 2224 getMaxDeviationOfOrder(Pattern<TIndex, Swift<TSpec> > &pattern) 2225 { 2226 // TODO(holtgrew): Can pattern be made const? 2227 return back(pattern.bucketParams).delta + back(pattern.bucketParams).overlap + length(pattern.bucketParams) - 2; 2228 } 2229 2230 2231 }// namespace seqan 2232 2233 #endif //#ifndef SEQAN_HEADER_FIND_SHIFTAND_H 2234 2235