1 /*========================================================================== 2 RazerS - Fast Read Mapping with Controlled Loss Rate 3 http://www.seqan.de/projects/razers.html 4 5 ============================================================================ 6 Copyright (C) 2008 by David Weese 7 8 This program is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 3 of the License, or (at your options) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. 20 ==========================================================================*/ 21 22 #ifndef SEQAN_HEADER_RAZERS_H 23 #define SEQAN_HEADER_RAZERS_H 24 25 #include <iostream> 26 #include <fstream> 27 28 #ifdef _OPENMP 29 #include <omp.h> 30 #endif 31 32 #include <seqan/find.h> 33 #include <seqan/seq_io.h> 34 #include <seqan/index.h> 35 #include <seqan/index/find_pigeonhole.h> 36 #include <seqan/store.h> 37 #include <seqan/pipe.h> 38 #include <seqan/parallel.h> 39 #include <seqan/seq_io.h> 40 #include <seqan/stream.h> 41 42 #ifdef RAZERS_PROFILE 43 #include "profile_timeline.h" 44 #endif // #ifdef RAZERS_PROFILE 45 46 // No parallelism for less than MIN_PARALLEL_WORK reads. 47 const unsigned MIN_PARALLEL_WORK = 1; //100/*0*/; // TODO(holtgrew): Set to some useful value after development. 48 49 namespace seqan { 50 51 // Compact representation of a match. 52 template <typename TContigPos_> 53 struct MatchRecord 54 { 55 typedef typename MakeSigned_<TContigPos_>::Type TContigPos; 56 57 unsigned contigId; // genome seqNo 58 unsigned readId; // read seqNo 59 TContigPos beginPos; // begin position of the match in the genome 60 TContigPos endPos; // end position of the match in the genome 61 #ifdef RAZERS_DEFER_COMPACTION 62 bool isRegistered; // registered in masking process. 63 #endif // #ifdef RAZERS_DEFER_COMPACTION 64 char orientation; // 'F', 'R', '-' 65 short int score; // Levenshtein distance / score. 66 unsigned pairMatchId; // unique id for the two mate-pair matches (0 if unpaired) 67 int libDiff : 24; // outer distance difference from librarySize 68 int pairScore : 8; // combined score of both mates 69 70 static const unsigned INVALID_ID; 71 MatchRecordMatchRecord72 MatchRecord() : 73 contigId(std::numeric_limits<unsigned>::max()), readId(std::numeric_limits<unsigned>::max()), 74 beginPos(0), endPos(0), 75 #ifdef RAZERS_DEFER_COMPACTION 76 isRegistered(false), 77 #endif // #ifndef RAZERS_DEFER_COMPACTION 78 orientation('-'), score(0), pairMatchId(std::numeric_limits<unsigned>::max()), 79 libDiff(0), pairScore(0) 80 {} 81 }; 82 83 template <typename TStream, typename TPos> 84 TStream & 85 operator<<(TStream & stream, MatchRecord<TPos> & record) 86 { 87 stream << "(contigId=" << record.contigId << ", readId=" << record.readId << ", beginPos=" << record.beginPos << ", endPos = " << record.endPos << ", orientation=" << record.orientation << ", score=" << record.score << ", pairMatchId=" << record.pairMatchId << ", libDiff=" << record.libDiff << ", pairScore=" << record.pairScore << ")"; 88 return stream; 89 } 90 91 template <typename TGPos_> 92 const unsigned MatchRecord<TGPos_>::INVALID_ID = std::numeric_limits<unsigned>::max(); 93 94 #ifdef RAZERS_PROFILE 95 enum 96 { 97 TASK_WAIT, 98 TASK_ON_CONTIG, 99 TASK_INIT, 100 TASK_REVCOMP, 101 TASK_FILTER, 102 TASK_VERIFY, 103 TASK_WRITEBACK, 104 TASK_COMPACT, 105 TASK_DUMP_MATCHES, 106 TASK_LOAD, 107 TASK_SORT, 108 TASK_COPY_FINDER 109 }; 110 #endif // #ifdef RAZERS_PROFILE 111 112 ////////////////////////////////////////////////////////////////////////////// 113 // RazerS modes 114 115 // Alignment mode 116 struct RazerSLocal; 117 struct RazerSGlobal; 118 struct RazerSPrefix; 119 120 // Gap mode 121 struct RazerSGapped; 122 struct RazerSUngapped; 123 124 // Score mode 125 struct RazerSErrors; 126 struct RazerSScore; 127 struct RazerSMAQ; 128 129 template <typename TSpec = Default> 130 struct RazerSQuality; 131 132 template <typename TAlignMode_, typename TGapMode_, typename TScoreMode_, typename TMatchNPolicy_> 133 struct RazerSMode 134 { 135 typedef TAlignMode_ TAlignMode; 136 typedef TGapMode_ TGapMode; 137 typedef TScoreMode_ TScoreMode; 138 typedef TMatchNPolicy_ TMatchNPolicy; 139 }; 140 141 enum AlignMode {RAZERS_LOCAL, RAZERS_PREFIX, RAZERS_GLOBAL}; 142 enum GapMode {RAZERS_GAPPED, RAZERS_UNGAPPED}; 143 enum ScoreMode {RAZERS_ERRORS, RAZERS_SCORE, RAZERS_QUALITY}; 144 enum CompactMatchesMode {COMPACT, COMPACT_FINAL, COMPACT_FINAL_EXTERNAL}; 145 146 147 148 ////////////////////////////////////////////////////////////////////////////// 149 // Default options 150 151 template <bool DONT_VERIFY_ = false, bool DONT_DUMP_RESULTS_ = false> 152 struct RazerSSpec 153 { 154 enum {DONT_VERIFY = DONT_VERIFY_}; // omit verifying potential matches 155 enum {DONT_DUMP_RESULTS = DONT_DUMP_RESULTS_}; // omit dumping results 156 }; 157 158 template <typename TSpec = RazerSSpec<> > 159 struct RazerSCoreOptions 160 { 161 // major options 162 AlignMode alignMode; 163 GapMode gapMode; 164 ScoreMode scoreMode; 165 166 // main options 167 TSpec spec; 168 bool forward; // compute forward oriented read matches 169 bool reverse; // compute reverse oriented read matches 170 double errorRate; // Criteria 1 threshold 171 unsigned maxHits; // output at most maxHits many matches 172 unsigned scoreDistanceRange; // output only the best, second best, ..., scoreDistanceRange best matches 173 int dRange; // used in matchVerify 174 // to a best match with e errors 175 bool purgeAmbiguous; // true..remove reads with more than maxHits best matches, false..keep them 176 CharString output; // name of result file 177 int _debugLevel; // level of verbosity 178 bool printVersion; // print version number 179 int trimLength; // if >0, cut reads to #trimLength characters 180 // controlled pigeonhole extensions 181 double mutationRate; // difference between reference genome and sequenced genome 182 double lossRate; // 1.0 - sensitivity 183 184 // output format options 185 unsigned outputFormat; // 0..Razer format 186 // 1..enhanced Fasta 187 // 2..ELAND format 188 bool dumpAlignment; // compute and dump the match alignments in the result files 189 unsigned genomeNaming; // 0..use Fasta id 190 // 1..enumerate reads beginning with 1 191 // TODO(holtgrew): SAM export should imply --read-naming 3 192 unsigned readNaming; // 0..use Fasta id 193 // 1..enumerate reads beginning with 1 194 // 2..use the read sequence (only for short reads!) 195 // 3..use Fasta id, do not append /L and /R for mate pairs. 196 bool fullFastaId; // read full FastaId or clip after first whitespace 197 unsigned sortOrder; // 0..sort keys: 1. read number, 2. genome position 198 // 1.. 1. genome pos50ition, 2. read number 199 int positionFormat; // 0..gap space 200 // 1..position space 201 const char * runID; // runID needed for gff output 202 bool dontShrinkAlignments; // Required when used for building gold Rabema mapping. 203 bool computeGlobal; // compute global alignment in SAM output 204 205 // filtration parameters 206 std::string shape; // shape (e.g. 11111111111) 207 int threshold; // threshold (minimal threshold, 0 activates pigeonhole mode) 208 int tabooLength; // taboo length 209 int repeatLength; // repeat length threshold 210 double abundanceCut; // abundance threshold 211 int delta; // q-gram delta (in pigeonhole mode), 0=automatic 212 int overlap; // q-gram overlap (in pigeonhole mode), 0=lossless 213 unsigned maxOverlap; // limits the overlap in automatic mode 214 215 // mate-pair parameters 216 int libraryLength; // offset between two mates 217 int libraryError; // offset tolerance 218 unsigned nextPairMatchId; // use this id for the next mate-pair 219 220 // verification parameters 221 unsigned prefixSeedLength; // length of the prefix seed 222 bool matchN; // false..N is always a mismatch, true..N matches with all 223 unsigned char compMask[5]; 224 Score<int, Simple> scoringScheme; 225 int minScore; // minimal alignment score 226 227 // statistics 228 typedef LogProb<> TProb; 229 // typedef double TProb; 230 String<unsigned> readLengths; // read length histogram (i -> #reads of length i) 231 String<double> avrgQuality; // average error quality per base 232 String<TProb> errorProb; // error probability per base 233 CharString errorPrbFileName; 234 CharString mismatchFilename; 235 236 String<double> errorDist; // error distribution 237 int64_t countFiltration; // matches returned by the filter 238 int64_t countVerification; // matches returned by the verifier 239 double timeLoadFiles; // time for loading input files 240 double timeMapReads; // time for mapping reads 241 double timeDumpResults; // time for dumping the results 242 double timeBuildQGramIndex; // time for q-gram index building. 243 double timeCompactMatches; // time for compacting reads 244 double timeMaskDuplicates; // time spent masking duplicates 245 double timeFsCopy; // time spent copying alignments back into the fragment store 246 double timeFiltration; 247 double timeVerification; 248 249 bool maqMapping; 250 int absMaxQualSumErrors; 251 252 bool lowMemory; // set maximum shape weight to 13 to limit size of q-gram index 253 bool fastaIdQual; // hidden option for special fasta+quality format we use 254 255 // misc 256 double noCompactFrac; // If in last noCompactFrac of genome, don't compact. 257 double compactMult; // Multiplicator for compaction threshold. 258 int64_t compactThresh; // compact match array if larger than compactThresh 259 260 // multi-threading 261 262 unsigned threadCount; // Number of threads to use in the parallel version. 263 unsigned windowSize; // Collect SWIFT hits in windows of this length. 264 unsigned verificationPackageSize; // This number of SWIFT hits per verification. 265 unsigned maxVerificationPackageCount; // Maximum number of verification packages to create. 266 int64_t availableMatchesMemorySize; // Memory available for matches. Used for switching to external memory algorithms. -1 for always external, 0 for never. 267 int matchHistoStartThreshold; // Threshold to use for starting histogram. >= 1 268 269 #ifdef RAZERS_OPENADDRESSING 270 double loadFactor; 271 #endif 272 273 // global preprocessing information and maximal allowed errors 274 275 typedef Infix<String<Dna5Q> >::Type TRead; 276 typedef Pattern<TRead const, MyersUkkonen> TMyersPattern; // verifier 277 typedef String<TMyersPattern> TPreprocessing; 278 279 static String<unsigned char> errorCutOff; // ignore matches with >=errorCutOff errors 280 static TPreprocessing forwardPatterns; 281 282 CharString commandLine; 283 std::string version; 284 RazerSCoreOptionsRazerSCoreOptions285 RazerSCoreOptions() 286 { 287 alignMode = RAZERS_GLOBAL; 288 gapMode = RAZERS_GAPPED; 289 scoreMode = RAZERS_ERRORS; 290 291 forward = true; 292 reverse = true; 293 errorRate = 0.05; 294 maxHits = 100; 295 scoreDistanceRange = 0; // disabled 296 dRange = 1 << 30; 297 purgeAmbiguous = false; 298 output = ""; 299 _debugLevel = 0; 300 printVersion = false; 301 trimLength = 0; 302 mutationRate = 0.05; 303 304 outputFormat = 0; 305 dumpAlignment = false; 306 genomeNaming = 0; 307 readNaming = 0; 308 fullFastaId = false; 309 sortOrder = 0; 310 positionFormat = 0; 311 runID = "s"; // 312 dontShrinkAlignments = false; 313 computeGlobal = false; 314 315 matchN = false; 316 shape = "11111111111"; 317 threshold = 1; 318 tabooLength = 1; 319 repeatLength = 1000; 320 abundanceCut = 1; 321 delta = 0; 322 overlap = 0; 323 maxOverlap = 10; 324 325 libraryLength = 220; 326 libraryError = 50; 327 nextPairMatchId = 0; 328 329 prefixSeedLength = 28; // the "artificial" seed length that is used for mapping quality assignment 330 for (unsigned i = 0; i < 4; ++i) 331 compMask[i] = 1 << i; 332 compMask[4] = 0; 333 334 noCompactFrac = 0.05; 335 compactMult = 2.2; 336 compactThresh = 1024; 337 // compactThresh = 40; 338 339 absMaxQualSumErrors = 100; // maximum for sum of mism qualities in total readlength 340 lowMemory = false; // set maximum shape weight to 13 to limit size of q-gram index 341 fastaIdQual = false; 342 343 threadCount = 1; 344 // TODO(holtgrew): Tune this! 345 windowSize = 500000; 346 verificationPackageSize = 100; 347 maxVerificationPackageCount = 100; 348 availableMatchesMemorySize = 0; 349 matchHistoStartThreshold = 5; 350 351 #ifdef RAZERS_OPENADDRESSING 352 loadFactor = 1.6; 353 #endif 354 355 lossRate = 0.0; 356 minScore = 0; 357 countFiltration = 0; 358 countVerification = 0; 359 timeLoadFiles = 0.0; 360 timeMapReads = 0.0; 361 timeDumpResults = 0.0; 362 timeBuildQGramIndex = 0.0; 363 timeCompactMatches = 0.0; 364 timeMaskDuplicates = 0.0; 365 timeFsCopy = 0.0; 366 timeFiltration = 0.0; 367 timeVerification = 0.0; 368 maqMapping = false; 369 } 370 371 }; 372 373 template <typename TSpec = RazerSSpec<> > 374 struct RazerSOptions : RazerSCoreOptions<TSpec> 375 { 376 typedef RazerSCoreOptions<TSpec> TCoreOptions; 377 SeqFileIn readFile; // left read's SeqFile (we have to keep it open and store it here to stream it only once) 378 }; 379 380 template <typename TSpec> 381 String<unsigned char> RazerSCoreOptions<TSpec>::errorCutOff; 382 383 template <typename TSpec> 384 typename RazerSCoreOptions<TSpec>::TPreprocessing RazerSCoreOptions<TSpec>::forwardPatterns; 385 386 ////////////////////////////////////////////////////////////////////////////// 387 // Typedefs 388 389 enum RAZERS_ERROR 390 { 391 RAZERS_INVALID_OPTIONS = -1, 392 RAZERS_READS_FAILED = -2, 393 RAZERS_GENOME_FAILED = -3, 394 RAZERS_INVALID_SHAPE = -4 395 }; 396 397 ////////////////////////////////////////////////////////////////////////////// 398 // Definitions 399 400 template <typename TReadSet, typename TShape, typename TSpec> 401 struct Cargo<Index<TReadSet, IndexQGram<TShape, TSpec> > > 402 { 403 typedef struct 404 { 405 double abundanceCut; 406 int _debugLevel; 407 } Type; 408 }; 409 410 ////////////////////////////////////////////////////////////////////////////// 411 // Memory tuning 412 413 #ifdef RAZERS_MEMOPT 414 415 template <typename TReadSet, typename TShape, typename TSpec> 416 struct SAValue<Index<TReadSet, IndexQGram<TShape, TSpec> > > 417 { 418 typedef Pair< 419 unsigned, 420 unsigned, 421 BitCompressed<24, 8> // max. 16M reads of length < 256 422 > Type; 423 }; 424 425 #else 426 427 template <typename TReadSet, typename TShape, typename TSpec> 428 struct SAValue<Index<TReadSet, IndexQGram<TShape, TSpec> > > 429 { 430 typedef Pair< 431 unsigned, // many reads 432 unsigned // of arbitrary length 433 > Type; 434 }; 435 436 #endif 437 438 template <> 439 struct Size<Dna5String> 440 { 441 typedef unsigned Type; 442 }; 443 444 template <typename TReadSet, typename TShape> 445 struct Size<Index<TReadSet, IndexQGram<TShape> > > 446 { 447 typedef unsigned Type; 448 }; 449 450 template <typename TReadSet, typename TShape> 451 struct Size<Index<TReadSet, IndexQGram<TShape, OpenAddressing> > > 452 { 453 typedef unsigned Type; 454 }; 455 456 457 #ifdef RAZERS_PRUNE_QGRAM_INDEX 458 459 ////////////////////////////////////////////////////////////////////////////// 460 // Repeat masker 461 template <typename TReadSet, typename TShape, typename TSpec> 462 inline bool _qgramDisableBuckets(Index<TReadSet, IndexQGram<TShape, TSpec> > & index) 463 { 464 typedef Index<TReadSet, IndexQGram<TShape, TSpec> > TReadIndex; 465 typedef typename Fibre<TReadIndex, QGramDir>::Type TDir; 466 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 467 typedef typename Value<TDir>::Type TSize; 468 469 TDir & dir = indexDir(index); 470 bool result = false; 471 unsigned counter = 0; 472 TSize thresh = (TSize)(length(index) * cargo(index).abundanceCut); 473 if (thresh < 100) 474 thresh = 100; 475 476 TDirIterator it = begin(dir, Standard()); 477 TDirIterator itEnd = end(dir, Standard()); 478 for (; it != itEnd; ++it) 479 if (*it > thresh) 480 { 481 *it = (TSize) - 1; 482 result = true; 483 ++counter; 484 } 485 486 if (counter > 0 && cargo(index)._debugLevel >= 1) 487 std::cerr << "Removed " << counter << " k-mers" << std::endl; 488 489 return result; 490 } 491 492 #endif 493 494 495 template < 496 typename TFragmentStore_, 497 typename TMatches_, 498 typename TRazerSOptions_, 499 typename TRazerSMode_, 500 typename TFilterPattern_, 501 typename TCounts_ 502 > 503 struct MatchVerifier 504 { 505 typedef TFragmentStore_ TFragmentStore; 506 typedef TMatches_ TMatches; 507 typedef TRazerSOptions_ TOptions; 508 typedef TRazerSMode_ TRazerSMode; 509 typedef TFilterPattern_ TFilterPattern; 510 typedef TCounts_ TCounts; 511 512 typedef typename TRazerSMode::TMatchNPolicy TMatchNPolicy; 513 514 typedef typename TFragmentStore::TReadSeqStore TReadSeqStore; 515 typedef typename Value<TReadSeqStore>::Type const TRead; 516 typedef typename TFragmentStore::TAlignedReadStore TAlignedReadStore; 517 typedef typename TFragmentStore::TAlignQualityStore TAlignQualityStore; 518 typedef typename TFragmentStore::TContigSeq TContigSeq; 519 typedef typename Value<TAlignedReadStore>::Type TAlignedRead; 520 typedef typename Value<TAlignQualityStore>::Type TAlignQuality; 521 typedef typename Size<TContigSeq>::Type TSize; 522 typedef typename MakeSigned_<TSize>::Type TContigPos; 523 typedef ModifiedString<TRead, ModReverse> TRevRead; 524 525 typedef typename Value<TMatches>::Type TMatchRecord; 526 527 #ifdef RAZERS_BANDED_MYERS 528 typedef PatternState_<TRead, Myers<AlignTextBanded<FindInfix, TMatchNPolicy, TMatchNPolicy>, True, void> > TPatternState; 529 typedef PatternState_<TRevRead, Myers<AlignTextBanded<FindPrefix, TMatchNPolicy, TMatchNPolicy>, True, void> > TRPatternState; 530 #else // #ifdef RAZERS_BANDED_MYERS 531 typedef Pattern<TRead, Myers<FindInfix, False, void> > TMyersPattern; 532 typedef Pattern<TRevRead, Myers<FindInfix, False, void> > TRevMyersPattern; 533 typedef typename PatternState<TMyersPattern>::Type TPatternState; 534 typedef typename PatternState<TRevMyersPattern>::Type TRPatternState; 535 #endif // #ifdef RAZERS_BANDED_MYERS 536 537 TMatches * matches; 538 TOptions * options; // RazerS options 539 TFilterPattern * filterPattern; 540 TCounts * cnts; 541 542 TMatchRecord m; 543 TPatternState patternState; 544 TRPatternState revPatternState; 545 TSize genomeLength; 546 TSize rightClip; 547 TContigPos sinkPos; 548 bool onReverseComplement; 549 bool oneMatchPerBucket; 550 551 double compactionTime; 552 553 MatchVerifier() : 554 genomeLength(0), rightClip(0), sinkPos(std::numeric_limits<TContigPos>::max()), onReverseComplement(false), oneMatchPerBucket(false), compactionTime(0) {} 555 556 MatchVerifier(TMatches_ & _matches, TOptions & _options, TFilterPattern & _filterPattern, TCounts & _cnts) : 557 matches(&_matches), 558 options(&_options), 559 filterPattern(&_filterPattern), 560 cnts(&_cnts) 561 { 562 genomeLength = 0; 563 rightClip = 0; 564 sinkPos = std::numeric_limits<TContigPos>::max() >> 1; 565 onReverseComplement = false; 566 oneMatchPerBucket = false; 567 compactionTime = 0; 568 } 569 570 inline void push() 571 { 572 if (onReverseComplement) 573 { 574 // transform coordinates to the forward strand 575 m.beginPos = genomeLength - m.beginPos; 576 m.endPos = genomeLength - m.endPos; 577 std::swap(m.beginPos, m.endPos); 578 m.orientation = 'R'; 579 } 580 else 581 { 582 m.orientation = 'F'; 583 } 584 585 //SEQAN_OMP_PRAGMA(critical) 586 // begin of critical section 587 { 588 if (!options->spec.DONT_DUMP_RESULTS) 589 { 590 // std::cout << "begin: "<<m.beginPos <<"\tendPos: "<<m.endPos << "\terrors: "<<m.score <<std::endl; 591 appendValue(*matches, m, Generous()); 592 593 if ((int64_t)length(*matches) > options->compactThresh) 594 { 595 double beginTime = sysTime(); 596 typename Size<TMatches>::Type oldSize = length(*matches); 597 598 if (IsSameType<typename TRazerSMode::TGapMode, RazerSGapped>::VALUE || options->threshold == 0) 599 maskDuplicates(*matches, *options, TRazerSMode()); // overlapping parallelograms cause duplicates 600 // SEQAN_ASSERT_MSG((back(*matches).endPos - back(*matches).beginPos == 100), "len == %d", int(m.endPos - m.beginPos)); 601 602 compactMatches(*matches, *cnts, *options, TRazerSMode(), *filterPattern, COMPACT); 603 // SEQAN_ASSERT_MSG((back(*matches).endPos - back(*matches).beginPos == 100), "len == %d", int(m.endPos - m.beginPos)); 604 605 if (length(*matches) * 4 > oldSize) // the threshold should not be raised 606 { // fprintf(stderr, "[raising threshold]"); 607 // options->compactThresh += (options->compactThresh >> 1); // if too many matches were removed 608 options->compactThresh = (int64_t)(options->compactThresh * options->compactMult); 609 } 610 611 // if (options._debugLevel >= 2) 612 // std::cerr << '(' << oldSize - length(store.alignedReadStore) << " matches removed)"; 613 double endTime = sysTime(); 614 compactionTime += (endTime - beginTime); 615 } 616 } 617 ++options->countVerification; 618 } 619 // end of critical section 620 } 621 622 }; 623 624 625 626 ////////////////////////////////////////////////////////////////////////////// 627 // Read a list of genome file names 628 template<typename TSpec> 629 int getGenomeFileNameList(CharString filename, StringSet<CharString> & genomeFileNames, RazerSCoreOptions<TSpec> &options) 630 { 631 std::ifstream file; 632 file.open(toCString(filename), std::ios_base::in | std::ios_base::binary); 633 if (!file.is_open()) 634 return RAZERS_GENOME_FAILED; 635 636 DirectionIterator<std::fstream, Input>::Type reader(file); 637 if (!atEnd(reader)) 638 return 0; 639 640 clear(genomeFileNames); 641 if (*reader == '>' && *reader != '@') //if file does not start with a fasta header --> list of multiple reference genome files 642 { 643 if(options._debugLevel >=1) 644 std::cout << std::endl << "Reading multiple genome files:" << std::endl; 645 646 unsigned i = 1; 647 CharString line; 648 while (!atEnd(reader)) 649 { 650 readLine(line, reader); 651 cropOuter(line, IsWhitespace()); 652 appendValue(genomeFileNames, line); 653 if(options._debugLevel >=2) 654 std::cout <<"Genome file #"<< i <<": " << back(genomeFileNames) << std::endl; 655 ++i; 656 } 657 if(options._debugLevel >=1) 658 std::cout << i-1 << " genome files total." << std::endl; 659 } 660 else //if file starts with a fasta header --> regular one-genome-file input 661 appendValue(genomeFileNames, filename, Exact()); 662 file.close(); 663 return 0; 664 } 665 666 ////////////////////////////////////////////////////////////////////////////// 667 // Load multi-Fasta sequences with or w/o quality values 668 template <typename TFSSpec, typename TFSConfig, typename TRazerSOptions> 669 bool loadReads( 670 FragmentStore<TFSSpec, TFSConfig> & store, 671 SeqFileIn &seqFile, 672 TRazerSOptions & options) 673 { 674 bool countN = !(options.matchN || options.outputFormat == 1); 675 676 String<uint64_t> qualSum; 677 String<Dna5Q> seq; 678 CharString qual; 679 CharString seqId; 680 681 unsigned seqCount = 0; 682 unsigned kickoutcount = 0; 683 684 while (!atEnd(seqFile)) 685 { 686 readRecord(seqId, seq, qual, seqFile); 687 ++seqCount; 688 689 if (options.readNaming == 0 || options.readNaming == 3) 690 { 691 if (!options.fullFastaId) 692 cropAfterFirst(seqId, IsWhitespace()); // read Fasta id up to the first whitespace 693 } 694 else 695 { 696 clear(seqId); 697 } 698 699 if (countN) 700 { 701 int count = 0; 702 int cutoffCount = (int)(options.errorRate * length(seq)); 703 for (unsigned j = 0; j < length(seq); ++j) 704 if (getValue(seq, j) == 'N') 705 if (++count > cutoffCount) 706 { 707 clear(seq); 708 clear(seqId); 709 clear(qual); // So no qualities are assigned below. 710 ++kickoutcount; 711 break; 712 } 713 // low qual. reads are empty to output them and their id later as LQ reads 714 // if (count > cutoffCount) continue; 715 } 716 717 // store dna and quality together 718 assignQualities(seq, qual); 719 if (options.trimLength > 0 && length(seq) > (unsigned)options.trimLength) 720 resize(seq, options.trimLength); 721 722 // append read to fragment store 723 appendRead(store, seq, seqId); 724 725 unsigned len = length(seq); 726 if (length(qualSum) <= len) 727 { 728 resize(qualSum, len, 0u); 729 resize(options.readLengths, len + 1, 0u); 730 } 731 ++options.readLengths[len]; 732 733 for (unsigned i = 0; i < len; ++i) 734 qualSum[i] += getQualityValue(seq[i]); 735 } 736 737 // memory optimization 738 // we store reads in a concat-direct stringset and can shrink its size 739 shrinkToFit(store.readSeqStore.concat); 740 shrinkToFit(store.readSeqStore.limits); 741 742 // compute error probabilities 743 resize(options.avrgQuality, length(qualSum)); 744 unsigned coverage = 0; 745 for (unsigned i = length(qualSum); i != 0; ) 746 { 747 coverage += options.readLengths[i]; 748 --i; 749 options.avrgQuality[i] = (double)qualSum[i] / (double)coverage; 750 } 751 estimateErrorDistributionFromQualities(options); 752 753 typedef Shape<Dna, SimpleShape> TShape; 754 typedef typename SAValue<Index<StringSet<Dna5String>, IndexQGram<TShape, OpenAddressing> > >::Type TSAValue; 755 TSAValue sa(0, 0); 756 sa.i1 = ~sa.i1; 757 sa.i2 = ~sa.i2; 758 759 if ((unsigned)sa.i1 < length(store.readSeqStore) - 1) 760 { 761 std::cerr << "Maximal read number of " << (unsigned)sa.i1 + 1 << " exceeded. Please remove \"#define RAZERS_MEMOPT\" in razers.cpp and recompile." << std::endl; 762 seqCount = 0; 763 } 764 if ((unsigned)sa.i2 < length(options.readLengths) - 2) 765 { 766 std::cerr << "Maximal read length of " << (unsigned)sa.i2 + 1 << " bps exceeded. Please remove \"#define RAZERS_MEMOPT\" in razers.cpp and recompile." << std::endl; 767 seqCount = 0; 768 } 769 770 if (options._debugLevel > 1 && kickoutcount > 0) 771 std::cerr << "Ignoring " << kickoutcount << " low quality reads.\n"; 772 773 if (options._debugLevel > 1) 774 { 775 std::cerr << std::endl; 776 std::cerr << "Average quality profile:" << std::endl; 777 unsigned cols = length(options.avrgQuality); 778 if (cols > 40) 779 cols = 40; 780 781 for (int j = 20; j >= 0; --j) 782 { 783 std::cout.width(3); 784 if (j % 5 == 0) 785 std::cout << 2 * j; 786 else 787 std::cout << ' '; 788 std::cout << " | "; 789 for (unsigned i = 0; i < cols; ++i) 790 { 791 unsigned c = i * (length(options.avrgQuality) - 1) / (cols - 1); 792 double x = options.avrgQuality[c]; 793 std::cout << ((2 * j + 1 <= x) ? '*' : ' '); 794 } 795 std::cout << std::endl; 796 } 797 std::cout << " +-"; 798 for (unsigned i = 0; i < cols; ++i) 799 std::cout << ((i % 5 == 0) ? '+' : '-'); 800 std::cout << std::endl << " "; 801 for (unsigned i = 0; i < cols; ++i) 802 { 803 unsigned c = i * (length(options.avrgQuality) - 1) / (cols - 1); 804 std::cout.width(5); 805 if (i % 5 == 0) 806 std::cout << c + 1; 807 } 808 std::cout << std::endl; 809 } 810 811 return seqCount > 0; 812 } 813 814 ////////////////////////////////////////////////////////////////////////////// 815 // Read the first sequence of a multi-sequence file 816 // and return its length 817 inline int estimateReadLength(SeqFileIn &seqFile) 818 { 819 if (atEnd(seqFile)) 820 return 0; 821 822 typedef String<char, Array<1000> > TBuffer; 823 824 // read chunk into buffer 825 TBuffer buffer; 826 resize(buffer, capacity(buffer)); 827 size_t len = seqFile.stream.readsome(&buffer[0], length(buffer)); 828 for (size_t i = 0; i < len; ++i) 829 seqFile.stream.unget(); 830 resize(buffer, len); 831 832 // parse record from buffer 833 DirectionIterator<TBuffer, Input>::Type iter = directionIterator(buffer, Input()); 834 CharString fastaId, seq; 835 readRecord(fastaId, seq, iter, seqFile.format); 836 return length(seq); 837 } 838 839 // Comparators for RazerS1-style matches. 840 841 // TODO(holtgrew): Slightly different comparators than in previous RazerS 3 version, add back the additional checks? 842 843 template <typename TReadMatch> 844 struct LessBeginPos : 845 public std::binary_function<TReadMatch, TReadMatch, bool> 846 { 847 inline bool operator()(TReadMatch const & a, TReadMatch const & b) const 848 { 849 // genome position and orientation 850 if (a.contigId < b.contigId) return true; 851 if (a.contigId > b.contigId) return false; 852 853 if (a.beginPos < b.beginPos) return true; 854 855 return false; 856 } 857 858 }; 859 860 template <typename TReadMatch> 861 struct LessRNoBeginPos : 862 public std::binary_function<TReadMatch, TReadMatch, bool> 863 { 864 inline bool operator()(TReadMatch const & a, TReadMatch const & b) const 865 { 866 // read number 867 if (a.readId < b.readId) return true; 868 if (a.readId > b.readId) return false; 869 870 // genome position and orientation 871 if (a.contigId < b.contigId) return true; 872 if (a.contigId > b.contigId) return false; 873 874 if (a.beginPos < b.beginPos) return true; 875 if (a.beginPos > b.beginPos) return false; 876 877 if (a.orientation == '-') return false; 878 if (b.orientation == '-') return true; 879 880 if (a.orientation < b.orientation) return true; 881 if (a.orientation > b.orientation) return false; 882 883 // quality 884 if (a.score > b.score) return true; 885 if (b.score > a.score) return false; 886 887 if (a.endPos > b.endPos) return true; 888 889 return false; 890 } 891 892 }; 893 894 template <typename TReadMatch> 895 struct LessRNoBeginPosMP : 896 public std::binary_function<TReadMatch, TReadMatch, bool> 897 { 898 inline bool operator()(TReadMatch const & a, TReadMatch const & b) const 899 { 900 // read number 901 if (a.readId < b.readId) return true; 902 if (a.readId > b.readId) return false; 903 904 // genome position and orientation 905 if (a.contigId < b.contigId) return true; 906 if (a.contigId > b.contigId) return false; 907 908 if (a.beginPos < b.beginPos) return true; 909 if (a.beginPos > b.beginPos) return false; 910 911 if (a.orientation == '-') return false; 912 if (b.orientation == '-') return true; 913 914 if (a.orientation < b.orientation) return true; 915 if (a.orientation > b.orientation) return false; 916 917 // quality 918 if (a.pairScore > b.pairScore) return true; 919 if (a.pairScore < b.pairScore) return false; 920 921 if (a.libDiff < b.libDiff) return true; 922 if (a.libDiff > b.libDiff) return false; 923 924 if (a.endPos > b.endPos) return true; 925 926 return false; 927 } 928 929 }; 930 931 // ... to sort matches and remove duplicates with equal gEnd 932 template <typename TReadMatch> 933 struct LessRNoEndPos : 934 public std::binary_function<TReadMatch, TReadMatch, bool> 935 { 936 inline bool operator()(TReadMatch const & a, TReadMatch const & b) const 937 { 938 // read number 939 if (a.readId < b.readId) return true; 940 if (a.readId > b.readId) return false; 941 942 // genome position and orientation 943 if (a.contigId < b.contigId) return true; 944 if (a.contigId > b.contigId) return false; 945 946 if (a.endPos < b.endPos) return true; 947 if (a.endPos > b.endPos) return false; 948 949 if (a.orientation == '-') return false; 950 if (b.orientation == '-') return true; 951 952 if (a.orientation < b.orientation) return true; 953 if (a.orientation > b.orientation) return false; 954 955 // quality 956 if (a.score > b.score) return true; 957 if (b.score > a.score) return false; 958 959 if (a.beginPos < b.beginPos) return true; 960 961 return false; 962 } 963 964 }; 965 966 template <typename TReadMatch> 967 struct LessRNoEndPosMP : 968 public std::binary_function<TReadMatch, TReadMatch, bool> 969 { 970 int libSize; 971 LessRNoEndPosMP(int _libSize) : 972 libSize(_libSize) {} 973 974 inline bool operator()(TReadMatch const & a, TReadMatch const & b) const 975 { 976 // read number 977 if (a.readId < b.readId) return true; 978 if (a.readId > b.readId) return false; 979 980 // genome position and orientation 981 if (a.contigId < b.contigId) return true; 982 if (a.contigId > b.contigId) return false; 983 984 if (a.endPos < b.endPos) return true; 985 if (a.endPos > b.endPos) return false; 986 987 if (a.orientation == '-') return false; 988 if (b.orientation == '-') return true; 989 990 if (a.orientation < b.orientation) return true; 991 if (a.orientation > b.orientation) return false; 992 993 // quality 994 if (a.pairScore > b.pairScore) return true; 995 if (a.pairScore < b.pairScore) return false; 996 997 if (a.libDiff < b.libDiff) return true; 998 if (a.libDiff > b.libDiff) return false; 999 1000 if (a.beginPos < b.beginPos) return true; 1001 1002 return false; 1003 } 1004 1005 }; 1006 1007 template <typename TReadMatch> 1008 struct LessScoreBackport : 1009 public std::binary_function<TReadMatch, TReadMatch, bool> 1010 { 1011 inline bool operator()(TReadMatch const & a, TReadMatch const & b) const 1012 { 1013 // read number 1014 if (a.readId < b.readId) return true; 1015 if (a.readId > b.readId) return false; 1016 1017 // quality 1018 if (a.orientation == '-') return false; 1019 if (b.orientation == '-') return true; 1020 1021 if (a.score > b.score) return true; 1022 if (b.score > a.score) return false; 1023 1024 // Sort by leftmost begin pos, longest end pos on ties. 1025 if (a.contigId < b.contigId) return true; 1026 if (a.contigId > b.contigId) return false; 1027 1028 if (a.orientation < b.orientation) return true; 1029 if (a.orientation > b.orientation) return false; 1030 1031 if (a.beginPos < b.beginPos) return true; 1032 if (a.beginPos > b.beginPos) return false; 1033 1034 if (a.endPos < b.endPos) return false; 1035 if (a.endPos > b.endPos) return true; 1036 1037 return false; 1038 } 1039 1040 }; 1041 1042 // TODO(holtgrew): Merge with above. 1043 1044 template <typename TReadMatch> 1045 struct LessScoreBackport3Way : 1046 public std::binary_function<TReadMatch, TReadMatch, int> 1047 { 1048 inline int operator()(TReadMatch const & a, TReadMatch const & b) const 1049 { 1050 // read number 1051 if (a.readId < b.readId) return -1; 1052 if (a.readId > b.readId) return 1; 1053 1054 // quality 1055 if (a.orientation != '-' || b.orientation != '-') 1056 { 1057 if (a.orientation == '-') return -1; 1058 if (b.orientation == '-') return 1; 1059 } 1060 1061 if (a.score > b.score) return -1; 1062 if (b.score > a.score) return 1; 1063 1064 // Sort by leftmost begin pos, longest end pos on ties. 1065 if (a.contigId < b.contigId) return -1; 1066 if (a.contigId > b.contigId) return 1; 1067 1068 if (a.orientation < b.orientation) return -1; 1069 if (a.orientation > b.orientation) return 1; 1070 1071 if (a.beginPos < b.beginPos) return -1; 1072 if (a.beginPos > b.beginPos) return 1; 1073 1074 if (a.endPos < b.endPos) return 1; 1075 if (a.endPos > b.endPos) return -1; 1076 1077 return 0; 1078 } 1079 1080 }; 1081 1082 1083 // Comparators for Fragment Store 1084 1085 template <typename TAlignedReadStore, typename TLessScore> 1086 struct LessRNoGPos : 1087 public std::binary_function<typename Value<TAlignedReadStore>::Type, typename Value<TAlignedReadStore>::Type, bool> 1088 { 1089 typedef typename Value<TAlignedReadStore>::Type TAlignedRead; 1090 TLessScore lessScore; 1091 1092 LessRNoGPos(TLessScore const & _lessScore) : 1093 lessScore(_lessScore) {} 1094 1095 inline bool operator()(TAlignedRead const & a, TAlignedRead const & b) const 1096 { 1097 // read number 1098 if (a.readId < b.readId) return true; 1099 if (a.readId > b.readId) return false; 1100 1101 // contig number 1102 if (a.contigId < b.contigId) return true; 1103 if (a.contigId > b.contigId) return false; 1104 1105 // beginning position 1106 typename TAlignedRead::TPos ba = _min(a.beginPos, a.endPos); 1107 typename TAlignedRead::TPos bb = _min(b.beginPos, b.endPos); 1108 if (ba < bb) return true; 1109 if (ba > bb) return false; 1110 1111 // orientation 1112 bool oa = a.beginPos < a.endPos; 1113 bool ob = b.beginPos < b.endPos; 1114 if (oa != ob) return oa; 1115 1116 int result = lessScore.compare(a, b); 1117 if (result == 0) 1118 { 1119 // prefer reads that support more of the reference 1120 return _max(a.beginPos, a.endPos) > _max(b.beginPos, b.endPos); 1121 } 1122 return result == -1; 1123 } 1124 1125 }; 1126 1127 // ... to sort matches and remove duplicates with equal gEnd 1128 template <typename TAlignedReadStore, typename TLessScore> 1129 struct LessRNoGEndPos : 1130 public std::binary_function<typename Value<TAlignedReadStore>::Type, typename Value<TAlignedReadStore>::Type, bool> 1131 { 1132 typedef typename Value<TAlignedReadStore>::Type TAlignedRead; 1133 TLessScore lessScore; 1134 1135 LessRNoGEndPos(TLessScore const & _lessScore) : 1136 lessScore(_lessScore) {} 1137 1138 inline bool operator()( 1139 typename Value<TAlignedReadStore>::Type const & a, 1140 typename Value<TAlignedReadStore>::Type const & b) const 1141 { 1142 // read number 1143 if (a.readId < b.readId) return true; 1144 if (a.readId > b.readId) return false; 1145 1146 // contig number 1147 if (a.contigId < b.contigId) return true; 1148 if (a.contigId > b.contigId) return false; 1149 1150 // end position 1151 typename TAlignedRead::TPos ea = _max(a.beginPos, a.endPos); 1152 typename TAlignedRead::TPos eb = _max(b.beginPos, b.endPos); 1153 if (ea < eb) return true; 1154 if (ea > eb) return false; 1155 1156 // orientation 1157 bool oa = a.beginPos < a.endPos; 1158 bool ob = b.beginPos < b.endPos; 1159 if (oa != ob) return oa; 1160 1161 int result = lessScore.compare(a, b); 1162 if (result == 0) 1163 { 1164 // prefer reads that support more of the reference 1165 return _min(a.beginPos, a.endPos) < _min(b.beginPos, b.endPos); 1166 } 1167 return result == -1; 1168 } 1169 1170 }; 1171 1172 template <typename TAlignedReadStore, typename TAlignedReadQualityStore, typename TRazerSMode> 1173 struct LessScore : 1174 public std::binary_function<typename Value<TAlignedReadStore>::Type, typename Value<TAlignedReadStore>::Type, bool> 1175 { 1176 TAlignedReadQualityStore & qualStore; 1177 1178 LessScore(TAlignedReadQualityStore & _qualStore) : 1179 qualStore(_qualStore) {} 1180 1181 inline int compare( 1182 typename Value<TAlignedReadStore>::Type const & a, 1183 typename Value<TAlignedReadStore>::Type const & b) const 1184 { 1185 typedef typename Value<TAlignedReadStore>::Type TAlignedRead; 1186 1187 // read number 1188 if (a.readId < b.readId) return -1; 1189 if (a.readId > b.readId) return 1; 1190 1191 // quality 1192 if (a.id == TAlignedRead::INVALID_ID) return 1; 1193 if (b.id == TAlignedRead::INVALID_ID) return -1; 1194 1195 typename GetValue<TAlignedReadQualityStore>::Type qa = getValue(qualStore, a.id); 1196 typename GetValue<TAlignedReadQualityStore>::Type qb = getValue(qualStore, b.id); 1197 if (qa.pairScore > qb.pairScore) return -1; 1198 if (qa.pairScore < qb.pairScore) return 1; 1199 1200 if (qa.score > qb.score) return -1; 1201 if (qb.score > qa.score) return 1; 1202 1203 return 0; 1204 } 1205 1206 inline bool operator()( 1207 typename Value<TAlignedReadStore>::Type const & a, 1208 typename Value<TAlignedReadStore>::Type const & b) const 1209 { 1210 return compare(a, b) == -1; 1211 } 1212 1213 }; 1214 1215 // longest prefix mapping 1216 template <typename TAlignedReadStore, typename TAlignedReadQualityStore, typename TGapMode, typename TScoreMode, typename TMatchNPolicy> 1217 struct LessScore<TAlignedReadStore, TAlignedReadQualityStore, RazerSMode<RazerSPrefix, TGapMode, TScoreMode, TMatchNPolicy> >: 1218 public std::binary_function<typename Value<TAlignedReadStore>::Type, typename Value<TAlignedReadStore>::Type, bool> 1219 { 1220 TAlignedReadQualityStore & qualStore; 1221 1222 LessScore(TAlignedReadQualityStore & _qualStore) : 1223 qualStore(_qualStore) {} 1224 1225 inline int compare( 1226 typename Value<TAlignedReadStore>::Type const & a, 1227 typename Value<TAlignedReadStore>::Type const & b) const 1228 { 1229 typedef typename Value<TAlignedReadStore>::Type TAlignedRead; 1230 1231 // read number 1232 if (a.readId < b.readId) return -1; 1233 if (a.readId > b.readId) return 1; 1234 1235 // quality 1236 if (a.id == TAlignedRead::INVALID_ID) return 1; 1237 if (b.id == TAlignedRead::INVALID_ID) return -1; 1238 1239 typename GetValue<TAlignedReadQualityStore>::Type qa = getValue(qualStore, a.id); 1240 typename GetValue<TAlignedReadQualityStore>::Type qb = getValue(qualStore, b.id); 1241 if (qa.errors < qb.errors) return -1; 1242 if (qa.errors > qb.errors) return 1; 1243 1244 if (qa.score > qb.score) return -1; 1245 if (qb.score > qa.score) return 1; 1246 1247 return 0; 1248 } 1249 1250 inline bool operator()( 1251 typename Value<TAlignedReadStore>::Type const & a, 1252 typename Value<TAlignedReadStore>::Type const & b) const 1253 { 1254 return compare(a, b) == -1; 1255 } 1256 1257 }; 1258 1259 ////////////////////////////////////////////////////////////////////////////// 1260 1261 template <typename TAlignedReadQualityStore, typename TRazerSMode> 1262 struct BinFunctorDefault 1263 { 1264 TAlignedReadQualityStore & qualStore; 1265 1266 BinFunctorDefault(TAlignedReadQualityStore & _qualStore) : 1267 qualStore(_qualStore) {} 1268 1269 template <typename TAlignedRead> 1270 inline int operator()(TAlignedRead & alignedRead) const 1271 { 1272 return qualStore[alignedRead.id].errors; 1273 } 1274 1275 }; 1276 1277 1278 ////////////////////////////////////////////////////////////////////////////// 1279 // Mark duplicate matches for deletion 1280 template <typename TMatches, typename TIterator, typename TOptions, typename TRazerSMode> 1281 void maskDuplicates(TMatches &, TIterator const itBegin, TIterator const itEnd, TOptions & options, TRazerSMode) 1282 { 1283 typedef typename Value<TMatches>::Type TMatch; 1284 typedef typename TMatch::TContigPos TContigPos; 1285 1286 TContigPos beginPos, endPos; 1287 unsigned contigId, readId; 1288 char orientation; 1289 unsigned masked; 1290 TIterator it; 1291 double beginTime = sysTime(); 1292 1293 ////////////////////////////////////////////////////////////////////////////// 1294 // remove matches with equal ends 1295 1296 // we can skip one sort step in no-gap mode and with pigeonhole filter 1297 if (IsSameType<typename TRazerSMode::TGapMode, RazerSGapped>::VALUE || options.threshold != 0) 1298 { 1299 #ifdef RAZERS_PROFILE 1300 timelineBeginTask(TASK_SORT); 1301 #endif 1302 if (options.libraryLength >= 0) 1303 std::sort(itBegin, itEnd, LessRNoEndPosMP<TMatch>(options.libraryLength)); 1304 else 1305 std::sort(itBegin, itEnd, LessRNoEndPos<TMatch>()); 1306 #ifdef RAZERS_PROFILE 1307 timelineEndTask(TASK_SORT); 1308 #endif 1309 1310 beginPos = -1; 1311 endPos = -1; 1312 contigId = TMatch::INVALID_ID; 1313 readId = TMatch::INVALID_ID; 1314 orientation = '-'; 1315 masked = 0; 1316 it = itBegin; 1317 1318 for (; it != itEnd; ++it) 1319 { 1320 if ((*it).pairMatchId != TMatch::INVALID_ID && (it->readId & 1) != 0) 1321 continue; // remove only single reads or left mates 1322 1323 TContigPos itEndPos = _max((*it).beginPos, (*it).endPos); 1324 if (endPos == itEndPos && orientation == (*it).orientation && 1325 contigId == (*it).contigId && readId == (*it).readId) 1326 { 1327 (*it).orientation = '-'; 1328 masked += 1; 1329 continue; 1330 } 1331 readId = (*it).readId; 1332 contigId = (*it).contigId; 1333 endPos = itEndPos; 1334 orientation = (*it).orientation; 1335 } 1336 } 1337 1338 ////////////////////////////////////////////////////////////////////////////// 1339 // remove matches with equal begins 1340 1341 #ifdef RAZERS_PROFILE 1342 timelineBeginTask(TASK_SORT); 1343 #endif // #ifdef RAZERS_PROFILE 1344 if (options.libraryLength >= 0) 1345 std::sort(itBegin, itEnd, LessRNoBeginPosMP<TMatch>()); 1346 else 1347 std::sort(itBegin, itEnd, LessRNoBeginPos<TMatch>()); 1348 // std::cerr << "(SORTING " << itEnd-itBegin << " MATCHES)"; 1349 // sortAlignedReads(store.alignedReadStore, TLessBeginPos(TLessScore(store.alignQualityStore))); 1350 #ifdef RAZERS_PROFILE 1351 timelineEndTask(TASK_SORT); 1352 #endif // #ifdef RAZERS_PROFILE 1353 1354 beginPos = -1; 1355 endPos = -1; 1356 contigId = TMatch::INVALID_ID; 1357 readId = TMatch::INVALID_ID; 1358 orientation = '-'; 1359 masked = 0; 1360 it = itBegin; 1361 1362 for (; it != itEnd; ++it) 1363 { 1364 if ((*it).orientation == '-') 1365 continue; 1366 if ((*it).pairMatchId != TMatch::INVALID_ID && (it->readId & 1) != 0) 1367 continue; // remove only single reads or left mates 1368 1369 TContigPos itBeginPos = _min((*it).beginPos, (*it).endPos); 1370 if (beginPos == itBeginPos && readId == (*it).readId && 1371 contigId == (*it).contigId && orientation == ((*it).beginPos < (*it).endPos)) 1372 { 1373 (*it).orientation = '-'; 1374 masked += 1; 1375 continue; 1376 } 1377 readId = (*it).readId; 1378 contigId = (*it).contigId; 1379 beginPos = itBeginPos; 1380 orientation = (*it).beginPos < (*it).endPos; 1381 } 1382 1383 #ifdef RAZERS_DEFER_COMPACTION 1384 #ifdef RAZERS_PROFILE 1385 timelineBeginTask(TASK_SORT); 1386 #endif // #ifdef RAZERS_PROFILE 1387 ////////////////////////////////////////////////////////////////////////////// 1388 // sort matches by begin position when using defered compaction 1389 std::sort(itBegin, itEnd, LessBeginPos<TMatch>()); 1390 #ifdef RAZERS_PROFILE 1391 timelineEndTask(TASK_SORT); 1392 #endif // #ifdef RAZERS_PROFILE 1393 #endif // #ifdef RAZERS_DEFER_COMPACTION 1394 1395 options.timeMaskDuplicates += sysTime() - beginTime; 1396 if (options._debugLevel >= 2) 1397 fprintf(stderr, " [%u matches masked]", masked); 1398 } 1399 1400 template <typename TMatches, typename TOptions, typename TRazerSMode> 1401 void maskDuplicates(TMatches & matches, TOptions & options, TRazerSMode const & mode) 1402 { 1403 maskDuplicates(matches, begin(matches, Standard()), end(matches, Standard()), options, mode); 1404 } 1405 1406 /* 1407 ////////////////////////////////////////////////////////////////////////////// 1408 // Count matches for each number of errors 1409 template <typename TFragmentStore, typename TCounts, typename TBinFunctor, typename TRazerSMode> 1410 void countMatches(TFragmentStore &store, TCounts &cnt, TBinFunctor &binF, TRazerSMode) 1411 { 1412 typedef typename TFragmentStore::TAlignedReadStore TAlignedReadStore; 1413 typedef typename TFragmentStore::TAlignQualityStore TAlignQualityStore; 1414 1415 typedef typename Value<TAlignedReadStore>::Type TAlignedRead; 1416 typedef typename Iterator<TAlignedReadStore, Standard>::Type TIterator; 1417 typedef typename Value<TCounts>::Type TRow; 1418 typedef typename Value<TRow>::Type TValue; 1419 1420 sortAlignedReads(store.alignedReadStore, LessScore<TAlignedReadStore, TAlignQualityStore, TRazerSMode>(store.alignQualityStore)); 1421 1422 TIterator it = begin(store.alignedReadStore, Standard()); 1423 TIterator itEnd = end(store.alignedReadStore, Standard()); 1424 1425 unsigned readId = TAlignedRead::INVALID_ID; 1426 int lastBin = -1; 1427 int64_t count = 0; 1428 1429 String<TValue> row, empty; 1430 for (; it != itEnd; ++it) 1431 { 1432 if ((*it).id == TAlignedRead::INVALID_ID) continue; 1433 int bin = binF((*it).id); 1434 1435 if (readId == (*it).readId) 1436 { 1437 if (lastBin == bin) 1438 ++count; 1439 else 1440 { 1441 appendValue(row, TValue(bin, count), Generous()); 1442 lastBin = bin; 1443 count = 1; 1444 } 1445 } 1446 else 1447 { 1448 while (length(cnt) < readId) 1449 appendValue(cnt, empty, Generous()); 1450 appendValue(cnt, row, Generous()); 1451 clear(row); 1452 readId = (*it).readId; 1453 lastBin = bin; 1454 count = 1; 1455 } 1456 } 1457 while (length(cnt) < readId) 1458 appendValue(cnt, empty, Generous()); 1459 appendValue(cnt, row, Generous()); 1460 } 1461 */ 1462 ////////////////////////////////////////////////////////////////////////////// 1463 // Count matches for each number of errors 1464 template <typename TFragmentStore, typename TCounts, typename TRazerSMode> 1465 void countMatches(TFragmentStore & store, TCounts & cnt, TRazerSMode const &) 1466 { 1467 typedef typename TFragmentStore::TAlignedReadStore TAlignedReadStore; 1468 typedef typename TFragmentStore::TAlignQualityStore TAlignQualityStore; 1469 1470 typedef typename Value<TAlignedReadStore>::Type TAlignedRead; 1471 typedef typename Iterator<TAlignedReadStore, Standard>::Type TIterator; 1472 typedef typename Value<TCounts>::Type TRow; 1473 typedef typename Value<TRow>::Type TValue; 1474 1475 TIterator it = begin(store.alignedReadStore, Standard()); 1476 TIterator itEnd = end(store.alignedReadStore, Standard()); 1477 1478 unsigned readId = TAlignedRead::INVALID_ID; 1479 short errors = -1; 1480 int64_t count = 0; 1481 int64_t maxVal = std::numeric_limits<TValue>::max(); 1482 1483 #ifdef RAZERS_PROFILE 1484 timelineBeginTask(TASK_SORT); 1485 #endif // #ifdef RAZERS_PROFILE 1486 std::sort(begin(store.alignedReadStore, Standard()), end(store.alignedReadStore, Standard()), LessScore<TAlignedReadStore, TAlignQualityStore, TRazerSMode>(store.alignQualityStore)); 1487 //sortAlignedReads(store.alignedReadStore, LessScore<TAlignedReadStore, TAlignQualityStore, TRazerSMode>(store.alignQualityStore)); 1488 #ifdef RAZERS_PROFILE 1489 timelineEndTask(TASK_SORT); 1490 #endif // #ifdef RAZERS_PROFILE 1491 1492 for (; it != itEnd; ++it) 1493 { 1494 if ((*it).id == TAlignedRead::INVALID_ID) 1495 continue; 1496 if (readId == (*it).readId && errors == store.alignQualityStore[(*it).id].errors) 1497 ++count; 1498 else 1499 { 1500 if (readId != TAlignedRead::INVALID_ID && (unsigned)errors < length(cnt)) 1501 cnt[errors][readId] = (maxVal < count) ? (TValue)maxVal : (TValue)count; 1502 readId = (*it).readId; 1503 errors = store.alignQualityStore[(*it).id].errors; 1504 count = 1; 1505 } 1506 } 1507 if (readId != TAlignedRead::INVALID_ID && (unsigned)errors < length(cnt)) 1508 cnt[errors][readId] = (TValue)count; 1509 } 1510 1511 ////////////////////////////////////////////////////////////////////////////// 1512 1513 template <typename TFilterPattern, typename TReadNo, typename TMaxErrors> 1514 inline void 1515 setMaxErrors(TFilterPattern &, TReadNo, TMaxErrors) 1516 {} 1517 1518 template <typename TIndex, typename TPigeonholeSpec, typename TReadNo, typename TMaxErrors> 1519 inline void 1520 setMaxErrors(Pattern<TIndex, Pigeonhole<TPigeonholeSpec> > & filterPattern, TReadNo readNo, TMaxErrors maxErrors) 1521 { 1522 if (maxErrors < 0) 1523 maskPatternSequence(filterPattern, readNo, maxErrors >= 0); 1524 } 1525 1526 template <typename TIndex, typename TSwiftSpec, typename TReadNo, typename TMaxErrors> 1527 inline void 1528 setMaxErrors(Pattern<TIndex, Swift<TSwiftSpec> > & filterPattern, TReadNo readNo, TMaxErrors maxErrors) 1529 { 1530 // if (readNo==643) 1531 // std::cout<<"dman"<<std::endl; 1532 int minT = _qgramLemma(filterPattern, readNo, maxErrors); 1533 if (minT > 1) 1534 { 1535 // std::cout<<" read:"<<readNo<<" newThresh:"<<minT; 1536 if (maxErrors < 0) 1537 minT = std::numeric_limits<int>::max(); 1538 setMinThreshold(filterPattern, readNo, (unsigned)minT); 1539 } 1540 } 1541 1542 ////////////////////////////////////////////////////////////////////////////// 1543 // Remove low quality matches 1544 template < 1545 typename TMatches, 1546 typename TCounts, 1547 typename TSpec, 1548 typename TAlignMode, 1549 typename TGapMode, 1550 typename TScoreMode, 1551 typename TSwift, 1552 typename TMatchNPolicy 1553 > 1554 void compactMatches( 1555 TMatches & matches, 1556 TCounts &, 1557 RazerSCoreOptions<TSpec> & options, 1558 RazerSMode<TAlignMode, TGapMode, TScoreMode, TMatchNPolicy> const &, 1559 TSwift & swift, 1560 CompactMatchesMode compactMode) 1561 { 1562 // fprintf(stderr, "[compact]"); 1563 double beginTime = sysTime(); 1564 typedef typename Value<TMatches>::Type TMatch; 1565 typedef typename Iterator<TMatches, Standard>::Type TIterator; 1566 //typedef RazerSMode<TAlignMode, TGapMode, TScoreMode, TMatchNPolicy> TRazerSMode; 1567 1568 unsigned readNo = -1; 1569 unsigned hitCount = 0; 1570 unsigned hitCountCutOff = options.maxHits; 1571 int scoreCutOff = std::numeric_limits<int>::min(); 1572 int scoreRangeBest = (IsSameType<TAlignMode, RazerSGlobal>::VALUE && !IsSameType<TScoreMode, RazerSScore>::VALUE) ? -(int)options.scoreDistanceRange : std::numeric_limits<int>::max(); 1573 ignoreUnusedVariableWarning(scoreRangeBest); 1574 ignoreUnusedVariableWarning(compactMode); 1575 1576 #ifdef RAZERS_PROFILE 1577 timelineBeginTask(TASK_SORT); 1578 #endif // #ifdef RAZERS_PROFILE 1579 #ifdef RAZERS_EXTERNAL_MATCHES 1580 if (compactMode == COMPACT_FINAL_EXTERNAL) 1581 { 1582 typedef Pipe<TMatches, Source<> > TSource; 1583 typedef LessScoreBackport3Way<TMatch> TLess; 1584 typedef Pool<TMatch, SorterSpec<SorterConfigSize<TLess, typename Size<TSource>::Type> > > TSorterPool; 1585 1586 TSource source(matches); 1587 TSorterPool sorter; 1588 sorter << source; 1589 matches << sorter; 1590 1591 for (unsigned i = 1; i < length(matches); ++i) 1592 SEQAN_ASSERT_LEQ(TLess() (matches[i - 1], matches[i]), 0); 1593 } 1594 else 1595 { 1596 #endif // #ifdef RAZERS_EXTERNAL_MATCHES 1597 std::sort(begin(matches, Standard()), end(matches, Standard()), LessScoreBackport<TMatch>()); 1598 // sortAlignedReads(store.alignedReadStore, LessScore<TAlignedReadStore, TAlignQualityStore, TRazerSMode>(store.alignQualityStore)); 1599 #ifdef RAZERS_EXTERNAL_MATCHES 1600 } 1601 1602 #endif // #ifdef RAZERS_EXTERNAL_MATCHES 1603 #ifdef RAZERS_PROFILE 1604 timelineEndTask(TASK_SORT); 1605 #endif // #ifdef RAZERS_PROFILE 1606 1607 TIterator it = begin(matches, Standard()); 1608 TIterator itEnd = end(matches, Standard()); 1609 TIterator dit = it; 1610 TIterator ditBeg = it; 1611 // fprintf(stderr, "[%u matches to compact]", unsigned(itEnd - it)); 1612 unsigned disabled = 0; 1613 1614 for (; it != itEnd; ++it) 1615 { 1616 if ((*it).orientation == '-') 1617 continue; 1618 int score = (*it).score; 1619 int errors = -(*it).score; 1620 ignoreUnusedVariableWarning(errors); 1621 1622 //if (readNo == 643) std::cerr <<"["<<score<<","<<errors<<"] "<<std::flush; 1623 if (readNo == (*it).readId && (*it).pairMatchId == TMatch::INVALID_ID) // Only compact unpaired matches. 1624 { 1625 if (score <= scoreCutOff) 1626 { 1627 // std::cout<<"decreased errCutOff["<<readNo<<"] from "<<(unsigned)options.errorCutOff[readNo]; 1628 options.errorCutOff[readNo] = -scoreCutOff; 1629 // std::cout<<"to "<<(unsigned)options.errorCutOff[readNo]<<std::endl; 1630 setMaxErrors(swift, readNo, -scoreCutOff - 1); 1631 while (it != itEnd && readNo == (*it).readId) 1632 ++it; 1633 --it; 1634 continue; 1635 } 1636 if (++hitCount >= hitCountCutOff) 1637 { 1638 #ifdef RAZERS_MASK_READS 1639 if (hitCount == hitCountCutOff) 1640 { 1641 // we have enough, now look for better matches 1642 if (options.purgeAmbiguous && (options.scoreDistanceRange == 0 || score > scoreRangeBest)) 1643 { 1644 // std::cerr << "PURGED " << readNo << std::endl; 1645 // std::cout<<"decreased errCutOff["<<readNo<<"] from "<<(unsigned)options.errorCutOff[readNo]; 1646 options.errorCutOff[readNo] = 0; 1647 // std::cout<<"to "<<(unsigned)options.errorCutOff[readNo]<<std::endl; 1648 setMaxErrors(swift, readNo, -1); 1649 ++disabled; 1650 // if (options._debugLevel >= 2) 1651 // std::cerr << "(read #" << readNo << " disabled)"; 1652 } 1653 else 1654 // we only need better matches 1655 if (IsSameType<TScoreMode, RazerSErrors>::VALUE) 1656 { 1657 // std::cerr << "LIMITED " << readNo << std::endl; 1658 // std::cout<<"decreased errCutOff["<<readNo<<"] from "<<(unsigned)options.errorCutOff[readNo]; 1659 options.errorCutOff[readNo] = errors; 1660 // std::cout<<"to "<<(unsigned)options.errorCutOff[readNo]<<std::endl; 1661 setMaxErrors(swift, readNo, errors - 1); 1662 if (errors == 0) 1663 ++disabled; 1664 // if (errors == 0 && options._debugLevel >= 2) 1665 // std::cerr << "(read #" << readNo << " disabled)"; 1666 } 1667 1668 if (options.purgeAmbiguous && (compactMode == COMPACT_FINAL || compactMode == COMPACT_FINAL_EXTERNAL)) 1669 { 1670 if (options.scoreDistanceRange == 0 || score > scoreRangeBest || compactMode == COMPACT_FINAL || compactMode == COMPACT_FINAL_EXTERNAL) 1671 { 1672 dit = ditBeg; 1673 } 1674 else 1675 { 1676 *dit = *it; 1677 ++dit; 1678 } 1679 } 1680 } 1681 #endif 1682 continue; 1683 } 1684 } 1685 else 1686 { 1687 readNo = (*it).readId; 1688 hitCount = 0; 1689 if (options.scoreDistanceRange > 0) 1690 scoreCutOff = score - options.scoreDistanceRange; 1691 ditBeg = dit; 1692 } 1693 *dit = *it; 1694 ++dit; 1695 } 1696 unsigned origSize = length(matches); 1697 resize(matches, dit - begin(matches, Standard())); 1698 // compactAlignedReads(store); 1699 options.timeCompactMatches += sysTime() - beginTime; 1700 // fprintf(stderr, "[compacted in %f s]", endTime - beginTime); 1701 unsigned newSize = length(matches); 1702 if (options._debugLevel >= 2) 1703 { 1704 fprintf(stderr, " [%u matches removed]", unsigned(origSize - newSize)); 1705 if (disabled > 0) 1706 fprintf(stderr, " [%u reads disabled]", disabled); 1707 } 1708 } 1709 1710 ////////////////////////////////////////////////////////////////////////////// 1711 // Remove low quality matches 1712 template < 1713 typename TMatches, 1714 typename TCounts, 1715 typename TSpec, 1716 typename TGapMode, 1717 typename TSwift, 1718 typename TMatchNPolicy 1719 > 1720 void compactMatches( 1721 TMatches & matches, 1722 TCounts & cnts, 1723 RazerSCoreOptions<TSpec> &, 1724 RazerSMode<RazerSGlobal, TGapMode, RazerSQuality<RazerSMAQ>, TMatchNPolicy> const &, 1725 TSwift &, 1726 CompactMatchesMode compactMode) 1727 { 1728 typedef typename Value<TMatches>::Type TMatch; 1729 typedef typename Iterator<TMatches, Standard>::Type TIterator; 1730 //typedef RazerSMode<RazerSGlobal, TGapMode, RazerSQuality<RazerSMAQ>, TMatchNPolicy> TRazerSMode; 1731 1732 unsigned readNo = -1; 1733 1734 #ifdef RAZERS_PROFILE 1735 timelineBeginTask(TASK_SORT); 1736 #endif // #ifdef RAZERS_PROFILE 1737 std::sort( 1738 begin(matches, Standard()), 1739 end(matches, Standard()), 1740 LessScoreBackport<TMatch>()); 1741 // sortAlignedReads(store.alignedReadStore, LessScore<TAlignedReadStore, TAlignQualityStore, TRazerSMode>(store.alignQualityStore)); 1742 #ifdef RAZERS_PROFILE 1743 timelineEndTask(TASK_SORT); 1744 #endif // #ifdef RAZERS_PROFILE 1745 1746 TIterator it = begin(matches, Standard()); 1747 TIterator itEnd = end(matches, Standard()); 1748 TIterator dit = it; 1749 1750 //number of errors may not exceed 31! 1751 bool second = true; 1752 for (; it != itEnd; ++it) 1753 { 1754 if ((*it).orientation == '-') 1755 continue; 1756 if (readNo == (*it).readId) 1757 { 1758 //second best match 1759 if (second) 1760 { 1761 second = false; 1762 if ((cnts[1][(*it).readId] & 31) > (*it).editDist) 1763 { 1764 //this second best match is better than any second best match before 1765 cnts[1][(*it).readId] = (*it).editDist; // second best dist is this editDist 1766 // count is 0 (will be updated if countFirstTwo) 1767 } 1768 if (compactMode == COMPACT_FINAL) 1769 if ((cnts[1][(*it).readId] >> 5) != 2047) 1770 cnts[1][(*it).readId] += 32; 1771 } 1772 else 1773 { 1774 if ((*it).editDist <= (cnts[0][(*it).readId] & 31)) 1775 if (cnts[0][(*it).readId] >> 5 != 2047) 1776 cnts[0][(*it).readId] += 32; 1777 if ((*it).editDist <= (cnts[1][(*it).readId] & 31)) 1778 if ((cnts[1][(*it).readId] >> 5) != 2047) 1779 cnts[1][(*it).readId] += 32; 1780 continue; 1781 } 1782 } 1783 else //best match 1784 { 1785 second = true; 1786 readNo = (*it).readId; 1787 //cnts has 16bits, 11:5 for count:dist 1788 if ((cnts[0][(*it).readId] & 31) > (*it).editDist) 1789 { 1790 //this match is better than any match before 1791 cnts[1][(*it).readId] = cnts[0][(*it).readId]; // best before is now second best 1792 // (count will be updated when match is removed) 1793 cnts[0][(*it).readId] = (*it).editDist; // best dist is this editDist 1794 // count is 0 (will be updated if countFirstTwo) 1795 } 1796 if (compactMode == COMPACT_FINAL) 1797 if ((cnts[0][(*it).readId] >> 5) != 2047) 1798 cnts[0][(*it).readId] += 32; 1799 // shift 5 to the right, add 1, shift 5 to the left, and keep count 1800 } 1801 *dit = *it; 1802 ++dit; 1803 } 1804 1805 resize(matches, dit - begin(matches, Standard())); 1806 } 1807 1808 /* // fallback 1809 template < 1810 typename TFragmentStore, 1811 typename TCounts, 1812 typename TSpec, 1813 typename TAlignMode, 1814 typename TGapMode, 1815 typename TScoreMode, 1816 typename TSwift 1817 > 1818 void compactMatches( 1819 TFragmentStore &, 1820 TCounts &, 1821 RazerSCoreOptions<TSpec> &, 1822 RazerSMode<TAlignMode, TGapMode, TScoreMode> const, 1823 TSwift &, 1824 CompactMatchesMode) 1825 { 1826 } 1827 */ 1828 1829 ////////////////////////////////////////////////////////////////////////////// 1830 // Best Hamming prefix verification 1831 template < 1832 typename TMatchVerifier, 1833 typename TGenome, 1834 typename TRead, 1835 typename TMatchNPolicy> 1836 inline bool 1837 matchVerify( 1838 TMatchVerifier & verifier, 1839 Segment<TGenome, InfixSegment> inf, // potential match genome region 1840 unsigned readId, // read number 1841 TRead const & read, // read 1842 RazerSMode<RazerSPrefix, RazerSUngapped, RazerSErrors, TMatchNPolicy> const &) // Hamming only 1843 { 1844 typedef Segment<TGenome, InfixSegment> TGenomeInfix; 1845 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator; 1846 typedef typename Iterator<TRead const, Standard>::Type TReadIterator; 1847 1848 // unsigned maxErrors = (unsigned)(verifier.options->prefixSeedLength * verifier.options->errorRate); 1849 unsigned maxErrors = verifier.options->errorCutOff[readId]; 1850 if (maxErrors == 0) 1851 return false; 1852 1853 --maxErrors; 1854 1855 // verify 1856 TReadIterator ritBeg = begin(read, Standard()); 1857 TReadIterator ritEnd = end(read, Standard()); 1858 unsigned ndlLength = ritEnd - ritBeg; 1859 1860 if (length(inf) < ndlLength) 1861 return false; 1862 1863 TGenomeIterator git = begin(inf, Standard()); 1864 TGenomeIterator gitEnd = end(inf, Standard()) - (ndlLength - 1); 1865 1866 unsigned errorThresh = (verifier.oneMatchPerBucket) ? std::numeric_limits<unsigned>::max() : maxErrors; 1867 unsigned minErrors = maxErrors + 2; 1868 int bestHitLength = 0; 1869 1870 for (; git < gitEnd; ++git) 1871 { 1872 unsigned errors = 0; 1873 TReadIterator r = ritBeg; 1874 TGenomeIterator g = git; 1875 for (; r != ritEnd; ++r, ++g) 1876 if ((verifier.options->compMask[ordValue(*g)] & verifier.options->compMask[ordValue(*r)]) == 0) 1877 { 1878 if (r - ritBeg < (int)verifier.options->prefixSeedLength) // seed 1879 { 1880 if (++errors > maxErrors) // doesn't work for islands with errorThresh > maxErrors 1881 break; 1882 } 1883 else 1884 break; 1885 } 1886 1887 if (errors < minErrors) 1888 { 1889 minErrors = errors; 1890 bestHitLength = r - ritBeg; 1891 verifier.m.beginPos = git - begin(host(inf), Standard()); 1892 } 1893 else if (errors == minErrors && bestHitLength < r - ritBeg) 1894 { 1895 bestHitLength = r - ritBeg; 1896 verifier.m.beginPos = git - begin(host(inf), Standard()); 1897 } 1898 else if (errorThresh < errors) 1899 { 1900 if (minErrors <= maxErrors) 1901 { 1902 verifier.m.endPos = verifier.m.beginPos + bestHitLength; 1903 verifier.q.pairScore = verifier.q.score = bestHitLength; 1904 verifier.q.errors = minErrors; 1905 1906 if (maxErrors > minErrors + verifier.options->dRange) 1907 maxErrors = minErrors + verifier.options->dRange; 1908 minErrors = maxErrors + 2; 1909 1910 verifier.push(); 1911 } 1912 } 1913 } 1914 1915 if (minErrors <= maxErrors) 1916 { 1917 verifier.m.endPos = verifier.m.beginPos + bestHitLength; 1918 verifier.q.pairScore = verifier.q.score = bestHitLength; 1919 verifier.q.errors = minErrors; 1920 if (!verifier.oneMatchPerBucket) 1921 { 1922 if (maxErrors > minErrors + verifier.options->dRange) 1923 maxErrors = minErrors + verifier.options->dRange; 1924 verifier.push(); 1925 // update maximal errors per read 1926 unsigned cutOff = maxErrors + 1; 1927 if ((unsigned)verifier.options->errorCutOff[readId] > cutOff) 1928 verifier.options->errorCutOff[readId] = cutOff; 1929 } 1930 return true; 1931 } 1932 return false; 1933 } 1934 1935 template <typename TRazerSMode> 1936 struct UseQualityValues__ 1937 { 1938 enum 1939 { 1940 VALUE = false 1941 }; 1942 }; 1943 template <typename TAlignMode, typename TGapMode, typename TSpec, typename TMatchNPolicy> 1944 struct UseQualityValues__<RazerSMode<TAlignMode, TGapMode, RazerSQuality<TSpec>, TMatchNPolicy> > 1945 { 1946 enum 1947 { 1948 VALUE = true 1949 }; 1950 }; 1951 1952 ////////////////////////////////////////////////////////////////////////////// 1953 // Hamming verification 1954 template < 1955 typename TMatchVerifier, 1956 typename TGenome, 1957 typename TRead, 1958 typename TScoreMode, 1959 typename TMatchNPolicy> 1960 inline bool 1961 matchVerify( 1962 TMatchVerifier & verifier, 1963 Segment<TGenome, InfixSegment> inf, // potential match genome region 1964 unsigned readId, // read number 1965 TRead const & read, // reads 1966 RazerSMode<RazerSGlobal, RazerSUngapped, TScoreMode, TMatchNPolicy> const &) // Semi-global, no gaps 1967 { 1968 typedef Segment<TGenome, InfixSegment> TGenomeInfix; 1969 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator; 1970 typedef typename Iterator<TRead const, Standard>::Type TReadIterator; 1971 typedef RazerSMode<RazerSGlobal, RazerSUngapped, TScoreMode, TMatchNPolicy> TRazerSMode; 1972 1973 #ifdef RAZERS_DEBUG 1974 std::cout << "Verify: " << std::endl; 1975 std::cout << "Genome: " << inf << "\t" << beginPosition(inf) << "," << endPosition(inf) << std::endl; 1976 std::cout << "Read: " << read << std::endl; 1977 #endif 1978 1979 int mismatchDelta, scoreInit; 1980 int minScore; 1981 if (IsSameType<TScoreMode, RazerSErrors>::VALUE) 1982 { 1983 minScore = verifier.options->errorCutOff[readId]; 1984 if (minScore == 0) 1985 return false; 1986 1987 minScore = -minScore + 1; 1988 } 1989 else if (UseQualityValues__<TRazerSMode>::VALUE) 1990 minScore = -verifier.options->absMaxQualSumErrors; 1991 else if (IsSameType<TScoreMode, RazerSScore>::VALUE) 1992 { 1993 minScore = verifier.options->minScore; 1994 mismatchDelta = scoreMatch(verifier.options->scoringScheme) - scoreMismatch(verifier.options->scoringScheme); 1995 scoreInit = scoreMatch(verifier.options->scoringScheme) * length(read); 1996 } 1997 1998 // verify 1999 TReadIterator ritBeg = begin(read, Standard()); 2000 TReadIterator ritEnd = end(read, Standard()); 2001 unsigned ndlLength = ritEnd - ritBeg; 2002 2003 if (length(inf) < ndlLength) 2004 return false; 2005 2006 TGenomeIterator git = begin(inf, Standard()); 2007 TGenomeIterator gitEnd = end(inf, Standard()) - (ndlLength - 1); 2008 2009 int maxScore = minScore - 1; 2010 int scoreThresh = (verifier.oneMatchPerBucket) ? std::numeric_limits<int>::max() : minScore; 2011 int score, errors; 2012 2013 for (; git < gitEnd; ++git) 2014 { 2015 if (!IsSameType<TScoreMode, RazerSScore>::VALUE) 2016 score = 0; 2017 else 2018 score = scoreInit; 2019 2020 if (!IsSameType<TScoreMode, RazerSErrors>::VALUE) 2021 errors = 0; 2022 2023 TGenomeIterator g = git; 2024 for (TReadIterator r = ritBeg; r != ritEnd; ++r, ++g) 2025 if ((verifier.options->compMask[ordValue(*g)] & verifier.options->compMask[ordValue(*r)]) == 0) 2026 { 2027 if (IsSameType<TScoreMode, RazerSErrors>::VALUE) 2028 { 2029 // A. Count mismatches only 2030 --score; 2031 } 2032 else 2033 { 2034 ++errors; 2035 if (UseQualityValues__<TRazerSMode>::VALUE) 2036 // B. Count mismatches and mismatch qualities 2037 score -= getQualityValue(*g); 2038 else if (IsSameType<TScoreMode, RazerSScore>::VALUE) 2039 // C. Count mismatches and alignment score 2040 score -= mismatchDelta; 2041 else 2042 SEQAN_FAIL("Unsupported score mode!"); 2043 } 2044 if (score < minScore) // doesn't work for islands with errorThresh > maxErrors 2045 break; 2046 } 2047 2048 if (score > maxScore) 2049 { 2050 maxScore = score; 2051 if (IsSameType<TScoreMode, RazerSErrors>::VALUE) 2052 verifier.m.score = score; 2053 else 2054 verifier.m.score = errors; 2055 verifier.m.beginPos = git - begin(host(inf), Standard()); 2056 } 2057 #ifdef RAZERS_ISLAND_CRITERION 2058 else if (scoreThresh > score) 2059 { 2060 if (maxScore >= minScore) 2061 { 2062 // for RazerSErrors bestErrors == -maxScore 2063 verifier.m.endPos = verifier.m.beginPos + ndlLength; 2064 verifier.m.pairScore = verifier.m.score = maxScore; 2065 if (!verifier.oneMatchPerBucket) 2066 { 2067 if (minScore < maxScore - verifier.options->dRange) 2068 minScore = maxScore - verifier.options->dRange; 2069 verifier.push(); 2070 } 2071 maxScore = minScore - 1; 2072 } 2073 } 2074 #else 2075 (void)scoreThresh; 2076 #endif 2077 } 2078 2079 if (maxScore >= minScore) 2080 { 2081 verifier.m.endPos = verifier.m.beginPos + ndlLength; 2082 verifier.m.pairScore = verifier.m.score = maxScore; 2083 if (!verifier.oneMatchPerBucket) 2084 { 2085 if (minScore < maxScore - verifier.options->dRange) 2086 minScore = maxScore - verifier.options->dRange; 2087 verifier.push(); 2088 2089 // update maximal errors per read 2090 unsigned cutOff = -minScore + 1; 2091 if ((unsigned)verifier.options->errorCutOff[readId] > cutOff) 2092 verifier.options->errorCutOff[readId] = cutOff; 2093 } 2094 return true; 2095 } 2096 return false; 2097 } 2098 2099 ////////////////////////////////////////////////////////////////////////////// 2100 // Edit distance verification 2101 template < 2102 typename TMatchVerifier, 2103 typename TGenome, 2104 typename TRead, 2105 typename TMatchNPolicy> 2106 inline bool 2107 matchVerify( 2108 TMatchVerifier & verifier, 2109 Segment<TGenome, InfixSegment> inf, // potential match genome region 2110 unsigned readId, // read number 2111 TRead const & read, // reads 2112 RazerSMode<RazerSGlobal, RazerSGapped, RazerSErrors, TMatchNPolicy> const &) // Mismatches and Indels 2113 { 2114 if (empty(inf)) 2115 return false; 2116 2117 typedef Segment<TGenome, InfixSegment> TGenomeInfix; 2118 typedef typename Prefix<TRead const>::Type TReadPrefix SEQAN_UNUSED_TYPEDEF; 2119 typedef typename Position<TGenomeInfix>::Type TPosition; 2120 typedef typename MakeSigned_<TPosition>::Type TDistance; 2121 2122 // find read match end 2123 typedef Finder<TGenomeInfix> TMyersFinder; 2124 typedef typename TMatchVerifier::TOptions::TMyersPattern TMyersPattern SEQAN_UNUSED_TYPEDEF; 2125 typedef typename TMatchVerifier::TPatternState TPatternState; 2126 2127 // find read match begin 2128 // TODO(holtgrew): Use reverse-search here, as well! 2129 typedef ModifiedString<TGenomeInfix, ModReverse> TGenomeInfixRev; 2130 typedef Finder<TGenomeInfixRev> TMyersFinderRev; 2131 2132 #ifdef RAZERS_NOOUTERREADGAPS 2133 typedef ModifiedString<TReadPrefix, ModReverse> TReadRev; 2134 #else 2135 typedef ModifiedString<TRead const, ModReverse> TReadRev; 2136 #endif 2137 typedef Pattern<TReadRev, MyersUkkonenGlobal> TMyersPatternRev; 2138 2139 unsigned ndlLength = length(read); 2140 int maxScore = std::numeric_limits<int>::min(); 2141 int minScore = verifier.options->errorCutOff[readId]; 2142 if (minScore == 0) 2143 return false; 2144 2145 minScore = -minScore + 1; 2146 2147 TDistance minSinkDistance = std::numeric_limits<TDistance>::max(); 2148 TPosition maxPos = 0; 2149 TPosition lastPos = length(inf); 2150 #ifdef RAZERS_ISLAND_CRITERION 2151 unsigned minDistance = (verifier.oneMatchPerBucket) ? lastPos : 1; 2152 #endif 2153 2154 #ifdef RAZERS_NOOUTERREADGAPS 2155 TGenomeInfix origInf(inf); 2156 setEndPosition(inf, endPosition(inf) - 1); 2157 --ndlLength; 2158 TReadPrefix readPrefix = prefix(read, ndlLength); 2159 #else 2160 TRead readPrefix(read); // here only infixes (no sequence) is copied 2161 #endif 2162 2163 TMyersFinder myersFinder(inf); 2164 #ifndef RAZERS_BANDED_MYERS 2165 TMyersPattern & myersPattern = verifier.options->forwardPatterns[readId]; 2166 #endif // #ifdef RAZERS_BANDED_MYERS 2167 TPatternState & state = verifier.patternState; 2168 2169 #ifdef RAZERS_DEBUG 2170 std::cout << "Verify: " << std::endl; 2171 std::cout << "Genome: " << inf << "\t" << beginPosition(inf) << "," << endPosition(inf) << std::endl; 2172 std::cout << "Read: " << read << "(id: " << readId << ")" << std::endl; 2173 #endif 2174 2175 // find end of best semi-global alignment 2176 #ifdef RAZERS_BANDED_MYERS 2177 while (find(myersFinder, readPrefix, state, minScore)) 2178 #else // #ifdef RAZERS_BANDED_MYERS 2179 while (find(myersFinder, myersPattern, state, minScore)) 2180 #endif // #ifdef RAZERS_BANDED_MYERS 2181 { 2182 TPosition const pos = position(hostIterator(myersFinder)); 2183 int score = getScore(state); 2184 2185 #ifdef RAZERS_NOOUTERREADGAPS 2186 // Manually align the last base of the read 2187 // 2188 // In this case myersPattern contains the whole read without the 2189 // last base. We compare the bases and adjust the score. 2190 // We also have to adjust inf and remove the last base of the 2191 // genomic region that has to be verified. 2192 SEQAN_ASSERT_LT(pos + 1, length(origInf)); 2193 if ((verifier.options->compMask[ordValue(origInf[pos + 1])] & verifier.options->compMask[ordValue(back(read))]) == 0) 2194 if (--score < minScore) 2195 continue; 2196 #endif 2197 #ifdef RAZERS_ISLAND_CRITERION 2198 if (lastPos + minDistance < pos) 2199 { 2200 if (minScore <= maxScore) 2201 { 2202 verifier.m.endPos = beginPosition(inf) + maxPos + 1; 2203 verifier.m.pairScore = verifier.m.score = maxScore; 2204 // verifier.m.errors = -maxScore; 2205 2206 if (maxScore == 0) 2207 verifier.m.beginPos = verifier.m.endPos - ndlLength; 2208 else 2209 { 2210 // find beginning of best semi-global alignment 2211 TPosition infBeginPos = beginPosition(inf); 2212 TPosition infEndPos = endPosition(inf); 2213 TPosition newInfEndPos = verifier.m.endPos; 2214 2215 #ifdef RAZERS_BANDED_MYERS 2216 verifier.revPatternState.leftClip = infEndPos - newInfEndPos + verifier.rightClip; 2217 #endif 2218 setEndPosition(inf, newInfEndPos); 2219 2220 // // limit the beginning to needle length plus errors (== -maxScore) 2221 // if (length(inf) > ndlLength - maxScore) 2222 // setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore); 2223 2224 // we eventually have to search before the beginning of our parallelogram 2225 // otherwise alignments of an island in the previous parallelogram 2226 // could be cut and prevent that an island in this parallelgram is found 2227 if (endPosition(inf) > (unsigned)(ndlLength - maxScore)) 2228 setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore); 2229 else 2230 setBeginPosition(inf, 0); 2231 2232 #ifdef RAZERS_NOOUTERREADGAPS 2233 // The best score must be corrected to hold the score of the prefix w/o the last read base 2234 if ((verifier.options->compMask[ordValue(origInf[maxPos + 1])] & verifier.options->compMask[ordValue(back(read))]) == 0) 2235 ++maxScore; 2236 #endif 2237 2238 TReadRev readRev(readPrefix); 2239 TGenomeInfixRev infRev(inf); 2240 TMyersPatternRev myersPatternRev(readRev); 2241 TMyersFinderRev myersFinderRev(infRev); 2242 2243 verifier.m.beginPos = verifier.m.endPos; 2244 #ifdef RAZERS_BANDED_MYERS 2245 while (find(myersFinderRev, readRev, verifier.revPatternState, maxScore)) 2246 { 2247 if (maxScore <= getScore(verifier.revPatternState)) 2248 { 2249 maxScore = getScore(verifier.revPatternState); 2250 verifier.m.beginPos = verifier.m.endPos - (position(myersFinderRev) + 1); 2251 } 2252 } 2253 #else 2254 _patternMatchNOfPattern(myersPatternRev, verifier.options->matchN); 2255 _patternMatchNOfFinder(myersPatternRev, verifier.options->matchN); 2256 while (find(myersFinderRev, myersPatternRev, maxScore)) 2257 { 2258 if (maxScore <= getScore(myersPatternRev)) 2259 { 2260 maxScore = getScore(myersPatternRev); 2261 verifier.m.beginPos = verifier.m.endPos - (position(myersFinderRev) + 1); 2262 } 2263 } 2264 #endif 2265 setBeginPosition(inf, infBeginPos); 2266 setEndPosition(inf, infEndPos); 2267 #ifdef RAZERS_BANDED_MYERS 2268 if (verifier.m.beginPos == verifier.m.endPos) 2269 continue; 2270 #endif 2271 } 2272 // minDistance implicitly forbids to get here with verifier.oneMatchPerBucket == true 2273 SEQAN_ASSERT_NOT(verifier.oneMatchPerBucket); 2274 SEQAN_ASSERT_LT(verifier.m.beginPos, verifier.m.endPos); 2275 2276 #ifdef RAZERS_NOOUTERREADGAPS 2277 // The match end position must be increased by the omitted base. 2278 ++verifier.m.endPos; 2279 #endif 2280 if (minScore < verifier.m.score - verifier.options->dRange) 2281 minScore = verifier.m.score - verifier.options->dRange; 2282 2283 verifier.push(); 2284 maxScore = minScore - 1; 2285 minSinkDistance = std::numeric_limits<TDistance>::max(); 2286 } 2287 } 2288 #endif // #ifdef RAZERS_ISLAND_CRITERION 2289 2290 // minimize distance between sink and estimated match begin 2291 TDistance sinkDistance = verifier.sinkPos - ((TDistance)(beginPosition(inf) + pos) - (TDistance)ndlLength); 2292 // sinkDistance = maxValue(sinkDistance); 2293 if (sinkDistance < (TDistance)0) 2294 sinkDistance = -sinkDistance; 2295 2296 if (score > maxScore || (score == maxScore && sinkDistance <= minSinkDistance)) 2297 { 2298 maxScore = score; 2299 minSinkDistance = sinkDistance; 2300 maxPos = pos; 2301 } 2302 lastPos = pos; 2303 } 2304 2305 if (minScore <= maxScore) 2306 { 2307 verifier.m.endPos = beginPosition(inf) + maxPos + 1; 2308 verifier.m.pairScore = verifier.m.score = maxScore; 2309 // verifier.m.errors = -maxScore; 2310 2311 if (maxScore == 0) 2312 verifier.m.beginPos = verifier.m.endPos - ndlLength; 2313 else 2314 { 2315 // find beginning of best semi-global alignment 2316 TPosition newInfEndPos = verifier.m.endPos; 2317 2318 #ifdef RAZERS_BANDED_MYERS 2319 verifier.revPatternState.leftClip = endPosition(inf) - newInfEndPos + verifier.rightClip; 2320 #endif 2321 setEndPosition(inf, newInfEndPos); 2322 2323 // // limit the beginning to needle length plus errors (== -maxScore) 2324 // if (length(inf) > ndlLength - maxScore) 2325 // setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore); 2326 2327 // we eventually have to search before the beginning of our parallelogram 2328 // otherwise alignments of an island in the previous parallelogram 2329 // could be cut and prevent that an island in this parallelgram is found 2330 if (endPosition(inf) > (unsigned)(ndlLength - maxScore)) 2331 setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore); 2332 else 2333 setBeginPosition(inf, 0); 2334 2335 #ifdef RAZERS_NOOUTERREADGAPS 2336 // The best score must be corrected to hold the score of the prefix w/o the last read base 2337 if ((verifier.options->compMask[ordValue(origInf[maxPos + 1])] & verifier.options->compMask[ordValue(back(read))]) == 0) 2338 ++maxScore; 2339 #endif 2340 2341 TReadRev readRev(readPrefix); 2342 TGenomeInfixRev infRev(inf); 2343 TMyersPatternRev myersPatternRev(readRev); 2344 TMyersFinderRev myersFinderRev(infRev); 2345 2346 verifier.m.beginPos = verifier.m.endPos; 2347 #ifdef RAZERS_BANDED_MYERS 2348 while (find(myersFinderRev, readRev, verifier.revPatternState, maxScore)) 2349 { 2350 if (maxScore <= getScore(verifier.revPatternState)) 2351 { 2352 maxScore = getScore(verifier.revPatternState); 2353 verifier.m.beginPos = verifier.m.endPos - (position(myersFinderRev) + 1); 2354 } 2355 } 2356 #else 2357 _patternMatchNOfPattern(myersPatternRev, verifier.options->matchN); 2358 _patternMatchNOfFinder(myersPatternRev, verifier.options->matchN); 2359 while (find(myersFinderRev, myersPatternRev, maxScore)) 2360 { 2361 if (maxScore <= getScore(myersPatternRev)) 2362 { 2363 maxScore = getScore(myersPatternRev); 2364 verifier.m.beginPos = verifier.m.endPos - (position(myersFinderRev) + 1); 2365 } 2366 } 2367 #endif 2368 #ifdef RAZERS_BANDED_MYERS 2369 if (verifier.m.beginPos == verifier.m.endPos) 2370 { 2371 #ifdef RAZERS_DEBUG 2372 std::cout << "FAILED2" << std::endl; 2373 #endif 2374 return false; 2375 } 2376 #endif // RAZERS_BANDED_MYERS 2377 } 2378 SEQAN_ASSERT_LT(verifier.m.beginPos, verifier.m.endPos); 2379 2380 #ifdef RAZERS_NOOUTERREADGAPS 2381 // The match end position must be increased by the omitted base. 2382 ++verifier.m.endPos; 2383 #endif 2384 2385 if (!verifier.oneMatchPerBucket) 2386 { 2387 if (minScore < verifier.m.score - verifier.options->dRange) 2388 minScore = verifier.m.score - verifier.options->dRange; 2389 verifier.push(); 2390 2391 // update maximal errors per read 2392 unsigned cutOff = -minScore + 1; 2393 if ((unsigned)verifier.options->errorCutOff[readId] > cutOff) 2394 { 2395 // std::cout<<"maxScore="<<verifier.m.score << " minScore=" << minScore<<std::endl; 2396 // std::cout<<"decreased errCutOff["<<readId<<"] from "<<(unsigned)verifier.options->errorCutOff[readId]; 2397 verifier.options->errorCutOff[readId] = cutOff; 2398 // std::cout<<"to "<<(unsigned)verifier.options->errorCutOff[readId]<<std::endl; 2399 } 2400 } 2401 2402 #ifdef RAZERS_DEBUG 2403 std::cout << "OK" << std::endl; 2404 #endif 2405 return true; 2406 } 2407 #ifdef RAZERS_DEBUG 2408 std::cout << "FAILED3" << std::endl; 2409 #endif 2410 return false; 2411 } 2412 2413 template < 2414 typename TMatchVerifier, 2415 typename TGenome, 2416 typename TRead, 2417 typename TAlignMode, 2418 typename TGapMode, 2419 typename TScoreMode, 2420 typename TMatchNPolicy> 2421 inline bool 2422 matchVerify( 2423 TMatchVerifier &, 2424 Segment<TGenome, InfixSegment>, // potential match genome region 2425 unsigned, // read number 2426 TRead const &, // read 2427 RazerSMode<TAlignMode, TGapMode, TScoreMode, TMatchNPolicy> const &) 2428 { 2429 SEQAN_FAIL("Verification not implemented!"); 2430 return false; 2431 } 2432 2433 template <typename TOptions> 2434 inline void 2435 estimateErrorDistributionFromQualities(TOptions & options) 2436 { 2437 typedef typename TOptions::TProb TFloat; 2438 2439 resize(options.errorProb, length(options.avrgQuality)); 2440 // std::cout<< "ERROR PROBS:"<<std::endl; 2441 for (unsigned i = 0; i < length(options.avrgQuality); ++i) 2442 { 2443 // qual = -10 log_10 p 2444 // = -10 log p / log 10 2445 // p = exp^(qual * log 10 / -10) 2446 // log p = qual * log 10 / -10; 2447 double e = options.avrgQuality[i] * log(10.0) / -10.0; 2448 TFloat sequencingError; 2449 sequencingError.data_value = e; 2450 // sequencingError = exp(e); 2451 options.errorProb[i] = (TFloat)1.0 - ((TFloat)1.0 - sequencingError) * ((TFloat)1.0 - options.mutationRate); 2452 // std::cout<<e<<':'<<options.errorProb[i]<<'\t'; 2453 } 2454 // std::cout<<std::endl<<std::endl; 2455 } 2456 2457 ////////////////////////////////////////////////////////////////////////////// 2458 // Customize filter 2459 template <typename TDelta, typename TOptions> 2460 void computeQGramLengths(TDelta & minDelta, TOptions const & options) 2461 { 2462 const unsigned maxLength1 = length(options.readLengths); 2463 // const unsigned maxLength = maxLength1 - 1; 2464 // const unsigned maxErrors = (unsigned) floor(options.errorRate * maxLength); 2465 // const unsigned maxErrors1 = maxErrors + 1; 2466 2467 unsigned seqCount = 0; 2468 String<unsigned> maxDelta; 2469 resize(minDelta, options.maxOverlap + 1, std::numeric_limits<unsigned>::max()); 2470 resize(maxDelta, options.maxOverlap + 1, 3); 2471 2472 // compute delta (==stepSize) for different overlaps 2473 for (unsigned len = 0; len < maxLength1; ++len) 2474 { 2475 if (options.readLengths[len] == 0) 2476 continue; 2477 2478 seqCount += options.readLengths[len]; 2479 for (unsigned ol = 0; ol <= options.maxOverlap; ++ol) 2480 { 2481 // sequence must have sufficient length 2482 if (len <= ol) 2483 continue; 2484 2485 // cut overlap many characters from the end 2486 unsigned errors = (unsigned) floor(options.errorRate * len); 2487 unsigned delta = (len - ol) / (errors + 1); 2488 2489 // ignore too short delta-grams 2490 if (delta < 3) 2491 continue; 2492 if (minDelta[ol] > delta) 2493 minDelta[ol] = delta; 2494 if (maxDelta[ol] < delta) 2495 maxDelta[ol] = delta; 2496 } 2497 } 2498 2499 // std::cout<< "Deltas:"<<std::endl; 2500 // for (unsigned ol = 0; ol < length(minDelta); ++ol) 2501 // { 2502 // if (minDelta[ol] < 3) minDelta[ol] = maxDelta[ol]; 2503 // std::cout<<minDelta[ol]+ol<<'\t'; 2504 // } 2505 // std::cout<<std::endl<<std::endl; 2506 } 2507 2508 template <typename TEstLosses, typename TDelta, typename TOptions> 2509 unsigned estimatePigeonholeLosses(TEstLosses & estLosses, TDelta const & delta, TOptions const & options) 2510 { 2511 typedef typename Value<TEstLosses>::Type TFloat; 2512 2513 const unsigned maxLength1 = length(options.readLengths); 2514 const unsigned maxLength = maxLength1 - 1; 2515 const unsigned maxErrors = (unsigned) floor(options.errorRate * maxLength); 2516 const unsigned maxErrors1 = maxErrors + 1; 2517 2518 // ------------------------------------------------------------------------------------------------ 2519 // compute probs to have 0,1,...,maxErrors errors in the prefix of length 1,...,maxLength 2520 // ------------------------------------------------------------------------------------------------ 2521 String<TFloat> p_prefix; 2522 resize(p_prefix, maxLength * maxErrors1); 2523 p_prefix[0] = (TFloat)1.0 - options.errorProb[0]; // 0 error 2524 p_prefix[1] = options.errorProb[0]; // 1 error 2525 for (unsigned e = 2; e <= maxErrors; ++e) // 2 errors are not possible in a sequence of length 1 2526 p_prefix[e] = 0.0; 2527 for (unsigned i = 1, idx = maxErrors1; i < maxLength; ++i) 2528 { 2529 p_prefix[idx] = p_prefix[idx - maxErrors1] * ((TFloat)1.0 - options.errorProb[i]); 2530 ++idx; 2531 for (unsigned e = 1; e <= maxErrors; ++e, ++idx) 2532 p_prefix[idx] = p_prefix[idx - maxErrors1] * ((TFloat)1.0 - options.errorProb[i]) + p_prefix[idx - maxErrors1 - 1] * options.errorProb[i]; 2533 } 2534 2535 // ------------------------------------------------------------------------------------------------ 2536 // compute expected numbers of reads with 0,1,...,maxErrors errors 2537 // ------------------------------------------------------------------------------------------------ 2538 2539 clear(estLosses); 2540 resize(estLosses, maxErrors1 + 2, 0.0); 2541 double maxLoss = 0.0; 2542 // std::cout << "len:"<<0<<" "<<options.readLengths[0]<<std::endl; 2543 for (unsigned len = 1; len <= maxLength; ++len) 2544 { 2545 if (options.readLengths[len] == 0) 2546 continue; 2547 // std::cout << "len:"<<len<<" "<<options.readLengths[len]<<'\t'<<(TFloat)options.readLengths[len]<<'\t'<<(double)(TFloat)options.readLengths[len]<<'\t'<<std::endl; 2548 2549 unsigned errors = (unsigned) floor(options.errorRate * len); 2550 TFloat sum = 0.0; 2551 for (unsigned e = 0; e <= errors; ++e) 2552 { 2553 sum += p_prefix[(len - 1) * maxErrors1 + e]; 2554 estLosses[2 + e] += p_prefix[(len - 1) * maxErrors1 + e] * (TFloat)options.readLengths[len]; 2555 } 2556 maxLoss += (double)sum * options.readLengths[len]; 2557 } 2558 maxLoss *= options.lossRate; 2559 2560 unsigned overlap = 0; 2561 2562 // ------------------------------------------------------------------------------------------------ 2563 // compute loss for each overlap and select best the shape 2564 // ------------------------------------------------------------------------------------------------ 2565 for (unsigned ol = 0; ol < length(delta); ++ol) 2566 { 2567 unsigned stepSize = delta[ol]; 2568 unsigned q = stepSize + ol; 2569 2570 // if we had the same q-gram before 2571 if (ol > 0 && delta[ol - 1] + ol - 1 == q) 2572 continue; 2573 2574 // don't allow more than two overlapping q-grams 2575 if (2 * ol > stepSize) 2576 break; 2577 2578 // ------------------------------------------------------------------------------------------------ 2579 // compute probs to have 0,1,...,maxErrors errors in a segment 2580 // ------------------------------------------------------------------------------------------------ 2581 2582 String<TFloat> p; 2583 resize(p, maxLength * maxErrors1); 2584 for (unsigned i = 0, idx = 0; i < maxLength; ++i) 2585 { 2586 if (i % stepSize == 0 || (i >= stepSize && i % stepSize == ol)) 2587 { 2588 // initialization 2589 p[idx++] = (TFloat)1.0 - options.errorProb[i]; // 0 error 2590 p[idx++] = options.errorProb[i]; // 1 error 2591 for (unsigned e = 2; e <= maxErrors; ++e, ++idx) // 2 errors are not possible in a sequence of length 1 2592 p[idx] = 0.0; 2593 } 2594 else 2595 { 2596 p[idx] = p[idx - maxErrors1] * ((TFloat)1.0 - options.errorProb[i]); 2597 ++idx; 2598 for (unsigned e = 1; e <= maxErrors; ++e, ++idx) 2599 { 2600 // a sequence of length i with e errors has either: 2601 // - e errors in prefix i-1 and no error at position i 2602 // - e-1 errors in prefix i-1 and an error at position i 2603 p[idx] = p[idx - maxErrors1] * ((TFloat)1.0 - options.errorProb[i]) + p[idx - maxErrors1 - 1] * options.errorProb[i]; 2604 } 2605 } 2606 } 2607 2608 String<TFloat> p_last; 2609 resize(p_last, maxLength * maxErrors1); 2610 for (unsigned i = 0, idx = 0; i < maxLength; ++i) 2611 { 2612 if (i == 0 || (i >= stepSize && i % stepSize == ol)) 2613 { 2614 // initialization 2615 p_last[idx++] = (TFloat)1.0 - options.errorProb[i]; // 0 error 2616 p_last[idx++] = options.errorProb[i]; // 1 error 2617 for (unsigned e = 2; e <= maxErrors; ++e, ++idx) // 2 errors are not possible in a sequence of length 1 2618 p_last[idx] = 0.0; 2619 } 2620 else 2621 { 2622 p_last[idx] = p_last[idx - maxErrors1] * ((TFloat)1.0 - options.errorProb[i]); 2623 ++idx; 2624 for (unsigned e = 1; e <= maxErrors; ++e, ++idx) 2625 p_last[idx] = p_last[idx - maxErrors1] * ((TFloat)1.0 - options.errorProb[i]) + p_last[idx - maxErrors1 - 1] * options.errorProb[i]; 2626 } 2627 } 2628 2629 // ------------------------------------------------------------------------------------------------ 2630 // compute probs to lose every prefix of pigeonhole segments (q-grams) with 0,...,maxErrors errors 2631 // ------------------------------------------------------------------------------------------------ 2632 unsigned maxXErrors1 = _min(maxErrors, ol) + 1; 2633 unsigned maxSegments = (maxLength - ol) / stepSize; 2634 String<TFloat> M; 2635 resize(M, maxSegments * maxErrors1 * maxXErrors1, 0.0); 2636 2637 unsigned idx = maxXErrors1; 2638 for (unsigned e0 = 1; e0 <= maxErrors; ++e0, idx += maxXErrors1) 2639 { 2640 if (q != stepSize) 2641 { 2642 unsigned maxX = _min(e0, ol); 2643 for (unsigned x0 = 0; x0 <= maxX; ++x0) 2644 { 2645 M[idx + x0] = p[(stepSize - 1) * maxErrors1 + (e0 - x0)] * p[(q - 1) * maxErrors1 + x0]; 2646 // std::cout<<"M[0,"<<e0<<','<<x0<<"]="<<M[idx + x0]<<std::endl; 2647 } 2648 } 2649 else 2650 M[idx] = p[(stepSize - 1) * maxErrors1 + e0]; 2651 // std::cout<<"p(C_"<<0<<'='<<e0<<")="<<p[( (stepSize - 1)) * maxErrors1 + e0]<<std::endl; 2652 // std::cout<<"p(X_"<<0<<'='<<e0<<")="<<p[( (q - 1)) * maxErrors1 + e0]<<std::endl; 2653 } 2654 /* 2655 SEQAN_OMP_PRAGMA(critical) 2656 { 2657 for (unsigned e0 = 1; e0 <= maxErrors; ++e0) 2658 for (unsigned i = 0; i < maxLength; ++i) 2659 std::cerr<<"p("<<i<<'='<<e0<<")="<<p[i * maxErrors1 + e0]<<std::endl; 2660 for (unsigned e0 = 1; e0 <= maxErrors; ++e0) 2661 for (unsigned i = 0; i < maxLength; ++i) 2662 std::cerr<<"p_last("<<i<<'='<<e0<<")="<<p_last[i * maxErrors1 + e0]<<std::endl; 2663 } 2664 */ 2665 // std::cout<<"=============================="<<std::endl; 2666 2667 // M[s,e,x] at position ((s * maxErrors1) + e) * maxErrors1 + x 2668 for (unsigned s = 1; s < maxSegments; ++s) 2669 for (unsigned e = 0; e <= maxErrors; ++e, idx += maxXErrors1) 2670 { 2671 if (ol != 0) 2672 { 2673 unsigned maxX = _min(e, ol); 2674 for (unsigned x = 0; x <= maxX; ++x) 2675 { 2676 TFloat sum = 0.0; 2677 for (unsigned _e = 0; _e <= e - x; ++_e) 2678 { 2679 unsigned max_X = _min(_e, ol); 2680 for (unsigned _x = 0; _x <= max_X; ++_x) 2681 if (_e - _x < e) // at least one error in seed X_{s-1} C_s X_s 2682 sum += M[((s - 1) * maxErrors1 + _e) * maxXErrors1 + _x] 2683 * p[(stepSize * s + (stepSize - 1)) * maxErrors1 + (e - _e - x)] 2684 * p[(stepSize * s + (q - 1)) * maxErrors1 + x]; 2685 } 2686 2687 M[idx + x] = sum; 2688 // std::cerr<<"M["<<s<<','<<e<<','<<x<<"]="<<M[idx + x]<<std::endl; 2689 } 2690 // std::cerr<<"p(C_"<<s<<'='<<e<<")="<<p[(stepSize * s + (stepSize - 1)) * maxErrors1 + e]<<std::endl; 2691 // std::cerr<<"p(X_"<<s<<'='<<e<<")="<<p[(stepSize * s + (q - 1)) * maxErrors1 + e]<<std::endl; 2692 } 2693 else 2694 { 2695 TFloat sum = 0.0; 2696 for (unsigned _e = 0; _e < e; ++_e) 2697 sum += M[((s - 1) * maxErrors1 + _e) * maxXErrors1] 2698 * p[(stepSize * s + (stepSize - 1)) * maxErrors1 + (e - _e)]; 2699 2700 M[idx] = sum; 2701 // std::cerr<<"M["<<s<<','<<e<<",0]="<<M[idx]<<std::endl; 2702 } 2703 } 2704 2705 // // no overlap => no loss => no estimation required 2706 // if (ol == 0) continue; 2707 2708 // ------------------------------------------------------------------------------------------------ 2709 // sum up the expected numbers of lost reads for every read length 0,...,maxLength 2710 // ------------------------------------------------------------------------------------------------ 2711 double loss = 0.0; 2712 // std::cout<< "LOSSES FOR OVERLAP " << ol <<":"<<std::endl; 2713 2714 appendValue(estLosses, (double)ol); 2715 appendValue(estLosses, (double)q); 2716 resize(estLosses, length(estLosses) + maxErrors1, 0.0); 2717 for (unsigned len = 1; len <= maxLength; ++len) 2718 { 2719 if (options.readLengths[len] == 0) 2720 continue; 2721 if (len < q) 2722 continue; 2723 2724 unsigned errors = (unsigned) floor(options.errorRate * len); 2725 unsigned segments = (len - ol) / stepSize; 2726 2727 // TFloat divider = p_prefix[(len - 1) * maxErrors1]; 2728 // std::cout<<"DIVIDER"<<0<< ':'<<p_prefix[(len - 1) * maxErrors1]<<std::endl; 2729 // for (unsigned e = 1; e <= errors; ++e) 2730 // { 2731 // divider += p_prefix[(len - 1) * maxErrors1 + e]; 2732 // std::cout<<"DIVIDER"<<e<< ':'<<p_prefix[(len - 1) * maxErrors1 + e]<<std::endl; 2733 // } 2734 // std::cout<<"DIVIDERSUM:"<<divider<<std::endl; 2735 2736 TFloat lossPerLength = 0.0; 2737 unsigned segmentedLen = segments * stepSize + ol; 2738 for (unsigned e1 = 0; e1 <= errors; ++e1) 2739 { 2740 unsigned maxX = _min(e1, ol); 2741 for (unsigned x = 0; x <= maxX; ++x) 2742 { 2743 TFloat sum = 0.0; 2744 if (segmentedLen < len) 2745 { 2746 unsigned maxE2 = _min(errors - e1, len - segmentedLen); 2747 for (unsigned e2 = 0; e2 <= maxE2; ++e2) 2748 { 2749 sum += p_last[(len - 1) * maxErrors1 + e2]; 2750 estLosses[length(estLosses) - maxErrors1 + e1 + e2] += (M[((segments - 1) * maxErrors1 + e1) * maxXErrors1 + x] * p_last[(len - 1) * maxErrors1 + e2] /* / divider */) * options.readLengths[len]; 2751 } 2752 } 2753 else 2754 { 2755 sum = 1.0; 2756 estLosses[length(estLosses) - maxErrors1 + e1] += (M[((segments - 1) * maxErrors1 + e1) * maxXErrors1 + x] /* / divider */) * options.readLengths[len]; 2757 } 2758 lossPerLength += M[((segments - 1) * maxErrors1 + e1) * maxXErrors1 + x] * sum; 2759 } 2760 } 2761 2762 // lossPerLength /= divider; 2763 loss += options.readLengths[len] * (double)lossPerLength; 2764 // std::cout<<len<<':'<<options.readLengths[len] * (double)lossPerLength<<'\t'; 2765 2766 // if (len >= delta[ol] * (errors + 2) + ol) continue; 2767 // loss += options.readLengths[len] * (q0[len * options.maxOverlap + ol - 1] / p[len * maxErrors1 + errors]); 2768 } 2769 if (loss > maxLoss) 2770 break; 2771 // std::cout<<std::endl<<std::endl; 2772 2773 overlap = ol; 2774 } 2775 return overlap; 2776 } 2777 2778 template <typename TIndex, typename TPigeonholeSpec, typename TOptions> 2779 void _applyFilterOptions(Pattern<TIndex, Pigeonhole<TPigeonholeSpec> > & filterPattern, TOptions const & options) 2780 { 2781 if (options.lossRate == 0.0) 2782 { 2783 filterPattern.params.delta = options.delta; 2784 filterPattern.params.overlap = options.overlap; 2785 _patternInit(filterPattern, options.errorRate); 2786 2787 if (options._debugLevel >= 2) 2788 { 2789 SEQAN_OMP_PRAGMA(critical) 2790 { 2791 CharString str; 2792 shapeToString(str, filterPattern.shape); 2793 std::cout << std::endl << "Pigeonhole settings:" << std::endl; 2794 std::cout << " shape: " << length(str) << '\t' << str << std::endl; 2795 std::cout << " stepsize: " << getStepSize(host(filterPattern)) << std::endl; 2796 } 2797 return; 2798 } 2799 } 2800 2801 typedef typename TOptions::TProb TFloat; 2802 2803 String<TFloat> estLosses; 2804 const unsigned maxErrors = (unsigned) floor(options.errorRate * length(options.readLengths)); 2805 const unsigned maxErrors1 = maxErrors + 1; 2806 2807 String<unsigned> delta; 2808 computeQGramLengths(delta, options); 2809 unsigned maxWeight = _pigeonholeMaxShapeWeight(indexShape(host(filterPattern))); 2810 2811 if (delta[0] < maxWeight && options.lossRate != 0.0) 2812 { 2813 // lossy filtration 2814 2815 // we can stop if the proposed weight exceeds or reaches the maximum 2816 for (unsigned ol = 0; ol < length(delta); ++ol) 2817 if (delta[ol] + ol > maxWeight) 2818 { 2819 resize(delta, ol); 2820 break; 2821 } 2822 2823 filterPattern.params.overlap = estimatePigeonholeLosses(estLosses, delta, options); 2824 _patternInit(filterPattern, options.errorRate); 2825 2826 if (options._debugLevel >= 2) 2827 { 2828 SEQAN_OMP_PRAGMA(critical) 2829 { 2830 std::cout << " e | e error reads | loss ol =" << std::setw(2) << estLosses[maxErrors1 + 2]; 2831 for (unsigned i = 2 * (maxErrors1 + 2); i < length(estLosses); i += maxErrors1 + 2) 2832 std::cout << " | ol =" << std::setw(2) << estLosses[i]; 2833 std::cout << std::endl; 2834 std::cout << " | | q =" << std::setw(2) << estLosses[maxErrors1 + 3]; 2835 for (unsigned i = 2 * (maxErrors1 + 2); i < length(estLosses); i += maxErrors1 + 2) 2836 std::cout << " | q =" << std::setw(2) << estLosses[i + 1]; 2837 std::cout << std::endl; 2838 std::cout << " ------+---------------"; 2839 for (unsigned i = maxErrors1 + 2; i < length(estLosses); i += maxErrors1 + 2) 2840 std::cout << "+-------------"; 2841 std::cout << std::endl; 2842 std::cout.setf(std::ios::fixed); 2843 TFloat sumReads; 2844 for (unsigned e = 0; e <= maxErrors; ++e) 2845 { 2846 std::cout.fill(' '); 2847 std::cout << std::setprecision(0); 2848 std::cout << std::setw(6) << e; 2849 std::cout << " |"; 2850 std::cout << std::setw(14) << (unsigned)estLosses[e + 2]; 2851 std::cout << std::setprecision(2); 2852 for (unsigned i = maxErrors1 + 2 + e + 2; i < length(estLosses); i += maxErrors1 + 2) 2853 std::cout << " |" << std::setw(12) << estLosses[i]; 2854 std::cout << std::endl; 2855 sumReads += estLosses[e + 2]; 2856 } 2857 std::cout << " ------+---------------"; 2858 for (unsigned i = maxErrors1 + 2; i < length(estLosses); i += maxErrors1 + 2) 2859 std::cout << "+-------------"; 2860 std::cout << std::endl; 2861 2862 std::cout << std::setprecision(0); 2863 std::cout << " total |" << std::setw(14) << (unsigned)sumReads << ' '; 2864 std::cout << std::setprecision(2); 2865 for (unsigned i = maxErrors1 + 4; i < length(estLosses); i += maxErrors1 + 2) 2866 { 2867 sumReads = 0.0; 2868 for (unsigned e = 0; e <= maxErrors; ++e) 2869 sumReads += estLosses[i + e]; 2870 std::cout << '|' << std::setw(12) << sumReads; 2871 std::cout << ((filterPattern.params.overlap == (unsigned)estLosses[i - 2]) ? '*' : ' '); 2872 } 2873 std::cout << std::endl; 2874 2875 CharString str; 2876 shapeToString(str, filterPattern.shape); 2877 std::cout << std::endl << "Pigeonhole settings:" << std::endl; 2878 std::cout << " shape: " << length(str) << '\t' << str << std::endl; 2879 std::cout << " stepsize: " << getStepSize(host(filterPattern)) << std::endl; 2880 } 2881 } 2882 } 2883 else 2884 { 2885 // lossless filtration 2886 filterPattern.params.overlap = 0; 2887 } 2888 2889 } 2890 2891 template <typename TIndex, typename TSwiftSpec, typename TOptions> 2892 void _applyFilterOptions(Pattern<TIndex, Swift<TSwiftSpec> > & filterPattern, TOptions const & options) 2893 { 2894 filterPattern.params.minThreshold = options.threshold; 2895 filterPattern.params.tabooLength = options.tabooLength; 2896 _patternInit(filterPattern, options.errorRate, 0); 2897 } 2898 2899 ////////////////////////////////////////////////////////////////////////////// 2900 // Find read matches in a single genome sequence 2901 template < 2902 typename TMatches, 2903 typename TFragmentStore, 2904 typename TReadIndex, 2905 typename TFilterSpec, 2906 typename TCounts, 2907 typename TRazerSOptions, 2908 typename TRazerSMode> 2909 void _mapSingleReadsToContig( 2910 TMatches & matches, 2911 TFragmentStore & store, 2912 unsigned contigId, // ... and its sequence number 2913 Pattern<TReadIndex, TFilterSpec> & filterPattern, 2914 TCounts & cnts, 2915 char orientation, // q-gram index of reads 2916 TRazerSOptions & options, 2917 TRazerSMode const & mode) 2918 { 2919 // FILTRATION 2920 typedef typename TFragmentStore::TContigSeq TContigSeq; 2921 typedef Finder<TContigSeq, TFilterSpec> TFilterFinder; 2922 typedef Pattern<TReadIndex, TFilterSpec> TFilterPattern; 2923 2924 // VERIFICATION 2925 typedef MatchVerifier< 2926 TFragmentStore, 2927 TMatches, 2928 TRazerSOptions, 2929 TRazerSMode, 2930 TFilterPattern, 2931 TCounts> TVerifier; 2932 typedef typename Fibre<TReadIndex, FibreText>::Type TReadSet; 2933 2934 // iterate all genomic sequences 2935 if (options._debugLevel >= 1) 2936 { 2937 std::cerr << std::endl << "Process genome seq #" << contigId; 2938 if (orientation == 'F') 2939 std::cerr << "[fwd]"; 2940 else 2941 std::cerr << "[rev]"; 2942 } 2943 lockContig(store, contigId); 2944 TContigSeq & contigSeq = store.contigStore[contigId].seq; 2945 if (orientation == 'R') 2946 reverseComplement(contigSeq); 2947 2948 TReadSet & readSet = indexText(host(filterPattern)); 2949 TFilterFinder filterFinder(contigSeq, options.repeatLength, 1); 2950 TVerifier verifier(matches, options, filterPattern, cnts); 2951 2952 #ifdef RAZERS_BANDED_MYERS 2953 typename Size<TContigSeq>::Type contigLength = length(contigSeq); 2954 #endif 2955 2956 // initialize verifier 2957 verifier.onReverseComplement = (orientation == 'R'); 2958 verifier.genomeLength = length(contigSeq); 2959 verifier.m.contigId = contigId; 2960 2961 double beginTime = sysTime(); 2962 // Build q-gram index separately, so we can better compute the time for it. 2963 indexRequire(host(filterPattern), QGramSADir()); 2964 options.timeBuildQGramIndex += sysTime() - beginTime; 2965 2966 // iterate all verification regions returned by SWIFT 2967 while (find(filterFinder, filterPattern, options.errorRate)) 2968 { 2969 // std::cout << "read id = " << (*filterFinder.curHit).ndlSeqNo << ", " << beginPosition(filterFinder) << std::endl; 2970 2971 // if (length(infix(filterFinder)) < length(readSet[(*filterFinder.curHit).ndlSeqNo])) 2972 // continue; // Skip if hit length < read length. TODO(holtgrew): David has to fix something in banded myers to make this work. 2973 #ifdef RAZERS_BANDED_MYERS 2974 verifier.patternState.leftClip = (beginPosition(filterFinder) >= 0) ? 0 : -beginPosition(filterFinder); // left clip if match begins left of the genome 2975 verifier.rightClip = (endPosition(filterFinder) <= contigLength) ? 0 : endPosition(filterFinder) - contigLength; // right clip if match end right of the genome 2976 #endif 2977 verifier.m.readId = (*filterFinder.curHit).ndlSeqNo; 2978 if (!options.spec.DONT_VERIFY) 2979 matchVerify(verifier, infix(filterFinder), verifier.m.readId, readSet[verifier.m.readId], mode); 2980 ++options.countFiltration; 2981 } 2982 if (!unlockAndFreeContig(store, contigId)) // if the contig is still used 2983 if (orientation == 'R') 2984 reverseComplement(contigSeq); 2985 // we have to restore original orientation 2986 } 2987 2988 ////////////////////////////////////////////////////////////////////////////// 2989 // Find read matches in many genome sequences 2990 template < 2991 typename TFSSpec, 2992 typename TFSConfig, 2993 typename TCounts, 2994 typename TSpec, 2995 typename TAlignMode, 2996 typename TGapMode, 2997 typename TScoreMode, 2998 typename TReadIndex, 2999 typename TMatchNPolicy, 3000 typename TFilterSpec> 3001 int _mapSingleReads( 3002 FragmentStore<TFSSpec, TFSConfig> & store, 3003 TCounts & cnts, 3004 RazerSCoreOptions<TSpec> & options, 3005 RazerSMode<TAlignMode, TGapMode, TScoreMode, TMatchNPolicy> const & mode, 3006 TReadIndex & readIndex, 3007 TFilterSpec) 3008 { 3009 typedef FragmentStore<TFSSpec, TFSConfig> TFragmentStore; 3010 //typedef typename TFragmentStore::TReadSeqStore TReadSeqStore; 3011 typedef Pattern<TReadIndex, TFilterSpec> TFilterPattern; // filter 3012 3013 //typedef typename Value<TReadSeqStore>::Type const TRead; 3014 //typedef Pattern<TRead, MyersUkkonen> TMyersPattern; // verifier 3015 // typedef Pattern<TRead, Myers<FindInfix, False, void> > TMyersPattern; // verifier 3016 3017 typedef typename TFragmentStore::TContigSeq TContigSeq; 3018 typedef typename Position<TContigSeq>::Type TContigPos; 3019 typedef MatchRecord<TContigPos> TMatchRecord; 3020 3021 // configure Swift pattern 3022 TFilterPattern filterPattern(readIndex); 3023 3024 // configure filter pattern 3025 // (if this is a pigeonhole filter, all sequences must be appended first) 3026 _applyFilterOptions(filterPattern, options); 3027 filterPattern.params.printDots = options._debugLevel > 0; 3028 3029 // clear stats 3030 options.countFiltration = 0; 3031 options.countVerification = 0; 3032 options.timeMapReads = 0; 3033 options.timeDumpResults = 0; 3034 options.timeBuildQGramIndex = 0; 3035 options.timeCompactMatches = 0; 3036 options.timeMaskDuplicates = 0; 3037 options.timeFsCopy = 0; 3038 options.timeFiltration = 0; 3039 options.timeVerification = 0; 3040 SEQAN_PROTIMESTART(find_time); 3041 3042 // We collect the matches in a more compact data structure than the 3043 // AlignedReadStoreElement from FragmentStore. 3044 String<TMatchRecord> matches; 3045 3046 // iterate over genome sequences 3047 for (unsigned contigId = 0; contigId < length(store.contigStore); ++contigId) 3048 { 3049 // lock to prevent releasing and loading the same contig twice 3050 // (once per _mapSingleReadsToContig call) 3051 lockContig(store, contigId); 3052 #ifndef RAZERS_WINDOW 3053 if (options.forward) 3054 _mapSingleReadsToContig(matches, store, contigId, filterPattern, cnts, 'F', options, mode); 3055 if (options.reverse) 3056 _mapSingleReadsToContig(matches, store, contigId, filterPattern, cnts, 'R', options, mode); 3057 #else 3058 if (options.forward) 3059 _mapSingleReadsToContigWindow(store, contigId, filterPattern, cnts, 'F', options, mode); 3060 if (options.reverse) 3061 _mapSingleReadsToContigWindow(store, contigId, filterPattern, cnts, 'R', options, mode); 3062 #endif 3063 unlockAndFreeContig(store, contigId); 3064 } 3065 3066 options.timeMapReads = SEQAN_PROTIMEDIFF(find_time); 3067 if (options._debugLevel >= 1) 3068 std::cerr << std::endl << "Finding reads took \t" << options.timeMapReads << " seconds" << std::endl; 3069 3070 double beginCopyTime = sysTime(); 3071 // Final mask duplicates and compact matches. 3072 Nothing nothing; 3073 if (IsSameType<TGapMode, RazerSGapped>::VALUE || options.threshold == 0) 3074 maskDuplicates(matches, options, mode); 3075 compactMatches(matches, cnts, options, mode, nothing, COMPACT_FINAL); 3076 // Write back to store. 3077 reserve(store.alignedReadStore, length(matches), Exact()); 3078 reserve(store.alignQualityStore, length(matches), Exact()); 3079 typedef typename Iterator<String<TMatchRecord>, Standard>::Type TIterator; 3080 typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedReadStoreElem; 3081 typedef typename Value<typename TFragmentStore::TAlignQualityStore>::Type TAlignedQualStoreElem; 3082 for (TIterator it = begin(matches), itEnd = end(matches); it != itEnd; ++it) 3083 { 3084 SEQAN_ASSERT_NEQ(it->orientation, '-'); 3085 SEQAN_ASSERT_LEQ(it->beginPos, it->endPos); 3086 if (it->orientation == 'R') 3087 std::swap(it->beginPos, it->endPos); 3088 appendValue(store.alignedReadStore, TAlignedReadStoreElem(length(store.alignQualityStore), it->readId, it->contigId, it->beginPos, it->endPos)); 3089 appendValue(store.alignQualityStore, TAlignedQualStoreElem(it->pairScore, it->score, -it->score)); 3090 } 3091 options.timeFsCopy = sysTime() - beginCopyTime; 3092 3093 if (options._debugLevel >= 1) 3094 { 3095 std::cerr << "Masking duplicates took \t" << options.timeMaskDuplicates << " seconds" << std::endl; 3096 std::cerr << "Compacting matches took \t" << options.timeCompactMatches << " seconds" << std::endl; 3097 std::cerr << "Building q-gram index took \t" << options.timeBuildQGramIndex << " seconds" << std::endl; 3098 std::cerr << "Time for copying back \t" << options.timeFsCopy << " seconds" << std::endl; 3099 std::cerr << "Time for filtration \t" << options.timeFiltration << " seconds" << std::endl; 3100 std::cerr << "Time for verifications \t" << options.timeVerification << " seconds" << std::endl; 3101 std::cerr << std::endl; 3102 std::cerr << "___FILTRATION_STATS____" << std::endl; 3103 std::cerr << "Filtration counter: " << options.countFiltration << std::endl; 3104 std::cerr << "Successful verifications: " << options.countVerification << std::endl; 3105 } 3106 return 0; 3107 } 3108 3109 ////////////////////////////////////////////////////////////////////////////// 3110 // Wrapper for SWIFT (default) 3111 template < 3112 typename TFSSpec, 3113 typename TFSConfig, 3114 typename TCounts, 3115 typename TSpec, 3116 typename TShape, 3117 typename TAlignMode, 3118 typename TGapMode, 3119 typename TScoreMode, 3120 typename TMatchNPolicy, 3121 typename TFilterSpec> 3122 int _mapSingleReads( 3123 FragmentStore<TFSSpec, TFSConfig> & store, 3124 TCounts & cnts, 3125 RazerSCoreOptions<TSpec> & options, 3126 TShape const & shape, 3127 RazerSMode<TAlignMode, TGapMode, TScoreMode, TMatchNPolicy> const & mode, 3128 TFilterSpec) 3129 { 3130 typedef FragmentStore<TFSSpec, TFSConfig> TFragmentStore; 3131 typedef typename TFragmentStore::TReadSeqStore TReadSeqStore; 3132 #ifndef RAZERS_OPENADDRESSING 3133 typedef Index<TReadSeqStore, IndexQGram<TShape> > TIndex; // q-gram index 3134 #else 3135 typedef Index<TReadSeqStore, IndexQGram<TShape, OpenAddressing> > TIndex; 3136 #endif 3137 3138 // configure q-gram index 3139 TIndex swiftIndex(store.readSeqStore, shape); 3140 #ifdef RAZERS_OPENADDRESSING 3141 swiftIndex.alpha = options.loadFactor; 3142 #endif 3143 cargo(swiftIndex).abundanceCut = options.abundanceCut; 3144 cargo(swiftIndex)._debugLevel = options._debugLevel; 3145 3146 return _mapSingleReads(store, cnts, options, mode, swiftIndex, TFilterSpec()); 3147 } 3148 3149 ////////////////////////////////////////////////////////////////////////////// 3150 // Wrapper for SWIFT with Micro RNA 3151 template < 3152 typename TFSSpec, 3153 typename TFSConfig, 3154 typename TCounts, 3155 typename TSpec, 3156 typename TShape, 3157 typename TGapMode, 3158 typename TScoreMode, 3159 typename TMatchNPolicy, 3160 typename TFilterSpec> 3161 int _mapSingleReads( 3162 FragmentStore<TFSSpec, TFSConfig> & store, 3163 TCounts & cnts, 3164 RazerSCoreOptions<TSpec> & options, 3165 TShape const & shape, 3166 RazerSMode<RazerSPrefix, TGapMode, TScoreMode, TMatchNPolicy> const & mode, 3167 TFilterSpec) 3168 { 3169 typedef FragmentStore<TFSSpec, TFSConfig> TFragmentStore; 3170 typedef typename TFragmentStore::TReadSeqStore TReadSeqStore; 3171 3172 // typedef typename Value<TReadSeqStore>::Type TRead; 3173 // typedef typename Infix<TRead>::Type TReadInfix; 3174 // typedef StringSet<TReadInfix> TReadSet; 3175 typedef TReadSeqStore TReadSet; 3176 typedef Index<TReadSet, IndexQGram<TShape> > TIndex; // q-gram index 3177 3178 // TReadSet readSet; 3179 // unsigned readCount = length(store.readSeqStore); 3180 // resize(readSet, readCount, Exact()); 3181 // 3182 // for (unsigned i = 0; i < readCount; ++i) 3183 // assign(readSet[i], prefix(store.readSeqStore[i], _min(length(store.readSeqStore[i]), options.prefixSeedLength))); 3184 // 3185 // // configure q-gram index 3186 // TIndex swiftIndex(readSet, shape); 3187 TIndex swiftIndex(store.readSeqStore, shape); 3188 cargo(swiftIndex).abundanceCut = options.abundanceCut; 3189 cargo(swiftIndex)._debugLevel = options._debugLevel; 3190 3191 return _mapSingleReads(store, cnts, options, mode, swiftIndex, TFilterSpec()); 3192 } 3193 3194 ////////////////////////////////////////////////////////////////////////////// 3195 // Wrapper for different filters specs 3196 template < 3197 typename TFSSpec, 3198 typename TFSConfig, 3199 typename TCounts, 3200 typename TSpec, 3201 typename TShape, 3202 typename TAlignMode, 3203 typename TGapMode, 3204 typename TScoreMode, 3205 typename TMatchNPolicy> 3206 int _mapSingleReads( 3207 FragmentStore<TFSSpec, TFSConfig> & store, 3208 TCounts & cnts, 3209 RazerSCoreOptions<TSpec> & options, 3210 TShape const & shape, 3211 RazerSMode<TAlignMode, TGapMode, TScoreMode, TMatchNPolicy> const & mode) 3212 { 3213 if (options.threshold > 0) 3214 { 3215 typedef typename If<IsSameType<TGapMode, RazerSGapped>, SwiftSemiGlobal, SwiftSemiGlobalHamming>::Type TSwiftSpec; 3216 return _mapSingleReads(store, cnts, options, shape, mode, Swift<TSwiftSpec>()); 3217 } 3218 else 3219 { 3220 typedef typename If<IsSameType<TGapMode, RazerSGapped>, void, Hamming_>::Type TPigeonholeSpec; 3221 return _mapSingleReads(store, cnts, options, Shape<Dna, OneGappedShape>(), mode, Pigeonhole<TPigeonholeSpec>()); 3222 } 3223 } 3224 3225 ////////////////////////////////////////////////////////////////////////////// 3226 // Wrapper for different filters specs 3227 template < 3228 typename TFSSpec, 3229 typename TFSConfig, 3230 typename TCounts, 3231 typename TSpec, 3232 typename TShape, 3233 typename TAlignMode, 3234 typename TGapMode, 3235 typename TScoreMode, 3236 typename TMatchNPolicy> 3237 int _mapMatePairReads( 3238 FragmentStore<TFSSpec, TFSConfig> & store, 3239 TCounts & cnts, 3240 RazerSCoreOptions<TSpec> & options, 3241 TShape const & shape, 3242 RazerSMode<TAlignMode, TGapMode, TScoreMode, TMatchNPolicy> const & mode) 3243 { 3244 if (options.threshold > 0) 3245 { 3246 typedef typename If<IsSameType<TGapMode, RazerSGapped>, SwiftSemiGlobal, SwiftSemiGlobalHamming>::Type TSwiftSpec; 3247 return _mapMatePairReads(store, cnts, options, shape, mode, Swift<TSwiftSpec>()); 3248 } 3249 else 3250 { 3251 typedef typename If<IsSameType<TGapMode, RazerSGapped>, void, Hamming_>::Type TPigeonholeSpec; 3252 return _mapMatePairReads(store, cnts, options, Shape<Dna, OneGappedShape>(), mode, Pigeonhole<TPigeonholeSpec>()); 3253 } 3254 } 3255 3256 ////////////////////////////////////////////////////////////////////////////// 3257 // Wrapper for single/mate-pair mapping 3258 template < 3259 typename TFSSpec, 3260 typename TFSConfig, 3261 typename TCounts, 3262 typename TSpec, 3263 typename TShape, 3264 typename TRazerSMode> 3265 int _mapReads( 3266 FragmentStore<TFSSpec, TFSConfig> & store, 3267 TCounts & cnts, 3268 RazerSCoreOptions<TSpec> & options, 3269 TShape const & shape, 3270 TRazerSMode const & mode) 3271 { 3272 //typedef FragmentStore<TFSSpec, TFSConfig> TFragmentStore; 3273 //typedef typename TFragmentStore::TReadSeqStore TReadSeqStore; 3274 //typedef typename Value<TReadSeqStore>::Type TRead; 3275 //typedef Pattern<TRead const, MyersUkkonen> TMyersPattern; // verifier 3276 3277 3278 options.dRange = options.scoreDistanceRange; 3279 if (options.dRange == 0) 3280 options.dRange = 1 << 30; 3281 if (options.maxHits == 1 && !options.purgeAmbiguous) 3282 options.dRange = -1; 3283 else 3284 --options.dRange; 3285 3286 3287 int readCount = length(store.readSeqStore); 3288 resize(options.errorCutOff, readCount, Exact()); 3289 3290 #ifndef RAZERS_BANDED_MYERS 3291 if (options.gapMode == RAZERS_GAPPED) 3292 resize(options.forwardPatterns, readCount, Exact()); 3293 #endif 3294 3295 // set absolute error cut-offs 3296 Dna5String tmp; 3297 3298 SEQAN_OMP_PRAGMA(parallel for private(tmp)) 3299 for (int i = 0; i < readCount; ++i) 3300 { 3301 unsigned err = (unsigned)(length(store.readSeqStore[i]) * options.errorRate); 3302 options.errorCutOff[i] = (err < 255) ? err + 1 : 255; 3303 3304 // optionally compute preprocessing information 3305 #ifndef RAZERS_BANDED_MYERS 3306 if (!empty(store.readSeqStore[i])) 3307 continue; 3308 _patternMatchNOfPattern(options.forwardPatterns[i], options.matchN); 3309 _patternMatchNOfFinder(options.forwardPatterns[i], options.matchN); 3310 3311 #ifdef RAZERS_NOOUTERREADGAPS 3312 if (options.libraryLength >= 0 && (i & 1) == 1) 3313 { 3314 tmp = store.readSeqStore[i]; 3315 reverseComplement(tmp); 3316 setHost(options.forwardPatterns[i], prefix(tmp, length(tmp) - 1)); 3317 } 3318 else 3319 setHost(options.forwardPatterns[i], prefix(store.readSeqStore[i], length(store.readSeqStore[i]) - 1)); 3320 #else 3321 if (options.libraryLength >= 0 && (i & 1) == 1) 3322 { 3323 tmp = store.readSeqStore[i]; 3324 reverseComplement(tmp); 3325 setHost(options.forwardPatterns[i], tmp); 3326 } 3327 else 3328 setHost(options.forwardPatterns[i], store.readSeqStore[i]); 3329 #endif 3330 #endif // #ifdef RAZERS_BANDED_MYERS 3331 } 3332 3333 #ifdef _OPENMP 3334 if (options.threadCount == 0 || length(store.readNameStore) < MIN_PARALLEL_WORK) 3335 #endif 3336 { 3337 // Sequential RazerS 3338 #ifdef RAZERS_MATEPAIRS 3339 if (options.libraryLength >= 0) 3340 return _mapMatePairReads(store, cnts, options, shape, mode); 3341 else 3342 #endif // #ifndef RAZERS_MATEPAIRS 3343 return _mapSingleReads(store, cnts, options, shape, mode); 3344 } 3345 #ifdef _OPENMP 3346 else 3347 { 3348 // Parallel RazerS 3349 #ifdef RAZERS_MATEPAIRS 3350 if (options.libraryLength >= 0) 3351 return _mapMatePairReadsParallel(store, cnts, options, shape, mode); 3352 else 3353 #endif // #ifndef RAZERS_MATEPAIRS 3354 return _mapSingleReadsParallel(store, cnts, options, shape, mode); 3355 } 3356 #endif 3357 } 3358 3359 ////////////////////////////////////////////////////////////////////////////// 3360 // Wrapper for different shapes 3361 template <typename TFSSpec, typename TFSConfig, typename TCounts, typename TSpec, typename TRazersMode> 3362 int _mapReads( 3363 FragmentStore<TFSSpec, TFSConfig> & store, 3364 TCounts & cnts, 3365 RazerSCoreOptions<TSpec> & options, 3366 TRazersMode const & mode) 3367 { 3368 Shape<Dna, SimpleShape> ungapped; 3369 Shape<Dna, OneGappedShape> onegapped; 3370 Shape<Dna, GenericShape> gapped; 3371 3372 // 2x3 SPECIALIZATION 3373 3374 // select best-fitting shape 3375 if (stringToShape(ungapped, options.shape)) 3376 return _mapReads(store, cnts, options, ungapped, mode); 3377 3378 // if (stringToShape(onegapped, options.shape)) 3379 // return _mapReads(store, cnts, options, onegapped, mode); 3380 if (stringToShape(gapped, options.shape)) 3381 return _mapReads(store, cnts, options, gapped, mode); 3382 3383 return RAZERS_INVALID_SHAPE; 3384 } 3385 3386 ////////////////////////////////////////////////////////////////////////////// 3387 // Wrapper for different score modes 3388 template <typename TFSSpec, typename TFSConfig, typename TCounts, typename TSpec, typename TAlignMode, typename TGapMode, typename TMatchNPolicy> 3389 int _mapReads( 3390 FragmentStore<TFSSpec, TFSConfig> & store, 3391 TCounts & cnts, 3392 RazerSCoreOptions<TSpec> & options, 3393 RazerSMode<TAlignMode, TGapMode, Nothing, TMatchNPolicy> const) 3394 { 3395 if (options.scoreMode == RAZERS_ERRORS) 3396 return _mapReads(store, cnts, options, RazerSMode<TAlignMode, TGapMode, RazerSErrors, TMatchNPolicy>()); 3397 3398 /* if (options.scoreMode == RAZERS_SCORE) 3399 return _mapReads(store, cnts, options, RazerSMode<TAlignMode, TGapMode, RazerSScore, TMatchNPolicy>()); 3400 if (options.scoreMode == RAZERS_QUALITY) 3401 return _mapReads(store, cnts, options, RazerSMode<TAlignMode, TGapMode, RazerSQuality<>, TMatchNPolicy>()); 3402 */ return RAZERS_INVALID_OPTIONS; 3403 } 3404 3405 ////////////////////////////////////////////////////////////////////////////// 3406 // Wrapper for different gap and align modes 3407 template <typename TFSSpec, typename TFSConfig, typename TCounts, typename TSpec> 3408 int _mapReads( 3409 FragmentStore<TFSSpec, TFSConfig> & store, 3410 TCounts & cnts, 3411 RazerSCoreOptions<TSpec> & options) 3412 { 3413 if (options.matchN) 3414 { 3415 if (options.gapMode == RAZERS_GAPPED) 3416 { 3417 // if (options.alignMode == RAZERS_LOCAL) 3418 // return _mapReads(store, cnts, options, RazerSMode<RazerSLocal, RazerSGapped, Nothing, NMatchesAll_>()); 3419 // if (options.alignMode == RAZERS_PREFIX) 3420 // return _mapReads(store, cnts, options, RazerSMode<RazerSPrefix, RazerSGapped, Nothing, NMatchesAll_>()); 3421 if (options.alignMode == RAZERS_GLOBAL) 3422 return _mapReads(store, cnts, options, RazerSMode<RazerSGlobal, RazerSGapped, Nothing, NMatchesAll_>()); 3423 } 3424 else 3425 { 3426 // if (options.alignMode == RAZERS_LOCAL) 3427 // return _mapReads(store, cnts, options, RazerSMode<RazerSLocal, RazerSUngapped, Nothing, NMatchesAll_>()); 3428 // if (options.alignMode == RAZERS_PREFIX) 3429 // return _mapReads(store, cnts, options, RazerSMode<RazerSPrefix, RazerSUngapped, Nothing, NMatchesAll_>()); 3430 if (options.alignMode == RAZERS_GLOBAL) 3431 return _mapReads(store, cnts, options, RazerSMode<RazerSGlobal, RazerSUngapped, Nothing, NMatchesAll_>()); 3432 } 3433 } 3434 else 3435 { 3436 if (options.gapMode == RAZERS_GAPPED) 3437 { 3438 // if (options.alignMode == RAZERS_LOCAL) 3439 // return _mapReads(store, cnts, options, RazerSMode<RazerSLocal, RazerSGapped, Nothing, NMatchesNone_>()); 3440 // if (options.alignMode == RAZERS_PREFIX) 3441 // return _mapReads(store, cnts, options, RazerSMode<RazerSPrefix, RazerSGapped, Nothing, NMatchesNone_>()); 3442 if (options.alignMode == RAZERS_GLOBAL) 3443 return _mapReads(store, cnts, options, RazerSMode<RazerSGlobal, RazerSGapped, Nothing, NMatchesNone_>()); 3444 } 3445 else 3446 { 3447 // if (options.alignMode == RAZERS_LOCAL) 3448 // return _mapReads(store, cnts, options, RazerSMode<RazerSLocal, RazerSUngapped, Nothing, NMatchesNone_>()); 3449 // if (options.alignMode == RAZERS_PREFIX) 3450 // return _mapReads(store, cnts, options, RazerSMode<RazerSPrefix, RazerSUngapped, Nothing, NMatchesNone_>()); 3451 if (options.alignMode == RAZERS_GLOBAL) 3452 return _mapReads(store, cnts, options, RazerSMode<RazerSGlobal, RazerSUngapped, Nothing, NMatchesNone_>()); 3453 } 3454 } 3455 return RAZERS_INVALID_OPTIONS; 3456 } 3457 3458 } 3459 3460 #endif 3461