1 /*========================================================================== 2 RazerS - Fast Read Mapping with Controlled Loss Rate 3 http://www.seqan.de/projects/razers.html 4 5 ============================================================================ 6 Copyright (C) 2008 by David Weese 7 8 This program is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 3 of the License, or (at your options) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. 20 ==========================================================================*/ 21 22 #ifndef SEQAN_HEADER_RAZERS_H 23 #define SEQAN_HEADER_RAZERS_H 24 25 #include <iostream> 26 #include <fstream> 27 28 #include <seqan/find.h> 29 #include <seqan/seq_io.h> 30 #include <seqan/index.h> 31 #include <seqan/seq_io.h> 32 33 #ifdef RAZERS_PARALLEL 34 #include "tbb/spin_mutex.h" 35 #endif 36 37 namespace seqan 38 { 39 40 ////////////////////////////////////////////////////////////////////////////// 41 // Default options 42 43 #ifdef RAZERS_OPENADDRESSING 44 typedef OpenAddressing TQGramIndexSpec; 45 #else 46 typedef Default TQGramIndexSpec; 47 #endif 48 49 template < bool DONT_VERIFY_ = false, bool DONT_DUMP_RESULTS_ = false > 50 struct RazerSSpec 51 { 52 enum { DONT_VERIFY = DONT_VERIFY_ }; // omit verifying potential matches 53 enum { DONT_DUMP_RESULTS = DONT_DUMP_RESULTS_ }; // omit dumping results 54 }; 55 56 template < typename TSpec = RazerSSpec<> > 57 struct RazerSOptions 58 { 59 // main options 60 TSpec spec; 61 bool forward; // compute forward oriented read matches 62 bool reverse; // compute reverse oriented read matches 63 double errorRate; // Criteria 1 threshold 64 unsigned maxHits; // output at most maxHits many matches 65 unsigned distanceRange; // output only the best, second best, ..., distanceRange best matches 66 bool purgeAmbiguous; // true..remove reads with more than maxHits best matches, false..keep them 67 CharString output; // name of result file 68 int _debugLevel; // level of verbosity 69 bool printVersion; // print version number 70 bool hammingOnly; // no indels 71 int trimLength; // if >0, cut reads to #trimLength characters 72 CharString outputUnmapped; // name of file storing unmapped reads 73 74 75 // output format options 76 unsigned outputFormat; // 0..Razer format 77 // 1..enhanced Fasta 78 // 2..ELAND format 79 bool dumpAlignment; // compute and dump the match alignments in the result files 80 unsigned genomeNaming; // 0..use Fasta id 81 // 1..enumerate reads beginning with 1 82 unsigned readNaming; // 0..use Fasta id 83 // 1..enumerate reads beginning with 1 84 // 2..use the read sequence (only for short reads!) 85 unsigned sortOrder; // 0..sort keys: 1. read number, 2. genome position 86 // 1.. 1. genome pos50ition, 2. read number 87 unsigned positionFormat; // 0..gap space 88 // 1..position space 89 const char *runID; // runID needed for gff output 90 91 // filtration parameters 92 ::std::string shape; // shape (e.g. 11111111111) 93 int threshold; // threshold 94 int tabooLength; // taboo length 95 int repeatLength; // repeat length threshold 96 double abundanceCut; // abundance threshold 97 98 // mate-pair parameters 99 int libraryLength; // offset between two mates 100 int libraryError; // offset tolerance 101 unsigned nextMatePairId; // use this id for the next mate-pair 102 103 // verification parameters 104 bool matchN; // false..N is always a mismatch, true..N matches with all 105 unsigned char compMask[5]; 106 107 // statistics 108 int64_t FP; // false positives (threshold reached, no match) 109 int64_t TP; // true positives (threshold reached, match) 110 double timeLoadFiles; // time for loading input files 111 double timeMapReads; // time for mapping reads 112 double timeDumpResults; // time for dumping the results 113 114 #ifdef RAZERS_DIRECT_MAQ_MAPPING 115 bool maqMapping; 116 int maxMismatchQualSum; 117 int mutationRateQual; 118 int absMaxQualSumErrors; 119 unsigned artSeedLength; 120 bool noBelowIdentity; 121 #endif 122 bool readsWithQualities; 123 124 #ifdef RAZERS_MICRO_RNA 125 bool microRNA; 126 unsigned rnaSeedLength; 127 bool exactSeed; 128 #endif 129 130 bool lowMemory; // set maximum shape weight to 13 to limit size of q-gram index 131 bool fastaIdQual; // hidden option for special fasta+quality format we use 132 int minClippedLen; 133 134 // misc 135 unsigned compactThresh; // compact match array if larger than compactThresh 136 int maxReadLength; 137 138 int penaltyC; // puts penalty on the existence on a middle gap in split mapping mode 139 unsigned minMatchLen; // minimum prefix/suffix match length in split mapping mode 140 int maxPrefixErrors; // 141 int maxSuffixErrors; 142 int maxGap; 143 int minGap; 144 ::std::string shapeR; // shape (e.g. 11111111111) 145 int thresholdR; // threshold 146 unsigned int specifiedGenomeLen; 147 bool anchored; 148 int maxReadRegionsEnd; 149 int minReadRegionsStart; 150 151 152 // multi-threading 153 154 #ifdef RAZERS_PARALLEL 155 typedef ::tbb::spin_mutex TMutex; 156 157 TMutex *patternMutex; 158 TMutex optionsMutex; 159 TMutex matchMutex; 160 #endif 161 RazerSOptionsRazerSOptions162 RazerSOptions() 163 { 164 forward = true; 165 reverse = true; 166 errorRate = 0.08; 167 maxHits = 100; 168 distanceRange = 0; // disabled 169 purgeAmbiguous = false; 170 output = ""; 171 _debugLevel = 0; 172 printVersion = false; 173 hammingOnly = false; 174 outputUnmapped = ""; 175 trimLength = 0; 176 177 outputFormat = 0; 178 dumpAlignment = false; 179 genomeNaming = 0; 180 readNaming = 0; 181 sortOrder = 0; 182 positionFormat = 0; 183 runID = "s"; // 184 185 matchN = false; 186 187 shape = "11111111111"; 188 threshold = 1; 189 tabooLength = 1; 190 repeatLength = 1000; 191 abundanceCut = 1; 192 193 libraryLength = 220; 194 libraryError = 50; 195 nextMatePairId = 1; 196 197 for (unsigned i = 0; i < 4; ++i) 198 compMask[i] = 1 << i; 199 compMask[4] = 0; 200 201 compactThresh = 1024; 202 #ifdef RAZERS_DIRECT_MAQ_MAPPING 203 maqMapping = false; 204 maxMismatchQualSum = 70; 205 mutationRateQual = 30; 206 artSeedLength = 28; // the "artificial" seed length that is used for mapping quality assignment 207 // (28bp is maq default) 208 absMaxQualSumErrors = 100; // maximum for sum of mism qualities in total readlength 209 noBelowIdentity = false; 210 #endif 211 readsWithQualities = false; 212 213 #ifdef RAZERS_MICRO_RNA 214 microRNA = false; 215 rnaSeedLength = 16; 216 exactSeed = true; 217 #endif 218 penaltyC = 2; 219 minMatchLen = 18; 220 shapeR = "11111111111"; 221 thresholdR = 1; 222 maxGap = 10000; 223 minGap = 0; 224 maxPrefixErrors = 1; 225 maxSuffixErrors = 1; 226 specifiedGenomeLen = 3000; //in Mb, whole human genome default 227 anchored = false; 228 maxReadRegionsEnd = 0; 229 minReadRegionsStart = 0; // 230 231 maxReadLength = 0; 232 lowMemory = false; // set maximum shape weight to 13 to limit size of q-gram index 233 fastaIdQual = false; 234 minClippedLen = 0; 235 236 FP = 0; 237 TP = 0; 238 timeLoadFiles = 0.0; 239 timeMapReads = 0.0; 240 timeDumpResults = 0; 241 } 242 }; 243 244 struct MicroRNA{}; 245 246 #ifdef RAZERS_MICRO_RNA 247 #define RAZERS_EXTENDED_MATCH 248 #endif 249 250 #ifdef RAZERS_DIRECT_MAQ_MAPPING 251 #define RAZERS_EXTENDED_MATCH 252 #endif 253 254 #ifdef RAZERS_SPLICED 255 #define RAZERS_EXTENDED_MATCH 256 #endif 257 258 ////////////////////////////////////////////////////////////////////////////// 259 // Typedefs 260 261 // definition of a Read match 262 template <typename TGPos_> 263 struct ReadMatch 264 { 265 typedef typename MakeSigned_<TGPos_>::Type TGPos; 266 267 unsigned gseqNo; // genome seqNo 268 unsigned rseqNo; // read seqNo 269 TGPos gBegin; // begin position of the match in the genome 270 TGPos gEnd; // end position of the match in the genome 271 #ifdef RAZERS_MATEPAIRS 272 unsigned pairId; // unique id for the two mate-pair matches (0 if unpaired) 273 int mateDelta:23; // outer coordinate delta to the other mate 274 unsigned pairScore:9; // combined score of both mates max 256 275 #endif 276 unsigned short editDist; // Levenshtein distance 277 #ifdef RAZERS_EXTENDED_MATCH 278 short mScore; 279 short seedEditDist:8; 280 #endif 281 #ifdef RAZERS_SPLICED 282 char traceExtension; 283 short gSeedLen:8;// used as gMinMatchLen to store genomic end position of seed match 284 #endif 285 char orientation; // 'F'..forward strand, 'R'..reverse comp. strand 286 }; 287 288 enum RAZERS_ERROR 289 { 290 RAZERS_INVALID_OPTIONS = -1, 291 RAZERS_READS_FAILED = -2, 292 RAZERS_GENOME_FAILED = -3, 293 RAZERS_INVALID_SHAPE = -4 294 }; 295 296 ////////////////////////////////////////////////////////////////////////////// 297 // Definitions 298 299 typedef Dna5String TGenome; 300 typedef StringSet<TGenome> TGenomeSet; 301 // typedef Dna5String TRead; 302 typedef String<Dna5Q> TRead; 303 #ifdef RAZERS_CONCATREADS 304 typedef StringSet<TRead, Owner<ConcatDirect<> > > TReadSet; 305 #else 306 typedef StringSet<TRead> TReadSet; 307 #endif 308 309 typedef ReadMatch<Difference<TGenome>::Type> TMatch; // a single match 310 typedef String<TMatch/*, MMap<>*/ > TMatches; // array of matches 311 //chrNo, startPos, endPos 312 // typedef String<Pair<unsigned,MakeSigned_<Difference<TGenome>::Type>::Type > > TReadRegions; 313 314 typedef MakeSigned_<Difference<TGenome>::Type>::Type TSignedPos; 315 typedef Pair<unsigned,TSignedPos,BitPacked<2,BitsPerValue<TSignedPos>::VALUE> > TFlagPos; 316 317 //chrNo, flag to remember flag, startPos of mate 318 typedef String<Pair<unsigned,TFlagPos > > TReadRegions; 319 320 321 template <typename TSpec> 322 struct Cargo< Index<TReadSet, TSpec> > { 323 typedef struct { 324 double abundanceCut; 325 int _debugLevel; 326 } Type; 327 }; 328 329 template <typename TAnyReadSet, typename TSpec> 330 struct Cargo< Index<TAnyReadSet, TSpec> > { 331 typedef struct { 332 double abundanceCut; 333 int _debugLevel; 334 } Type; 335 }; 336 ////////////////////////////////////////////////////////////////////////////// 337 // Memory tuning 338 339 #ifdef RAZERS_MEMOPT 340 341 template <typename TSpec> 342 struct SAValue< Index<TReadSet, TSpec> > { 343 typedef Pair< 344 unsigned, 345 unsigned, 346 BitPacked<24, 8> // max. 16M reads of length < 256 347 > Type; 348 }; 349 //454 350 // template <typename TSpec> 351 // struct SAValue< Index<TReadSet, TSpec> > { 352 // typedef Pair< 353 // unsigned, 354 // unsigned, 355 // BitPacked<22, 10> // max. 4M reads of length < 1024 356 // > Type; 357 // }; 358 359 #else 360 361 template <typename TSpec> 362 struct SAValue< Index<TReadSet, TSpec> > { 363 typedef Pair< 364 unsigned, // many reads 365 unsigned, // of arbitrary length 366 Pack 367 > Type; 368 }; 369 370 #endif 371 372 template <> 373 struct Size<Dna5String> 374 { 375 typedef unsigned Type; 376 }; 377 378 template <typename TShape, typename TSpec> 379 struct Size< Index<TReadSet, IndexQGram<TShape, TSpec> > > 380 { 381 typedef unsigned Type; 382 }; 383 384 385 #ifdef RAZERS_PRUNE_QGRAM_INDEX 386 387 ////////////////////////////////////////////////////////////////////////////// 388 // Repeat masker 389 template <typename TShape, typename TSpec> 390 inline bool _qgramDisableBuckets(Index<TReadSet, IndexQGram<TShape, TSpec> > &index) 391 { 392 typedef Index<TReadSet, IndexQGram<TShape, TSpec> > TReadIndex; 393 typedef typename Fibre<TReadIndex, QGramDir>::Type TDir; 394 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 395 typedef typename Value<TDir>::Type TSize; 396 397 TDir &dir = indexDir(index); 398 bool result = false; 399 unsigned counter = 0; 400 TSize thresh = (TSize)(length(index) * cargo(index).abundanceCut); 401 if (thresh < 100) thresh = 100; 402 403 TDirIterator it = begin(dir, Standard()); 404 TDirIterator itEnd = end(dir, Standard()); 405 for (; it != itEnd; ++it) 406 if (*it > thresh) 407 { 408 *it = (TSize)-1; 409 result = true; 410 ++counter; 411 } 412 413 if (counter > 0 && cargo(index)._debugLevel >= 1) 414 ::std::cerr << "Removed " << counter << " k-mers" << ::std::endl; 415 416 return result; 417 } 418 419 #endif 420 421 422 template<typename TIdString, typename TSeqString, typename TQString, typename TOptions> 423 bool 424 _clipReads(TIdString & fastaID, TSeqString & seq, TQString & qual, TOptions & options) 425 { 426 typedef typename Value<TIdString>::Type TChar; 427 428 int tagStart = length(fastaID); 429 int clipFront = -1; 430 int clipBack = -1; 431 for(unsigned i = 0; i < length(fastaID); ++i) 432 { 433 TChar c = fastaID[i]; 434 if(c == 'c') 435 { 436 tagStart = i-1; 437 if(infix(fastaID,i,i+5)=="clip=") 438 { 439 i += 5; 440 String<TChar> clipStr = ""; 441 while(i < length(fastaID) && _parseIsDigit(fastaID[i])) 442 { 443 append(clipStr,fastaID[i]); 444 ++i; 445 } 446 std::istringstream istr(toCString(clipStr)); 447 istr >> clipFront; 448 if(i < length(fastaID) && fastaID[i] == ',') ++i; 449 clipStr = ""; 450 while(i < length(fastaID) && _parseIsDigit(fastaID[i])) 451 { 452 append(clipStr,fastaID[i]); 453 ++i; 454 } 455 std::istringstream istr2(toCString(clipStr)); 456 istr2 >> clipBack; 457 break; 458 } 459 } 460 } 461 if(clipFront<0 && clipBack<0) return true; //no clip tag found 462 if(clipFront<0) clipFront = 0; 463 if(clipBack<0) clipBack = 0; 464 resize(fastaID,tagStart); // only resize fastaID, as it might contain the quality string 465 466 if(options.minClippedLen == 0) // clipping tag was found, but clipping option is turned off 467 return true; 468 469 if(((int)length(seq)-clipBack-clipFront) < options.minClippedLen) // sequence is too short after clipping 470 return false; 471 472 // int newLen = (int)length(seq)-clipBack-clipFront; 473 474 //clip 475 if(length(qual) == 0) // meaning that quality is in fasta header 476 { 477 // first adapt the fasta header 478 TIdString tmp = fastaID; 479 fastaID = infix(tmp,0,length(tmp)-length(seq)); 480 append(fastaID,infix(tmp,length(tmp)-length(seq)+clipFront,length(tmp)-clipBack)); 481 } 482 else 483 qual = infix(qual,clipFront,length(qual)-clipBack); 484 485 // now adapt the sequence 486 seq = infix(seq,clipFront,length(seq)-clipBack); 487 488 return true; 489 490 } 491 492 493 ////////////////////////////////////////////////////////////////////////////// 494 // Load multi-Fasta sequences with or w/o quality values 495 template <typename TReadSet, typename TNameSet, typename TRazerSOptions> 496 bool loadReads( 497 TReadSet &reads, 498 TNameSet &fastaIDs, 499 char const * fileName, 500 TRazerSOptions &options) 501 { 502 bool countN = !(options.matchN || options.outputFormat == 1); 503 if (!empty(options.outputUnmapped)) countN = false; 504 #ifdef RAZERS_MICRO_RNA 505 if(options.microRNA) countN = false; 506 #endif 507 508 SeqFileIn seqFile; 509 510 bool success; 511 if (!isEqual(fileName, "-")) 512 success = open(seqFile, fileName); 513 else 514 success = open(seqFile, std::cin); 515 if (!success) 516 return false; 517 518 CharString fastaId; 519 String<Dna5Q> seq; 520 CharString qual; 521 522 unsigned kickoutcount = 0; 523 while (!atEnd(seqFile)) 524 { 525 readRecord(fastaId, seq, qual, seqFile); 526 if (options.readNaming == 0 527 #ifdef RAZERS_DIRECT_MAQ_MAPPING 528 || options.fastaIdQual 529 #endif 530 ) 531 appendValue(fastaIDs, fastaId); // append Fasta id 532 if(!empty(qual)) options.readsWithQualities = true; 533 #ifdef RAZERS_DIRECT_MAQ_MAPPING 534 //check if sequence has a clip tag 535 if(options.minClippedLen > 0) 536 { 537 if(!_clipReads(back(fastaIDs),seq,qual,options)) 538 { 539 clear(seq); 540 clear(qual); 541 ++kickoutcount; 542 } 543 } 544 if(options.fastaIdQual && !empty(seq)) 545 { 546 if(options.minClippedLen == 0)_clipReads(back(fastaIDs),seq,qual,options); // if the header wasnt clipped before, then clip now!! necessary for quality in header 547 qual = suffix(back(fastaIDs),length(back(fastaIDs))-length(seq)); 548 if(options.readNaming == 0) 549 back(fastaIDs) = prefix(back(fastaIDs),length(back(fastaIDs))-length(seq)); 550 else clear(back(fastaIDs)); 551 } 552 #endif 553 if (countN) 554 { 555 int count = 0; 556 int cutoffCount = (int)(options.errorRate * length(seq)); 557 for (unsigned j = 0; j < length(seq); ++j) 558 if (getValue(seq, j) == 'N') 559 if (++count > cutoffCount) 560 { 561 clear(seq); 562 clear(qual); 563 ++kickoutcount; 564 break; 565 } 566 // low qual. reads are empty to output them and their id later as LQ reads 567 // if (count > cutoffCount) continue; 568 } 569 #ifdef RAZERS_MICRO_RNA 570 if(options.microRNA && length(seq)<options.rnaSeedLength) 571 { 572 clear(seq); 573 clear(qual); 574 } 575 #endif 576 577 // store dna and quality together 578 for (unsigned j = 0; j < length(qual) && j < length(seq); ++j) 579 assignQualityValue(seq[j], (int)(ordValue(qual[j]) - 33)); 580 /* if(ordValue(seq[j])>3) 581 setValue(hybridSeq[j],(unsigned int)164); //N=164 such that &3 == 0 582 else 583 setValue(hybridSeq[j],(unsigned int)((ordValue(qual[j]) - 33) * 4 + ordValue(seq[j]))); 584 */ 585 if (options.trimLength > 0 && length(seq) > (unsigned)options.trimLength) 586 resize(seq, options.trimLength); 587 if((int)length(seq) > options.maxReadLength) 588 options.maxReadLength = length(seq); 589 590 appendValue(reads, seq, Generous()); 591 } 592 593 if (options._debugLevel > 1 && kickoutcount > 0) 594 ::std::cerr << "Ignoring " << kickoutcount << " low quality reads.\n"; 595 return !empty(reads); 596 } 597 598 ////////////////////////////////////////////////////////////////////////////// 599 // Read the first sequence of a multi-sequence file 600 // and return its length 601 inline int estimateReadLength(char const *fname) 602 { 603 SeqFileIn seqFile; 604 if (!open(seqFile, fname) || atEnd(seqFile)) 605 return RAZERS_READS_FAILED; 606 607 // parse record from file 608 CharString fastaId, seq; 609 readRecord(fastaId, seq, seqFile); 610 return length(seq); 611 } 612 613 ////////////////////////////////////////////////////////////////////////////// 614 // Load multi-Fasta sequences from multiple files 615 template <typename TGenomeSet> 616 bool loadGenomes(TGenomeSet &genomes, StringSet<CharString> &fileNameList) 617 { 618 SeqFileIn seqFile; 619 StringSet<CharString> ids; 620 for (size_t i = 0; i != length(fileNameList); ++i) 621 { 622 if (!open(seqFile, toCString(fileNameList[i]))) 623 return false; 624 625 readRecords(ids, genomes, seqFile); 626 clear(ids); 627 close(seqFile); 628 } 629 return length(genomes); 630 } 631 632 #ifdef RAZERS_MICRO_RNA 633 template <typename TReadMatch> 634 struct LessRNoGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > 635 { 636 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 637 { 638 // read number 639 if (a.rseqNo < b.rseqNo) return true; 640 if (a.rseqNo > b.rseqNo) return false; 641 642 // genome position and orientation 643 if (a.gseqNo < b.gseqNo) return true; 644 if (a.gseqNo > b.gseqNo) return false; 645 if (a.gBegin < b.gBegin) return true; 646 if (a.gBegin > b.gBegin) return false; 647 if (a.orientation < b.orientation) return true; 648 if (a.orientation > b.orientation) return false; 649 650 651 if(a.editDist < b.editDist) return true; 652 if(a.editDist > b.editDist) return false; 653 return (a.mScore > b.mScore); 654 } 655 }; 656 657 template <typename TReadMatch> 658 struct LessRNoEdistHLen : public ::std::binary_function < TReadMatch, TReadMatch, bool > 659 { 660 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 661 { 662 // read number 663 if (a.rseqNo < b.rseqNo) return true; 664 if (a.rseqNo > b.rseqNo) return false; 665 666 if(a.editDist < b.editDist) return true; 667 if(a.editDist > b.editDist) return false; 668 return (a.mScore > b.mScore); 669 670 } 671 }; 672 #else 673 674 675 template <typename TReadMatch> 676 struct LessRNoGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > 677 { 678 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 679 { 680 // read number 681 if (a.rseqNo < b.rseqNo) return true; 682 if (a.rseqNo > b.rseqNo) return false; 683 684 // genome position and orientation 685 if (a.gseqNo < b.gseqNo) return true; 686 if (a.gseqNo > b.gseqNo) return false; 687 if (a.gBegin < b.gBegin) return true; 688 if (a.gBegin > b.gBegin) return false; 689 if (a.orientation < b.orientation) return true; 690 if (a.orientation > b.orientation) return false; 691 692 // quality 693 #ifdef RAZERS_MATEPAIRS 694 return a.pairScore > b.pairScore; 695 #else 696 return a.editDist < b.editDist; 697 #endif 698 } 699 }; 700 #endif 701 702 // ... to sort matches and remove duplicates with equal gEnd 703 template <typename TReadMatch> 704 struct LessRNoGEndPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > 705 { 706 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 707 { 708 // read number 709 if (a.rseqNo < b.rseqNo) return true; 710 if (a.rseqNo > b.rseqNo) return false; 711 712 // genome position and orientation 713 if (a.gseqNo < b.gseqNo) return true; 714 if (a.gseqNo > b.gseqNo) return false; 715 if (a.gEnd < b.gEnd) return true; 716 if (a.gEnd > b.gEnd) return false; 717 if (a.orientation < b.orientation) return true; 718 if (a.orientation > b.orientation) return false; 719 720 // quality 721 #ifdef RAZERS_MATEPAIRS 722 return a.pairScore > b.pairScore; 723 #else 724 return a.editDist < b.editDist; 725 #endif 726 } 727 }; 728 729 template <typename TReadMatch> 730 struct LessErrors : public ::std::binary_function < TReadMatch, TReadMatch, bool > 731 { 732 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 733 { 734 // read number 735 if (a.rseqNo < b.rseqNo) return true; 736 if (a.rseqNo > b.rseqNo) return false; 737 738 // quality 739 #ifdef RAZERS_MATEPAIRS 740 return a.pairScore > b.pairScore; 741 #else 742 return a.editDist < b.editDist; 743 #endif 744 } 745 }; 746 747 template <typename TReadMatch> 748 struct LessSplicedScore : public ::std::binary_function < TReadMatch, TReadMatch, bool > 749 { 750 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 751 { 752 // read number 753 if (a.rseqNo < b.rseqNo ) return true; 754 if (a.rseqNo > b.rseqNo ) return false; 755 //if ((a.rseqNo >> 1) < (b.rseqNo >> 1)) return true; 756 //if ((a.rseqNo >> 1) > (b.rseqNo >> 1)) return false; 757 758 // quality 759 if (a.pairScore > b.pairScore) return true; 760 if (a.pairScore < b.pairScore) return false; 761 //CHECK: if (abs(a.mateDelta) < abs(b.mateDelta)) return true; 762 //CHECK: if (abs(a.mateDelta) > abs(b.mateDelta)) return falsee; 763 764 return a.pairId < b.pairId; 765 } 766 }; 767 768 769 template <typename TReadMatch> 770 struct LessSplicedScoreGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > 771 { 772 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 773 { 774 // read number 775 if (a.rseqNo < b.rseqNo ) return true; 776 if (a.rseqNo > b.rseqNo ) return false; 777 //if ((a.rseqNo >> 1) < (b.rseqNo >> 1)) return true; 778 //if ((a.rseqNo >> 1) > (b.rseqNo >> 1)) return false; 779 780 // quality 781 if (a.pairScore > b.pairScore) return true; 782 if (a.pairScore < b.pairScore) return false; 783 if (a.pairId < b.pairId) return true; 784 if (a.pairId > b.pairId) return false; 785 if (a.gBegin < b.gBegin) return true; 786 if (a.gBegin > b.gBegin) return false; 787 return a.editDist < b.editDist; 788 } 789 }; 790 791 792 793 #ifdef RAZERS_DIRECT_MAQ_MAPPING 794 795 struct QualityBasedScoring{}; 796 797 template <typename TReadMatch> 798 struct LessRNoMQ : public ::std::binary_function < TReadMatch, TReadMatch, bool > 799 { 800 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 801 { 802 // read number 803 if (a.rseqNo < b.rseqNo) return true; 804 if (a.rseqNo > b.rseqNo) return false; 805 806 // quality 807 if (a.mScore < b.mScore) return true; // sum of quality values of mismatches (the smaller the better) 808 if (a.mScore > b.mScore) return false; 809 810 return (a.editDist < b.editDist); // seedEditDist? 811 // genome position and orientation 812 /* if (a.gseqNo < b.gseqNo) return true; 813 if (a.gseqNo > b.gseqNo) return false; 814 if (a.gBegin < b.gBegin) return true; 815 if (a.gBegin > b.gBegin) return false; 816 if (a.orientation < b.orientation) return true; 817 if (a.orientation > b.orientation) return false; 818 */ 819 } 820 }; 821 #endif 822 823 ////////////////////////////////////////////////////////////////////////////// 824 // Remove duplicate matches and leave at most maxHits many distanceRange 825 // best matches per read 826 template < typename TMatches > 827 void maskDuplicates(TMatches &matches) 828 { 829 typedef typename Value<TMatches>::Type TMatch; 830 typedef typename Iterator<TMatches, Standard>::Type TIterator; 831 832 ////////////////////////////////////////////////////////////////////////////// 833 // remove matches with equal ends 834 835 ::std::sort( 836 begin(matches, Standard()), 837 end(matches, Standard()), 838 LessRNoGEndPos<TMatch>()); 839 840 typename TMatch::TGPos gBegin = -1; 841 typename TMatch::TGPos gEnd = -1; 842 unsigned gseqNo = -1; 843 unsigned readNo = -1; 844 char orientation = '-'; 845 846 TIterator it = begin(matches, Standard()); 847 TIterator itEnd = end(matches, Standard()); 848 849 for (; it != itEnd; ++it) 850 { 851 #ifdef RAZERS_MATEPAIRS 852 if ((*it).pairId != 0) continue; 853 #endif 854 if (gEnd == (*it).gEnd && orientation == (*it).orientation && 855 gseqNo == (*it).gseqNo && readNo == (*it).rseqNo) 856 { 857 (*it).orientation = '-'; 858 continue; 859 } 860 readNo = (*it).rseqNo; 861 gseqNo = (*it).gseqNo; 862 gEnd = (*it).gEnd; 863 orientation = (*it).orientation; 864 } 865 866 ////////////////////////////////////////////////////////////////////////////// 867 // remove matches with equal begins 868 869 ::std::sort( 870 begin(matches, Standard()), 871 end(matches, Standard()), 872 LessRNoGPos<TMatch>()); 873 874 orientation = '-'; 875 876 it = begin(matches, Standard()); 877 itEnd = end(matches, Standard()); 878 879 for (; it != itEnd; ++it) 880 { 881 if ((*it).orientation == '-' 882 #ifdef RAZERS_MATEPAIRS 883 || ((*it).pairId != 0) 884 #endif 885 ) continue; 886 if (gBegin == (*it).gBegin && readNo == (*it).rseqNo && 887 gseqNo == (*it).gseqNo && orientation == (*it).orientation) 888 { 889 (*it).orientation = '-'; 890 continue; 891 } 892 readNo = (*it).rseqNo; 893 gseqNo = (*it).gseqNo; 894 gBegin = (*it).gBegin; 895 orientation = (*it).orientation; 896 } 897 898 ::std::sort( 899 begin(matches, Standard()), 900 end(matches, Standard()), 901 LessErrors<TMatch>()); 902 } 903 904 ////////////////////////////////////////////////////////////////////////////// 905 // Count matches for each number of errors 906 template < typename TMatches, typename TCounts > 907 void countMatches(TMatches &matches, TCounts &cnt) 908 { 909 //typedef typename Value<TMatches>::Type TMatch; 910 typedef typename Iterator<TMatches, Standard>::Type TIterator; 911 typedef typename Value<TCounts>::Type TRow; 912 typedef typename Value<TRow>::Type TValue; 913 914 TIterator it = begin(matches, Standard()); 915 TIterator itEnd = end(matches, Standard()); 916 917 unsigned readNo = -1; 918 short editDist = -1; 919 int64_t count = 0; 920 int64_t maxVal = std::numeric_limits<TValue>::max(); 921 922 for (; it != itEnd; ++it) 923 { 924 if ((*it).orientation == '-') continue; 925 if (readNo == (*it).rseqNo && editDist == (*it).editDist) 926 ++count; 927 else 928 { 929 if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt)) 930 cnt[editDist][readNo] = (maxVal < count)? maxVal : count; 931 readNo = (*it).rseqNo; 932 editDist = (*it).editDist; 933 count = 1; 934 } 935 } 936 if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt)) 937 cnt[editDist][readNo] = count; 938 939 940 } 941 942 ////////////////////////////////////////////////////////////////////////////// 943 944 template < typename TReadNo, typename TMaxErrors > 945 inline void 946 setMaxErrors(Nothing &, TReadNo, TMaxErrors) 947 { 948 } 949 950 template < typename TSwift, typename TReadNo, typename TMaxErrors > 951 inline void 952 setMaxErrors(TSwift &swift, TReadNo readNo, TMaxErrors maxErrors) 953 { 954 int minT = _qgramLemma(swift, readNo, maxErrors); 955 if (minT > 1) 956 { 957 if (maxErrors < 0) minT = std::numeric_limits<int>::max(); 958 // ::std::cout<<" read:"<<readNo<<" newThresh:"<<minT; 959 setMinThreshold(swift, readNo, (unsigned)minT); 960 } 961 } 962 963 ////////////////////////////////////////////////////////////////////////////// 964 // Remove low quality matches 965 template < typename TMatches, typename TCounts, typename TSpec, typename TSwift > 966 void compactMatches(TMatches &matches, TCounts & 967 #ifdef RAZERS_DIRECT_MAQ_MAPPING 968 cnts 969 #endif 970 , RazerSOptions<TSpec> &options 971 , bool compactFinal , 972 TSwift & 973 #if defined RAZERS_DIRECT_MAQ_MAPPING || defined RAZERS_MASK_READS 974 swift 975 #endif 976 ) 977 { 978 // Get rid of "unused variable" warnings. This is hard to read 979 // and should not be done anywhere else. Better not use ifdefs. 980 (void)compactFinal; 981 #ifdef RAZERS_DIRECT_MAQ_MAPPING 982 if(options.maqMapping) compactMatches(matches, cnts,options,compactFinal,swift,true); 983 #endif 984 //typedef typename Value<TMatches>::Type TMatch; 985 typedef typename Iterator<TMatches, Standard>::Type TIterator; 986 987 #ifdef RAZERS_MICRO_RNA 988 if(options.microRNA) 989 ::std::sort( 990 begin(matches, Standard()), 991 end(matches, Standard()), 992 LessRNoEdistHLen<TMatch>()); 993 int bestMScore = 0; 994 #endif 995 996 unsigned readNo = -1; 997 unsigned hitCount = 0; 998 unsigned hitCountCutOff = options.maxHits; 999 int64_t disabled = 0; 1000 #ifdef RAZERS_MICRO_RNA 1001 if(options.microRNA && options.purgeAmbiguous) 1002 ++hitCountCutOff; // we keep one more match than we actually want, so we can later decide 1003 // whether the read mapped more than maxhits times 1004 #endif 1005 int editDistCutOff = std::numeric_limits<int>::max(); 1006 1007 TIterator it = begin(matches, Standard()); 1008 TIterator itEnd = end(matches, Standard()); 1009 TIterator dit = it; 1010 TIterator ditBeg = it; 1011 1012 for (; it != itEnd; ++it) 1013 { 1014 if ((*it).orientation == '-') continue; 1015 if (readNo == (*it).rseqNo 1016 #ifdef RAZERS_MATEPAIRS 1017 && (*it).pairId == 0 1018 #endif 1019 ) 1020 { 1021 if ((*it).editDist >= editDistCutOff) continue; 1022 #ifdef RAZERS_MICRO_RNA 1023 if ( (*it).mScore < bestMScore) continue; 1024 #endif 1025 if (++hitCount >= hitCountCutOff) 1026 { 1027 #ifdef RAZERS_MASK_READS 1028 if (hitCount == hitCountCutOff) 1029 { 1030 // we have enough, now look for better matches 1031 int maxErrors = (*it).editDist - 1; 1032 if (options.purgeAmbiguous && (options.distanceRange == 0 || (*it).editDist < options.distanceRange)) 1033 maxErrors = -1; 1034 1035 setMaxErrors(swift, readNo, maxErrors); 1036 1037 if (maxErrors == -1) 1038 ++disabled; 1039 // ::std::cerr << "(read #" << readNo << " disabled)"; 1040 1041 if (options.purgeAmbiguous) 1042 { 1043 if (options.distanceRange == 0 || (*it).editDist < options.distanceRange || compactFinal) 1044 dit = ditBeg; 1045 else { 1046 *dit = *it; 1047 ++dit; 1048 } 1049 } 1050 } 1051 #endif 1052 continue; 1053 } 1054 } 1055 else 1056 { 1057 readNo = (*it).rseqNo; 1058 hitCount = 0; 1059 if (options.distanceRange > 0) 1060 editDistCutOff = (*it).editDist + options.distanceRange; 1061 #ifdef RAZERS_MICRO_RNA 1062 bestMScore = (*it).mScore; 1063 #endif 1064 ditBeg = dit; 1065 } 1066 *dit = *it; 1067 ++dit; 1068 } 1069 if (options._debugLevel >= 2) 1070 { 1071 std::cerr << " [" << length(matches) - (dit - begin(matches, Standard())) << " matches removed]"; 1072 std::cerr << " [" << disabled << " reads disabled]"; 1073 } 1074 1075 resize(matches, dit - begin(matches, Standard())); 1076 } 1077 1078 1079 #ifdef RAZERS_DIRECT_MAQ_MAPPING 1080 ////////////////////////////////////////////////////////////////////////////// 1081 // Remove low quality matches 1082 template < typename TMatches, typename TCounts, typename TSpec, typename TSwift > 1083 void compactMatches(TMatches &matches, TCounts &cnts, RazerSOptions<TSpec> &, bool, TSwift &, bool dontCountFirstTwo) 1084 { 1085 typedef typename Value<TMatches>::Type TMatch; 1086 typedef typename Iterator<TMatches, Standard>::Type TIterator; 1087 1088 ::std::sort( 1089 begin(matches, Standard()), 1090 end(matches, Standard()), 1091 LessRNoMQ<TMatch>()); 1092 1093 unsigned readNo = -1; 1094 1095 TIterator it = begin(matches, Standard()); 1096 TIterator itEnd = end(matches, Standard()); 1097 TIterator dit = it; 1098 1099 //number of errors may not exceed 31! 1100 bool second = true; 1101 for (; it != itEnd; ++it) 1102 { 1103 if ((*it).orientation == '-') continue; 1104 if (readNo == (*it).rseqNo) 1105 { 1106 //second best match 1107 if (second) 1108 { 1109 second = false; 1110 if((cnts[1][(*it).rseqNo] & 31) > (*it).editDist) 1111 { 1112 //this second best match is better than any second best match before 1113 cnts[1][(*it).rseqNo] = (*it).editDist; // second best dist is this editDist 1114 // count is 0 (will be updated if countFirstTwo) 1115 } 1116 if(!dontCountFirstTwo) 1117 if((cnts[1][(*it).rseqNo]>>5) != 2047) cnts[1][(*it).rseqNo] += 32; 1118 } 1119 else 1120 { 1121 if ((*it).editDist <= (cnts[0][(*it).rseqNo] & 31) ) 1122 if(cnts[0][(*it).rseqNo]>>5 != 2047) 1123 cnts[0][(*it).rseqNo] +=32; 1124 if ((*it).editDist <= (cnts[1][(*it).rseqNo] & 31) ) 1125 if((cnts[1][(*it).rseqNo]>>5) != 2047) 1126 cnts[1][(*it).rseqNo] +=32; 1127 continue; 1128 } 1129 } else 1130 { //best match 1131 second = true; 1132 readNo = (*it).rseqNo; 1133 //cnts has 16bits, 11:5 for count:dist 1134 if((cnts[0][(*it).rseqNo] & 31) > (*it).editDist) 1135 { 1136 //this match is better than any match before 1137 cnts[1][(*it).rseqNo] = cnts[0][(*it).rseqNo]; // best before is now second best 1138 // (count will be updated when match is removed) 1139 cnts[0][(*it).rseqNo] = (*it).editDist; // best dist is this editDist 1140 // count is 0 (will be updated if countFirstTwo) 1141 } 1142 if(!dontCountFirstTwo) 1143 if((cnts[0][(*it).rseqNo]>>5) != 2047) cnts[0][(*it).rseqNo] += 32; // shift 5 to the right, add 1, shift 5 to the left, and keep count 1144 } 1145 *dit = *it; 1146 ++dit; 1147 } 1148 1149 resize(matches, dit - begin(matches, Standard())); 1150 } 1151 #endif 1152 1153 1154 #ifdef RAZERS_MICRO_RNA 1155 1156 template < typename TMatches, typename TSpec > 1157 void purgeAmbiguousRnaMatches(TMatches &matches, RazerSOptions<TSpec> &options) 1158 { 1159 typedef typename Value<TMatches>::Type TMatch; 1160 typedef typename Iterator<TMatches, Standard>::Type TIterator; 1161 1162 ::std::sort(begin(matches, Standard()),end(matches, Standard()),LessRNoEdistHLen<TMatch>()); 1163 int bestMScore = 0; 1164 1165 unsigned readNo = -1; 1166 unsigned hitCount = 0; 1167 unsigned hitCountCutOff = options.maxHits; 1168 int editDistCutOff = std::numeric_limits<int>::max(); 1169 1170 TIterator it = begin(matches, Standard()); 1171 TIterator itEnd = end(matches, Standard()); 1172 TIterator dit = it; 1173 TIterator ditBeg = it; 1174 1175 for (; it != itEnd; ++it) 1176 { 1177 if ((*it).orientation == '-') continue; 1178 if (readNo == (*it).rseqNo) 1179 { 1180 if ((*it).editDist >= editDistCutOff) continue; 1181 if ( (*it).mScore < bestMScore) continue; 1182 if (++hitCount >= hitCountCutOff) 1183 { 1184 if (hitCount == hitCountCutOff) 1185 dit = ditBeg; 1186 continue; 1187 } 1188 } 1189 else 1190 { 1191 readNo = (*it).rseqNo; 1192 hitCount = 0; 1193 if (options.distanceRange > 0) 1194 editDistCutOff = (*it).editDist + options.distanceRange; 1195 bestMScore = (*it).mScore; 1196 ditBeg = dit; 1197 } 1198 *dit = *it; 1199 ++dit; 1200 } 1201 resize(matches, dit - begin(matches, Standard())); 1202 } 1203 1204 1205 1206 ////////////////////////////////////////////////////////////////////////////// 1207 // Hamming verification recording sum of mismatching qualities in m.mScore 1208 template < 1209 typename TMatch, 1210 typename TGenome, 1211 typename TReadSet, 1212 typename TSpec > 1213 inline bool 1214 matchVerify( 1215 TMatch &m, // resulting match 1216 Segment<TGenome, InfixSegment> inf, // potential match genome region 1217 unsigned rseqNo, // read number 1218 TReadSet &readSet, // reads 1219 RazerSOptions<TSpec> const &options, // RazerS options 1220 MicroRNA) // MaqMapping 1221 { 1222 1223 typedef Segment<TGenome, InfixSegment> TGenomeInfix; 1224 typedef typename Value<TReadSet>::Type const TRead; 1225 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator; 1226 typedef typename Iterator<TRead, Standard>::Type TReadIterator; 1227 1228 unsigned ndlLength = sequenceLength(rseqNo, readSet); 1229 if (length(inf) < ndlLength) return false; 1230 1231 // verify 1232 TRead &read = readSet[rseqNo]; 1233 TReadIterator ritBeg = begin(read, Standard()); 1234 TReadIterator ritEnd = end(read, Standard()); 1235 TGenomeIterator git = begin(inf, Standard()); 1236 TGenomeIterator gitEnd = end(inf, Standard()) - (ndlLength - 1); 1237 1238 // this is max number of errors the seed should have 1239 unsigned maxErrorsSeed = (unsigned)(options.rnaSeedLength * options.errorRate); 1240 unsigned minSeedErrors = maxErrorsSeed + 1; 1241 unsigned bestHitLength = 0; 1242 1243 for (; git < gitEnd; ++git) 1244 { 1245 bool hit = true; 1246 unsigned hitLength = 0; 1247 unsigned count = 0; 1248 unsigned seedErrors = 0; 1249 TGenomeIterator g = git; //maq would count errors in the first 28bp only (at least initially. not for output) 1250 for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g) 1251 { 1252 if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0) 1253 { 1254 if (count < options.rnaSeedLength) // the maq (28bp-)seed 1255 { 1256 if(++seedErrors > maxErrorsSeed) 1257 { 1258 hit = false; 1259 break; 1260 } 1261 } 1262 else 1263 break; 1264 } 1265 ++count; 1266 } 1267 if (hit) hitLength = count; 1268 if (hitLength > bestHitLength ) //simply take the longest hit 1269 { 1270 minSeedErrors = seedErrors; 1271 bestHitLength = hitLength; 1272 m.gBegin = git - begin(host(inf), Standard()); 1273 } 1274 } 1275 1276 // std::cout << "options.absMaxQualSumErrors" << options.absMaxQualSumErrors << std::endl; 1277 // std::cout << "maxSeedErrors" << maxErrorsSeed << std::endl; 1278 // std::cout << "minErrors" << minSeedErrors << std::endl; 1279 // if(derBesgte) ::std::cout << minErrors <<"minErrors\n"; 1280 if (minSeedErrors > maxErrorsSeed) return false; 1281 1282 m.gEnd = m.gBegin + bestHitLength; 1283 m.editDist = minSeedErrors; // errors in seed or total number of errors? 1284 m.mScore = bestHitLength; 1285 m.seedEditDist = minSeedErrors; 1286 return true; 1287 } 1288 1289 #endif 1290 1291 1292 1293 ////////////////////////////////////////////////////////////////////////////// 1294 // Hamming verification 1295 template < 1296 typename TMatch, 1297 typename TGenome, 1298 typename TReadSet, 1299 typename TMyersPatterns, 1300 typename TSpec > 1301 inline bool 1302 matchVerify( 1303 TMatch &m, // resulting match 1304 Segment<TGenome, InfixSegment> inf, // potential match genome region 1305 unsigned rseqNo, // read number 1306 TReadSet &readSet, // reads 1307 TMyersPatterns const &, // MyersBitVector preprocessing data 1308 RazerSOptions<TSpec> const &options, // RazerS options 1309 SwiftSemiGlobalHamming) // Hamming only 1310 { 1311 #ifdef RAZERS_DIRECT_MAQ_MAPPING 1312 if(options.maqMapping) 1313 return matchVerify(m,inf,rseqNo,readSet,options,QualityBasedScoring()); 1314 #endif 1315 1316 #ifdef RAZERS_MICRO_RNA 1317 if(options.microRNA) 1318 return matchVerify(m,inf,rseqNo,readSet,options,MicroRNA()); 1319 #endif 1320 1321 typedef Segment<TGenome, InfixSegment> TGenomeInfix; 1322 typedef typename Value<TReadSet>::Type const TRead; 1323 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator; 1324 typedef typename Iterator<TRead, Standard>::Type TReadIterator; 1325 1326 #ifdef RAZERS_DEBUG 1327 ::std::cout<<"Verify: "<<::std::endl; 1328 ::std::cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl; 1329 ::std::cout<<"Read: "<<readSet[rseqNo] << ::std::endl; 1330 #endif 1331 1332 unsigned ndlLength = sequenceLength(rseqNo, readSet); 1333 if (length(inf) < ndlLength) return false; 1334 1335 // verify 1336 TRead &read = readSet[rseqNo]; 1337 TReadIterator ritBeg = begin(read, Standard()); 1338 TReadIterator ritEnd = end(read, Standard()); 1339 TGenomeIterator git = begin(inf, Standard()); 1340 TGenomeIterator gitEnd = end(inf, Standard()) - (ndlLength - 1); 1341 1342 unsigned maxErrors = (unsigned)(ndlLength * options.errorRate); 1343 unsigned minErrors = maxErrors + 1; 1344 1345 for (; git < gitEnd; ++git) 1346 { 1347 unsigned errors = 0; 1348 TGenomeIterator g = git; 1349 for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g) 1350 if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0) 1351 if (++errors > maxErrors) 1352 break; 1353 if (minErrors > errors) 1354 { 1355 minErrors = errors; 1356 m.gBegin = git - begin(host(inf), Standard()); 1357 } 1358 } 1359 1360 if (minErrors > maxErrors) return false; 1361 1362 m.gEnd = m.gBegin + ndlLength; 1363 m.editDist = minErrors; 1364 return true; 1365 } 1366 1367 1368 ////////////////////////////////////////////////////////////////////////////// 1369 // Edit distance verification 1370 template < 1371 typename TMatch, 1372 typename TGenome, 1373 typename TReadSet, 1374 typename TMyersPatterns, 1375 typename TSpec > 1376 inline bool 1377 matchVerify( 1378 TMatch &m, // resulting match 1379 Segment<TGenome, InfixSegment> inf, // potential match genome region 1380 unsigned rseqNo, // read number 1381 TReadSet &readSet, // reads 1382 TMyersPatterns &forwardPatterns, // MyersBitVector preprocessing data 1383 RazerSOptions<TSpec> const &options, // RazerS options 1384 SwiftSemiGlobal) // Swift specialization 1385 { 1386 typedef Segment<TGenome, InfixSegment> TGenomeInfix; 1387 typedef typename Value<TReadSet>::Type TRead; 1388 1389 // find read match end 1390 typedef Finder<TGenomeInfix> TMyersFinder; 1391 typedef typename Value<TMyersPatterns>::Type TMyersPattern; 1392 1393 // find read match begin 1394 typedef ModifiedString<TGenomeInfix, ModReverse> TGenomeInfixRev; 1395 typedef ModifiedString<TRead, ModReverse> TReadRev; 1396 typedef Finder<TGenomeInfixRev> TMyersFinderRev; 1397 typedef Pattern<TReadRev, MyersUkkonenGlobal> TMyersPatternRev; 1398 1399 TMyersFinder myersFinder(inf); 1400 TMyersPattern &myersPattern = forwardPatterns[rseqNo]; 1401 1402 #ifdef RAZERS_DEBUG 1403 ::std::cout<<"Verify: "<<::std::endl; 1404 ::std::cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl; 1405 ::std::cout<<"Read: "<<readSet[rseqNo]<<::std::endl; 1406 #endif 1407 1408 unsigned ndlLength = sequenceLength(rseqNo, readSet); 1409 int maxScore = std::numeric_limits<int>::min(); 1410 int minScore = -(int)(ndlLength * options.errorRate); 1411 TMyersFinder maxPos; 1412 1413 // find end of best semi-global alignment 1414 while (find(myersFinder, myersPattern, minScore)) 1415 if (maxScore <= getScore(myersPattern)) 1416 { 1417 maxScore = getScore(myersPattern); 1418 maxPos = myersFinder; 1419 } 1420 1421 if (maxScore < minScore) return false; 1422 m.editDist = (unsigned)-maxScore; 1423 setEndPosition(inf, m.gEnd = (beginPosition(inf) + position(maxPos) + 1)); 1424 1425 // limit the beginning to needle length plus errors (== -maxScore) 1426 if (length(inf) > ndlLength - maxScore) 1427 setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore); 1428 1429 // find beginning of best semi-global alignment 1430 TGenomeInfixRev infRev(inf); 1431 TReadRev readRev(readSet[rseqNo]); 1432 TMyersFinderRev myersFinderRev(infRev); 1433 TMyersPatternRev myersPatternRev(readRev); 1434 1435 _patternMatchNOfPattern(myersPatternRev, options.matchN); 1436 _patternMatchNOfFinder(myersPatternRev, options.matchN); 1437 while (find(myersFinderRev, myersPatternRev, maxScore)) 1438 m.gBegin = m.gEnd - (position(myersFinderRev) + 1); 1439 1440 return true; 1441 } 1442 1443 1444 #ifdef RAZERS_DIRECT_MAQ_MAPPING 1445 ////////////////////////////////////////////////////////////////////////////// 1446 // Hamming verification recording sum of mismatching qualities in m.mScore 1447 template < 1448 typename TMatch, 1449 typename TGenome, 1450 typename TReadSet, 1451 typename TSpec > 1452 inline bool 1453 matchVerify( 1454 TMatch &m, // resulting match 1455 Segment<TGenome, InfixSegment> inf, // potential match genome region 1456 unsigned rseqNo, // read number 1457 TReadSet &readSet, // reads 1458 RazerSOptions<TSpec> const &options, // RazerS options 1459 QualityBasedScoring) // MaqMapping 1460 { 1461 1462 typedef Segment<TGenome, InfixSegment> TGenomeInfix; 1463 typedef typename Value<TReadSet>::Type const TRead; 1464 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator; 1465 typedef typename Iterator<TRead, Standard>::Type TReadIterator; 1466 1467 // #ifdef RAZERS_DEBUG 1468 // cout<<"Verify: "<<::std::endl; 1469 // cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl; 1470 // cout<<"Read: "<<host(myersPattern)<<::std::endl; 1471 // #endif 1472 1473 // bool derBesgte = false; 1474 //if(rseqNo == 2) derBesgte = true; 1475 // if(derBesgte) ::std::cout << "der besagte\n"; 1476 unsigned ndlLength = sequenceLength(rseqNo, readSet); 1477 if (length(inf) < ndlLength) return false; 1478 1479 // verify 1480 TRead &read = readSet[rseqNo]; 1481 TReadIterator ritBeg = begin(read, Standard()); 1482 TReadIterator ritEnd = end(read, Standard()); 1483 TGenomeIterator git = begin(inf, Standard()); 1484 TGenomeIterator gitEnd = end(inf, Standard()) - (ndlLength - 1); 1485 TGenomeIterator bestIt = begin(inf, Standard()); 1486 1487 // this is max number of errors the 28bp 'seed' should have 1488 //assuming that maxErrors-1 matches can be found with 100% SN 1489 unsigned maxErrorsSeed = (unsigned)(options.artSeedLength * options.errorRate) + 1; 1490 unsigned maxErrorsTotal = (unsigned)(ndlLength * 0.25); //options.maxErrorRate); 1491 unsigned minErrors = maxErrorsTotal + 1; 1492 int minQualSumErrors = options.absMaxQualSumErrors + 10; 1493 unsigned minSeedErrors = maxErrorsSeed + 1; 1494 1495 for (; git < gitEnd; ++git) 1496 { 1497 bool hit = true; 1498 unsigned errors = 0; 1499 unsigned count = 0; 1500 unsigned seedErrors = 0; 1501 int qualSumErrors = 0; 1502 TGenomeIterator g = git; //maq would count errors in the first 28bp only (at least initially. not for output) 1503 for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g) 1504 { 1505 if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0) 1506 { 1507 // ::std::cout << count << "<-"; 1508 if (++errors > maxErrorsTotal) 1509 { 1510 hit = false; 1511 break; 1512 } 1513 //int qualityValue = (int)((unsigned char)*r >> 3); 1514 //qualSumErrors += (qualityValue < options.mutationRateQual) ? qualityValue : options.mutationRateQual; 1515 qualSumErrors += (getQualityValue(*r) < options.mutationRateQual) ? getQualityValue(*r) : options.mutationRateQual; 1516 if(qualSumErrors > options.absMaxQualSumErrors || qualSumErrors > minQualSumErrors) 1517 { 1518 hit = false; 1519 break; 1520 } 1521 if (count < options.artSeedLength) // the maq (28bp-)seed 1522 { 1523 if(++seedErrors > maxErrorsSeed) 1524 { 1525 hit = false; 1526 break; 1527 } 1528 if(qualSumErrors > options.maxMismatchQualSum) 1529 { 1530 hit = false; 1531 break; 1532 }// discard match, if 'seed' is bad (later calculations are done with the quality sum over the whole read) 1533 } 1534 } 1535 ++count; 1536 } 1537 if (hit && (qualSumErrors < minQualSumErrors /*&& seedErrors <=maxErrorsSeed*/) ) //oder (seedErrors < minSeedErrors) 1538 { 1539 minErrors = errors; 1540 minSeedErrors = seedErrors; 1541 minQualSumErrors = qualSumErrors; 1542 m.gBegin = git - begin(host(inf), Standard()); 1543 bestIt = git; 1544 } 1545 } 1546 if (minQualSumErrors > options.absMaxQualSumErrors || minSeedErrors > maxErrorsSeed || minErrors > maxErrorsTotal) return false; 1547 1548 m.gEnd = m.gBegin + ndlLength; 1549 m.editDist = minErrors; // errors in seed or total number of errors? 1550 m.mScore = minQualSumErrors; 1551 m.seedEditDist = minSeedErrors; 1552 return true; 1553 } 1554 1555 1556 ////////////////////////////////////////////////////////////////////////////// 1557 // Edit distance verification 1558 template < 1559 typename TMatch, 1560 typename TGenome, 1561 typename TReadSet, 1562 typename TMyersPatterns, 1563 typename TSpec > 1564 inline bool 1565 matchVerify( 1566 TMatch &m, // resulting match 1567 Segment<TGenome, InfixSegment> inf, // potential match genome region 1568 unsigned rseqNo, // read number 1569 TReadSet &readSet, // reads 1570 TMyersPatterns const & pat, // MyersBitVector preprocessing data 1571 RazerSOptions<TSpec> const &options, // RazerS options 1572 SwiftSemiGlobal const &swiftsemi, // Hamming only 1573 QualityBasedScoring) // Swift specialization 1574 { 1575 //if(!options.maqMapping) 1576 return matchVerify(m,inf,rseqNo,readSet,pat,options,swiftsemi); 1577 //else 1578 // return matchVerify(m,inf,rseqNo,readSet,pat,options,swiftsemi); // todo! 1579 } 1580 #endif 1581 1582 1583 1584 1585 #ifndef RAZERS_PARALLEL 1586 ////////////////////////////////////////////////////////////////////////////// 1587 // Find read matches in one genome sequence 1588 template < 1589 typename TMatches, 1590 typename TGenome, 1591 typename TReadIndex, 1592 typename TSwiftSpec, 1593 typename TVerifier, 1594 typename TCounts, 1595 typename TSpec > 1596 void mapSingleReads( 1597 TMatches &matches, // resulting matches 1598 TGenome &genome, // genome ... 1599 unsigned gseqNo, // ... and its sequence number 1600 Pattern<TReadIndex, Swift<TSwiftSpec> > &swiftPattern, 1601 TVerifier &forwardPatterns, 1602 TCounts & cnts, 1603 char orientation, // q-gram index of reads 1604 RazerSOptions<TSpec> &options) 1605 { 1606 typedef typename Fibre<TReadIndex, FibreText>::Type TReadSet; 1607 typedef typename Size<TGenome>::Type TSize; 1608 typedef typename Value<TMatches>::Type TMatch; 1609 1610 1611 // FILTRATION 1612 typedef Finder<TGenome, Swift<TSwiftSpec> > TSwiftFinder; 1613 //typedef Pattern<TReadIndex, Swift<TSwiftSpec> > TSwiftPattern; 1614 1615 // iterate all genomic sequences 1616 if (options._debugLevel >= 1) 1617 { 1618 ::std::cerr << ::std::endl << "Process genome seq #" << gseqNo; 1619 if (orientation == 'F') 1620 ::std::cerr << "[fwd]"; 1621 else 1622 ::std::cerr << "[rev]"; 1623 } 1624 1625 TReadSet &readSet = host(host(swiftPattern)); 1626 TSwiftFinder swiftFinder(genome, options.repeatLength, 1); 1627 1628 TMatch m = { // to supress uninitialized warnings 1629 0, 0, 0, 0, 1630 #ifdef RAZERS_MATEPAIRS 1631 0, 0, 0, 1632 #endif 1633 0, 1634 #ifdef RAZERS_EXTENDED_MATCH 1635 0, 0, 1636 #endif 1637 #ifdef RAZERS_SPLICED 1638 0, 0, 1639 #endif 1640 0 1641 }; 1642 1643 SEQAN_PROTIMESTART(map_contig_time); 1644 1645 // iterate all verification regions returned by SWIFT 1646 TSize gLength = length(genome); 1647 int64_t localTP = 0; 1648 int64_t localFP = 0; 1649 1650 #ifdef RAZERS_MICRO_RNA 1651 while (find(swiftFinder, swiftPattern, 0.2)) 1652 #else 1653 while (find(swiftFinder, swiftPattern, options.errorRate)) 1654 #endif 1655 { 1656 unsigned rseqNo = (*swiftFinder.curHit).ndlSeqNo; 1657 if (!options.spec.DONT_VERIFY && 1658 matchVerify(m, infix(swiftFinder), rseqNo, readSet, forwardPatterns, options, TSwiftSpec())) 1659 { 1660 // transform coordinates to the forward strand 1661 if (orientation == 'R') 1662 { 1663 TSize temp = m.gBegin; 1664 m.gBegin = gLength - m.gEnd; 1665 m.gEnd = gLength - temp; 1666 } 1667 m.gseqNo = gseqNo; 1668 m.rseqNo = rseqNo; 1669 m.orientation = orientation; 1670 #ifdef RAZERS_MATEPAIRS 1671 m.pairId = 0; 1672 m.pairScore = 0 - m.editDist; 1673 #endif 1674 1675 if (!options.spec.DONT_DUMP_RESULTS) 1676 { 1677 appendValue(matches, m, Generous()); 1678 if (length(matches) > options.compactThresh) 1679 { 1680 #ifndef RAZERS_MICRO_RNA 1681 typename Size<TMatches>::Type oldSize = length(matches); 1682 #endif 1683 maskDuplicates(matches); // overlapping parallelograms cause duplicates 1684 #ifdef RAZERS_DIRECT_MAQ_MAPPING 1685 if(options.maqMapping) 1686 compactMatches(matches, cnts, options, false, swiftPattern, true); 1687 else 1688 #endif 1689 compactMatches(matches, cnts, options, false, swiftPattern); 1690 #ifdef RAZERS_MICRO_RNA 1691 options.compactThresh = 2 * length(matches); 1692 #else 1693 if (length(matches) * 4 > oldSize) // the threshold should not be raised 1694 options.compactThresh += (options.compactThresh >> 1); // if too many matches were removed 1695 #endif 1696 } 1697 } 1698 1699 ++localTP; 1700 // ::std::cerr << "\"" << infix(swiftFinder) << "\" "; 1701 // ::std::cerr << hstkPos << " + "; 1702 // ::std::cerr << ::std::endl; 1703 } else { 1704 ++localFP; 1705 // ::std::cerr << "\"" << infx(swiftFinder) << "\" \"" << infix(swiftPattern) << "\" "; 1706 // ::std::cerr << rseqNo << " : "; 1707 // ::std::cerr << hstkPos << " + "; 1708 // ::std::cerr << bucketWidth << " " << TP << ::std::endl; 1709 } 1710 } 1711 options.TP += localTP; 1712 options.FP += localFP; 1713 if (options._debugLevel >= 2) 1714 { 1715 double spec = 100; 1716 double time = SEQAN_PROTIMEDIFF(map_contig_time); 1717 if (localFP+localTP != 0) 1718 spec = (1000*localTP/(localFP+localTP))/10.0; 1719 ::std::cerr << " [" << (int)((gLength / time)/100.0)/10.0 << "kbp/s " << time << "s] [" << spec << "%SP " << localFP+localTP << "V]"; 1720 } 1721 } 1722 #endif 1723 1724 1725 1726 #ifdef RAZERS_MICRO_RNA 1727 1728 // multiple sequences 1729 template < 1730 typename TSA, 1731 typename TStringSet, 1732 typename TShape, 1733 typename TDir, 1734 typename TValue, 1735 typename TWithConstraints 1736 > 1737 inline void 1738 _qgramFillSuffixArray( 1739 TSA &sa, 1740 TStringSet &stringSet, 1741 TShape &shape, 1742 TDir &dir, 1743 Nothing, 1744 TWithConstraints const, 1745 TValue prefixLen, 1746 MicroRNA) 1747 { 1748 typedef typename Value<TStringSet>::Type TString; 1749 typedef typename Iterator<TString const, Standard>::Type TIterator; 1750 typedef typename Value<TDir>::Type TSize; 1751 typedef typename Value<TShape>::Type THash; 1752 1753 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 1754 { 1755 TString const &sequence = value(stringSet, seqNo); 1756 if (length(sequence) < length(shape)) continue; 1757 int num_qgrams = prefixLen - (int)length(shape) + 1; 1758 1759 typename Value<TSA>::Type localPos; 1760 assignValueI1(localPos, seqNo); 1761 assignValueI2(localPos, 0); 1762 1763 TIterator itText = begin(sequence, Standard()); 1764 if (TWithConstraints::VALUE) { 1765 THash h = hash(shape, itText) + 1; // first hash 1766 if (dir[h] != (TSize)-1) sa[dir[h]++] = localPos; // if bucket is enabled 1767 } else 1768 sa[dir[hash(shape, itText) + 1]++] = localPos; // first hash 1769 1770 for(int i = 1; i < num_qgrams; ++i) 1771 { 1772 ++itText; 1773 assignValueI2(localPos, i); 1774 if (TWithConstraints::VALUE) { 1775 THash h = hashNext(shape, itText) + 1; // next hash 1776 if (dir[h] != (TSize)-1) sa[dir[h]++] = localPos; // if bucket is enabled 1777 } else 1778 sa[dir[hashNext(shape, itText) + 1]++] = localPos; // next hash 1779 } 1780 } 1781 } 1782 1783 template < typename TDir, typename TStringSet, typename TShape, typename TValue > 1784 inline void 1785 _qgramCountQGrams(TDir &dir, TStringSet &stringSet, TShape &shape, TValue prefixLen, MicroRNA) 1786 { 1787 typedef typename Value<TStringSet>::Type TString; 1788 typedef typename Iterator<TString const, Standard>::Type TIterator; 1789 typedef typename Value<TDir>::Type TSize; 1790 1791 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 1792 { 1793 TString const &sequence = value(stringSet, seqNo); 1794 if (length(sequence) < length(shape)) continue; 1795 TSize num_qgrams = prefixLen - length(shape) + 1; 1796 1797 TIterator itText = begin(sequence, Standard()); 1798 ++dir[hash(shape, itText)]; 1799 for(TSize i = 1; i < num_qgrams; ++i) 1800 { 1801 ++itText; 1802 ++dir[hashNext(shape, itText)]; 1803 } 1804 } 1805 } 1806 1807 1808 template < typename TIndex, typename TValue> 1809 void createQGramIndex(TIndex &index, TValue prefixLen, MicroRNA) 1810 { 1811 typename Fibre<TIndex, QGramText>::Type &text = indexText(index); 1812 typename Fibre<TIndex, QGramSA>::Type &sa = indexSA(index); 1813 typename Fibre<TIndex, QGramDir>::Type &dir = indexDir(index); 1814 typename Fibre<TIndex, QGramShape>::Type &shape = indexShape(index); 1815 1816 Nothing nothing; 1817 1818 // 1. clear counters 1819 arrayFill(begin(dir, Standard()), end(dir, Standard()), 0); 1820 1821 // 2. count q-grams 1822 _qgramCountQGrams(dir, text, shape, prefixLen,MicroRNA()); 1823 1824 if (_qgramDisableBuckets(index)) 1825 { 1826 // 3. cumulative sum 1827 _qgramCummulativeSum(dir, True()); 1828 1829 // 4. fill suffix array 1830 _qgramFillSuffixArray(sa, text, shape, dir, nothing, True(), prefixLen, MicroRNA()); 1831 1832 // 5. correct disabled buckets 1833 _qgramPostprocessBuckets(dir); 1834 } 1835 else 1836 { 1837 // 3. cumulative sum 1838 _qgramCummulativeSum(dir, False()); 1839 1840 // 4. fill suffix array 1841 _qgramFillSuffixArray(sa, text, shape, dir, nothing, False(),prefixLen, MicroRNA()); 1842 } 1843 } 1844 1845 1846 #endif 1847 1848 ////////////////////////////////////////////////////////////////////////////// 1849 // Find read matches in many genome sequences (import from Fasta) 1850 template < 1851 typename TMatches, 1852 typename TReadSet, 1853 typename TCounts, 1854 typename TSpec, 1855 typename TShape, 1856 typename TSwiftSpec > 1857 int mapSingleReads( 1858 TMatches & matches, 1859 StringSet<CharString> & genomeFileNameList, 1860 StringSet<CharString> & genomeNames, // genome names, taken from the Fasta file 1861 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap, 1862 TReadSet const & readSet, 1863 TCounts & cnts, 1864 RazerSOptions<TSpec> & options, 1865 TShape const & shape, 1866 Swift<TSwiftSpec> const) 1867 { 1868 typedef typename Value<TReadSet>::Type TRead; 1869 typedef Index<TReadSet, IndexQGram<TShape, TQGramIndexSpec> > TIndex; // q-gram index 1870 typedef Pattern<TIndex, Swift<TSwiftSpec> > TSwiftPattern; // filter 1871 typedef Pattern<TRead, MyersUkkonen> TMyersPattern; // verifier 1872 1873 /* // try opening each genome file once before running the whole mapping procedure 1874 int filecount = 0; 1875 int numFiles = length(genomeFileNameList); 1876 while(filecount < numFiles) 1877 { 1878 ::std::ifstream file; 1879 file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary); 1880 if (!file.is_open()) 1881 return RAZERS_GENOME_FAILED; 1882 file.close(); 1883 ++filecount; 1884 } 1885 */ 1886 1887 // configure q-gram index 1888 TIndex swiftIndex(readSet, shape); 1889 #ifdef RAZERS_OPENADDRESSING 1890 swiftIndex.alpha = 2; 1891 #endif 1892 cargo(swiftIndex).abundanceCut = options.abundanceCut; 1893 cargo(swiftIndex)._debugLevel = options._debugLevel; 1894 1895 // configure Swift 1896 TSwiftPattern swiftPattern(swiftIndex); 1897 swiftPattern.params.minThreshold = options.threshold; 1898 swiftPattern.params.tabooLength = options.tabooLength; 1899 swiftPattern.params.printDots = options._debugLevel > 0; 1900 1901 // init edit distance verifiers 1902 unsigned readCount = countSequences(swiftIndex); 1903 String<TMyersPattern> forwardPatterns; 1904 options.compMask[4] = (options.matchN)? 15: 0; 1905 if (!options.hammingOnly) 1906 { 1907 resize(forwardPatterns, readCount, Exact()); 1908 for(unsigned i = 0; i < readCount; ++i) 1909 { 1910 setHost(forwardPatterns[i], indexText(swiftIndex)[i]); 1911 _patternMatchNOfPattern(forwardPatterns[i], options.matchN); 1912 _patternMatchNOfFinder(forwardPatterns[i], options.matchN); 1913 } 1914 } 1915 #ifdef RAZERS_MICRO_RNA 1916 typename Size<TIndex>::Type qgram_count = 0; 1917 if(options.microRNA) 1918 { 1919 for(unsigned i = 0; i < countSequences(swiftIndex); ++i) 1920 if (sequenceLength(i, swiftIndex) >= options.rnaSeedLength) 1921 qgram_count += options.rnaSeedLength - (length(shape) - 1); 1922 resize(indexSA(swiftIndex), qgram_count, Exact()); 1923 resize(indexDir(swiftIndex), _fullDirLength(swiftIndex), Exact()); 1924 createQGramIndex(swiftIndex,options.rnaSeedLength,MicroRNA()); 1925 } 1926 #endif 1927 1928 1929 #ifdef RAZERS_DIRECT_MAQ_MAPPING 1930 if(options.maqMapping) 1931 { 1932 resize(cnts, 2); 1933 for (unsigned i = 0; i < length(cnts); ++i) 1934 resize(cnts[i], readCount, 31); //initialize with maxeditDist, 11:5 for count:dist 1935 } 1936 #endif 1937 1938 // clear stats 1939 options.FP = 0; 1940 options.TP = 0; 1941 options.timeMapReads = 0; 1942 options.timeDumpResults = 0; 1943 1944 unsigned numFiles = length(genomeFileNameList); 1945 unsigned gseqNo = 0; 1946 1947 // open genome files, one by one 1948 for (unsigned filecount = 0; filecount < numFiles; ++filecount) 1949 { 1950 // open genome file 1951 SeqFileIn file; 1952 if (!open(file, toCString(genomeFileNameList[filecount]))) 1953 return RAZERS_GENOME_FAILED; 1954 1955 // remove the directory prefix of current genome file 1956 ::std::string genomeFile(toCString(genomeFileNameList[filecount])); 1957 size_t lastPos = genomeFile.find_last_of('/') + 1; 1958 if (lastPos == genomeFile.npos) lastPos = genomeFile.find_last_of('\\') + 1; 1959 if (lastPos == genomeFile.npos) lastPos = 0; 1960 ::std::string genomeName = genomeFile.substr(lastPos); 1961 1962 CharString id; 1963 Dna5String genome; 1964 unsigned gseqNoWithinFile = 0; 1965 // iterate over genome sequences 1966 SEQAN_PROTIMESTART(find_time); 1967 for(; !atEnd(file); ++gseqNo) 1968 { 1969 readRecord(id, genome, file); // read Fasta id and sequence 1970 if (options.genomeNaming == 0) 1971 { 1972 cropAfterFirst(id, IsWhitespace()); 1973 appendValue(genomeNames, id, Generous()); 1974 } 1975 1976 gnoToFileMap.insert(::std::make_pair(gseqNo,::std::make_pair(genomeName,gseqNoWithinFile))); 1977 1978 if (options.forward) 1979 mapSingleReads(matches, genome, gseqNo, swiftPattern, forwardPatterns, cnts, 'F', options); 1980 1981 if (options.reverse) 1982 { 1983 reverseComplement(genome); 1984 mapSingleReads(matches, genome, gseqNo, swiftPattern, forwardPatterns, cnts, 'R', options); 1985 } 1986 ++gseqNoWithinFile; 1987 1988 } 1989 options.timeMapReads += SEQAN_PROTIMEDIFF(find_time); 1990 } 1991 1992 if (options._debugLevel >= 1) 1993 ::std::cerr << ::std::endl << "Finding reads took \t" << options.timeMapReads << " seconds" << ::std::endl; 1994 if (options._debugLevel >= 2) { 1995 ::std::cerr << ::std::endl; 1996 ::std::cerr << "___FILTRATION_STATS____" << ::std::endl; 1997 ::std::cerr << "Swift FP: " << options.FP << ::std::endl; 1998 ::std::cerr << "Swift TP: " << options.TP << ::std::endl; 1999 } 2000 return 0; 2001 } 2002 2003 2004 ////////////////////////////////////////////////////////////////////////////// 2005 // Find read matches in many genome sequences (given as StringSet) 2006 template < 2007 typename TMatches, 2008 typename TGenomeSet, 2009 typename TReadSet, 2010 typename TCounts, 2011 typename TSpec, 2012 typename TShape, 2013 typename TSwiftSpec > 2014 int mapSingleReads( 2015 TMatches & matches, 2016 TGenomeSet & genomeSet, 2017 TReadSet const & readSet, 2018 TCounts & cnts, 2019 RazerSOptions<TSpec> & options, 2020 TShape const & shape, 2021 Swift<TSwiftSpec> const) 2022 { 2023 typedef typename Value<TReadSet>::Type TRead; 2024 typedef Index<TReadSet, IndexQGram<TShape, TQGramIndexSpec> > TIndex; // q-gram index 2025 typedef Pattern<TIndex, Swift<TSwiftSpec> > TSwiftPattern; // filter 2026 typedef Pattern<TRead, MyersUkkonen> TMyersPattern; // verifier 2027 2028 // configure q-gram index 2029 TIndex swiftIndex(readSet, shape); 2030 cargo(swiftIndex).abundanceCut = options.abundanceCut; 2031 cargo(swiftIndex)._debugLevel = options._debugLevel; 2032 2033 // configure Swift 2034 TSwiftPattern swiftPattern(swiftIndex); 2035 swiftPattern.params.minThreshold = options.threshold; 2036 swiftPattern.params.tabooLength = options.tabooLength; 2037 2038 // init edit distance verifiers 2039 String<TMyersPattern> forwardPatterns; 2040 options.compMask[4] = (options.matchN)? 15: 0; 2041 if (!options.hammingOnly) 2042 { 2043 unsigned readCount = countSequences(swiftIndex); 2044 resize(forwardPatterns, readCount, Exact()); 2045 for(unsigned i = 0; i < readCount; ++i) 2046 { 2047 setHost(forwardPatterns[i], indexText(swiftIndex)[i]); 2048 _patternMatchNOfPattern(forwardPatterns[i], options.matchN); 2049 _patternMatchNOfFinder(forwardPatterns[i], options.matchN); 2050 } 2051 } 2052 2053 // clear stats 2054 options.FP = 0; 2055 options.TP = 0; 2056 options.timeMapReads = 0; 2057 options.timeDumpResults = 0; 2058 2059 CharString id; 2060 #ifdef RAZERS_DIRECT_MAQ_MAPPING 2061 if(options.maqMapping) 2062 { 2063 resize(cnts, 2); 2064 for (unsigned i = 0; i < length(cnts); ++i) 2065 resize(cnts[i], length(readSet), 31); 2066 } 2067 #endif 2068 2069 2070 2071 // iterate over genome sequences 2072 SEQAN_PROTIMESTART(find_time); 2073 for(unsigned gseqNo = 0; gseqNo < length(genomeSet); ++gseqNo) 2074 { 2075 if (options.forward) 2076 mapSingleReads(matches, genomeSet[gseqNo], gseqNo, swiftPattern, forwardPatterns, cnts, 'F', options); 2077 2078 if (options.reverse) 2079 { 2080 reverseComplement(genomeSet[gseqNo]); 2081 mapSingleReads(matches, genomeSet[gseqNo], gseqNo, swiftPattern, forwardPatterns, cnts, 'R', options); 2082 reverseComplement(genomeSet[gseqNo]); 2083 } 2084 2085 } 2086 options.timeMapReads += SEQAN_PROTIMEDIFF(find_time); 2087 2088 if (options._debugLevel >= 1) 2089 ::std::cerr << ::std::endl << "Finding reads took \t" << options.timeMapReads << " seconds" << ::std::endl; 2090 if (options._debugLevel >= 2) { 2091 ::std::cerr << ::std::endl; 2092 ::std::cerr << "___FILTRATION_STATS____" << ::std::endl; 2093 ::std::cerr << "Swift FP: " << options.FP << ::std::endl; 2094 ::std::cerr << "Swift TP: " << options.TP << ::std::endl; 2095 } 2096 return 0; 2097 } 2098 2099 ////////////////////////////////////////////////////////////////////////////// 2100 // Wrapper for single/mate-pair mapping 2101 template < 2102 typename TMatches, 2103 typename TReadSet, 2104 typename TCounts, 2105 typename TSpec, 2106 typename TShape, 2107 typename TSwiftSpec > 2108 int mapReads( 2109 TMatches & matches, 2110 StringSet<CharString> & genomeFileNameList, 2111 StringSet<CharString> & genomeNames, // genome names, taken from the Fasta file 2112 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap, 2113 TReadSet & readSet, 2114 TCounts & cnts, 2115 RazerSOptions<TSpec> & options, 2116 TShape const & shape, 2117 Swift<TSwiftSpec> const) 2118 { 2119 2120 #ifdef RAZERS_MATEPAIRS 2121 if (options.libraryLength >= 0) 2122 return mapMatePairReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, shape, Swift<TSwiftSpec>()); 2123 else 2124 #endif 2125 return mapSingleReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, shape, Swift<TSwiftSpec>()); 2126 } 2127 2128 template < 2129 typename TMatches, 2130 typename TGenomeSet, 2131 typename TReadSet, 2132 typename TCounts, 2133 typename TSpec, 2134 typename TShape, 2135 typename TSwiftSpec > 2136 int mapReads( 2137 TMatches & matches, 2138 TGenomeSet & genomeSet, 2139 TReadSet & readSet, 2140 TCounts & cnts, 2141 RazerSOptions<TSpec> & options, 2142 TShape const & shape, 2143 Swift<TSwiftSpec> const) 2144 { 2145 #ifdef RAZERS_MATEPAIRS 2146 if (options.libraryLength >= 0) 2147 return mapMatePairReads(matches, genomeSet, readSet, cnts, options, shape, Swift<TSwiftSpec>()); 2148 else 2149 #endif 2150 return mapSingleReads(matches, genomeSet, readSet, cnts, options, shape, Swift<TSwiftSpec>()); 2151 } 2152 2153 ////////////////////////////////////////////////////////////////////////////// 2154 // Wrapper for different template specializations 2155 template <typename TMatches, typename TReadSet, typename TCounts, typename TSpec> 2156 int mapReads( 2157 TMatches & matches, 2158 StringSet<CharString> & genomeFileNameList, 2159 StringSet<CharString> & genomeNames, // genome names, taken from the Fasta file 2160 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap, 2161 TReadSet & readSet, 2162 TReadRegions & readRegions, 2163 TCounts & cnts, 2164 RazerSOptions<TSpec> & options) 2165 { 2166 // first check whether split mapping (--> two shapes) or normal mapping 2167 #ifdef RAZERS_SPLICED 2168 if (options.minMatchLen > 0) 2169 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options); 2170 #else 2171 (void)readRegions; 2172 #endif 2173 2174 Shape<Dna, SimpleShape> ungapped; 2175 Shape<Dna, OneGappedShape> onegapped; 2176 Shape<Dna, GenericShape> gapped; 2177 2178 // 2x3 SPECIALIZATION 2179 2180 if (options.hammingOnly) 2181 { 2182 // select best-fitting shape 2183 if (stringToShape(ungapped, options.shape)) 2184 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobalHamming>()); 2185 2186 if (stringToShape(onegapped, options.shape)) 2187 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobalHamming>()); 2188 2189 if (stringToShape(gapped, options.shape)) 2190 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, gapped, Swift<SwiftSemiGlobalHamming>()); 2191 } 2192 else 2193 { 2194 if (stringToShape(ungapped, options.shape)) 2195 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobal>()); 2196 2197 if (stringToShape(onegapped, options.shape)) 2198 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobal>()); 2199 2200 if (stringToShape(gapped, options.shape)) 2201 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, gapped, Swift<SwiftSemiGlobal>()); 2202 } 2203 2204 return RAZERS_INVALID_SHAPE; 2205 } 2206 2207 template <typename TMatches, typename TGenomeSet, typename TReadSet, typename TReadRegions, typename TCounts, typename TSpec> 2208 int mapReads( 2209 TMatches & matches, 2210 TGenomeSet & genomeSet, 2211 TReadSet & readSet, 2212 TReadRegions & 2213 #ifdef RAZERS_SPLICED 2214 readRegions 2215 #endif 2216 , 2217 TCounts & cnts, 2218 RazerSOptions<TSpec> & options) 2219 { 2220 2221 2222 // first check whether split mapping (--> two shapes) or normal mapping 2223 #ifdef RAZERS_SPLICED 2224 if (options.minMatchLen > 0) 2225 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options); 2226 #endif 2227 2228 2229 Shape<Dna, SimpleShape> ungapped; 2230 Shape<Dna, OneGappedShape> onegapped; 2231 Shape<Dna, GenericShape> gapped; 2232 2233 // 2x3 SPECIALIZATION 2234 2235 if (options.hammingOnly) 2236 { 2237 // select best-fitting shape 2238 if (stringToShape(ungapped, options.shape)) 2239 return mapReads(matches, genomeSet, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobalHamming>()); 2240 2241 if (stringToShape(onegapped, options.shape)) 2242 return mapReads(matches, genomeSet, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobalHamming>()); 2243 2244 if (stringToShape(gapped, options.shape)) 2245 return mapReads(matches, genomeSet, readSet, cnts, options, gapped, Swift<SwiftSemiGlobalHamming>()); 2246 } 2247 else 2248 { 2249 if (stringToShape(ungapped, options.shape)) 2250 return mapReads(matches, genomeSet, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobal>()); 2251 2252 if (stringToShape(onegapped, options.shape)) 2253 return mapReads(matches, genomeSet, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobal>()); 2254 2255 if (stringToShape(gapped, options.shape)) 2256 return mapReads(matches, genomeSet, readSet, cnts, options, gapped, Swift<SwiftSemiGlobal>()); 2257 } 2258 2259 return RAZERS_INVALID_SHAPE; 2260 } 2261 2262 } 2263 2264 #endif 2265