1 /*========================================================================== 2 RazerS - Fast Read Mapping with Controlled Loss Rate 3 http://www.seqan.de/projects/razers.html 4 5 ============================================================================ 6 Copyright (C) 2008 by David Weese 7 8 This program is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 3 of the License, or (at your options) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. 20 ==========================================================================*/ 21 22 #ifndef SEQAN_HEADER_RAZERS_H 23 #define SEQAN_HEADER_RAZERS_H 24 25 #include <iostream> 26 #include <fstream> 27 28 #include <seqan/find.h> 29 #include <seqan/index.h> 30 31 #ifdef RAZERS_PARALLEL 32 #include "tbb/spin_mutex.h" 33 #endif 34 35 namespace SEQAN_NAMESPACE_MAIN 36 { 37 38 ////////////////////////////////////////////////////////////////////////////// 39 // Default options 40 41 #ifdef RAZERS_OPENADDRESSING 42 typedef OpenAddressing TQGramIndexSpec; 43 #else 44 typedef Default TQGramIndexSpec; 45 #endif 46 47 template < bool DONT_VERIFY_ = false, bool DONT_DUMP_RESULTS_ = false > 48 struct RazerSSpec 49 { 50 enum { DONT_VERIFY = DONT_VERIFY_ }; // omit verifying potential matches 51 enum { DONT_DUMP_RESULTS = DONT_DUMP_RESULTS_ }; // omit dumping results 52 }; 53 54 template < typename TSpec = RazerSSpec<> > 55 struct RazerSOptions 56 { 57 // main options 58 TSpec spec; 59 bool forward; // compute forward oriented read matches 60 bool reverse; // compute reverse oriented read matches 61 double errorRate; // Criteria 1 threshold 62 unsigned maxHits; // output at most maxHits many matches 63 unsigned distanceRange; // output only the best, second best, ..., distanceRange best matches 64 bool purgeAmbiguous; // true..remove reads with more than maxHits best matches, false..keep them 65 CharString output; // name of result file 66 int _debugLevel; // level of verbosity 67 bool printVersion; // print version number 68 bool hammingOnly; // no indels 69 int trimLength; // if >0, cut reads to #trimLength characters 70 71 // output format options 72 unsigned outputFormat; // 0..Razer format 73 // 1..enhanced Fasta 74 // 2..ELAND format 75 bool dumpAlignment; // compute and dump the match alignments in the result files 76 unsigned genomeNaming; // 0..use Fasta id 77 // 1..enumerate reads beginning with 1 78 unsigned readNaming; // 0..use Fasta id 79 // 1..enumerate reads beginning with 1 80 // 2..use the read sequence (only for short reads!) 81 unsigned sortOrder; // 0..sort keys: 1. read number, 2. genome position 82 // 1.. 1. genome pos50ition, 2. read number 83 unsigned positionFormat; // 0..gap space 84 // 1..position space 85 const char *runID; // runID needed for gff output 86 87 // filtration parameters 88 ::std::string shape; // shape (e.g. 11111111111) 89 int threshold; // threshold 90 int tabooLength; // taboo length 91 int repeatLength; // repeat length threshold 92 double abundanceCut; // abundance threshold 93 94 // mate-pair parameters 95 int libraryLength; // offset between two mates 96 int libraryError; // offset tolerance 97 unsigned nextMatePairId; // use this id for the next mate-pair 98 99 // verification parameters 100 bool matchN; // false..N is always a mismatch, true..N matches with all 101 unsigned char compMask[5]; 102 103 // statistics 104 __int64 FP; // false positives (threshold reached, no match) 105 __int64 TP; // true positives (threshold reached, match) 106 double timeLoadFiles; // time for loading input files 107 double timeMapReads; // time for mapping reads 108 double timeDumpResults; // time for dumping the results 109 110 #ifdef RAZERS_DIRECT_MAQ_MAPPING 111 bool maqMapping; 112 int maxMismatchQualSum; 113 int mutationRateQual; 114 int absMaxQualSumErrors; 115 unsigned artSeedLength; 116 bool noBelowIdentity; 117 #endif 118 119 #ifdef RAZERS_MICRO_RNA 120 bool microRNA; 121 unsigned rnaSeedLength; 122 bool exactSeed; 123 #endif 124 125 bool lowMemory; // set maximum shape weight to 13 to limit size of q-gram index 126 bool fastaIdQual; // hidden option for special fasta+quality format we use 127 int minClippedLen; 128 129 // misc 130 unsigned compactThresh; // compact match array if larger than compactThresh 131 int maxReadLength; 132 133 unsigned minMatchLen; 134 int maxPrefixErrors; 135 int maxSuffixErrors; 136 int maxGap; 137 int minGap; 138 ::std::string shapeR; // shape (e.g. 11111111111) 139 int thresholdR; // threshold 140 unsigned int specifiedGenomeLen; 141 142 143 // multi-threading 144 145 #ifdef RAZERS_PARALLEL 146 typedef ::tbb::spin_mutex TMutex; 147 148 TMutex *patternMutex; 149 TMutex optionsMutex; 150 TMutex matchMutex; 151 #endif 152 RazerSOptionsRazerSOptions153 RazerSOptions() 154 { 155 forward = true; 156 reverse = true; 157 errorRate = 0.08; 158 maxHits = 100; 159 distanceRange = 0; // disabled 160 purgeAmbiguous = false; 161 output = ""; 162 _debugLevel = 0; 163 printVersion = false; 164 hammingOnly = false; 165 trimLength = 0; 166 167 outputFormat = 0; 168 dumpAlignment = false; 169 genomeNaming = 0; 170 readNaming = 0; 171 sortOrder = 0; 172 positionFormat = 0; 173 runID = "s"; // 174 175 matchN = false; 176 177 shape = "11111111111"; 178 threshold = 1; 179 tabooLength = 1; 180 repeatLength = 1000; 181 abundanceCut = 1; 182 183 libraryLength = 220; 184 libraryError = 50; 185 nextMatePairId = 1; 186 187 for (unsigned i = 0; i < 4; ++i) 188 compMask[i] = 1 << i; 189 compMask[4] = 0; 190 191 compactThresh = 1024; 192 #ifdef RAZERS_DIRECT_MAQ_MAPPING 193 maqMapping = false; 194 maxMismatchQualSum = 70; 195 mutationRateQual = 30; 196 artSeedLength = 28; // the "artificial" seed length that is used for mapping quality assignment 197 // (28bp is maq default) 198 absMaxQualSumErrors = 100; // maximum for sum of mism qualities in total readlength 199 noBelowIdentity = false; 200 #endif 201 202 #ifdef RAZERS_MICRO_RNA 203 microRNA = false; 204 rnaSeedLength = 16; 205 exactSeed = true; 206 #endif 207 minMatchLen = 23; 208 shapeR = "11111111111"; 209 thresholdR = 1; 210 maxGap = 10000; 211 minGap = 0; 212 maxPrefixErrors = -1; 213 maxSuffixErrors = -1; 214 specifiedGenomeLen = 3000000000U; //whole human genome default 215 216 maxReadLength = 0; 217 lowMemory = false; // set maximum shape weight to 13 to limit size of q-gram index 218 fastaIdQual = false; 219 minClippedLen = 0; 220 221 } 222 }; 223 224 struct MicroRNA{}; 225 226 #ifdef RAZERS_MICRO_RNA 227 #define RAZERS_EXTENDED_MATCH 228 #endif 229 230 #ifdef RAZERS_DIRECT_MAQ_MAPPING 231 #define RAZERS_EXTENDED_MATCH 232 #endif 233 234 #ifdef RAZERS_SPLICED 235 #define RAZERS_EXTENDED_MATCH 236 #endif 237 238 ////////////////////////////////////////////////////////////////////////////// 239 // Typedefs 240 241 // definition of a Read match 242 template <typename TGPos_> 243 struct ReadMatch 244 { 245 typedef typename MakeSigned_<TGPos_>::Type TGPos; 246 247 unsigned gseqNo; // genome seqNo 248 unsigned rseqNo; // read seqNo 249 TGPos gBegin; // begin position of the match in the genome 250 TGPos gEnd; // end position of the match in the genome 251 #ifdef RAZERS_MATEPAIRS 252 unsigned pairId; // unique id for the two mate-pair matches (0 if unpaired) 253 int mateDelta:24; // outer coordinate delta to the other mate 254 int pairScore:8; // combined score of both mates 255 #endif 256 unsigned short editDist; // Levenshtein distance 257 #ifdef RAZERS_EXTENDED_MATCH 258 short mScore; 259 short seedEditDist:8; 260 #endif 261 #ifdef RAZERS_SPLICED 262 char traceExtension; 263 short gSeedLen:8;// used as gMinMatchLen to store genomic end position of seed match 264 #endif 265 char orientation; // 'F'..forward strand, 'R'..reverse comp. strand 266 }; 267 268 enum RAZERS_ERROR 269 { 270 RAZERS_INVALID_OPTIONS = -1, 271 RAZERS_READS_FAILED = -2, 272 RAZERS_GENOME_FAILED = -3, 273 RAZERS_INVALID_SHAPE = -4 274 }; 275 276 ////////////////////////////////////////////////////////////////////////////// 277 // Definitions 278 279 typedef Dna5String TGenome; 280 typedef StringSet<TGenome> TGenomeSet; 281 // typedef Dna5String TRead; 282 typedef String<Dna5Q> TRead; 283 #ifdef RAZERS_CONCATREADS 284 typedef StringSet<TRead, Owner<ConcatDirect<> > > TReadSet; 285 #else 286 typedef StringSet<TRead> TReadSet; 287 #endif 288 289 typedef ReadMatch<Difference<TGenome>::Type> TMatch; // a single match 290 typedef String<TMatch/*, MMap<>*/ > TMatches; // array of matches 291 292 293 template <typename TSpec> 294 struct Cargo< Index<TReadSet, TSpec> > { 295 typedef struct { 296 double abundanceCut; 297 int _debugLevel; 298 } Type; 299 }; 300 301 template <typename TAnyReadSet, typename TSpec> 302 struct Cargo< Index<TAnyReadSet, TSpec> > { 303 typedef struct { 304 double abundanceCut; 305 int _debugLevel; 306 } Type; 307 }; 308 ////////////////////////////////////////////////////////////////////////////// 309 // Memory tuning 310 311 #ifdef RAZERS_MEMOPT 312 313 template <typename TSpec> 314 struct SAValue< Index<TReadSet, TSpec> > { 315 typedef Pair< 316 unsigned, 317 unsigned, 318 BitCompressed<24, 8> // max. 16M reads of length < 256 319 > Type; 320 }; 321 //454 322 // template <typename TSpec> 323 // struct SAValue< Index<TReadSet, TSpec> > { 324 // typedef Pair< 325 // unsigned, 326 // unsigned, 327 // BitCompressed<22, 10> // max. 4M reads of length < 1024 328 // > Type; 329 // }; 330 331 #else 332 333 template <typename TSpec> 334 struct SAValue< Index<TReadSet, TSpec> > { 335 typedef Pair< 336 unsigned, // many reads 337 unsigned, // of arbitrary length 338 Compressed 339 > Type; 340 }; 341 342 #endif 343 344 template <> 345 struct Size<Dna5String> 346 { 347 typedef unsigned Type; 348 }; 349 350 template <typename TShape, typename TSpec> 351 struct Size< Index<TReadSet, IndexQGram<TShape, TSpec> > > 352 { 353 typedef unsigned Type; 354 }; 355 356 357 #ifdef RAZERS_PRUNE_QGRAM_INDEX 358 359 ////////////////////////////////////////////////////////////////////////////// 360 // Repeat masker 361 template <typename TShape, typename TSpec> 362 inline bool _qgramDisableBuckets(Index<TReadSet, IndexQGram<TShape, TSpec> > &index) 363 { 364 typedef Index<TReadSet, IndexQGram<TShape, TSpec> > TReadIndex; 365 typedef typename Fibre<TReadIndex, QGramDir>::Type TDir; 366 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 367 typedef typename Value<TDir>::Type TSize; 368 369 TDir &dir = indexDir(index); 370 bool result = false; 371 unsigned counter = 0; 372 TSize thresh = (TSize)(length(index) * cargo(index).abundanceCut); 373 if (thresh < 100) thresh = 100; 374 375 TDirIterator it = begin(dir, Standard()); 376 TDirIterator itEnd = end(dir, Standard()); 377 for (; it != itEnd; ++it) 378 if (*it > thresh) 379 { 380 *it = (TSize)-1; 381 result = true; 382 ++counter; 383 } 384 385 if (counter > 0 && cargo(index)._debugLevel >= 1) 386 ::std::cerr << "Removed " << counter << " k-mers" << ::std::endl; 387 388 return result; 389 } 390 391 #endif 392 393 394 template<typename TIdString, typename TSeqString, typename TQString, typename TOptions> 395 bool 396 _clipReads(TIdString & fastaID, TSeqString & seq, TQString & qual, TOptions & options) 397 { 398 typedef typename Value<TIdString>::Type TChar; 399 400 int tagStart = length(fastaID); 401 int clipFront = -1; 402 int clipBack = -1; 403 for(unsigned i = 0; i < length(fastaID); ++i) 404 { 405 TChar c = fastaID[i]; 406 if(c == 'c') 407 { 408 tagStart = i-1; 409 if(infix(fastaID,i,i+5)=="clip=") 410 { 411 i += 5; 412 String<TChar> clipStr = ""; 413 while(i < length(fastaID) && _parseIsDigit(fastaID[i])) 414 { 415 append(clipStr,fastaID[i]); 416 ++i; 417 } 418 std::istringstream istr(toCString(clipStr)); 419 istr >> clipFront; 420 if(i < length(fastaID) && fastaID[i] == ',') ++i; 421 clipStr = ""; 422 while(i < length(fastaID) && _parseIsDigit(fastaID[i])) 423 { 424 append(clipStr,fastaID[i]); 425 ++i; 426 } 427 std::istringstream istr2(toCString(clipStr)); 428 istr2 >> clipBack; 429 break; 430 } 431 } 432 } 433 if(clipFront<0 && clipBack<0) return true; //no clip tag found 434 if(clipFront<0) clipFront = 0; 435 if(clipBack<0) clipBack = 0; 436 resize(fastaID,tagStart); // only resize fastaID, as it might contain the quality string 437 438 if(options.minClippedLen == 0) // clipping tag was found, but clipping option is turned off 439 return true; 440 441 if(((int)length(seq)-clipBack-clipFront) < options.minClippedLen) // sequence is too short after clipping 442 return false; 443 444 // int newLen = (int)length(seq)-clipBack-clipFront; 445 446 //clip 447 if(length(qual) == 0) // meaning that quality is in fasta header 448 { 449 // first adapt the fasta header 450 TIdString tmp = fastaID; 451 fastaID = infix(tmp,0,length(tmp)-length(seq)); 452 append(fastaID,infix(tmp,length(tmp)-length(seq)+clipFront,length(tmp)-clipBack)); 453 } 454 else 455 qual = infix(qual,clipFront,length(qual)-clipBack); 456 457 // now adapt the sequence 458 seq = infix(seq,clipFront,length(seq)-clipBack); 459 460 return true; 461 462 } 463 464 465 ////////////////////////////////////////////////////////////////////////////// 466 // Load multi-Fasta sequences with or w/o quality values 467 template <typename TReadSet, typename TNameSet, typename TRazerSOptions> 468 bool loadReads( 469 TReadSet &reads, 470 TNameSet &fastaIDs, 471 const char *fileName, 472 TRazerSOptions &options) 473 { 474 bool countN = !(options.matchN || options.outputFormat == 1); 475 #ifdef RAZERS_MICRO_RNA 476 if(options.microRNA) countN = false; 477 #endif 478 479 MultiFasta multiFasta; 480 if (!open(multiFasta.concat, fileName, OPEN_RDONLY)) return false; 481 482 AutoSeqFormat format; 483 guessFormat(multiFasta.concat, format); 484 split(multiFasta, format); 485 486 unsigned seqCount = length(multiFasta); 487 #ifndef RAZERS_CONCATREADS 488 resize(reads, seqCount, Exact()); 489 #endif 490 if (options.readNaming == 0) 491 resize(fastaIDs, seqCount, Exact()); 492 493 String<Dna5Q> seq; 494 CharString qual; 495 496 unsigned kickoutcount = 0; 497 for(unsigned i = 0; i < seqCount; ++i) 498 { 499 if (options.readNaming == 0 500 #ifdef RAZERS_DIRECT_MAQ_MAPPING 501 || options.fastaIdQual 502 #endif 503 ) 504 assignSeqId(fastaIDs[i], multiFasta[i], format); // read Fasta id 505 assignSeq(seq, multiFasta[i], format); // read Read sequence 506 assignQual(qual, multiFasta[i], format); // read ascii quality values 507 #ifdef RAZERS_DIRECT_MAQ_MAPPING 508 //check if sequence has a clip tag 509 if(options.minClippedLen > 0) 510 { 511 if(!_clipReads(fastaIDs[i],seq,qual,options)) 512 { 513 clear(seq); 514 clear(qual); 515 ++kickoutcount; 516 } 517 } 518 if(options.fastaIdQual && !empty(seq)) 519 { 520 if(options.minClippedLen == 0)_clipReads(fastaIDs[i],seq,qual,options); // if the header wasnt clipped before, then clip now!! necessary for quality in header 521 qual = suffix(fastaIDs[i],length(fastaIDs[i])-length(seq)); 522 if(options.readNaming == 0) 523 fastaIDs[i] = prefix(fastaIDs[i],length(fastaIDs[i])-length(seq)); 524 else clear(fastaIDs[i]); 525 } 526 #endif 527 if (countN) 528 { 529 int count = 0; 530 int cutoffCount = (int)(options.errorRate * length(seq)); 531 for (unsigned j = 0; j < length(seq); ++j) 532 if (getValue(seq, j) == 'N') 533 if (++count > cutoffCount) 534 { 535 clear(seq); 536 clear(qual); 537 ++kickoutcount; 538 break; 539 } 540 // low qual. reads are empty to output them and their id later as LQ reads 541 // if (count > cutoffCount) continue; 542 } 543 #ifdef RAZERS_MICRO_RNA 544 if(options.microRNA && length(seq)<options.rnaSeedLength) 545 { 546 clear(seq); 547 clear(qual); 548 } 549 #endif 550 551 // store dna and quality together 552 for (unsigned j = 0; j < length(qual) && j < length(seq); ++j) 553 assignQualityValue(seq[j], (int)(ordValue(qual[j]) - 33)); 554 /* if(ordValue(seq[j])>3) 555 setValue(hybridSeq[j],(unsigned int)164); //N=164 such that &3 == 0 556 else 557 setValue(hybridSeq[j],(unsigned int)((ordValue(qual[j]) - 33) * 4 + ordValue(seq[j]))); 558 */ 559 if (options.trimLength > 0 && length(seq) > (unsigned)options.trimLength) 560 resize(seq, options.trimLength); 561 if((int)length(seq) > options.maxReadLength) 562 options.maxReadLength = length(seq); 563 564 #ifdef RAZERS_CONCATREADS 565 appendValue(reads, seq, Generous()); 566 #else 567 assign(reads[i], seq, Exact()); 568 #endif 569 } 570 #ifdef RAZERS_CONCATREADS 571 reserve(reads.concat, length(reads.concat), Exact()); 572 #endif 573 574 if (options._debugLevel > 1 && kickoutcount > 0) 575 ::std::cerr << "Ignoring " << kickoutcount << " low quality reads.\n"; 576 return (seqCount > 0); 577 } 578 579 ////////////////////////////////////////////////////////////////////////////// 580 // Read the first sequence of a multi-sequence file 581 // and return its length 582 inline int estimateReadLength(char const *fileName) 583 { 584 MultiFasta multiFasta; 585 if (!open(multiFasta.concat, fileName, OPEN_RDONLY)) // open the whole file 586 return RAZERS_READS_FAILED; 587 588 AutoSeqFormat format; 589 guessFormat(multiFasta.concat, format); // guess file format 590 split(multiFasta, format); // divide into single sequences 591 592 if (length(multiFasta) == 0) 593 return 0; 594 595 Dna5String firstRead; 596 assignSeq(firstRead, multiFasta[0], format); // read the first sequence 597 return length(firstRead); 598 } 599 600 601 /* 602 ////////////////////////////////////////////////////////////////////////////// 603 // Load multi-Fasta sequences 604 template <typename TGenomeSet> 605 bool loadGenomes(TGenomeSet &genomes, const char *fileName) 606 { 607 MultiFasta multiFasta; 608 if (!open(multiFasta.concat, fileName, OPEN_RDONLY)) return false; 609 split(multiFasta, Fasta()); 610 611 unsigned seqCount = length(multiFasta); 612 resize(genomes, seqCount, Exact()); 613 for(unsigned i = 0; i < seqCount; ++i) 614 assignSeq(genomes[i], multiFasta[i], Fasta()); // read Genome sequence 615 616 return (seqCount > 0); 617 }*/ 618 619 ////////////////////////////////////////////////////////////////////////////// 620 // Load multi-Fasta sequences from multiple files 621 template <typename TGenomeSet> 622 bool loadGenomes(TGenomeSet &genomes, StringSet<CharString> &fileNameList) 623 { 624 unsigned gSeqNo = 0; 625 unsigned filecount = 0; 626 while(filecount < length(fileNameList)) 627 { 628 MultiFasta multiFasta; 629 if (!open(multiFasta.concat, toCString(fileNameList[filecount]), OPEN_RDONLY)) return false; 630 split(multiFasta, Fasta()); 631 632 unsigned seqCount = length(multiFasta); 633 if(length(genomes) < gSeqNo+seqCount) 634 resize(genomes,gSeqNo+seqCount); 635 for(unsigned i = 0; i < seqCount; ++i) 636 assignSeq(genomes[gSeqNo+i], multiFasta[i], Fasta()); // read Genome sequence 637 gSeqNo += seqCount; 638 ++filecount; 639 } 640 resize(genomes,gSeqNo); 641 return (gSeqNo > 0); 642 } 643 644 #ifdef RAZERS_MICRO_RNA 645 template <typename TReadMatch> 646 struct LessRNoGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > 647 { 648 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 649 { 650 // read number 651 if (a.rseqNo < b.rseqNo) return true; 652 if (a.rseqNo > b.rseqNo) return false; 653 654 // genome position and orientation 655 if (a.gseqNo < b.gseqNo) return true; 656 if (a.gseqNo > b.gseqNo) return false; 657 if (a.gBegin < b.gBegin) return true; 658 if (a.gBegin > b.gBegin) return false; 659 if (a.orientation < b.orientation) return true; 660 if (a.orientation > b.orientation) return false; 661 662 663 if(a.editDist < b.editDist) return true; 664 if(a.editDist > b.editDist) return false; 665 return (a.mScore > b.mScore); 666 } 667 }; 668 669 template <typename TReadMatch> 670 struct LessRNoEdistHLen : public ::std::binary_function < TReadMatch, TReadMatch, bool > 671 { 672 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 673 { 674 // read number 675 if (a.rseqNo < b.rseqNo) return true; 676 if (a.rseqNo > b.rseqNo) return false; 677 678 if(a.editDist < b.editDist) return true; 679 if(a.editDist > b.editDist) return false; 680 return (a.mScore > b.mScore); 681 682 } 683 }; 684 #else 685 686 687 template <typename TReadMatch> 688 struct LessRNoGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > 689 { 690 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 691 { 692 // read number 693 if (a.rseqNo < b.rseqNo) return true; 694 if (a.rseqNo > b.rseqNo) return false; 695 696 // genome position and orientation 697 if (a.gseqNo < b.gseqNo) return true; 698 if (a.gseqNo > b.gseqNo) return false; 699 if (a.gBegin < b.gBegin) return true; 700 if (a.gBegin > b.gBegin) return false; 701 if (a.orientation < b.orientation) return true; 702 if (a.orientation > b.orientation) return false; 703 704 // quality 705 #ifdef RAZERS_MATEPAIRS 706 return a.pairScore > b.pairScore; 707 #else 708 return a.editDist < b.editDist; 709 #endif 710 } 711 }; 712 #endif 713 714 // ... to sort matches and remove duplicates with equal gEnd 715 template <typename TReadMatch> 716 struct LessRNoGEndPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > 717 { 718 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 719 { 720 // read number 721 if (a.rseqNo < b.rseqNo) return true; 722 if (a.rseqNo > b.rseqNo) return false; 723 724 // genome position and orientation 725 if (a.gseqNo < b.gseqNo) return true; 726 if (a.gseqNo > b.gseqNo) return false; 727 if (a.gEnd < b.gEnd) return true; 728 if (a.gEnd > b.gEnd) return false; 729 if (a.orientation < b.orientation) return true; 730 if (a.orientation > b.orientation) return false; 731 732 // quality 733 #ifdef RAZERS_MATEPAIRS 734 return a.pairScore > b.pairScore; 735 #else 736 return a.editDist < b.editDist; 737 #endif 738 } 739 }; 740 741 template <typename TReadMatch> 742 struct LessErrors : public ::std::binary_function < TReadMatch, TReadMatch, bool > 743 { 744 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 745 { 746 // read number 747 if (a.rseqNo < b.rseqNo) return true; 748 if (a.rseqNo > b.rseqNo) return false; 749 750 // quality 751 #ifdef RAZERS_MATEPAIRS 752 return a.pairScore > b.pairScore; 753 #else 754 return a.editDist < b.editDist; 755 #endif 756 } 757 }; 758 759 template <typename TReadMatch> 760 struct LessSplicedScore : public ::std::binary_function < TReadMatch, TReadMatch, bool > 761 { 762 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 763 { 764 // read number 765 if (a.rseqNo < b.rseqNo ) return true; 766 if (a.rseqNo > b.rseqNo ) return false; 767 //if ((a.rseqNo >> 1) < (b.rseqNo >> 1)) return true; 768 //if ((a.rseqNo >> 1) > (b.rseqNo >> 1)) return false; 769 770 // quality 771 if (a.pairScore > b.pairScore) return true; 772 if (a.pairScore < b.pairScore) return false; 773 return a.pairId < b.pairId; 774 } 775 }; 776 777 778 template <typename TReadMatch> 779 struct LessSplicedScoreGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > 780 { 781 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 782 { 783 // read number 784 if (a.rseqNo < b.rseqNo ) return true; 785 if (a.rseqNo > b.rseqNo ) return false; 786 //if ((a.rseqNo >> 1) < (b.rseqNo >> 1)) return true; 787 //if ((a.rseqNo >> 1) > (b.rseqNo >> 1)) return false; 788 789 // quality 790 if (a.pairScore > b.pairScore) return true; 791 if (a.pairScore < b.pairScore) return false; 792 if (a.pairId < b.pairId) return true; 793 if (a.pairId > b.pairId) return false; 794 if (a.gBegin < b.gBegin) return true; 795 if (a.gBegin > b.gBegin) return false; 796 return a.editDist < b.editDist; 797 } 798 }; 799 800 801 802 #ifdef RAZERS_DIRECT_MAQ_MAPPING 803 804 struct QualityBasedScoring{}; 805 806 template <typename TReadMatch> 807 struct LessRNoMQ : public ::std::binary_function < TReadMatch, TReadMatch, bool > 808 { 809 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const 810 { 811 // read number 812 if (a.rseqNo < b.rseqNo) return true; 813 if (a.rseqNo > b.rseqNo) return false; 814 815 // quality 816 if (a.mScore < b.mScore) return true; // sum of quality values of mismatches (the smaller the better) 817 if (a.mScore > b.mScore) return false; 818 819 return (a.editDist < b.editDist); // seedEditDist? 820 // genome position and orientation 821 /* if (a.gseqNo < b.gseqNo) return true; 822 if (a.gseqNo > b.gseqNo) return false; 823 if (a.gBegin < b.gBegin) return true; 824 if (a.gBegin > b.gBegin) return false; 825 if (a.orientation < b.orientation) return true; 826 if (a.orientation > b.orientation) return false; 827 */ 828 } 829 }; 830 #endif 831 832 ////////////////////////////////////////////////////////////////////////////// 833 // Remove duplicate matches and leave at most maxHits many distanceRange 834 // best matches per read 835 template < typename TMatches > 836 void maskDuplicates(TMatches &matches) 837 { 838 typedef typename Value<TMatches>::Type TMatch; 839 typedef typename Iterator<TMatches, Standard>::Type TIterator; 840 841 ////////////////////////////////////////////////////////////////////////////// 842 // remove matches with equal ends 843 844 ::std::sort( 845 begin(matches, Standard()), 846 end(matches, Standard()), 847 LessRNoGEndPos<TMatch>()); 848 849 typename TMatch::TGPos gBegin = -1; 850 typename TMatch::TGPos gEnd = -1; 851 unsigned gseqNo = -1; 852 unsigned readNo = -1; 853 char orientation = '-'; 854 855 TIterator it = begin(matches, Standard()); 856 TIterator itEnd = end(matches, Standard()); 857 858 for (; it != itEnd; ++it) 859 { 860 #ifdef RAZERS_MATEPAIRS 861 if ((*it).pairId != 0) continue; 862 #endif 863 if (gEnd == (*it).gEnd && orientation == (*it).orientation && 864 gseqNo == (*it).gseqNo && readNo == (*it).rseqNo) 865 { 866 (*it).orientation = '-'; 867 continue; 868 } 869 readNo = (*it).rseqNo; 870 gseqNo = (*it).gseqNo; 871 gEnd = (*it).gEnd; 872 orientation = (*it).orientation; 873 } 874 875 ////////////////////////////////////////////////////////////////////////////// 876 // remove matches with equal begins 877 878 ::std::sort( 879 begin(matches, Standard()), 880 end(matches, Standard()), 881 LessRNoGPos<TMatch>()); 882 883 orientation = '-'; 884 885 it = begin(matches, Standard()); 886 itEnd = end(matches, Standard()); 887 888 for (; it != itEnd; ++it) 889 { 890 if ((*it).orientation == '-' 891 #ifdef RAZERS_MATEPAIRS 892 || ((*it).pairId != 0) 893 #endif 894 ) continue; 895 if (gBegin == (*it).gBegin && readNo == (*it).rseqNo && 896 gseqNo == (*it).gseqNo && orientation == (*it).orientation) 897 { 898 (*it).orientation = '-'; 899 continue; 900 } 901 readNo = (*it).rseqNo; 902 gseqNo = (*it).gseqNo; 903 gBegin = (*it).gBegin; 904 orientation = (*it).orientation; 905 } 906 907 ::std::sort( 908 begin(matches, Standard()), 909 end(matches, Standard()), 910 LessErrors<TMatch>()); 911 } 912 913 ////////////////////////////////////////////////////////////////////////////// 914 // Count matches for each number of errors 915 template < typename TMatches, typename TCounts > 916 void countMatches(TMatches &matches, TCounts &cnt) 917 { 918 typedef typename Value<TMatches>::Type TMatch; 919 typedef typename Iterator<TMatches, Standard>::Type TIterator; 920 typedef typename Value<TCounts>::Type TRow; 921 typedef typename Value<TRow>::Type TValue; 922 923 TIterator it = begin(matches, Standard()); 924 TIterator itEnd = end(matches, Standard()); 925 926 unsigned readNo = -1; 927 short editDist = -1; 928 __int64 count = 0; 929 __int64 maxVal = MaxValue<TValue>::VALUE; 930 931 for (; it != itEnd; ++it) 932 { 933 if ((*it).orientation == '-') continue; 934 if (readNo == (*it).rseqNo && editDist == (*it).editDist) 935 ++count; 936 else 937 { 938 if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt)) 939 cnt[editDist][readNo] = (maxVal < count)? maxVal : count; 940 readNo = (*it).rseqNo; 941 editDist = (*it).editDist; 942 count = 1; 943 } 944 } 945 if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt)) 946 cnt[editDist][readNo] = count; 947 948 949 } 950 951 ////////////////////////////////////////////////////////////////////////////// 952 953 template < typename TReadNo, typename TMaxErrors > 954 inline void 955 setMaxErrors(Nothing &, TReadNo, TMaxErrors) 956 { 957 } 958 959 template < typename TSwift, typename TReadNo, typename TMaxErrors > 960 inline void 961 setMaxErrors(TSwift &swift, TReadNo readNo, TMaxErrors maxErrors) 962 { 963 int minT = _qgramLemma(swift, readNo, maxErrors); 964 if (minT > 1) 965 { 966 if (maxErrors < 0) minT = MaxValue<int>::VALUE; 967 // ::std::cout<<" read:"<<readNo<<" newThresh:"<<minT; 968 setMinThreshold(swift, readNo, (unsigned)minT); 969 } 970 } 971 972 ////////////////////////////////////////////////////////////////////////////// 973 // Remove low quality matches 974 template < typename TMatches, typename TCounts, typename TSpec, typename TSwift > 975 void compactMatches(TMatches &matches, TCounts & 976 #ifdef RAZERS_DIRECT_MAQ_MAPPING 977 cnts 978 #endif 979 , RazerSOptions<TSpec> &options 980 , bool compactFinal , 981 TSwift & 982 #if defined RAZERS_DIRECT_MAQ_MAPPING || defined RAZERS_MASK_READS 983 swift 984 #endif 985 ) 986 { 987 // Get rid of "unused variable" warnings. This is hard to read 988 // and should not be done anywhere else. Better not use ifdefs. 989 (void)compactFinal; 990 #ifdef RAZERS_DIRECT_MAQ_MAPPING 991 if(options.maqMapping) compactMatches(matches, cnts,options,compactFinal,swift,true); 992 #endif 993 typedef typename Value<TMatches>::Type TMatch; 994 typedef typename Iterator<TMatches, Standard>::Type TIterator; 995 996 #ifdef RAZERS_MICRO_RNA 997 if(options.microRNA) 998 ::std::sort( 999 begin(matches, Standard()), 1000 end(matches, Standard()), 1001 LessRNoEdistHLen<TMatch>()); 1002 int bestMScore = 0; 1003 #endif 1004 1005 unsigned readNo = -1; 1006 unsigned hitCount = 0; 1007 unsigned hitCountCutOff = options.maxHits; 1008 __int64 disabled = 0; 1009 #ifdef RAZERS_MICRO_RNA 1010 if(options.microRNA && options.purgeAmbiguous) 1011 ++hitCountCutOff; // we keep one more match than we actually want, so we can later decide 1012 // whether the read mapped more than maxhits times 1013 #endif 1014 int editDistCutOff = MaxValue<int>::VALUE; 1015 1016 TIterator it = begin(matches, Standard()); 1017 TIterator itEnd = end(matches, Standard()); 1018 TIterator dit = it; 1019 TIterator ditBeg = it; 1020 1021 for (; it != itEnd; ++it) 1022 { 1023 if ((*it).orientation == '-') continue; 1024 if (readNo == (*it).rseqNo 1025 #ifdef RAZERS_MATEPAIRS 1026 && (*it).pairId == 0 1027 #endif 1028 ) 1029 { 1030 if ((*it).editDist >= editDistCutOff) continue; 1031 #ifdef RAZERS_MICRO_RNA 1032 if ( (*it).mScore < bestMScore) continue; 1033 #endif 1034 if (++hitCount >= hitCountCutOff) 1035 { 1036 #ifdef RAZERS_MASK_READS 1037 if (hitCount == hitCountCutOff) 1038 { 1039 // we have enough, now look for better matches 1040 int maxErrors = (*it).editDist - 1; 1041 if (options.purgeAmbiguous && (options.distanceRange == 0 || (*it).editDist < options.distanceRange)) 1042 maxErrors = -1; 1043 1044 setMaxErrors(swift, readNo, maxErrors); 1045 1046 if (maxErrors == -1) 1047 ++disabled; 1048 // ::std::cerr << "(read #" << readNo << " disabled)"; 1049 1050 if (options.purgeAmbiguous) 1051 { 1052 if (options.distanceRange == 0 || (*it).editDist < options.distanceRange || compactFinal) 1053 dit = ditBeg; 1054 else { 1055 *dit = *it; 1056 ++dit; 1057 } 1058 } 1059 } 1060 #endif 1061 continue; 1062 } 1063 } 1064 else 1065 { 1066 readNo = (*it).rseqNo; 1067 hitCount = 0; 1068 if (options.distanceRange > 0) 1069 editDistCutOff = (*it).editDist + options.distanceRange; 1070 #ifdef RAZERS_MICRO_RNA 1071 bestMScore = (*it).mScore; 1072 #endif 1073 ditBeg = dit; 1074 } 1075 *dit = *it; 1076 ++dit; 1077 } 1078 if (options._debugLevel >= 2) 1079 { 1080 std::cerr << " [" << length(matches) - (dit - begin(matches, Standard())) << " matches removed]"; 1081 std::cerr << " [" << disabled << " reads disabled]"; 1082 } 1083 1084 resize(matches, dit - begin(matches, Standard())); 1085 } 1086 1087 1088 #ifdef RAZERS_DIRECT_MAQ_MAPPING 1089 ////////////////////////////////////////////////////////////////////////////// 1090 // Remove low quality matches 1091 template < typename TMatches, typename TCounts, typename TSpec, typename TSwift > 1092 void compactMatches(TMatches &matches, TCounts &cnts, RazerSOptions<TSpec> &, bool, TSwift &, bool dontCountFirstTwo) 1093 { 1094 typedef typename Value<TMatches>::Type TMatch; 1095 typedef typename Iterator<TMatches, Standard>::Type TIterator; 1096 1097 ::std::sort( 1098 begin(matches, Standard()), 1099 end(matches, Standard()), 1100 LessRNoMQ<TMatch>()); 1101 1102 unsigned readNo = -1; 1103 1104 TIterator it = begin(matches, Standard()); 1105 TIterator itEnd = end(matches, Standard()); 1106 TIterator dit = it; 1107 1108 //number of errors may not exceed 31! 1109 bool second = true; 1110 for (; it != itEnd; ++it) 1111 { 1112 if ((*it).orientation == '-') continue; 1113 if (readNo == (*it).rseqNo) 1114 { 1115 //second best match 1116 if (second) 1117 { 1118 second = false; 1119 if((cnts[1][(*it).rseqNo] & 31) > (*it).editDist) 1120 { 1121 //this second best match is better than any second best match before 1122 cnts[1][(*it).rseqNo] = (*it).editDist; // second best dist is this editDist 1123 // count is 0 (will be updated if countFirstTwo) 1124 } 1125 if(!dontCountFirstTwo) 1126 if((cnts[1][(*it).rseqNo]>>5) != 2047) cnts[1][(*it).rseqNo] += 32; 1127 } 1128 else 1129 { 1130 if ((*it).editDist <= (cnts[0][(*it).rseqNo] & 31) ) 1131 if(cnts[0][(*it).rseqNo]>>5 != 2047) 1132 cnts[0][(*it).rseqNo] +=32; 1133 if ((*it).editDist <= (cnts[1][(*it).rseqNo] & 31) ) 1134 if((cnts[1][(*it).rseqNo]>>5) != 2047) 1135 cnts[1][(*it).rseqNo] +=32; 1136 continue; 1137 } 1138 } else 1139 { //best match 1140 second = true; 1141 readNo = (*it).rseqNo; 1142 //cnts has 16bits, 11:5 for count:dist 1143 if((cnts[0][(*it).rseqNo] & 31) > (*it).editDist) 1144 { 1145 //this match is better than any match before 1146 cnts[1][(*it).rseqNo] = cnts[0][(*it).rseqNo]; // best before is now second best 1147 // (count will be updated when match is removed) 1148 cnts[0][(*it).rseqNo] = (*it).editDist; // best dist is this editDist 1149 // count is 0 (will be updated if countFirstTwo) 1150 } 1151 if(!dontCountFirstTwo) 1152 if((cnts[0][(*it).rseqNo]>>5) != 2047) cnts[0][(*it).rseqNo] += 32; // shift 5 to the right, add 1, shift 5 to the left, and keep count 1153 } 1154 *dit = *it; 1155 ++dit; 1156 } 1157 1158 resize(matches, dit - begin(matches, Standard())); 1159 } 1160 #endif 1161 1162 1163 #ifdef RAZERS_MICRO_RNA 1164 1165 template < typename TMatches, typename TSpec > 1166 void purgeAmbiguousRnaMatches(TMatches &matches, RazerSOptions<TSpec> &options) 1167 { 1168 typedef typename Value<TMatches>::Type TMatch; 1169 typedef typename Iterator<TMatches, Standard>::Type TIterator; 1170 1171 ::std::sort(begin(matches, Standard()),end(matches, Standard()),LessRNoEdistHLen<TMatch>()); 1172 int bestMScore = 0; 1173 1174 unsigned readNo = -1; 1175 unsigned hitCount = 0; 1176 unsigned hitCountCutOff = options.maxHits; 1177 int editDistCutOff = MaxValue<int>::VALUE; 1178 1179 TIterator it = begin(matches, Standard()); 1180 TIterator itEnd = end(matches, Standard()); 1181 TIterator dit = it; 1182 TIterator ditBeg = it; 1183 1184 for (; it != itEnd; ++it) 1185 { 1186 if ((*it).orientation == '-') continue; 1187 if (readNo == (*it).rseqNo) 1188 { 1189 if ((*it).editDist >= editDistCutOff) continue; 1190 if ( (*it).mScore < bestMScore) continue; 1191 if (++hitCount >= hitCountCutOff) 1192 { 1193 if (hitCount == hitCountCutOff) 1194 dit = ditBeg; 1195 continue; 1196 } 1197 } 1198 else 1199 { 1200 readNo = (*it).rseqNo; 1201 hitCount = 0; 1202 if (options.distanceRange > 0) 1203 editDistCutOff = (*it).editDist + options.distanceRange; 1204 bestMScore = (*it).mScore; 1205 ditBeg = dit; 1206 } 1207 *dit = *it; 1208 ++dit; 1209 } 1210 resize(matches, dit - begin(matches, Standard())); 1211 } 1212 1213 1214 1215 ////////////////////////////////////////////////////////////////////////////// 1216 // Hamming verification recording sum of mismatching qualities in m.mScore 1217 template < 1218 typename TMatch, 1219 typename TGenome, 1220 typename TReadSet, 1221 typename TSpec > 1222 inline bool 1223 matchVerify( 1224 TMatch &m, // resulting match 1225 Segment<TGenome, InfixSegment> inf, // potential match genome region 1226 unsigned rseqNo, // read number 1227 TReadSet &readSet, // reads 1228 RazerSOptions<TSpec> const &options, // RazerS options 1229 MicroRNA) // MaqMapping 1230 { 1231 1232 typedef Segment<TGenome, InfixSegment> TGenomeInfix; 1233 typedef typename Value<TReadSet>::Type const TRead; 1234 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator; 1235 typedef typename Iterator<TRead, Standard>::Type TReadIterator; 1236 1237 unsigned ndlLength = sequenceLength(rseqNo, readSet); 1238 if (length(inf) < ndlLength) return false; 1239 1240 // verify 1241 TRead &read = readSet[rseqNo]; 1242 TReadIterator ritBeg = begin(read, Standard()); 1243 TReadIterator ritEnd = end(read, Standard()); 1244 TGenomeIterator git = begin(inf, Standard()); 1245 TGenomeIterator gitEnd = end(inf, Standard()) - (ndlLength - 1); 1246 1247 // this is max number of errors the seed should have 1248 unsigned maxErrorsSeed = (unsigned)(options.rnaSeedLength * options.errorRate); 1249 unsigned minSeedErrors = maxErrorsSeed + 1; 1250 unsigned bestHitLength = 0; 1251 1252 for (; git < gitEnd; ++git) 1253 { 1254 bool hit = true; 1255 unsigned hitLength = 0; 1256 unsigned count = 0; 1257 unsigned seedErrors = 0; 1258 TGenomeIterator g = git; //maq would count errors in the first 28bp only (at least initially. not for output) 1259 for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g) 1260 { 1261 if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0) 1262 { 1263 if (count < options.rnaSeedLength) // the maq (28bp-)seed 1264 { 1265 if(++seedErrors > maxErrorsSeed) 1266 { 1267 hit = false; 1268 break; 1269 } 1270 } 1271 else 1272 break; 1273 } 1274 ++count; 1275 } 1276 if (hit) hitLength = count; 1277 if (hitLength > bestHitLength ) //simply take the longest hit 1278 { 1279 minSeedErrors = seedErrors; 1280 bestHitLength = hitLength; 1281 m.gBegin = git - begin(host(inf), Standard()); 1282 } 1283 } 1284 1285 // std::cout << "options.absMaxQualSumErrors" << options.absMaxQualSumErrors << std::endl; 1286 // std::cout << "maxSeedErrors" << maxErrorsSeed << std::endl; 1287 // std::cout << "minErrors" << minSeedErrors << std::endl; 1288 // if(derBesgte) ::std::cout << minErrors <<"minErrors\n"; 1289 if (minSeedErrors > maxErrorsSeed) return false; 1290 1291 m.gEnd = m.gBegin + bestHitLength; 1292 m.editDist = minSeedErrors; // errors in seed or total number of errors? 1293 m.mScore = bestHitLength; 1294 m.seedEditDist = minSeedErrors; 1295 return true; 1296 } 1297 1298 #endif 1299 1300 1301 1302 ////////////////////////////////////////////////////////////////////////////// 1303 // Hamming verification 1304 template < 1305 typename TMatch, 1306 typename TGenome, 1307 typename TReadSet, 1308 typename TMyersPatterns, 1309 typename TSpec > 1310 inline bool 1311 matchVerify( 1312 TMatch &m, // resulting match 1313 Segment<TGenome, InfixSegment> inf, // potential match genome region 1314 unsigned rseqNo, // read number 1315 TReadSet &readSet, // reads 1316 TMyersPatterns const &, // MyersBitVector preprocessing data 1317 RazerSOptions<TSpec> const &options, // RazerS options 1318 SwiftSemiGlobalHamming) // Hamming only 1319 { 1320 #ifdef RAZERS_DIRECT_MAQ_MAPPING 1321 if(options.maqMapping) 1322 return matchVerify(m,inf,rseqNo,readSet,options,QualityBasedScoring()); 1323 #endif 1324 1325 #ifdef RAZERS_MICRO_RNA 1326 if(options.microRNA) 1327 return matchVerify(m,inf,rseqNo,readSet,options,MicroRNA()); 1328 #endif 1329 1330 typedef Segment<TGenome, InfixSegment> TGenomeInfix; 1331 typedef typename Value<TReadSet>::Type const TRead; 1332 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator; 1333 typedef typename Iterator<TRead, Standard>::Type TReadIterator; 1334 1335 #ifdef RAZERS_DEBUG 1336 ::std::cout<<"Verify: "<<::std::endl; 1337 ::std::cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl; 1338 ::std::cout<<"Read: "<<readSet[rseqNo] << ::std::endl; 1339 #endif 1340 1341 unsigned ndlLength = sequenceLength(rseqNo, readSet); 1342 if (length(inf) < ndlLength) return false; 1343 1344 // verify 1345 TRead &read = readSet[rseqNo]; 1346 TReadIterator ritBeg = begin(read, Standard()); 1347 TReadIterator ritEnd = end(read, Standard()); 1348 TGenomeIterator git = begin(inf, Standard()); 1349 TGenomeIterator gitEnd = end(inf, Standard()) - (ndlLength - 1); 1350 1351 unsigned maxErrors = (unsigned)(ndlLength * options.errorRate); 1352 unsigned minErrors = maxErrors + 1; 1353 1354 for (; git < gitEnd; ++git) 1355 { 1356 unsigned errors = 0; 1357 TGenomeIterator g = git; 1358 for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g) 1359 if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0) 1360 if (++errors > maxErrors) 1361 break; 1362 if (minErrors > errors) 1363 { 1364 minErrors = errors; 1365 m.gBegin = git - begin(host(inf), Standard()); 1366 } 1367 } 1368 1369 if (minErrors > maxErrors) return false; 1370 1371 m.gEnd = m.gBegin + ndlLength; 1372 m.editDist = minErrors; 1373 return true; 1374 } 1375 1376 1377 ////////////////////////////////////////////////////////////////////////////// 1378 // Edit distance verification 1379 template < 1380 typename TMatch, 1381 typename TGenome, 1382 typename TReadSet, 1383 typename TMyersPatterns, 1384 typename TSpec > 1385 inline bool 1386 matchVerify( 1387 TMatch &m, // resulting match 1388 Segment<TGenome, InfixSegment> inf, // potential match genome region 1389 unsigned rseqNo, // read number 1390 TReadSet &readSet, // reads 1391 TMyersPatterns &forwardPatterns, // MyersBitVector preprocessing data 1392 RazerSOptions<TSpec> const &options, // RazerS options 1393 SwiftSemiGlobal) // Swift specialization 1394 { 1395 typedef Segment<TGenome, InfixSegment> TGenomeInfix; 1396 typedef typename Value<TReadSet>::Type TRead; 1397 1398 // find read match end 1399 typedef Finder<TGenomeInfix> TMyersFinder; 1400 typedef typename Value<TMyersPatterns>::Type TMyersPattern; 1401 1402 // find read match begin 1403 typedef ModifiedString<TGenomeInfix, ModReverse> TGenomeInfixRev; 1404 typedef ModifiedString<TRead, ModReverse> TReadRev; 1405 typedef Finder<TGenomeInfixRev> TMyersFinderRev; 1406 typedef Pattern<TReadRev, MyersUkkonenGlobal> TMyersPatternRev; 1407 1408 TMyersFinder myersFinder(inf); 1409 TMyersPattern &myersPattern = forwardPatterns[rseqNo]; 1410 1411 #ifdef RAZERS_DEBUG 1412 ::std::cout<<"Verify: "<<::std::endl; 1413 ::std::cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl; 1414 ::std::cout<<"Read: "<<readSet[rseqNo]<<::std::endl; 1415 #endif 1416 1417 unsigned ndlLength = sequenceLength(rseqNo, readSet); 1418 int maxScore = MinValue<int>::VALUE; 1419 int minScore = -(int)(ndlLength * options.errorRate); 1420 TMyersFinder maxPos; 1421 1422 // find end of best semi-global alignment 1423 while (find(myersFinder, myersPattern, minScore)) 1424 if (maxScore <= getScore(myersPattern)) 1425 { 1426 maxScore = getScore(myersPattern); 1427 maxPos = myersFinder; 1428 } 1429 1430 if (maxScore < minScore) return false; 1431 m.editDist = (unsigned)-maxScore; 1432 setEndPosition(inf, m.gEnd = (beginPosition(inf) + position(maxPos) + 1)); 1433 1434 // limit the beginning to needle length plus errors (== -maxScore) 1435 if (length(inf) > ndlLength - maxScore) 1436 setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore); 1437 1438 // find beginning of best semi-global alignment 1439 TGenomeInfixRev infRev(inf); 1440 TReadRev readRev(readSet[rseqNo]); 1441 TMyersFinderRev myersFinderRev(infRev); 1442 TMyersPatternRev myersPatternRev(readRev); 1443 1444 _patternMatchNOfPattern(myersPatternRev, options.matchN); 1445 _patternMatchNOfFinder(myersPatternRev, options.matchN); 1446 while (find(myersFinderRev, myersPatternRev, maxScore)) 1447 m.gBegin = m.gEnd - (position(myersFinderRev) + 1); 1448 1449 return true; 1450 } 1451 1452 1453 #ifdef RAZERS_DIRECT_MAQ_MAPPING 1454 ////////////////////////////////////////////////////////////////////////////// 1455 // Hamming verification recording sum of mismatching qualities in m.mScore 1456 template < 1457 typename TMatch, 1458 typename TGenome, 1459 typename TReadSet, 1460 typename TSpec > 1461 inline bool 1462 matchVerify( 1463 TMatch &m, // resulting match 1464 Segment<TGenome, InfixSegment> inf, // potential match genome region 1465 unsigned rseqNo, // read number 1466 TReadSet &readSet, // reads 1467 RazerSOptions<TSpec> const &options, // RazerS options 1468 QualityBasedScoring) // MaqMapping 1469 { 1470 1471 typedef Segment<TGenome, InfixSegment> TGenomeInfix; 1472 typedef typename Value<TReadSet>::Type const TRead; 1473 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator; 1474 typedef typename Iterator<TRead, Standard>::Type TReadIterator; 1475 1476 // #ifdef RAZERS_DEBUG 1477 // cout<<"Verify: "<<::std::endl; 1478 // cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl; 1479 // cout<<"Read: "<<host(myersPattern)<<::std::endl; 1480 // #endif 1481 1482 // bool derBesgte = false; 1483 //if(rseqNo == 2) derBesgte = true; 1484 // if(derBesgte) ::std::cout << "der besagte\n"; 1485 unsigned ndlLength = sequenceLength(rseqNo, readSet); 1486 if (length(inf) < ndlLength) return false; 1487 1488 // verify 1489 TRead &read = readSet[rseqNo]; 1490 TReadIterator ritBeg = begin(read, Standard()); 1491 TReadIterator ritEnd = end(read, Standard()); 1492 TGenomeIterator git = begin(inf, Standard()); 1493 TGenomeIterator gitEnd = end(inf, Standard()) - (ndlLength - 1); 1494 TGenomeIterator bestIt = begin(inf, Standard()); 1495 1496 // this is max number of errors the 28bp 'seed' should have 1497 //assuming that maxErrors-1 matches can be found with 100% SN 1498 unsigned maxErrorsSeed = (unsigned)(options.artSeedLength * options.errorRate) + 1; 1499 unsigned maxErrorsTotal = (unsigned)(ndlLength * 0.25); //options.maxErrorRate); 1500 unsigned minErrors = maxErrorsTotal + 1; 1501 int minQualSumErrors = options.absMaxQualSumErrors + 10; 1502 unsigned minSeedErrors = maxErrorsSeed + 1; 1503 1504 for (; git < gitEnd; ++git) 1505 { 1506 bool hit = true; 1507 unsigned errors = 0; 1508 unsigned count = 0; 1509 unsigned seedErrors = 0; 1510 int qualSumErrors = 0; 1511 TGenomeIterator g = git; //maq would count errors in the first 28bp only (at least initially. not for output) 1512 for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g) 1513 { 1514 if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0) 1515 { 1516 // ::std::cout << count << "<-"; 1517 if (++errors > maxErrorsTotal) 1518 { 1519 hit = false; 1520 break; 1521 } 1522 //int qualityValue = (int)((unsigned char)*r >> 3); 1523 //qualSumErrors += (qualityValue < options.mutationRateQual) ? qualityValue : options.mutationRateQual; 1524 qualSumErrors += (getQualityValue(*r) < options.mutationRateQual) ? getQualityValue(*r) : options.mutationRateQual; 1525 if(qualSumErrors > options.absMaxQualSumErrors || qualSumErrors > minQualSumErrors) 1526 { 1527 hit = false; 1528 break; 1529 } 1530 if (count < options.artSeedLength) // the maq (28bp-)seed 1531 { 1532 if(++seedErrors > maxErrorsSeed) 1533 { 1534 hit = false; 1535 break; 1536 } 1537 if(qualSumErrors > options.maxMismatchQualSum) 1538 { 1539 hit = false; 1540 break; 1541 }// discard match, if 'seed' is bad (later calculations are done with the quality sum over the whole read) 1542 } 1543 } 1544 ++count; 1545 } 1546 if (hit && (qualSumErrors < minQualSumErrors /*&& seedErrors <=maxErrorsSeed*/) ) //oder (seedErrors < minSeedErrors) 1547 { 1548 minErrors = errors; 1549 minSeedErrors = seedErrors; 1550 minQualSumErrors = qualSumErrors; 1551 m.gBegin = git - begin(host(inf), Standard()); 1552 bestIt = git; 1553 } 1554 } 1555 if (minQualSumErrors > options.absMaxQualSumErrors || minSeedErrors > maxErrorsSeed || minErrors > maxErrorsTotal) return false; 1556 1557 m.gEnd = m.gBegin + ndlLength; 1558 m.editDist = minErrors; // errors in seed or total number of errors? 1559 m.mScore = minQualSumErrors; 1560 m.seedEditDist = minSeedErrors; 1561 return true; 1562 } 1563 1564 1565 ////////////////////////////////////////////////////////////////////////////// 1566 // Edit distance verification 1567 template < 1568 typename TMatch, 1569 typename TGenome, 1570 typename TReadSet, 1571 typename TMyersPatterns, 1572 typename TSpec > 1573 inline bool 1574 matchVerify( 1575 TMatch &m, // resulting match 1576 Segment<TGenome, InfixSegment> inf, // potential match genome region 1577 unsigned rseqNo, // read number 1578 TReadSet &readSet, // reads 1579 TMyersPatterns const & pat, // MyersBitVector preprocessing data 1580 RazerSOptions<TSpec> const &options, // RazerS options 1581 SwiftSemiGlobal const &swiftsemi, // Hamming only 1582 QualityBasedScoring) // Swift specialization 1583 { 1584 //if(!options.maqMapping) 1585 return matchVerify(m,inf,rseqNo,readSet,pat,options,swiftsemi); 1586 //else 1587 // return matchVerify(m,inf,rseqNo,readSet,pat,options,swiftsemi); // todo! 1588 } 1589 #endif 1590 1591 1592 1593 1594 #ifndef RAZERS_PARALLEL 1595 ////////////////////////////////////////////////////////////////////////////// 1596 // Find read matches in one genome sequence 1597 template < 1598 typename TMatches, 1599 typename TGenome, 1600 typename TReadIndex, 1601 typename TSwiftSpec, 1602 typename TVerifier, 1603 typename TCounts, 1604 typename TSpec > 1605 void mapSingleReads( 1606 TMatches &matches, // resulting matches 1607 TGenome &genome, // genome ... 1608 unsigned gseqNo, // ... and its sequence number 1609 Pattern<TReadIndex, Swift<TSwiftSpec> > &swiftPattern, 1610 TVerifier &forwardPatterns, 1611 TCounts & cnts, 1612 char orientation, // q-gram index of reads 1613 RazerSOptions<TSpec> &options) 1614 { 1615 typedef typename Fibre<TReadIndex, FibreText>::Type TReadSet; 1616 typedef typename Size<TGenome>::Type TSize; 1617 typedef typename Value<TMatches>::Type TMatch; 1618 1619 1620 // FILTRATION 1621 typedef Finder<TGenome, Swift<TSwiftSpec> > TSwiftFinder; 1622 typedef Pattern<TReadIndex, Swift<TSwiftSpec> > TSwiftPattern; 1623 1624 // iterate all genomic sequences 1625 if (options._debugLevel >= 1) 1626 { 1627 ::std::cerr << ::std::endl << "Process genome seq #" << gseqNo; 1628 if (orientation == 'F') 1629 ::std::cerr << "[fwd]"; 1630 else 1631 ::std::cerr << "[rev]"; 1632 } 1633 1634 TReadSet &readSet = host(host(swiftPattern)); 1635 TSwiftFinder swiftFinder(genome, options.repeatLength, 1); 1636 1637 TMatch m = { // to supress uninitialized warnings 1638 0, 0, 0, 0, 1639 #ifdef RAZERS_MATEPAIRS 1640 0, 0, 0, 1641 #endif 1642 0, 1643 #ifdef RAZERS_EXTENDED_MATCH 1644 0, 0, 1645 #endif 1646 #ifdef RAZERS_SPLICED 1647 0, 0, 1648 #endif 1649 0 1650 }; 1651 1652 SEQAN_PROTIMESTART(map_contig_time); 1653 1654 // iterate all verification regions returned by SWIFT 1655 TSize gLength = length(genome); 1656 __int64 localTP = 0; 1657 __int64 localFP = 0; 1658 1659 #ifdef RAZERS_MICRO_RNA 1660 while (find(swiftFinder, swiftPattern, 0.2)) 1661 #else 1662 while (find(swiftFinder, swiftPattern, options.errorRate)) 1663 #endif 1664 { 1665 unsigned rseqNo = (*swiftFinder.curHit).ndlSeqNo; 1666 if (!options.spec.DONT_VERIFY && 1667 matchVerify(m, infix(swiftFinder), rseqNo, readSet, forwardPatterns, options, TSwiftSpec())) 1668 { 1669 // transform coordinates to the forward strand 1670 if (orientation == 'R') 1671 { 1672 TSize temp = m.gBegin; 1673 m.gBegin = gLength - m.gEnd; 1674 m.gEnd = gLength - temp; 1675 } 1676 m.gseqNo = gseqNo; 1677 m.rseqNo = rseqNo; 1678 m.orientation = orientation; 1679 #ifdef RAZERS_MATEPAIRS 1680 m.pairId = 0; 1681 m.pairScore = 0 - m.editDist; 1682 #endif 1683 1684 if (!options.spec.DONT_DUMP_RESULTS) 1685 { 1686 appendValue(matches, m, Generous()); 1687 if (length(matches) > options.compactThresh) 1688 { 1689 #ifndef RAZERS_MICRO_RNA 1690 typename Size<TMatches>::Type oldSize = length(matches); 1691 #endif 1692 maskDuplicates(matches); // overlapping parallelograms cause duplicates 1693 #ifdef RAZERS_DIRECT_MAQ_MAPPING 1694 if(options.maqMapping) 1695 compactMatches(matches, cnts, options, false, swiftPattern, true); 1696 else 1697 #endif 1698 compactMatches(matches, cnts, options, false, swiftPattern); 1699 #ifdef RAZERS_MICRO_RNA 1700 options.compactThresh = 2 * length(matches); 1701 #else 1702 if (length(matches) * 4 > oldSize) // the threshold should not be raised 1703 options.compactThresh += (options.compactThresh >> 1); // if too many matches were removed 1704 #endif 1705 } 1706 } 1707 1708 ++localTP; 1709 // ::std::cerr << "\"" << infix(swiftFinder) << "\" "; 1710 // ::std::cerr << hstkPos << " + "; 1711 // ::std::cerr << ::std::endl; 1712 } else { 1713 ++localFP; 1714 // ::std::cerr << "\"" << infx(swiftFinder) << "\" \"" << infix(swiftPattern) << "\" "; 1715 // ::std::cerr << rseqNo << " : "; 1716 // ::std::cerr << hstkPos << " + "; 1717 // ::std::cerr << bucketWidth << " " << TP << ::std::endl; 1718 } 1719 } 1720 options.TP += localTP; 1721 options.FP += localFP; 1722 if (options._debugLevel >= 2) 1723 { 1724 double spec = 100; 1725 double time = SEQAN_PROTIMEDIFF(map_contig_time); 1726 if (localFP+localTP != 0) 1727 spec = (1000*localTP/(localFP+localTP))/10.0; 1728 ::std::cerr << " [" << (int)((gLength / time)/100.0)/10.0 << "kbp/s " << time << "s] [" << spec << "%SP " << localFP+localTP << "V]"; 1729 } 1730 } 1731 #endif 1732 1733 1734 1735 #ifdef RAZERS_MICRO_RNA 1736 1737 // multiple sequences 1738 template < 1739 typename TSA, 1740 typename TStringSet, 1741 typename TShape, 1742 typename TDir, 1743 typename TValue, 1744 typename TWithConstraints 1745 > 1746 inline void 1747 _qgramFillSuffixArray( 1748 TSA &sa, 1749 TStringSet &stringSet, 1750 TShape &shape, 1751 TDir &dir, 1752 Nothing, 1753 TWithConstraints const, 1754 TValue prefixLen, 1755 MicroRNA) 1756 { 1757 SEQAN_CHECKPOINT 1758 typedef typename Value<TStringSet>::Type TString; 1759 typedef typename Iterator<TString const, Standard>::Type TIterator; 1760 typedef typename Value<TDir>::Type TSize; 1761 typedef typename Value<TShape>::Type THash; 1762 1763 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 1764 { 1765 TString const &sequence = value(stringSet, seqNo); 1766 if (length(sequence) < length(shape)) continue; 1767 int num_qgrams = prefixLen - (int)length(shape) + 1; 1768 1769 typename Value<TSA>::Type localPos; 1770 assignValueI1(localPos, seqNo); 1771 assignValueI2(localPos, 0); 1772 1773 TIterator itText = begin(sequence, Standard()); 1774 if (TWithConstraints::VALUE) { 1775 THash h = hash(shape, itText) + 1; // first hash 1776 if (dir[h] != (TSize)-1) sa[dir[h]++] = localPos; // if bucket is enabled 1777 } else 1778 sa[dir[hash(shape, itText) + 1]++] = localPos; // first hash 1779 1780 for(int i = 1; i < num_qgrams; ++i) 1781 { 1782 ++itText; 1783 assignValueI2(localPos, i); 1784 if (TWithConstraints::VALUE) { 1785 THash h = hashNext(shape, itText) + 1; // next hash 1786 if (dir[h] != (TSize)-1) sa[dir[h]++] = localPos; // if bucket is enabled 1787 } else 1788 sa[dir[hashNext(shape, itText) + 1]++] = localPos; // next hash 1789 } 1790 } 1791 } 1792 1793 template < typename TDir, typename TStringSet, typename TShape, typename TValue > 1794 inline void 1795 _qgramCountQGrams(TDir &dir, TStringSet &stringSet, TShape &shape, TValue prefixLen, MicroRNA) 1796 { 1797 SEQAN_CHECKPOINT 1798 typedef typename Value<TStringSet>::Type TString; 1799 typedef typename Iterator<TString const, Standard>::Type TIterator; 1800 typedef typename Value<TDir>::Type TSize; 1801 1802 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 1803 { 1804 TString const &sequence = value(stringSet, seqNo); 1805 if (length(sequence) < length(shape)) continue; 1806 TSize num_qgrams = prefixLen - length(shape) + 1; 1807 1808 TIterator itText = begin(sequence, Standard()); 1809 ++dir[hash(shape, itText)]; 1810 for(TSize i = 1; i < num_qgrams; ++i) 1811 { 1812 ++itText; 1813 ++dir[hashNext(shape, itText)]; 1814 } 1815 } 1816 } 1817 1818 1819 template < typename TIndex, typename TValue> 1820 void createQGramIndex(TIndex &index, TValue prefixLen, MicroRNA) 1821 { 1822 SEQAN_CHECKPOINT 1823 typename Fibre<TIndex, QGramText>::Type &text = indexText(index); 1824 typename Fibre<TIndex, QGramSA>::Type &sa = indexSA(index); 1825 typename Fibre<TIndex, QGramDir>::Type &dir = indexDir(index); 1826 typename Fibre<TIndex, QGramShape>::Type &shape = indexShape(index); 1827 1828 Nothing nothing; 1829 1830 // 1. clear counters 1831 arrayFill(begin(dir, Standard()), end(dir, Standard()), 0); 1832 1833 // 2. count q-grams 1834 _qgramCountQGrams(dir, text, shape, prefixLen,MicroRNA()); 1835 1836 if (_qgramDisableBuckets(index)) 1837 { 1838 // 3. cumulative sum 1839 _qgramCummulativeSum(dir, True()); 1840 1841 // 4. fill suffix array 1842 _qgramFillSuffixArray(sa, text, shape, dir, nothing, True(), prefixLen, MicroRNA()); 1843 1844 // 5. correct disabled buckets 1845 _qgramPostprocessBuckets(dir); 1846 } 1847 else 1848 { 1849 // 3. cumulative sum 1850 _qgramCummulativeSum(dir, False()); 1851 1852 // 4. fill suffix array 1853 _qgramFillSuffixArray(sa, text, shape, dir, nothing, False(),prefixLen, MicroRNA()); 1854 } 1855 } 1856 1857 1858 #endif 1859 1860 1861 1862 ////////////////////////////////////////////////////////////////////////////// 1863 // Find read matches in many genome sequences (import from Fasta) 1864 template < 1865 typename TMatches, 1866 typename TReadSet, 1867 typename TCounts, 1868 typename TSpec, 1869 typename TShape, 1870 typename TSwiftSpec > 1871 int mapSingleReads( 1872 TMatches & matches, 1873 StringSet<CharString> & genomeFileNameList, 1874 StringSet<CharString> & genomeNames, // genome names, taken from the Fasta file 1875 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap, 1876 TReadSet const & readSet, 1877 TCounts & cnts, 1878 RazerSOptions<TSpec> & options, 1879 TShape const & shape, 1880 Swift<TSwiftSpec> const) 1881 { 1882 typedef typename Value<TReadSet>::Type TRead; 1883 typedef Index<TReadSet, IndexQGram<TShape, TQGramIndexSpec> > TIndex; // q-gram index 1884 typedef Pattern<TIndex, Swift<TSwiftSpec> > TSwiftPattern; // filter 1885 typedef Pattern<TRead, MyersUkkonen> TMyersPattern; // verifier 1886 1887 /* // try opening each genome file once before running the whole mapping procedure 1888 int filecount = 0; 1889 int numFiles = length(genomeFileNameList); 1890 while(filecount < numFiles) 1891 { 1892 ::std::ifstream file; 1893 file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary); 1894 if (!file.is_open()) 1895 return RAZERS_GENOME_FAILED; 1896 file.close(); 1897 ++filecount; 1898 } 1899 */ 1900 1901 // configure q-gram index 1902 TIndex swiftIndex(readSet, shape); 1903 #ifdef RAZERS_OPENADDRESSING 1904 swiftIndex.alpha = 2; 1905 #endif 1906 cargo(swiftIndex).abundanceCut = options.abundanceCut; 1907 cargo(swiftIndex)._debugLevel = options._debugLevel; 1908 1909 // configure Swift 1910 TSwiftPattern swiftPattern(swiftIndex); 1911 swiftPattern.params.minThreshold = options.threshold; 1912 swiftPattern.params.tabooLength = options.tabooLength; 1913 swiftPattern.params.printDots = options._debugLevel > 0; 1914 1915 // init edit distance verifiers 1916 unsigned readCount = countSequences(swiftIndex); 1917 String<TMyersPattern> forwardPatterns; 1918 options.compMask[4] = (options.matchN)? 15: 0; 1919 if (!options.hammingOnly) 1920 { 1921 resize(forwardPatterns, readCount, Exact()); 1922 for(unsigned i = 0; i < readCount; ++i) 1923 { 1924 setHost(forwardPatterns[i], indexText(swiftIndex)[i]); 1925 _patternMatchNOfPattern(forwardPatterns[i], options.matchN); 1926 _patternMatchNOfFinder(forwardPatterns[i], options.matchN); 1927 } 1928 } 1929 #ifdef RAZERS_MICRO_RNA 1930 typename Size<TIndex>::Type qgram_count = 0; 1931 if(options.microRNA) 1932 { 1933 for(unsigned i = 0; i < countSequences(swiftIndex); ++i) 1934 if (sequenceLength(i, swiftIndex) >= options.rnaSeedLength) 1935 qgram_count += options.rnaSeedLength - (length(shape) - 1); 1936 resize(indexSA(swiftIndex), qgram_count, Exact()); 1937 resize(indexDir(swiftIndex), _fullDirLength(swiftIndex), Exact()); 1938 createQGramIndex(swiftIndex,options.rnaSeedLength,MicroRNA()); 1939 } 1940 #endif 1941 1942 1943 #ifdef RAZERS_DIRECT_MAQ_MAPPING 1944 if(options.maqMapping) 1945 { 1946 resize(cnts, 2); 1947 for (unsigned i = 0; i < length(cnts); ++i) 1948 resize(cnts[i], readCount, 31); //initialize with maxeditDist, 11:5 for count:dist 1949 } 1950 #endif 1951 1952 // clear stats 1953 options.FP = 0; 1954 options.TP = 0; 1955 options.timeMapReads = 0; 1956 options.timeDumpResults = 0; 1957 1958 unsigned filecount = 0; 1959 unsigned numFiles = length(genomeFileNameList); 1960 unsigned gseqNo = 0; 1961 1962 // open genome files, one by one 1963 while (filecount < numFiles) 1964 { 1965 // open genome file 1966 ::std::ifstream file; 1967 file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary); 1968 if (!file.is_open()) 1969 return RAZERS_GENOME_FAILED; 1970 1971 // remove the directory prefix of current genome file 1972 ::std::string genomeFile(toCString(genomeFileNameList[filecount])); 1973 size_t lastPos = genomeFile.find_last_of('/') + 1; 1974 if (lastPos == genomeFile.npos) lastPos = genomeFile.find_last_of('\\') + 1; 1975 if (lastPos == genomeFile.npos) lastPos = 0; 1976 ::std::string genomeName = genomeFile.substr(lastPos); 1977 1978 1979 CharString id; 1980 Dna5String genome; 1981 unsigned gseqNoWithinFile = 0; 1982 // iterate over genome sequences 1983 SEQAN_PROTIMESTART(find_time); 1984 for(; !_streamEOF(file); ++gseqNo) 1985 { 1986 if (options.genomeNaming == 0) 1987 { 1988 //readID(file, id, Fasta()); // read Fasta id 1989 readShortID(file, id, Fasta()); // read Fasta id up to first whitespace 1990 appendValue(genomeNames, id, Generous()); 1991 } 1992 read(file, genome, Fasta()); // read Fasta sequence 1993 1994 gnoToFileMap.insert(::std::make_pair<unsigned,::std::pair< ::std::string,unsigned> >(gseqNo,::std::make_pair< ::std::string,unsigned>(genomeName,gseqNoWithinFile))); 1995 1996 if (options.forward) 1997 mapSingleReads(matches, genome, gseqNo, swiftPattern, forwardPatterns, cnts, 'F', options); 1998 1999 if (options.reverse) 2000 { 2001 reverseComplement(genome); 2002 mapSingleReads(matches, genome, gseqNo, swiftPattern, forwardPatterns, cnts, 'R', options); 2003 } 2004 ++gseqNoWithinFile; 2005 2006 } 2007 options.timeMapReads += SEQAN_PROTIMEDIFF(find_time); 2008 file.close(); 2009 ++filecount; 2010 } 2011 2012 if (options._debugLevel >= 1) 2013 ::std::cerr << ::std::endl << "Finding reads took \t" << options.timeMapReads << " seconds" << ::std::endl; 2014 if (options._debugLevel >= 2) { 2015 ::std::cerr << ::std::endl; 2016 ::std::cerr << "___FILTRATION_STATS____" << ::std::endl; 2017 ::std::cerr << "Swift FP: " << options.FP << ::std::endl; 2018 ::std::cerr << "Swift TP: " << options.TP << ::std::endl; 2019 } 2020 return 0; 2021 } 2022 2023 2024 ////////////////////////////////////////////////////////////////////////////// 2025 // Find read matches in many genome sequences (given as StringSet) 2026 template < 2027 typename TMatches, 2028 typename TGenomeSet, 2029 typename TReadSet, 2030 typename TCounts, 2031 typename TSpec, 2032 typename TShape, 2033 typename TSwiftSpec > 2034 int mapSingleReads( 2035 TMatches & matches, 2036 TGenomeSet & genomeSet, 2037 TReadSet const & readSet, 2038 TCounts & cnts, 2039 RazerSOptions<TSpec> & options, 2040 TShape const & shape, 2041 Swift<TSwiftSpec> const) 2042 { 2043 typedef typename Value<TReadSet>::Type TRead; 2044 typedef Index<TReadSet, IndexQGram<TShape, TQGramIndexSpec> > TIndex; // q-gram index 2045 typedef Pattern<TIndex, Swift<TSwiftSpec> > TSwiftPattern; // filter 2046 typedef Pattern<TRead, MyersUkkonen> TMyersPattern; // verifier 2047 2048 // configure q-gram index 2049 TIndex swiftIndex(readSet, shape); 2050 cargo(swiftIndex).abundanceCut = options.abundanceCut; 2051 cargo(swiftIndex)._debugLevel = options._debugLevel; 2052 2053 // configure Swift 2054 TSwiftPattern swiftPattern(swiftIndex); 2055 swiftPattern.params.minThreshold = options.threshold; 2056 swiftPattern.params.tabooLength = options.tabooLength; 2057 2058 // init edit distance verifiers 2059 String<TMyersPattern> forwardPatterns; 2060 options.compMask[4] = (options.matchN)? 15: 0; 2061 if (!options.hammingOnly) 2062 { 2063 unsigned readCount = countSequences(swiftIndex); 2064 resize(forwardPatterns, readCount, Exact()); 2065 for(unsigned i = 0; i < readCount; ++i) 2066 { 2067 setHost(forwardPatterns[i], indexText(swiftIndex)[i]); 2068 _patternMatchNOfPattern(forwardPatterns[i], options.matchN); 2069 _patternMatchNOfFinder(forwardPatterns[i], options.matchN); 2070 } 2071 } 2072 2073 // clear stats 2074 options.FP = 0; 2075 options.TP = 0; 2076 options.timeMapReads = 0; 2077 options.timeDumpResults = 0; 2078 2079 CharString id; 2080 #ifdef RAZERS_DIRECT_MAQ_MAPPING 2081 if(options.maqMapping) 2082 { 2083 resize(cnts, 2); 2084 for (unsigned i = 0; i < length(cnts); ++i) 2085 resize(cnts[i], length(readSet), 31); 2086 } 2087 #endif 2088 2089 2090 2091 // iterate over genome sequences 2092 SEQAN_PROTIMESTART(find_time); 2093 for(unsigned gseqNo = 0; gseqNo < length(genomeSet); ++gseqNo) 2094 { 2095 if (options.forward) 2096 mapSingleReads(matches, genomeSet[gseqNo], gseqNo, swiftPattern, forwardPatterns, cnts, 'F', options); 2097 2098 if (options.reverse) 2099 { 2100 reverseComplement(genomeSet[gseqNo]); 2101 mapSingleReads(matches, genomeSet[gseqNo], gseqNo, swiftPattern, forwardPatterns, cnts, 'R', options); 2102 reverseComplement(genomeSet[gseqNo]); 2103 } 2104 2105 } 2106 options.timeMapReads += SEQAN_PROTIMEDIFF(find_time); 2107 2108 if (options._debugLevel >= 1) 2109 ::std::cerr << ::std::endl << "Finding reads took \t" << options.timeMapReads << " seconds" << ::std::endl; 2110 if (options._debugLevel >= 2) { 2111 ::std::cerr << ::std::endl; 2112 ::std::cerr << "___FILTRATION_STATS____" << ::std::endl; 2113 ::std::cerr << "Swift FP: " << options.FP << ::std::endl; 2114 ::std::cerr << "Swift TP: " << options.TP << ::std::endl; 2115 } 2116 return 0; 2117 } 2118 2119 ////////////////////////////////////////////////////////////////////////////// 2120 // Wrapper for single/mate-pair mapping 2121 template < 2122 typename TMatches, 2123 typename TReadSet, 2124 typename TCounts, 2125 typename TSpec, 2126 typename TShape, 2127 typename TSwiftSpec > 2128 int mapReads( 2129 TMatches & matches, 2130 StringSet<CharString> & genomeFileNameList, 2131 StringSet<CharString> & genomeNames, // genome names, taken from the Fasta file 2132 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap, 2133 TReadSet & readSet, 2134 TCounts & cnts, 2135 RazerSOptions<TSpec> & options, 2136 TShape const & shape, 2137 Swift<TSwiftSpec> const) 2138 { 2139 2140 #ifdef RAZERS_MATEPAIRS 2141 if (options.libraryLength >= 0) 2142 return mapMatePairReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, shape, Swift<TSwiftSpec>()); 2143 else 2144 #endif 2145 return mapSingleReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, shape, Swift<TSwiftSpec>()); 2146 } 2147 2148 template < 2149 typename TMatches, 2150 typename TGenomeSet, 2151 typename TReadSet, 2152 typename TCounts, 2153 typename TSpec, 2154 typename TShape, 2155 typename TSwiftSpec > 2156 int mapReads( 2157 TMatches & matches, 2158 TGenomeSet & genomeSet, 2159 TReadSet & readSet, 2160 TCounts & cnts, 2161 RazerSOptions<TSpec> & options, 2162 TShape const & shape, 2163 Swift<TSwiftSpec> const) 2164 { 2165 #ifdef RAZERS_MATEPAIRS 2166 if (options.libraryLength >= 0) 2167 return mapMatePairReads(matches, genomeSet, readSet, cnts, options, shape, Swift<TSwiftSpec>()); 2168 else 2169 #endif 2170 return mapSingleReads(matches, genomeSet, readSet, cnts, options, shape, Swift<TSwiftSpec>()); 2171 } 2172 2173 ////////////////////////////////////////////////////////////////////////////// 2174 // Wrapper for different template specializations 2175 template <typename TMatches, typename TReadSet, typename TCounts, typename TSpec> 2176 int mapReads( 2177 TMatches & matches, 2178 StringSet<CharString> & genomeFileNameList, 2179 StringSet<CharString> & genomeNames, // genome names, taken from the Fasta file 2180 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap, 2181 TReadSet & readSet, 2182 TCounts & cnts, 2183 RazerSOptions<TSpec> & options) 2184 { 2185 // first check whether split mapping (--> two shapes) or normal mapping 2186 #ifdef RAZERS_SPLICED 2187 if (options.minMatchLen > 0) 2188 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options); 2189 #endif 2190 2191 Shape<Dna, SimpleShape> ungapped; 2192 Shape<Dna, OneGappedShape> onegapped; 2193 Shape<Dna, GenericShape> gapped; 2194 2195 // 2x3 SPECIALIZATION 2196 2197 if (options.hammingOnly) 2198 { 2199 // select best-fitting shape 2200 if (stringToShape(ungapped, options.shape)) 2201 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobalHamming>()); 2202 2203 if (stringToShape(onegapped, options.shape)) 2204 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobalHamming>()); 2205 2206 if (stringToShape(gapped, options.shape)) 2207 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, gapped, Swift<SwiftSemiGlobalHamming>()); 2208 } 2209 else 2210 { 2211 if (stringToShape(ungapped, options.shape)) 2212 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobal>()); 2213 2214 if (stringToShape(onegapped, options.shape)) 2215 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobal>()); 2216 2217 if (stringToShape(gapped, options.shape)) 2218 return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, gapped, Swift<SwiftSemiGlobal>()); 2219 } 2220 2221 return RAZERS_INVALID_SHAPE; 2222 } 2223 2224 template <typename TMatches, typename TGenomeSet, typename TReadSet, typename TCounts, typename TSpec> 2225 int mapReads( 2226 TMatches & matches, 2227 TGenomeSet & genomeSet, 2228 TReadSet & readSet, 2229 TCounts & cnts, 2230 RazerSOptions<TSpec> & options) 2231 { 2232 2233 2234 // first check whether split mapping (--> two shapes) or normal mapping 2235 #ifdef RAZERS_SPLICED 2236 if (options.minMatchLen > 0) 2237 return mapSplicedReads(matches, genomeSet, readSet, cnts, options); 2238 #endif 2239 2240 2241 Shape<Dna, SimpleShape> ungapped; 2242 Shape<Dna, OneGappedShape> onegapped; 2243 Shape<Dna, GenericShape> gapped; 2244 2245 // 2x3 SPECIALIZATION 2246 2247 if (options.hammingOnly) 2248 { 2249 // select best-fitting shape 2250 if (stringToShape(ungapped, options.shape)) 2251 return mapReads(matches, genomeSet, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobalHamming>()); 2252 2253 if (stringToShape(onegapped, options.shape)) 2254 return mapReads(matches, genomeSet, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobalHamming>()); 2255 2256 if (stringToShape(gapped, options.shape)) 2257 return mapReads(matches, genomeSet, readSet, cnts, options, gapped, Swift<SwiftSemiGlobalHamming>()); 2258 } 2259 else 2260 { 2261 if (stringToShape(ungapped, options.shape)) 2262 return mapReads(matches, genomeSet, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobal>()); 2263 2264 if (stringToShape(onegapped, options.shape)) 2265 return mapReads(matches, genomeSet, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobal>()); 2266 2267 if (stringToShape(gapped, options.shape)) 2268 return mapReads(matches, genomeSet, readSet, cnts, options, gapped, Swift<SwiftSemiGlobal>()); 2269 } 2270 2271 return RAZERS_INVALID_SHAPE; 2272 } 2273 2274 } 2275 2276 #endif 2277