1 /*========================================================================== 2 RazerS - Fast Read Mapping with Controlled Loss Rate 3 http://www.seqan.de/projects/razers.html 4 5 ============================================================================ 6 Copyright (C) 2008 by David Weese 7 8 This program is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 3 of the License, or (at your options) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. 20 ==========================================================================*/ 21 22 #ifndef SEQAN_HEADER_RAZERS_MATEPAIRS_H 23 #define SEQAN_HEADER_RAZERS_MATEPAIRS_H 24 25 #include <seqan/misc/misc_dequeue.h> 26 27 namespace SEQAN_NAMESPACE_MAIN 28 { 29 30 // We require mate-pairs to be stored together in one read string. 31 // Pair i has mates at positions 2*i and 2*i+1 in the read string. 32 33 34 ////////////////////////////////////////////////////////////////////////////// 35 // Definitions 36 37 typedef StringSet<TRead const, Dependent<> > TMPReadSet; 38 39 #ifdef RAZERS_MEMOPT 40 41 template <typename TShape> 42 struct SAValue< Index<TMPReadSet, IndexQGram<TShape> > > { 43 typedef Pair< 44 unsigned, 45 unsigned, 46 BitCompressed<24, 8> // max. 16M reads of length < 256 47 > Type; 48 }; 49 50 #else 51 52 template <typename TShape> 53 struct SAValue< Index<TMPReadSet, IndexQGram<TShape> > > { 54 typedef Pair< 55 unsigned, // many reads 56 unsigned, // of arbitrary length 57 Compressed 58 > Type; 59 }; 60 61 #endif 62 63 64 template <typename TShape, typename TSpec> 65 struct Cargo< Index<TMPReadSet, IndexQGram<TShape, TSpec> > > { 66 typedef struct { 67 double abundanceCut; 68 int _debugLevel; 69 } Type; 70 }; 71 72 #ifdef RAZERS_PRUNE_QGRAM_INDEX 73 74 ////////////////////////////////////////////////////////////////////////////// 75 // Repeat masker 76 template <typename TShape> 77 inline bool _qgramDisableBuckets(Index<TMPReadSet, IndexQGram<TShape> > &index) 78 { 79 typedef Index<TMPReadSet, IndexQGram<TShape> > TReadIndex; 80 typedef typename Fibre<TReadIndex, QGramDir>::Type TDir; 81 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 82 typedef typename Value<TDir>::Type TSize; 83 84 TDir &dir = indexDir(index); 85 bool result = false; 86 unsigned counter = 0; 87 TSize thresh = (TSize)(length(index) * cargo(index).abundanceCut); 88 if (thresh < 100) thresh = 100; 89 90 TDirIterator it = begin(dir, Standard()); 91 TDirIterator itEnd = end(dir, Standard()); 92 for (; it != itEnd; ++it) 93 if (*it > thresh) 94 { 95 *it = (TSize)-1; 96 result = true; 97 ++counter; 98 } 99 100 if (counter > 0 && cargo(index)._debugLevel >= 1) 101 ::std::cerr << "Removed " << counter << " k-mers" << ::std::endl; 102 103 return result; 104 } 105 106 #endif 107 108 ////////////////////////////////////////////////////////////////////////////// 109 // Load multi-Fasta sequences 110 template <typename TFSSpec, typename TFSConfig, typename TRazerSOptions> 111 bool loadReads( 112 FragmentStore<TFSSpec, TFSConfig> & store, 113 const char * fileNameL, // left mates file 114 const char * fileNameR, // right mates file 115 TRazerSOptions & options) 116 { 117 bool countN = !(options.matchN || options.outputFormat == 1); 118 119 MultiFasta leftMates; 120 MultiFasta rightMates; 121 122 if (!open(leftMates.concat, fileNameL, OPEN_RDONLY)) return false; 123 if (!open(rightMates.concat, fileNameR, OPEN_RDONLY)) return false; 124 125 AutoSeqFormat formatL; 126 guessFormat(leftMates.concat, formatL); 127 split(leftMates, formatL); 128 129 AutoSeqFormat formatR; 130 guessFormat(rightMates.concat, formatR); 131 split(rightMates, formatR); 132 133 unsigned seqCount = length(leftMates); 134 if (seqCount != length(rightMates)) 135 if (options._debugLevel > 1) 136 { 137 ::std::cerr << "Numbers of mates differ: " << seqCount << "(left) != " << length(rightMates) << "(right).\n"; 138 return false; 139 } 140 141 String<Dna5Q> seq[2]; 142 CharString qual[2]; 143 CharString id[2]; 144 145 unsigned kickoutcount = 0; 146 unsigned maxReadLength = 0; 147 for(unsigned i = 0; i < seqCount; ++i) 148 { 149 if (options.readNaming == 0 || options.readNaming == 3) 150 { 151 assignSeqId(id[0], leftMates[i], formatL); // read left Fasta id 152 assignSeqId(id[1], rightMates[i], formatR); // read right Fasta id 153 if (options.readNaming == 0) 154 { 155 append(id[0], "/L"); 156 append(id[1], "/R"); 157 } 158 } 159 160 assignSeq(seq[0], leftMates[i], formatL); // read left Read sequence 161 assignSeq(seq[1], rightMates[i], formatR); // read right Read sequence 162 assignQual(qual[0], leftMates[i], formatL); // read left ascii quality values 163 assignQual(qual[1], rightMates[i], formatR); // read right ascii quality values 164 165 if (countN) 166 { 167 for (int j = 0; j < 2; ++j) 168 { 169 int maxBase = (int)(0.8 * length(seq[j])); 170 int allowed[5] = 171 { maxBase, maxBase, maxBase, maxBase, (int)(options.errorRate * length(seq[j]))}; 172 for (unsigned k = 0; k < length(seq[j]); ++k) 173 if (--allowed[ordValue(getValue(seq[j], k))] == 0) 174 { 175 // std::cout << "Ignoring mate-pair: " << seq[0] << " " << seq[1] << std::endl; 176 clear(seq[0]); 177 clear(seq[1]); 178 clear(id[0]); 179 clear(id[1]); 180 ++kickoutcount; 181 break; 182 } 183 } 184 } 185 186 for (int j = 0; j < 2; ++j) 187 { 188 // store dna and quality together 189 assignQualities(seq[j], qual[j]); 190 191 if (options.trimLength > 0 && length(seq[j]) > (unsigned)options.trimLength) 192 resize(seq[j], options.trimLength); 193 } 194 appendMatePair(store, seq[0], seq[1], id[0], id[1]); 195 if (maxReadLength < length(seq[0])) 196 maxReadLength = length(seq[0]); 197 if (maxReadLength < length(seq[1])) 198 maxReadLength = length(seq[1]); 199 } 200 // memory optimization 201 reserve(store.readSeqStore.concat, length(store.readSeqStore.concat), Exact()); 202 // reserve(store.readNameStore.concat, length(store.readNameStore.concat), Exact()); 203 204 typedef Shape<Dna, SimpleShape> TShape; 205 typedef typename SAValue< Index<TMPReadSet, IndexQGram<TShape, OpenAddressing> > >::Type TSAValue; 206 TSAValue sa(0, 0); 207 sa.i1 = ~sa.i1; 208 sa.i2 = ~sa.i2; 209 210 if ((unsigned)sa.i1 < length(store.readSeqStore) - 1) 211 { 212 ::std::cerr << "Maximal read number of " << (unsigned)sa.i1 + 1 << " exceeded. Please remove \"#define RAZERS_MEMOPT\" in razers.cpp and recompile." << ::std::endl; 213 seqCount = 0; 214 } 215 if ((unsigned)sa.i2 < maxReadLength - 1) 216 { 217 ::std::cerr << "Maximal read length of " << (unsigned)sa.i2 + 1 << " bps exceeded. Please remove \"#define RAZERS_MEMOPT\" in razers.cpp and recompile." << ::std::endl; 218 seqCount = 0; 219 } 220 221 if (options._debugLevel > 1 && kickoutcount > 0) 222 ::std::cerr << "Ignoring " << kickoutcount << " low quality mate-pairs.\n"; 223 return (seqCount > 0); 224 } 225 226 template <typename TFragmentStore> 227 struct LessPairScore : 228 public ::std::binary_function < 229 typename Value<typename TFragmentStore::TAlignedReadStore>::Type, 230 typename Value<typename TFragmentStore::TAlignedReadStore>::Type, 231 bool > 232 { 233 TFragmentStore &store; 234 235 LessPairScore(TFragmentStore &_store): 236 store(_store) {} 237 238 inline bool operator() ( 239 typename Value<typename TFragmentStore::TAlignedReadStore>::Type const &a, 240 typename Value<typename TFragmentStore::TAlignedReadStore>::Type const &b) const 241 { 242 typedef typename TFragmentStore::TReadStore TReadStore; 243 typedef typename TFragmentStore::TAlignedReadStore TAlignedReadStore; 244 typedef typename TFragmentStore::TAlignQualityStore TAlignQualityStore; 245 typedef typename Value<TReadStore>::Type TRead; 246 typedef typename Value<TAlignedReadStore>::Type TAlignedRead; 247 typedef typename Value<TAlignQualityStore>::Type TQual; 248 typedef typename Id<TRead>::Type TId; 249 250 // pair number 251 if (b.readId == TAlignedRead::INVALID_ID) return false; 252 if (a.readId == TAlignedRead::INVALID_ID) return true; 253 TRead const &ra = store.readStore[a.readId]; 254 TRead const &rb = store.readStore[b.readId]; 255 if (ra.matePairId < rb.matePairId) return true; 256 if (ra.matePairId > rb.matePairId) return false; 257 258 // quality 259 if (a.id == TAlignedRead::INVALID_ID) return false; 260 if (b.id == TAlignedRead::INVALID_ID) return true; 261 TQual const &qa = store.alignQualityStore[a.id]; 262 TQual const &qb = store.alignQualityStore[b.id]; 263 if (qa.pairScore > qb.pairScore) return true; 264 if (qa.pairScore < qb.pairScore) return false; 265 266 return a.pairMatchId < b.pairMatchId; 267 } 268 }; 269 270 271 ////////////////////////////////////////////////////////////////////////////// 272 // Remove low quality matches 273 template < typename TFragmentStore, typename TCounts, typename TSpec, typename TSwiftL, typename TSwiftR > 274 void compactPairMatches( 275 TFragmentStore & store, 276 TCounts &, 277 RazerSOptions<TSpec> & options, 278 TSwiftL & swiftL, 279 TSwiftR & swiftR) 280 { 281 typedef typename TFragmentStore::TReadStore TReadStore; 282 typedef typename TFragmentStore::TAlignedReadStore TAlignedReadStore; 283 typedef typename TFragmentStore::TAlignQualityStore TAlignQualityStore; 284 typedef typename Value<TReadStore>::Type TRead; 285 typedef typename Value<TAlignedReadStore>::Type TAlignedRead; 286 typedef typename Value<TAlignQualityStore>::Type TQual; 287 typedef typename Iterator<TAlignedReadStore, Standard>::Type TIterator; 288 289 unsigned matePairId = -2; 290 unsigned hitCount = 0; 291 unsigned hitCountCutOff = options.maxHits; 292 int scoreDistCutOff = MinValue<int>::VALUE; 293 294 TIterator it = begin(store.alignedReadStore, Standard()); 295 TIterator itEnd = end(store.alignedReadStore, Standard()); 296 TIterator dit = it; 297 TIterator ditBeg = it; 298 299 // sort 300 sortAlignedReads(store.alignedReadStore, LessPairScore<TFragmentStore>(store)); 301 302 for (; it != itEnd; ++it) 303 { 304 if ((*it).id == TAlignedRead::INVALID_ID || (*it).readId == TAlignedRead::INVALID_ID) continue; 305 TRead &r = store.readStore[(*it).readId]; 306 TQual &q = store.alignQualityStore[(*it).id]; 307 if (matePairId == r.matePairId) 308 { 309 if (q.pairScore <= scoreDistCutOff) continue; 310 if (++hitCount >= hitCountCutOff) 311 { 312 #ifdef RAZERS_MASK_READS 313 if (hitCount == hitCountCutOff) 314 { 315 // we have enough, now look for better matches 316 int maxErrors = -1 - q.pairScore; 317 if (options.purgeAmbiguous) 318 maxErrors = -1; 319 320 setMaxErrors(swiftL, matePairId, maxErrors); 321 setMaxErrors(swiftR, matePairId, maxErrors); 322 323 if (maxErrors == -1 && options._debugLevel >= 2) 324 ::std::cerr << "(pair #" << matePairId << " disabled)"; 325 326 if (options.purgeAmbiguous) 327 dit = ditBeg; 328 } 329 #endif 330 continue; 331 } 332 } 333 else 334 { 335 matePairId = r.matePairId; 336 hitCount = 0; 337 if (options.distanceRange > 0) 338 scoreDistCutOff = q.pairScore - options.distanceRange; 339 ditBeg = dit; 340 } 341 *dit = *it; ++dit; ++it; 342 *dit = *it; ++dit; 343 } 344 resize(store.alignedReadStore, dit - begin(store.alignedReadStore, Standard())); 345 compactAlignedReads(store); 346 } 347 348 349 350 #ifndef RAZERS_PARALLEL 351 ////////////////////////////////////////////////////////////////////////////// 352 // Find read matches in one genome sequence 353 template < 354 typename TFSSpec, 355 typename TFSConfig, 356 typename TGenome, 357 typename TReadIndex, 358 typename TSwiftSpec, 359 typename TPreprocessing, 360 typename TCounts, 361 typename TRazerSOptions > 362 void mapMatePairReads( 363 FragmentStore<TFSSpec, TFSConfig> & store, 364 TGenome & genome, // genome ... 365 unsigned contigId, // ... and its sequence number 366 Pattern<TReadIndex, Swift<TSwiftSpec> > & swiftPatternL, 367 Pattern<TReadIndex, Swift<TSwiftSpec> > & swiftPatternR, 368 TPreprocessing & preprocessingL, 369 TPreprocessing & preprocessingR, 370 TCounts & cnts, 371 char orientation, // q-gram index of reads 372 TRazerSOptions & options) 373 { 374 typedef FragmentStore<TFSSpec, TFSConfig> TFragmentStore; 375 typedef typename TFragmentStore::TMatePairStore TMatePairStore; 376 typedef typename TFragmentStore::TAlignedReadStore TAlignedReadStore; 377 typedef typename TFragmentStore::TAlignQualityStore TAlignQualityStore; 378 typedef typename Value<TMatePairStore>::Type TMatePair; 379 typedef typename Value<TAlignedReadStore>::Type TAlignedRead; 380 typedef typename Value<TAlignQualityStore>::Type TAlignQuality; 381 typedef typename Fibre<TReadIndex, FibreText>::Type TReadSet; 382 typedef typename Id<TAlignedRead>::Type TId; 383 384 typedef typename Size<TGenome>::Type TSize; 385 typedef typename Position<TGenome>::Type TGPos; 386 typedef typename MakeSigned_<TGPos>::Type TSignedGPos; 387 typedef typename Infix<TGenome>::Type TGenomeInf; 388 389 // FILTRATION 390 typedef Finder<TGenome, Swift<TSwiftSpec> > TSwiftFinderL; 391 typedef Finder<TGenomeInf, Swift<TSwiftSpec> > TSwiftFinderR; 392 typedef Pattern<TReadIndex, Swift<TSwiftSpec> > TSwiftPattern; 393 394 // MATE-PAIR FILTRATION 395 typedef Triple<__int64, TAlignedRead, TAlignQuality> TDequeueValue; 396 typedef Dequeue<TDequeueValue> TDequeue; 397 typedef typename TDequeue::TIter TDequeueIterator; 398 399 // VERIFICATION 400 typedef MatchVerifier < 401 TFragmentStore, 402 TRazerSOptions, 403 TPreprocessing, 404 TSwiftPattern, 405 TCounts > TVerifier; 406 407 const unsigned NOT_VERIFIED = 1u << (8*sizeof(unsigned)-1); 408 409 // iterate all genomic sequences 410 if (options._debugLevel >= 1) 411 { 412 ::std::cerr << ::std::endl << "Process genome seq #" << contigId; 413 if (orientation == 'F') 414 ::std::cerr << "[fwd]"; 415 else 416 ::std::cerr << "[rev]"; 417 } 418 419 TReadSet &readSetL = host(host(swiftPatternL)); 420 TReadSet &readSetR = host(host(swiftPatternR)); 421 TVerifier verifierL(store, options, preprocessingL, swiftPatternL, cnts); 422 TVerifier verifierR(store, options, preprocessingR, swiftPatternR, cnts); 423 424 verifierL.oneMatchPerBucket = true; 425 verifierR.oneMatchPerBucket = true; 426 verifierL.m.contigId = contigId; 427 verifierR.m.contigId = contigId; 428 429 if (empty(readSetL)) 430 return; 431 432 // distance <= libLen + libErr + 2*(parWidth-readLen) - shapeLen 433 // distance >= libLen - libErr - 2*parWidth + shapeLen 434 TSize readLength = length(readSetL[0]); 435 TSignedGPos maxDistance = options.libraryLength + options.libraryError - 2 * (int)readLength - (int)length(indexShape(host(swiftPatternL))); 436 TSignedGPos minDistance = options.libraryLength - options.libraryError + (int)length(indexShape(host(swiftPatternL))); 437 TGPos scanShift = (minDistance < 0)? 0: minDistance; 438 439 // exit if contig is shorter than library size 440 if (length(genome) <= scanShift) 441 return; 442 443 TGenomeInf genomeInf = infix(genome, scanShift, length(genome)); 444 TSwiftFinderL swiftFinderL(genome, options.repeatLength, 1); 445 TSwiftFinderR swiftFinderR(genomeInf, options.repeatLength, 1); 446 447 TDequeue fifo; // stores left-mate potential matches 448 String<__int64> lastPotMatchNo; // last number of a left-mate potential 449 __int64 lastNo = 0; // last number over all left-mate pot. matches in the queue 450 __int64 firstNo = 0; // first number over all left-mate pot. match in the queue 451 Pair<TGPos> gPair; 452 453 resize(lastPotMatchNo, length(host(swiftPatternL)), (__int64)-1, Exact()); 454 455 TSize gLength = length(genome); 456 457 TAlignedRead mR; 458 TAlignQuality qR; 459 TDequeueValue fL(-1, mR, qR); // to supress uninitialized warnings 460 461 // unsigned const preFetchMatches = 2048; 462 463 // iterate all verification regions returned by SWIFT 464 while (find(swiftFinderR, swiftPatternR, options.errorRate)) 465 { 466 unsigned matePairId = swiftPatternR.curSeqNo; 467 TGPos rEndPos = endPosition(swiftFinderR) + scanShift; 468 TGPos doubleParWidth = 2 * (*swiftFinderR.curHit).bucketWidth; 469 470 // remove out-of-window left mates from fifo 471 while (!empty(fifo) && (TSignedGPos)front(fifo).i2.endPos + maxDistance + (TSignedGPos)doubleParWidth < (TSignedGPos)rEndPos) 472 { 473 popFront(fifo); 474 ++firstNo; 475 } 476 /* 477 if (empty(fifo) || back(fifo).endPos + minDistance < (TSignedGPos)(rEndPos + doubleParWidth)) 478 for (unsigned i = 0; i < preFetchMatches; ++i) 479 if (find(swiftFinderL, swiftPatternL, options.errorRate, false)) 480 pushBack(fifo, mL); 481 else 482 break; 483 */ 484 // add within-window left mates to fifo 485 while (empty(fifo) || (TSignedGPos)back(fifo).i2.endPos + minDistance < (TSignedGPos)(rEndPos + doubleParWidth)) 486 { 487 if (find(swiftFinderL, swiftPatternL, options.errorRate)) 488 { 489 gPair = positionRange(swiftFinderL); 490 if ((TSignedGPos)gPair.i2 + maxDistance + (TSignedGPos)doubleParWidth >= (TSignedGPos)rEndPos) 491 { 492 // link in 493 fL.i1 = lastPotMatchNo[swiftPatternL.curSeqNo]; 494 lastPotMatchNo[swiftPatternL.curSeqNo] = lastNo++; 495 496 fL.i2.readId = store.matePairStore[swiftPatternL.curSeqNo].readId[0] | NOT_VERIFIED; 497 fL.i2.beginPos = gPair.i1; 498 fL.i2.endPos = gPair.i2; 499 500 pushBack(fifo, fL); 501 } 502 } else 503 break; 504 } 505 506 int bestLeftScore = MinValue<int>::VALUE; 507 int bestLibSizeError = MaxValue<int>::VALUE; 508 TDequeueIterator bestLeft = TDequeueIterator(); 509 510 TDequeueIterator it; 511 unsigned leftReadId = store.matePairStore[matePairId].readId[0]; 512 __int64 lastPositive = (__int64)-1; 513 for (__int64 i = lastPotMatchNo[matePairId]; firstNo <= i; i = (*it).i1) 514 { 515 it = &value(fifo, i - firstNo); 516 517 // search left mate 518 // if (((*it).i2.readId & ~NOT_VERIFIED) == leftReadId) 519 { 520 // verify left mate (equal seqNo), if not done already 521 if ((*it).i2.readId & NOT_VERIFIED) 522 { 523 524 // if (matchVerify( 525 // (*it).i2, (*it).i3, infix(genome, (TSignedGPos)(*it).i2.beginPos, (TSignedGPos)(*it).i2.endPos), 526 // matePairId, readSetL, forwardPatternsL, 527 // options, TSwiftSpec())) 528 if (matchVerify(verifierL, infix(genome, (TSignedGPos)(*it).i2.beginPos, (TSignedGPos)(*it).i2.endPos), 529 matePairId, readSetL, TSwiftSpec())) 530 { 531 verifierL.m.readId = (*it).i2.readId & ~NOT_VERIFIED; // has been verified positively 532 (*it).i2 = verifierL.m; 533 (*it).i3 = verifierL.q; 534 535 // short-cut negative matches 536 if (lastPositive == (__int64)-1) 537 lastPotMatchNo[matePairId] = i; 538 else 539 value(fifo, lastPositive - firstNo).i1 = i; 540 lastPositive = i; 541 } else 542 (*it).i2.readId = ~NOT_VERIFIED; // has been verified negatively 543 } 544 /* 545 if ((*it).i2.readId == leftReadId) 546 { 547 bestLeft = it; 548 bestLeftScore = (*it).i3.score; 549 break; 550 } 551 */ 552 if ((*it).i2.readId == leftReadId) 553 { 554 int score = (*it).i3.score; 555 if (bestLeftScore <= score) 556 { 557 int libSizeError = options.libraryLength - (int)((__int64)mR.endPos - (__int64)(*it).i2.beginPos); 558 if (libSizeError < 0) libSizeError = -libSizeError; 559 if (bestLeftScore == score) 560 { 561 if (bestLibSizeError > libSizeError) 562 { 563 bestLibSizeError = libSizeError; 564 bestLeft = it; 565 } 566 } 567 else 568 { 569 bestLeftScore = score; 570 bestLibSizeError = libSizeError; 571 bestLeft = it; 572 if (bestLeftScore == 0) break; // TODO: replace if we have real qualities 573 } 574 } 575 } 576 } 577 // else 578 // std::cout << "HUH?" << std::endl; 579 } 580 581 // short-cut negative matches 582 if (lastPositive == (__int64)-1) 583 lastPotMatchNo[matePairId] = (__int64)-1; 584 else 585 value(fifo, lastPositive - firstNo).i1 = (__int64)-1; 586 587 // verify right mate, if left mate matches 588 if (bestLeftScore != MinValue<int>::VALUE) 589 { 590 // if (matchVerify( 591 // mR, qR, infix(swiftFinderR, genomeInf), 592 // matePairId, readSetR, forwardPatternsR, 593 // options, TSwiftSpec())) 594 if (matchVerify(verifierR, infix(swiftFinderR, genomeInf), 595 matePairId, readSetR, TSwiftSpec())) 596 { 597 // distance between left mate beginning and right mate end 598 __int64 dist = (__int64)verifierR.m.endPos - (__int64)(*bestLeft).i2.beginPos; 599 if (dist <= options.libraryLength + options.libraryError && 600 options.libraryLength <= dist + options.libraryError) 601 { 602 mR = verifierR.m; 603 qR = verifierR.q; 604 605 fL.i2 = (*bestLeft).i2; 606 fL.i3 = (*bestLeft).i3; 607 608 // transform mate readNo to global readNo 609 TMatePair &mp = store.matePairStore[matePairId]; 610 fL.i2.readId = mp.readId[0]; 611 mR.readId = mp.readId[1]; 612 613 // transform coordinates to the forward strand 614 if (orientation == 'F') 615 { 616 TSize temp = mR.beginPos; 617 mR.beginPos = mR.endPos; 618 mR.endPos = temp; 619 } else 620 { 621 fL.i2.beginPos = gLength - fL.i2.beginPos; 622 fL.i2.endPos = gLength - fL.i2.endPos; 623 TSize temp = mR.beginPos; 624 mR.beginPos = gLength - mR.endPos; 625 mR.endPos = gLength - temp; 626 dist = -dist; 627 } 628 629 // set a unique pair id 630 fL.i2.pairMatchId = mR.pairMatchId = options.nextPairMatchId; 631 if (++options.nextPairMatchId == TAlignedRead::INVALID_ID) 632 options.nextPairMatchId = 0; 633 634 // score the whole match pair 635 fL.i3.pairScore = qR.pairScore = fL.i3.score + qR.score; 636 637 // both mates match with correct library size 638 /* std::cout << "found " << matePairId << " on " << orientation << contigId; 639 std::cout << " dist:" << dist; 640 if (orientation=='F') 641 std::cout << " \t_" << fL.i2.beginPos+1 << "_" << mR.endPos; 642 else 643 std::cout << " \t_" << mR.beginPos+1 << "_" << mL.endPos; 644 // std::cout << " L_" << (*bestLeft).beginPos << "_" << (*bestLeft).endPos << "_" << (*bestLeft).editDist; 645 // std::cout << " R_" << mR.beginPos << "_" << mR.endPos << "_" << mR.editDist; 646 std::cout << std::endl; 647 */ 648 if (!options.spec.DONT_DUMP_RESULTS) 649 { 650 fL.i2.id = length(store.alignedReadStore); 651 appendValue(store.alignedReadStore, fL.i2, Generous()); 652 appendValue(store.alignQualityStore, fL.i3, Generous()); 653 mR.id = length(store.alignedReadStore); 654 appendValue(store.alignedReadStore, mR, Generous()); 655 appendValue(store.alignQualityStore, qR, Generous()); 656 657 if (length(store.alignedReadStore) > options.compactThresh) 658 { 659 typename Size<TAlignedReadStore>::Type oldSize = length(store.alignedReadStore); 660 // maskDuplicates(matches); // overlapping parallelograms cause duplicates 661 compactPairMatches(store, cnts, options, swiftPatternL, swiftPatternR); 662 663 if (length(store.alignedReadStore) * 4 > oldSize) // the threshold should not be raised 664 options.compactThresh += (options.compactThresh >> 1); // if too many matches were removed 665 666 if (options._debugLevel >= 2) 667 ::std::cerr << '(' << oldSize - length(store.alignedReadStore) << " matches removed)"; 668 } 669 } 670 ++options.countVerification; 671 } 672 ++options.countFiltration; 673 } 674 } 675 } 676 } 677 #endif 678 679 680 ////////////////////////////////////////////////////////////////////////////// 681 // Find read matches in many genome sequences (import from Fasta) 682 template < 683 typename TFSSpec, 684 typename TFSConfig, 685 typename TGNoToFile, 686 typename TCounts, 687 typename TSpec, 688 typename TShape, 689 typename TSwiftSpec > 690 int mapMatePairReads( 691 FragmentStore<TFSSpec, TFSConfig> & store, 692 StringSet<CharString> & genomeFileNameList, 693 String<TGNoToFile> & gnoToFileMap, 694 TCounts & cnts, 695 RazerSOptions<TSpec> & options, 696 TShape const & shape, 697 Swift<TSwiftSpec> const) 698 { 699 typedef FragmentStore<TFSSpec, TFSConfig> TFragmentStore; 700 typedef typename TFragmentStore::TReadSeqStore TReadSeqStore; 701 702 typedef typename Value<TReadSeqStore>::Type TRead; 703 typedef StringSet<TRead> TReadSet; 704 typedef Index<TReadSet, IndexQGram<TShape> > TIndex; // q-gram index 705 typedef Pattern<TIndex, Swift<TSwiftSpec> > TSwiftPattern; // filter 706 typedef Pattern<TRead, MyersUkkonen> TMyersPattern; // verifier 707 708 // std::cout << "SA-TYPE:" <<sizeof(typename SAValue<TIndex>::Type)<<std::endl; 709 710 // split mate-pairs over two indices 711 TReadSet readSetL, readSetR; 712 unsigned pairCount = length(store.matePairStore); 713 resize(readSetL, pairCount, Exact()); 714 resize(readSetR, pairCount, Exact()); 715 716 for (unsigned i = 0; i < pairCount; ++i) 717 { 718 assign(readSetL[i], store.readSeqStore[store.matePairStore[i].readId[0]]); 719 assign(readSetR[i], store.readSeqStore[store.matePairStore[i].readId[1]]); 720 } 721 reverseComplement(readSetR); 722 723 // configure q-gram index 724 TIndex swiftIndexL(readSetL, shape); 725 TIndex swiftIndexR(readSetR, shape); 726 reverse(indexShape(swiftIndexR)); // right mate qualities are reversed -> reverse right shape 727 728 cargo(swiftIndexL).abundanceCut = options.abundanceCut; 729 cargo(swiftIndexR).abundanceCut = options.abundanceCut; 730 cargo(swiftIndexL)._debugLevel = options._debugLevel; 731 cargo(swiftIndexR)._debugLevel = options._debugLevel; 732 733 // configure Swift 734 TSwiftPattern swiftPatternL(swiftIndexL); 735 TSwiftPattern swiftPatternR(swiftIndexR); 736 swiftPatternL.params.minThreshold = options.threshold; 737 swiftPatternR.params.minThreshold = options.threshold; 738 swiftPatternL.params.tabooLength = options.tabooLength; 739 swiftPatternR.params.tabooLength = options.tabooLength; 740 swiftPatternL.params.printDots = false; 741 swiftPatternR.params.printDots = options._debugLevel > 0; 742 743 // init edit distance verifiers 744 String<TMyersPattern> forwardPatternsL; 745 String<TMyersPattern> forwardPatternsR; 746 options.compMask[4] = (options.matchN)? 15: 0; 747 if (!options.hammingOnly) 748 { 749 resize(forwardPatternsL, pairCount, Exact()); 750 resize(forwardPatternsR, pairCount, Exact()); 751 for(unsigned i = 0; i < pairCount; ++i) 752 { 753 #ifdef RAZERS_NOOUTERREADGAPS 754 if (!empty(readSetL[i]) && !empty(readSetR[i])) { 755 setHost(forwardPatternsL[i], prefix(readSetL[i], length(readSetL[i]) - 1)); 756 setHost(forwardPatternsR[i], prefix(readSetR[i], length(readSetR[i]) - 1)); 757 } 758 #else 759 setHost(forwardPatternsL[i], readSetL[i]); 760 setHost(forwardPatternsR[i], readSetR[i]); 761 #endif 762 _patternMatchNOfPattern(forwardPatternsL[i], options.matchN); 763 _patternMatchNOfPattern(forwardPatternsR[i], options.matchN); 764 _patternMatchNOfFinder(forwardPatternsL[i], options.matchN); 765 _patternMatchNOfFinder(forwardPatternsR[i], options.matchN); 766 } 767 } 768 769 // clear stats 770 options.countFiltration = 0; 771 options.countVerification = 0; 772 options.timeMapReads = 0; 773 options.timeDumpResults = 0; 774 775 unsigned filecount = 0; 776 unsigned numFiles = length(genomeFileNameList); 777 unsigned contigId = 0; 778 779 // open genome files, one by one 780 while (filecount < numFiles) 781 { 782 // open genome file 783 ::std::ifstream file; 784 file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary); 785 if (!file.is_open()) 786 return RAZERS_GENOME_FAILED; 787 788 // remove the directory prefix of current genome file 789 ::std::string genomeFile(toCString(genomeFileNameList[filecount])); 790 size_t lastPos = genomeFile.find_last_of('/') + 1; 791 if (lastPos == genomeFile.npos) lastPos = genomeFile.find_last_of('\\') + 1; 792 if (lastPos == genomeFile.npos) lastPos = 0; 793 ::std::string genomeName = genomeFile.substr(lastPos); 794 795 CharString id; 796 Dna5String genome; 797 unsigned gseqNoWithinFile = 0; 798 // iterate over genome sequences 799 SEQAN_PROTIMESTART(find_time); 800 for(; !_streamEOF(file); ++contigId) 801 { 802 if (options.genomeNaming == 0) 803 { 804 //readID(file, id, Fasta()); // read Fasta id 805 readShortID(file, id, Fasta()); // read Fasta id up to first whitespace 806 appendValue(store.contigNameStore, id, Generous()); 807 } 808 read(file, genome, Fasta()); // read Fasta sequence 809 810 appendValue(gnoToFileMap, TGNoToFile(genomeName, gseqNoWithinFile)); 811 812 if (options.forward) 813 mapMatePairReads(store, genome, contigId, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'F', options); 814 815 if (options.reverse) 816 { 817 reverseComplement(genome); 818 mapMatePairReads(store, genome, contigId, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'R', options); 819 } 820 ++gseqNoWithinFile; 821 822 } 823 options.timeMapReads += SEQAN_PROTIMEDIFF(find_time); 824 file.close(); 825 ++filecount; 826 } 827 828 compactPairMatches(store, cnts, options, swiftPatternL, swiftPatternR); 829 reverseComplement(readSetR); 830 831 if (options._debugLevel >= 1) 832 ::std::cerr << ::std::endl << "Finding reads took \t" << options.timeMapReads << " seconds" << ::std::endl; 833 if (options._debugLevel >= 2) { 834 ::std::cerr << ::std::endl; 835 ::std::cerr << "___FILTRATION_STATS____" << ::std::endl; 836 ::std::cerr << "Filtration counter: " << options.countFiltration << ::std::endl; 837 ::std::cerr << "Verfication counter: " << options.countVerification << ::std::endl; 838 } 839 return 0; 840 } 841 842 843 } 844 845 #endif 846