1 /*==========================================================================
2 SplitRazerS - Fast Split Read Mapping
3
4 ============================================================================
5 Copyright (C) 2008 by Anne-Katrin Emde
6
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your options) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 ==========================================================================*/
20
21
22 #ifndef SEQAN_HEADER_RAZERS_SPLICED_H
23 #define SEQAN_HEADER_RAZERS_SPLICED_H
24
25 //#define RAZERS_DEBUG
26 //#define RAZERS_DEBUG_LIGHT
27
28 #include <seqan/store.h>
29 #include <seqan/align.h>
30 #include <seqan/graph_types.h>
31 #include <seqan/graph_algorithms.h>
32 #include <seqan/align.h>
33 #include <seqan/seeds.h>
34 #include <seqan/bam_io.h>
35
36 namespace seqan
37 {
38
39
40
41 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
42 // Match statistics stuff
43 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
44
45 //statistics, expected number of matches
choose(int n,int k)46 double choose(int n, int k)
47 {
48 double cnk = 1.0;
49
50 if (k * 2 > n)
51 k = n - k;
52
53 for (int i = 1; i <= k; n--, i++)
54 {
55 cnk /= i;
56 cnk *= n;
57 }
58 return cnk;
59 }
60
61
62 //#R - read length
63 //#T - total sequence length
64 //#I - length of insertion
65 //#D - length of deletion
66 //#M1 - minimal prefix match length
67 //#M2 - minimal suffix match length
68
69
70 //## first function denotes the random match to an iid sequence
71 template<typename TSize, typename TValue>
72 double
_probability(TSize R,TSize M1,TSize M2,TValue d,TValue d_m1,TValue d_m2)73 _probability(TSize R, TSize M1, TSize M2, TValue d, TValue d_m1, TValue d_m2)
74 {
75 double sum_prob = 0;
76 for (unsigned i1 = 0; i1 <= d_m1; ++i1)
77 {
78 for (unsigned i2 = 0; i2 <= d_m2; ++i2)
79 {
80 for (unsigned j = 0; j <= d-i1-i2; ++j)
81 {
82 if(R-M1-M2>=j)
83 sum_prob +=
84 (double)(choose(M1,i1)* pow((double)0.25,(double)R-i1-i2-j) *
85 choose(M2,i2) * pow((double)0.75,(double)i1+i2+j) *
86 choose(R-M1-M2,j));
87 }
88 }
89 }
90 return sum_prob;
91 }
92
93
94 //#InsCuts describes the possible number of cuts
95 //#for a read of length R and an insertion of length I
96 template<typename TSize>
97 TSize
_insCuts(TSize R,TSize M1,TSize M2,TSize I)98 _insCuts(TSize R, TSize M1, TSize M2, TSize I){
99 return (R-I-M1-M2+1);
100 }
101
102
103
104 //#function for calculating expected matches with I insertions
105 template<typename TSize, typename TValue>
106 double
_expMatchesIns(TSize R,TSize M1,TSize M2,TSize I,TValue d,TValue d_m1,TValue d_m2,TSize S,TSize T)107 _expMatchesIns(TSize R, TSize M1, TSize M2, TSize I, TValue d,TValue d_m1,TValue d_m2, TSize S, TSize T)
108 {
109 return((double)T*((S-R-I-1)*_probability((R-I),M1,M2,d,d_m1,d_m2))*_insCuts(R,M1,M2,I));
110 }
111
112
113 //#DelCuts describes the possible number of cuts
114 //#for a read of length R in case of a deletion
115 template<typename TSize>
116 TSize
_delCuts(TSize R,TSize M1,TSize M2)117 _delCuts(TSize R, TSize M1, TSize M2)
118 {
119 return (R-M1-M2+1);
120 }
121
122
123 //#Del describes the possible number of configurations
124 //#for a string of length R in a sequence of length S
125 template<typename TSize>
126 TSize
_del(TSize R,TSize S,TSize maxD,TSize minD)127 _del(TSize R, TSize S, TSize maxD, TSize minD)
128 {
129 //return((S-R)*(S-R+1)/2);
130 return (maxD-minD+1)*(S-R+1) - ((maxD+1)*maxD)/2 + ((minD-1)*minD)/2;
131 }
132
133
134 //#function for calculating expected matches with deletion of size D
135 template<typename TSize, typename TValue>
136 double
_expMatchesDel(TSize R,TSize M1,TSize M2,TValue d,TValue d_m1,TValue d_m2,TSize S,TSize T,TSize maxD,TSize minD)137 _expMatchesDel(TSize R, TSize M1, TSize M2, TValue d, TValue d_m1, TValue d_m2, TSize S, TSize T, TSize maxD, TSize minD )
138 {
139 return ((double)T*(_probability(R,M1,M2,d,d_m1,d_m2))*_delCuts(R,M1,M2)*_del(R,S,maxD,minD));
140 }
141
142 template<typename TReadSet, typename TSize, typename TOptions>
143 void
expNumRandomMatches(TReadSet & readSet,TSize genomeLen,TOptions & options)144 expNumRandomMatches(TReadSet &readSet, TSize genomeLen, TOptions & options)
145 {
146 TSize M1 = options.minMatchLen;
147 TSize M2 = options.minMatchLen;
148 TSize d_m1 = (int) options.maxPrefixErrors;
149 TSize d_m2 = (int) options.maxSuffixErrors;
150 TSize numReads = length(readSet);
151 genomeLen = 1000000 * options.specifiedGenomeLen; // go from Mb to bp
152 if(options.anchored) genomeLen = options.maxGap + 2*options.libraryError; // upper bound
153 TSize readLen = (numReads > 0) ? length(readSet[0]) : 0;
154 TSize numErrors = static_cast<int>(readLen * options.errorRate);
155 TSize maxD = options.maxGap;
156 TSize minD = options.minGap;
157 //expected number of random deletion matches:
158 double delMatches = _expMatchesDel(readLen,M1,M2,numErrors,d_m1, d_m2, genomeLen, numReads, maxD, minD);
159 if (options.reverse) delMatches *= 2;
160
161 //expected number of random deletion matches:
162 double insMatches = 0;
163 for(TSize insLen = 1; insLen <=readLen-M1-M2; ++insLen)
164 {
165 numErrors = static_cast<int>((readLen-insLen) * options.errorRate);
166 insMatches += _expMatchesIns(readLen,M1,M2,insLen,numErrors,d_m1, d_m2, genomeLen, numReads);
167 }
168 if (options.reverse) insMatches *= 2;
169
170 ::std::cout << "Expected number of random deletion-indicating matches: " << delMatches << std::endl;
171 ::std::cout << "Expected number of random insertion-indicating matches: " << insMatches << std::endl;
172 }
173
174
175 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
176
177 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
178
179
180 // SPLIT MAPPING
181 // We build two q-gram indices, one for the prefix, the other for the suffix of the read
182
183 struct LongestPrefix{};
184 struct LongestSuffix{};
185
186 struct OrientationReverse{};
187 struct OrientationForward{};
188
189 //////////////////////////////////////////////////////////////////////////////
190 // Load anchored reads from SAM file
191 template <typename TReadSet, typename TNameSet,typename TReadRegions, typename TRazerSOptions>
loadReadsSam(TReadSet & reads,TNameSet & fastaIDs,TReadRegions & readRegions,const char * fileName,TRazerSOptions & options)192 bool loadReadsSam(
193 TReadSet &reads,
194 TNameSet &fastaIDs,
195 TReadRegions &readRegions,
196 const char *fileName,
197 TRazerSOptions &options)
198 {
199 typedef typename Value<TReadRegions>::Type TRegion;
200 //typedef typename Value<TReadSet>::Type TRead;
201 typedef typename Value<TNameSet>::Type TReadName;
202 //typedef typename Value<TRegion,2>::Type TContigPos;
203 typedef typename Value<TRegion,2>::Type TFlagPos;
204 typedef typename Value<TFlagPos,2>::Type TContigPos;
205
206 bool countN = !(options.matchN || options.outputFormat == 1 );
207 if (!empty(CharString(options.outputUnmapped))) countN = false;
208
209 BamFileIn file;
210 if (!open(file, fileName)) return false;
211
212 options.maxReadRegionsEnd = 0;
213 options.minReadRegionsStart = std::numeric_limits<int>::max();
214 TContigPos regionBegin = options.minReadRegionsStart;
215 TContigPos regionEnd = options.maxReadRegionsEnd;
216
217 //typename std::ifstream::pos_type lineStart = (*file).tellg();
218 //lineStart = lineStart - (std::ifstream::pos_type) 1;
219 // (*file).seekp(lineStart);
220
221
222 int i = 0;
223 //int lastFlag = -1;
224 //TReadName lastQname;
225 //bool lastWasAnchored = false;
226 int kickoutcount = 0;
227
228 // Read header.
229 BamHeader header;
230 readHeader(header, file);
231
232 // Read records.
233 BamAlignmentRecord record;
234 while (!atEnd(file))
235 {
236 readRecord(record, file);
237 if (record.rID == -1)
238 continue; // Skip if orphan.
239
240 // Get the query name, remove everything after the first space.
241 TReadName qname = record.qName;
242 cropAfterFirst(qname, IsWhitespace());
243
244 // Evaluate flag.
245 if (!hasFlagMultiple(record) || !hasFlagUnmapped(record))
246 continue; // Skip if not unmapped mate.
247 bool reverse = hasFlagRC(record); // if reverse this read is expected to match downstream of its mate
248 int bitFlag = 0; // has two bits
249 if (reverse) bitFlag += 2; // first bits says whether reversed (or not ->0)
250 if ((record.flag & (1 << 7)) == (1 << 7)) bitFlag +=1; // second bit says whether second mate (or first ->0)
251
252 // Read reference name. Same behaviour as for query name: Read up to
253 // the first whitespace character and skip to next tab char.
254 String<char> chrname = contigNames(context(file))[record.rID];
255 //need gnameToIdMap !!
256
257 // Get read begin position.
258 TContigPos beginPos = record.beginPos;
259
260 // Get CIGAR string.
261 String<CigarElement<> > cigar = record.cigar;
262
263 // read in sequence
264 String<Dna5Q> readSeq = record.seq;
265 SEQAN_ASSERT_GT(length(readSeq), 0u);
266 int readLength = (int)length(readSeq);
267 if (countN)
268 {
269 int count = 0;
270 int cutoffCount = (int)(options.errorRate * readLength);
271 int j = 0;
272 for (; j < readLength; ++j)
273 if (getValue(readSeq, j) == 'N')
274 if (++count > cutoffCount)
275 {
276 ++kickoutcount;
277 break;
278 }
279 if(j<readLength) continue; // read is skipped because of low quality
280 }
281
282 // and associated qualities
283 assignQualities(readSeq, record.qual);
284
285
286 // now store the information
287 if (options.readNaming == 0
288 #ifdef RAZERS_DIRECT_MAQ_MAPPING
289 || options.fastaIdQual
290 #endif
291 ) //15578976
292 appendValue(fastaIDs, qname); // append read name Fasta id
293
294
295 // if (options.trimLength > 0 && readLength > (unsigned)options.trimLength)
296 // resize(readSeq, options.trimLength);
297 if((int)length(readSeq) > options.maxReadLength)
298 options.maxReadLength = length(readSeq);
299
300 //reverseComplement(readSeq);
301 appendValue(reads, readSeq, Generous());
302
303 appendValue(readRegions,TRegion(0,TFlagPos(0,0)));
304 readRegions[i].i2.i1 = bitFlag;
305 readRegions[i].i2.i2 = (signed)beginPos;
306 if (reverse)
307 {
308 // libraryLength is outer fragment distance
309 regionBegin = _max((int)beginPos + readLength,(int)beginPos+options.libraryLength-readLength-options.libraryError);
310 regionEnd = (signed)beginPos+options.libraryLength+options.libraryError+options.maxGap;
311 // expected begin position of read
312 // readRegions[i].i2 = (signed)beginPos + options.libraryLength - readLength;
313
314 }
315 else // read should map upstream of mapped mate --> set the vorzeichen bit
316 {
317 regionBegin = _max((int)0,(int)beginPos-options.libraryLength+readLength-options.libraryError-options.maxGap);
318 regionEnd = _min((int)beginPos,(int)beginPos-options.libraryLength+2*readLength+options.libraryError);
319 // expected end position of read
320 //readRegions[i].i2 = - _min((int)beginPos,(int)beginPos-options.libraryLength+2*readLength);//+options.maxGap);
321 }
322 // readRegions[i].i1 = 0; //currently just ment for one chr at a time... need to add genomeId map
323 if(regionEnd > (TContigPos) options.maxReadRegionsEnd) options.maxReadRegionsEnd = (unsigned) regionEnd;
324 if(regionBegin < (TContigPos)options.minReadRegionsStart) options.minReadRegionsStart = (unsigned) regionBegin;
325 //lastFlag = flag;
326 //lastQname = qname;
327 ++i;
328 }
329 if (options._debugLevel > 1 && kickoutcount > 0)
330 ::std::cerr << "Ignoring " << kickoutcount << " low quality reads.\n";
331 return (i > 0);
332
333 }
334
335
336
337 //////////////////////////////////////////////////////////////////////////////
338 // Remove low quality matches # necessary to have an own splicedmatch function?
339 // planned specs: SpliceSite, General, ...
340 template < typename TMatches, typename TCounts, typename TSpec, typename TSwiftL, typename TSwiftR >
compactSplicedMatches(TMatches & matches,TCounts &,RazerSOptions<TSpec> & options,bool,TSwiftL & swiftL,TSwiftR & swiftR)341 void compactSplicedMatches(TMatches &matches,
342 TCounts & /*cnts*/,
343 RazerSOptions<TSpec> &options,
344 bool ,
345 TSwiftL &swiftL,
346 TSwiftR &swiftR)
347 {
348 typedef typename Value<TMatches>::Type TMatch;
349 typedef typename Iterator<TMatches, Standard>::Type TIterator;
350
351 unsigned readNo = -1;
352 unsigned hitCount = 0;
353 unsigned hitCountCutOff = options.maxHits;
354 int scoreDistCutOff = std::numeric_limits<int>::min();
355
356 TIterator it = begin(matches, Standard());
357 TIterator itEnd = end(matches, Standard());
358 TIterator dit = it;
359 TIterator ditBeg = it;
360
361
362 // sort
363 // ::std::sort(it, itEnd, LessSplicedErrors<TMatch>());
364 ::std::sort(it, itEnd, LessSplicedScore<TMatch>());
365 int counter = 0;
366 for (; it != itEnd; ++it)
367 {
368 ++counter;
369 if ((*it).orientation == '-') { ++it; continue; }
370 if (readNo == (*it).rseqNo)
371 {
372 if ((int)(*it).pairScore <= scoreDistCutOff)
373 {
374 ++it;
375 continue;
376 }
377 if (++hitCount >= hitCountCutOff)
378 {
379 #ifdef RAZERS_MASK_READS
380 if (hitCount == hitCountCutOff)
381 {
382 // we have enough, now look for better matches
383 int minScore = (*it).pairScore - 1;
384 if (options.purgeAmbiguous &&
385 (options.distanceRange == 0 || minScore >= options.maxReadLength - (int)options.distanceRange))
386 {
387 setMaxErrors(swiftL, readNo, -1);
388 setMaxErrors(swiftR, readNo, -1);
389 if (options._debugLevel >= 2) ::std::cerr << "(read #" << readNo << " disabled)";
390 dit = ditBeg;
391 }
392 else
393 {
394 // enough optimal read matches found, disable further search but keep maxHits+1 matches
395 if(minScore == options.maxReadLength - 1)
396 {
397 setMaxErrors(swiftL, readNo, -1);
398 setMaxErrors(swiftR, readNo, -1);
399 if (options._debugLevel >= 2) ::std::cerr << "(read #" << readNo << " disabled)";
400 }
401
402 *dit = *it; ++dit; ++it;
403 *dit = *it; ++dit;
404 continue;
405 }
406
407 }
408 #endif
409 ++it;
410 continue;
411 }
412 }
413 else
414 {
415 readNo = (*it).rseqNo;// >> 1;
416 hitCount = 0;
417 if (options.distanceRange > 0)
418 scoreDistCutOff = (*it).pairScore - options.distanceRange;
419 ditBeg = dit;
420 }
421 *dit = *it; ++dit; ++it;
422 *dit = *it; ++dit;
423 }
424 resize(matches, dit - begin(matches, Standard()));
425
426 }
427
428
429
430
431 //////////////////////////////////////////////////////////////////////////////
432 // final compacting of spliced matches
433 template < typename TMatches, typename TCounts, typename TSpec>
compactAndCountSplicedMatches(TMatches & matches,TCounts & states,RazerSOptions<TSpec> & options,bool)434 void compactAndCountSplicedMatches(TMatches &matches,
435 TCounts & states,
436 RazerSOptions<TSpec> &options,
437 bool )
438 {
439 typedef typename Value<TMatches>::Type TMatch;
440 typedef typename Iterator<TMatches, Standard>::Type TIterator;
441
442 unsigned readNo = -1;
443 unsigned hitCount = 0;
444 unsigned hitCountCutOff = options.maxHits;
445 int scoreDistCutOff = std::numeric_limits<int>::min();
446
447 clear(states);
448
449 int bestScore = 0;
450 int state = -1; // 0 = unique, 1 = multi, 2 = suboptimal
451 int numSuccesful = 0;
452
453 TIterator it = begin(matches, Standard());
454 TIterator itEnd = end(matches, Standard());
455 TIterator dit = it;
456 TIterator ditBeg = it;
457 // int lastPairId = 0;
458 // sort
459 ::std::sort(it, itEnd, LessSplicedScoreGPos<TMatch>());
460 // ::std::sort(it, itEnd, LessSplicedErrorsGPos<TMatch>());
461 int counter = 0;
462 for (; it != itEnd; ++it)
463 {
464 ++counter;
465 if ((*it).orientation == '-') { ++it; continue; }
466 if (readNo == (*it).rseqNo) // current match is either multi or suboptimal
467 {
468 if ((int)(*it).pairScore <= scoreDistCutOff)
469 {
470 ++it;
471 continue;
472 }
473 if (++hitCount >= hitCountCutOff)
474 {
475 if (hitCount == hitCountCutOff)
476 {
477 if ((int)(*it).pairScore == bestScore)
478 {
479 if(state == 0)
480 state = 1; // the match before is multi
481 }
482
483 if(options.purgeAmbiguous)
484 {
485 dit = ditBeg;
486 resize(states,numSuccesful);
487 state = -1;
488 }
489
490 }
491 ++it;
492 continue;
493 }
494 if ((int)(*it).pairScore == bestScore)
495 {
496 if(state == 0)
497 {
498 appendValue(states, 1); // the match before is multi
499 state = 1; // current one is multi too
500 }
501 else // state > 0
502 appendValue(states, state); // either current match is multi or suboptimal, same state
503 //std::cout << "state = "<< state << " for PairId = " << lastPairId << std::endl;
504 }
505 if ((int)(*it).pairScore < bestScore)
506 {
507 appendValue(states, state); // either current match is suboptimal, state before is what it was
508 //std::cout << "state = "<< state << " for PairId = " << lastPairId << std::endl;
509 state = 2; // the current one is suboptimal
510 }
511 // lastPairId = (*it).pairId;
512 }
513 else
514 {
515 if(state != -1)
516 {
517 appendValue(states, state); // append state of the match before
518 //std::cout << "state = "<< state << " for PairId = " << lastPairId << std::endl;
519 }
520 readNo = (*it).rseqNo;// >> 1;
521 hitCount = 0;
522 if (options.distanceRange > 0)
523 scoreDistCutOff = (*it).pairScore - options.distanceRange;
524 ditBeg = dit;
525 numSuccesful = (dit - begin(matches, Standard()))/2;
526 //std::cout << "numSuccesful=" << numSuccesful << std::endl;
527 bestScore = (*it).pairScore;
528 // lastPairId = (*it).pairId;
529 state = 0;
530 }
531 *dit = *it; ++dit; ++it;
532 *dit = *it; ++dit;
533
534 }
535 if(state != -1)
536 {
537 appendValue(states, state);
538 //std::cout << "state = "<< state << " for PairId = " << lastPairId << std::endl;
539 }
540
541 resize(matches, dit - begin(matches, Standard()));
542 // std::cout << "lengthmatches = " << length(matches) << " numStates = " << length(states) << std::endl;
543 }
544
545
546 template<typename TAlign, typename TPosition>
547 int
countErrorsInAlign(TAlign & align,TPosition end_)548 countErrorsInAlign(TAlign & align, TPosition end_)
549 {
550
551 //typedef typename Source<TAlign>::Type TSource;
552 //typedef typename Iterator<TSource, Rooted>::Type TStringIterator;
553
554 typedef typename Row<TAlign>::Type TRow;
555 typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
556
557 TAlignIterator ali_it0 = iter(row(align,0),0);
558 TAlignIterator ali_it1 = iter(row(align,1),0);
559 TAlignIterator ali_it0_stop = iter(row(align,0),end_);
560 TAlignIterator ali_it1_stop = iter(row(align,1),end_);
561
562
563 int errors = 0;
564 while(ali_it0 != ali_it0_stop && ali_it1 != ali_it1_stop)
565 {
566 while(ali_it0!=ali_it0_stop && ali_it1!=ali_it1_stop && !isGap(ali_it0)&& !isGap(ali_it1))
567 {
568 if(*ali_it1 != *ali_it0)
569 ++errors;
570 ++ali_it0;
571 ++ali_it1;
572 }
573 while(ali_it0!=ali_it0_stop && isGap(ali_it0))
574 {
575 ++ali_it0;
576 ++ali_it1;
577 ++errors;
578 }
579 while(isGap(ali_it1)&& ali_it1!=ali_it1_stop)
580 {
581 ++ali_it0;
582 ++ali_it1;
583 ++errors;
584 }
585 }
586 while(ali_it0!=ali_it0_stop)
587 {
588 ++ali_it0;
589 ++errors;
590 }
591 while(ali_it1 != ali_it1_stop)
592 {
593 ++ali_it1;
594 ++errors;
595 }
596
597 return errors;
598 }
599
600
601
602 //////////////////////////////////////////////////////////////////////////////
603 // Edit distance verification for longest suffix/prefix
604 template <
605 typename TMatch,
606 typename TGenome,
607 typename TReadSet,
608 typename TMyersPatterns,
609 typename TSpec,
610 typename TSufPrefSpec
611 >
612 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> inf,unsigned rseqNo,TReadSet & readSet,TMyersPatterns & forwardPatterns,RazerSOptions<TSpec> const & options,SwiftSemiGlobal,TSufPrefSpec)613 matchVerify(
614 TMatch &m, // resulting match
615 Segment<TGenome, InfixSegment> inf, // potential match genome region
616 unsigned rseqNo, // read number
617 TReadSet &readSet, // reads
618 TMyersPatterns &forwardPatterns, // MyersBitVector preprocessing data
619 RazerSOptions<TSpec> const &options,// RazerS options
620 SwiftSemiGlobal, // Edit distance
621 TSufPrefSpec) // Swift specialization
622 {
623 typedef Segment<TGenome, InfixSegment> TGenomeInfix;
624 typedef typename Value<TReadSet>::Type TRead;
625
626 // find read match end
627 typedef Finder<TGenomeInfix> TMyersFinder;
628 typedef typename Value<TMyersPatterns>::Type TMyersPattern;
629
630 // find read match begin
631 typedef ModifiedString<TGenomeInfix, ModReverse> TGenomeInfixRev;
632 typedef ModifiedString<TRead, ModReverse> TReadRev;
633 typedef Finder<TGenomeInfixRev> TMyersFinderRev;
634 typedef Pattern<TReadRev, MyersUkkonenGlobal> TMyersPatternRev;
635
636 TMyersFinder myersFinder(inf);
637 TMyersPattern &myersPattern = forwardPatterns[rseqNo]; //have to make sure this only contains the prefix
638
639 #ifdef RAZERS_DEBUG
640 ::std::cout << "Verify: " << ::std::endl;
641 ::std::cout << "Genome: " << inf << "\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl;
642 ::std::cout << "Read: " << readSet[rseqNo] << ::std::endl;
643 #endif
644
645 unsigned ndlLength = _min(sequenceLength(rseqNo, readSet),options.minMatchLen);
646 int maxScore = std::numeric_limits<int>::min();
647 int minScore = - maxNumSeedErrors(options,TSufPrefSpec());
648
649 TMyersFinder maxPos;
650
651 // find end of best semi-global alignment
652 while (find(myersFinder, myersPattern, minScore))
653 {
654 if (maxScore <= getScore(myersPattern))
655 {
656 maxScore = getScore(myersPattern);
657 maxPos = myersFinder;
658 }
659 }
660
661
662 if (maxScore < minScore)
663 return false;
664
665 m.editDist = (unsigned)-maxScore;
666 TGenomeInfix oriInf = inf;
667 setEndPosition(inf, m.gEnd = (beginPosition(inf) + position(maxPos) + 1));
668
669 // limit the beginning to needle length plus errors (== -maxScore)
670 if (length(inf) > ndlLength - maxScore)
671 setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore);
672
673 // find beginning of best semi-global alignment
674 TGenomeInfixRev infRev(inf);
675 TMyersFinderRev myersFinderRev(infRev);
676 TReadRev readRev;
677 TRead readInf; // Needs to be a global variable, since ModifiedString cannot hold a pointer to a temporary.
678 if(IsSameType<TSufPrefSpec,LongestSuffix>::VALUE)
679 readInf = infix(readSet[rseqNo],length(readSet[rseqNo])-options.minMatchLen,length(readSet[rseqNo]));
680 else
681 readInf = infix(readSet[rseqNo],0,options.minMatchLen);
682 setHost(readRev, readInf);
683
684 TMyersPatternRev myersPatternRev(readRev);
685
686 _patternMatchNOfPattern(myersPatternRev, options.matchN);
687 _patternMatchNOfFinder(myersPatternRev, options.matchN);
688 while (find(myersFinderRev, myersPatternRev, maxScore))
689 m.gBegin = m.gEnd - (position(myersFinderRev) + 1);
690
691 m.mScore = ndlLength;
692 m.seedEditDist = m.editDist;
693 m.gSeedLen = m.gEnd - m.gBegin;
694
695 #ifdef RAZERS_DEBUG
696 ::std::cout << " before extendMatch " << ::std::endl;
697 ::std::cout << " match: " << ::std::endl;
698 ::std::cout << " mScore= " <<m.mScore << ::std::endl;
699 ::std::cout << " gBegin= " <<m.gBegin << ::std::endl;
700 ::std::cout << " gEnd= " <<m.gEnd << ::std::endl;
701 ::std::cout << " edit= " <<m.editDist << ::std::endl;
702 #endif
703
704 //TODO: give only part of read to extension!!!
705
706 //now extend the seed
707 extendMatch(readSet,rseqNo,oriInf,m,options,TSufPrefSpec());
708
709 #ifdef RAZERS_DEBUG
710 ::std::cout << " match: " << ::std::endl;
711 ::std::cout << " mScore= " <<m.mScore << ::std::endl;
712 ::std::cout << " gBegin= " <<m.gBegin << ::std::endl;
713 ::std::cout << " gEnd= " <<m.gEnd << ::std::endl;
714 ::std::cout << " edit= " <<m.editDist << ::std::endl;
715 #endif
716
717 return true;
718 }
719
720 template<typename TOptions>
721 int
maxNumSeedErrors(TOptions & options,LongestSuffix)722 maxNumSeedErrors(TOptions &options, LongestSuffix)
723 {
724 return (int)options.maxSuffixErrors;
725 }
726
727 template<typename TOptions>
728 int
maxNumSeedErrors(TOptions & options,LongestPrefix)729 maxNumSeedErrors(TOptions &options, LongestPrefix)
730 {
731 return (int)options.maxPrefixErrors;
732 }
733
734
735
736 template<typename TReadSet, typename TSize, typename TInf, typename TMatch, typename TOptions>
737 void
extendMatch(TReadSet & readSet,TSize rseqNo,TInf & inf,TMatch & m,TOptions & options,LongestSuffix)738 extendMatch(TReadSet &readSet, TSize rseqNo, TInf & inf, TMatch &m, TOptions &options, LongestSuffix)
739 {
740 #ifdef RAZERS_DEBUG
741 ::std::cout << " extending match left" << ::std::endl;
742 #endif
743
744 // XXXIMPROV
745 // unsigned lDim0 = _max(0,(int)length(readSet[rseqNo])- 2 * (int)options.minMatchLen);
746 unsigned lDim0 = _max(0,(int)length(readSet[rseqNo])-(int)options.minMatchLen);
747 unsigned lDim1 = m.gBegin - beginPosition(inf);
748 // XXXIMPROV
749 // unsigned rDim0 = length(readSet[rseqNo])-1-options.minMatchLen;
750 unsigned rDim0 = length(readSet[rseqNo])-1;
751 unsigned rDim1 = m.gEnd - beginPosition(inf)-1;
752
753 // Seed<int,SimpleSeed> seed(lDim0, lDim1, rDim0, rDim1);
754 Seed<Simple> seed(lDim0, lDim1, rDim0+1, rDim1+1);
755 Score<int> scoreMatrix(0,-1,-1,-1);
756 int scoreDropOff = static_cast<int>((sequenceLength(rseqNo,readSet) * options.errorRate) - m.editDist);
757
758 #ifdef RAZERS_DEBUG
759 ::std::cout << "beginPos = " << beginPosition(inf) << std::endl;
760 ::std::cout << "endPos = " << endPosition(inf) << std::endl;
761 ::std::cout << " lDim0: " << lDim0 << ::std::endl;
762 ::std::cout << " lDim1: " << lDim1 << ::std::endl;
763 ::std::cout << " rDim0: " << rDim0 << ::std::endl;
764 ::std::cout << " rDim1: " << rDim1 << ::std::endl;
765 ::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
766 ::std::cout << " readInf: "<< infix(readSet[rseqNo],lDim0,rDim0+1) << ::std::endl;
767 ::std::cout << " gInfInf: "<< infix(inf,lDim1,rDim1+1) << ::std::endl;
768 ::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
769 ::std::cout << " gInf: "<< inf << ::std::endl;
770 #endif
771
772 int extScore = 0;
773 //extendSeedScore(seed,extScore,scoreDropOff,scoreMatrix, readSet[rseqNo],inf,0,GappedXDrop());
774
775 typedef typename Prefix<typename Value<TReadSet>::Type >::Type TQueryPrefix;
776 typedef typename Prefix<TInf>::Type TDatabasePrefix;
777
778 TQueryPrefix queryPrefix = prefix(readSet[rseqNo], beginPositionH(seed));
779 TDatabasePrefix databasePrefix = prefix(inf, beginPositionV(seed));
780 extScore = _extendSeedGappedXDropOneDirection(seed, databasePrefix, queryPrefix, EXTEND_LEFT, scoreMatrix, scoreDropOff);
781
782 // m.gBegin = leftDim1(seed) + beginPosition(inf);
783 // m.mScore = rightDim0(seed) - leftDim0(seed) + 1;
784
785 m.gBegin = beginPositionV(seed) + beginPosition(inf);
786 m.mScore = endPositionH(seed) - beginPositionH(seed);
787 m.editDist -= extScore;
788
789 #ifdef RAZERS_DEBUG
790 ::std::cout << " lDim0: " << beginPositionH(seed) << ::std::endl;
791 ::std::cout << " lDim1: " << beginPositionV(seed) << ::std::endl;
792 ::std::cout << " rDim0: " << endPositionH(seed) << ::std::endl;
793 ::std::cout << " rDim1: " << endPositionV(seed) << ::std::endl;
794 ::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
795 ::std::cout << " readInf: "<< infix(readSet[rseqNo],beginPositionH(seed),endPositionH(seed)+1) << ::std::endl;
796 ::std::cout << " gInfInf: "<< infix(inf,beginPositionV(seed),endPositionV(seed)+1) << ::std::endl;
797 ::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
798 ::std::cout << " gInf: "<< inf << ::std::endl;
799 #endif
800 }
801
802 template<typename TReadSet, typename TSize, typename TInf, typename TMatch, typename TOptions>
803 void
extendMatch(TReadSet & readSet,TSize rseqNo,TInf & inf,TMatch & m,TOptions & options,LongestPrefix)804 extendMatch(TReadSet &readSet, TSize rseqNo, TInf & inf, TMatch &m, TOptions &options, LongestPrefix)
805 {
806 #ifdef RAZERS_DEBUG
807 ::std::cout << " extending match right" << ::std::endl;
808 #endif
809
810 unsigned lDim0 = 0;
811 unsigned lDim1 = m.gBegin - beginPosition(inf);
812 unsigned rDim0 = _min(options.minMatchLen,length(readSet[rseqNo])) - 1;
813 unsigned rDim1 = m.gEnd - beginPosition(inf) - 1;
814 // Seed<int,SimpleSeed> seed(lDim0, lDim1, rDim0, rDim1);
815 Seed<Simple> seed(lDim0, lDim1, rDim0+1, rDim1+1);
816 Score<int> scoreMatrix(0,-1,-1,-1);
817 int scoreDropOff = static_cast<int>((sequenceLength(rseqNo,readSet) * options.errorRate) - m.editDist);
818
819 #ifdef RAZERS_DEBUG
820 ::std::cout << "beginPos = " << beginPosition(inf) << std::endl;
821 ::std::cout << "endPos = " << endPosition(inf) << std::endl;
822 ::std::cout << " lDim0: " << lDim0 << ::std::endl;
823 ::std::cout << " lDim1: " << lDim1 << ::std::endl;
824 ::std::cout << " rDim0: " << rDim0 << ::std::endl;
825 ::std::cout << " rDim1: " << rDim1 << ::std::endl;
826 ::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
827 ::std::cout << " readInf: "<< infix(readSet[rseqNo],lDim0,rDim0+1) << ::std::endl;
828 ::std::cout << " gInfInf: "<< infix(inf,lDim1,rDim1+1) << ::std::endl;
829 ::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
830 ::std::cout << " gInf: "<< inf << ::std::endl;
831 #endif
832
833 int extScore = 0;
834 // XXXIMPROV
835 // extendSeedScore(seed,extScore,scoreDropOff,scoreMatrix, prefix(readSet[rseqNo],length(readSet[rseqNo])-options.minMatchLen),inf,1,GappedXDrop());
836
837 typedef typename Suffix<typename Value<TReadSet>::Type >::Type TQuerySuffix;
838 typedef typename Suffix<TInf>::Type TDatabaseSuffix;
839
840 TQuerySuffix querySuffix = suffix(readSet[rseqNo], endPositionH(seed));
841 TDatabaseSuffix databaseSuffix = suffix(inf, endPositionV(seed));
842 extScore = _extendSeedGappedXDropOneDirection(seed, databaseSuffix, querySuffix, EXTEND_RIGHT, scoreMatrix, scoreDropOff);
843
844
845 //extendSeedScore(seed,extScore,scoreDropOff,scoreMatrix, readSet[rseqNo],inf,1,GappedXDrop());
846 //m.gEnd = rightDim1(seed) + 1 + beginPosition(inf);
847 //m.mScore = rightDim0(seed) - leftDim0(seed) + 1;
848 m.gEnd = endPositionV(seed) + beginPosition(inf);
849 m.mScore = endPositionH(seed) - beginPositionH(seed);
850 m.editDist -= extScore;
851
852 #ifdef RAZERS_DEBUG
853 ::std::cout << " lDim0: " << beginPositionH(seed) << ::std::endl;
854 ::std::cout << " lDim1: " << beginPositionV(seed) << ::std::endl;
855 ::std::cout << " rDim0: " << endPositionH(seed) << ::std::endl;
856 ::std::cout << " rDim1: " << endPositionV(seed) << ::std::endl;
857 ::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
858 ::std::cout << " readInf: "<< infix(readSet[rseqNo],beginPositionH(seed),endPositionH(seed)) << ::std::endl;
859 ::std::cout << " gInfInf: "<< infix(inf,beginPositionV(seed),endPositionV(seed)) << ::std::endl;
860 ::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
861 ::std::cout << " gInf: "<< inf << ::std::endl;
862 #endif
863 }
864
865
866
867
868 template <
869 typename TMatch,
870 typename TGenome,
871 typename TReadSet,
872 typename TPattern,
873 typename TSpec >
874 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> genomeInf,unsigned rseqNo,TReadSet & readSet,TPattern &,RazerSOptions<TSpec> const & options,SwiftSemiGlobalHamming,LongestPrefix)875 matchVerify(
876 TMatch &m, // resulting match
877 Segment<TGenome,InfixSegment> genomeInf, // potential match genome region
878 unsigned rseqNo, // read number
879 TReadSet& readSet, // original readSet
880 TPattern&,
881 RazerSOptions<TSpec> const &options, // RazerS options
882 SwiftSemiGlobalHamming, // HammingDistance
883 LongestPrefix) // LongestPrefix
884 {
885
886 typedef Segment<TGenome, InfixSegment> TGenomeInfix;
887 //typedef typename Size<TGenomeInfix>::Type TSize;
888 //typedef typename Value<TGenomeInfix>::Type TDna;
889 //typedef typename Position<TGenomeInfix>::Type TPosition;
890 typedef typename Value<TReadSet>::Type TRead;
891 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator;
892 typedef typename Infix<TRead>::Type TReadInf;
893 typedef typename Iterator<TReadInf, Standard>::Type TReadIterator;
894
895 if (length(genomeInf) < options.minMatchLen) return false;
896 TReadInf read = infix(readSet[rseqNo],0,length(readSet[rseqNo])-options.minMatchLen);
897
898
899 TReadIterator ritBeg = begin(read, Standard());
900 TReadIterator ritEnd = end(read, Standard());
901 TGenomeIterator git = begin(genomeInf, Standard());
902 TGenomeIterator gitEnd = end(genomeInf, Standard()) - (length(read) - 1);
903
904 // this is max number of errors the seed should have
905 // unsigned maxErrorsSeed = (unsigned)(options.minMatchLen * options.errorRate);
906 unsigned maxTotalErrors = (unsigned)(length(readSet[rseqNo]) * options.errorRate);
907 unsigned maxErrorsSeed = options.maxPrefixErrors;
908 if(maxErrorsSeed > maxTotalErrors) maxErrorsSeed = maxTotalErrors;
909 // unsigned minSeedErrors = maxErrorsSeed + 1;
910 unsigned minTotalErrors = maxTotalErrors + 1;
911 unsigned bestHitLength = 0;
912
913 for (; git < gitEnd; ++git)
914 {
915 bool hit = true;
916 unsigned hitLength = 0;
917 unsigned count = 0;
918 unsigned seedErrors = 0;
919 unsigned totalErrors = 0;
920 TGenomeIterator g = git;
921 for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g)
922 {
923 if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
924 {
925 if (count < options.minMatchLen)
926 {
927 ++totalErrors;
928 if(++seedErrors > maxErrorsSeed)
929 {
930 hit = false;
931 break;
932 }
933 }
934 else
935 {
936 if(++totalErrors > maxTotalErrors)
937 {
938 // we exclude this last error position
939 --totalErrors;
940 break;
941 }
942 }
943 }
944 ++count;
945 }
946 if (hit) hitLength = count;
947 if (hitLength > bestHitLength ) //simply take the longest hit
948 {
949 // minSeedErrors = seedErrors;
950 minTotalErrors = totalErrors;
951 bestHitLength = hitLength;
952 m.gBegin = git - begin(host(genomeInf), Standard());
953 }
954 }
955
956
957
958 if(bestHitLength < options.minMatchLen)
959 return false;
960
961 m.gEnd = m.gBegin + bestHitLength;
962 m.mScore = bestHitLength;
963 m.editDist = minTotalErrors;
964
965 #ifdef RAZERS_DEBUG
966 std::cout << "m.gBeg =" << m.gBegin << "\n";
967 std::cout << "m.gEnd =" << m.gEnd << "\n";
968 std::cout << "m.edit =" << m.editDist << "\n";
969 std::cout << "m.mScore=" << m.mScore << "\n\n";
970 #endif
971
972 return true;
973
974 }
975
976
977 template <
978 typename TMatch,
979 typename TGenome,
980 typename TReadSet,
981 typename TPattern,
982 typename TSpec >
983 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> genomeInf,unsigned rseqNo,TReadSet & readSet,TPattern &,RazerSOptions<TSpec> const & options,SwiftSemiGlobalHamming,LongestSuffix)984 matchVerify(
985 TMatch &m, // resulting match
986 Segment<TGenome,InfixSegment> genomeInf, // potential match genome region
987 unsigned rseqNo, // read number
988 TReadSet & readSet, // original readSet
989 TPattern&,
990 RazerSOptions<TSpec> const &options, // RazerS options
991 SwiftSemiGlobalHamming,
992 LongestSuffix) // LongestSuffix
993 {
994
995 typedef Segment<TGenome, InfixSegment> TGenomeInfix;
996 //typedef typename Size<TGenomeInfix>::Type TSize;
997 //typedef typename Value<TGenomeInfix>::Type TDna;
998 //typedef typename Position<TGenomeInfix>::Type TPosition;
999 typedef typename Value<TReadSet>::Type TRead;
1000 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator;
1001 typedef typename Infix<TRead>::Type TReadInf;
1002 typedef typename Iterator<TReadInf, Standard>::Type TReadIterator;
1003
1004 if (length(genomeInf) < options.minMatchLen) return false;
1005 TRead read = infix(readSet[rseqNo],options.minMatchLen,length(readSet[rseqNo]));
1006
1007
1008 #ifdef RAZERS_DEBUG
1009 bool debug = true;
1010 if(debug)
1011 {
1012 ::std::cout<< "suffixmatching\n";
1013 ::std::cout << "genome=" << genomeInf << "\nread =" << read <<"\n";
1014 }
1015 #endif
1016
1017 TReadIterator ritEnd = end(read, Standard())-1;
1018 TReadIterator ritBeg = begin(read, Standard());
1019 TGenomeIterator git = end(genomeInf, Standard())-1;
1020 TGenomeIterator gitBeg = begin(genomeInf, Standard()) + options.minMatchLen;
1021
1022 // this is max number of errors the seed should have
1023 // unsigned maxErrorsSeed = (unsigned)(options.minMatchLen * options.errorRate);
1024 unsigned maxTotalErrors = (unsigned)(length(readSet[rseqNo]) * options.errorRate);
1025 unsigned maxErrorsSeed = options.maxSuffixErrors;
1026 if(maxErrorsSeed > maxTotalErrors) maxErrorsSeed = maxTotalErrors;
1027 // unsigned minSeedErrors = maxErrorsSeed + 1;
1028 unsigned minTotalErrors = maxTotalErrors + 1;
1029 unsigned bestHitLength = 0;
1030
1031 for (; git > gitBeg; --git)
1032 {
1033 bool hit = true;
1034 unsigned hitLength = 0;
1035 unsigned count = 0;
1036 unsigned seedErrors = 0;
1037 unsigned totalErrors = 0;
1038 TGenomeIterator g = git;
1039 for(TReadIterator r = ritEnd; r >= ritBeg; --r, --g)
1040 {
1041 //if(debug)::std::cout << *r << "\t" << *g << "\n";
1042 if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
1043 {
1044 if (count < options.minMatchLen)
1045 {
1046 ++totalErrors;
1047 if(++seedErrors > maxErrorsSeed)
1048 {
1049 hit = false;
1050 break;
1051 }
1052 }
1053 else
1054 {
1055 if(++totalErrors > maxTotalErrors)
1056 {
1057 // we exclude this last error position
1058 --totalErrors;
1059 break;
1060 }
1061 }
1062 }
1063 ++count;
1064 }
1065 if (hit) hitLength = count;
1066 if (hitLength > bestHitLength ) //simply take the longest hit
1067 {
1068 // minSeedErrors = seedErrors;
1069 minTotalErrors = totalErrors;
1070 bestHitLength = hitLength;
1071 m.gEnd = git - begin(host(genomeInf), Standard()) + 1;
1072
1073 }
1074 }
1075
1076
1077 if(bestHitLength < options.minMatchLen)
1078 return false;
1079
1080
1081 m.gBegin = m.gEnd - bestHitLength;
1082 m.mScore = bestHitLength;
1083 m.editDist = minTotalErrors;
1084
1085 #ifdef RAZERS_DEBUG
1086 std::cout << "m.gBeg =" << m.gBegin << "\n";
1087 std::cout << "m.gEnd =" << m.gEnd << "\n";
1088 std::cout << "m.edit =" << m.editDist << "\n";
1089 std::cout << "m.mScore=" << m.mScore << "\n\n";
1090 #endif
1091
1092 return true;
1093
1094 }
1095
1096
1097 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1098
1099
1100
1101 // Edit distance match combination forward
1102 template <typename TScore>
1103 bool
findBestSplitPosition(String<Pair<TScore,int>> & maxColsL,String<Pair<TScore,int>> & maxColsR,int & rowPosL1,int rowPosL2,int & rowPosR1,int rowPosR2,int seq0Len,int & traceExt,OrientationForward,SwiftSemiGlobal)1104 findBestSplitPosition(String<Pair<TScore,int> > & maxColsL,
1105 String<Pair<TScore,int> > & maxColsR,
1106 int & rowPosL1,
1107 int rowPosL2,
1108 int & rowPosR1,
1109 int rowPosR2,
1110 int seq0Len,
1111 int & traceExt,
1112 OrientationForward,
1113 SwiftSemiGlobal)
1114 {
1115
1116 #ifdef RAZERS_DEBUG
1117 ::std::cout << "findBestSplitEditForward\n";
1118 #endif
1119
1120 TScore maxSum = std::numeric_limits<TScore>::min();
1121 int bestL = rowPosL1;
1122 int bestR = rowPosR1;
1123 int bestTraceExtL = rowPosL1;
1124 int bestTraceExtR = rowPosL1;
1125 while (rowPosL1 <= rowPosL2 && rowPosR1 >= rowPosR2)
1126 {
1127 // this is to prevent same bases from being used in both prefix and suffix match
1128 // this works, because we store the FIRST bestScore in each row
1129 if (!(maxColsL[rowPosL1].i2 + maxColsR[rowPosR1].i2 <= seq0Len))
1130 {
1131 ++rowPosL1;
1132 --rowPosR1;
1133 continue;
1134 }
1135 if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 > maxSum)
1136 {
1137 maxSum = maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1;
1138 bestL = rowPosL1;
1139 bestR = rowPosR1;
1140 bestTraceExtL = rowPosL1;
1141 }
1142 else if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 == maxSum)
1143 bestTraceExtR = rowPosL1;
1144
1145 ++rowPosL1;
1146 --rowPosR1;
1147 }
1148 traceExt = bestTraceExtR - bestTraceExtL;
1149 rowPosL1 = bestL;
1150 rowPosR1 = bestR;
1151
1152 return true;
1153 }
1154
1155 // ---------------------------------------------------------------------------
1156 // BEGIN BREAKPOINT COMPUTATION CODE FROM ALIGN MODULE
1157 // ---------------------------------------------------------------------------
1158
1159 // TODO(holtgrew): These functions also has to be copied to msplazers.
1160 // TODO(holtgrew): This breakpoint computation code is basically reusable, but needs some love (i.e. documentation, demos, interfaces)
1161
1162 template <typename TAlign, typename TStringSet, typename TTrace, typename TValPair, typename TIndexPair, typename TDiagonal>
1163 inline void
_alignBandedNeedlemanWunschTrace(TAlign & align,TStringSet const & str,TTrace const & trace,TValPair const & overallMaxValue,TIndexPair const & overallMaxIndex,TDiagonal const diagL,TDiagonal const diagU)1164 _alignBandedNeedlemanWunschTrace(TAlign& align,
1165 TStringSet const& str,
1166 TTrace const& trace,
1167 TValPair const& overallMaxValue,
1168 TIndexPair const& overallMaxIndex,
1169 TDiagonal const diagL,
1170 TDiagonal const diagU)
1171 {
1172 typedef typename Value<TStringSet>::Type TString;
1173 typedef typename Id<TStringSet>::Type TId;
1174 typedef typename Size<TTrace>::Type TSize;
1175 typedef typename Value<TTrace>::Type TTraceValue;
1176
1177 // Traceback values
1178 TTraceValue Diagonal = 0; TTraceValue Horizontal = 1; TTraceValue Vertical = 2;
1179
1180 // Initialization
1181 TString const& str1 = str[0];
1182 TString const& str2 = str[1];
1183 TId id1 = positionToId(const_cast<TStringSet&>(str), 0);
1184 TId id2 = positionToId(const_cast<TStringSet&>(str), 1);
1185 TSize len1 = length(str1) + 1;
1186 TSize len2 = length(str2) + 1;
1187 TSize lo_row = (diagU <= 0) ? -1 * diagU : 0;
1188 TSize diagonalWidth = (TSize) (diagU - diagL + 1);
1189
1190 //// Debug stuff
1191 //TColumn originalMat;
1192 //resize(originalMat, len1 * len2);
1193 //TSize count = 0;
1194 //for(TSize i=0; i<len2; ++i) {
1195 // for(TSize j=0; j<len1; ++j) {
1196 // value(originalMat, i * len1 + j) = count;
1197 // std::cout << count << ',';
1198 // ++count;
1199 // }
1200 // std::cout << std::endl;
1201 //}
1202 //std::cout << std::endl;
1203
1204 // Start the trace from the cell with the max value
1205 TSize row = (overallMaxValue[0] > overallMaxValue[1]) ? overallMaxIndex[0] : overallMaxIndex[2];
1206 TSize col = (overallMaxValue[0] > overallMaxValue[1]) ? overallMaxIndex[1] : overallMaxIndex[3];
1207
1208 // Handle the skipped sequence parts
1209 TSize actualRow = row + lo_row;
1210 TSize actualCol = col + diagL + actualRow;
1211 if (actualCol + 1 < len1) _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, (len1 - (actualCol + 1)), Horizontal);
1212 if (actualRow + 1 < len2) _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, (len2 - (actualRow + 1)), Vertical);
1213
1214 if ((actualRow != 0) && (actualCol != 0)) {
1215 // Find initial direction
1216 TTraceValue tv = trace[row * diagonalWidth + col];
1217 if (tv == Horizontal) --col;
1218 else if (tv == Vertical) {--row; ++col;}
1219 else --row;
1220
1221 // Walk until we hit a border
1222 TSize seqLen = 1;
1223 TTraceValue newTv = tv;
1224 while(true) {
1225 actualRow = row + lo_row;
1226 actualCol = col + diagL + actualRow;
1227 newTv = trace[row * diagonalWidth + col];
1228
1229 // Check if we hit a border
1230 if ((actualRow == 0) || (actualCol == 0)) break;
1231 else {
1232 //std::cout << row << ',' << col << ':' << value(originalMat, actualRow * len1 + actualCol) << std::endl;
1233 if (tv == Diagonal) {
1234 if (newTv == Horizontal) {
1235 _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1236 --col; seqLen = 1;
1237 } else if (newTv == Vertical) {
1238 _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1239 --row; ++col; seqLen = 1;
1240 } else {
1241 --row; ++seqLen;
1242 }
1243 } else {
1244 if (tv == Horizontal) {
1245 if (newTv == Diagonal) {
1246 _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1247 --row; seqLen = 1;
1248 } else if (newTv == Vertical) {
1249 _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1250 --row; ++col; seqLen = 1;
1251 } else {
1252 --col; ++seqLen;
1253 }
1254 } else {
1255 if (newTv == Diagonal) {
1256 _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1257 --row; seqLen = 1;
1258 } else if (newTv == Horizontal) {
1259 _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1260 --col; seqLen = 1;
1261 } else {
1262 --row; ++col; ++seqLen;
1263 }
1264 }
1265 }
1266 tv = newTv;
1267 }
1268 }
1269
1270 // Align left overs
1271 if (seqLen) _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1272 }
1273
1274 // Handle the remaining sequence
1275 if (actualCol != 0) _alignTracePrint(align, str[0], str[1], (TId) id1, (TSize) 0, (TId) 0, (TSize) 0, (TSize) actualCol, Horizontal);
1276 else if (actualRow != 0) _alignTracePrint(align, str[0], str[1], (TId) 0, (TSize) 0, (TId) id2, (TSize) 0, (TSize) actualRow, Vertical);
1277 }
1278
1279 template <typename TTrace, typename TStringSet, typename TScore, typename TValPair, typename TIndexPair, typename TDiagonal, typename TAlignConfig>
1280 inline typename Value<TScore>::Type
_alignBandedNeedlemanWunsch(TTrace & trace,TStringSet const & str,TScore const & sc,TValPair & overallMaxValue,TIndexPair & overallMaxIndex,TDiagonal diagL,TDiagonal diagU,TAlignConfig const,String<Pair<typename Value<TScore>::Type,int>> & maxCols,unsigned minColNum)1281 _alignBandedNeedlemanWunsch(TTrace& trace,
1282 TStringSet const& str,
1283 TScore const & sc,
1284 TValPair& overallMaxValue,
1285 TIndexPair& overallMaxIndex,
1286 TDiagonal diagL,
1287 TDiagonal diagU,
1288 TAlignConfig const,
1289 String<Pair<typename Value<TScore>::Type,int> > & maxCols,
1290 unsigned minColNum)
1291 {
1292 typedef typename Value<TTrace>::Type TTraceValue;
1293 typedef typename Value<TScore>::Type TScoreValue;
1294 typedef typename Value<TStringSet>::Type TString;
1295 typedef typename Size<TTrace>::Type TSize;
1296
1297 // ::std::cout << "hierrrrrrrrrr banded ali\n";
1298 // Initialization
1299 TTraceValue Diagonal = 0; TTraceValue Horizontal = 1; TTraceValue Vertical = 2;
1300 TString const& str1 = str[0];
1301 TString const& str2 = str[1];
1302 TSize len1 = length(str1) + 1;
1303 TSize len2 = length(str2) + 1;
1304 // ::std::cout << "len1="<<len1 << " len2=" << len2 << ::std::endl;
1305 TSize diagonalWidth = (TSize) (diagU - diagL + 1);
1306 TSize hi_diag = diagonalWidth;
1307 TSize lo_diag = 0;
1308 if (diagL > 0) lo_diag = 0;
1309 else lo_diag = (diagU < 0) ? hi_diag : (TSize) (1 - diagL);
1310 TSize lo_row = (diagU <= 0) ? -diagU : 0;
1311 TSize hi_row = len2;
1312 if (len1 - diagL < hi_row) hi_row = len1 - diagL;
1313 TSize height = hi_row - lo_row;
1314
1315 clear(maxCols);
1316
1317 typedef String<TScoreValue> TRow;
1318 TRow mat;
1319 resize(mat, diagonalWidth);
1320 resize(trace, height * diagonalWidth);
1321 // ::std::cout <<height << "<-hieght\n";
1322 overallMaxValue[0] = std::numeric_limits<TScoreValue>::min();
1323 overallMaxValue[1] = std::numeric_limits<TScoreValue>::min();
1324 overallMaxIndex[0] = diagonalWidth; overallMaxIndex[1] = height;
1325 overallMaxIndex[2] = diagonalWidth; overallMaxIndex[3] = height;
1326
1327 //// Debug stuff
1328 //String<TScoreValue> originalMat;
1329 //resize(originalMat, len1 * len2);
1330 //TSize count = 0;
1331 //for(TSize i=0; i<len2; ++i) {
1332 // for(TSize j=0; j<len1; ++j) {
1333 // value(originalMat, i * len1 + j) = count;
1334 // std::cerr << count << ',';
1335 // ++count;
1336 // }
1337 // std::cerr << std::endl;
1338 //}
1339 //std::cerr << std::endl;
1340
1341 // Classical DP with affine gap costs
1342 typedef typename Iterator<TRow, Standard>::Type TRowIter;
1343 typedef typename Iterator<TTrace, Standard>::Type TTraceIter;
1344 TSize actualCol = 0;
1345 TSize actualRow = 0;
1346 TScoreValue verti_val = 0;
1347 TScoreValue hori_val = 0;
1348 for(TSize row = 0; row < height; ++row) {
1349 TScoreValue maxRowVal = std::numeric_limits<TScoreValue>::min();
1350 unsigned maxRowCol = 0;
1351 actualRow = row + lo_row;
1352 if (lo_diag > 0) --lo_diag;
1353 if (row + lo_row >= len1 - diagU) --hi_diag;
1354 TTraceIter traceIt = begin(trace, Standard()) + row * diagonalWidth + lo_diag;
1355 TRowIter matIt = begin(mat, Standard()) + lo_diag;
1356 hori_val = std::numeric_limits<TScoreValue>::min();
1357 for(TSize col = lo_diag; col<hi_diag; ++col, ++matIt, ++traceIt) {
1358 actualCol = col + diagL + actualRow;
1359 //std::cerr << row << ',' << col << ':' << value(originalMat, actualRow * len1 + actualCol) << std::endl;
1360
1361 if ((actualRow != 0) && (actualCol != 0)) {
1362 // Get the new maximum for mat
1363 *matIt += score(const_cast<TScore&>(sc), sequenceEntryForScore(const_cast<TScore&>(sc), str1, ((int) actualCol - 1)),
1364 sequenceEntryForScore(const_cast<TScore&>(sc), str2, ((int) actualRow - 1)));
1365 *traceIt = Diagonal;
1366 if ((verti_val = (col < diagonalWidth - 1) ? *(matIt+1) +
1367 scoreGapExtendVertical(sc, sequenceEntryForScore(sc, str1, ((int) actualCol - 1)),
1368 sequenceEntryForScore(sc, str2, ((int) actualRow - 1))) : std::numeric_limits<TScoreValue>::min()) > *matIt)
1369 {
1370 *matIt = verti_val;
1371 *traceIt = Vertical;
1372 }
1373 if ((hori_val = (col > 0) ? hori_val +
1374 scoreGapExtendHorizontal(sc, sequenceEntryForScore(sc, str1, ((int) actualCol - 1)),
1375 sequenceEntryForScore(sc, str2, ((int) actualRow - 1))) : std::numeric_limits<TScoreValue>::min()) > *matIt)
1376 {
1377 *matIt = hori_val;
1378 *traceIt = Horizontal;
1379 }
1380 hori_val = *matIt;
1381 } else {
1382 // Usual initialization for first row and column
1383 if (actualRow == 0)
1384 _initFirstRow(TAlignConfig(), *matIt, (TScoreValue) actualCol *
1385 scoreGapExtendHorizontal(sc, sequenceEntryForScore(sc, str1, std::max(0,((int) actualCol - 1))),
1386 sequenceEntryForScore(sc, str2, 0)));
1387 else {
1388 _initFirstColumn(TAlignConfig(), *matIt, (TScoreValue) actualRow *
1389 scoreGapExtendVertical(sc, sequenceEntryForScore(sc, str1, 0),
1390 sequenceEntryForScore(sc, str2, std::max(0,((int) actualRow - 1)))));
1391 hori_val = *matIt;
1392 }
1393 }
1394 if(*matIt > maxRowVal && actualCol >= minColNum)
1395 {
1396 maxRowVal = *matIt;
1397 maxRowCol = actualCol;
1398 }
1399 // Store the maximum
1400 if (actualCol == len1 - 1) _lastColumn(TAlignConfig(), overallMaxValue, overallMaxIndex, *matIt, row, col);
1401 if (actualRow == len2 - 1) _lastRow(TAlignConfig(), overallMaxValue, overallMaxIndex, *matIt, row, col);
1402 //std::cerr << row << ',' << col << ':' << *matIt << std::endl;
1403 }
1404 appendValue(maxCols,Pair<typename Value<TScore>::Type,int>(maxRowVal,maxRowCol));
1405 // std::cout << maxRowVal << ","<<maxRowCol << std::endl;
1406 }
1407 return (overallMaxValue[0] > overallMaxValue[1]) ? overallMaxValue[0] : overallMaxValue[1];
1408 }
1409
1410 template<typename TAlign, typename TStringSet, typename TScore, typename TAlignConfig, typename TDiagonal>
1411 inline typename Value<TScore>::Type
_globalAlignment(TAlign & align,TStringSet const & str,TScore const & sc,TAlignConfig const,TDiagonal diag1,TDiagonal diag2,String<Pair<typename Value<TScore>::Type,int>> & maxCols,unsigned minColNum,NeedlemanWunsch)1412 _globalAlignment(TAlign& align,
1413 TStringSet const& str,
1414 TScore const& sc,
1415 TAlignConfig const,
1416 TDiagonal diag1,
1417 TDiagonal diag2,
1418 String<Pair<typename Value<TScore>::Type,int> > &maxCols,
1419 unsigned minColNum,
1420 NeedlemanWunsch)
1421 {
1422 typedef typename Value<TScore>::Type TScoreValue;
1423 typedef typename Size<TStringSet>::Type TSize;
1424
1425 // Maximum value
1426 TScoreValue overallMaxValue[2];
1427 TSize overallMaxIndex[4];
1428
1429 // Create the trace
1430 String<TraceBack> trace;
1431 TScoreValue maxScore = _alignBandedNeedlemanWunsch(trace, str, sc, overallMaxValue, overallMaxIndex, (int) diag1, (int) diag2, TAlignConfig(),maxCols, minColNum);
1432
1433 // Follow the trace and create the graph
1434 _alignBandedNeedlemanWunschTrace(align, str, trace, overallMaxValue, overallMaxIndex, (int) diag1, (int) diag2);
1435
1436 return maxScore;
1437 }
1438
1439 template<typename TStringSet, typename TScore, typename TAlignConfig, typename TDiagonal>
1440 inline typename Value<TScore>::Type
_globalAlignment(TStringSet const & str,TScore const & sc,TAlignConfig const,TDiagonal diag1,TDiagonal diag2,String<Pair<typename Value<TScore>::Type,int>> & maxCols,unsigned minColNum,NeedlemanWunsch)1441 _globalAlignment(TStringSet const& str,
1442 TScore const& sc,
1443 TAlignConfig const,
1444 TDiagonal diag1,
1445 TDiagonal diag2,
1446 String<Pair<typename Value<TScore>::Type,int> > &maxCols,
1447 unsigned minColNum,
1448 NeedlemanWunsch)
1449 {
1450 typedef typename Value<TScore>::Type TScoreValue;
1451 typedef typename Size<TStringSet>::Type TSize;
1452
1453 // Maximum value
1454 TScoreValue overallMaxValue[2];
1455 TSize overallMaxIndex[4];
1456
1457 // Calculate the score
1458 String<TraceBack> trace;
1459 return _alignBandedNeedlemanWunsch(trace, str, sc, overallMaxValue, overallMaxIndex, (int) diag1, (int) diag2, TAlignConfig(),maxCols,minColNum);
1460 }
1461
1462 // ---------------------------------------------------------------------------
1463 // END BREAKPOINT COMPUTATION CODE FROM ALIGN MODULE
1464 // ---------------------------------------------------------------------------
1465
1466 // Edit distance match combination reverse
1467 template <typename TScore>
1468 bool
findBestSplitPosition(String<Pair<TScore,int>> & maxColsL,String<Pair<TScore,int>> & maxColsR,int & rowPosL1,int rowPosL2,int & rowPosR1,int rowPosR2,int seq0Len,int & traceExt,OrientationReverse,SwiftSemiGlobal)1469 findBestSplitPosition(String<Pair<TScore,int> > & maxColsL,
1470 String<Pair<TScore,int> > & maxColsR,
1471 int & rowPosL1,
1472 int rowPosL2,
1473 int & rowPosR1,
1474 int rowPosR2,
1475 int seq0Len,
1476 int & traceExt,
1477 OrientationReverse,
1478 SwiftSemiGlobal)
1479 {
1480 #ifdef RAZERS_DEBUG
1481 ::std::cout << "findBestSplitEditReverse\n";
1482 #endif
1483
1484 TScore maxSum = std::numeric_limits<TScore>::min();
1485 int bestL = rowPosL2;
1486 int bestR = rowPosR2;
1487 int bestTraceExtR = rowPosL1;
1488 int bestTraceExtL = rowPosL1;
1489
1490 while (rowPosL1 <= rowPosL2 && rowPosR1 >= rowPosR2)
1491 {
1492 if(!(maxColsL[rowPosL1].i2 + maxColsR[rowPosR1].i2 <= seq0Len))
1493 {
1494 ++rowPosL1;
1495 --rowPosR1;
1496 continue;
1497 }
1498 if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 >= maxSum)
1499 {
1500 maxSum = maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1;
1501 bestL = rowPosL1;
1502 bestR = rowPosR1;
1503 bestTraceExtR = rowPosL1;
1504 if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 > maxSum) bestTraceExtL = rowPosL1;
1505 }
1506 ++rowPosL1;
1507 --rowPosR1;
1508 }
1509 rowPosL1 = bestL;
1510 rowPosR1 = bestR;
1511 traceExt = bestTraceExtR - bestTraceExtL;
1512 return true;
1513 }
1514
1515
1516
1517
1518
1519 // Edit distance match combination wrapper
1520 template <typename TMatch, typename TRead, typename TGenome, typename TSpec>
1521 bool
combineLeftRight(TMatch & mR,TMatch & mL,TRead & read,TGenome & genome,RazerSOptions<TSpec> & options,char orientation,SwiftSemiGlobal)1522 combineLeftRight(TMatch & mR,
1523 TMatch & mL,
1524 TRead & read,
1525 TGenome & genome,
1526 RazerSOptions<TSpec> &options,
1527 char orientation,
1528 SwiftSemiGlobal)
1529 {
1530
1531 #ifdef RAZERS_DEBUG
1532 ::std::cout << "combinLeftRightEdit\n";
1533 #endif
1534 Score<int> scoreType(0,-1,-1,-1);
1535 typedef typename Infix<TGenome>::Type TGenomeInf;
1536 typedef ModifiedString<TGenomeInf,ModReverse> TGenomeInfRev;
1537
1538 TGenomeInf genomeInfL = infix(genome, mL.gBegin+mL.gSeedLen, mL.gEnd);
1539 TGenomeInf readInfL = infix(read,options.minMatchLen,mL.mScore);
1540 TGenomeInfRev genomeInfR(infix(genome, mR.gBegin, mR.gEnd-mR.gSeedLen));
1541 TGenomeInfRev readInfR(infix(read,length(read)-mR.mScore,length(read)-options.minMatchLen));
1542
1543 #ifdef RAZERS_DEBUG
1544 bool debug = true;
1545 std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1546 std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1547 std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1548 std::cout << "mL.editDist =" << mL.editDist << "\n";
1549
1550 std::cout << "mR.mScore =" << mR.mScore << "\n";
1551 std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1552 std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1553 std::cout << "mR.editDist =" << mR.editDist << "\n";
1554 #endif
1555
1556 int readLength = length(read);
1557 unsigned maxErrors = static_cast<unsigned>(readLength * options.errorRate);
1558
1559 // neither insertion nor deletion
1560 if(mR.gEnd - mL.gBegin == readLength)
1561 {
1562 #ifdef RAZERS_DEBUG
1563 ::std::cout << "normal\n";
1564 #endif
1565 if(true)return false;
1566 unsigned halfReadLen = readLength >> 1;
1567
1568 Align<String<Dna5>, ArrayGaps> align;
1569 assignSource(row(align, 0), infix(read,0,readLength));
1570 assignSource(row(align, 1), infix(genome, mL.gBegin, mR.gEnd));
1571 int sc = globalAlignment(align, scoreType, AlignConfig<false,false,false,false>(), NeedlemanWunsch());
1572 if(-sc > (int)maxErrors) return false;
1573
1574 mL.gEnd = mL.gBegin + toSourcePosition(row(align, 1),toViewPosition(row(align, 0), halfReadLen-1));
1575 mR.gBegin = mL.gEnd;
1576 mL.mScore = halfReadLen;
1577 mR.mScore = readLength - mL.mScore;
1578 mL.editDist = countErrorsInAlign(align,toViewPosition(row(align, 0), halfReadLen));
1579 mR.editDist = -sc-mL.editDist;
1580
1581 #ifdef RAZERS_DEBUG
1582 std::cout << "mL.mScore =" << mL.mScore << "\n";
1583 std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1584 std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1585 std::cout << "mL.editDist =" << mL.editDist << "\n";
1586
1587 std::cout << "mR.mScore =" << mR.mScore << "\n";
1588 std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1589 std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1590 std::cout << "mR.editDist =" << mR.editDist << "\n";
1591 #endif
1592
1593 }
1594
1595 //potential insertion // for edit distance this is a heuristic
1596 if(mR.gEnd - mL.gBegin < readLength)
1597 {
1598 #ifdef RAZERS_DEBUG
1599 ::std::cout << "insertion\n";
1600 #endif
1601 if(mR.gEnd - mL.gBegin < static_cast<long int>(2*options.minMatchLen)) //too close together // actually minus allowed seed errors
1602 return false;
1603
1604 if(mL.gEnd < mR.gBegin) //prefix and suffix match do not meet
1605 return false;
1606
1607 if(mR.mScore + mL.mScore == mR.gEnd - mL.gBegin)
1608 if(mL.gEnd == mR.gBegin)
1609 if(mR.editDist + mL.editDist <= maxErrors ) //prefix and suffix match meet and do not overlap --> perfect
1610 return true;
1611
1612 int diag1L = -static_cast<int>(maxErrors) + mL.seedEditDist;
1613 int diag2L = maxErrors - mL.seedEditDist;
1614 int diag1R = -static_cast<int>(maxErrors) + mR.seedEditDist;
1615 int diag2R = maxErrors - mR.seedEditDist;
1616 int minColNum = 0;
1617
1618 // genomeInf is the shorter sequence --> find best split position on genome
1619 // rows in alignment matrix represent genome position
1620 StringSet<TGenomeInf,Dependent<> > strSetL;
1621 appendValue(strSetL,readInfL);
1622 appendValue(strSetL,genomeInfL);
1623 Graph<Alignment<StringSet<TGenomeInf,Dependent<> >, void> > alignL(strSetL);
1624 String<Pair<int,int> > maxColsL;
1625 _globalAlignment(alignL,strSetL,scoreType,AlignConfig<false,false,false,false>(),diag1L,diag2L,maxColsL,minColNum,NeedlemanWunsch());
1626
1627 StringSet<TGenomeInfRev,Dependent<> > strSetR;
1628 appendValue(strSetR,readInfR);
1629 appendValue(strSetR,genomeInfR);
1630 Graph<Alignment<StringSet<TGenomeInfRev,Dependent<> >, void > > alignR(strSetR);
1631 String<Pair<int,int> > maxColsR;
1632 _globalAlignment(alignR,strSetR,scoreType,AlignConfig<false,false,false,false>(),diag1R,diag2R,maxColsR,minColNum,NeedlemanWunsch());
1633
1634 //::std::cout << alignL;
1635 //::std::cout << alignR;
1636
1637 // our begin and start positions are defined by the read positions
1638 // go from read source to view position to genome source position
1639 int rowPosL1 = 0;
1640 if (mL.gSeedLen < (int)mR.gBegin - mL.gBegin) rowPosL1 = mR.gBegin - mL.gBegin - mL.gSeedLen;//or from first possible overlap pos
1641 int rowPosR2 = 0;
1642 if (mR.gSeedLen < (int)mR.gEnd - mL.gEnd) rowPosR2 = mR.gEnd - mL.gEnd - mR.gSeedLen;
1643
1644 int rowPosL2 = mR.gEnd - mR.gSeedLen - rowPosR2 - mL.gBegin - mL.gSeedLen;
1645 int rowPosR1 = mR.gEnd - mR.gSeedLen - rowPosL1 - mL.gBegin - mL.gSeedLen;
1646
1647
1648 #ifdef RAZERS_DEBUG
1649 ::std::cout << "vorher\nrowPosL1=" << rowPosL1 << ::std::endl;
1650 ::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1651 ::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1652 ::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1653 #endif
1654
1655 // compute best split position
1656 int seq0Len = readLength - 2*options.minMatchLen;
1657 int traceExt = 0;
1658 if(orientation == 'R')
1659 findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,traceExt,OrientationReverse(),SwiftSemiGlobal());
1660 else
1661 findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,traceExt,OrientationForward(),SwiftSemiGlobal());
1662
1663 #ifdef RAZERS_DEBUG
1664 ::std::cout << "nachher\nrowPosL1=" << rowPosL1 << ::std::endl;
1665 ::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1666 ::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1667 ::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1668 ::std::cout << "mR.editDist=" << mR.editDist << ::std::endl;
1669 ::std::cout << "mL.editDist=" << mL.editDist << ::std::endl;
1670 ::std::cout << "maxErros=" << maxErrors << ::std::endl;
1671 #endif
1672 mL.editDist = mR.seedEditDist - maxColsL[rowPosL1].i1; //scores are negative
1673 mR.editDist = mL.seedEditDist - maxColsR[rowPosR1].i1;
1674 if(mR.editDist + mL.editDist > maxErrors )
1675 return false;
1676
1677 mR.traceExtension = mL.traceExtension = traceExt;
1678
1679 // best split position in genome is saved in rowPosL1 (and rowPosR1)
1680 mL.mScore = options.minMatchLen + maxColsL[rowPosL1].i2;
1681 mL.gEnd = mL.gBegin + mL.gSeedLen + rowPosL1; //read position of best genomic split
1682
1683 mR.mScore = options.minMatchLen + maxColsR[rowPosR1].i2;
1684 mR.gBegin = mR.gEnd - mR.gSeedLen - rowPosR1; //read position of best genomic split
1685
1686 #ifdef RAZERS_DEBUG
1687 if(mL.editDist > 50 || mR.mScore < options.minMatchLen || mL.mScore < options.minMatchLen)
1688 {
1689 ::std::cout << "-maxColsL[rowPosL1].i1=" << -maxColsL[rowPosL1].i1 << " -maxColsL[rowPosL1].i2=" << -maxColsL[rowPosL1].i2 << " rowPosL1="<<rowPosL1;
1690 ::std::cout << " maxColsLLen="<< length(maxColsL) << ::std::endl;
1691 ::std::cout << "-maxColsR[rowPosR1].i1=" << -maxColsR[rowPosR1].i1 << "-maxColsR[rowPosR1].i2=" << -maxColsR[rowPosR1].i2 << " rowPosR1="<<rowPosR1;
1692 ::std::cout << " maxColsRLen="<< length(maxColsR) << ::std::endl;
1693 }
1694 #endif
1695 }
1696
1697 //potential deletion
1698 if(mR.gEnd - mL.gBegin > readLength)
1699 {
1700 #ifdef RAZERS_DEBUG
1701 ::std::cout << "deletion\n";
1702 #endif
1703 if(mR.mScore + mL.mScore < readLength)
1704 return false;
1705 if(mR.mScore + mL.mScore == readLength && mR.editDist + mL.editDist > maxErrors) //the prefix and suffix match meet, but too many errors
1706 return false;
1707
1708 int diag1L = -static_cast<int>(maxErrors) + mL.seedEditDist;
1709 int diag2L = maxErrors - mL.seedEditDist;
1710 int diag1R = -static_cast<int>(maxErrors) + mR.seedEditDist;
1711 int diag2R = maxErrors - mR.seedEditDist;
1712 int minColNum = 0;
1713
1714 // readInf is the shorter sequence --> find best split position on read
1715 // rows in alignment matrix represent read position
1716 StringSet<TGenomeInf> strL;
1717 appendValue(strL,genomeInfL);
1718 appendValue(strL,readInfL);
1719 String<Pair<int,int> > maxColsL;
1720 _globalAlignment(strL,scoreType,AlignConfig<false,false,false,false>(),diag1L,diag2L,maxColsL,minColNum,NeedlemanWunsch());
1721
1722 StringSet<TGenomeInfRev> strR;
1723 appendValue(strR,genomeInfR);
1724 appendValue(strR,readInfR);
1725 String<Pair<int,int> > maxColsR;
1726
1727 _globalAlignment(strR,scoreType,AlignConfig<false,false,false,false>(),diag1R,diag2R,maxColsR,minColNum,NeedlemanWunsch());
1728
1729 int rowPosL1 = ((int)options.minMatchLen > readLength-mR.mScore) ? (int)0 : readLength-mR.mScore-options.minMatchLen;
1730 int rowPosL2 = (readLength-(int)options.minMatchLen < mL.mScore) ? readLength-(int)2*options.minMatchLen : mL.mScore - options.minMatchLen;
1731 int rowPosR1 = (int)readLength - 2*options.minMatchLen - rowPosL1;
1732 int rowPosR2 = (int)readLength - 2*options.minMatchLen - rowPosL2;
1733
1734 #ifdef RAZERS_DEBUG
1735 ::std::cout << "rowPosL1=" << rowPosL1 << ::std::endl;
1736 ::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1737 ::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1738 ::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1739
1740 std::cout << "before split:\nmL.mScore =" << mL.mScore << "\n";
1741 std::cout << "mL.gBegin =" << length(genome)-mL.gBegin << "\n";
1742 std::cout << "mL.gEnd =" << length(genome)-mL.gEnd << "\n";
1743 std::cout << "mL.editDist =" << mL.editDist << "\n";
1744
1745 std::cout << "mR.mScore =" << mR.mScore << "\n";
1746 std::cout << "mR.gBegin =" << length(genome)-mR.gBegin << "\n";
1747 std::cout << "mR.gEnd =" << length(genome)-mR.gEnd << "\n";
1748 std::cout << "mR.editDist =" << mR.editDist << "\n";
1749 #endif
1750
1751 // compute best split position
1752 int seq0Len = mR.gEnd - mL.gBegin;
1753 int traceExt = 0;
1754 if(orientation == 'R')
1755 findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,traceExt,OrientationReverse(),SwiftSemiGlobal());
1756 else
1757 findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,traceExt,OrientationForward(),SwiftSemiGlobal());
1758
1759
1760 // best split position in read is saved in rowPosL1 (and rowPosR1)
1761 mL.editDist = mL.seedEditDist - maxColsL[rowPosL1].i1;
1762 mR.editDist = mR.seedEditDist - maxColsR[rowPosR1].i1;
1763
1764
1765 if(mR.editDist + mL.editDist > maxErrors)
1766 return false;
1767
1768 mR.mScore = options.minMatchLen + rowPosR1;
1769 mR.gBegin = mR.gEnd - maxColsR[rowPosR1].i2 - mR.gSeedLen; //genomic position of best read split
1770 mL.mScore = options.minMatchLen + rowPosL1;
1771 mR.traceExtension = mL.traceExtension = traceExt;
1772
1773 mL.gEnd = mL.gBegin + maxColsL[rowPosL1].i2 + mL.gSeedLen; //genomic position of best read split
1774 #ifdef RAZERS_DEBUG
1775 ::std::cout << "rowPosL1=" << rowPosL1 << ::std::endl;
1776 ::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1777 ::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1778 ::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1779
1780 std::cout << "after split:\nmL.mScore =" << mL.mScore << "\n";
1781 std::cout << "mL.gBegin =" << length(genome)-mL.gBegin << "\n";
1782 std::cout << "mL.gEnd =" << length(genome)-mL.gEnd << "\n";
1783 std::cout << "mL.editDist =" << mL.editDist << "\n";
1784
1785 std::cout << "mR.mScore =" << mR.mScore << "\n";
1786 std::cout << "mR.gBegin =" << length(genome)-mR.gBegin << "\n";
1787 std::cout << "mR.gEnd =" << length(genome)-mR.gEnd << "\n";
1788 std::cout << "mR.editDist =" << mR.editDist << "\n";
1789 #endif
1790
1791 }
1792 maxErrors = static_cast<unsigned>((mR.mScore + mL.mScore) * options.errorRate);
1793 if(mR.editDist + mL.editDist > maxErrors) return false; //make sure percent identity critrium is fulfilled on matched part of the read
1794
1795 return true;
1796 }
1797
1798
1799
1800 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1801 // Hamming distance match combination
1802
1803 template <typename TMatch, typename TOriRead, typename TGenome, typename TSpec>
1804 bool
combineLeftRight(TMatch & mR,TMatch & mL,TOriRead const & oriRead,TGenome & genome,RazerSOptions<TSpec> & options,char orientation,SwiftSemiGlobalHamming)1805 combineLeftRight(TMatch & mR,
1806 TMatch & mL,
1807 TOriRead const& oriRead,
1808 TGenome & genome,
1809 RazerSOptions<TSpec> &options,
1810 char orientation,
1811 SwiftSemiGlobalHamming)
1812 {
1813
1814 #ifdef RAZERS_DEBUG
1815 ::std::cout << "combineLeftRightHamming\n";
1816 #endif
1817
1818
1819 typedef typename Infix<TOriRead const>::Type TRead;
1820 TRead read = infix(oriRead,0,length(oriRead));
1821
1822 typedef typename Infix<TGenome>::Type TGenomeInf;
1823 TGenomeInf genomeInf = infix(genome, mL.gBegin, mR.gEnd);
1824 int readLength = length(read);
1825 unsigned maxErrors = static_cast<unsigned>(readLength * options.errorRate);
1826
1827
1828 #ifdef RAZERS_DEBUG
1829 std::cout << "readLength=" << readLength << "\n";
1830 std::cout << "sumLen=" << mL.mScore + mR.mScore << "\n";
1831 std::cout << "gInfLength=" << length(genomeInf) << "\n";
1832 std::cout << "gInf=" << genomeInf << "\n";
1833
1834 std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1835 std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1836 std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1837 std::cout << "mL.editDist =" << mL.editDist << "\n";
1838 std::cout << "readPref=" << prefix(oriRead,mL.mScore) << "\n";
1839
1840 std::cout << "mR.mScore =" << mR.mScore << "\n";
1841 std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1842 std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1843 std::cout << "mR.editDist =" << mR.editDist << "\n";
1844 std::cout << "readSuff=" << suffix(oriRead,length(oriRead)-mR.mScore) << "\n";
1845 #endif
1846
1847
1848 // neither insertion nor deletion
1849 if(mR.gEnd - mL.gBegin == readLength)
1850 {
1851 #ifdef RAZERS_DEBUG
1852 ::std::cout << "normal\n";
1853 #endif
1854 unsigned halfReadLen = readLength >> 1;
1855 unsigned prefixErrors = 0;
1856 unsigned suffixErrors = 0;
1857 for (unsigned i = 0 ; i < length(genomeInf); ++i)
1858 {
1859 if ((options.compMask[ordValue(read[i])] & options.compMask[ordValue(genomeInf[i])]) == 0)
1860 {
1861 if(i >= halfReadLen)
1862 ++suffixErrors;
1863 else
1864 ++prefixErrors;
1865 }
1866 }
1867
1868 if (suffixErrors+prefixErrors <= maxErrors)
1869 {
1870 mL.mScore = halfReadLen;
1871 mR.mScore = length(read)- halfReadLen;
1872 mR.gBegin = mR.gEnd - mR.mScore;
1873 mL.gEnd = mL.gBegin + mL.mScore;
1874 mL.editDist = prefixErrors;
1875 mR.editDist = suffixErrors;
1876 }
1877 else return false;
1878
1879 }
1880 //potential insertion
1881 if(mR.gEnd - mL.gBegin < readLength)
1882 {
1883 #ifdef RAZERS_DEBUG
1884 ::std::cout << "insertion\n";
1885 #endif
1886
1887 if(mR.gEnd - mL.gBegin < static_cast<long int>(2*options.minMatchLen))//too close together
1888 return false;
1889
1890 if(mR.mScore + mL.mScore < mR.gEnd - mL.gBegin) //prefix and suffix match do not meet
1891 return false;
1892
1893 if((mR.mScore + mL.mScore == mR.gEnd - mL.gBegin) && (mR.editDist + mL.editDist > maxErrors)) //prefix and suffix match meet but too many errors
1894 return false;
1895
1896 // if((mR.gEnd - mL.gBegin <= mL.mScore) || (mR.gEnd - mL.gBegin <= mR.mScore))//too close together
1897 // return false;
1898 int traceExt = 0;
1899 bool result = findBestSplitPosition(read,genomeInf,mL.mScore,mR.mScore,mL.editDist,mR.editDist, traceExt, options, orientation, SwiftSemiGlobalHamming());
1900 if(!result || mR.editDist + mL.editDist > maxErrors) return false;
1901 mR.gBegin = mR.gEnd - mR.mScore;
1902 mL.gEnd = mL.gBegin + mL.mScore;
1903 mR.traceExtension = mL.traceExtension = traceExt;
1904
1905
1906 }
1907 //potential deletion
1908 if(mR.gEnd - mL.gBegin > readLength)
1909 {
1910 #ifdef RAZERS_DEBUG
1911 ::std::cout << "deletion\n";
1912 #endif
1913
1914 if(mR.mScore + mL.mScore < readLength)
1915 return false;
1916
1917 if((mR.mScore + mL.mScore == readLength) && (mR.editDist + mL.editDist > maxErrors)) //the prefix and suffix match meet and do not overlap --> perfect
1918 return false;
1919
1920 int traceExt = 0;
1921 bool result = findBestSplitPosition(genomeInf,read,mL.mScore,mR.mScore,mL.editDist,mR.editDist, traceExt, options, orientation,SwiftSemiGlobalHamming());
1922
1923 if(!result || mR.editDist + mL.editDist > maxErrors) return false;
1924
1925 mR.traceExtension = mL.traceExtension = traceExt;
1926 mR.gBegin = mR.gEnd - mR.mScore;
1927 mL.gEnd = mL.gBegin + mL.mScore;
1928
1929 }
1930 #ifdef RAZERS_DEBUG
1931 std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1932 std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1933 std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1934 std::cout << "mL.editDist =" << mL.editDist << "\n";
1935
1936 std::cout << "mR.mScore =" << mR.mScore << "\n";
1937 std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1938 std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1939 std::cout << "mR.editDist =" << mR.editDist << "\n";
1940 #endif
1941
1942 maxErrors = static_cast<unsigned>((mR.mScore + mL.mScore) * options.errorRate);
1943 if(mR.editDist + mL.editDist > maxErrors) return false; //make sure percent identity critrium is fulfilled on matched part of the read
1944
1945 return true;
1946 }
1947
1948
1949 // find the best split position for a split match
1950 // positions are relative to shorter sequence
1951 // (deletion --> read is shorter, insertion --> genomeInf is shorter)
1952 template <typename TSize, typename TLongerSegment, typename TShorterSegment, typename TErrors, typename TOptions>
1953 bool
findBestSplitPosition(TLongerSegment & longSeg,TShorterSegment & shortSeg,TSize & mLmScore,TSize & mRmScore,TErrors & errorsLeft,TErrors & errorsRight,int & traceExt,TOptions & options,char orientation,SwiftSemiGlobalHamming)1954 findBestSplitPosition(TLongerSegment &longSeg,
1955 TShorterSegment &shortSeg,
1956 TSize & mLmScore,
1957 TSize & mRmScore,
1958 TErrors & errorsLeft,
1959 TErrors & errorsRight,
1960 int & traceExt,
1961 TOptions & options,
1962 char orientation,
1963 SwiftSemiGlobalHamming)
1964 {
1965
1966 #ifdef RAZERS_DEBUG
1967 ::std::cout << "findBestSplitHamming"<<orientation<<"\n";
1968 #endif
1969
1970 // usually, both types should be the same, but you never know...
1971 //typedef typename Iterator<TLongerSegment const>::Type TLongIterator;
1972 //typedef typename Iterator<TShorterSegment const>::Type TShortIterator;
1973 typedef typename Size<TShorterSegment const>::Type TShortSize;
1974 typedef typename Size<TLongerSegment const>::Type TLongSize;
1975
1976
1977 TShortSize shortLen = length(shortSeg);
1978 TLongSize longLen = length(longSeg);
1979 TLongSize lenDiff = longLen - shortLen;
1980
1981 TLongSize leftLongBegin = _max((int)options.minMatchLen,(int)shortLen-mRmScore);
1982 TLongSize leftLongEnd = _min((int)mLmScore,(int)shortLen-(int)options.minMatchLen);
1983 TLongSize leftLongPos = leftLongBegin;
1984
1985 TLongSize rightLongPos = leftLongBegin + lenDiff;
1986 TShortSize shortPos = leftLongBegin;
1987
1988 int bestSumErrors = 0;
1989 // int bestLErrors = 0;
1990 // int bestRErrors = 0;
1991 int bestPos = shortPos;
1992 int errorsL = 0;
1993 int errorsR = 0;
1994 int errorsPosL = 0;
1995 int errorsPosR = 0;
1996
1997 // determine trace extensions
1998 int bestTraceExtL = shortPos;
1999 int bestTraceExtR = shortPos;
2000
2001 #ifdef RAZERS_DEBUG
2002 std::cout << "before\nerrorsLeft= " << errorsLeft << std::endl;
2003 std::cout << "errorsRight= " << errorsRight << std::endl;
2004 std::cout << "mLmScore= " << mLmScore << std::endl;
2005 std::cout << "mRmScore= " << mRmScore << std::endl;
2006
2007 std::cout << "leftLongPos " << leftLongPos << std::endl;
2008 std::cout << "leftLongEnd " << leftLongEnd << std::endl;
2009 std::cout << "rightLongPos " << rightLongPos << std::endl;
2010 std::cout << "shortPos " << shortPos << std::endl;
2011
2012 std::cout << longSeg << "=longSeg\n";
2013 std::cout << shortSeg << "=shortSeg\n";
2014 #endif
2015
2016 //find best split position --> min. sum errors
2017 while(leftLongPos < leftLongEnd)
2018 {
2019 if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[leftLongPos])]) == 0)
2020 {
2021 ++errorsL;
2022 ++errorsPosL;
2023 }
2024 if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[rightLongPos])]) == 0)
2025 {
2026 --errorsPosR;
2027 ++errorsR;
2028 }
2029 if(errorsPosL+errorsPosR < bestSumErrors
2030 || (orientation == 'R' && errorsPosL+errorsPosR == bestSumErrors))
2031 {
2032 bestSumErrors = errorsPosL+errorsPosR;
2033 // bestLErrors = errorsPosL;
2034 // bestRErrors = errorsPosR;
2035 bestPos = shortPos + 1;
2036 if(errorsPosL+errorsPosR < bestSumErrors) bestTraceExtL = bestPos;
2037 }
2038 if(errorsPosL+errorsPosR == bestSumErrors)
2039 {
2040 bestTraceExtR = shortPos + 1;
2041 }
2042 ++leftLongPos;
2043 ++rightLongPos;
2044 ++shortPos;
2045 }
2046
2047 // trace extension:
2048 traceExt = bestTraceExtR - bestTraceExtL;
2049
2050 //update to new match lengths
2051 mLmScore = bestPos;
2052 mRmScore = shortLen - bestPos;
2053
2054 //count numErrors for left and right match
2055 //(have to count completely new, because mScore may be longer than shortLen --> no able to track errors outside segment)
2056 errorsRight = errorsLeft = 0;
2057 for(leftLongPos = 0, shortPos = 0; leftLongPos < (unsigned)mLmScore; ++leftLongPos, ++shortPos)
2058 {
2059 if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[leftLongPos])]) == (unsigned) 0)
2060 ++errorsLeft;
2061
2062 }
2063 for(rightLongPos = 0, shortPos = 0; rightLongPos < (unsigned)mRmScore; ++rightLongPos, ++shortPos)
2064 {
2065 if((options.compMask[ordValue(shortSeg[shortLen-1-shortPos])] & options.compMask[ordValue(longSeg[longLen-1-rightLongPos])]) == (unsigned) 0)
2066 ++errorsRight;
2067
2068 }
2069
2070 #ifdef RAZERS_DEBUG
2071 std::cout << "bestSumErrors= " << bestSumErrors << std::endl;
2072 std::cout << "errorsPosR= " << errorsPosR << std::endl;
2073 std::cout << "errorsPosL= " << errorsPosL << std::endl;
2074 std::cout << "after\nerrorsLeft= " << errorsLeft << std::endl;
2075 std::cout << "errorsRight= " << errorsRight << std::endl;
2076 std::cout << "mLmScore= " << mLmScore << std::endl;
2077 std::cout << "mRmScore= " << mRmScore << std::endl;
2078 std::cout << "traceExt= " << traceExt << std::endl;
2079 #endif
2080
2081 return true;
2082 }
2083
2084
2085
2086
2087
2088 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2089 // Find read matches in many genome sequences (import from Fasta)
2090 template <
2091 typename TMatches,
2092 typename TReadSet_,
2093 typename TCounts,
2094 typename TSpec,
2095 typename TShapeL,
2096 typename TShapeR,
2097 typename TSwiftSpec >
mapSplicedReads(TMatches & matches,StringSet<CharString> & genomeFileNameList,StringSet<CharString> & genomeNames,::std::map<unsigned,::std::pair<::std::string,unsigned>> & gnoToFileMap,TReadSet_ & readSet,TReadRegions & readRegions,TCounts & cnts,RazerSOptions<TSpec> & options,TShapeL const & shapeL,TShapeR const & shapeR,Swift<TSwiftSpec> const)2098 int mapSplicedReads(
2099 TMatches & matches,
2100 StringSet<CharString> & genomeFileNameList,
2101 StringSet<CharString> & genomeNames, // genome names, taken from the Fasta file
2102 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
2103 TReadSet_ & readSet,
2104 TReadRegions & readRegions,
2105 TCounts & cnts,
2106 RazerSOptions<TSpec> & options,
2107 TShapeL const & shapeL,
2108 TShapeR const & shapeR,
2109 Swift<TSwiftSpec> const)
2110 {
2111
2112
2113 typedef typename Value<TReadSet_>::Type TRead;
2114 typedef StringSet<typename Infix<TRead>::Type> TReadSet;
2115 typedef Index<TReadSet, IndexQGram<TShapeL, TQGramIndexSpec> > TIndexL; // q-gram index left
2116 typedef Index<TReadSet, IndexQGram<TShapeR, TQGramIndexSpec> > TIndexR; // q-gram index right
2117 typedef Pattern<TIndexL, Swift<TSwiftSpec> > TSwiftPatternL; // filter //should be the same thing for left and right
2118 typedef Pattern<TIndexR, Swift<TSwiftSpec> > TSwiftPatternR; // filter
2119 typedef Pattern<TRead, MyersUkkonen> TMyersPattern; // verifier
2120
2121
2122 // split reads over two indices, one for prefixes, the other for suffixes
2123 TReadSet readSetL, readSetR;
2124 unsigned readCount = length(readSet);
2125 resize(readSetL, readCount);
2126 resize(readSetR, readCount);
2127
2128 if(options._debugLevel > 0)
2129 {
2130 int64_t genomeLen = static_cast<int64_t>(3000000000lu) * 2; // ufff make that an option
2131 expNumRandomMatches(readSet, genomeLen, options);
2132 }
2133
2134 if(options._debugLevel > 0 )
2135 std::cout << "Performing spliced mapping.\n";
2136 for (unsigned i = 0; i < readCount; ++i)
2137 {
2138 if(length(readSet[i])>=2*options.minMatchLen)
2139 {
2140 assign(readSetL[i], infix(readSet[i],0,options.minMatchLen), Exact());
2141 assign(readSetR[i], infix(readSet[i],length(readSet[i])-options.minMatchLen,length(readSet[i])), Exact());
2142 }
2143 else
2144 {
2145 assign(readSetL[i], infix(readSet[i],0,0), Exact());
2146 assign(readSetR[i], infix(readSet[i],0,0), Exact());
2147 }
2148 }
2149
2150
2151 if(options._debugLevel > 1)::std::cout << "Make index left right\n";
2152 // configure q-gram index
2153 TIndexL swiftIndexL(readSetL, shapeL);
2154 TIndexR swiftIndexR(readSetR, shapeR);
2155
2156 #ifdef RAZERS_OPENADDRESSING
2157 swiftIndexL.alpha = 2;
2158 swiftIndexR.alpha = 2;
2159 #endif
2160
2161 cargo(swiftIndexL).abundanceCut = options.abundanceCut;
2162 cargo(swiftIndexR).abundanceCut = options.abundanceCut;
2163 cargo(swiftIndexL)._debugLevel = 0;
2164 cargo(swiftIndexR)._debugLevel = options._debugLevel;
2165
2166 // configure Swift
2167 TSwiftPatternL swiftPatternL(swiftIndexL);
2168 TSwiftPatternR swiftPatternR(swiftIndexR);
2169 swiftPatternL.params.minThreshold = options.threshold;
2170 swiftPatternR.params.minThreshold = options.thresholdR;
2171 swiftPatternL.params.tabooLength = options.tabooLength;
2172 swiftPatternR.params.tabooLength = options.tabooLength;
2173
2174 // init edit distance verifiers
2175 String<TMyersPattern> forwardPatternsL;
2176 String<TMyersPattern> forwardPatternsR;
2177 options.compMask[4] = (options.matchN)? 15: 0;
2178 if (!options.hammingOnly)
2179 {
2180 resize(forwardPatternsL, readCount, Exact());
2181 resize(forwardPatternsR, readCount, Exact());
2182 for(unsigned i = 0; i < readCount; ++i)
2183 {
2184 setHost(forwardPatternsL[i], indexText(swiftIndexL)[i]);
2185 setHost(forwardPatternsR[i], indexText(swiftIndexR)[i]);
2186 _patternMatchNOfPattern(forwardPatternsL[i], options.matchN);
2187 _patternMatchNOfPattern(forwardPatternsR[i], options.matchN);
2188 _patternMatchNOfFinder(forwardPatternsL[i], options.matchN);
2189 _patternMatchNOfFinder(forwardPatternsR[i], options.matchN);
2190 }
2191 }
2192
2193 if(options._debugLevel > 1)::std::cout << "Patterns created\n";
2194
2195 // clear stats
2196 options.FP = 0;
2197 options.TP = 0;
2198 options.timeMapReads = 0;
2199 options.timeDumpResults = 0;
2200
2201 unsigned numFiles = length(genomeFileNameList);
2202 unsigned gseqNo = 0;
2203
2204 // open genome files, one by one
2205 for (unsigned filecount = 0; filecount < numFiles; ++filecount)
2206 {
2207 // open genome file
2208 SeqFileIn file;
2209 if (!open(file, toCString(genomeFileNameList[filecount])))
2210 return RAZERS_GENOME_FAILED;
2211
2212 // remove the directory prefix of current genome file
2213 ::std::string genomeFile(toCString(genomeFileNameList[filecount]));
2214 size_t lastPos = genomeFile.find_last_of('/') + 1;
2215 if (lastPos == genomeFile.npos) lastPos = genomeFile.find_last_of('\\') + 1;
2216 if (lastPos == genomeFile.npos) lastPos = 0;
2217 ::std::string genomeName = genomeFile.substr(lastPos);
2218
2219 CharString id;
2220 //Dna5String genome;
2221 String<Dna5Q> genome;
2222 unsigned gseqNoWithinFile = 0;
2223 SEQAN_PROTIMESTART(find_time);
2224
2225 // iterate over genome sequences
2226 for(; !atEnd(file); ++gseqNo)
2227 {
2228 readRecord(id, genome, file); // read Fasta id and sequence
2229 if (options.genomeNaming == 0)
2230 {
2231 cropAfterFirst(id, IsWhitespace()); // crop id after the first whitespace
2232 appendValue(genomeNames, id, Generous());
2233 }
2234 gnoToFileMap.insert(::std::make_pair(gseqNo,::std::make_pair(genomeName,gseqNoWithinFile)));
2235
2236 if (options.forward)
2237 mapSplicedReads(matches, genome, gseqNo, readSet, readRegions, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'F', options);
2238
2239 if (options.reverse)
2240 {
2241 reverseComplement(genome);
2242 mapSplicedReads(matches, genome, gseqNo, readSet, readRegions, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'R', options);
2243 }
2244 ++gseqNoWithinFile;
2245
2246 }
2247 options.timeMapReads += SEQAN_PROTIMEDIFF(find_time);
2248 }
2249
2250 compactSplicedMatches(matches, cnts, options, false, swiftPatternL, swiftPatternR);
2251
2252 if (options._debugLevel >= 1)
2253 ::std::cerr << ::std::endl << "Finding reads took \t" << options.timeMapReads << " seconds" << ::std::endl;
2254 if (options._debugLevel >= 2) {
2255 ::std::cerr << ::std::endl;
2256 ::std::cerr << "___FILTRATION_STATS____" << ::std::endl;
2257 ::std::cerr << "Swift FP: " << options.FP << ::std::endl;
2258 ::std::cerr << "Swift TP: " << options.TP << ::std::endl;
2259 }
2260 return 0;
2261 }
2262
2263
2264 // returns start pos furthest to the left
2265 template<typename TValue, typename TRegion, typename TOptions>
2266 inline typename Value<typename Value<TRegion,2>::Type,2>::Type
regionStartPos(TRegion & readRegion,TValue readLength,TOptions & options)2267 regionStartPos(TRegion & readRegion,
2268 TValue readLength,
2269 TOptions & options)
2270 {
2271 typedef typename Value<typename Value<TRegion,2>::Type,2>::Type TSignedPos;
2272
2273 if(readRegion.i2.i1 < 2) // i2 stores expected end pos of match
2274 return _max((TSignedPos)0,(TSignedPos)(readRegion.i2.i2 - options.libraryLength + readLength - options.libraryError - options.maxGap));
2275 else // i2 stores expected start pos
2276 return (readRegion.i2.i2 + options.libraryLength - readLength - options.libraryError);
2277
2278 }
2279
2280 // returns end pos furthest to the right
2281 template<typename TValue, typename TRegion, typename TOptions>
2282 inline typename Value<typename Value<TRegion,2>::Type,2>::Type
regionEndPos(TRegion & readRegion,TValue readLength,TOptions & options)2283 regionEndPos(TRegion & readRegion,
2284 TValue readLength,
2285 TOptions & options)
2286 {
2287 typedef typename Value<typename Value<TRegion,2>::Type,2>::Type TSignedPos;
2288
2289 if(readRegion.i2.i1 < 2) // i2 stores expected end pos of match
2290 return (TSignedPos)readRegion.i2.i2-options.libraryLength+2*readLength + options.libraryError;
2291 else // i2 stores expected start pos
2292 return readRegion.i2.i2+ options.libraryLength + options.libraryError + options.maxGap;
2293
2294 }
2295
2296
2297
2298 template<typename TMatch, typename TValue, typename TRegion, typename TOptions>
2299 inline bool
isValidRegion(TMatch & mL,TMatch & mR,TValue readLength,TRegion & readRegion,TOptions & options)2300 isValidRegion(TMatch & mL,
2301 TMatch & mR,
2302 TValue readLength,
2303 TRegion & readRegion,
2304 TOptions & options)
2305 {
2306 typedef typename Value<typename Value<TRegion,2>::Type,2>::Type TSignedPos;
2307
2308 if(readRegion.i2.i1 < 2) // mR.gEnd needs to lie within a specific region
2309 {
2310 TSignedPos expEndPos = readRegion.i2.i2 - options.libraryLength + 2*readLength;
2311 if (mR.gEnd < expEndPos - options.libraryError ||
2312 mR.gEnd > expEndPos + options.libraryError )
2313 return false;
2314 }
2315 else // mL.gBegin needs to lie within a specific region
2316 {
2317 TSignedPos expStartPos = readRegion.i2.i2 + options.libraryLength - readLength;
2318 if (mL.gBegin < expStartPos - options.libraryError ||
2319 mL.gBegin > expStartPos + options.libraryError )
2320 return false;
2321 }
2322 return true;
2323 }
2324
2325 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2326 // Find read matches in one genome sequence
2327 template <
2328 typename TMatches,
2329 typename TGenome,
2330 typename TOriReadSet,
2331 typename TReadIndexL,
2332 typename TReadIndexR,
2333 typename TSwiftSpec,
2334 typename TVerifier,
2335 typename TCounts,
2336 typename TSpec >
mapSplicedReads(TMatches & matches,TGenome & genome,unsigned gseqNo,TOriReadSet & readSet,TReadRegions & readRegions,Pattern<TReadIndexL,Swift<TSwiftSpec>> & swiftPatternL,Pattern<TReadIndexR,Swift<TSwiftSpec>> & swiftPatternR,TVerifier & forwardPatternsL,TVerifier & forwardPatternsR,TCounts & cnts,char orientation,RazerSOptions<TSpec> & options)2337 void mapSplicedReads(
2338 TMatches &matches, // resulting matches
2339 TGenome &genome, // genome ...
2340 unsigned gseqNo, // ... and its sequence number
2341 TOriReadSet &readSet, // reads
2342 TReadRegions & readRegions,
2343 Pattern<TReadIndexL, Swift<TSwiftSpec> > &swiftPatternL, // left index
2344 Pattern<TReadIndexR, Swift<TSwiftSpec> > &swiftPatternR, // right index
2345 TVerifier &forwardPatternsL,
2346 TVerifier &forwardPatternsR,
2347 TCounts & cnts,
2348 char orientation,
2349 RazerSOptions<TSpec> &options)
2350 {
2351 typedef typename Value<TOriReadSet>::Type TRead;
2352 typedef typename Fibre<TReadIndexL, FibreText>::Type TReadSetL;
2353 typedef typename Fibre<TReadIndexR, FibreText>::Type TReadSetR;
2354 typedef typename Size<TGenome>::Type TSize;
2355 typedef typename Position<TGenome>::Type TGPos;
2356 typedef typename MakeSigned_<TGPos>::Type TSignedGPos;
2357 typedef typename Value<TMatches>::Type TMatch;
2358 typedef typename Infix<TGenome>::Type TGenomeInf;
2359
2360 // Prefix-Suffix filtration
2361 //typedef Finder<TGenome, Swift<TSwiftSpec> > TSwiftFinderL;
2362 typedef Finder<TGenomeInf, Swift<TSwiftSpec> > TSwiftFinderL;
2363 typedef Finder<TGenomeInf, Swift<TSwiftSpec> > TSwiftFinderR;
2364 //typedef Pattern<TReadIndexL, Swift<TSwiftSpec> > TSwiftPatternL;
2365 //typedef Pattern<TReadIndexR, Swift<TSwiftSpec> > TSwiftPatternR;
2366
2367 typedef Pair<int64_t, TMatch> TDequeueValue;
2368 typedef Dequeue<TDequeueValue> TDequeue;
2369 typedef typename TDequeue::TIter TDequeueIterator;
2370
2371 const unsigned NOT_VERIFIED = 1u << (8*sizeof(unsigned)-1);
2372
2373 // iterate all genomic sequences
2374 if (options._debugLevel >= 1)
2375 {
2376 ::std::cerr << ::std::endl << "Process genome seq #" << gseqNo;
2377 if (orientation == 'F')
2378 ::std::cerr << "[fwd]";
2379 else
2380 ::std::cerr << "[rev]";
2381 }
2382
2383 TReadSetL &readSetL = host(host(swiftPatternL));
2384 TReadSetR &readSetR = host(host(swiftPatternR));
2385
2386 if (empty(readSetL) || empty(readSetR))
2387 return;
2388
2389
2390 TSignedGPos maxDistance = options.maxGap;
2391 // raus:TSignedGPos maxDistance = options.maxGap + (int)options.maxReadLength;
2392 //TSignedGPos minDistance = options.minGap;// + 2*options.minMatchLen ;
2393 TSignedGPos minDistance = 2*options.minMatchLen ;
2394 if(!options.hammingOnly)
2395 minDistance -= (options.maxPrefixErrors + options.maxSuffixErrors);
2396
2397 // exit if contig is shorter than minDistance
2398 if (length(genome) <= (unsigned)2*options.minMatchLen)
2399 return;
2400
2401 TGPos scanBegin = 0;
2402 TGPos scanEnd = length(genome);
2403 if(!empty(readRegions))
2404 {
2405 if(options._debugLevel > 1)
2406 {
2407 std::cout << "MaxRegionEndPos=" << options.maxReadRegionsEnd << std::endl;
2408 std::cout << "MinRegionStartPos=" << options.minReadRegionsStart << std::endl;
2409 }
2410 scanBegin = options.minReadRegionsStart;
2411 if (scanBegin > length(genome))
2412 scanBegin = length(genome);
2413 scanEnd = options.maxReadRegionsEnd;
2414 if (scanEnd > length(genome))
2415 scanEnd = length(genome);
2416 }
2417 TGenomeInf genomeInf = infix(genome, scanBegin, scanEnd);
2418 //TSwiftFinderL swiftFinderL(genome, options.repeatLength, 1);
2419 TSwiftFinderL swiftFinderL(genomeInf, options.repeatLength, 1);
2420 TSwiftFinderR swiftFinderR(genomeInf, options.repeatLength, 1);
2421
2422 TDequeue fifo; // stores potential prefix matches
2423 String<int64_t> lastPotMatchNo; // last number of a potential prefix match
2424 int64_t lastNo = 0; // last number over all potential prefix matches in the queue
2425 int64_t firstNo = 0; // first number over all potential prefix matches in the queue
2426 Pair<TGPos> gPair;
2427
2428 resize(lastPotMatchNo, length(host(swiftPatternL)), (int64_t)-1, Exact());
2429
2430 String<Pair<TGPos> > lastRightMatch; // begin and end of last verified suffix match
2431 resize(lastRightMatch, length(host(swiftPatternL)), Pair<TGPos>(0,0), Exact());
2432
2433 TSize gLength = length(genome);
2434 if(orientation == 'R') scanBegin = gLength - scanEnd;
2435 TMatch mR = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2436 TMatch temp = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2437 TDequeueValue fL(-1, mR);
2438 fL.i2.gseqNo = gseqNo;
2439 mR.gseqNo = gseqNo;
2440 fL.i2.orientation = orientation;
2441 mR.orientation = orientation;
2442
2443 double maxErrorRateL = (double)options.maxPrefixErrors/options.minMatchLen;
2444 double maxErrorRateR = (double)options.maxSuffixErrors/options.minMatchLen;
2445
2446 double totalTime = sysTime();
2447 double totalTimeCombine = 0.0;
2448 double totalTimeLeftVerify = 0.0;
2449 double totalTimeLeftFilter = 0.0;
2450 double totalTimeRightVerify = 0.0;
2451
2452 Pair<TGPos,TGPos> lastLeftMatch(0,0);
2453 // iterate all verification regions returned by SWIFT
2454 while (find(swiftFinderR, swiftPatternR, maxErrorRateR))
2455 {
2456 unsigned rseqNo = swiftPatternR.curSeqNo;
2457 TGPos rEndPos = endPosition(swiftFinderR) + scanBegin;
2458 TGPos rBeginPos = beginPosition(swiftFinderR) + scanBegin;
2459 // std::cout << "rEnd=" << rEndPos << "\t";
2460 //TGPos doubleParWidth = 2 * (*swiftFinderR.curHit).bucketWidth;
2461 TRead const &read = readSet[rseqNo];
2462 unsigned readLength = length(readSet[rseqNo]);
2463 TSignedGPos extensionOffset = length(read)-2*options.minMatchLen;
2464 if(!options.hammingOnly)
2465 extensionOffset += static_cast<TGPos>(floor(options.errorRate*length(read)));
2466 // passed all valid regions
2467 if(!empty(readRegions) && (TSignedGPos)rBeginPos > (TSignedGPos)options.maxReadRegionsEnd)
2468 break;
2469 if(!empty(readRegions) &&
2470 ((TSignedGPos)rEndPos < (TSignedGPos)regionStartPos(readRegions[rseqNo],readLength,options) || (TSignedGPos)rBeginPos > (TSignedGPos)regionEndPos(readRegions[rseqNo],readLength,options))) //parallelogram must lie within possible mapping region
2471 continue;
2472
2473 // Check this again...
2474 // remove out-of-window prefixes from fifo
2475 while(!empty(fifo) && front(fifo).i2.gEnd + extensionOffset + maxDistance < (TSignedGPos) rBeginPos)
2476 //raus:while (!empty(fifo) && front(fifo).i2.gBegin + maxDistance + (TSignedGPos)doubleParWidth < (TSignedGPos)rEndPos)
2477 {
2478 popFront(fifo);
2479 ++firstNo;
2480 }
2481
2482 // Check this again...
2483 // add within-window prefixes to fifo
2484 double vTimeBegin1 = sysTime();
2485 while (empty(fifo) || back(fifo).i2.gBegin + minDistance <= (TSignedGPos)rEndPos )
2486 {
2487 if (find(swiftFinderL, swiftPatternL, maxErrorRateL))
2488 {
2489 gPair = positionRange(swiftFinderL);
2490 gPair.i1 += scanBegin;
2491 gPair.i2 += scanBegin;
2492 // std::cout << "lBegin=" << gPair.i1 << "\t";
2493 if(!empty(readRegions) && (TSignedGPos)gPair.i1 > (TSignedGPos) options.maxReadRegionsEnd)
2494 break;
2495 if ((TSignedGPos)gPair.i2 + maxDistance + extensionOffset >= (TSignedGPos)rEndPos
2496 && (empty(readRegions) ||
2497 ((TSignedGPos)gPair.i2 > (TSignedGPos)regionStartPos(readRegions[swiftPatternL.curSeqNo],readLength,options)
2498 && (TSignedGPos)gPair.i1 < (TSignedGPos)regionEndPos(readRegions[swiftPatternL.curSeqNo],readLength,options)))) //parallelogram must lie within possible mapping region
2499 //raus:if ((TSignedGPos)gPair.i2 + maxDistance + (TSignedGPos)doubleParWidth >= (TSignedGPos)rEndPos)
2500 {
2501 // link in
2502 fL.i1 = lastPotMatchNo[swiftPatternL.curSeqNo]; //link to last previous potential match
2503 lastPotMatchNo[swiftPatternL.curSeqNo] = lastNo++; //++ general counter and remember last pot match of curSeqNo
2504
2505 fL.i2.rseqNo = swiftPatternL.curSeqNo | NOT_VERIFIED; // set first bit
2506 fL.i2.gBegin = gPair.i1;
2507 fL.i2.gEnd = gPair.i2;
2508 pushBack(fifo, fL);
2509 }
2510 #ifdef RAZERS_DEBUG
2511 ::std::cout << "Discard potential match, out of window\n";
2512 #endif
2513 }
2514 else
2515 break;
2516 }
2517 double vTimeEnd1 = sysTime();
2518 totalTimeLeftFilter += (vTimeEnd1 - vTimeBegin1);
2519
2520
2521 TDequeueIterator it;
2522 int64_t lastPositive = (int64_t)-1;
2523
2524 TSize counter = 0;
2525 bool noMatchRight = false;
2526 bool notYetVerifiedRight = true;
2527 lastLeftMatch.i1 = 0;
2528 lastLeftMatch.i2 = 0;
2529
2530 // walk through all potential prefix matches
2531 // if suffix is positive, verify prefixes (if not verfied already), mark as positive or negative
2532 for (int64_t i = lastPotMatchNo[rseqNo]; firstNo <= i; i = (*it).i1)
2533 {
2534 //CHECK HIER raus do suffix match verification only once
2535 if(notYetVerifiedRight)
2536 {
2537 notYetVerifiedRight = false;
2538 TGPos minBeginPos = beginPosition(infix(swiftFinderR, genomeInf));
2539 if((int)minBeginPos - extensionOffset > 0)
2540 minBeginPos = minBeginPos - extensionOffset;
2541 else minBeginPos = 0;
2542 double vTimeBegin = sysTime();
2543 if (!matchVerify(mR,
2544 infix(genome,minBeginPos,endPosition(infix(swiftFinderR, genomeInf))),
2545 rseqNo,
2546 readSet,//readSetR,
2547 forwardPatternsR,
2548 options,
2549 TSwiftSpec(),
2550 LongestSuffix()))
2551 {
2552 double vTimeEnd = sysTime();
2553 totalTimeRightVerify += (vTimeEnd - vTimeBegin);
2554 noMatchRight = true;
2555 // continue;
2556 }
2557 else
2558 {
2559 double vTimeEnd = sysTime();
2560 totalTimeRightVerify += (vTimeEnd - vTimeBegin);
2561 if (lastRightMatch[rseqNo].i1 == (TGPos)mR.gBegin && lastRightMatch[rseqNo].i2 == (TGPos)mR.gEnd)
2562 noMatchRight = true;
2563 lastRightMatch[rseqNo].i1 = mR.gBegin;
2564 lastRightMatch[rseqNo].i2 = mR.gEnd;
2565
2566 // check here if the match lies within the candidate region!!!
2567 }
2568 }
2569
2570
2571
2572 // std::cout << "i=" <<i << "\t";
2573 it = &value(fifo, i - firstNo);
2574 //CHECK HIER raus noMatchRight --> \FCberspringen korrekt?
2575 if (noMatchRight || (*it).i2.gBegin + minDistance > (TSignedGPos)rEndPos)
2576 {
2577 if (lastPositive == (int64_t)-1)
2578 lastPotMatchNo[rseqNo] = i;
2579 else
2580 value(fifo, lastPositive - firstNo).i1 = i;
2581 lastPositive = i;
2582 continue;
2583 }
2584
2585 // verify left mate (equal seqNo), if not done already
2586 if ((*it).i2.rseqNo & NOT_VERIFIED)
2587 {
2588 TGPos maxEndPos = (*it).i2.gEnd + extensionOffset;
2589 if(maxEndPos > length(genome)) maxEndPos = length(genome);
2590 double vTimeBegin = sysTime();
2591 if (matchVerify( (*it).i2,
2592 infix(genome, (*it).i2.gBegin, maxEndPos),
2593 rseqNo,
2594 readSet, //readSetL
2595 forwardPatternsL,
2596 options,
2597 TSwiftSpec(),
2598 LongestPrefix()) &&
2599 !(lastLeftMatch.i1 == (TGPos)(*it).i2.gBegin && lastLeftMatch.i2 == (TGPos)(*it).i2.gEnd ))
2600 {
2601 double vTimeEnd = sysTime();
2602 totalTimeLeftVerify += (vTimeEnd - vTimeBegin);
2603 (*it).i2.rseqNo &= ~NOT_VERIFIED; // has been verified positively // unset first bit
2604 lastLeftMatch.i1 = (*it).i2.gBegin;
2605 lastLeftMatch.i2 = (*it).i2.gEnd;
2606 // short-cut negative matches
2607 if (lastPositive == (int64_t)-1)
2608 lastPotMatchNo[rseqNo] = i;
2609 else
2610 value(fifo, lastPositive - firstNo).i1 = i;
2611 lastPositive = i;
2612 }
2613 else
2614 {
2615 double vTimeEnd = sysTime();
2616 totalTimeLeftVerify += (vTimeEnd - vTimeBegin);
2617 (*it).i2.rseqNo = ~NOT_VERIFIED; // has been verified negatively
2618 }
2619
2620 }
2621
2622 // prefix match was found
2623 if ((*it).i2.rseqNo == rseqNo)
2624 {
2625 lastLeftMatch.i1 = (*it).i2.gBegin;
2626 lastLeftMatch.i2 = (*it).i2.gEnd;
2627
2628 // dont shortcut too much
2629 if (lastPositive == (int64_t)-1 || i < lastPositive)
2630 lastPositive = i;
2631
2632 /* // CHECK HIER rein
2633 // do suffix match verification
2634 if(notYetVerifiedRight)
2635 {
2636 notYetVerifiedRight = false;
2637 TGPos minBeginPos = beginPosition(infix(swiftFinderR, genomeInf));
2638 if((int)minBeginPos - extensionOffset > 0)
2639 minBeginPos = minBeginPos - extensionOffset;
2640 else minBeginPos = 0;
2641 double vTimeBegin = sysTime();
2642 if (!matchVerify(mR,
2643 infix(genome,minBeginPos,endPosition(infix(swiftFinderR, genomeInf))),
2644 rseqNo,
2645 readSet,//readSetR,
2646 forwardPatternsR,
2647 options,
2648 TSwiftSpec(),
2649 LongestSuffix()))
2650 {
2651 double vTimeEnd = sysTime();
2652 totalTimeRightVerify += (vTimeEnd - vTimeBegin);
2653 noMatchRight = true;
2654 continue;
2655 }
2656 else
2657 {
2658 double vTimeEnd = sysTime();
2659 totalTimeRightVerify += (vTimeEnd - vTimeBegin);
2660 if (lastRightMatch[rseqNo].i1 == (TGPos)mR.gBegin && lastRightMatch[rseqNo].i2 == (TGPos)mR.gEnd)
2661 noMatchRight = true;
2662 lastRightMatch[rseqNo].i1 = mR.gBegin;
2663 lastRightMatch[rseqNo].i2 = mR.gEnd;
2664 }
2665 }
2666 */
2667 //else check if prefix and suffix match fit together
2668 if(!noMatchRight)
2669 {
2670 int outerDistance = mR.gEnd - (*it).i2.gBegin;
2671 #ifdef RAZERS_DEBUG
2672 std::cout << "before\nmL.mScore =" << (*it).i2.mScore << "\n";
2673 std::cout << "mL.gBegin =" << (*it).i2.gBegin << "\n";
2674 std::cout << "mL.gEnd =" << (*it).i2.gEnd << "\n";
2675 std::cout << "mL.editDist =" << (*it).i2.editDist << "\n";
2676
2677 std::cout << "mR.mScore =" << mR.mScore << "\n";
2678 std::cout << "mR.gBegin =" << mR.gBegin << "\n";
2679 std::cout << "mR.gEnd =" << mR.gEnd << "\n";
2680 std::cout << "mR.editDist =" << mR.editDist << "\n";
2681 std::cout << "outerDist =" << outerDistance << "\n";
2682 #endif
2683 if (outerDistance < (int)(2 * options.minMatchLen)) //subtract suffix/prefix-errors in case of edit distance
2684 continue;
2685 // ::std::cout << "outerDistance->" << outerDistance << std::endl;
2686 int outerDistanceError = length(readSet[rseqNo]) -(int)(outerDistance);
2687 if (outerDistanceError < 0) outerDistanceError = -outerDistanceError;
2688 if ((outerDistanceError > (int) options.maxGap) || // f\FCr edit + static_cast<TGPos>(floor(options.errorRate*length(read))
2689 (outerDistanceError < (int) options.minGap)) // f\FCr edit - static_cast<TGPos>(floor(options.errorRate*length(read))
2690 continue;
2691
2692 TMatch mRtmp = mR;
2693 TMatch mLtmp = (*it).i2;
2694 if(!empty(readRegions) &&
2695 !isValidRegion(mLtmp,mRtmp,readLength,readRegions[rseqNo],options))
2696 // ((TSignedGPos)mLtmp.gBegin < (TSignedGPos)readRegions[rseqNo].i2 ||
2697 // (TSignedGPos)mRtmp.gEnd > (TSignedGPos)readRegions[rseqNo].i3)) //match must lie within possible mapping region
2698 continue;
2699 double t1 = sysTime();
2700 if(options.spec.DONT_VERIFY ||
2701 !combineLeftRight(mRtmp,mLtmp,read,genome,options,orientation,TSwiftSpec()))
2702 {
2703 ++options.FP;
2704 double t2 = sysTime();
2705 totalTimeCombine += (t2 - t1) ;
2706 continue;
2707 }
2708 else {
2709 double t2 = sysTime();
2710 totalTimeCombine += (t2 - t1) ;
2711 ++options.TP;
2712 }
2713 ++counter;
2714
2715 //assign an alignment score
2716 //mRtmp.alignmentScore = (options.matchScore * (sumLength - sumDist)) + (options.mismatchScore * sumDist);
2717 //if(outerDistanceError != 0) mRtmp.alignmentScore += ((outerDistanceError - 1) * options.gapExtend) + options.gapOpen;
2718 //mLtmp.alignmentScore = mRtmp.alignmentScore;
2719
2720 //mRtmp.alignmentScore = (options.matchScore * (sumLength - sumDist)) + (options.mismatchScore * sumDist);
2721 //if(outerDistanceError != 0)
2722 //{
2723 // int maxErrors = (int)(options.errorRate*length(readSet[rseqNo]));
2724 // double gapScoreConvergeTo = ((options.minMatchLen*2 - maxErrors) * options.matchScore) + (maxErrors * options.mismatchScore) ;
2725 // double gapPenalty = gapScoreConvergeTo*(1-exp((double)-options.indelLambda*outerDistanceError));
2726 // mRtmp.alignmentScore -= (int)gapPenalty;
2727 //}
2728 //mLtmp.alignmentScore = mRtmp.alignmentScore;
2729
2730 if (orientation == 'R')
2731 {
2732 TSize temp = mLtmp.gBegin;
2733 mLtmp.gBegin = gLength - mLtmp.gEnd;
2734 mLtmp.gEnd = gLength - temp;
2735 temp = mRtmp.gBegin;
2736 mRtmp.gBegin = gLength - mRtmp.gEnd;
2737 mRtmp.gEnd = gLength - temp;
2738 outerDistance = -outerDistance;
2739 }
2740 // set a unique pair id
2741 mLtmp.pairId = mRtmp.pairId = options.nextMatePairId;
2742 if (++options.nextMatePairId == 0)
2743 options.nextMatePairId = 1;
2744
2745 // score the whole match pair
2746 //mLtmp.pairScore = mRtmp.pairScore = 0 - mLtmp.editDist - mRtmp.editDist;
2747 // score the whole match pair by # of matches bases - # mismatched bases (not entirely correct for edit distance)
2748 #ifdef TRY_SCORES
2749 double identityScore = ( (100.00 - (100.00* (double)(mLtmp.editDist + mRtmp.editDist)/(mRtmp.mScore + mLtmp.mScore))) - 80.0 ) * 5.0;
2750 if(options._debugLevel > 1 )std::cout << "identityScore: " << identityScore << std::endl;
2751
2752 double aliScore = mRtmp.mScore + mLtmp.mScore - 2* mLtmp.editDist - 2* mRtmp.editDist;
2753 aliScore = (double)(aliScore*100)/readLength;
2754 mLtmp.pairScore = mRtmp.pairScore = (int)(identityScore+aliScore)/2.0;
2755 #else
2756 mLtmp.pairScore = mRtmp.pairScore = mRtmp.mScore + mLtmp.mScore - 2* mLtmp.editDist - 2* mRtmp.editDist;
2757 #endif
2758
2759
2760 if(outerDistanceError != 0)
2761 { //subtract one if there is an indel in the middle (such that perfect matches are better than indel matches..)
2762 mLtmp.pairScore -= (int)((double)options.penaltyC * length(readSet[rseqNo])/100) ;
2763 mRtmp.pairScore -= (int)((double)options.penaltyC * length(readSet[rseqNo])/100) ;
2764 }
2765 #ifdef RAZERS_DEBUG_LIGHT
2766 bool falsch = false;
2767 if((unsigned)(mRtmp.mScore + mLtmp.mScore) > length(readSet[rseqNo])) falsch = true;
2768 if(mRtmp.mScore> length(readSet[rseqNo])-options.minMatchLen || mRtmp.mScore < options.minMatchLen) falsch = true;
2769 if((int)mRtmp.gEnd - (int)mRtmp.gBegin < 0 || (int)mRtmp.gEnd - (int)mRtmp.gBegin > 500) falsch = true;
2770 if(mRtmp.editDist > 200) falsch = true;
2771
2772 if(mLtmp.mScore> length(readSet[rseqNo])-options.minMatchLen ||mLtmp.mScore < options.minMatchLen) falsch = true;
2773 if((int)mLtmp.gEnd - (int)mLtmp.gBegin < 0 || (int)mLtmp.gEnd - (int)mLtmp.gBegin > 500) falsch = true;
2774 if(mLtmp.editDist > 200) falsch = true;
2775
2776 if(falsch)
2777 {
2778 ::std::cout << "rseqNo="<<rseqNo <<::std::endl;
2779 std::cout << "mL.mScore =" << mLtmp.mScore << "\n";
2780 std::cout << "mL.gBegin =" << mLtmp.gBegin << "\n";
2781 std::cout << "mL.gEnd =" << mLtmp.gEnd << "\n";
2782 std::cout << "mL.editDist =" << mLtmp.editDist << "\n";
2783
2784 std::cout << "mR.mScore =" << mRtmp.mScore << "\n";
2785 std::cout << "mR.gBegin =" << mRtmp.gBegin << "\n";
2786 std::cout << "mR.gEnd =" << mRtmp.gEnd << "\n";
2787 std::cout << "mR.editDist =" << mRtmp.editDist << "\n";
2788
2789
2790
2791 if(-mLtmp.pairScore > (int) (options.errorRate * length(readSet[rseqNo])))
2792 {
2793 ::std::cout << "mLtmp.pairScore = " << mLtmp.pairScore;
2794 ::std::cout << "\treadLen = " << length(readSet[rseqNo]);
2795 ::std::cout << "\trseqNo = " << rseqNo << ::std::endl;
2796 }
2797 std::cout << "mScoreSum =" << mLtmp.mScore + mRtmp.mScore << "\t";
2798 std::cout << "readLength =" << length(readSet[rseqNo]) << "\t";
2799 std::cout << "mL.mScore =" << mLtmp.mScore << "\t";
2800 std::cout << "mL.gBegin =" << mLtmp.gBegin << "\t";
2801 std::cout << "mL.gEnd =" << mLtmp.gEnd << "\t";
2802 std::cout << "mL.editDist =" << mLtmp.editDist << "\n";
2803
2804 std::cout << "mR.mScore =" << mRtmp.mScore << "\t";
2805 std::cout << "mR.gBegin =" << mRtmp.gBegin << "\t";
2806 std::cout << "mR.gEnd =" << mRtmp.gEnd << "\t";
2807 std::cout << "mR.editDist =" << mRtmp.editDist << "\n";
2808
2809 Align<String<Dna5Q>, ArrayGaps> alignL;
2810 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
2811 if (options.hammingOnly)
2812 scoreType.data_mismatch = -1;
2813 resize(rows(alignL), 2);
2814 assignSource(row(alignL, 0), infix(readSet[rseqNo],0,mLtmp.mScore));
2815 assignSource(row(alignL, 1), infix(genome,mLtmp.gBegin, mLtmp.gEnd));
2816 if (orientation == 'R')
2817 reverseComplement(source(row(alignL, 1)));
2818
2819 globalAlignment(alignL, scoreType);
2820 std::cout << alignL;
2821
2822 Align<String<Dna5Q>, ArrayGaps> alignR;
2823 resize(rows(alignR), 2);
2824 assignSource(row(alignR, 0), infix(readSet[rseqNo],length(readSet[rseqNo])-mRtmp.mScore,length(readSet[rseqNo])));
2825 assignSource(row(alignR, 1), infix(genome,mRtmp.gBegin, mRtmp.gEnd));
2826 if (orientation == 'R')
2827 reverseComplement(source(row(alignR, 1)));
2828
2829 globalAlignment(alignR, scoreType);
2830 std::cout << alignR;
2831
2832 std::cout << "Komplettali:\n";
2833 Align<String<Dna5Q>, ArrayGaps> align;
2834 resize(rows(align), 2);
2835 assignSource(row(align, 0), infix(readSet[rseqNo],0,length(readSet[rseqNo])));
2836 assignSource(row(align, 1), infix(genome,_min(mRtmp.gBegin,mLtmp.gBegin), _max(mRtmp.gEnd,mLtmp.gEnd)));
2837 if (orientation == 'R')
2838 reverseComplement(source(row(align, 1)));
2839
2840 globalAlignment(align, scoreType);
2841 std::cout << align;
2842 }
2843 #endif
2844
2845 // relative positions
2846 mLtmp.mateDelta = outerDistance;
2847 mRtmp.mateDelta = -outerDistance;
2848
2849 mLtmp.rseqNo = mRtmp.rseqNo = rseqNo;
2850
2851 if (!empty(readRegions) && options.anchored && options.outputFormat != 4)
2852 {
2853 if(readRegions[rseqNo].i2.i1 > 1) // match actually is on reverse strand
2854 {
2855 temp = mLtmp;
2856 mLtmp = mRtmp;
2857 mLtmp = temp;
2858 mLtmp.orientation = mRtmp.orientation = 'R';
2859 }
2860 }
2861
2862 if (!options.spec.DONT_DUMP_RESULTS)
2863 {
2864 appendValue(matches, mLtmp, Generous());
2865 appendValue(matches, mRtmp, Generous());
2866 }
2867 }
2868 }
2869 }
2870 if (length(matches) > options.compactThresh)
2871 {
2872 typename Size<TMatches>::Type oldSize = length(matches);
2873 // maskDuplicates(matches); // overlapping parallelograms cause duplicates for edit distance //TODO: implement!
2874 compactSplicedMatches(matches, cnts, options, false, swiftPatternL, swiftPatternR);
2875 options.compactThresh += (options.compactThresh >> 1);
2876 if (options._debugLevel >= 2)
2877 ::std::cerr << '(' << oldSize - length(matches) << " matches removed)";
2878 }
2879
2880 // short-cut negative matches
2881 if (lastPositive == (int64_t)-1)
2882 lastPotMatchNo[rseqNo] = (int64_t)-1;
2883 else
2884 value(fifo, lastPositive - firstNo).i1 = (int64_t)-1; // the first positive's link to previous is removed
2885
2886
2887 }//swiftFinderR
2888 double totalTimeEnd = sysTime();
2889 if(options._debugLevel > 1)
2890 {
2891 std::cout << "TotalTime:"<< totalTimeEnd - totalTime << std::endl;
2892 std::cout << "TotalTimeLeftFilter:" <<totalTimeLeftFilter << std::endl;
2893 std::cout << "TotalTimeLeftVerify:"<< totalTimeLeftVerify << std::endl;
2894 std::cout << "TotalTimeRightVerify:"<< totalTimeRightVerify << std::endl;
2895 std::cout << "TotalTimeCombine:"<< totalTimeCombine << std::endl;
2896 }
2897
2898
2899 }//function
2900
2901
2902
2903
2904 //////////////////////////////////////////////////////////////////////////////
2905 // Find split read matches in many genome sequences (given as StringSet)
2906 template <
2907 typename TMatches,
2908 typename TGenomeSet,
2909 typename TReadSet,
2910 typename TCounts,
2911 typename TSpec,
2912 typename TShapeL,
2913 typename TShapeR,
2914 typename TSwiftSpec >
mapSplicedReads(TMatches & matches,TGenomeSet & genomeSet,TReadSet const & readSet,TReadRegions & readRegions,TCounts & cnts,RazerSOptions<TSpec> & options,TShapeL const & shapeL,TShapeR const & shapeR,Swift<TSwiftSpec> const)2915 int mapSplicedReads(
2916 TMatches & matches,
2917 TGenomeSet & genomeSet,
2918 TReadSet const & readSet,
2919 TReadRegions & readRegions,
2920 TCounts & cnts,
2921 RazerSOptions<TSpec> & options,
2922 TShapeL const & shapeL,
2923 TShapeR const & shapeR,
2924 Swift<TSwiftSpec> const)
2925 {
2926 typedef typename Value<TReadSet>::Type TRead;
2927 typedef Index<TReadSet, IndexQGram<TShapeL, TQGramIndexSpec> > TIndexL; // q-gram index
2928 typedef Index<TReadSet, IndexQGram<TShapeR, TQGramIndexSpec> > TIndexR; // q-gram index
2929 typedef Pattern<TIndexL, Swift<TSwiftSpec> > TSwiftPatternL; // filter
2930 typedef Pattern<TIndexR, Swift<TSwiftSpec> > TSwiftPatternR; // filter
2931 typedef Pattern<TRead, MyersUkkonen> TMyersPattern; // verifier
2932
2933 unsigned readCount = length(readSet);
2934
2935 // configure q-gram index
2936 TIndexL swiftIndexL(readSet, shapeL);
2937 cargo(swiftIndexL).abundanceCut = options.abundanceCut;
2938 cargo(swiftIndexL)._debugLevel = options._debugLevel;
2939
2940 TIndexR swiftIndexR(readSet, shapeR);
2941 cargo(swiftIndexR).abundanceCut = options.abundanceCut;
2942 cargo(swiftIndexR)._debugLevel = options._debugLevel;
2943
2944 // configure Swift
2945 TSwiftPatternL swiftPatternL(swiftIndexL);
2946 swiftPatternL.params.minThreshold = options.threshold;
2947 swiftPatternL.params.tabooLength = options.tabooLength;
2948
2949 TSwiftPatternR swiftPatternR(swiftIndexR);
2950 swiftPatternL.params.minThreshold = options.thresholdR;
2951 swiftPatternL.params.tabooLength = options.tabooLength;
2952
2953 // init edit distance verifiers
2954 String<TMyersPattern> forwardPatternsL;
2955 String<TMyersPattern> forwardPatternsR;
2956 options.compMask[4] = (options.matchN)? 15: 0;
2957 if (!options.hammingOnly)
2958 {
2959 resize(forwardPatternsL, readCount, Exact());
2960 resize(forwardPatternsR, readCount, Exact());
2961 for(unsigned i = 0; i < readCount; ++i)
2962 {
2963 setHost(forwardPatternsL[i], indexText(swiftIndexL)[i]);
2964 setHost(forwardPatternsR[i], indexText(swiftIndexR)[i]);
2965 _patternMatchNOfPattern(forwardPatternsL[i], options.matchN);
2966 _patternMatchNOfPattern(forwardPatternsR[i], options.matchN);
2967 _patternMatchNOfFinder(forwardPatternsL[i], options.matchN);
2968 _patternMatchNOfFinder(forwardPatternsR[i], options.matchN);
2969 }
2970 }
2971
2972 // clear stats
2973 options.FP = 0;
2974 options.TP = 0;
2975 options.timeMapReads = 0;
2976 options.timeDumpResults = 0;
2977
2978 CharString id;
2979
2980 // iterate over genome sequences
2981 SEQAN_PROTIMESTART(find_time);
2982 for(unsigned gseqNo = 0; gseqNo < length(genomeSet); ++gseqNo)
2983 {
2984 if (options.forward)
2985 mapSplicedReads(matches, genomeSet[gseqNo], gseqNo, readSet, readRegions, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'F', options);
2986
2987 if (options.reverse)
2988 {
2989 reverseComplement(genomeSet[gseqNo]);
2990 mapSplicedReads(matches, genomeSet[gseqNo], gseqNo, readSet, readRegions, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'R', options);
2991 reverseComplement(genomeSet[gseqNo]);
2992 }
2993
2994 }
2995 options.timeMapReads += SEQAN_PROTIMEDIFF(find_time);
2996
2997 if (options._debugLevel >= 1)
2998 ::std::cerr << ::std::endl << "Finding reads took \t" << options.timeMapReads << " seconds" << ::std::endl;
2999 if (options._debugLevel >= 2) {
3000 ::std::cerr << ::std::endl;
3001 ::std::cerr << "___FILTRATION_STATS____" << ::std::endl;
3002 ::std::cerr << "Swift FP: " << options.FP << ::std::endl;
3003 ::std::cerr << "Swift TP: " << options.TP << ::std::endl;
3004 }
3005 return 0;
3006 }
3007
3008
3009 //////////////////////////////////////////////////////////////////////////////
3010 // Wrapper for different template specializations
3011 template <typename TMatches, typename TReadSet, typename TCounts, typename TSpec>
mapSplicedReads(TMatches & matches,StringSet<CharString> & genomeFileNameList,StringSet<CharString> & genomeNames,::std::map<unsigned,::std::pair<::std::string,unsigned>> & gnoToFileMap,TReadSet & readSet,TReadRegions & readRegions,TCounts & cnts,RazerSOptions<TSpec> & options)3012 int mapSplicedReads(
3013 TMatches & matches,
3014 StringSet<CharString> & genomeFileNameList,
3015 StringSet<CharString> & genomeNames, // genome names, taken from the Fasta file
3016 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
3017 TReadSet & readSet,
3018 TReadRegions & readRegions,
3019 TCounts & cnts,
3020 RazerSOptions<TSpec> & options)
3021 {
3022
3023 Shape<Dna, SimpleShape> ungappedL;
3024 Shape<Dna, OneGappedShape> onegappedL;
3025 Shape<Dna, GenericShape> gappedL;
3026
3027 Shape<Dna, SimpleShape> ungappedR;
3028 Shape<Dna, OneGappedShape> onegappedR;
3029 Shape<Dna, GenericShape> gappedR;
3030
3031 // 2x3x3 SPECIALIZATION
3032
3033 if (options.hammingOnly)
3034 {
3035 // select best-fitting combination of shape
3036 if (stringToShape(ungappedL, options.shape))
3037 {
3038 if (stringToShape(ungappedR, options.shapeR))
3039 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3040 if (stringToShape(onegappedR, options.shapeR))
3041 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3042 if (stringToShape(gappedR, options.shapeR))
3043 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3044 }
3045
3046 if (stringToShape(onegappedL, options.shape))
3047 {
3048 if (stringToShape(ungappedR, options.shapeR))
3049 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3050 if (stringToShape(onegappedR, options.shapeR))
3051 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3052 if (stringToShape(gappedR, options.shapeR))
3053 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3054 }
3055
3056 if (stringToShape(gappedL, options.shape))
3057 {
3058 if (stringToShape(ungappedR, options.shapeR))
3059 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3060 if (stringToShape(onegappedR, options.shapeR))
3061 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3062 if (stringToShape(gappedR, options.shapeR))
3063 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3064 }
3065 }
3066 else
3067 {
3068 // select best-fitting combination of shape
3069 if (stringToShape(ungappedL, options.shape))
3070 {
3071 if (stringToShape(ungappedR, options.shapeR))
3072 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, ungappedR, Swift<SwiftSemiGlobal>());
3073 if (stringToShape(onegappedR, options.shapeR))
3074 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, onegappedR, Swift<SwiftSemiGlobal>());
3075 if (stringToShape(gappedR, options.shapeR))
3076 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, gappedR, Swift<SwiftSemiGlobal>());
3077 }
3078
3079 if (stringToShape(onegappedL, options.shape))
3080 {
3081 if (stringToShape(ungappedR, options.shapeR))
3082 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, ungappedR, Swift<SwiftSemiGlobal>());
3083 if (stringToShape(onegappedR, options.shapeR))
3084 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, onegappedR, Swift<SwiftSemiGlobal>());
3085 if (stringToShape(gappedR, options.shapeR))
3086 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, gappedR, Swift<SwiftSemiGlobal>());
3087 }
3088
3089 if (stringToShape(gappedL, options.shape))
3090 {
3091 if (stringToShape(ungappedR, options.shapeR))
3092 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, ungappedR, Swift<SwiftSemiGlobal>());
3093 if (stringToShape(onegappedR, options.shapeR))
3094 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, onegappedR, Swift<SwiftSemiGlobal>());
3095 if (stringToShape(gappedR, options.shapeR))
3096 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, gappedR, Swift<SwiftSemiGlobal>());
3097 }
3098 }
3099
3100 return RAZERS_INVALID_SHAPE;
3101 }
3102
3103
3104 template <typename TMatches, typename TGenomeSet, typename TReadSet, typename TCounts, typename TSpec>
mapSplicedReads(TMatches & matches,TGenomeSet & genomeSet,TReadSet & readSet,TReadRegions & readRegions,TCounts & cnts,RazerSOptions<TSpec> & options)3105 int mapSplicedReads(
3106 TMatches & matches,
3107 TGenomeSet & genomeSet,
3108 TReadSet & readSet,
3109 TReadRegions & readRegions,
3110 TCounts & cnts,
3111 RazerSOptions<TSpec> & options)
3112 {
3113
3114
3115 Shape<Dna, SimpleShape> ungappedL;
3116 Shape<Dna, OneGappedShape> onegappedL;
3117 Shape<Dna, GenericShape> gappedL;
3118
3119
3120 Shape<Dna, SimpleShape> ungappedR;
3121 Shape<Dna, OneGappedShape> onegappedR;
3122 Shape<Dna, GenericShape> gappedR;
3123
3124 // 2x3x3 SPECIALIZATION
3125
3126 if (options.hammingOnly)
3127 {
3128 // select best-fitting combination of shape
3129 if (stringToShape(ungappedL, options.shape))
3130 {
3131 if (stringToShape(ungappedR, options.shapeR))
3132 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3133 if (stringToShape(onegappedR, options.shapeR))
3134 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3135 if (stringToShape(gappedR, options.shapeR))
3136 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3137 }
3138
3139 if (stringToShape(onegappedL, options.shape))
3140 {
3141 if (stringToShape(ungappedR, options.shapeR))
3142 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3143 if (stringToShape(onegappedR, options.shapeR))
3144 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3145 if (stringToShape(gappedR, options.shapeR))
3146 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3147 }
3148
3149 if (stringToShape(gappedL, options.shape))
3150 {
3151 if (stringToShape(ungappedR, options.shapeR))
3152 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3153 if (stringToShape(onegappedR, options.shapeR))
3154 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3155 if (stringToShape(gappedR, options.shapeR))
3156 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3157 }
3158 }
3159 else
3160 {
3161 // select best-fitting combination of shape
3162 if (stringToShape(ungappedL, options.shape))
3163 {
3164 if (stringToShape(ungappedR, options.shapeR))
3165 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, ungappedR, Swift<SwiftSemiGlobal>());
3166 if (stringToShape(onegappedR, options.shapeR))
3167 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, onegappedR, Swift<SwiftSemiGlobal>());
3168 if (stringToShape(gappedR, options.shapeR))
3169 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, gappedR, Swift<SwiftSemiGlobal>());
3170 }
3171
3172 if (stringToShape(onegappedL, options.shape))
3173 {
3174 if (stringToShape(ungappedR, options.shapeR))
3175 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, ungappedR, Swift<SwiftSemiGlobal>());
3176 if (stringToShape(onegappedR, options.shapeR))
3177 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, onegappedR, Swift<SwiftSemiGlobal>());
3178 if (stringToShape(gappedR, options.shapeR))
3179 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, gappedR, Swift<SwiftSemiGlobal>());
3180 }
3181
3182 if (stringToShape(gappedL, options.shape))
3183 {
3184 if (stringToShape(ungappedR, options.shapeR))
3185 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, ungappedR, Swift<SwiftSemiGlobal>());
3186 if (stringToShape(onegappedR, options.shapeR))
3187 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, onegappedR, Swift<SwiftSemiGlobal>());
3188 if (stringToShape(gappedR, options.shapeR))
3189 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, gappedR, Swift<SwiftSemiGlobal>());
3190 }
3191 }
3192
3193 return RAZERS_INVALID_SHAPE;
3194 }
3195
3196
3197
3198 }//namespace
3199
3200 #endif
3201