1 /*==========================================================================
2 RazerS Spliced - Fast Split Read Mapping
3
4 ============================================================================
5 Copyright (C) 2008 by Anne-Katrin Emde
6
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your options) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 ==========================================================================*/
20
21 #ifndef SEQAN_HEADER_RAZERS_SPLICED_H
22 #define SEQAN_HEADER_RAZERS_SPLICED_H
23
24 //#define RAZERS_DEBUG
25 //#define RAZERS_DEBUG_LIGHT
26
27 #include <seqan/align.h>
28 #include <seqan/graph_types.h>
29 #include <seqan/graph_algorithms.h>
30 #include <seqan/graph_align.h>
31 #include <seqan/seeds.h>
32
33 namespace seqan
34 {
35
36
37
38 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
39 // Match statistics stuff
40 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
41
42 //statistics, expected number of matches
choose(int n,int k)43 double choose(int n, int k)
44 {
45 double cnk = 1.0;
46
47 if (k * 2 > n)
48 k = n - k;
49
50 for (int i = 1; i <= k; n--, i++)
51 {
52 cnk /= i;
53 cnk *= n;
54 }
55 return cnk;
56 }
57
58
59 //#R - read length
60 //#T - total sequence length
61 //#I - length of insertion
62 //#D - length of deletion
63 //#M1 - minimal prefix match length
64 //#M2 - minimal suffix match length
65
66
67 //## first function denotes the random match to an iid sequence
68 template<typename TSize, typename TValue>
69 double
_probability(TSize R,TSize M1,TSize M2,TValue d,TValue d_m1,TValue d_m2)70 _probability(TSize R, TSize M1, TSize M2, TValue d, TValue d_m1, TValue d_m2)
71 {
72 double sum_prob = 0;
73 for (unsigned i1 = 0; i1 < d_m1; ++i1)
74 {
75 for (unsigned i2 = 0; i2 < d_m2; ++i2)
76 {
77 for (unsigned j = 0; j < d-i1-i2; ++j)
78 {
79 sum_prob += (double)(choose(M1,i1)* pow((double)0.25,(double)R-i1-i2-j) * choose(M2,i2) * pow((double)0.75,(double)i1+i2+j) * choose(R-M1-M2,j));
80 }
81 }
82 }
83 return sum_prob;
84 }
85
86
87 //#InsCuts describes the possible number of cuts
88 //#for a read of length R and an insertion of length I
89 template<typename TSize>
90 TSize
_insCuts(TSize R,TSize M1,TSize M2,TSize I)91 _insCuts(TSize R, TSize M1, TSize M2, TSize I){
92 return (R-I-M1-M2+1);
93 }
94
95
96
97 //#function for calculating expected matches with I insertions
98 template<typename TSize, typename TValue>
99 double
_expMatchesIns(TSize R,TSize M1,TSize M2,TSize I,TValue d,TValue d_m1,TValue d_m2,TSize S,TSize T)100 _expMatchesIns(TSize R, TSize M1, TSize M2, TSize I, TValue d,TValue d_m1,TValue d_m2, TSize S, TSize T)
101 {
102 return((double)T*((S-R-I-1)*_probability((R-I),M1,M2,d,d_m1,d_m2))*_insCuts(R,M1,M2,I));
103 }
104
105
106 //#DelCuts describes the possible number of cuts
107 //#for a read of length R in case of a deletion
108 template<typename TSize>
109 TSize
_delCuts(TSize R,TSize M1,TSize M2)110 _delCuts(TSize R, TSize M1, TSize M2)
111 {
112 return (R-M1-M2+1);
113 }
114
115
116 //#Del describes the possible number of configurations
117 //#for a string of length R
118 template<typename TSize>
119 TSize
_del(TSize R,TSize S)120 _del(TSize R, TSize S)
121 {
122 return((S-R)*(S-R+1)/2);
123 }
124
125
126 //#function for calculating expected matches with deletion of size D
127 template<typename TSize, typename TValue>
128 double
_expMatchesDel(TSize R,TSize M1,TSize M2,TValue d,TValue d_m1,TValue d_m2,TSize S,TSize T)129 _expMatchesDel(TSize R, TSize M1, TSize M2, TValue d, TValue d_m1, TValue d_m2, TSize S, TSize T)
130 {
131 return ((double)T*(_probability(R,M1,M2,d,d_m1,d_m2))*_delCuts(R,M1,M2)*_del(R,S));
132 }
133
134 template<typename TReadSet, typename TSize, typename TOptions>
135 void
expNumRandomMatches(TReadSet & readSet,TSize genomeLen,TOptions & options)136 expNumRandomMatches(TReadSet &readSet, TSize genomeLen, TOptions & options)
137 {
138 TSize M1 = options.minMatchLen;
139 TSize M2 = options.minMatchLen;
140 TSize d_m1 = (int) M1 * options.errorRate;
141 TSize d_m2 = (int) M2 * options.errorRate;
142 TSize numReads = length(readSet);
143 TSize readLen = (numReads > 0) ? length(readSet[0]) : 0;
144 TSize numErrors = (int) readLen * options.errorRate;
145
146 //expected number of random deletion matches:
147 double delMatches = _expMatchesDel(readLen,M1,M2,numErrors,d_m1, d_m2, genomeLen, numReads);
148
149 //expected number of random deletion matches:
150 double insMatches = 0;
151 for(TSize insLen = 1; insLen <=readLen-M1-M2; ++insLen)
152 insMatches += _expMatchesIns(readLen,M1,M2,numErrors,insLen,d_m1, d_m2, genomeLen, numReads);
153
154 ::std::cout << "Expected number of random deletion-indicating matches: " << delMatches << std::endl;
155 ::std::cout << "Expected number of random insertion-indicating matches: " << insMatches << std::endl;
156 }
157
158
159 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
160
161 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
162
163
164 // SPLIT MAPPING
165 // We build two q-gram indices, one for the prefix, the other for the suffix of the read
166
167 struct LongestPrefix{};
168 struct LongestSuffix{};
169
170 struct OrientationReverse{};
171 struct OrientationForward{};
172
173
174
175
176 //////////////////////////////////////////////////////////////////////////////
177 // Remove low quality matches # necessary to have an own splicedmatch function?
178 // planned specs: SpliceSite, General, ...
179 template < typename TMatches, typename TCounts, typename TSpec, typename TSwiftL, typename TSwiftR >
compactSplicedMatches(TMatches & matches,TCounts &,RazerSOptions<TSpec> & options,bool compactFinal,TSwiftL & swiftL,TSwiftR & swiftR)180 void compactSplicedMatches(TMatches &matches,
181 TCounts & /*cnts*/,
182 RazerSOptions<TSpec> &options,
183 bool compactFinal,
184 TSwiftL &swiftL,
185 TSwiftR &swiftR)
186 {
187 typedef typename Value<TMatches>::Type TMatch;
188 typedef typename Iterator<TMatches, Standard>::Type TIterator;
189
190 unsigned readNo = -1;
191 unsigned hitCount = 0;
192 unsigned hitCountCutOff = options.maxHits;
193 int scoreDistCutOff = std::numeric_limits<int>::min();
194
195 TIterator it = begin(matches, Standard());
196 TIterator itEnd = end(matches, Standard());
197 TIterator dit = it;
198 TIterator ditBeg = it;
199
200 // sort
201 ::std::sort(it, itEnd, LessSplicedErrors<TMatch>());
202 // ::std::sort(it, itEnd, LessSplicedScore<TMatch>());
203 int counter = 0;
204 for (; it != itEnd; ++it)
205 {
206 ++counter;
207 if ((*it).orientation == '-') { ++it; continue; }
208 if (readNo == (*it).rseqNo)
209 {
210 if ((*it).pairScore <= scoreDistCutOff)
211 {
212 ++it;
213 continue;
214 }
215 if (++hitCount >= hitCountCutOff)
216 {
217 #ifdef RAZERS_MASK_READS
218 if (hitCount == hitCountCutOff)
219 {
220 // we have enough, now look for better matches
221 int maxErrors = - (*it).pairScore - 1;
222 if (options.purgeAmbiguous && (options.distanceRange == 0 || maxErrors <= (int) options.distanceRange))
223 maxErrors = -1;
224
225 setMaxErrors(swiftL, readNo, maxErrors);
226 setMaxErrors(swiftR, readNo, maxErrors);
227
228 if (maxErrors == -1 && options._debugLevel >= 2)
229 ::std::cerr << "(read #" << readNo << " disabled)";
230
231 if(options.purgeAmbiguous)
232 {
233 if (options.distanceRange == 0 || -(*it).pairScore < (int) options.distanceRange || compactFinal)
234 dit = ditBeg;
235 else {
236 *dit = *it; ++dit; ++it;
237 *dit = *it; ++dit;
238 }
239 }
240
241 }
242 #endif
243 ++it;
244 continue;
245 }
246 }
247 else
248 {
249 readNo = (*it).rseqNo;// >> 1;
250 hitCount = 0;
251 if (options.distanceRange > 0)
252 scoreDistCutOff = (*it).pairScore - options.distanceRange;
253 ditBeg = dit;
254 }
255 *dit = *it; ++dit; ++it;
256 *dit = *it; ++dit;
257 }
258 resize(matches, dit - begin(matches, Standard()));
259 }
260
261
262
263 //////////////////////////////////////////////////////////////////////////////
264 // Remove low quality matches # necessary to have an own splicedmatch function?
265 template < typename TMatches, typename TCounts, typename TSpec, typename TSwiftL, typename TSwiftR >
compactSplicedMatchesPurgeAmbiguous(TMatches & matches,TCounts &,RazerSOptions<TSpec> & options,TSwiftL &,TSwiftR &)266 void compactSplicedMatchesPurgeAmbiguous(TMatches &matches, TCounts & /*cnts*/, RazerSOptions<TSpec> &options, TSwiftL &, TSwiftR &)
267 {
268 typedef typename Value<TMatches>::Type TMatch;
269 typedef typename Iterator<TMatches, Standard>::Type TIterator;
270
271 unsigned readNo = -1;
272 unsigned hitCount = 0;
273 unsigned hitCountCutOff = options.maxHits;
274 int scoreDistCutOff = std::numeric_limits<int>::min();
275
276 TIterator it = begin(matches, Standard());
277 TIterator itEnd = end(matches, Standard());
278 TIterator dit = it;
279 TIterator ditBeg = it;
280
281 // sort
282 ::std::sort(it, itEnd, LessSplicedErrors<TMatch>());
283 // ::std::sort(it, itEnd, LessSplicedScore<TMatch>());
284 int counter = 0;
285 for (; it != itEnd; ++it)
286 {
287 ++counter;
288 if ((*it).orientation == '-') continue;
289 if (readNo == (*it).rseqNo)
290 {
291 if ((*it).pairScore <= scoreDistCutOff)
292 {
293 ++it;
294 continue;
295 }
296 if (++hitCount >= hitCountCutOff)
297 {
298 if (hitCount == hitCountCutOff)
299 dit = ditBeg;
300 ++it;
301 continue;
302 }
303 }
304 else
305 {
306 readNo = (*it).rseqNo;
307 hitCount = 0;
308 if (options.distanceRange > 0)
309 scoreDistCutOff = (*it).pairScore - options.distanceRange;
310 ditBeg = dit;
311 }
312 *dit = *it; ++dit; ++it;
313 *dit = *it; ++dit;
314 }
315 resize(matches, dit - begin(matches, Standard()));
316 }
317
318
319 template <typename TReadMatch>
320 struct LessReadNoPairErrors : public ::std::binary_function < TReadMatch, TReadMatch, bool >
321 {
operatorLessReadNoPairErrors322 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
323 {
324 // read number
325 if (a.rseqNo < b.rseqNo) return true;
326 if (a.rseqNo > b.rseqNo) return false;
327
328 // quality
329 if (a.pairScore > b.pairScore) return true;
330 if (a.pairScore < b.pairScore) return false;
331 if (a.pairId < b.pairId) return true;
332 if (a.pairId > b.pairId) return false;
333 return a.editDist < b.editDist;
334 }
335 };
336
337
338
339 //////////////////////////////////////////////////////////////////////////////
340 // Count matches for each number of errors
341 template < typename TMatches, typename TCounts >
countSplitMatches(TMatches & matches,TCounts & cnt)342 void countSplitMatches(TMatches &matches, TCounts &cnt)
343 {
344 typedef typename Value<TMatches>::Type TMatch;
345 typedef typename Iterator<TMatches, Standard>::Type TIterator;
346 typedef typename Value<TCounts>::Type TRow;
347 typedef typename Value<TRow>::Type TValue;
348
349
350 TIterator it = begin(matches, Standard());
351 TIterator itEnd = end(matches, Standard());
352
353 ::std::sort(it, itEnd, LessReadNoPairErrors<TMatch>());
354
355 unsigned readNo = -1;
356 short editDist = -1;
357 int64_t count = 0;
358 int64_t maxVal = std::numeric_limits<TValue>::max();
359
360 for (; it != itEnd; ++it)
361 {
362 if ((*it).orientation == '-') continue;
363 if (readNo == (*it).rseqNo &&
364 -editDist == (*it).pairScore)
365 ++count;
366 else
367 {
368 if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt))
369 cnt[editDist][readNo] = (maxVal < count)? maxVal : count;
370 readNo = (*it).rseqNo;
371 editDist = -(*it).pairScore;
372 count = 1;
373 }
374 }
375 if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt))
376 cnt[editDist][readNo] = count;
377 }
378
379
380 template<typename TAlign, typename TPosition>
381 int
countErrorsInAlign(TAlign & align,TPosition end_)382 countErrorsInAlign(TAlign & align, TPosition end_)
383 {
384
385 typedef typename Source<TAlign>::Type TSource;
386 typedef typename Iterator<TSource, Rooted>::Type TStringIterator;
387
388 typedef typename Row<TAlign>::Type TRow;
389 typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
390
391 TAlignIterator ali_it0 = iter(row(align,0),0);
392 TAlignIterator ali_it1 = iter(row(align,1),0);
393 TAlignIterator ali_it0_stop = iter(row(align,0),end_);
394 TAlignIterator ali_it1_stop = iter(row(align,1),end_);
395
396
397 int errors = 0;
398 while(ali_it0 != ali_it0_stop && ali_it1 != ali_it1_stop)
399 {
400 while(ali_it0!=ali_it0_stop && ali_it1!=ali_it1_stop && !isGap(ali_it0)&& !isGap(ali_it1))
401 {
402 if(*ali_it1 != *ali_it0)
403 ++errors;
404 ++ali_it0;
405 ++ali_it1;
406 }
407 while(ali_it0!=ali_it0_stop && isGap(ali_it0))
408 {
409 ++ali_it0;
410 ++ali_it1;
411 ++errors;
412 }
413 while(isGap(ali_it1)&& ali_it1!=ali_it1_stop)
414 {
415 ++ali_it0;
416 ++ali_it1;
417 ++errors;
418 }
419 }
420 while(ali_it0!=ali_it0_stop)
421 {
422 ++ali_it0;
423 ++errors;
424 }
425 while(ali_it1 != ali_it1_stop)
426 {
427 ++ali_it1;
428 ++errors;
429 }
430
431 return errors;
432 }
433
434
435
436 //////////////////////////////////////////////////////////////////////////////
437 // Edit distance verification for longest suffix/prefix
438 template <
439 typename TMatch,
440 typename TGenome,
441 typename TReadSet,
442 typename TMyersPatterns,
443 typename TSpec,
444 typename TSufPrefSpec
445 >
446 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> inf,unsigned rseqNo,TReadSet & readSet,TMyersPatterns & forwardPatterns,RazerSOptions<TSpec> const & options,SwiftSemiGlobal,TSufPrefSpec)447 matchVerify(
448 TMatch &m, // resulting match
449 Segment<TGenome, InfixSegment> inf, // potential match genome region
450 unsigned rseqNo, // read number
451 TReadSet &readSet, // reads
452 TMyersPatterns &forwardPatterns, // MyersBitVector preprocessing data
453 RazerSOptions<TSpec> const &options,// RazerS options
454 SwiftSemiGlobal, // Edit distance
455 TSufPrefSpec) // Swift specialization
456 {
457 typedef Segment<TGenome, InfixSegment> TGenomeInfix;
458 typedef typename Value<TReadSet>::Type TRead;
459
460 // find read match end
461 typedef Finder<TGenomeInfix> TMyersFinder;
462 typedef typename Value<TMyersPatterns>::Type TMyersPattern;
463
464 // find read match begin
465 typedef ModifiedString<TGenomeInfix, ModReverse> TGenomeInfixRev;
466 typedef ModifiedString<TRead, ModReverse> TReadRev;
467 typedef Finder<TGenomeInfixRev> TMyersFinderRev;
468 typedef Pattern<TReadRev, MyersUkkonenGlobal> TMyersPatternRev;
469
470 TMyersFinder myersFinder(inf);
471 TMyersPattern &myersPattern = forwardPatterns[rseqNo]; //have to make sure this only contains the prefix
472
473 #ifdef RAZERS_DEBUG
474 ::std::cout << "Verify: " << ::std::endl;
475 ::std::cout << "Genome: " << inf << "\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl;
476 ::std::cout << "Read: " << readSet[rseqNo] << ::std::endl;
477 #endif
478
479 unsigned ndlLength = _min(sequenceLength(rseqNo, readSet),options.minMatchLen);
480 int maxScore = std::numeric_limits<int>::min();
481 int minScore = -(int)(ndlLength * options.errorRate);
482 TMyersFinder maxPos;
483
484 // find end of best semi-global alignment
485 while (find(myersFinder, myersPattern, minScore))
486 {
487 if (maxScore <= getScore(myersPattern))
488 {
489 maxScore = getScore(myersPattern);
490 maxPos = myersFinder;
491 }
492 }
493
494
495 if (maxScore < minScore)
496 return false;
497
498 m.editDist = (unsigned)-maxScore;
499 TGenomeInfix oriInf = inf;
500 setEndPosition(inf, m.gEnd = (beginPosition(inf) + position(maxPos) + 1));
501
502 // limit the beginning to needle length plus errors (== -maxScore)
503 if (length(inf) > ndlLength - maxScore)
504 setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore);
505
506 // find beginning of best semi-global alignment
507 TGenomeInfixRev infRev(inf);
508 TMyersFinderRev myersFinderRev(infRev);
509 TReadRev readRev;
510 TRead readInf;
511 if(IsSameType<TSufPrefSpec,LongestSuffix>::VALUE)
512 readInf = infix(readSet[rseqNo],length(readSet[rseqNo])-options.minMatchLen,length(readSet[rseqNo]));
513 else
514 readInf = infix(readSet[rseqNo],0,options.minMatchLen);
515 setHost(readRev, readInf);
516
517 TMyersPatternRev myersPatternRev(readRev);
518
519 _patternMatchNOfPattern(myersPatternRev, options.matchN);
520 _patternMatchNOfFinder(myersPatternRev, options.matchN);
521 while (find(myersFinderRev, myersPatternRev, maxScore))
522 m.gBegin = m.gEnd - (position(myersFinderRev) + 1);
523
524 m.mScore = ndlLength;
525 m.seedEditDist = m.editDist;
526 m.gSeedLen = m.gEnd - m.gBegin;
527
528 #ifdef RAZERS_DEBUG
529 ::std::cout << " before extendMatch " << ::std::endl;
530 ::std::cout << " match: " << ::std::endl;
531 ::std::cout << " mScore= " <<m.mScore << ::std::endl;
532 ::std::cout << " gBegin= " <<m.gBegin << ::std::endl;
533 ::std::cout << " gEnd= " <<m.gEnd << ::std::endl;
534 ::std::cout << " edit= " <<m.editDist << ::std::endl;
535 #endif
536
537 //TODO: give only part of read to extension!!!
538
539 //now extend the seed
540 extendMatch(readSet,rseqNo,oriInf,m,options,TSufPrefSpec());
541
542 #ifdef RAZERS_DEBUG
543 ::std::cout << " match: " << ::std::endl;
544 ::std::cout << " mScore= " <<m.mScore << ::std::endl;
545 ::std::cout << " gBegin= " <<m.gBegin << ::std::endl;
546 ::std::cout << " gEnd= " <<m.gEnd << ::std::endl;
547 ::std::cout << " edit= " <<m.editDist << ::std::endl;
548 #endif
549
550 return true;
551 }
552
553 template<typename TReadSet, typename TSize, typename TInf, typename TMatch, typename TOptions>
554 void
extendMatch(TReadSet & readSet,TSize rseqNo,TInf & inf,TMatch & m,TOptions & options,LongestSuffix)555 extendMatch(TReadSet &readSet, TSize rseqNo, TInf & inf, TMatch &m, TOptions &options, LongestSuffix)
556 {
557 #ifdef RAZERS_DEBUG
558 ::std::cout << " extending match left" << ::std::endl;
559 #endif
560
561 unsigned lDim0 = _max(0,(int)length(readSet[rseqNo])-(int)options.minMatchLen);
562 unsigned lDim1 = m.gBegin - beginPosition(inf);
563 unsigned rDim0 = length(readSet[rseqNo])-1;
564 unsigned rDim1 = m.gEnd - beginPosition(inf)-1;
565 Seed<int,SimpleSeed> seed(lDim0, lDim1, rDim0, rDim1);
566 Score<int> scoreMatrix(0,-1,-1,-1);
567 int scoreDropOff = (sequenceLength(rseqNo,readSet) * options.errorRate) - m.editDist;
568
569 #ifdef RAZERS_DEBUG
570 ::std::cout << "beginPos = " << beginPosition(inf) << std::endl;
571 ::std::cout << "endPos = " << endPosition(inf) << std::endl;
572 ::std::cout << " lDim0: " << lDim0 << ::std::endl;
573 ::std::cout << " lDim1: " << lDim1 << ::std::endl;
574 ::std::cout << " rDim0: " << rDim0 << ::std::endl;
575 ::std::cout << " rDim1: " << rDim1 << ::std::endl;
576 ::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
577 ::std::cout << " readInf: "<< infix(readSet[rseqNo],lDim0,rDim0+1) << ::std::endl;
578 ::std::cout << " gInfInf: "<< infix(inf,lDim1,rDim1+1) << ::std::endl;
579 ::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
580 ::std::cout << " gInf: "<< inf << ::std::endl;
581 #endif
582
583 int extScore = 0;
584 extendSeedScore(seed,extScore,scoreDropOff,scoreMatrix, readSet[rseqNo],inf,0,GappedXDrop());
585 m.gBegin = leftDim1(seed) + beginPosition(inf);
586 m.mScore = rightDim0(seed) - leftDim0(seed) + 1;
587 m.editDist -= extScore;
588
589 #ifdef RAZERS_DEBUG
590 ::std::cout << " lDim0: " << leftDim0(seed) << ::std::endl;
591 ::std::cout << " lDim1: " << leftDim1(seed) << ::std::endl;
592 ::std::cout << " rDim0: " << rightDim0(seed) << ::std::endl;
593 ::std::cout << " rDim1: " << rightDim1(seed) << ::std::endl;
594 ::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
595 ::std::cout << " readInf: "<< infix(readSet[rseqNo],leftDim0(seed),rightDim0(seed)+1) << ::std::endl;
596 ::std::cout << " gInfInf: "<< infix(inf,leftDim1(seed),rightDim1(seed)+1) << ::std::endl;
597 ::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
598 ::std::cout << " gInf: "<< inf << ::std::endl;
599 #endif
600 }
601
602 template<typename TReadSet, typename TSize, typename TInf, typename TMatch, typename TOptions>
603 void
extendMatch(TReadSet & readSet,TSize rseqNo,TInf & inf,TMatch & m,TOptions & options,LongestPrefix)604 extendMatch(TReadSet &readSet, TSize rseqNo, TInf & inf, TMatch &m, TOptions &options, LongestPrefix)
605 {
606 #ifdef RAZERS_DEBUG
607 ::std::cout << " extending match right" << ::std::endl;
608 #endif
609
610 unsigned lDim0 = 0;
611 unsigned lDim1 = m.gBegin - beginPosition(inf);
612 unsigned rDim0 = _min(options.minMatchLen,length(readSet[rseqNo])) - 1;
613 unsigned rDim1 = m.gEnd - beginPosition(inf) - 1;
614 Seed<int,SimpleSeed> seed(lDim0, lDim1, rDim0, rDim1);
615 Score<int> scoreMatrix(0,-1,-1,-1);
616 int scoreDropOff = (sequenceLength(rseqNo,readSet) * options.errorRate) - m.editDist;
617
618 #ifdef RAZERS_DEBUG
619 ::std::cout << "beginPos = " << beginPosition(inf) << std::endl;
620 ::std::cout << "endPos = " << endPosition(inf) << std::endl;
621 ::std::cout << " lDim0: " << lDim0 << ::std::endl;
622 ::std::cout << " lDim1: " << lDim1 << ::std::endl;
623 ::std::cout << " rDim0: " << rDim0 << ::std::endl;
624 ::std::cout << " rDim1: " << rDim1 << ::std::endl;
625 ::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
626 ::std::cout << " readInf: "<< infix(readSet[rseqNo],lDim0,rDim0+1) << ::std::endl;
627 ::std::cout << " gInfInf: "<< infix(inf,lDim1,rDim1+1) << ::std::endl;
628 ::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
629 ::std::cout << " gInf: "<< inf << ::std::endl;
630 #endif
631
632 int extScore = 0;
633 extendSeedScore(seed,extScore,scoreDropOff,scoreMatrix, readSet[rseqNo],inf,1,GappedXDrop());
634 m.gEnd = rightDim1(seed) + 1 + beginPosition(inf);
635 m.mScore = rightDim0(seed) - leftDim0(seed) + 1;
636 m.editDist -= extScore;
637
638 #ifdef RAZERS_DEBUG
639 ::std::cout << " lDim0: " << leftDim0(seed) << ::std::endl;
640 ::std::cout << " lDim1: " << leftDim1(seed) << ::std::endl;
641 ::std::cout << " rDim0: " << rightDim0(seed) << ::std::endl;
642 ::std::cout << " rDim1: " << rightDim1(seed) << ::std::endl;
643 ::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
644 ::std::cout << " readInf: "<< infix(readSet[rseqNo],leftDim0(seed),rightDim0(seed)+1) << ::std::endl;
645 ::std::cout << " gInfInf: "<< infix(inf,leftDim1(seed),rightDim1(seed)+1) << ::std::endl;
646 ::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
647 ::std::cout << " gInf: "<< inf << ::std::endl;
648 #endif
649 }
650
651
652 //template <
653 // typename TMatch,
654 // typename TGenomeSegment,
655 // typename TReadSet,
656 // typename TDummy,
657 // typename TSpec >
658 //inline bool
659 //matchVerify(
660 // TMatch &m, // resulting match
661 // TGenomeSegment inf, // potential match genome region
662 // unsigned rseqNo, // read number
663 // TReadSet &readSet, // reads
664 // TDummy&,
665 // RazerSOptions<TSpec> const &options, // RazerS options
666 // LongestSuffix) // LongestEditPrefix within errorRate
667 //{
668 // ::std::cout << "vcejsfhkjdfhksh11111\n";
669 // if( matchVerify(m,inf,rseqNo,
670 // readSet, false,
671 // options,LongestHammingSuffix()))
672 // {
673 // // unsigned tmp = m.gBegin;
674 // // m.gBegin = m.gEnd;
675 // // m.gEnd = tmp;
676 // return true;
677 // }
678 // else return false;
679 //
680 //}
681 //
682 //
683 //template <
684 // typename TMatch,
685 // typename TGenomeSegment,
686 // typename TReadSet,
687 // typename TDummy,
688 // typename TSpec >
689 //inline bool
690 //matchVerify(
691 // TMatch &m, // resulting match
692 // TGenomeSegment inf, // potential match genome region
693 // unsigned rseqNo, // read number
694 // TReadSet &readSet, // reads
695 // TDummy & ,
696 // RazerSOptions<TSpec> const &options, // RazerS options
697 // LongestPrefix) // LongestEditPrefix within errorRate
698 //{
699 // ::std::cout << "vcejsfhkjdfhksh222222\n";
700 // return matchVerify(m,inf,rseqNo,
701 // readSet, true,
702 // options,LongestHammingPrefix());
703 //}
704
705
706 template <
707 typename TMatch,
708 typename TGenome,
709 typename TReadSet,
710 typename TPattern,
711 typename TSpec >
712 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> genomeInf,unsigned rseqNo,TReadSet & readSet,TPattern &,RazerSOptions<TSpec> const & options,SwiftSemiGlobalHamming,LongestPrefix)713 matchVerify(
714 TMatch &m, // resulting match
715 Segment<TGenome,InfixSegment> genomeInf, // potential match genome region
716 unsigned rseqNo, // read number
717 TReadSet& readSet, // original readSet
718 TPattern&,
719 RazerSOptions<TSpec> const &options, // RazerS options
720 SwiftSemiGlobalHamming, // HammingDistance
721 LongestPrefix) // LongestPrefix
722 {
723
724 typedef Segment<TGenome, InfixSegment> TGenomeInfix;
725 typedef typename Size<TGenomeInfix>::Type TSize;
726 typedef typename Value<TGenomeInfix>::Type TDna;
727 typedef typename Position<TGenomeInfix>::Type TPosition;
728 typedef typename Value<TReadSet>::Type TRead;
729 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator;
730 typedef typename Infix<TRead>::Type TReadInf;
731 typedef typename Iterator<TReadInf, Standard>::Type TReadIterator;
732
733 if (length(genomeInf) < options.minMatchLen) return false;
734 TReadInf read = infix(readSet[rseqNo],0,length(readSet[rseqNo])-options.minMatchLen);
735
736
737 TReadIterator ritBeg = begin(read, Standard());
738 TReadIterator ritEnd = end(read, Standard());
739 TGenomeIterator git = begin(genomeInf, Standard());
740 TGenomeIterator gitEnd = end(genomeInf, Standard()) - (length(read) - 1);
741
742 // this is max number of errors the seed should have
743 unsigned maxErrorsSeed = (unsigned)(options.minMatchLen * options.errorRate);
744 unsigned maxTotalErrors = (unsigned)(length(read) * options.errorRate);
745 unsigned minSeedErrors = maxErrorsSeed + 1;
746 unsigned minTotalErrors = maxTotalErrors + 1;
747 unsigned bestHitLength = 0;
748
749 for (; git < gitEnd; ++git)
750 {
751 bool hit = true;
752 unsigned hitLength = 0;
753 unsigned count = 0;
754 unsigned seedErrors = 0;
755 unsigned totalErrors = 0;
756 TGenomeIterator g = git;
757 for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g)
758 {
759 if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
760 {
761 if (count < options.minMatchLen)
762 {
763 ++totalErrors;
764 if(++seedErrors > maxErrorsSeed)
765 {
766 hit = false;
767 break;
768 }
769 }
770 else
771 {
772 if(++totalErrors > maxTotalErrors)
773 {
774 // we are excluding this last error position
775 --totalErrors;
776 break;
777 }
778 }
779 }
780 ++count;
781 }
782 if (hit) hitLength = count;
783 if (hitLength > bestHitLength ) //simply take the longest hit
784 {
785 minSeedErrors = seedErrors;
786 minTotalErrors = totalErrors;
787 bestHitLength = hitLength;
788 m.gBegin = git - begin(host(genomeInf), Standard());
789 }
790 }
791
792
793
794 if(bestHitLength < options.minMatchLen)
795 return false;
796
797 m.gEnd = m.gBegin + bestHitLength;
798 m.mScore = bestHitLength;
799 m.editDist = minTotalErrors;
800
801 #ifdef RAZERS_DEBUG
802 std::cout << "m.gBeg =" << m.gBegin << "\n";
803 std::cout << "m.gEnd =" << m.gEnd << "\n";
804 std::cout << "m.edit =" << m.editDist << "\n";
805 std::cout << "m.mScore=" << m.mScore << "\n\n";
806 #endif
807
808 return true;
809
810 }
811
812
813 template <
814 typename TMatch,
815 typename TGenome,
816 typename TReadSet,
817 typename TPattern,
818 typename TSpec >
819 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> genomeInf,unsigned rseqNo,TReadSet & readSet,TPattern &,RazerSOptions<TSpec> const & options,SwiftSemiGlobalHamming,LongestSuffix)820 matchVerify(
821 TMatch &m, // resulting match
822 Segment<TGenome,InfixSegment> genomeInf, // potential match genome region
823 unsigned rseqNo, // read number
824 TReadSet & readSet, // original readSet
825 TPattern&,
826 RazerSOptions<TSpec> const &options, // RazerS options
827 SwiftSemiGlobalHamming,
828 LongestSuffix) // LongestSuffix
829 {
830
831 typedef Segment<TGenome, InfixSegment> TGenomeInfix;
832 typedef typename Size<TGenomeInfix>::Type TSize;
833 typedef typename Value<TGenomeInfix>::Type TDna;
834 typedef typename Position<TGenomeInfix>::Type TPosition;
835 typedef typename Value<TReadSet>::Type TRead;
836 typedef typename Iterator<TGenomeInfix, Standard>::Type TGenomeIterator;
837 typedef typename Infix<TRead>::Type TReadInf;
838 typedef typename Iterator<TReadInf, Standard>::Type TReadIterator;
839
840 if (length(genomeInf) < options.minMatchLen) return false;
841 bool debug = false;
842 TRead read = infix(readSet[rseqNo],options.minMatchLen,length(readSet[rseqNo]));
843
844
845 if(debug)
846 {
847 ::std::cout<< "suffixmatching\n";
848 ::std::cout << "genome=" << genomeInf << "\nread =" << read <<"\n";
849 }
850
851 TReadIterator ritEnd = end(read, Standard())-1;
852 TReadIterator ritBeg = begin(read, Standard());
853 TGenomeIterator git = end(genomeInf, Standard())-1;
854 TGenomeIterator gitBeg = begin(genomeInf, Standard()) + options.minMatchLen;
855
856 // this is max number of errors the seed should have
857 unsigned maxErrorsSeed = (unsigned)(options.minMatchLen * options.errorRate);
858 unsigned maxTotalErrors = (unsigned)(length(read) * options.errorRate);
859 unsigned minSeedErrors = maxErrorsSeed + 1;
860 unsigned minTotalErrors = maxTotalErrors + 1;
861 unsigned bestHitLength = 0;
862
863 for (; git > gitBeg; --git)
864 {
865 bool hit = true;
866 unsigned hitLength = 0;
867 unsigned count = 0;
868 unsigned seedErrors = 0;
869 unsigned totalErrors = 0;
870 TGenomeIterator g = git;
871 for(TReadIterator r = ritEnd; r >= ritBeg; --r, --g)
872 {
873 if(debug)::std::cout << *r << "\t" << *g << "\n";
874 if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
875 {
876 if (count < options.minMatchLen)
877 {
878 ++totalErrors;
879 if(++seedErrors > maxErrorsSeed)
880 {
881 // if(debug) ::std::cout << "-->no\n";
882 hit = false;
883 break;
884 }
885 }
886 else
887 {
888 if(++totalErrors > maxTotalErrors)
889 {
890 // we are excluding this last error position
891 --totalErrors;
892 break;
893 }
894 }
895 }
896 ++count;
897 }
898 if (hit) hitLength = count;
899 if (hitLength > bestHitLength ) //simply take the longest hit
900 {
901 minSeedErrors = seedErrors;
902 minTotalErrors = totalErrors;
903 bestHitLength = hitLength;
904 m.gEnd = git - begin(host(genomeInf), Standard()) + 1;
905 if(debug) ::std::cout << "m.gEnd=" << m.gEnd << ::std::endl;
906
907 }
908 }
909
910
911 if(bestHitLength < options.minMatchLen)
912 return false;
913
914
915 m.gBegin = m.gEnd - bestHitLength;
916 m.mScore = bestHitLength;
917 m.editDist = minTotalErrors;
918
919 #ifdef RAZERS_DEBUG
920 std::cout << "m.gBeg =" << m.gBegin << "\n";
921 std::cout << "m.gEnd =" << m.gEnd << "\n";
922 std::cout << "m.edit =" << m.editDist << "\n";
923 std::cout << "m.mScore=" << m.mScore << "\n\n";
924 #endif
925
926 return true;
927
928 }
929
930
931 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
932
933
934
935
936 template <typename TScore>
937 bool
findBestSplitPosition(String<Pair<TScore,int>> & maxColsL,String<Pair<TScore,int>> & maxColsR,int & rowPosL1,int rowPosL2,int & rowPosR1,int rowPosR2,int seq0Len,OrientationForward,SwiftSemiGlobal)938 findBestSplitPosition(String<Pair<TScore,int> > & maxColsL,
939 String<Pair<TScore,int> > & maxColsR,
940 int & rowPosL1,
941 int rowPosL2,
942 int & rowPosR1,
943 int rowPosR2,
944 int seq0Len,
945 OrientationForward,
946 SwiftSemiGlobal)
947 {
948
949 #ifdef RAZERS_DEBUG
950 ::std::cout << "findBestSplitEditForward\n";
951 #endif
952
953 TScore maxSum = std::numeric_limits<TScore>::min();
954 int bestL = rowPosL1;
955 int bestR = rowPosR1;
956 while (rowPosL1 <= rowPosL2 && rowPosR1 >= rowPosR2)
957 {
958 if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 > maxSum
959 && (maxColsL[rowPosL1].i2 + maxColsR[rowPosR1].i2 <= seq0Len)) // this is to prevent same bases from being used in both prefix and suffix match
960 { // this works, because we store the FIRST bestScore in each row, i.e.
961 maxSum = maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1;
962 bestL = rowPosL1;
963 bestR = rowPosR1;
964 }
965 ++rowPosL1;
966 --rowPosR1;
967 }
968 rowPosL1 = bestL;
969 rowPosR1 = bestR;
970
971 return true;
972 }
973
974
975
976
977 // Edit distance match combination
978 template <typename TScore>
979 bool
findBestSplitPosition(String<Pair<TScore,int>> & maxColsL,String<Pair<TScore,int>> & maxColsR,int & rowPosL1,int rowPosL2,int & rowPosR1,int rowPosR2,int seq0Len,OrientationReverse,SwiftSemiGlobal)980 findBestSplitPosition(String<Pair<TScore,int> > & maxColsL,
981 String<Pair<TScore,int> > & maxColsR,
982 int & rowPosL1,
983 int rowPosL2,
984 int & rowPosR1,
985 int rowPosR2,
986 int seq0Len,
987 OrientationReverse,
988 SwiftSemiGlobal)
989 {
990 #ifdef RAZERS_DEBUG
991 ::std::cout << "findBestSplitEditReverse\n";
992 #endif
993
994 TScore maxSum = std::numeric_limits<TScore>::min();
995 int bestL = rowPosL2;
996 int bestR = rowPosR2;
997
998 while (rowPosL1 <= rowPosL2 && rowPosR1 >= rowPosR2)
999 {
1000 if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 >= maxSum
1001 && (maxColsL[rowPosL1].i2 + maxColsR[rowPosR1].i2 <= seq0Len))
1002 {
1003 maxSum = maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1;
1004 bestL = rowPosL1;
1005 bestR = rowPosR1;
1006 }
1007 ++rowPosL1;
1008 --rowPosR1;
1009 }
1010 rowPosL1 = bestL;
1011 rowPosR1 = bestR;
1012
1013 return true;
1014 }
1015
1016
1017
1018
1019
1020
1021 template <typename TMatch, typename TRead, typename TGenome, typename TSpec>
1022 bool
combineLeftRight(TMatch & mR,TMatch & mL,TRead & read,TGenome & genome,RazerSOptions<TSpec> & options,char orientation,SwiftSemiGlobal)1023 combineLeftRight(TMatch & mR,
1024 TMatch & mL,
1025 TRead & read,
1026 TGenome & genome,
1027 RazerSOptions<TSpec> &options,
1028 char orientation,
1029 SwiftSemiGlobal)
1030 {
1031
1032 #ifdef RAZERS_DEBUG
1033 ::std::cout << "combinLeftRightEdit\n";
1034 #endif
1035 Score<int> scoreType(0,-1,-1,-1);
1036 typedef typename Infix<TGenome>::Type TGenomeInf;
1037 typedef ModifiedString<TGenomeInf,ModReverse> TGenomeInfRev;
1038
1039 TGenomeInf genomeInfL = infix(genome, mL.gBegin+mL.gSeedLen, mL.gEnd);
1040 TGenomeInf readInfL = infix(read,options.minMatchLen,mL.mScore);
1041 TGenomeInfRev genomeInfR(infix(genome, mR.gBegin, mR.gEnd-mR.gSeedLen));
1042 TGenomeInfRev readInfR(infix(read,length(read)-mR.mScore,length(read)-options.minMatchLen));
1043
1044 #ifdef RAZERS_DEBUG
1045 bool debug = true;
1046 std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1047 std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1048 std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1049 std::cout << "mL.editDist =" << mL.editDist << "\n";
1050
1051 std::cout << "mR.mScore =" << mR.mScore << "\n";
1052 std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1053 std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1054 std::cout << "mR.editDist =" << mR.editDist << "\n";
1055 #endif
1056
1057 int readLength = length(read);
1058 unsigned maxErrors = readLength * options.errorRate;
1059
1060 // neither insertion nor deletion
1061 if(mR.gEnd - mL.gBegin == readLength)
1062 {
1063 #ifdef RAZERS_DEBUG
1064 ::std::cout << "normal\n";
1065 #endif
1066 if(true)return false;
1067 unsigned halfReadLen = readLength >> 1;
1068
1069 Align<String<Dna5>, ArrayGaps> align;
1070 assignSource(row(align, 0), infix(read,0,readLength));
1071 assignSource(row(align, 1), infix(genome, mL.gBegin, mR.gEnd));
1072 int sc = globalAlignment(align, scoreType, AlignConfig<false,false,false,false>(), NeedlemanWunsch());
1073 if(-sc > (int)maxErrors) return false;
1074
1075 mL.gEnd = mL.gBegin + toSourcePosition(row(align, 1),toViewPosition(row(align, 0), halfReadLen-1));
1076 mR.gBegin = mL.gEnd;
1077 mL.mScore = halfReadLen;
1078 mR.mScore = readLength - mL.mScore;
1079 mL.editDist = countErrorsInAlign(align,toViewPosition(row(align, 0), halfReadLen));
1080 mR.editDist = -sc-mL.editDist;
1081
1082 #ifdef RAZERS_DEBUG
1083 std::cout << "mL.mScore =" << mL.mScore << "\n";
1084 std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1085 std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1086 std::cout << "mL.editDist =" << mL.editDist << "\n";
1087
1088 std::cout << "mR.mScore =" << mR.mScore << "\n";
1089 std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1090 std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1091 std::cout << "mR.editDist =" << mR.editDist << "\n";
1092 #endif
1093
1094 }
1095
1096 //potential insertion
1097 if(mR.gEnd - mL.gBegin < readLength)
1098 {
1099 #ifdef RAZERS_DEBUG
1100 ::std::cout << "insertion\n";
1101 #endif
1102 if(mR.gEnd - mL.gBegin < 2*options.minMatchLen) //too close together // actually minus allowed seed errors
1103 return false;
1104
1105 if(mL.gEnd < mR.gBegin) //prefix and suffix match do not meet
1106 return false;
1107
1108 if(mR.mScore + mL.mScore == mR.gEnd - mL.gBegin)
1109 if(mL.gEnd == mR.gBegin)
1110 if(mR.editDist + mL.editDist <= maxErrors ) //prefix and suffix match meet and do not overlap --> perfect
1111 return true;
1112
1113 // if((mR.gEnd - mL.gBegin <= mL.mScore) || (mR.gEnd - mL.gBegin <= mR.mScore))//too close together // heuristic way to kick out repeat matches early on
1114 // return false;
1115
1116 int diag1L = -maxErrors + mL.seedEditDist;
1117 int diag2L = maxErrors - mL.seedEditDist;
1118 int diag1R = -maxErrors + mR.seedEditDist;
1119 int diag2R = maxErrors - mR.seedEditDist;
1120 int minColNum = 0;
1121
1122 // genomeInf is the shorter sequence --> find best split position on genome
1123 // rows in alignment matrix represent genome position
1124 StringSet<TGenomeInf,Dependent<> > strSetL;
1125 appendValue(strSetL,readInfL);
1126 appendValue(strSetL,genomeInfL);
1127 Graph<Alignment<StringSet<TGenomeInf,Dependent<> >, void> > alignL(strSetL);
1128 String<Pair<int,int> > maxColsL;
1129 _globalAlignment(alignL,strSetL,scoreType,AlignConfig<false,false,false,false>(),diag1L,diag2L,maxColsL,minColNum,BandedNeedlemanWunsch());
1130
1131 StringSet<TGenomeInfRev,Dependent<> > strSetR;
1132 appendValue(strSetR,readInfR);
1133 appendValue(strSetR,genomeInfR);
1134 Graph<Alignment<StringSet<TGenomeInfRev,Dependent<> >, void > > alignR(strSetR);
1135 String<Pair<int,int> > maxColsR;
1136 _globalAlignment(alignR,strSetR,scoreType,AlignConfig<false,false,false,false>(),diag1R,diag2R,maxColsR,minColNum,BandedNeedlemanWunsch());
1137
1138 //::std::cout << alignL;
1139 //::std::cout << alignR;
1140
1141 // our begin and start positions are defined by the read positions
1142 // go from read source to view position to genome source position
1143 int rowPosL1 = 0;
1144 if (mL.gSeedLen < (int)mR.gBegin - mL.gBegin) rowPosL1 = mR.gBegin - mL.gBegin - mL.gSeedLen;//or from first possible overlap pos
1145 int rowPosR2 = 0;
1146 if (mR.gSeedLen < (int)mR.gEnd - mL.gEnd) rowPosR2 = mR.gEnd - mL.gEnd - mR.gSeedLen;
1147
1148 int rowPosL2 = mR.gEnd - mR.gSeedLen - rowPosR2 - mL.gBegin - mL.gSeedLen;
1149 int rowPosR1 = mR.gEnd - mR.gSeedLen - rowPosL1 - mL.gBegin - mL.gSeedLen;
1150
1151
1152 #ifdef RAZERS_DEBUG
1153 ::std::cout << "vorher\nrowPosL1=" << rowPosL1 << ::std::endl;
1154 ::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1155 ::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1156 ::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1157 #endif
1158
1159 // compute best split position
1160 int seq0Len = readLength;
1161 if(orientation == 'R')
1162 findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,OrientationReverse(),SwiftSemiGlobal());
1163 else
1164 findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,OrientationForward(),SwiftSemiGlobal());
1165
1166 mL.editDist = mR.seedEditDist - maxColsL[rowPosL1].i1; //scores are negative
1167 mR.editDist = mL.seedEditDist - maxColsR[rowPosR1].i1;
1168 if(mR.editDist + mL.editDist > maxErrors )
1169 return false;
1170
1171 #ifdef RAZERS_DEBUG
1172 ::std::cout << "nachher\nrowPosL1=" << rowPosL1 << ::std::endl;
1173 ::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1174 ::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1175 ::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1176 #endif
1177
1178 // best split position in genome is saved in rowPosL1 (and rowPosR1)
1179 mL.mScore = options.minMatchLen + maxColsL[rowPosL1].i2;
1180 mL.gEnd = mL.gBegin + mL.gSeedLen + rowPosL1; //read position of best genomic split
1181
1182 mR.mScore = options.minMatchLen + maxColsR[rowPosR1].i2;
1183 mR.gBegin = mR.gEnd - mR.gSeedLen - rowPosR1; //read position of best genomic split
1184
1185 #ifdef RAZERS_DEBUG
1186 if(mL.editDist > 50 || mR.mScore < options.minMatchLen || mL.mScore < options.minMatchLen)
1187 {
1188 ::std::cout << "-maxColsL[rowPosL1].i1=" << -maxColsL[rowPosL1].i1 << " -maxColsL[rowPosL1].i2=" << -maxColsL[rowPosL1].i2 << " rowPosL1="<<rowPosL1;
1189 ::std::cout << " maxColsLLen="<< length(maxColsL) << ::std::endl;
1190 ::std::cout << "-maxColsR[rowPosR1].i1=" << -maxColsR[rowPosR1].i1 << "-maxColsR[rowPosR1].i2=" << -maxColsR[rowPosR1].i2 << " rowPosR1="<<rowPosR1;
1191 ::std::cout << " maxColsRLen="<< length(maxColsR) << ::std::endl;
1192 }
1193 #endif
1194 }
1195
1196 //potential deletion
1197 if(mR.gEnd - mL.gBegin > readLength)
1198 {
1199 #ifdef RAZERS_DEBUG
1200 ::std::cout << "deletion\n";
1201 #endif
1202 if(mR.mScore + mL.mScore < readLength)
1203 return false;
1204 if(mR.mScore + mL.mScore == readLength && mR.editDist + mL.editDist > maxErrors) //the prefix and suffix match meet, but too many errors
1205 return false;
1206
1207 int diag1L = -maxErrors + mL.seedEditDist;
1208 int diag2L = maxErrors - mL.seedEditDist;
1209 int diag1R = -maxErrors + mR.seedEditDist;
1210 int diag2R = maxErrors - mR.seedEditDist;
1211 int minColNum = 0;
1212
1213 // readInf is the shorter sequence --> find best split position on read
1214 // rows in alignment matrix represent read position
1215 StringSet<TGenomeInf> strL;
1216 appendValue(strL,genomeInfL);
1217 appendValue(strL,readInfL);
1218 String<Pair<int,int> > maxColsL;
1219 _globalAlignment(strL,scoreType,AlignConfig<false,false,false,false>(),diag1L,diag2L,maxColsL,minColNum,BandedNeedlemanWunsch());
1220
1221 StringSet<TGenomeInfRev> strR;
1222 appendValue(strR,genomeInfR);
1223 appendValue(strR,readInfR);
1224 String<Pair<int,int> > maxColsR;
1225 _globalAlignment(strR,scoreType,AlignConfig<false,false,false,false>(),diag1R,diag2R,maxColsR,minColNum,BandedNeedlemanWunsch());
1226
1227 int rowPosL1 = ((int)options.minMatchLen > readLength-mR.mScore) ? (int)0 : readLength-mR.mScore-options.minMatchLen;
1228 int rowPosL2 = (readLength-(int)options.minMatchLen < mL.mScore) ? readLength-(int)2*options.minMatchLen : mL.mScore - options.minMatchLen;
1229 int rowPosR1 = (int)readLength - 2*options.minMatchLen - rowPosL1;
1230 int rowPosR2 = (int)readLength - 2*options.minMatchLen - rowPosL2;
1231
1232 #ifdef RAZERS_DEBUG
1233 ::std::cout << "rowPosL1=" << rowPosL1 << ::std::endl;
1234 ::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1235 ::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1236 ::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1237
1238 std::cout << "before split:\nmL.mScore =" << mL.mScore << "\n";
1239 std::cout << "mL.gBegin =" << length(genome)-mL.gBegin << "\n";
1240 std::cout << "mL.gEnd =" << length(genome)-mL.gEnd << "\n";
1241 std::cout << "mL.editDist =" << mL.editDist << "\n";
1242
1243 std::cout << "mR.mScore =" << mR.mScore << "\n";
1244 std::cout << "mR.gBegin =" << length(genome)-mR.gBegin << "\n";
1245 std::cout << "mR.gEnd =" << length(genome)-mR.gEnd << "\n";
1246 std::cout << "mR.editDist =" << mR.editDist << "\n";
1247 #endif
1248
1249 // compute best split position
1250 int seq0Len = mR.gEnd - mL.gBegin;
1251 if(orientation == 'R')
1252 findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,OrientationReverse(),SwiftSemiGlobal());
1253 else
1254 findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,OrientationForward(),SwiftSemiGlobal());
1255
1256
1257 // best split position in read is saved in rowPosL1 (and rowPosR1)
1258 mL.editDist = mL.seedEditDist - maxColsL[rowPosL1].i1;
1259 mR.editDist = mR.seedEditDist - maxColsR[rowPosR1].i1;
1260
1261
1262 if(mR.editDist + mL.editDist > maxErrors)
1263 return false;
1264
1265 mR.mScore = options.minMatchLen + rowPosR1;
1266 mR.gBegin = mR.gEnd - maxColsR[rowPosR1].i2 - mR.gSeedLen; //genomic position of best read split
1267 mL.mScore = options.minMatchLen + rowPosL1;
1268 mL.gEnd = mL.gBegin + maxColsL[rowPosL1].i2 + mL.gSeedLen; //genomic position of best read split
1269
1270 }
1271
1272 return true;
1273 }
1274
1275
1276
1277 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1278 // Hamming distance match combination
1279
1280 template <typename TMatch, typename TOriRead, typename TGenome, typename TSpec>
1281 bool
combineLeftRight(TMatch & mR,TMatch & mL,TOriRead const & oriRead,TGenome & genome,RazerSOptions<TSpec> & options,char orientation,SwiftSemiGlobalHamming)1282 combineLeftRight(TMatch & mR,
1283 TMatch & mL,
1284 TOriRead const& oriRead,
1285 TGenome & genome,
1286 RazerSOptions<TSpec> &options,
1287 char orientation,
1288 SwiftSemiGlobalHamming)
1289 {
1290
1291 #ifdef RAZERS_DEBUG
1292 ::std::cout << "combineLeftRightHamming\n";
1293 #endif
1294
1295
1296 typedef typename Infix<TOriRead const>::Type TRead;
1297 TRead read = infix(oriRead,0,length(oriRead));
1298
1299 typedef typename Infix<TGenome>::Type TGenomeInf;
1300 TGenomeInf genomeInf = infix(genome, mL.gBegin, mR.gEnd);
1301 int readLength = length(read);
1302 unsigned maxErrors = readLength * options.errorRate;
1303
1304
1305 #ifdef RAZERS_DEBUG
1306 std::cout << "readLength=" << readLength << "\n";
1307 std::cout << "sumLen=" << mL.mScore + mR.mScore << "\n";
1308 std::cout << "gInfLength=" << length(genomeInf) << "\n";
1309 std::cout << "gInf=" << genomeInf << "\n";
1310
1311 std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1312 std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1313 std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1314 std::cout << "mL.editDist =" << mL.editDist << "\n";
1315 std::cout << "readPref=" << prefix(oriRead,mL.mScore) << "\n";
1316
1317 std::cout << "mR.mScore =" << mR.mScore << "\n";
1318 std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1319 std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1320 std::cout << "mR.editDist =" << mR.editDist << "\n";
1321 std::cout << "readSuff=" << suffix(oriRead,length(oriRead)-mR.mScore) << "\n";
1322 #endif
1323
1324
1325 // neither insertion nor deletion
1326 if(mR.gEnd - mL.gBegin == readLength)
1327 {
1328 #ifdef RAZERS_DEBUG
1329 ::std::cout << "normal\n";
1330 #endif
1331 unsigned halfReadLen = readLength >> 1;
1332 unsigned prefixErrors = 0;
1333 unsigned suffixErrors = 0;
1334 for (unsigned i = 0 ; i < length(genomeInf); ++i)
1335 {
1336 if ((options.compMask[ordValue(read[i])] & options.compMask[ordValue(genomeInf[i])]) == 0)
1337 {
1338 if(i >= halfReadLen)
1339 ++suffixErrors;
1340 else
1341 ++prefixErrors;
1342 }
1343 }
1344
1345 if (suffixErrors+prefixErrors <= maxErrors)
1346 {
1347 mL.mScore = halfReadLen;
1348 mR.mScore = length(read)- halfReadLen;
1349 mR.gBegin = mR.gEnd - mR.mScore;
1350 mL.gEnd = mL.gBegin + mL.mScore;
1351 mL.editDist = prefixErrors;
1352 mR.editDist = suffixErrors;
1353 }
1354 else return false;
1355
1356 }
1357 //potential insertion
1358 if(mR.gEnd - mL.gBegin < readLength)
1359 {
1360 #ifdef RAZERS_DEBUG
1361 ::std::cout << "insertion\n";
1362 #endif
1363
1364 if(mR.gEnd - mL.gBegin < 2*options.minMatchLen)//too close together
1365 return false;
1366
1367 if(mR.mScore + mL.mScore < mR.gEnd - mL.gBegin) //prefix and suffix match do not meet
1368 return false;
1369
1370 if((mR.mScore + mL.mScore == mR.gEnd - mL.gBegin) && (mR.editDist + mL.editDist > maxErrors)) //prefix and suffix match meet but too many errors
1371 return false;
1372
1373 // if((mR.gEnd - mL.gBegin <= mL.mScore) || (mR.gEnd - mL.gBegin <= mR.mScore))//too close together
1374 // return false;
1375
1376 bool result = findBestSplitPosition(read,genomeInf,mL.mScore,mR.mScore,mL.editDist,mR.editDist, options, orientation, SwiftSemiGlobalHamming());
1377 if(!result || mR.editDist + mL.editDist > maxErrors) return false;
1378 mR.gBegin = mR.gEnd - mR.mScore;
1379 mL.gEnd = mL.gBegin + mL.mScore;
1380
1381 }
1382 //potential deletion
1383 if(mR.gEnd - mL.gBegin > readLength)
1384 {
1385 #ifdef RAZERS_DEBUG
1386 ::std::cout << "deletion\n";
1387 #endif
1388
1389 if(mR.mScore + mL.mScore < readLength)
1390 return false;
1391
1392 if((mR.mScore + mL.mScore == readLength) && (mR.editDist + mL.editDist > maxErrors)) //the prefix and suffix match meet and do not overlap --> perfect
1393 return false;
1394
1395 bool result = findBestSplitPosition(genomeInf,read,mL.mScore,mR.mScore,mL.editDist,mR.editDist, options, orientation,SwiftSemiGlobalHamming());
1396
1397 if(!result || mR.editDist + mL.editDist > maxErrors) return false;
1398
1399 mR.gBegin = mR.gEnd - mR.mScore;
1400 mL.gEnd = mL.gBegin + mL.mScore;
1401
1402 }
1403 #ifdef RAZERS_DEBUG
1404 std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1405 std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1406 std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1407 std::cout << "mL.editDist =" << mL.editDist << "\n";
1408
1409 std::cout << "mR.mScore =" << mR.mScore << "\n";
1410 std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1411 std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1412 std::cout << "mR.editDist =" << mR.editDist << "\n";
1413 #endif
1414
1415
1416 return true;
1417 }
1418
1419
1420 // find the best split position for a split match
1421 // positions are relative to shorter sequence
1422 // (deletion --> read is shorter, insertion --> genomeInf is shorter)
1423 template <typename TSize, typename TLongerSegment, typename TShorterSegment, typename TErrors, typename TOptions>
1424 bool
findBestSplitPosition(TLongerSegment & longSeg,TShorterSegment & shortSeg,TSize & mLmScore,TSize & mRmScore,TErrors & errorsLeft,TErrors & errorsRight,TOptions & options,char orientation,SwiftSemiGlobalHamming)1425 findBestSplitPosition(TLongerSegment &longSeg,
1426 TShorterSegment &shortSeg,
1427 TSize & mLmScore,
1428 TSize & mRmScore,
1429 TErrors & errorsLeft,
1430 TErrors & errorsRight,
1431 TOptions & options,
1432 char orientation,
1433 SwiftSemiGlobalHamming)
1434 {
1435
1436 #ifdef RAZERS_DEBUG
1437 ::std::cout << "findBestSplitHamming"<<orientation<<"\n";
1438 #endif
1439
1440 // usually, both types should be the same, but you never know...
1441 typedef typename Iterator<TLongerSegment const>::Type TLongIterator;
1442 typedef typename Iterator<TShorterSegment const>::Type TShortIterator;
1443 typedef typename Size<TShorterSegment const>::Type TShortSize;
1444 typedef typename Size<TLongerSegment const>::Type TLongSize;
1445
1446
1447 TShortSize shortLen = length(shortSeg);
1448 TLongSize longLen = length(longSeg);
1449 TLongSize lenDiff = longLen - shortLen;
1450
1451 TLongSize leftLongBegin = _max((int)options.minMatchLen,(int)shortLen-mRmScore);
1452 TLongSize leftLongEnd = _min((int)mLmScore,(int)shortLen-(int)options.minMatchLen);
1453 TLongSize leftLongPos = leftLongBegin;
1454
1455 TLongSize rightLongPos = leftLongBegin + lenDiff;
1456 TShortSize shortPos = leftLongBegin;
1457
1458 int bestSumErrors = 0;
1459 int bestLErrors = 0;
1460 int bestRErrors = 0;
1461 int bestPos = shortPos;
1462 int errorsL = 0;
1463 int errorsR = 0;
1464 int errorsPosL = 0;
1465 int errorsPosR = 0;
1466
1467 #ifdef RAZERS_DEBUG
1468 std::cout << "before\nerrorsLeft= " << errorsLeft << std::endl;
1469 std::cout << "errorsRight= " << errorsRight << std::endl;
1470 std::cout << "mLmScore= " << mLmScore << std::endl;
1471 std::cout << "mRmScore= " << mRmScore << std::endl;
1472
1473 std::cout << "leftLongPos " << leftLongPos << std::endl;
1474 std::cout << "leftLongEnd " << leftLongEnd << std::endl;
1475 std::cout << "rightLongPos " << rightLongPos << std::endl;
1476 std::cout << "shortPos " << shortPos << std::endl;
1477
1478 std::cout << longSeg << "=longSeg\n";
1479 std::cout << shortSeg << "=shortSeg\n";
1480 #endif
1481
1482 //find best split position --> min. sum errors
1483 while(leftLongPos < leftLongEnd)
1484 {
1485 if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[leftLongPos])]) == 0)
1486 {
1487 ++errorsL;
1488 ++errorsPosL;
1489 }
1490 if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[rightLongPos])]) == 0)
1491 {
1492 --errorsPosR;
1493 ++errorsR;
1494 }
1495 if(errorsPosL+errorsPosR < bestSumErrors
1496 || (orientation == 'R' && errorsPosL+errorsPosR == bestSumErrors))
1497 {
1498 bestSumErrors = errorsPosL+errorsPosR;
1499 bestLErrors = errorsPosL;
1500 bestRErrors = errorsPosR;
1501 bestPos = shortPos + 1;
1502 }
1503 ++leftLongPos;
1504 ++rightLongPos;
1505 ++shortPos;
1506 }
1507
1508 //update to new match lengths
1509 mLmScore = bestPos;
1510 mRmScore = shortLen - bestPos;
1511
1512 //count numErrors for left and right match
1513 //(have to count completely new, because mScore may be longer than shortLen --> no able to track errors outside segment)
1514 errorsRight = errorsLeft = 0;
1515 for(leftLongPos = 0, shortPos = 0; leftLongPos < (unsigned)mLmScore; ++leftLongPos, ++shortPos)
1516 {
1517 if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[leftLongPos])]) == (unsigned) 0)
1518 ++errorsLeft;
1519
1520 }
1521 for(rightLongPos = 0, shortPos = 0; rightLongPos < (unsigned)mRmScore; ++rightLongPos, ++shortPos)
1522 {
1523 if((options.compMask[ordValue(shortSeg[shortLen-1-shortPos])] & options.compMask[ordValue(longSeg[longLen-1-rightLongPos])]) == (unsigned) 0)
1524 ++errorsRight;
1525
1526 }
1527
1528 #ifdef RAZERS_DEBUG
1529 std::cout << "bestSumErrors= " << bestSumErrors << std::endl;
1530 std::cout << "errorsPosR= " << errorsPosR << std::endl;
1531 std::cout << "errorsPosL= " << errorsPosL << std::endl;
1532 std::cout << "after\nerrorsLeft= " << errorsLeft << std::endl;
1533 std::cout << "errorsRight= " << errorsRight << std::endl;
1534 std::cout << "mLmScore= " << mLmScore << std::endl;
1535 std::cout << "mRmScore= " << mRmScore << std::endl;
1536 #endif
1537
1538 return true;
1539 }
1540
1541
1542
1543
1544
1545 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1546 // Find read matches in many genome sequences (import from Fasta)
1547 template <
1548 typename TMatches,
1549 typename TReadSet_,
1550 typename TCounts,
1551 typename TSpec,
1552 typename TShape,
1553 typename TSwiftSpec >
mapSplicedReads(TMatches & matches,StringSet<CharString> & genomeFileNameList,StringSet<CharString> & genomeNames,::std::map<unsigned,::std::pair<::std::string,unsigned>> & gnoToFileMap,TReadSet_ & readSet,TCounts & cnts,RazerSOptions<TSpec> & options,TShape const & shape,Swift<TSwiftSpec> const)1554 int mapSplicedReads(
1555 TMatches & matches,
1556 StringSet<CharString> & genomeFileNameList,
1557 StringSet<CharString> & genomeNames, // genome names, taken from the Fasta file
1558 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
1559 TReadSet_ & readSet,
1560 TCounts & cnts,
1561 RazerSOptions<TSpec> & options,
1562 TShape const & shape,
1563 Swift<TSwiftSpec> const)
1564 {
1565
1566
1567 typedef typename Value<TReadSet_>::Type TRead;
1568 typedef StringSet<typename Infix<TRead>::Type> TReadSet;
1569 typedef Index<TReadSet, IndexQGram<TShape, TQGramIndexSpec> > TIndex; // q-gram index
1570 typedef Pattern<TIndex, Swift<TSwiftSpec> > TSwiftPattern; // filter
1571 typedef Pattern<TRead, MyersUkkonen> TMyersPattern; // verifier
1572
1573
1574 // split reads over two indices, one for prefixes, the other for suffixes
1575 TReadSet readSetL, readSetR;
1576 unsigned readCount = length(readSet);
1577 resize(readSetL, readCount);
1578 resize(readSetR, readCount);
1579
1580 if(options._debugLevel > 0)
1581 {
1582 int64_t genomeLen = 3000000000 * 2; // ufff make that an option
1583 expNumRandomMatches(readSet, genomeLen, options);
1584 }
1585
1586 if(options._debugLevel > 0 )
1587 std::cout << "Performing spliced mapping.\n";
1588 for (unsigned i = 0; i < readCount; ++i)
1589 {
1590 if(length(readSet[i])>=2*options.minMatchLen)
1591 {
1592 assign(readSetL[i], infix(readSet[i],0,options.minMatchLen), Exact());
1593 assign(readSetR[i], infix(readSet[i],length(readSet[i])-options.minMatchLen,length(readSet[i])), Exact());
1594 }
1595 else
1596 {
1597 assign(readSetL[i], infix(readSet[i],0,0), Exact());
1598 assign(readSetR[i], infix(readSet[i],0,0), Exact());
1599 }
1600 }
1601
1602
1603 if(options._debugLevel > 1)::std::cout << "Make index left right\n";
1604 // configure q-gram index
1605 TIndex swiftIndexL(readSetL, shape);
1606 TIndex swiftIndexR(readSetR, shape);
1607
1608 #ifdef RAZERS_OPENADDRESSING
1609 swiftIndexL.alpha = 2;
1610 swiftIndexR.alpha = 2;
1611 #endif
1612
1613 cargo(swiftIndexL).abundanceCut = options.abundanceCut;
1614 cargo(swiftIndexR).abundanceCut = options.abundanceCut;
1615 cargo(swiftIndexL)._debugLevel = 0;
1616 cargo(swiftIndexR)._debugLevel = options._debugLevel;
1617
1618 // configure Swift
1619 TSwiftPattern swiftPatternL(swiftIndexL);
1620 TSwiftPattern swiftPatternR(swiftIndexR);
1621 swiftPatternL.params.minThreshold = options.threshold;
1622 swiftPatternR.params.minThreshold = options.threshold;
1623 swiftPatternL.params.tabooLength = options.tabooLength;
1624 swiftPatternR.params.tabooLength = options.tabooLength;
1625
1626 // init edit distance verifiers
1627 String<TMyersPattern> forwardPatternsL;
1628 String<TMyersPattern> forwardPatternsR;
1629 options.compMask[4] = (options.matchN)? 15: 0;
1630 if (!options.hammingOnly)
1631 {
1632 resize(forwardPatternsL, readCount, Exact());
1633 resize(forwardPatternsR, readCount, Exact());
1634 for(unsigned i = 0; i < readCount; ++i)
1635 {
1636 setHost(forwardPatternsL[i], indexText(swiftIndexL)[i]);
1637 setHost(forwardPatternsR[i], indexText(swiftIndexR)[i]);
1638 _patternMatchNOfPattern(forwardPatternsL[i], options.matchN);
1639 _patternMatchNOfPattern(forwardPatternsR[i], options.matchN);
1640 _patternMatchNOfFinder(forwardPatternsL[i], options.matchN);
1641 _patternMatchNOfFinder(forwardPatternsR[i], options.matchN);
1642 }
1643 }
1644
1645 if(options._debugLevel > 1)::std::cout << "Patterns created\n";
1646
1647 // clear stats
1648 options.FP = 0;
1649 options.TP = 0;
1650 options.timeMapReads = 0;
1651 options.timeDumpResults = 0;
1652
1653 unsigned filecount = 0;
1654 unsigned numFiles = length(genomeFileNameList);
1655 unsigned gseqNo = 0;
1656
1657 // open genome files, one by one
1658 while (filecount < numFiles)
1659 {
1660 // open genome file
1661 ::std::ifstream file;
1662 file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary);
1663 if (!file.is_open())
1664 return RAZERS_GENOME_FAILED;
1665
1666 // remove the directory prefix of current genome file
1667 ::std::string genomeFile(toCString(genomeFileNameList[filecount]));
1668 size_t lastPos = genomeFile.find_last_of('/') + 1;
1669 if (lastPos == genomeFile.npos) lastPos = genomeFile.find_last_of('\\') + 1;
1670 if (lastPos == genomeFile.npos) lastPos = 0;
1671 ::std::string genomeName = genomeFile.substr(lastPos);
1672
1673 CharString id;
1674 //Dna5String genome;
1675 String<Dna5Q> genome;
1676 unsigned gseqNoWithinFile = 0;
1677 SEQAN_PROTIMESTART(find_time);
1678
1679 // iterate over genome sequences
1680 for(; !_streamEOF(file); ++gseqNo)
1681 {
1682 if (options.genomeNaming == 0)
1683 {
1684 readShortID(file, id, Fasta()); // read Fasta id up to first whitespace
1685 appendValue(genomeNames, id, Generous());
1686 }
1687 read(file, genome, Fasta()); // read Fasta sequence
1688
1689 gnoToFileMap.insert(::std::make_pair(gseqNo,::std::make_pair(genomeName,gseqNoWithinFile)));
1690
1691 if (options.forward)
1692 mapSplicedReads(matches, genome, gseqNo, readSet, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'F', options);
1693
1694 if (options.reverse)
1695 {
1696 reverseComplement(genome);
1697 mapSplicedReads(matches, genome, gseqNo, readSet, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'R', options);
1698 }
1699 ++gseqNoWithinFile;
1700
1701 }
1702 options.timeMapReads += SEQAN_PROTIMEDIFF(find_time);
1703 file.close();
1704 ++filecount;
1705 }
1706
1707 compactSplicedMatches(matches, cnts, options, false, swiftPatternL, swiftPatternR);
1708
1709 if (options._debugLevel >= 1)
1710 ::std::cerr << ::std::endl << "Finding reads took \t" << options.timeMapReads << " seconds" << ::std::endl;
1711 if (options._debugLevel >= 2) {
1712 ::std::cerr << ::std::endl;
1713 ::std::cerr << "___FILTRATION_STATS____" << ::std::endl;
1714 ::std::cerr << "Swift FP: " << options.FP << ::std::endl;
1715 ::std::cerr << "Swift TP: " << options.TP << ::std::endl;
1716 }
1717 return 0;
1718 }
1719
1720
1721 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1722 // Find read matches in one genome sequence
1723 template <
1724 typename TMatches,
1725 typename TGenome,
1726 typename TOriReadSet,
1727 typename TReadIndex,
1728 typename TSwiftSpec,
1729 typename TVerifier,
1730 typename TCounts,
1731 typename TSpec >
mapSplicedReads(TMatches & matches,TGenome & genome,unsigned gseqNo,TOriReadSet & readSet,Pattern<TReadIndex,Swift<TSwiftSpec>> & swiftPatternL,Pattern<TReadIndex,Swift<TSwiftSpec>> & swiftPatternR,TVerifier & forwardPatternsL,TVerifier & forwardPatternsR,TCounts & cnts,char orientation,RazerSOptions<TSpec> & options)1732 void mapSplicedReads(
1733 TMatches &matches, // resulting matches
1734 TGenome &genome, // genome ...
1735 unsigned gseqNo, // ... and its sequence number
1736 TOriReadSet &readSet, // reads
1737 Pattern<TReadIndex, Swift<TSwiftSpec> > &swiftPatternL, // left index
1738 Pattern<TReadIndex, Swift<TSwiftSpec> > &swiftPatternR, // right index
1739 TVerifier &forwardPatternsL,
1740 TVerifier &forwardPatternsR,
1741 TCounts & cnts,
1742 char orientation,
1743 RazerSOptions<TSpec> &options)
1744 {
1745 typedef typename Value<TOriReadSet>::Type TRead;
1746 typedef typename Fibre<TReadIndex, FibreText>::Type TReadSet;
1747 typedef typename Size<TGenome>::Type TSize;
1748 typedef typename Position<TGenome>::Type TGPos;
1749 typedef typename MakeSigned_<TGPos>::Type TSignedGPos;
1750 typedef typename Value<TMatches>::Type TMatch;
1751 typedef typename Infix<TGenome>::Type TGenomeInf;
1752
1753 // Prefix-Suffix filtration
1754 typedef Finder<TGenome, Swift<TSwiftSpec> > TSwiftFinderL;
1755 typedef Finder<TGenomeInf, Swift<TSwiftSpec> > TSwiftFinderR;
1756 typedef Pattern<TReadIndex, Swift<TSwiftSpec> > TSwiftPattern;
1757
1758 typedef Pair<int64_t, TMatch> TDequeueValue;
1759 typedef Dequeue<TDequeueValue> TDequeue;
1760 typedef typename TDequeue::TIter TDequeueIterator;
1761
1762 const unsigned NOT_VERIFIED = 1u << (8*sizeof(unsigned)-1);
1763
1764 // iterate all genomic sequences
1765 if (options._debugLevel >= 1)
1766 {
1767 ::std::cerr << ::std::endl << "Process genome seq #" << gseqNo;
1768 if (orientation == 'F')
1769 ::std::cerr << "[fwd]";
1770 else
1771 ::std::cerr << "[rev]";
1772 }
1773
1774 TReadSet &readSetL = host(host(swiftPatternL));
1775 TReadSet &readSetR = host(host(swiftPatternR));
1776
1777 if (empty(readSetL) || empty(readSetR))
1778 return;
1779
1780
1781 // Check?
1782 TSignedGPos maxDistance = options.maxDistance;// + 2 * (int)options.minMatchLength - (int)length(indexShape(host(swiftPatternL)));
1783 TSignedGPos minDistance = options.minDistance + 2*options.minMatchLen;
1784
1785 // exit if contig is shorter than minDistance
1786 if (length(genome) <= (unsigned) minDistance)
1787 return;
1788
1789 TGenomeInf genomeInf = infix(genome, 0, length(genome));
1790 TSwiftFinderL swiftFinderL(genome, options.repeatLength, 1);
1791 TSwiftFinderR swiftFinderR(genomeInf, options.repeatLength, 1);
1792
1793 TDequeue fifo; // stores left-mate potential matches
1794 String<int64_t> lastPotMatchNo; // last number of a left-mate potential
1795 int64_t lastNo = 0; // last number over all left-mate pot. matches in the queue
1796 int64_t firstNo = 0; // first number over all left-mate pot. match in the queue
1797 Pair<TGPos> gPair;
1798
1799 resize(lastPotMatchNo, length(host(swiftPatternL)), (int64_t)-1, Exact());
1800
1801 String<Pair<TGPos> > lastRightMatch; // begin and end of last verified right match
1802 resize(lastRightMatch, length(host(swiftPatternL)), Pair<TGPos>(0,0), Exact());
1803
1804 TSize gLength = length(genome);
1805 TMatch mR = { // to supress uninitialized warnings
1806 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1807 TDequeueValue fL(-1, mR); // to supress uninitialized warnings
1808 fL.i2.gseqNo = gseqNo;
1809 mR.gseqNo = gseqNo;
1810 fL.i2.orientation = orientation;
1811 mR.orientation = orientation;
1812
1813 double maxErrorRate = options.errorRate;
1814 double prefixErrorRate = (double)floor(options.minMatchLen * options.errorRate)/options.minMatchLen;
1815 if(prefixErrorRate > maxErrorRate) maxErrorRate = prefixErrorRate;
1816 if(!empty(readSet))
1817 {
1818 double extPrefixErrorRate = (double) floor(length(readSet[0]) * options.errorRate)/(length(readSet[0]) - options.minMatchLen);
1819 if(extPrefixErrorRate > maxErrorRate) maxErrorRate = extPrefixErrorRate;
1820 }
1821 Pair<TGPos,TGPos> lastLeftMatch(0,0);
1822
1823 // iterate all verification regions returned by SWIFT
1824 while (find(swiftFinderR, swiftPatternR, maxErrorRate))
1825 {
1826 unsigned rseqNo = swiftPatternR.curSeqNo;
1827 TGPos rEndPos = endPosition(swiftFinderR);
1828 TGPos doubleParWidth = 2 * (*swiftFinderR.curHit).bucketWidth;
1829
1830 // remove out-of-window left mates from fifo
1831 while (!empty(fifo) && front(fifo).i2.gBegin + maxDistance + (TSignedGPos)doubleParWidth < (TSignedGPos)rEndPos)
1832 {
1833 popFront(fifo);
1834 ++firstNo;
1835 }
1836
1837 // add within-window left mates to fifo
1838 while (empty(fifo) || back(fifo).i2.gBegin + minDistance < (TSignedGPos)(rEndPos + doubleParWidth))
1839 {
1840 if (find(swiftFinderL, swiftPatternL, maxErrorRate))
1841 {
1842 gPair = positionRange(swiftFinderL);
1843 if ((TSignedGPos)gPair.i2 + maxDistance + (TSignedGPos)doubleParWidth >= (TSignedGPos)rEndPos)
1844 {
1845 // link in
1846 fL.i1 = lastPotMatchNo[swiftPatternL.curSeqNo]; //link to last previous potential match
1847 lastPotMatchNo[swiftPatternL.curSeqNo] = lastNo++; //++ general counter and remember last pot match of curSeqNo
1848
1849 fL.i2.rseqNo = swiftPatternL.curSeqNo | NOT_VERIFIED; //das erste bit wird gestetzt
1850 fL.i2.gBegin = gPair.i1; //begin und end auf die potential match region
1851 fL.i2.gEnd = gPair.i2;
1852 pushBack(fifo, fL);
1853 }
1854 }
1855 else
1856 break;
1857 }
1858
1859 TDequeueIterator it;
1860 int64_t lastPositive = (int64_t)-1;
1861
1862 TRead const &read = readSet[rseqNo];
1863 TSize counter = 0;
1864 bool noMatchRight = false;
1865 bool notYetVerifiedRight = true;
1866 lastLeftMatch.i1 = 0;
1867 lastLeftMatch.i2 = 0;
1868
1869 // walk through all potential left matches, verify if not verfied already, mark as positive or negative
1870 for (int64_t i = lastPotMatchNo[rseqNo]; firstNo <= i; i = (*it).i1)
1871 {
1872 it = &value(fifo, i - firstNo);
1873
1874 // verify left mate (equal seqNo), if not done already
1875 if ((*it).i2.rseqNo & NOT_VERIFIED)
1876 {
1877 TGPos maxEndPos = (*it).i2.gEnd + length(read)-2*options.minMatchLen + floor(options.errorRate*length(read));
1878 if(maxEndPos > length(genome)) maxEndPos = length(genome);
1879 if (matchVerify( (*it).i2,
1880 infix(genome, (*it).i2.gBegin, maxEndPos),
1881 rseqNo,
1882 readSet, //readSetL
1883 forwardPatternsL,
1884 options,
1885 TSwiftSpec(),
1886 LongestPrefix()) &&
1887 !(lastLeftMatch.i1 == (TGPos)(*it).i2.gBegin && lastLeftMatch.i2 == (TGPos)(*it).i2.gEnd ))
1888 {
1889 (*it).i2.rseqNo &= ~NOT_VERIFIED; // has been verified positively // go back to regular rseqNo
1890 lastLeftMatch.i1 = (*it).i2.gBegin;
1891 lastLeftMatch.i2 = (*it).i2.gEnd;
1892 // short-cut negative matches
1893 if (lastPositive == (int64_t)-1)
1894 lastPotMatchNo[rseqNo] = i;
1895 else
1896 value(fifo, lastPositive - firstNo).i1 = i;
1897 lastPositive = i;
1898 }
1899 else
1900 {
1901 (*it).i2.rseqNo = ~NOT_VERIFIED; // has been verified negatively // 01111111....111
1902 }
1903 }
1904
1905 // verify right mate, if left match was found
1906 if ((*it).i2.rseqNo == rseqNo)
1907 {
1908 lastLeftMatch.i1 = (*it).i2.gBegin;
1909 lastLeftMatch.i2 = (*it).i2.gEnd;
1910
1911 // dont want to shortcut too much
1912 if (lastPositive == (int64_t)-1 || i < lastPositive)
1913 lastPositive = i;
1914
1915 // first left match --> do right verification
1916 if(notYetVerifiedRight)
1917 {
1918 notYetVerifiedRight = false;
1919 TGPos minBeginPos = beginPosition(infix(swiftFinderR, genomeInf));
1920 if((int)minBeginPos - length(read)+2*options.minMatchLen - floor(options.errorRate*length(read)) > 0)
1921 minBeginPos = minBeginPos - length(read)+2*options.minMatchLen - floor(options.errorRate*length(read));
1922 else minBeginPos = 0;
1923 if (!matchVerify(mR,
1924 infix(genome,minBeginPos,endPosition(infix(swiftFinderR, genomeInf))),
1925 rseqNo,
1926 readSet,//readSetR,
1927 forwardPatternsR,
1928 options,
1929 TSwiftSpec(),
1930 LongestSuffix()))
1931 {
1932 noMatchRight = true;
1933 continue;
1934 }
1935 else
1936 {
1937 if (lastRightMatch[rseqNo].i1 == (TGPos)mR.gBegin && lastRightMatch[rseqNo].i2 == (TGPos)mR.gEnd)
1938 noMatchRight = true;
1939 lastRightMatch[rseqNo].i1 = mR.gBegin;
1940 lastRightMatch[rseqNo].i2 = mR.gEnd;
1941 }
1942 }
1943
1944 //else check if left and right match fit together
1945 if(!noMatchRight)
1946 {
1947 #ifdef RAZERS_DEBUG
1948 std::cout << "before\nmL.mScore =" << (*it).i2.mScore << "\n";
1949 std::cout << "mL.gBegin =" << (*it).i2.gBegin << "\n";
1950 std::cout << "mL.gEnd =" << (*it).i2.gEnd << "\n";
1951 std::cout << "mL.editDist =" << (*it).i2.editDist << "\n";
1952
1953 std::cout << "mR.mScore =" << mR.mScore << "\n";
1954 std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1955 std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1956 std::cout << "mR.editDist =" << mR.editDist << "\n";
1957 #endif
1958 int outerDistance = mR.gEnd - (*it).i2.gBegin;
1959 if (outerDistance < (int)(2 * options.minMatchLen))
1960 continue;
1961 // ::std::cout << options.minMatchLen << "<-minMatchLen outerDistance->" << outerDistance << std::endl;
1962 int outerDistanceError = length(readSet[rseqNo]) - (int)(outerDistance);
1963 if (outerDistanceError < 0) outerDistanceError = -outerDistanceError;
1964 if ((outerDistanceError > (int) options.maxDistance) ||
1965 (outerDistanceError < (int) options.minDistance))
1966 continue;
1967
1968 // ::std::cout <<"hier!"<<options.minDistance <<" "<<options.maxDistance<<"\n";
1969
1970 TMatch mRtmp = mR;
1971 TMatch mLtmp = (*it).i2;
1972 if(!combineLeftRight(mRtmp,mLtmp,read,genome,options,orientation,TSwiftSpec()))
1973 {
1974 ++options.FP;
1975 continue;
1976 }
1977 else ++options.TP;
1978
1979 ++counter;
1980
1981 //assign an alignment score
1982 //mRtmp.alignmentScore = (options.matchScore * (sumLength - sumDist)) + (options.mismatchScore * sumDist);
1983 //if(outerDistanceError != 0) mRtmp.alignmentScore += ((outerDistanceError - 1) * options.gapExtend) + options.gapOpen;
1984 //mLtmp.alignmentScore = mRtmp.alignmentScore;
1985
1986 //mRtmp.alignmentScore = (options.matchScore * (sumLength - sumDist)) + (options.mismatchScore * sumDist);
1987 //if(outerDistanceError != 0)
1988 //{
1989 // int maxErrors = (int)(options.errorRate*length(readSet[rseqNo]));
1990 // double gapScoreConvergeTo = ((options.minMatchLen*2 - maxErrors) * options.matchScore) + (maxErrors * options.mismatchScore) ;
1991 // double gapPenalty = gapScoreConvergeTo*(1-exp((double)-options.indelLambda*outerDistanceError));
1992 // mRtmp.alignmentScore -= (int)gapPenalty;
1993 //}
1994 //mLtmp.alignmentScore = mRtmp.alignmentScore;
1995
1996 if (orientation == 'R')
1997 {
1998 TSize temp = mLtmp.gBegin;
1999 mLtmp.gBegin = gLength - mLtmp.gEnd;
2000 mLtmp.gEnd = gLength - temp;
2001 temp = mRtmp.gBegin;
2002 mRtmp.gBegin = gLength - mRtmp.gEnd;
2003 mRtmp.gEnd = gLength - temp;
2004 outerDistance = -outerDistance;
2005 }
2006 // set a unique pair id
2007 mLtmp.pairId = mRtmp.pairId = options.nextMatePairId;
2008 if (++options.nextMatePairId == 0)
2009 options.nextMatePairId = 1;
2010
2011 // score the whole match pair
2012 mLtmp.pairScore = mRtmp.pairScore = 0 - mLtmp.editDist - mRtmp.editDist;
2013
2014 #ifdef RAZERS_DEBUG_LIGHT
2015 bool falsch = false;
2016 if((unsigned)(mRtmp.mScore + mLtmp.mScore) > length(readSet[rseqNo])) falsch = true;
2017 if(mRtmp.mScore> length(readSet[rseqNo])-options.minMatchLen || mRtmp.mScore < options.minMatchLen) falsch = true;
2018 if((int)mRtmp.gEnd - (int)mRtmp.gBegin < 0 || (int)mRtmp.gEnd - (int)mRtmp.gBegin > 500) falsch = true;
2019 if(mRtmp.editDist > 200) falsch = true;
2020
2021 if(mLtmp.mScore> length(readSet[rseqNo])-options.minMatchLen ||mLtmp.mScore < options.minMatchLen) falsch = true;
2022 if((int)mLtmp.gEnd - (int)mLtmp.gBegin < 0 || (int)mLtmp.gEnd - (int)mLtmp.gBegin > 500) falsch = true;
2023 if(mLtmp.editDist > 200) falsch = true;
2024
2025 if(falsch)
2026 {
2027 ::std::cout << "rseqNo="<<rseqNo <<::std::endl;
2028 std::cout << "mL.mScore =" << mLtmp.mScore << "\n";
2029 std::cout << "mL.gBegin =" << mLtmp.gBegin << "\n";
2030 std::cout << "mL.gEnd =" << mLtmp.gEnd << "\n";
2031 std::cout << "mL.editDist =" << mLtmp.editDist << "\n";
2032
2033 std::cout << "mR.mScore =" << mRtmp.mScore << "\n";
2034 std::cout << "mR.gBegin =" << mRtmp.gBegin << "\n";
2035 std::cout << "mR.gEnd =" << mRtmp.gEnd << "\n";
2036 std::cout << "mR.editDist =" << mRtmp.editDist << "\n";
2037
2038
2039
2040 if(-mLtmp.pairScore > (int) (options.errorRate * length(readSet[rseqNo])))
2041 {
2042 ::std::cout << "mLtmp.pairScore = " << mLtmp.pairScore;
2043 ::std::cout << "\treadLen = " << length(readSet[rseqNo]);
2044 ::std::cout << "\trseqNo = " << rseqNo << ::std::endl;
2045 }
2046 std::cout << "mScoreSum =" << mLtmp.mScore + mRtmp.mScore << "\t";
2047 std::cout << "readLength =" << length(readSet[rseqNo]) << "\t";
2048 std::cout << "mL.mScore =" << mLtmp.mScore << "\t";
2049 std::cout << "mL.gBegin =" << mLtmp.gBegin << "\t";
2050 std::cout << "mL.gEnd =" << mLtmp.gEnd << "\t";
2051 std::cout << "mL.editDist =" << mLtmp.editDist << "\n";
2052
2053 std::cout << "mR.mScore =" << mRtmp.mScore << "\t";
2054 std::cout << "mR.gBegin =" << mRtmp.gBegin << "\t";
2055 std::cout << "mR.gEnd =" << mRtmp.gEnd << "\t";
2056 std::cout << "mR.editDist =" << mRtmp.editDist << "\n";
2057
2058 Align<String<Dna5Q>, ArrayGaps> alignL;
2059 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
2060 if (options.hammingOnly)
2061 scoreType.data_mismatch = -1;
2062 resize(rows(alignL), 2);
2063 assignSource(row(alignL, 0), infix(readSet[rseqNo],0,mLtmp.mScore));
2064 assignSource(row(alignL, 1), infix(genome,mLtmp.gBegin, mLtmp.gEnd));
2065 if (orientation == 'R')
2066 reverseComplement(source(row(alignL, 1)));
2067
2068 globalAlignment(alignL, scoreType);
2069 std::cout << alignL;
2070
2071 Align<String<Dna5Q>, ArrayGaps> alignR;
2072 resize(rows(alignR), 2);
2073 assignSource(row(alignR, 0), infix(readSet[rseqNo],length(readSet[rseqNo])-mRtmp.mScore,length(readSet[rseqNo])));
2074 assignSource(row(alignR, 1), infix(genome,mRtmp.gBegin, mRtmp.gEnd));
2075 if (orientation == 'R')
2076 reverseComplement(source(row(alignR, 1)));
2077
2078 globalAlignment(alignR, scoreType);
2079 std::cout << alignR;
2080
2081 std::cout << "Komplettali:\n";
2082 Align<String<Dna5Q>, ArrayGaps> align;
2083 resize(rows(align), 2);
2084 assignSource(row(align, 0), infix(readSet[rseqNo],0,length(readSet[rseqNo])));
2085 assignSource(row(align, 1), infix(genome,_min(mRtmp.gBegin,mLtmp.gBegin), _max(mRtmp.gEnd,mLtmp.gEnd)));
2086 if (orientation == 'R')
2087 reverseComplement(source(row(align, 1)));
2088
2089 globalAlignment(align, scoreType);
2090 std::cout << align;
2091 }
2092 #endif
2093
2094
2095
2096 // relative positions
2097 mLtmp.mateDelta = outerDistance;
2098 mRtmp.mateDelta = -outerDistance;
2099
2100 mLtmp.rseqNo = mRtmp.rseqNo = rseqNo;
2101
2102 if (!options.spec.DONT_DUMP_RESULTS)
2103 {
2104 appendValue(matches, mLtmp, Generous());
2105 appendValue(matches, mRtmp, Generous());
2106 if (length(matches) > options.compactThresh)
2107 {
2108 typename Size<TMatches>::Type oldSize = length(matches);
2109 // maskDuplicates(matches); // overlapping parallelograms cause duplicates //TODO: implement!
2110 compactSplicedMatches(matches, cnts, options, false, swiftPatternL, swiftPatternR);
2111 options.compactThresh += (options.compactThresh >> 1);
2112 if (options._debugLevel >= 2)
2113 ::std::cerr << '(' << oldSize - length(matches) << " matches removed)";
2114 }
2115 }
2116 }
2117 }
2118 }
2119
2120 // short-cut negative matches
2121 if (lastPositive == (int64_t)-1)
2122 lastPotMatchNo[rseqNo] = (int64_t)-1;
2123 else
2124 value(fifo, lastPositive - firstNo).i1 = (int64_t)-1; // the first positive's link to previous is removed
2125
2126
2127 }//swiftFinderR
2128
2129
2130
2131 }//function
2132
2133
2134
2135
2136
2137 }//namespace
2138
2139 #endif
2140