1  /*==========================================================================
2                 RazerS Spliced - Fast Split Read Mapping
3 
4  ============================================================================
5   Copyright (C) 2008 by Anne-Katrin Emde
6 
7   This program is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 3 of the License, or (at your options) any later version.
11 
12   This program is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15   Lesser General Public License for more details.
16 
17   You should have received a copy of the GNU General Public License
18   along with this program.  If not, see <http://www.gnu.org/licenses/>.
19  ==========================================================================*/
20 
21 #ifndef SEQAN_HEADER_RAZERS_SPLICED_H
22 #define SEQAN_HEADER_RAZERS_SPLICED_H
23 
24 //#define RAZERS_DEBUG
25 //#define RAZERS_DEBUG_LIGHT
26 
27 #include <seqan/align.h>
28 #include <seqan/graph_types.h>
29 #include <seqan/graph_algorithms.h>
30 #include <seqan/graph_align.h>
31 #include <seqan/seeds.h>
32 
33 namespace seqan
34 {
35 
36 
37 
38 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
39 // Match statistics stuff
40 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
41 
42 //statistics, expected number of matches
choose(int n,int k)43 double choose(int n, int k)
44 {
45 	double cnk = 1.0;
46 
47 	if (k * 2 > n)
48 	k = n - k;
49 
50 	for (int i = 1; i <= k; n--, i++)
51 	{
52 		cnk /= i;
53 		cnk *= n;
54 	}
55 	return cnk;
56 }
57 
58 
59 //#R  - read length
60 //#T  - total sequence length
61 //#I  - length of insertion
62 //#D  - length of deletion
63 //#M1 - minimal prefix match length
64 //#M2 - minimal suffix match length
65 
66 
67 //## first function denotes the random match to an iid sequence
68 template<typename TSize, typename TValue>
69 double
_probability(TSize R,TSize M1,TSize M2,TValue d,TValue d_m1,TValue d_m2)70 _probability(TSize R, TSize M1, TSize M2, TValue d, TValue d_m1, TValue d_m2)
71 {
72 	double sum_prob = 0;
73 	for (unsigned i1 = 0; i1 < d_m1; ++i1)
74 	{
75 		for (unsigned i2 = 0; i2 < d_m2; ++i2)
76 		{
77 			for (unsigned j = 0; j < d-i1-i2; ++j)
78 			{
79 				sum_prob += (double)(choose(M1,i1)* pow((double)0.25,(double)R-i1-i2-j) * choose(M2,i2) * pow((double)0.75,(double)i1+i2+j) * choose(R-M1-M2,j));
80 			}
81 		}
82 	}
83 	return sum_prob;
84 }
85 
86 
87 //#InsCuts describes the possible number of cuts
88 //#for a read of length R and an insertion of length I
89 template<typename TSize>
90 TSize
_insCuts(TSize R,TSize M1,TSize M2,TSize I)91 _insCuts(TSize R, TSize M1, TSize M2, TSize I){
92 	return (R-I-M1-M2+1);
93 }
94 
95 
96 
97 //#function for calculating expected matches with I insertions
98 template<typename TSize, typename TValue>
99 double
_expMatchesIns(TSize R,TSize M1,TSize M2,TSize I,TValue d,TValue d_m1,TValue d_m2,TSize S,TSize T)100 _expMatchesIns(TSize R, TSize M1, TSize M2, TSize I, TValue d,TValue d_m1,TValue d_m2, TSize S, TSize T)
101 {
102 	return((double)T*((S-R-I-1)*_probability((R-I),M1,M2,d,d_m1,d_m2))*_insCuts(R,M1,M2,I));
103 }
104 
105 
106 //#DelCuts describes the possible number of cuts
107 //#for a read of length R in case of a deletion
108 template<typename TSize>
109 TSize
_delCuts(TSize R,TSize M1,TSize M2)110 _delCuts(TSize R, TSize M1, TSize M2)
111 {
112 	return (R-M1-M2+1);
113 }
114 
115 
116 //#Del describes the possible number of configurations
117 //#for a string of length R
118 template<typename TSize>
119 TSize
_del(TSize R,TSize S)120 _del(TSize R, TSize S)
121 {
122 	return((S-R)*(S-R+1)/2);
123 }
124 
125 
126 //#function for calculating expected matches with deletion of size D
127 template<typename TSize, typename TValue>
128 double
_expMatchesDel(TSize R,TSize M1,TSize M2,TValue d,TValue d_m1,TValue d_m2,TSize S,TSize T)129 _expMatchesDel(TSize R, TSize M1, TSize M2, TValue d, TValue d_m1, TValue d_m2, TSize S, TSize T)
130 {
131 	return ((double)T*(_probability(R,M1,M2,d,d_m1,d_m2))*_delCuts(R,M1,M2)*_del(R,S));
132 }
133 
134 template<typename TReadSet, typename TSize, typename TOptions>
135 void
expNumRandomMatches(TReadSet & readSet,TSize genomeLen,TOptions & options)136 expNumRandomMatches(TReadSet &readSet, TSize genomeLen, TOptions & options)
137 {
138 	TSize M1 = options.minMatchLen;
139 	TSize M2 = options.minMatchLen;
140 	TSize d_m1 = (int) M1 * options.errorRate;
141 	TSize d_m2 = (int) M2 * options.errorRate;
142 	TSize numReads = length(readSet);
143 	TSize readLen = (numReads > 0) ? length(readSet[0]) : 0;
144 	TSize numErrors = (int) readLen * options.errorRate;
145 
146 	//expected number of random deletion matches:
147 	double delMatches = _expMatchesDel(readLen,M1,M2,numErrors,d_m1, d_m2, genomeLen, numReads);
148 
149 	//expected number of random deletion matches:
150 	double insMatches = 0;
151 	for(TSize insLen = 1; insLen <=readLen-M1-M2; ++insLen)
152 		insMatches += _expMatchesIns(readLen,M1,M2,numErrors,insLen,d_m1, d_m2, genomeLen, numReads);
153 
154 	::std::cout << "Expected number of random deletion-indicating matches: " << delMatches << std::endl;
155 	::std::cout << "Expected number of random insertion-indicating matches: " << insMatches << std::endl;
156  }
157 
158 
159 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
160 
161 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
162 
163 
164 // SPLIT MAPPING
165 // We build two q-gram indices, one for the prefix, the other for the suffix of the read
166 
167 struct LongestPrefix{};
168 struct LongestSuffix{};
169 
170 struct OrientationReverse{};
171 struct OrientationForward{};
172 
173 
174 
175 
176 //////////////////////////////////////////////////////////////////////////////
177 // Remove low quality matches # necessary to have an own splicedmatch function?
178 // planned specs: SpliceSite, General, ...
179 template < typename TMatches, typename TCounts, typename TSpec, typename TSwiftL, typename TSwiftR >
compactSplicedMatches(TMatches & matches,TCounts &,RazerSOptions<TSpec> & options,bool compactFinal,TSwiftL & swiftL,TSwiftR & swiftR)180 void compactSplicedMatches(TMatches &matches,
181 			TCounts & /*cnts*/,
182 			RazerSOptions<TSpec> &options,
183 			bool compactFinal,
184 			TSwiftL &swiftL,
185 			TSwiftR &swiftR)
186 {
187 	typedef typename Value<TMatches>::Type				TMatch;
188 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
189 
190 	unsigned readNo = -1;
191 	unsigned hitCount = 0;
192 	unsigned hitCountCutOff = options.maxHits;
193 	int scoreDistCutOff = std::numeric_limits<int>::min();
194 
195 	TIterator it = begin(matches, Standard());
196 	TIterator itEnd = end(matches, Standard());
197 	TIterator dit = it;
198 	TIterator ditBeg = it;
199 
200 	// sort
201 	::std::sort(it, itEnd, LessSplicedErrors<TMatch>());
202 //	::std::sort(it, itEnd, LessSplicedScore<TMatch>());
203 	int counter = 0;
204 	for (; it != itEnd; ++it)
205 	{
206 		++counter;
207 		if ((*it).orientation == '-') { ++it; continue; }
208 		if (readNo == (*it).rseqNo)
209 		{
210 			if ((*it).pairScore <= scoreDistCutOff)
211 			{
212 				++it;
213 				continue;
214 			}
215 			if (++hitCount >= hitCountCutOff)
216 			{
217 #ifdef RAZERS_MASK_READS
218 				if (hitCount == hitCountCutOff)
219 				{
220 					// we have enough, now look for better matches
221 					int maxErrors = - (*it).pairScore - 1;
222 					if (options.purgeAmbiguous && (options.distanceRange == 0 || maxErrors <= (int) options.distanceRange))
223 						maxErrors = -1;
224 
225 					setMaxErrors(swiftL, readNo, maxErrors);
226 					setMaxErrors(swiftR, readNo, maxErrors);
227 
228 					if (maxErrors == -1 && options._debugLevel >= 2)
229 						::std::cerr << "(read #" << readNo << " disabled)";
230 
231 					if(options.purgeAmbiguous)
232 	     				{
233 						if (options.distanceRange == 0 || -(*it).pairScore < (int) options.distanceRange || compactFinal)
234 							dit = ditBeg;
235 						else {
236 							*dit = *it;	++dit; ++it;
237 							*dit = *it;	++dit;
238 						}
239 					}
240 
241 				}
242 #endif
243 				++it;
244 				continue;
245 			}
246 		}
247 		else
248 		{
249 			readNo = (*it).rseqNo;// >> 1;
250 			hitCount = 0;
251 			if (options.distanceRange > 0)
252 				scoreDistCutOff = (*it).pairScore - options.distanceRange;
253 			ditBeg = dit;
254 		}
255 		*dit = *it;	++dit; ++it;
256 		*dit = *it;	++dit;
257 	}
258 	resize(matches, dit - begin(matches, Standard()));
259 }
260 
261 
262 
263 //////////////////////////////////////////////////////////////////////////////
264 // Remove low quality matches # necessary to have an own splicedmatch function?
265 template < typename TMatches, typename TCounts, typename TSpec, typename TSwiftL, typename TSwiftR >
compactSplicedMatchesPurgeAmbiguous(TMatches & matches,TCounts &,RazerSOptions<TSpec> & options,TSwiftL &,TSwiftR &)266 void compactSplicedMatchesPurgeAmbiguous(TMatches &matches, TCounts & /*cnts*/, RazerSOptions<TSpec> &options, TSwiftL &, TSwiftR &)
267 {
268 	typedef typename Value<TMatches>::Type				TMatch;
269 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
270 
271 	unsigned readNo = -1;
272 	unsigned hitCount = 0;
273 	unsigned hitCountCutOff = options.maxHits;
274 	int scoreDistCutOff = std::numeric_limits<int>::min();
275 
276 	TIterator it = begin(matches, Standard());
277 	TIterator itEnd = end(matches, Standard());
278 	TIterator dit = it;
279 	TIterator ditBeg = it;
280 
281 	// sort
282 	::std::sort(it, itEnd, LessSplicedErrors<TMatch>());
283 //	::std::sort(it, itEnd, LessSplicedScore<TMatch>());
284 	int counter = 0;
285 	for (; it != itEnd; ++it)
286 	{
287 		++counter;
288 		if ((*it).orientation == '-') continue;
289 		if (readNo == (*it).rseqNo)
290 		{
291 			if ((*it).pairScore <= scoreDistCutOff)
292 			{
293 				++it;
294 				continue;
295 			}
296 			if (++hitCount >= hitCountCutOff)
297 			{
298 				if (hitCount == hitCountCutOff)
299 					dit = ditBeg;
300 				++it;
301 				continue;
302 			}
303 		}
304 		else
305 		{
306 			readNo = (*it).rseqNo;
307 			hitCount = 0;
308 			if (options.distanceRange > 0)
309 				scoreDistCutOff = (*it).pairScore - options.distanceRange;
310 			ditBeg = dit;
311 		}
312 		*dit = *it;	++dit; ++it;
313 		*dit = *it;	++dit;
314 	}
315 	resize(matches, dit - begin(matches, Standard()));
316 }
317 
318 
319 	template <typename TReadMatch>
320 	struct LessReadNoPairErrors : public ::std::binary_function < TReadMatch, TReadMatch, bool >
321 	{
operatorLessReadNoPairErrors322 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
323 		{
324 			// read number
325 			if (a.rseqNo < b.rseqNo) return true;
326 			if (a.rseqNo > b.rseqNo) return false;
327 
328 			// quality
329 			if (a.pairScore > b.pairScore) return true;
330 			if (a.pairScore < b.pairScore) return false;
331 			if (a.pairId < b.pairId) return true;
332 			if (a.pairId > b.pairId) return false;
333 			return a.editDist < b.editDist;
334 		}
335 	};
336 
337 
338 
339 //////////////////////////////////////////////////////////////////////////////
340 // Count matches for each number of errors
341 template < typename TMatches, typename TCounts >
countSplitMatches(TMatches & matches,TCounts & cnt)342 void countSplitMatches(TMatches &matches, TCounts &cnt)
343 {
344 	typedef typename Value<TMatches>::Type					TMatch;
345 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
346 	typedef typename Value<TCounts>::Type					TRow;
347 	typedef typename Value<TRow>::Type						TValue;
348 
349 
350 	TIterator it = begin(matches, Standard());
351 	TIterator itEnd = end(matches, Standard());
352 
353 	::std::sort(it, itEnd, LessReadNoPairErrors<TMatch>());
354 
355 	unsigned readNo = -1;
356 	short editDist = -1;
357 	int64_t count = 0;
358 	int64_t maxVal = std::numeric_limits<TValue>::max();
359 
360 	for (; it != itEnd; ++it)
361 	{
362 		if ((*it).orientation == '-') continue;
363 		if (readNo == (*it).rseqNo &&
364 			-editDist == (*it).pairScore)
365 			++count;
366 		else
367 		{
368 			if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt))
369 				cnt[editDist][readNo] = (maxVal < count)? maxVal : count;
370 			readNo = (*it).rseqNo;
371 			editDist = -(*it).pairScore;
372 			count = 1;
373 		}
374 	}
375 	if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt))
376 		cnt[editDist][readNo] = count;
377 }
378 
379 
380 template<typename TAlign, typename TPosition>
381 int
countErrorsInAlign(TAlign & align,TPosition end_)382 countErrorsInAlign(TAlign & align, TPosition end_)
383 {
384 
385 	typedef typename Source<TAlign>::Type TSource;
386 	typedef typename Iterator<TSource, Rooted>::Type TStringIterator;
387 
388 	typedef typename Row<TAlign>::Type TRow;
389 	typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
390 
391 	TAlignIterator ali_it0 = iter(row(align,0),0);
392 	TAlignIterator ali_it1 = iter(row(align,1),0);
393 	TAlignIterator ali_it0_stop = iter(row(align,0),end_);
394 	TAlignIterator ali_it1_stop = iter(row(align,1),end_);
395 
396 
397 	int errors = 0;
398 	while(ali_it0 != ali_it0_stop && ali_it1 != ali_it1_stop)
399 	{
400 		while(ali_it0!=ali_it0_stop && ali_it1!=ali_it1_stop && !isGap(ali_it0)&& !isGap(ali_it1))
401 		{
402 			if(*ali_it1 != *ali_it0)
403 				++errors;
404 			++ali_it0;
405 			++ali_it1;
406 		}
407 		while(ali_it0!=ali_it0_stop && isGap(ali_it0))
408 		{
409 			++ali_it0;
410 			++ali_it1;
411 			++errors;
412 		}
413 		while(isGap(ali_it1)&& ali_it1!=ali_it1_stop)
414 		{
415 			++ali_it0;
416 			++ali_it1;
417 			++errors;
418 		}
419 	}
420 	while(ali_it0!=ali_it0_stop)
421 	{
422 		++ali_it0;
423 		++errors;
424 	}
425 	while(ali_it1 != ali_it1_stop)
426 	{
427 		++ali_it1;
428 		++errors;
429 	}
430 
431 	return errors;
432 }
433 
434 
435 
436 //////////////////////////////////////////////////////////////////////////////
437 // Edit distance verification for longest suffix/prefix
438 template <
439 	typename TMatch,
440 	typename TGenome,
441 	typename TReadSet,
442 	typename TMyersPatterns,
443 	typename TSpec,
444 	typename TSufPrefSpec
445 >
446 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> inf,unsigned rseqNo,TReadSet & readSet,TMyersPatterns & forwardPatterns,RazerSOptions<TSpec> const & options,SwiftSemiGlobal,TSufPrefSpec)447 matchVerify(
448 	TMatch &m,							// resulting match
449 	Segment<TGenome, InfixSegment> inf,	// potential match genome region
450 	unsigned rseqNo,					// read number
451 	TReadSet &readSet,	    			// reads
452 	TMyersPatterns &forwardPatterns,	// MyersBitVector preprocessing data
453 	RazerSOptions<TSpec> const &options,// RazerS options
454 	SwiftSemiGlobal,					// Edit distance
455 	TSufPrefSpec)						// Swift specialization
456 {
457 	typedef Segment<TGenome, InfixSegment>				TGenomeInfix;
458 	typedef typename Value<TReadSet>::Type				TRead;
459 
460 	// find read match end
461 	typedef Finder<TGenomeInfix>					TMyersFinder;
462 	typedef typename Value<TMyersPatterns>::Type			TMyersPattern;
463 
464 	// find read match begin
465 	typedef ModifiedString<TGenomeInfix, ModReverse>		TGenomeInfixRev;
466 	typedef ModifiedString<TRead, ModReverse>			TReadRev;
467 	typedef Finder<TGenomeInfixRev>					TMyersFinderRev;
468 	typedef Pattern<TReadRev, MyersUkkonenGlobal>			TMyersPatternRev;
469 
470 	TMyersFinder myersFinder(inf);
471 	TMyersPattern &myersPattern = forwardPatterns[rseqNo];  //have to make sure this only contains the prefix
472 
473 #ifdef RAZERS_DEBUG
474 	::std::cout << "Verify: " << ::std::endl;
475 	::std::cout << "Genome: " << inf << "\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl;
476 	::std::cout << "Read:   " << readSet[rseqNo] << ::std::endl;
477 #endif
478 
479 	unsigned ndlLength = _min(sequenceLength(rseqNo, readSet),options.minMatchLen);
480 	int maxScore = std::numeric_limits<int>::min();
481 	int minScore = -(int)(ndlLength * options.errorRate);
482 	TMyersFinder maxPos;
483 
484 	// find end of best semi-global alignment
485 	while (find(myersFinder, myersPattern, minScore))
486 	{
487 		if (maxScore <= getScore(myersPattern))
488 		{
489 			maxScore = getScore(myersPattern);
490 			maxPos = myersFinder;
491 		}
492 	}
493 
494 
495 	if (maxScore < minScore)
496 		return false;
497 
498 	m.editDist	= (unsigned)-maxScore;
499 	TGenomeInfix oriInf = inf;
500 	setEndPosition(inf, m.gEnd = (beginPosition(inf) + position(maxPos) + 1));
501 
502 	// limit the beginning to needle length plus errors (== -maxScore)
503 	if (length(inf) > ndlLength - maxScore)
504 		setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore);
505 
506 	// find beginning of best semi-global alignment
507 	TGenomeInfixRev		infRev(inf);
508 	TMyersFinderRev		myersFinderRev(infRev);
509 	TReadRev			readRev;
510     TRead               readInf;
511 	if(IsSameType<TSufPrefSpec,LongestSuffix>::VALUE)
512 		readInf = infix(readSet[rseqNo],length(readSet[rseqNo])-options.minMatchLen,length(readSet[rseqNo]));
513 	else
514 		readInf = infix(readSet[rseqNo],0,options.minMatchLen);
515     setHost(readRev, readInf);
516 
517 	TMyersPatternRev	myersPatternRev(readRev);
518 
519 	_patternMatchNOfPattern(myersPatternRev, options.matchN);
520 	_patternMatchNOfFinder(myersPatternRev, options.matchN);
521 	while (find(myersFinderRev, myersPatternRev, maxScore))
522 		m.gBegin = m.gEnd - (position(myersFinderRev) + 1);
523 
524 	m.mScore = ndlLength;
525 	m.seedEditDist = m.editDist;
526 	m.gSeedLen = m.gEnd - m.gBegin;
527 
528 #ifdef RAZERS_DEBUG
529 	::std::cout << " before extendMatch " << ::std::endl;
530 	::std::cout << " match: " << ::std::endl;
531 	::std::cout << " mScore= " <<m.mScore << ::std::endl;
532 	::std::cout << " gBegin= " <<m.gBegin << ::std::endl;
533 	::std::cout << " gEnd= " <<m.gEnd << ::std::endl;
534 	::std::cout << " edit= " <<m.editDist << ::std::endl;
535 #endif
536 
537 	//TODO: give only part of read to extension!!!
538 
539 	//now extend the seed
540 	extendMatch(readSet,rseqNo,oriInf,m,options,TSufPrefSpec());
541 
542 #ifdef RAZERS_DEBUG
543 	::std::cout << " match: " << ::std::endl;
544 	::std::cout << " mScore= " <<m.mScore << ::std::endl;
545 	::std::cout << " gBegin= " <<m.gBegin << ::std::endl;
546 	::std::cout << " gEnd= " <<m.gEnd << ::std::endl;
547 	::std::cout << " edit= " <<m.editDist << ::std::endl;
548 #endif
549 
550 	return true;
551 }
552 
553 template<typename TReadSet, typename TSize, typename TInf, typename TMatch, typename TOptions>
554 void
extendMatch(TReadSet & readSet,TSize rseqNo,TInf & inf,TMatch & m,TOptions & options,LongestSuffix)555 extendMatch(TReadSet &readSet, TSize rseqNo, TInf & inf, TMatch &m, TOptions &options, LongestSuffix)
556 {
557 #ifdef RAZERS_DEBUG
558 	::std::cout << " extending match left" << ::std::endl;
559 #endif
560 
561 	unsigned lDim0 = _max(0,(int)length(readSet[rseqNo])-(int)options.minMatchLen);
562 	unsigned lDim1 = m.gBegin - beginPosition(inf);
563 	unsigned rDim0 = length(readSet[rseqNo])-1;
564 	unsigned rDim1 = m.gEnd - beginPosition(inf)-1;
565 	Seed<int,SimpleSeed> seed(lDim0, lDim1, rDim0, rDim1);
566 	Score<int> scoreMatrix(0,-1,-1,-1);
567 	int scoreDropOff = (sequenceLength(rseqNo,readSet) * options.errorRate) - m.editDist;
568 
569 #ifdef RAZERS_DEBUG
570 	::std::cout << "beginPos = " << beginPosition(inf) << std::endl;
571 	::std::cout << "endPos = " << endPosition(inf) << std::endl;
572 	::std::cout << " lDim0: " << lDim0 << ::std::endl;
573 	::std::cout << " lDim1: " << lDim1 << ::std::endl;
574 	::std::cout << " rDim0: " << rDim0 << ::std::endl;
575 	::std::cout << " rDim1: " << rDim1 << ::std::endl;
576 	::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
577 	::std::cout << " readInf: "<< infix(readSet[rseqNo],lDim0,rDim0+1) << ::std::endl;
578 	::std::cout << " gInfInf: "<< infix(inf,lDim1,rDim1+1) << ::std::endl;
579 	::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
580 	::std::cout << " gInf: "<< inf << ::std::endl;
581 #endif
582 
583 	int extScore = 0;
584 	extendSeedScore(seed,extScore,scoreDropOff,scoreMatrix, readSet[rseqNo],inf,0,GappedXDrop());
585 	m.gBegin = leftDim1(seed) + beginPosition(inf);
586 	m.mScore = rightDim0(seed) - leftDim0(seed) + 1;
587 	m.editDist -= extScore;
588 
589 #ifdef RAZERS_DEBUG
590 	::std::cout << " lDim0: " << leftDim0(seed) << ::std::endl;
591 	::std::cout << " lDim1: " << leftDim1(seed) << ::std::endl;
592 	::std::cout << " rDim0: " << rightDim0(seed) << ::std::endl;
593 	::std::cout << " rDim1: " << rightDim1(seed) << ::std::endl;
594 	::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
595 	::std::cout << " readInf: "<< infix(readSet[rseqNo],leftDim0(seed),rightDim0(seed)+1) << ::std::endl;
596 	::std::cout << " gInfInf: "<< infix(inf,leftDim1(seed),rightDim1(seed)+1) << ::std::endl;
597 	::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
598 	::std::cout << " gInf: "<< inf << ::std::endl;
599 #endif
600 }
601 
602 template<typename TReadSet, typename TSize, typename TInf, typename TMatch, typename TOptions>
603 void
extendMatch(TReadSet & readSet,TSize rseqNo,TInf & inf,TMatch & m,TOptions & options,LongestPrefix)604 extendMatch(TReadSet &readSet, TSize rseqNo, TInf & inf, TMatch &m, TOptions &options, LongestPrefix)
605 {
606 #ifdef RAZERS_DEBUG
607 	::std::cout << " extending match right" << ::std::endl;
608 #endif
609 
610 	unsigned lDim0 = 0;
611 	unsigned lDim1 = m.gBegin - beginPosition(inf);
612 	unsigned rDim0 = _min(options.minMatchLen,length(readSet[rseqNo])) - 1;
613 	unsigned rDim1 = m.gEnd - beginPosition(inf) - 1;
614 	Seed<int,SimpleSeed> seed(lDim0, lDim1, rDim0, rDim1);
615 	Score<int> scoreMatrix(0,-1,-1,-1);
616 	int scoreDropOff = (sequenceLength(rseqNo,readSet) * options.errorRate) - m.editDist;
617 
618 #ifdef RAZERS_DEBUG
619 	::std::cout << "beginPos = " << beginPosition(inf) << std::endl;
620 	::std::cout << "endPos = " << endPosition(inf) << std::endl;
621 	::std::cout << " lDim0: " << lDim0 << ::std::endl;
622 	::std::cout << " lDim1: " << lDim1 << ::std::endl;
623 	::std::cout << " rDim0: " << rDim0 << ::std::endl;
624 	::std::cout << " rDim1: " << rDim1 << ::std::endl;
625 	::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
626 	::std::cout << " readInf: "<< infix(readSet[rseqNo],lDim0,rDim0+1) << ::std::endl;
627 	::std::cout << " gInfInf: "<< infix(inf,lDim1,rDim1+1) << ::std::endl;
628 	::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
629 	::std::cout << " gInf: "<< inf << ::std::endl;
630 #endif
631 
632 	int extScore = 0;
633 	extendSeedScore(seed,extScore,scoreDropOff,scoreMatrix, readSet[rseqNo],inf,1,GappedXDrop());
634 	m.gEnd = rightDim1(seed) + 1 + beginPosition(inf);
635 	m.mScore = rightDim0(seed) - leftDim0(seed) + 1;
636 	m.editDist -= extScore;
637 
638 #ifdef RAZERS_DEBUG
639 	::std::cout << " lDim0: " << leftDim0(seed) << ::std::endl;
640 	::std::cout << " lDim1: " << leftDim1(seed) << ::std::endl;
641 	::std::cout << " rDim0: " << rightDim0(seed) << ::std::endl;
642 	::std::cout << " rDim1: " << rightDim1(seed) << ::std::endl;
643 	::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
644 	::std::cout << " readInf: "<< infix(readSet[rseqNo],leftDim0(seed),rightDim0(seed)+1) << ::std::endl;
645 	::std::cout << " gInfInf: "<< infix(inf,leftDim1(seed),rightDim1(seed)+1) << ::std::endl;
646 	::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
647 	::std::cout << " gInf: "<< inf << ::std::endl;
648 #endif
649 }
650 
651 
652 //template <
653 //	typename TMatch,
654 //	typename TGenomeSegment,
655 //	typename TReadSet,
656 //	typename TDummy,
657 //	typename TSpec >
658 //inline bool
659 //matchVerify(
660 //	TMatch &m,					// resulting match
661 //	TGenomeSegment inf,				// potential match genome region
662 //	unsigned rseqNo,				// read number
663 //	TReadSet &readSet,				// reads
664 //	TDummy&,
665 //	RazerSOptions<TSpec> const &options,		// RazerS options
666 //	LongestSuffix)				// LongestEditPrefix within errorRate
667 //{
668 //	::std::cout << "vcejsfhkjdfhksh11111\n";
669 //	if( matchVerify(m,inf,rseqNo,
670 //		readSet, false,
671 //		options,LongestHammingSuffix()))
672 //	{
673 //	//	unsigned tmp = m.gBegin;
674 //	//	m.gBegin = m.gEnd;
675 //	//	m.gEnd = tmp;
676 //		return true;
677 //	}
678 //	else return false;
679 //
680 //}
681 //
682 //
683 //template <
684 //	typename TMatch,
685 //	typename TGenomeSegment,
686 //	typename TReadSet,
687 //	typename TDummy,
688 //	typename TSpec >
689 //inline bool
690 //matchVerify(
691 //	TMatch &m,					// resulting match
692 //	TGenomeSegment inf,		// potential match genome region
693 //	unsigned rseqNo,				// read number
694 //	TReadSet &readSet,				// reads
695 //	TDummy & ,
696 //	RazerSOptions<TSpec> const &options,		// RazerS options
697 //	LongestPrefix)				// LongestEditPrefix within errorRate
698 //{
699 //	::std::cout << "vcejsfhkjdfhksh222222\n";
700 //	return matchVerify(m,inf,rseqNo,
701 //		readSet, true,
702 //		options,LongestHammingPrefix());
703 //}
704 
705 
706 template <
707 	typename TMatch,
708 	typename TGenome,
709 	typename TReadSet,
710 	typename TPattern,
711 	typename TSpec >
712 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> genomeInf,unsigned rseqNo,TReadSet & readSet,TPattern &,RazerSOptions<TSpec> const & options,SwiftSemiGlobalHamming,LongestPrefix)713 matchVerify(
714 	TMatch &m,									// resulting match
715 	Segment<TGenome,InfixSegment>  genomeInf,	// potential match genome region
716 	unsigned rseqNo,							// read number
717 	TReadSet& readSet,							// original readSet
718 	TPattern&,
719 	RazerSOptions<TSpec> const &options,		// RazerS options
720 	SwiftSemiGlobalHamming,						// HammingDistance
721 	LongestPrefix)								// LongestPrefix
722 {
723 
724 	typedef Segment<TGenome, InfixSegment>                  TGenomeInfix;
725 	typedef typename Size<TGenomeInfix>::Type               TSize;
726 	typedef typename Value<TGenomeInfix>::Type              TDna;
727 	typedef typename Position<TGenomeInfix>::Type           TPosition;
728 	typedef typename Value<TReadSet>::Type 			TRead;
729 	typedef typename Iterator<TGenomeInfix, Standard>::Type	TGenomeIterator;
730 	typedef typename Infix<TRead>::Type 			TReadInf;
731 	typedef typename Iterator<TReadInf, Standard>::Type	TReadIterator;
732 
733 	if (length(genomeInf) < options.minMatchLen) return false;
734 	TReadInf read = infix(readSet[rseqNo],0,length(readSet[rseqNo])-options.minMatchLen);
735 
736 
737 	TReadIterator ritBeg	= begin(read, Standard());
738 	TReadIterator ritEnd	= end(read, Standard());
739 	TGenomeIterator git	= begin(genomeInf, Standard());
740 	TGenomeIterator gitEnd	= end(genomeInf, Standard()) - (length(read) - 1);
741 
742 	// this is max number of errors the seed should have
743 	unsigned maxErrorsSeed = (unsigned)(options.minMatchLen * options.errorRate);
744 	unsigned maxTotalErrors = (unsigned)(length(read) * options.errorRate);
745 	unsigned minSeedErrors = maxErrorsSeed + 1;
746 	unsigned minTotalErrors = maxTotalErrors + 1;
747 	unsigned bestHitLength = 0;
748 
749 	for (; git < gitEnd; ++git)
750 	{
751 		bool hit = true;
752 		unsigned hitLength = 0;
753 		unsigned count = 0;
754 		unsigned seedErrors = 0;
755 		unsigned totalErrors = 0;
756 		TGenomeIterator g = git;
757 		for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g)
758 		{
759 			if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
760 			{
761 				if (count < options.minMatchLen)
762 				{
763 					++totalErrors;
764 					if(++seedErrors > maxErrorsSeed)
765 					{
766 						hit = false;
767 						break;
768 					}
769 				}
770 				else
771 				{
772 					if(++totalErrors > maxTotalErrors)
773 					{
774 						// we are excluding this last error position
775 						--totalErrors;
776 						break;
777 					}
778 				}
779 			}
780 			++count;
781 		}
782 		if (hit) hitLength = count;
783 		if (hitLength > bestHitLength ) //simply take the longest hit
784 		{
785 			minSeedErrors = seedErrors;
786 			minTotalErrors = totalErrors;
787 			bestHitLength = hitLength;
788 			m.gBegin = git - begin(host(genomeInf), Standard());
789 		}
790 	}
791 
792 
793 
794 	if(bestHitLength < options.minMatchLen)
795 		return false;
796 
797 	m.gEnd = m.gBegin + bestHitLength;
798 	m.mScore = bestHitLength;
799 	m.editDist = minTotalErrors;
800 
801 #ifdef RAZERS_DEBUG
802 		std::cout << "m.gBeg  =" << m.gBegin << "\n";
803 		std::cout << "m.gEnd  =" << m.gEnd << "\n";
804 		std::cout << "m.edit  =" << m.editDist << "\n";
805 		std::cout << "m.mScore=" << m.mScore << "\n\n";
806 #endif
807 
808 	return true;
809 
810 }
811 
812 
813 template <
814 	typename TMatch,
815 	typename TGenome,
816 	typename TReadSet,
817 	typename TPattern,
818 	typename TSpec >
819 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> genomeInf,unsigned rseqNo,TReadSet & readSet,TPattern &,RazerSOptions<TSpec> const & options,SwiftSemiGlobalHamming,LongestSuffix)820 matchVerify(
821 	TMatch &m,									// resulting match
822 	Segment<TGenome,InfixSegment>  genomeInf,	// potential match genome region
823 	unsigned rseqNo,							// read number
824 	TReadSet & readSet,							// original readSet
825 	TPattern&,
826 	RazerSOptions<TSpec> const &options,		// RazerS options
827 	SwiftSemiGlobalHamming,
828 	LongestSuffix)								// LongestSuffix
829 {
830 
831 	typedef Segment<TGenome, InfixSegment>                  TGenomeInfix;
832 	typedef typename Size<TGenomeInfix>::Type               TSize;
833 	typedef typename Value<TGenomeInfix>::Type              TDna;
834 	typedef typename Position<TGenomeInfix>::Type           TPosition;
835 	typedef typename Value<TReadSet>::Type 			TRead;
836 	typedef typename Iterator<TGenomeInfix, Standard>::Type	TGenomeIterator;
837 	typedef typename Infix<TRead>::Type 			TReadInf;
838 	typedef typename Iterator<TReadInf, Standard>::Type	TReadIterator;
839 
840 	if (length(genomeInf) < options.minMatchLen) return false;
841 	bool debug = false;
842 	TRead read = infix(readSet[rseqNo],options.minMatchLen,length(readSet[rseqNo]));
843 
844 
845 	if(debug)
846 	{
847 		::std::cout<< "suffixmatching\n";
848 		::std::cout << "genome=" << genomeInf << "\nread  =" << read <<"\n";
849 	}
850 
851 	TReadIterator ritEnd	= end(read, Standard())-1;
852 	TReadIterator ritBeg	= begin(read, Standard());
853 	TGenomeIterator git	= end(genomeInf, Standard())-1;
854 	TGenomeIterator gitBeg	= begin(genomeInf, Standard()) + options.minMatchLen;
855 
856 	// this is max number of errors the seed should have
857 	unsigned maxErrorsSeed = (unsigned)(options.minMatchLen * options.errorRate);
858 	unsigned maxTotalErrors = (unsigned)(length(read) * options.errorRate);
859 	unsigned minSeedErrors = maxErrorsSeed + 1;
860 	unsigned minTotalErrors = maxTotalErrors + 1;
861 	unsigned bestHitLength = 0;
862 
863 	for (; git > gitBeg; --git)
864 	{
865 		bool hit = true;
866 		unsigned hitLength = 0;
867 		unsigned count = 0;
868 		unsigned seedErrors = 0;
869 		unsigned totalErrors = 0;
870 		TGenomeIterator g = git;
871 		for(TReadIterator r = ritEnd; r >= ritBeg; --r, --g)
872 		{
873 			if(debug)::std::cout << *r << "\t" << *g << "\n";
874 			if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
875 			{
876 				if (count < options.minMatchLen)
877 				{
878 					++totalErrors;
879 					if(++seedErrors > maxErrorsSeed)
880 					{
881 					//	if(debug) ::std::cout << "-->no\n";
882 						hit = false;
883 						break;
884 					}
885 				}
886 				else
887 				{
888 					if(++totalErrors > maxTotalErrors)
889 					{
890 						// we are excluding this last error position
891 						--totalErrors;
892 						break;
893 					}
894 				}
895 			}
896 			++count;
897 		}
898 		if (hit) hitLength = count;
899 		if (hitLength > bestHitLength ) //simply take the longest hit
900 		{
901 			minSeedErrors = seedErrors;
902 			minTotalErrors = totalErrors;
903 			bestHitLength = hitLength;
904 			m.gEnd = git - begin(host(genomeInf), Standard()) + 1;
905 			if(debug) ::std::cout << "m.gEnd=" << m.gEnd << ::std::endl;
906 
907 		}
908 	}
909 
910 
911 	if(bestHitLength < options.minMatchLen)
912 		return false;
913 
914 
915 	m.gBegin = m.gEnd - bestHitLength;
916 	m.mScore = bestHitLength;
917 	m.editDist = minTotalErrors;
918 
919 #ifdef RAZERS_DEBUG
920 	std::cout << "m.gBeg  =" << m.gBegin << "\n";
921 	std::cout << "m.gEnd  =" << m.gEnd << "\n";
922 	std::cout << "m.edit  =" << m.editDist << "\n";
923 	std::cout << "m.mScore=" << m.mScore << "\n\n";
924 #endif
925 
926 	return true;
927 
928 }
929 
930 
931 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
932 
933 
934 
935 
936 template <typename TScore>
937 bool
findBestSplitPosition(String<Pair<TScore,int>> & maxColsL,String<Pair<TScore,int>> & maxColsR,int & rowPosL1,int rowPosL2,int & rowPosR1,int rowPosR2,int seq0Len,OrientationForward,SwiftSemiGlobal)938 findBestSplitPosition(String<Pair<TScore,int> > & maxColsL,
939 					  String<Pair<TScore,int> > & maxColsR,
940 					  int & rowPosL1,
941 					  int rowPosL2,
942 					  int & rowPosR1,
943 					  int rowPosR2,
944 					  int seq0Len,
945 					  OrientationForward,
946 					  SwiftSemiGlobal)
947 {
948 
949 #ifdef RAZERS_DEBUG
950 	::std::cout << "findBestSplitEditForward\n";
951 #endif
952 
953 	TScore maxSum = std::numeric_limits<TScore>::min();
954 	int bestL = rowPosL1;
955 	int bestR = rowPosR1;
956 	while (rowPosL1 <= rowPosL2 && rowPosR1 >= rowPosR2)
957 	{
958 		if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 > maxSum
959 			&& (maxColsL[rowPosL1].i2 + maxColsR[rowPosR1].i2 <= seq0Len))  // this is to prevent same bases from being used in both prefix and suffix match
960 		{																	// this works, because we store the FIRST bestScore in each row, i.e.
961 			maxSum = maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1;
962 			bestL = rowPosL1;
963 			bestR = rowPosR1;
964 		}
965 		++rowPosL1;
966 		--rowPosR1;
967 	}
968 	rowPosL1 = bestL;
969 	rowPosR1 = bestR;
970 
971 	return true;
972 }
973 
974 
975 
976 
977 // Edit distance match combination
978 template <typename TScore>
979 bool
findBestSplitPosition(String<Pair<TScore,int>> & maxColsL,String<Pair<TScore,int>> & maxColsR,int & rowPosL1,int rowPosL2,int & rowPosR1,int rowPosR2,int seq0Len,OrientationReverse,SwiftSemiGlobal)980 findBestSplitPosition(String<Pair<TScore,int> > & maxColsL,
981 					  String<Pair<TScore,int> > & maxColsR,
982 					  int & rowPosL1,
983 					  int rowPosL2,
984 					  int & rowPosR1,
985 					  int rowPosR2,
986 					  int seq0Len,
987 					  OrientationReverse,
988 					  SwiftSemiGlobal)
989 {
990 #ifdef RAZERS_DEBUG
991 	::std::cout << "findBestSplitEditReverse\n";
992 #endif
993 
994 	TScore maxSum = std::numeric_limits<TScore>::min();
995 	int bestL = rowPosL2;
996 	int bestR = rowPosR2;
997 
998 	while (rowPosL1 <= rowPosL2 && rowPosR1 >= rowPosR2)
999 	{
1000 		if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 >= maxSum
1001 			&& (maxColsL[rowPosL1].i2 + maxColsR[rowPosR1].i2 <= seq0Len))
1002 		{
1003 			maxSum = maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1;
1004 			bestL = rowPosL1;
1005 			bestR = rowPosR1;
1006 		}
1007 		++rowPosL1;
1008 		--rowPosR1;
1009 	}
1010 	rowPosL1 = bestL;
1011 	rowPosR1 = bestR;
1012 
1013 	return true;
1014 }
1015 
1016 
1017 
1018 
1019 
1020 
1021 template <typename TMatch, typename TRead, typename TGenome, typename TSpec>
1022 bool
combineLeftRight(TMatch & mR,TMatch & mL,TRead & read,TGenome & genome,RazerSOptions<TSpec> & options,char orientation,SwiftSemiGlobal)1023 combineLeftRight(TMatch & mR,
1024 		 TMatch & mL,
1025 		 TRead & read,
1026 		 TGenome & genome,
1027 		 RazerSOptions<TSpec> &options,
1028 		 char orientation,
1029 		 SwiftSemiGlobal)
1030 {
1031 
1032 #ifdef RAZERS_DEBUG
1033 	::std::cout << "combinLeftRightEdit\n";
1034 #endif
1035 	Score<int> scoreType(0,-1,-1,-1);
1036 	typedef typename Infix<TGenome>::Type TGenomeInf;
1037 	typedef ModifiedString<TGenomeInf,ModReverse> TGenomeInfRev;
1038 
1039 	TGenomeInf genomeInfL = infix(genome, mL.gBegin+mL.gSeedLen, mL.gEnd);
1040 	TGenomeInf readInfL = infix(read,options.minMatchLen,mL.mScore);
1041 	TGenomeInfRev genomeInfR(infix(genome, mR.gBegin, mR.gEnd-mR.gSeedLen));
1042 	TGenomeInfRev readInfR(infix(read,length(read)-mR.mScore,length(read)-options.minMatchLen));
1043 
1044 #ifdef RAZERS_DEBUG
1045 	bool debug = true;
1046 	std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1047 	std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1048 	std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1049 	std::cout << "mL.editDist =" << mL.editDist << "\n";
1050 
1051 	std::cout << "mR.mScore =" << mR.mScore << "\n";
1052 	std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1053 	std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1054 	std::cout << "mR.editDist =" << mR.editDist << "\n";
1055 #endif
1056 
1057 	int readLength = length(read);
1058 	unsigned maxErrors = readLength * options.errorRate;
1059 
1060 	// neither insertion nor deletion
1061 	if(mR.gEnd - mL.gBegin == readLength)
1062 	{
1063 #ifdef RAZERS_DEBUG
1064 		::std::cout << "normal\n";
1065 #endif
1066 		if(true)return false;
1067 		unsigned halfReadLen = readLength >> 1;
1068 
1069 		Align<String<Dna5>, ArrayGaps> align;
1070 		assignSource(row(align, 0), infix(read,0,readLength));
1071 		assignSource(row(align, 1), infix(genome, mL.gBegin, mR.gEnd));
1072 		int sc = globalAlignment(align, scoreType, AlignConfig<false,false,false,false>(), NeedlemanWunsch());
1073 		if(-sc > (int)maxErrors) return false;
1074 
1075 		mL.gEnd = mL.gBegin + toSourcePosition(row(align, 1),toViewPosition(row(align, 0), halfReadLen-1));
1076 		mR.gBegin = mL.gEnd;
1077 		mL.mScore = halfReadLen;
1078 		mR.mScore = readLength - mL.mScore;
1079 		mL.editDist = countErrorsInAlign(align,toViewPosition(row(align, 0), halfReadLen));
1080 		mR.editDist = -sc-mL.editDist;
1081 
1082 #ifdef RAZERS_DEBUG
1083 		std::cout << "mL.mScore =" << mL.mScore << "\n";
1084 		std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1085 		std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1086 		std::cout << "mL.editDist =" << mL.editDist << "\n";
1087 
1088 		std::cout << "mR.mScore =" << mR.mScore << "\n";
1089 		std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1090 		std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1091 		std::cout << "mR.editDist =" << mR.editDist << "\n";
1092 #endif
1093 
1094 	}
1095 
1096 	//potential insertion
1097 	if(mR.gEnd - mL.gBegin < readLength)
1098 	{
1099 #ifdef RAZERS_DEBUG
1100 		::std::cout << "insertion\n";
1101 #endif
1102 		if(mR.gEnd - mL.gBegin < 2*options.minMatchLen)  //too close together // actually minus allowed seed errors
1103 			return false;
1104 
1105 		if(mL.gEnd < mR.gBegin)  //prefix and suffix match do not meet
1106 			return false;
1107 
1108 		if(mR.mScore + mL.mScore == mR.gEnd - mL.gBegin)
1109 			if(mL.gEnd == mR.gBegin)
1110 				if(mR.editDist + mL.editDist <= maxErrors ) //prefix and suffix match meet and do not overlap --> perfect
1111 					return true;
1112 
1113 //		if((mR.gEnd - mL.gBegin <= mL.mScore) || (mR.gEnd - mL.gBegin <= mR.mScore))//too close together // heuristic way to kick out repeat matches early on
1114 //			return false;
1115 
1116 		int diag1L = -maxErrors + mL.seedEditDist;
1117 		int diag2L = maxErrors - mL.seedEditDist;
1118 		int diag1R = -maxErrors + mR.seedEditDist;
1119 		int diag2R = maxErrors - mR.seedEditDist;
1120  		int minColNum = 0;
1121 
1122 		// genomeInf is the shorter sequence --> find best split position on genome
1123 		// rows in alignment matrix represent genome position
1124 		StringSet<TGenomeInf,Dependent<> > strSetL;
1125 		appendValue(strSetL,readInfL);
1126 		appendValue(strSetL,genomeInfL);
1127 		Graph<Alignment<StringSet<TGenomeInf,Dependent<> >, void> > alignL(strSetL);
1128 		String<Pair<int,int> > maxColsL;
1129 		_globalAlignment(alignL,strSetL,scoreType,AlignConfig<false,false,false,false>(),diag1L,diag2L,maxColsL,minColNum,BandedNeedlemanWunsch());
1130 
1131 		StringSet<TGenomeInfRev,Dependent<> > strSetR;
1132 		appendValue(strSetR,readInfR);
1133 		appendValue(strSetR,genomeInfR);
1134 		Graph<Alignment<StringSet<TGenomeInfRev,Dependent<> >, void > > alignR(strSetR);
1135 		String<Pair<int,int> > maxColsR;
1136 		_globalAlignment(alignR,strSetR,scoreType,AlignConfig<false,false,false,false>(),diag1R,diag2R,maxColsR,minColNum,BandedNeedlemanWunsch());
1137 
1138 		//::std::cout << alignL;
1139 		//::std::cout << alignR;
1140 
1141 		// our begin and start positions are defined by the read positions
1142 		// go from read source to view position to genome source position
1143 		int rowPosL1 = 0;
1144 		if (mL.gSeedLen < (int)mR.gBegin - mL.gBegin) rowPosL1 = mR.gBegin - mL.gBegin - mL.gSeedLen;//or from first possible overlap pos
1145 		int rowPosR2 = 0;
1146 		if (mR.gSeedLen < (int)mR.gEnd - mL.gEnd) rowPosR2 = mR.gEnd - mL.gEnd - mR.gSeedLen;
1147 
1148 		int rowPosL2 = mR.gEnd - mR.gSeedLen - rowPosR2 - mL.gBegin - mL.gSeedLen;
1149 		int rowPosR1 = mR.gEnd - mR.gSeedLen - rowPosL1 - mL.gBegin - mL.gSeedLen;
1150 
1151 
1152 #ifdef RAZERS_DEBUG
1153 		::std::cout << "vorher\nrowPosL1=" << rowPosL1 << ::std::endl;
1154 		::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1155 		::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1156 		::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1157 #endif
1158 
1159 		// compute best split position
1160 		int seq0Len = readLength;
1161 		if(orientation == 'R')
1162 			findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,OrientationReverse(),SwiftSemiGlobal());
1163 		else
1164 			findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,OrientationForward(),SwiftSemiGlobal());
1165 
1166 		mL.editDist = mR.seedEditDist - maxColsL[rowPosL1].i1;	//scores are negative
1167 		mR.editDist = mL.seedEditDist - maxColsR[rowPosR1].i1;
1168 		if(mR.editDist + mL.editDist > maxErrors )
1169 			return false;
1170 
1171 #ifdef RAZERS_DEBUG
1172 		::std::cout << "nachher\nrowPosL1=" << rowPosL1 << ::std::endl;
1173 		::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1174 		::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1175 		::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1176 #endif
1177 
1178 		// best split position in genome is saved in rowPosL1 (and rowPosR1)
1179 		mL.mScore = options.minMatchLen + maxColsL[rowPosL1].i2;
1180 		mL.gEnd = mL.gBegin + mL.gSeedLen + rowPosL1; //read position of best genomic split
1181 
1182 		mR.mScore = options.minMatchLen + maxColsR[rowPosR1].i2;
1183 		mR.gBegin = mR.gEnd - mR.gSeedLen - rowPosR1; //read position of best genomic split
1184 
1185 #ifdef RAZERS_DEBUG
1186 		if(mL.editDist > 50 || mR.mScore < options.minMatchLen || mL.mScore < options.minMatchLen)
1187 		{
1188 			::std::cout << "-maxColsL[rowPosL1].i1=" << -maxColsL[rowPosL1].i1 << " -maxColsL[rowPosL1].i2=" << -maxColsL[rowPosL1].i2 << " rowPosL1="<<rowPosL1;
1189 			::std::cout << " maxColsLLen="<< length(maxColsL) << ::std::endl;
1190 			::std::cout << "-maxColsR[rowPosR1].i1=" << -maxColsR[rowPosR1].i1 << "-maxColsR[rowPosR1].i2=" << -maxColsR[rowPosR1].i2 << " rowPosR1="<<rowPosR1;
1191 			::std::cout << " maxColsRLen="<< length(maxColsR) << ::std::endl;
1192 		}
1193 #endif
1194 	}
1195 
1196 	//potential deletion
1197 	if(mR.gEnd - mL.gBegin > readLength)
1198 	{
1199 #ifdef RAZERS_DEBUG
1200 		::std::cout << "deletion\n";
1201 #endif
1202 		if(mR.mScore + mL.mScore < readLength)
1203 			return false;
1204 		if(mR.mScore + mL.mScore == readLength && mR.editDist + mL.editDist > maxErrors) //the prefix and suffix match meet, but too many errors
1205 			return false;
1206 
1207 		int diag1L = -maxErrors + mL.seedEditDist;
1208 		int diag2L = maxErrors - mL.seedEditDist;
1209 		int diag1R = -maxErrors + mR.seedEditDist;
1210 		int diag2R = maxErrors - mR.seedEditDist;
1211 		int minColNum = 0;
1212 
1213 		// readInf is the shorter sequence --> find best split position on read
1214 		// rows in alignment matrix represent read position
1215 		StringSet<TGenomeInf> strL;
1216 		appendValue(strL,genomeInfL);
1217 		appendValue(strL,readInfL);
1218 		String<Pair<int,int> > maxColsL;
1219 		_globalAlignment(strL,scoreType,AlignConfig<false,false,false,false>(),diag1L,diag2L,maxColsL,minColNum,BandedNeedlemanWunsch());
1220 
1221 		StringSet<TGenomeInfRev> strR;
1222 		appendValue(strR,genomeInfR);
1223 		appendValue(strR,readInfR);
1224 		String<Pair<int,int> > maxColsR;
1225 		_globalAlignment(strR,scoreType,AlignConfig<false,false,false,false>(),diag1R,diag2R,maxColsR,minColNum,BandedNeedlemanWunsch());
1226 
1227 		int rowPosL1 = ((int)options.minMatchLen > readLength-mR.mScore) ? (int)0 : readLength-mR.mScore-options.minMatchLen;
1228 		int rowPosL2 = (readLength-(int)options.minMatchLen < mL.mScore) ? readLength-(int)2*options.minMatchLen : mL.mScore - options.minMatchLen;
1229 		int rowPosR1 = (int)readLength - 2*options.minMatchLen - rowPosL1;
1230 		int rowPosR2 = (int)readLength - 2*options.minMatchLen - rowPosL2;
1231 
1232 #ifdef RAZERS_DEBUG
1233 		::std::cout << "rowPosL1=" << rowPosL1 << ::std::endl;
1234 		::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1235 		::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1236 		::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1237 
1238 		std::cout << "before split:\nmL.mScore =" << mL.mScore << "\n";
1239 		std::cout << "mL.gBegin =" << length(genome)-mL.gBegin << "\n";
1240 		std::cout << "mL.gEnd =" << length(genome)-mL.gEnd << "\n";
1241 		std::cout << "mL.editDist =" << mL.editDist << "\n";
1242 
1243 		std::cout << "mR.mScore =" << mR.mScore << "\n";
1244 		std::cout << "mR.gBegin =" << length(genome)-mR.gBegin << "\n";
1245 		std::cout << "mR.gEnd =" << length(genome)-mR.gEnd << "\n";
1246 		std::cout << "mR.editDist =" << mR.editDist << "\n";
1247 #endif
1248 
1249 		// compute best split position
1250 		int seq0Len = mR.gEnd - mL.gBegin;
1251 		if(orientation == 'R')
1252 			findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,OrientationReverse(),SwiftSemiGlobal());
1253 		else
1254 			findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,OrientationForward(),SwiftSemiGlobal());
1255 
1256 
1257 		// best split position in read is saved in rowPosL1 (and rowPosR1)
1258 		mL.editDist = mL.seedEditDist - maxColsL[rowPosL1].i1;
1259 		mR.editDist = mR.seedEditDist - maxColsR[rowPosR1].i1;
1260 
1261 
1262 		if(mR.editDist + mL.editDist > maxErrors)
1263 			return false;
1264 
1265 		mR.mScore = options.minMatchLen + rowPosR1;
1266 		mR.gBegin = mR.gEnd - maxColsR[rowPosR1].i2 - mR.gSeedLen; //genomic position of best read split
1267 		mL.mScore = options.minMatchLen + rowPosL1;
1268 		mL.gEnd = mL.gBegin + maxColsL[rowPosL1].i2 + mL.gSeedLen; //genomic position of best read split
1269 
1270 	}
1271 
1272 	return true;
1273 }
1274 
1275 
1276 
1277 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1278 // Hamming distance match combination
1279 
1280 template <typename TMatch, typename TOriRead, typename TGenome, typename TSpec>
1281 bool
combineLeftRight(TMatch & mR,TMatch & mL,TOriRead const & oriRead,TGenome & genome,RazerSOptions<TSpec> & options,char orientation,SwiftSemiGlobalHamming)1282 combineLeftRight(TMatch & mR,
1283 		 TMatch & mL,
1284 		 TOriRead const& oriRead,
1285 		 TGenome & genome,
1286 		 RazerSOptions<TSpec> &options,
1287 		 char orientation,
1288 		 SwiftSemiGlobalHamming)
1289 {
1290 
1291 #ifdef RAZERS_DEBUG
1292 	::std::cout << "combineLeftRightHamming\n";
1293 #endif
1294 
1295 
1296 	typedef typename Infix<TOriRead const>::Type TRead;
1297 	TRead read = infix(oriRead,0,length(oriRead));
1298 
1299 	typedef typename Infix<TGenome>::Type TGenomeInf;
1300 	TGenomeInf genomeInf = infix(genome, mL.gBegin, mR.gEnd);
1301 	int readLength = length(read);
1302 	unsigned maxErrors = readLength * options.errorRate;
1303 
1304 
1305 #ifdef RAZERS_DEBUG
1306 	std::cout << "readLength=" << readLength << "\n";
1307 	std::cout << "sumLen=" << mL.mScore + mR.mScore << "\n";
1308 	std::cout << "gInfLength=" << length(genomeInf) << "\n";
1309 	std::cout << "gInf=" << genomeInf << "\n";
1310 
1311 	std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1312 	std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1313 	std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1314 	std::cout << "mL.editDist =" << mL.editDist << "\n";
1315 	std::cout << "readPref=" << prefix(oriRead,mL.mScore) << "\n";
1316 
1317 	std::cout << "mR.mScore =" << mR.mScore << "\n";
1318 	std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1319 	std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1320 	std::cout << "mR.editDist =" << mR.editDist << "\n";
1321 	std::cout << "readSuff=" << suffix(oriRead,length(oriRead)-mR.mScore) << "\n";
1322 #endif
1323 
1324 
1325 	// neither insertion nor deletion
1326 	if(mR.gEnd - mL.gBegin == readLength)
1327 	{
1328 #ifdef RAZERS_DEBUG
1329 		::std::cout << "normal\n";
1330 #endif
1331 		unsigned halfReadLen = readLength >> 1;
1332 		unsigned prefixErrors = 0;
1333 		unsigned suffixErrors = 0;
1334 		for (unsigned i = 0 ; i < length(genomeInf); ++i)
1335 		{
1336 			if ((options.compMask[ordValue(read[i])] & options.compMask[ordValue(genomeInf[i])]) == 0)
1337 			{
1338 				if(i >= halfReadLen)
1339 					++suffixErrors;
1340 				else
1341 					++prefixErrors;
1342 			}
1343 		}
1344 
1345 		if (suffixErrors+prefixErrors <= maxErrors)
1346 		{
1347 			mL.mScore = halfReadLen;
1348 			mR.mScore = length(read)- halfReadLen;
1349 			mR.gBegin = mR.gEnd - mR.mScore;
1350 			mL.gEnd = mL.gBegin + mL.mScore;
1351 			mL.editDist = prefixErrors;
1352 			mR.editDist = suffixErrors;
1353 		}
1354 		else return false;
1355 
1356 	}
1357 	//potential insertion
1358 	if(mR.gEnd - mL.gBegin < readLength)
1359 	{
1360 #ifdef RAZERS_DEBUG
1361 		::std::cout << "insertion\n";
1362 #endif
1363 
1364 		if(mR.gEnd - mL.gBegin < 2*options.minMatchLen)//too close together
1365 			 return false;
1366 
1367 		if(mR.mScore + mL.mScore < mR.gEnd - mL.gBegin) //prefix and suffix match do not meet
1368 			return false;
1369 
1370 		if((mR.mScore + mL.mScore == mR.gEnd - mL.gBegin) && (mR.editDist + mL.editDist > maxErrors)) //prefix and suffix match meet but too many errors
1371 			return false;
1372 
1373 //		if((mR.gEnd - mL.gBegin <= mL.mScore) || (mR.gEnd - mL.gBegin <= mR.mScore))//too close together
1374 //			 return false;
1375 
1376 		bool result = findBestSplitPosition(read,genomeInf,mL.mScore,mR.mScore,mL.editDist,mR.editDist, options, orientation, SwiftSemiGlobalHamming());
1377 		if(!result || mR.editDist + mL.editDist > maxErrors) return false;
1378 		mR.gBegin = mR.gEnd - mR.mScore;
1379 		mL.gEnd = mL.gBegin + mL.mScore;
1380 
1381 	}
1382 	//potential deletion
1383 	if(mR.gEnd - mL.gBegin > readLength)
1384 	{
1385 #ifdef RAZERS_DEBUG
1386 		::std::cout << "deletion\n";
1387 #endif
1388 
1389 		if(mR.mScore + mL.mScore < readLength)
1390 			return false;
1391 
1392 		if((mR.mScore + mL.mScore == readLength) && (mR.editDist + mL.editDist > maxErrors)) //the prefix and suffix match meet and do not overlap --> perfect
1393 			return false;
1394 
1395 		bool result = findBestSplitPosition(genomeInf,read,mL.mScore,mR.mScore,mL.editDist,mR.editDist, options, orientation,SwiftSemiGlobalHamming());
1396 
1397 		if(!result || mR.editDist + mL.editDist > maxErrors) return false;
1398 
1399 		mR.gBegin = mR.gEnd - mR.mScore;
1400 		mL.gEnd = mL.gBegin + mL.mScore;
1401 
1402 	}
1403 #ifdef RAZERS_DEBUG
1404 	std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1405 	std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1406 	std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1407 	std::cout << "mL.editDist =" << mL.editDist << "\n";
1408 
1409 	std::cout << "mR.mScore =" << mR.mScore << "\n";
1410 	std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1411 	std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1412 	std::cout << "mR.editDist =" << mR.editDist << "\n";
1413 #endif
1414 
1415 
1416 	return true;
1417 }
1418 
1419 
1420 // find the best split position for a split match
1421 // positions are relative to shorter sequence
1422 // (deletion --> read is shorter, insertion --> genomeInf is shorter)
1423 template <typename TSize, typename TLongerSegment, typename TShorterSegment, typename TErrors, typename TOptions>
1424 bool
findBestSplitPosition(TLongerSegment & longSeg,TShorterSegment & shortSeg,TSize & mLmScore,TSize & mRmScore,TErrors & errorsLeft,TErrors & errorsRight,TOptions & options,char orientation,SwiftSemiGlobalHamming)1425 findBestSplitPosition(TLongerSegment &longSeg,
1426 			TShorterSegment &shortSeg,
1427 			TSize & mLmScore,
1428 			TSize & mRmScore,
1429 			TErrors & errorsLeft,
1430 			TErrors & errorsRight,
1431 			TOptions & options,
1432 			char orientation,
1433 			SwiftSemiGlobalHamming)
1434 {
1435 
1436 #ifdef RAZERS_DEBUG
1437 	::std::cout << "findBestSplitHamming"<<orientation<<"\n";
1438 #endif
1439 
1440 	// usually, both types should be the same, but you never know...
1441 	typedef typename Iterator<TLongerSegment const>::Type TLongIterator;
1442 	typedef typename Iterator<TShorterSegment const>::Type TShortIterator;
1443 	typedef typename Size<TShorterSegment const>::Type TShortSize;
1444 	typedef typename Size<TLongerSegment const>::Type TLongSize;
1445 
1446 
1447 	TShortSize shortLen = length(shortSeg);
1448 	TLongSize  longLen  = length(longSeg);
1449 	TLongSize  lenDiff  = longLen - shortLen;
1450 
1451 	TLongSize leftLongBegin = _max((int)options.minMatchLen,(int)shortLen-mRmScore);
1452 	TLongSize leftLongEnd = _min((int)mLmScore,(int)shortLen-(int)options.minMatchLen);
1453 	TLongSize leftLongPos = leftLongBegin;
1454 
1455 	TLongSize rightLongPos = leftLongBegin + lenDiff;
1456 	TShortSize shortPos = leftLongBegin;
1457 
1458 	int bestSumErrors = 0;
1459 	int bestLErrors = 0;
1460 	int bestRErrors = 0;
1461 	int bestPos = shortPos;
1462 	int errorsL = 0;
1463 	int errorsR = 0;
1464 	int errorsPosL = 0;
1465 	int errorsPosR = 0;
1466 
1467 #ifdef RAZERS_DEBUG
1468 	std::cout << "before\nerrorsLeft= " << errorsLeft << std::endl;
1469 	std::cout << "errorsRight= " << errorsRight << std::endl;
1470 	std::cout << "mLmScore= " << mLmScore << std::endl;
1471 	std::cout << "mRmScore= " << mRmScore << std::endl;
1472 
1473 	std::cout << "leftLongPos " << leftLongPos << std::endl;
1474 	std::cout << "leftLongEnd " << leftLongEnd << std::endl;
1475 	std::cout << "rightLongPos " << rightLongPos << std::endl;
1476 	std::cout << "shortPos " << shortPos << std::endl;
1477 
1478 	std::cout << longSeg << "=longSeg\n";
1479 	std::cout << shortSeg << "=shortSeg\n";
1480 #endif
1481 
1482 	//find best split position --> min. sum errors
1483 	while(leftLongPos < leftLongEnd)
1484 	{
1485 		if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[leftLongPos])]) == 0)
1486 		{
1487 			++errorsL;
1488 			++errorsPosL;
1489 		}
1490 		if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[rightLongPos])]) == 0)
1491 		{
1492 			--errorsPosR;
1493 			++errorsR;
1494 		}
1495 		if(errorsPosL+errorsPosR < bestSumErrors
1496 			|| (orientation == 'R' && errorsPosL+errorsPosR == bestSumErrors))
1497 		{
1498 			bestSumErrors = errorsPosL+errorsPosR;
1499 			bestLErrors = errorsPosL;
1500 			bestRErrors = errorsPosR;
1501 			bestPos = shortPos + 1;
1502 		}
1503 		++leftLongPos;
1504 		++rightLongPos;
1505 		++shortPos;
1506 	}
1507 
1508 	//update to new match lengths
1509 	mLmScore = bestPos;
1510 	mRmScore = shortLen - bestPos;
1511 
1512 	//count numErrors for left and right match
1513 	//(have to count completely new, because mScore may be longer than shortLen --> no able to track errors outside segment)
1514 	errorsRight = errorsLeft = 0;
1515 	for(leftLongPos = 0, shortPos = 0; leftLongPos < (unsigned)mLmScore; ++leftLongPos, ++shortPos)
1516 	{
1517 		if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[leftLongPos])]) == (unsigned) 0)
1518 			++errorsLeft;
1519 
1520 	}
1521 	for(rightLongPos = 0, shortPos = 0; rightLongPos < (unsigned)mRmScore; ++rightLongPos, ++shortPos)
1522 	{
1523 		if((options.compMask[ordValue(shortSeg[shortLen-1-shortPos])] & options.compMask[ordValue(longSeg[longLen-1-rightLongPos])]) ==  (unsigned) 0)
1524 			++errorsRight;
1525 
1526 	}
1527 
1528 #ifdef RAZERS_DEBUG
1529 	std::cout << "bestSumErrors= " << bestSumErrors << std::endl;
1530 	std::cout << "errorsPosR= " << errorsPosR << std::endl;
1531 	std::cout << "errorsPosL= " << errorsPosL << std::endl;
1532 	std::cout << "after\nerrorsLeft= " << errorsLeft << std::endl;
1533 	std::cout << "errorsRight= " << errorsRight << std::endl;
1534 	std::cout << "mLmScore= " << mLmScore << std::endl;
1535 	std::cout << "mRmScore= " << mRmScore << std::endl;
1536 #endif
1537 
1538 	return true;
1539 }
1540 
1541 
1542 
1543 
1544 
1545 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1546 // Find read matches in many genome sequences (import from Fasta)
1547 template <
1548 	typename TMatches,
1549 	typename TReadSet_,
1550 	typename TCounts,
1551 	typename TSpec,
1552 	typename TShape,
1553 	typename TSwiftSpec >
mapSplicedReads(TMatches & matches,StringSet<CharString> & genomeFileNameList,StringSet<CharString> & genomeNames,::std::map<unsigned,::std::pair<::std::string,unsigned>> & gnoToFileMap,TReadSet_ & readSet,TCounts & cnts,RazerSOptions<TSpec> & options,TShape const & shape,Swift<TSwiftSpec> const)1554 int mapSplicedReads(
1555 	TMatches &		matches,
1556 	StringSet<CharString> &	genomeFileNameList,
1557 	StringSet<CharString> &	genomeNames,	// genome names, taken from the Fasta file
1558 	::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
1559 	TReadSet_ & 		readSet,
1560 	TCounts &		cnts,
1561 	RazerSOptions<TSpec> &	options,
1562 	TShape const &		shape,
1563 	Swift<TSwiftSpec> const)
1564 {
1565 
1566 
1567 	typedef typename Value<TReadSet_>::Type								TRead;
1568 	typedef StringSet<typename Infix<TRead>::Type>						TReadSet;
1569 	typedef Index<TReadSet, IndexQGram<TShape, TQGramIndexSpec> >		TIndex;			// q-gram index
1570 	typedef Pattern<TIndex, Swift<TSwiftSpec> >							TSwiftPattern;	// filter
1571 	typedef Pattern<TRead, MyersUkkonen>								TMyersPattern;	// verifier
1572 
1573 
1574 	// split reads over two indices, one for prefixes, the other for suffixes
1575 	TReadSet readSetL, readSetR;
1576 	unsigned readCount = length(readSet);
1577 	resize(readSetL, readCount);
1578 	resize(readSetR, readCount);
1579 
1580 	if(options._debugLevel > 0)
1581 	{
1582 		int64_t genomeLen = 3000000000 * 2;					// ufff make that an option
1583 		expNumRandomMatches(readSet, genomeLen, options);
1584 	}
1585 
1586 	if(options._debugLevel > 0 )
1587 		std::cout << "Performing spliced mapping.\n";
1588 	for (unsigned i = 0; i < readCount; ++i)
1589 	{
1590 		if(length(readSet[i])>=2*options.minMatchLen)
1591 		{
1592 			assign(readSetL[i], infix(readSet[i],0,options.minMatchLen), Exact());
1593 			assign(readSetR[i], infix(readSet[i],length(readSet[i])-options.minMatchLen,length(readSet[i])), Exact());
1594 		}
1595 		else
1596 		{
1597 			assign(readSetL[i], infix(readSet[i],0,0), Exact());
1598 			assign(readSetR[i], infix(readSet[i],0,0), Exact());
1599 		}
1600 	}
1601 
1602 
1603 	if(options._debugLevel > 1)::std::cout << "Make index left right\n";
1604 	// configure q-gram index
1605 	TIndex swiftIndexL(readSetL, shape);
1606 	TIndex swiftIndexR(readSetR, shape);
1607 
1608 #ifdef RAZERS_OPENADDRESSING
1609 	swiftIndexL.alpha = 2;
1610 	swiftIndexR.alpha = 2;
1611 #endif
1612 
1613 	cargo(swiftIndexL).abundanceCut = options.abundanceCut;
1614 	cargo(swiftIndexR).abundanceCut = options.abundanceCut;
1615 	cargo(swiftIndexL)._debugLevel = 0;
1616 	cargo(swiftIndexR)._debugLevel = options._debugLevel;
1617 
1618 	// configure Swift
1619 	TSwiftPattern swiftPatternL(swiftIndexL);
1620 	TSwiftPattern swiftPatternR(swiftIndexR);
1621 	swiftPatternL.params.minThreshold = options.threshold;
1622 	swiftPatternR.params.minThreshold = options.threshold;
1623 	swiftPatternL.params.tabooLength = options.tabooLength;
1624 	swiftPatternR.params.tabooLength = options.tabooLength;
1625 
1626 	// init edit distance verifiers
1627 	String<TMyersPattern> forwardPatternsL;
1628 	String<TMyersPattern> forwardPatternsR;
1629 	options.compMask[4] = (options.matchN)? 15: 0;
1630 	if (!options.hammingOnly)
1631 	{
1632 		resize(forwardPatternsL, readCount, Exact());
1633 		resize(forwardPatternsR, readCount, Exact());
1634 		for(unsigned i = 0; i < readCount; ++i)
1635 		{
1636 			setHost(forwardPatternsL[i], indexText(swiftIndexL)[i]);
1637 			setHost(forwardPatternsR[i], indexText(swiftIndexR)[i]);
1638 			_patternMatchNOfPattern(forwardPatternsL[i], options.matchN);
1639 			_patternMatchNOfPattern(forwardPatternsR[i], options.matchN);
1640 			_patternMatchNOfFinder(forwardPatternsL[i], options.matchN);
1641 			_patternMatchNOfFinder(forwardPatternsR[i], options.matchN);
1642 		}
1643 	}
1644 
1645 	if(options._debugLevel > 1)::std::cout << "Patterns created\n";
1646 
1647 	// clear stats
1648 	options.FP = 0;
1649 	options.TP = 0;
1650 	options.timeMapReads = 0;
1651 	options.timeDumpResults = 0;
1652 
1653 	unsigned filecount = 0;
1654 	unsigned numFiles = length(genomeFileNameList);
1655 	unsigned gseqNo = 0;
1656 
1657 	// open genome files, one by one
1658 	while (filecount < numFiles)
1659 	{
1660 		// open genome file
1661 		::std::ifstream file;
1662 		file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary);
1663 		if (!file.is_open())
1664 			return RAZERS_GENOME_FAILED;
1665 
1666 		// remove the directory prefix of current genome file
1667 		::std::string genomeFile(toCString(genomeFileNameList[filecount]));
1668 		size_t lastPos = genomeFile.find_last_of('/') + 1;
1669 		if (lastPos == genomeFile.npos) lastPos = genomeFile.find_last_of('\\') + 1;
1670 		if (lastPos == genomeFile.npos) lastPos = 0;
1671 		::std::string genomeName = genomeFile.substr(lastPos);
1672 
1673 		CharString	id;
1674 		//Dna5String	genome;
1675 		String<Dna5Q> genome;
1676 		unsigned gseqNoWithinFile = 0;
1677 		SEQAN_PROTIMESTART(find_time);
1678 
1679 		// iterate over genome sequences
1680 		for(; !_streamEOF(file); ++gseqNo)
1681 		{
1682 			if (options.genomeNaming == 0)
1683 			{
1684 				readShortID(file, id, Fasta());			// read Fasta id up to first whitespace
1685 				appendValue(genomeNames, id, Generous());
1686 			}
1687 			read(file, genome, Fasta());			// read Fasta sequence
1688 
1689 			gnoToFileMap.insert(::std::make_pair(gseqNo,::std::make_pair(genomeName,gseqNoWithinFile)));
1690 
1691 			if (options.forward)
1692 				mapSplicedReads(matches, genome, gseqNo, readSet, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'F', options);
1693 
1694 			if (options.reverse)
1695 			{
1696 				reverseComplement(genome);
1697 				mapSplicedReads(matches, genome, gseqNo, readSet, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'R', options);
1698 			}
1699 			++gseqNoWithinFile;
1700 
1701 		}
1702 		options.timeMapReads += SEQAN_PROTIMEDIFF(find_time);
1703 		file.close();
1704 		++filecount;
1705 	}
1706 
1707 	compactSplicedMatches(matches, cnts, options, false, swiftPatternL, swiftPatternR);
1708 
1709 	if (options._debugLevel >= 1)
1710 		::std::cerr << ::std::endl << "Finding reads took               \t" << options.timeMapReads << " seconds" << ::std::endl;
1711 	if (options._debugLevel >= 2) {
1712 		::std::cerr << ::std::endl;
1713 		::std::cerr << "___FILTRATION_STATS____" << ::std::endl;
1714 		::std::cerr << "Swift FP: " << options.FP << ::std::endl;
1715 		::std::cerr << "Swift TP: " << options.TP << ::std::endl;
1716 	}
1717 	return 0;
1718 }
1719 
1720 
1721 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1722 // Find read matches in one genome sequence
1723 template <
1724 	typename TMatches,
1725 	typename TGenome,
1726 	typename TOriReadSet,
1727 	typename TReadIndex,
1728 	typename TSwiftSpec,
1729 	typename TVerifier,
1730 	typename TCounts,
1731 	typename TSpec >
mapSplicedReads(TMatches & matches,TGenome & genome,unsigned gseqNo,TOriReadSet & readSet,Pattern<TReadIndex,Swift<TSwiftSpec>> & swiftPatternL,Pattern<TReadIndex,Swift<TSwiftSpec>> & swiftPatternR,TVerifier & forwardPatternsL,TVerifier & forwardPatternsR,TCounts & cnts,char orientation,RazerSOptions<TSpec> & options)1732 void mapSplicedReads(
1733 	TMatches &matches,				// resulting matches
1734 	TGenome &genome,				// genome ...
1735 	unsigned gseqNo,				// ... and its sequence number
1736 	TOriReadSet &readSet,			// reads
1737 	Pattern<TReadIndex, Swift<TSwiftSpec> > &swiftPatternL,	// left index
1738 	Pattern<TReadIndex, Swift<TSwiftSpec> > &swiftPatternR,	// right index
1739 	TVerifier &forwardPatternsL,
1740 	TVerifier &forwardPatternsR,
1741 	TCounts & cnts,
1742 	char orientation,
1743 	RazerSOptions<TSpec> &options)
1744 {
1745 	typedef typename Value<TOriReadSet>::Type TRead;
1746 	typedef typename Fibre<TReadIndex, FibreText>::Type	TReadSet;
1747 	typedef typename Size<TGenome>::Type			TSize;
1748 	typedef typename Position<TGenome>::Type		TGPos;
1749 	typedef typename MakeSigned_<TGPos>::Type		TSignedGPos;
1750 	typedef typename Value<TMatches>::Type			TMatch;
1751 	typedef typename Infix<TGenome>::Type			TGenomeInf;
1752 
1753 	// Prefix-Suffix filtration
1754 	typedef Finder<TGenome, Swift<TSwiftSpec> >		TSwiftFinderL;
1755 	typedef Finder<TGenomeInf, Swift<TSwiftSpec> >	TSwiftFinderR;
1756 	typedef Pattern<TReadIndex, Swift<TSwiftSpec> >	TSwiftPattern;
1757 
1758 	typedef Pair<int64_t, TMatch>				TDequeueValue;
1759 	typedef Dequeue<TDequeueValue>				TDequeue;
1760 	typedef typename TDequeue::TIter			TDequeueIterator;
1761 
1762 	const unsigned NOT_VERIFIED = 1u << (8*sizeof(unsigned)-1);
1763 
1764 	// iterate all genomic sequences
1765 	if (options._debugLevel >= 1)
1766 	{
1767 		::std::cerr << ::std::endl << "Process genome seq #" << gseqNo;
1768 		if (orientation == 'F')
1769 			::std::cerr << "[fwd]";
1770 		else
1771 			::std::cerr << "[rev]";
1772 	}
1773 
1774 	TReadSet &readSetL = host(host(swiftPatternL));
1775 	TReadSet &readSetR = host(host(swiftPatternR));
1776 
1777 	if (empty(readSetL) || empty(readSetR))
1778 		return;
1779 
1780 
1781 	// Check?
1782 	TSignedGPos maxDistance = options.maxDistance;// + 2 * (int)options.minMatchLength - (int)length(indexShape(host(swiftPatternL)));
1783 	TSignedGPos minDistance = options.minDistance + 2*options.minMatchLen;
1784 
1785 	// exit if contig is shorter than minDistance
1786 	if (length(genome) <= (unsigned) minDistance)
1787 		return;
1788 
1789 	TGenomeInf genomeInf = infix(genome, 0, length(genome));
1790 	TSwiftFinderL swiftFinderL(genome, options.repeatLength, 1);
1791 	TSwiftFinderR swiftFinderR(genomeInf, options.repeatLength, 1);
1792 
1793 	TDequeue fifo;				// stores left-mate potential matches
1794 	String<int64_t> lastPotMatchNo;		// last number of a left-mate potential
1795 	int64_t lastNo = 0;			// last number over all left-mate pot. matches in the queue
1796 	int64_t firstNo = 0;			// first number over all left-mate pot. match in the queue
1797 	Pair<TGPos> gPair;
1798 
1799 	resize(lastPotMatchNo, length(host(swiftPatternL)), (int64_t)-1, Exact());
1800 
1801 	String<Pair<TGPos> > lastRightMatch;		// begin and end of last verified right match
1802 	resize(lastRightMatch, length(host(swiftPatternL)), Pair<TGPos>(0,0), Exact());
1803 
1804 	TSize gLength = length(genome);
1805 	TMatch mR = {	// to supress uninitialized warnings
1806 		0, 0, 0, 0,	0, 0, 0, 0, 0, 0, 0, 0};
1807 	TDequeueValue fL(-1, mR);	// to supress uninitialized warnings
1808 	fL.i2.gseqNo = gseqNo;
1809 	mR.gseqNo = gseqNo;
1810 	fL.i2.orientation = orientation;
1811 	mR.orientation = orientation;
1812 
1813 	double maxErrorRate = options.errorRate;
1814 	double prefixErrorRate = (double)floor(options.minMatchLen * options.errorRate)/options.minMatchLen;
1815 	if(prefixErrorRate > maxErrorRate) maxErrorRate = prefixErrorRate;
1816 	if(!empty(readSet))
1817 	{
1818 		double extPrefixErrorRate = (double) floor(length(readSet[0]) * options.errorRate)/(length(readSet[0]) - options.minMatchLen);
1819 		if(extPrefixErrorRate > maxErrorRate) maxErrorRate = extPrefixErrorRate;
1820 	}
1821 	Pair<TGPos,TGPos> lastLeftMatch(0,0);
1822 
1823 	// iterate all verification regions returned by SWIFT
1824 	while (find(swiftFinderR, swiftPatternR, maxErrorRate))
1825 	{
1826 		unsigned rseqNo = swiftPatternR.curSeqNo;
1827 		TGPos rEndPos = endPosition(swiftFinderR);
1828 		TGPos doubleParWidth = 2 * (*swiftFinderR.curHit).bucketWidth;
1829 
1830 		// remove out-of-window left mates from fifo
1831 		while (!empty(fifo) && front(fifo).i2.gBegin + maxDistance + (TSignedGPos)doubleParWidth < (TSignedGPos)rEndPos)
1832 		{
1833 			popFront(fifo);
1834 			++firstNo;
1835 		}
1836 
1837 		// add within-window left mates to fifo
1838 		while (empty(fifo) || back(fifo).i2.gBegin + minDistance < (TSignedGPos)(rEndPos + doubleParWidth))
1839 		{
1840 			if (find(swiftFinderL, swiftPatternL, maxErrorRate))
1841 			{
1842 				gPair = positionRange(swiftFinderL);
1843 				if ((TSignedGPos)gPair.i2 + maxDistance + (TSignedGPos)doubleParWidth >= (TSignedGPos)rEndPos)
1844 				{
1845 					// link in
1846 					fL.i1 = lastPotMatchNo[swiftPatternL.curSeqNo]; //link to last previous potential match
1847 					lastPotMatchNo[swiftPatternL.curSeqNo] = lastNo++; //++ general counter and remember last pot match of curSeqNo
1848 
1849 					fL.i2.rseqNo = swiftPatternL.curSeqNo | NOT_VERIFIED; //das erste bit wird gestetzt
1850 					fL.i2.gBegin = gPair.i1;	//begin und end auf die potential match region
1851 					fL.i2.gEnd = gPair.i2;
1852 					pushBack(fifo, fL);
1853 				}
1854 			}
1855 			else
1856 				break;
1857 		}
1858 
1859 		TDequeueIterator it;
1860 		int64_t lastPositive = (int64_t)-1;
1861 
1862 		TRead const &read = readSet[rseqNo];
1863 		TSize counter = 0;
1864 		bool noMatchRight = false;
1865 		bool notYetVerifiedRight = true;
1866 		lastLeftMatch.i1 = 0;
1867 		lastLeftMatch.i2 = 0;
1868 
1869 		// walk through all potential left matches, verify if not verfied already, mark as positive or negative
1870 		for (int64_t i = lastPotMatchNo[rseqNo]; firstNo <= i; i = (*it).i1)
1871 		{
1872 			it = &value(fifo, i - firstNo);
1873 
1874 			// verify left mate (equal seqNo), if not done already
1875 			if ((*it).i2.rseqNo & NOT_VERIFIED)
1876 			{
1877 				TGPos maxEndPos = (*it).i2.gEnd + length(read)-2*options.minMatchLen + floor(options.errorRate*length(read));
1878 				if(maxEndPos > length(genome)) maxEndPos = length(genome);
1879 				if (matchVerify( (*it).i2,
1880 						infix(genome, (*it).i2.gBegin, maxEndPos),
1881 						rseqNo,
1882 						readSet, //readSetL
1883 						forwardPatternsL,
1884 						options,
1885 						TSwiftSpec(),
1886 						LongestPrefix()) &&
1887 						!(lastLeftMatch.i1 == (TGPos)(*it).i2.gBegin && lastLeftMatch.i2 == (TGPos)(*it).i2.gEnd ))
1888 				{
1889 					(*it).i2.rseqNo &= ~NOT_VERIFIED; // has been verified positively // go back to regular rseqNo
1890 					lastLeftMatch.i1 = (*it).i2.gBegin;
1891 					lastLeftMatch.i2 = (*it).i2.gEnd;
1892 					// short-cut negative matches
1893 					if (lastPositive == (int64_t)-1)
1894 						lastPotMatchNo[rseqNo] = i;
1895 					else
1896 						value(fifo, lastPositive - firstNo).i1 = i;
1897 					lastPositive = i;
1898 				}
1899 				else
1900 				{
1901 					(*it).i2.rseqNo = ~NOT_VERIFIED;		// has been verified negatively // 01111111....111
1902 				}
1903 			}
1904 
1905 			// verify right mate, if left match was found
1906 			if ((*it).i2.rseqNo == rseqNo)
1907 			{
1908 				lastLeftMatch.i1 = (*it).i2.gBegin;
1909 				lastLeftMatch.i2 = (*it).i2.gEnd;
1910 
1911 				// dont want to shortcut too much
1912 				if (lastPositive == (int64_t)-1 || i < lastPositive)
1913 					lastPositive = i;
1914 
1915 				// first left match --> do right verification
1916 				if(notYetVerifiedRight)
1917 				{
1918 					notYetVerifiedRight = false;
1919 					TGPos minBeginPos = beginPosition(infix(swiftFinderR, genomeInf));
1920 					if((int)minBeginPos - length(read)+2*options.minMatchLen - floor(options.errorRate*length(read)) > 0)
1921 						minBeginPos = minBeginPos - length(read)+2*options.minMatchLen - floor(options.errorRate*length(read));
1922 					else minBeginPos = 0;
1923 					if (!matchVerify(mR,
1924 						infix(genome,minBeginPos,endPosition(infix(swiftFinderR, genomeInf))),
1925 						rseqNo,
1926 						readSet,//readSetR,
1927 						forwardPatternsR,
1928 						options,
1929 						TSwiftSpec(),
1930 						LongestSuffix()))
1931 					{
1932 						noMatchRight = true;
1933 						continue;
1934 					}
1935 					else
1936 					{
1937 						if (lastRightMatch[rseqNo].i1 == (TGPos)mR.gBegin && lastRightMatch[rseqNo].i2 == (TGPos)mR.gEnd)
1938 							noMatchRight = true;
1939 						lastRightMatch[rseqNo].i1 = mR.gBegin;
1940 						lastRightMatch[rseqNo].i2 = mR.gEnd;
1941 					}
1942 				}
1943 
1944 				//else check if left and right match fit together
1945 				if(!noMatchRight)
1946 				{
1947 #ifdef RAZERS_DEBUG
1948 					std::cout << "before\nmL.mScore =" << (*it).i2.mScore << "\n";
1949 					std::cout << "mL.gBegin =" << (*it).i2.gBegin << "\n";
1950 					std::cout << "mL.gEnd =" << (*it).i2.gEnd << "\n";
1951 					std::cout << "mL.editDist =" << (*it).i2.editDist << "\n";
1952 
1953 					std::cout << "mR.mScore =" << mR.mScore << "\n";
1954 					std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1955 					std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1956 					std::cout << "mR.editDist =" << mR.editDist << "\n";
1957 #endif
1958 					int outerDistance = mR.gEnd - (*it).i2.gBegin;
1959 					if (outerDistance < (int)(2 * options.minMatchLen))
1960 						continue;
1961 //					::std::cout << options.minMatchLen << "<-minMatchLen  outerDistance->" << outerDistance << std::endl;
1962 					int outerDistanceError = length(readSet[rseqNo]) - (int)(outerDistance);
1963 					if (outerDistanceError < 0) outerDistanceError = -outerDistanceError;
1964 					if ((outerDistanceError > (int) options.maxDistance) ||
1965 						(outerDistanceError < (int) options.minDistance))
1966 						continue;
1967 
1968 //					::std::cout <<"hier!"<<options.minDistance <<" "<<options.maxDistance<<"\n";
1969 
1970 					TMatch mRtmp = mR;
1971 					TMatch mLtmp = (*it).i2;
1972 					if(!combineLeftRight(mRtmp,mLtmp,read,genome,options,orientation,TSwiftSpec()))
1973 					{
1974 						++options.FP;
1975 						continue;
1976 					}
1977 					else ++options.TP;
1978 
1979 					++counter;
1980 
1981 					//assign an alignment score
1982 					//mRtmp.alignmentScore = (options.matchScore * (sumLength - sumDist)) + (options.mismatchScore * sumDist);
1983 					//if(outerDistanceError != 0) mRtmp.alignmentScore += ((outerDistanceError - 1) * options.gapExtend) + options.gapOpen;
1984 					//mLtmp.alignmentScore = mRtmp.alignmentScore;
1985 
1986 					//mRtmp.alignmentScore = (options.matchScore * (sumLength - sumDist)) + (options.mismatchScore * sumDist);
1987 					//if(outerDistanceError != 0)
1988 					//{
1989 					//	int maxErrors = (int)(options.errorRate*length(readSet[rseqNo]));
1990 					//	double gapScoreConvergeTo = ((options.minMatchLen*2 - maxErrors) * options.matchScore) + (maxErrors * options.mismatchScore) ;
1991 					//	double gapPenalty =	gapScoreConvergeTo*(1-exp((double)-options.indelLambda*outerDistanceError));
1992 					//	mRtmp.alignmentScore -= (int)gapPenalty;
1993 					//}
1994 					//mLtmp.alignmentScore = mRtmp.alignmentScore;
1995 
1996 					if (orientation == 'R')
1997 					{
1998 						TSize temp = mLtmp.gBegin;
1999 						mLtmp.gBegin = gLength - mLtmp.gEnd;
2000 						mLtmp.gEnd = gLength - temp;
2001 						temp = mRtmp.gBegin;
2002 						mRtmp.gBegin = gLength - mRtmp.gEnd;
2003 						mRtmp.gEnd = gLength - temp;
2004 						outerDistance = -outerDistance;
2005 					}
2006 					// set a unique pair id
2007 					mLtmp.pairId = mRtmp.pairId = options.nextMatePairId;
2008 					if (++options.nextMatePairId == 0)
2009 						options.nextMatePairId = 1;
2010 
2011 					// score the whole match pair
2012 					mLtmp.pairScore = mRtmp.pairScore = 0 - mLtmp.editDist - mRtmp.editDist;
2013 
2014 #ifdef RAZERS_DEBUG_LIGHT
2015 					bool falsch = false;
2016 					if((unsigned)(mRtmp.mScore + mLtmp.mScore) > length(readSet[rseqNo])) falsch = true;
2017 					if(mRtmp.mScore> length(readSet[rseqNo])-options.minMatchLen || mRtmp.mScore < options.minMatchLen) falsch = true;
2018 					if((int)mRtmp.gEnd - (int)mRtmp.gBegin < 0 || (int)mRtmp.gEnd - (int)mRtmp.gBegin > 500) falsch = true;
2019 					if(mRtmp.editDist > 200) falsch = true;
2020 
2021 					if(mLtmp.mScore> length(readSet[rseqNo])-options.minMatchLen ||mLtmp.mScore < options.minMatchLen) falsch = true;
2022 					if((int)mLtmp.gEnd - (int)mLtmp.gBegin < 0 || (int)mLtmp.gEnd - (int)mLtmp.gBegin > 500) falsch = true;
2023 					if(mLtmp.editDist > 200) falsch = true;
2024 
2025 					if(falsch)
2026 					{
2027 						::std::cout << "rseqNo="<<rseqNo <<::std::endl;
2028 							std::cout << "mL.mScore =" << mLtmp.mScore << "\n";
2029 							std::cout << "mL.gBegin =" << mLtmp.gBegin << "\n";
2030 							std::cout << "mL.gEnd =" << mLtmp.gEnd << "\n";
2031 							std::cout << "mL.editDist =" << mLtmp.editDist << "\n";
2032 
2033 							std::cout << "mR.mScore =" << mRtmp.mScore << "\n";
2034 							std::cout << "mR.gBegin =" << mRtmp.gBegin << "\n";
2035 							std::cout << "mR.gEnd =" << mRtmp.gEnd << "\n";
2036 							std::cout << "mR.editDist =" << mRtmp.editDist << "\n";
2037 
2038 
2039 
2040 					if(-mLtmp.pairScore > (int) (options.errorRate * length(readSet[rseqNo])))
2041 					{
2042 						::std::cout << "mLtmp.pairScore = " << mLtmp.pairScore;
2043 						::std::cout << "\treadLen = " << length(readSet[rseqNo]);
2044 						::std::cout << "\trseqNo = " << rseqNo << ::std::endl;
2045 					}
2046 					std::cout << "mScoreSum =" << mLtmp.mScore + mRtmp.mScore << "\t";
2047 					std::cout << "readLength =" << length(readSet[rseqNo]) << "\t";
2048 					std::cout << "mL.mScore =" << mLtmp.mScore << "\t";
2049 					std::cout << "mL.gBegin =" << mLtmp.gBegin << "\t";
2050 					std::cout << "mL.gEnd =" << mLtmp.gEnd << "\t";
2051 					std::cout << "mL.editDist =" << mLtmp.editDist << "\n";
2052 
2053 					std::cout << "mR.mScore =" << mRtmp.mScore << "\t";
2054 					std::cout << "mR.gBegin =" << mRtmp.gBegin << "\t";
2055 					std::cout << "mR.gEnd =" << mRtmp.gEnd << "\t";
2056 					std::cout << "mR.editDist =" << mRtmp.editDist << "\n";
2057 
2058 					Align<String<Dna5Q>, ArrayGaps> alignL;
2059 					Score<int> scoreType = Score<int>(0, -999, -1001, -1000);	// levenshtein-score (match, mismatch, gapOpen, gapExtend)
2060 					if (options.hammingOnly)
2061 						scoreType.data_mismatch = -1;
2062 					resize(rows(alignL), 2);
2063 					assignSource(row(alignL, 0), infix(readSet[rseqNo],0,mLtmp.mScore));
2064 					assignSource(row(alignL, 1), infix(genome,mLtmp.gBegin, mLtmp.gEnd));
2065 					if (orientation == 'R')
2066 						reverseComplement(source(row(alignL, 1)));
2067 
2068 					globalAlignment(alignL, scoreType);
2069 					std::cout << alignL;
2070 
2071 					Align<String<Dna5Q>, ArrayGaps> alignR;
2072 					resize(rows(alignR), 2);
2073 					assignSource(row(alignR, 0), infix(readSet[rseqNo],length(readSet[rseqNo])-mRtmp.mScore,length(readSet[rseqNo])));
2074 					assignSource(row(alignR, 1), infix(genome,mRtmp.gBegin, mRtmp.gEnd));
2075 					if (orientation == 'R')
2076 						reverseComplement(source(row(alignR, 1)));
2077 
2078 					globalAlignment(alignR, scoreType);
2079 					std::cout << alignR;
2080 
2081 					std::cout << "Komplettali:\n";
2082 					Align<String<Dna5Q>, ArrayGaps> align;
2083 					resize(rows(align), 2);
2084 					assignSource(row(align, 0), infix(readSet[rseqNo],0,length(readSet[rseqNo])));
2085 					assignSource(row(align, 1), infix(genome,_min(mRtmp.gBegin,mLtmp.gBegin), _max(mRtmp.gEnd,mLtmp.gEnd)));
2086 					if (orientation == 'R')
2087 						reverseComplement(source(row(align, 1)));
2088 
2089 					globalAlignment(align, scoreType);
2090 					std::cout << align;
2091 					}
2092 #endif
2093 
2094 
2095 
2096 					// relative positions
2097 					mLtmp.mateDelta = outerDistance;
2098 					mRtmp.mateDelta = -outerDistance;
2099 
2100 					mLtmp.rseqNo = mRtmp.rseqNo = rseqNo;
2101 
2102 					if (!options.spec.DONT_DUMP_RESULTS)
2103 					{
2104 						appendValue(matches, mLtmp, Generous());
2105 						appendValue(matches, mRtmp, Generous());
2106 						if (length(matches) > options.compactThresh)
2107 						{
2108 							typename Size<TMatches>::Type oldSize = length(matches);
2109 //							maskDuplicates(matches);	// overlapping parallelograms cause duplicates //TODO: implement!
2110 							compactSplicedMatches(matches, cnts, options, false, swiftPatternL, swiftPatternR);
2111 							options.compactThresh += (options.compactThresh >> 1);
2112 							if (options._debugLevel >= 2)
2113 								::std::cerr << '(' << oldSize - length(matches) << " matches removed)";
2114 						}
2115 					}
2116 				}
2117 			}
2118 		}
2119 
2120 		// short-cut negative matches
2121 		if (lastPositive == (int64_t)-1)
2122 			lastPotMatchNo[rseqNo] = (int64_t)-1;
2123 		else
2124 			value(fifo, lastPositive - firstNo).i1 = (int64_t)-1; // the first positive's link to previous is removed
2125 
2126 
2127 	}//swiftFinderR
2128 
2129 
2130 
2131 }//function
2132 
2133 
2134 
2135 
2136 
2137 }//namespace
2138 
2139 #endif
2140