1  /*==========================================================================
2                 SplitRazerS - Fast Split Read Mapping
3 
4  ============================================================================
5   Copyright (C) 2008 by Anne-Katrin Emde
6 
7   This program is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 3 of the License, or (at your options) any later version.
11 
12   This program is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15   Lesser General Public License for more details.
16 
17   You should have received a copy of the GNU General Public License
18   along with this program.  If not, see <http://www.gnu.org/licenses/>.
19  ==========================================================================*/
20 
21 
22 #ifndef SEQAN_HEADER_RAZERS_SPLICED_H
23 #define SEQAN_HEADER_RAZERS_SPLICED_H
24 
25 //#define RAZERS_DEBUG
26 //#define RAZERS_DEBUG_LIGHT
27 
28 #include <seqan/store.h>
29 #include <seqan/align.h>
30 #include <seqan/graph_types.h>
31 #include <seqan/graph_algorithms.h>
32 #include <seqan/align.h>
33 #include <seqan/seeds.h>
34 #include <seqan/bam_io.h>
35 
36 namespace seqan
37 {
38 
39 
40 
41 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
42 // Match statistics stuff
43 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
44 
45 //statistics, expected number of matches
choose(int n,int k)46 double choose(int n, int k)
47 {
48 	double cnk = 1.0;
49 
50 	if (k * 2 > n)
51 	k = n - k;
52 
53 	for (int i = 1; i <= k; n--, i++)
54 	{
55 		cnk /= i;
56 		cnk *= n;
57 	}
58 	return cnk;
59 }
60 
61 
62 //#R  - read length
63 //#T  - total sequence length
64 //#I  - length of insertion
65 //#D  - length of deletion
66 //#M1 - minimal prefix match length
67 //#M2 - minimal suffix match length
68 
69 
70 //## first function denotes the random match to an iid sequence
71 template<typename TSize, typename TValue>
72 double
_probability(TSize R,TSize M1,TSize M2,TValue d,TValue d_m1,TValue d_m2)73 _probability(TSize R, TSize M1, TSize M2, TValue d, TValue d_m1, TValue d_m2)
74 {
75 	double sum_prob = 0;
76 	for (unsigned i1 = 0; i1 <= d_m1; ++i1)
77 	{
78 		for (unsigned i2 = 0; i2 <= d_m2; ++i2)
79 		{
80 			for (unsigned j = 0; j <= d-i1-i2; ++j)
81 			{
82 				if(R-M1-M2>=j)
83 					sum_prob +=
84 					(double)(choose(M1,i1)* pow((double)0.25,(double)R-i1-i2-j) *
85 						 choose(M2,i2) * pow((double)0.75,(double)i1+i2+j) *
86 						 choose(R-M1-M2,j));
87 			}
88 		}
89 	}
90 	return sum_prob;
91 }
92 
93 
94 //#InsCuts describes the possible number of cuts
95 //#for a read of length R and an insertion of length I
96 template<typename TSize>
97 TSize
_insCuts(TSize R,TSize M1,TSize M2,TSize I)98 _insCuts(TSize R, TSize M1, TSize M2, TSize I){
99 	return (R-I-M1-M2+1);
100 }
101 
102 
103 
104 //#function for calculating expected matches with I insertions
105 template<typename TSize, typename TValue>
106 double
_expMatchesIns(TSize R,TSize M1,TSize M2,TSize I,TValue d,TValue d_m1,TValue d_m2,TSize S,TSize T)107 _expMatchesIns(TSize R, TSize M1, TSize M2, TSize I, TValue d,TValue d_m1,TValue d_m2, TSize S, TSize T)
108 {
109 	return((double)T*((S-R-I-1)*_probability((R-I),M1,M2,d,d_m1,d_m2))*_insCuts(R,M1,M2,I));
110 }
111 
112 
113 //#DelCuts describes the possible number of cuts
114 //#for a read of length R in case of a deletion
115 template<typename TSize>
116 TSize
_delCuts(TSize R,TSize M1,TSize M2)117 _delCuts(TSize R, TSize M1, TSize M2)
118 {
119 	return (R-M1-M2+1);
120 }
121 
122 
123 //#Del describes the possible number of configurations
124 //#for a string of length R in a sequence of length S
125 template<typename TSize>
126 TSize
_del(TSize R,TSize S,TSize maxD,TSize minD)127 _del(TSize R, TSize S, TSize maxD, TSize minD)
128 {
129 	//return((S-R)*(S-R+1)/2);
130 	return (maxD-minD+1)*(S-R+1) - ((maxD+1)*maxD)/2 + ((minD-1)*minD)/2;
131 }
132 
133 
134 //#function for calculating expected matches with deletion of size D
135 template<typename TSize, typename TValue>
136 double
_expMatchesDel(TSize R,TSize M1,TSize M2,TValue d,TValue d_m1,TValue d_m2,TSize S,TSize T,TSize maxD,TSize minD)137 _expMatchesDel(TSize R, TSize M1, TSize M2, TValue d, TValue d_m1, TValue d_m2, TSize S, TSize T, TSize maxD, TSize minD )
138 {
139 	return ((double)T*(_probability(R,M1,M2,d,d_m1,d_m2))*_delCuts(R,M1,M2)*_del(R,S,maxD,minD));
140 }
141 
142 template<typename TReadSet, typename TSize, typename TOptions>
143 void
expNumRandomMatches(TReadSet & readSet,TSize genomeLen,TOptions & options)144 expNumRandomMatches(TReadSet &readSet, TSize genomeLen, TOptions & options)
145 {
146 	TSize M1 = options.minMatchLen;
147 	TSize M2 = options.minMatchLen;
148 	TSize d_m1 = (int) options.maxPrefixErrors;
149 	TSize d_m2 = (int) options.maxSuffixErrors;
150 	TSize numReads = length(readSet);
151 	genomeLen = 1000000 * options.specifiedGenomeLen; // go from Mb to bp
152 	if(options.anchored) genomeLen = options.maxGap + 2*options.libraryError; // upper bound
153 	TSize readLen = (numReads > 0) ? length(readSet[0]) : 0;
154 	TSize numErrors = static_cast<int>(readLen * options.errorRate);
155 	TSize maxD = options.maxGap;
156 	TSize minD = options.minGap;
157 	//expected number of random deletion matches:
158 	double delMatches = _expMatchesDel(readLen,M1,M2,numErrors,d_m1, d_m2, genomeLen, numReads, maxD, minD);
159 	if (options.reverse) delMatches *= 2;
160 
161 	//expected number of random deletion matches:
162 	double insMatches = 0;
163 	for(TSize insLen = 1; insLen <=readLen-M1-M2; ++insLen)
164 	{
165 		numErrors = static_cast<int>((readLen-insLen) * options.errorRate);
166 		insMatches += _expMatchesIns(readLen,M1,M2,insLen,numErrors,d_m1, d_m2, genomeLen, numReads);
167 	}
168 	if (options.reverse) insMatches *= 2;
169 
170 	::std::cout << "Expected number of random deletion-indicating matches: " << delMatches << std::endl;
171 	::std::cout << "Expected number of random insertion-indicating matches: " << insMatches << std::endl;
172  }
173 
174 
175 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
176 
177 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
178 
179 
180 // SPLIT MAPPING
181 // We build two q-gram indices, one for the prefix, the other for the suffix of the read
182 
183 struct LongestPrefix{};
184 struct LongestSuffix{};
185 
186 struct OrientationReverse{};
187 struct OrientationForward{};
188 
189 //////////////////////////////////////////////////////////////////////////////
190 // Load anchored reads from SAM file
191 template <typename TReadSet, typename TNameSet,typename TReadRegions, typename TRazerSOptions>
loadReadsSam(TReadSet & reads,TNameSet & fastaIDs,TReadRegions & readRegions,const char * fileName,TRazerSOptions & options)192 bool loadReadsSam(
193 	TReadSet &reads,
194 	TNameSet &fastaIDs,
195 	TReadRegions &readRegions,
196 	const char *fileName,
197 	TRazerSOptions &options)
198 {
199 	typedef typename Value<TReadRegions>::Type	TRegion;
200 	//typedef typename Value<TReadSet>::Type		TRead;
201 	typedef typename Value<TNameSet>::Type		TReadName;
202 	//typedef typename Value<TRegion,2>::Type		TContigPos;
203 	typedef typename Value<TRegion,2>::Type		TFlagPos;
204 	typedef typename Value<TFlagPos,2>::Type	TContigPos;
205 
206 	bool countN = !(options.matchN || options.outputFormat == 1 );
207 	if (!empty(CharString(options.outputUnmapped))) countN = false;
208 
209 	BamFileIn file;
210 	if (!open(file, fileName)) return false;
211 
212 	options.maxReadRegionsEnd = 0;
213 	options.minReadRegionsStart = std::numeric_limits<int>::max();
214 	TContigPos regionBegin = options.minReadRegionsStart;
215 	TContigPos regionEnd = options.maxReadRegionsEnd;
216 
217 		//typename std::ifstream::pos_type lineStart = (*file).tellg();
218 		//lineStart = lineStart - (std::ifstream::pos_type) 1;
219 		//	(*file).seekp(lineStart);
220 
221 
222 	int i = 0;
223     //int lastFlag = -1;
224     //TReadName lastQname;
225     //bool lastWasAnchored = false;
226 	int kickoutcount = 0;
227 
228     // Read header.
229     BamHeader header;
230     readHeader(header, file);
231 
232     // Read records.
233     BamAlignmentRecord record;
234     while (!atEnd(file))
235     {
236         readRecord(record, file);
237         if (record.rID == -1)
238             continue;  // Skip if orphan.
239 
240         // Get the query name, remove everything after the first space.
241         TReadName qname = record.qName;
242         cropAfterFirst(qname, IsWhitespace());
243 
244         // Evaluate flag.
245         if (!hasFlagMultiple(record) || !hasFlagUnmapped(record))
246             continue;  // Skip if not unmapped mate.
247         bool reverse = hasFlagRC(record); // if reverse this read is expected to match downstream of its mate
248         int bitFlag = 0;    // has two bits
249         if (reverse) bitFlag += 2;  // first bits says whether reversed (or not ->0)
250         if ((record.flag & (1 << 7)) == (1 << 7)) bitFlag +=1; // second bit says whether second mate (or first ->0)
251 
252         // Read reference name.  Same behaviour as for query name:  Read up to
253         // the first whitespace character and skip to next tab char.
254         String<char> chrname = contigNames(context(file))[record.rID];
255         //need gnameToIdMap !!
256 
257         // Get read begin position.
258         TContigPos beginPos = record.beginPos;
259 
260         // Get CIGAR string.
261         String<CigarElement<> > cigar = record.cigar;
262 
263 		// read in sequence
264         String<Dna5Q> readSeq = record.seq;
265         SEQAN_ASSERT_GT(length(readSeq), 0u);
266 		int readLength = (int)length(readSeq);
267 		if (countN)
268 		{
269 			int count = 0;
270 			int cutoffCount = (int)(options.errorRate * readLength);
271 			int j = 0;
272 			for (; j < readLength; ++j)
273 				if (getValue(readSeq, j) == 'N')
274 					if (++count > cutoffCount)
275 					{
276 						++kickoutcount;
277 						break;
278 					}
279 			if(j<readLength) continue; // read is skipped because of low quality
280 		}
281 
282 		// and associated qualities
283         assignQualities(readSeq, record.qual);
284 
285 
286 		// now store the information
287 		if (options.readNaming == 0
288 #ifdef RAZERS_DIRECT_MAQ_MAPPING
289 			|| options.fastaIdQual
290 #endif
291 			)  //15578976
292 			appendValue(fastaIDs, qname);	// append read name Fasta id
293 
294 
295 //		if (options.trimLength > 0 && readLength > (unsigned)options.trimLength)
296 //			resize(readSeq, options.trimLength);
297 		if((int)length(readSeq) > options.maxReadLength)
298  			options.maxReadLength = length(readSeq);
299 
300       //reverseComplement(readSeq);
301 		appendValue(reads, readSeq, Generous());
302 
303 		appendValue(readRegions,TRegion(0,TFlagPos(0,0)));
304 		readRegions[i].i2.i1 = bitFlag;
305 		readRegions[i].i2.i2 = (signed)beginPos;
306         if (reverse)
307 		{
308 			// libraryLength is outer fragment distance
309 			regionBegin = _max((int)beginPos + readLength,(int)beginPos+options.libraryLength-readLength-options.libraryError);
310 			regionEnd = (signed)beginPos+options.libraryLength+options.libraryError+options.maxGap;
311 			// expected begin position of read
312 	//		readRegions[i].i2 = (signed)beginPos + options.libraryLength - readLength;
313 
314 		}
315 		else  // read should map upstream of mapped mate --> set the vorzeichen bit
316 		{
317 			regionBegin = _max((int)0,(int)beginPos-options.libraryLength+readLength-options.libraryError-options.maxGap);
318 			regionEnd = _min((int)beginPos,(int)beginPos-options.libraryLength+2*readLength+options.libraryError);
319 			// expected end position of read
320 			//readRegions[i].i2 = - _min((int)beginPos,(int)beginPos-options.libraryLength+2*readLength);//+options.maxGap);
321 		}
322 //		readRegions[i].i1 = 0; //currently just ment for one chr at a time... need to add genomeId map
323 		if(regionEnd > (TContigPos) options.maxReadRegionsEnd) options.maxReadRegionsEnd = (unsigned) regionEnd;
324 		if(regionBegin < (TContigPos)options.minReadRegionsStart) options.minReadRegionsStart = (unsigned) regionBegin;
325         //lastFlag = flag;
326         //lastQname = qname;
327 		++i;
328     }
329 	if (options._debugLevel > 1 && kickoutcount > 0)
330 		::std::cerr << "Ignoring " << kickoutcount << " low quality reads.\n";
331 	return (i > 0);
332 
333 }
334 
335 
336 
337 //////////////////////////////////////////////////////////////////////////////
338 // Remove low quality matches # necessary to have an own splicedmatch function?
339 // planned specs: SpliceSite, General, ...
340 template < typename TMatches, typename TCounts, typename TSpec, typename TSwiftL, typename TSwiftR >
compactSplicedMatches(TMatches & matches,TCounts &,RazerSOptions<TSpec> & options,bool,TSwiftL & swiftL,TSwiftR & swiftR)341 void compactSplicedMatches(TMatches &matches,
342 			TCounts & /*cnts*/,
343 			RazerSOptions<TSpec> &options,
344 			bool ,
345 			TSwiftL &swiftL,
346 			TSwiftR &swiftR)
347 {
348 	typedef typename Value<TMatches>::Type				TMatch;
349 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
350 
351 	unsigned readNo = -1;
352 	unsigned hitCount = 0;
353 	unsigned hitCountCutOff = options.maxHits;
354 	int scoreDistCutOff = std::numeric_limits<int>::min();
355 
356 	TIterator it = begin(matches, Standard());
357 	TIterator itEnd = end(matches, Standard());
358 	TIterator dit = it;
359 	TIterator ditBeg = it;
360 
361 
362 	// sort
363 //	::std::sort(it, itEnd, LessSplicedErrors<TMatch>());
364 	::std::sort(it, itEnd, LessSplicedScore<TMatch>());
365 	int counter = 0;
366 	for (; it != itEnd; ++it)
367 	{
368 		++counter;
369 		if ((*it).orientation == '-') { ++it; continue; }
370 		if (readNo == (*it).rseqNo)
371 		{
372 			if ((int)(*it).pairScore <= scoreDistCutOff)
373 			{
374 				++it;
375 				continue;
376 			}
377 			if (++hitCount >= hitCountCutOff)
378 			{
379 #ifdef RAZERS_MASK_READS
380 				if (hitCount == hitCountCutOff)
381 				{
382 					// we have enough, now look for better matches
383 					int minScore = (*it).pairScore - 1;
384 					if (options.purgeAmbiguous &&
385 						(options.distanceRange == 0 || minScore >= options.maxReadLength - (int)options.distanceRange))
386 					{
387 						setMaxErrors(swiftL, readNo, -1);
388 						setMaxErrors(swiftR, readNo, -1);
389 						if (options._debugLevel >= 2) ::std::cerr << "(read #" << readNo << " disabled)";
390 						dit = ditBeg;
391 					}
392 					else
393 					{
394 						// enough optimal read matches found, disable further search but keep maxHits+1 matches
395 						if(minScore == options.maxReadLength - 1)
396 						{
397 							setMaxErrors(swiftL, readNo, -1);
398 							setMaxErrors(swiftR, readNo, -1);
399 							if (options._debugLevel >= 2) ::std::cerr << "(read #" << readNo << " disabled)";
400 						}
401 
402 						*dit = *it;	++dit; ++it;
403 						*dit = *it;	++dit;
404 						continue;
405 					}
406 
407 				}
408 #endif
409 				++it;
410 				continue;
411 			}
412 		}
413 		else
414 		{
415 			readNo = (*it).rseqNo;// >> 1;
416 			hitCount = 0;
417 			if (options.distanceRange > 0)
418 				scoreDistCutOff = (*it).pairScore - options.distanceRange;
419 			ditBeg = dit;
420 		}
421 		*dit = *it;	++dit; ++it;
422 		*dit = *it;	++dit;
423 	}
424 	resize(matches, dit - begin(matches, Standard()));
425 
426 }
427 
428 
429 
430 
431 //////////////////////////////////////////////////////////////////////////////
432 // final compacting of spliced matches
433 template < typename TMatches, typename TCounts, typename TSpec>
compactAndCountSplicedMatches(TMatches & matches,TCounts & states,RazerSOptions<TSpec> & options,bool)434 void compactAndCountSplicedMatches(TMatches &matches,
435 			TCounts & states,
436 			RazerSOptions<TSpec> &options,
437 			bool )
438 {
439 	typedef typename Value<TMatches>::Type				TMatch;
440 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
441 
442 	unsigned readNo = -1;
443 	unsigned hitCount = 0;
444 	unsigned hitCountCutOff = options.maxHits;
445 	int scoreDistCutOff = std::numeric_limits<int>::min();
446 
447 	clear(states);
448 
449 	int bestScore = 0;
450 	int state = -1; // 0 = unique, 1 = multi, 2 = suboptimal
451 	int numSuccesful = 0;
452 
453 	TIterator it = begin(matches, Standard());
454 	TIterator itEnd = end(matches, Standard());
455 	TIterator dit = it;
456 	TIterator ditBeg = it;
457 	// int lastPairId = 0;
458 	// sort
459 	::std::sort(it, itEnd, LessSplicedScoreGPos<TMatch>());
460 //	::std::sort(it, itEnd, LessSplicedErrorsGPos<TMatch>());
461 	int counter = 0;
462 	for (; it != itEnd; ++it)
463 	{
464 		++counter;
465 		if ((*it).orientation == '-') { ++it; continue; }
466 		if (readNo == (*it).rseqNo) // current match is either multi or suboptimal
467 		{
468 			if ((int)(*it).pairScore <= scoreDistCutOff)
469 			{
470 				++it;
471 				continue;
472 			}
473 			if (++hitCount >= hitCountCutOff)
474 			{
475 				if (hitCount == hitCountCutOff)
476 				{
477 					if ((int)(*it).pairScore == bestScore)
478 					{
479 						if(state == 0)
480 							state = 1; // the match before is multi
481 					}
482 
483 					if(options.purgeAmbiguous)
484 	     				{
485 						dit = ditBeg;
486 						resize(states,numSuccesful);
487 						state  = -1;
488 					}
489 
490 				}
491 				++it;
492 				continue;
493 			}
494 			if ((int)(*it).pairScore == bestScore)
495 			{
496 				if(state == 0)
497 				{
498 					appendValue(states, 1); // the match before is multi
499 					state = 1;		// current one is multi too
500 				}
501 				else // state > 0
502 					appendValue(states, state); // either current match is multi or suboptimal, same state
503 				//std::cout << "state = "<< state << " for PairId = " << lastPairId << std::endl;
504 			}
505 			if ((int)(*it).pairScore < bestScore)
506 			{
507 				appendValue(states, state); // either current match is suboptimal, state before is what it was
508 				//std::cout << "state = "<< state << " for PairId = " << lastPairId << std::endl;
509 				state = 2;		// the current one is suboptimal
510 			}
511 			// lastPairId = (*it).pairId;
512 		}
513 		else
514 		{
515 			if(state != -1)
516 			{
517 				appendValue(states, state); // append state of the match before
518 				//std::cout << "state = "<< state << " for PairId = " << lastPairId << std::endl;
519 			}
520 			readNo = (*it).rseqNo;// >> 1;
521 			hitCount = 0;
522 			if (options.distanceRange > 0)
523 				scoreDistCutOff = (*it).pairScore - options.distanceRange;
524 			ditBeg = dit;
525 			numSuccesful = (dit - begin(matches, Standard()))/2;
526 			//std::cout << "numSuccesful=" << numSuccesful << std::endl;
527 			bestScore = (*it).pairScore;
528 			// lastPairId = (*it).pairId;
529 			state = 0;
530 		}
531 		*dit = *it;	++dit; ++it;
532 		*dit = *it;	++dit;
533 
534 	}
535 	if(state != -1)
536 	{
537 		appendValue(states, state);
538 		//std::cout << "state = "<< state << " for PairId = " << lastPairId << std::endl;
539 	}
540 
541 	resize(matches, dit - begin(matches, Standard()));
542 //	std::cout << "lengthmatches = " << length(matches) << " numStates = " << length(states) << std::endl;
543 }
544 
545 
546 template<typename TAlign, typename TPosition>
547 int
countErrorsInAlign(TAlign & align,TPosition end_)548 countErrorsInAlign(TAlign & align, TPosition end_)
549 {
550 
551 	//typedef typename Source<TAlign>::Type TSource;
552 	//typedef typename Iterator<TSource, Rooted>::Type TStringIterator;
553 
554 	typedef typename Row<TAlign>::Type TRow;
555 	typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
556 
557 	TAlignIterator ali_it0 = iter(row(align,0),0);
558 	TAlignIterator ali_it1 = iter(row(align,1),0);
559 	TAlignIterator ali_it0_stop = iter(row(align,0),end_);
560 	TAlignIterator ali_it1_stop = iter(row(align,1),end_);
561 
562 
563 	int errors = 0;
564 	while(ali_it0 != ali_it0_stop && ali_it1 != ali_it1_stop)
565 	{
566 		while(ali_it0!=ali_it0_stop && ali_it1!=ali_it1_stop && !isGap(ali_it0)&& !isGap(ali_it1))
567 		{
568 			if(*ali_it1 != *ali_it0)
569 				++errors;
570 			++ali_it0;
571 			++ali_it1;
572 		}
573 		while(ali_it0!=ali_it0_stop && isGap(ali_it0))
574 		{
575 			++ali_it0;
576 			++ali_it1;
577 			++errors;
578 		}
579 		while(isGap(ali_it1)&& ali_it1!=ali_it1_stop)
580 		{
581 			++ali_it0;
582 			++ali_it1;
583 			++errors;
584 		}
585 	}
586 	while(ali_it0!=ali_it0_stop)
587 	{
588 		++ali_it0;
589 		++errors;
590 	}
591 	while(ali_it1 != ali_it1_stop)
592 	{
593 		++ali_it1;
594 		++errors;
595 	}
596 
597 	return errors;
598 }
599 
600 
601 
602 //////////////////////////////////////////////////////////////////////////////
603 // Edit distance verification for longest suffix/prefix
604 template <
605 	typename TMatch,
606 	typename TGenome,
607 	typename TReadSet,
608 	typename TMyersPatterns,
609 	typename TSpec,
610 	typename TSufPrefSpec
611 >
612 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> inf,unsigned rseqNo,TReadSet & readSet,TMyersPatterns & forwardPatterns,RazerSOptions<TSpec> const & options,SwiftSemiGlobal,TSufPrefSpec)613 matchVerify(
614 	TMatch &m,							// resulting match
615 	Segment<TGenome, InfixSegment> inf,	// potential match genome region
616 	unsigned rseqNo,					// read number
617 	TReadSet &readSet,	    			// reads
618 	TMyersPatterns &forwardPatterns,	// MyersBitVector preprocessing data
619 	RazerSOptions<TSpec> const &options,// RazerS options
620 	SwiftSemiGlobal,					// Edit distance
621 	TSufPrefSpec)						// Swift specialization
622 {
623 	typedef Segment<TGenome, InfixSegment>              TGenomeInfix;
624 	typedef typename Value<TReadSet>::Type              TRead;
625 
626 	// find read match end
627 	typedef Finder<TGenomeInfix>                        TMyersFinder;
628 	typedef typename Value<TMyersPatterns>::Type        TMyersPattern;
629 
630 	// find read match begin
631 	typedef ModifiedString<TGenomeInfix, ModReverse>    TGenomeInfixRev;
632 	typedef ModifiedString<TRead, ModReverse>           TReadRev;
633 	typedef Finder<TGenomeInfixRev>                     TMyersFinderRev;
634 	typedef Pattern<TReadRev, MyersUkkonenGlobal>       TMyersPatternRev;
635 
636 	TMyersFinder myersFinder(inf);
637 	TMyersPattern &myersPattern = forwardPatterns[rseqNo];  //have to make sure this only contains the prefix
638 
639 #ifdef RAZERS_DEBUG
640 	::std::cout << "Verify: " << ::std::endl;
641 	::std::cout << "Genome: " << inf << "\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl;
642 	::std::cout << "Read:   " << readSet[rseqNo] << ::std::endl;
643 #endif
644 
645 	unsigned ndlLength = _min(sequenceLength(rseqNo, readSet),options.minMatchLen);
646 	int maxScore = std::numeric_limits<int>::min();
647 	int minScore = - maxNumSeedErrors(options,TSufPrefSpec());
648 
649 	TMyersFinder maxPos;
650 
651 	// find end of best semi-global alignment
652 	while (find(myersFinder, myersPattern, minScore))
653 	{
654 		if (maxScore <= getScore(myersPattern))
655 		{
656 			maxScore = getScore(myersPattern);
657 			maxPos = myersFinder;
658 		}
659 	}
660 
661 
662 	if (maxScore < minScore)
663 		return false;
664 
665 	m.editDist	= (unsigned)-maxScore;
666 	TGenomeInfix oriInf = inf;
667 	setEndPosition(inf, m.gEnd = (beginPosition(inf) + position(maxPos) + 1));
668 
669 	// limit the beginning to needle length plus errors (== -maxScore)
670 	if (length(inf) > ndlLength - maxScore)
671 		setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore);
672 
673 	// find beginning of best semi-global alignment
674 	TGenomeInfixRev		infRev(inf);
675 	TMyersFinderRev		myersFinderRev(infRev);
676     TReadRev            readRev;
677     TRead               readInf;  // Needs to be a global variable, since ModifiedString cannot hold a pointer to a temporary.
678 	if(IsSameType<TSufPrefSpec,LongestSuffix>::VALUE)
679         readInf = infix(readSet[rseqNo],length(readSet[rseqNo])-options.minMatchLen,length(readSet[rseqNo]));
680     else
681 		readInf = infix(readSet[rseqNo],0,options.minMatchLen);
682     setHost(readRev, readInf);
683 
684 	TMyersPatternRev	myersPatternRev(readRev);
685 
686 	_patternMatchNOfPattern(myersPatternRev, options.matchN);
687 	_patternMatchNOfFinder(myersPatternRev, options.matchN);
688 	while (find(myersFinderRev, myersPatternRev, maxScore))
689 		m.gBegin = m.gEnd - (position(myersFinderRev) + 1);
690 
691 	m.mScore = ndlLength;
692 	m.seedEditDist = m.editDist;
693 	m.gSeedLen = m.gEnd - m.gBegin;
694 
695 #ifdef RAZERS_DEBUG
696 	::std::cout << " before extendMatch " << ::std::endl;
697 	::std::cout << " match: " << ::std::endl;
698 	::std::cout << " mScore= " <<m.mScore << ::std::endl;
699 	::std::cout << " gBegin= " <<m.gBegin << ::std::endl;
700 	::std::cout << " gEnd= " <<m.gEnd << ::std::endl;
701 	::std::cout << " edit= " <<m.editDist << ::std::endl;
702 #endif
703 
704 	//TODO: give only part of read to extension!!!
705 
706 	//now extend the seed
707 	extendMatch(readSet,rseqNo,oriInf,m,options,TSufPrefSpec());
708 
709 #ifdef RAZERS_DEBUG
710 	::std::cout << " match: " << ::std::endl;
711 	::std::cout << " mScore= " <<m.mScore << ::std::endl;
712 	::std::cout << " gBegin= " <<m.gBegin << ::std::endl;
713 	::std::cout << " gEnd= " <<m.gEnd << ::std::endl;
714 	::std::cout << " edit= " <<m.editDist << ::std::endl;
715 #endif
716 
717 	return true;
718 }
719 
720 template<typename TOptions>
721 int
maxNumSeedErrors(TOptions & options,LongestSuffix)722 maxNumSeedErrors(TOptions &options, LongestSuffix)
723 {
724 	return (int)options.maxSuffixErrors;
725 }
726 
727 template<typename TOptions>
728 int
maxNumSeedErrors(TOptions & options,LongestPrefix)729 maxNumSeedErrors(TOptions &options, LongestPrefix)
730 {
731 	return (int)options.maxPrefixErrors;
732 }
733 
734 
735 
736 template<typename TReadSet, typename TSize, typename TInf, typename TMatch, typename TOptions>
737 void
extendMatch(TReadSet & readSet,TSize rseqNo,TInf & inf,TMatch & m,TOptions & options,LongestSuffix)738 extendMatch(TReadSet &readSet, TSize rseqNo, TInf & inf, TMatch &m, TOptions &options, LongestSuffix)
739 {
740 #ifdef RAZERS_DEBUG
741 	::std::cout << " extending match left" << ::std::endl;
742 #endif
743 
744 //  XXXIMPROV
745 //	unsigned lDim0 = _max(0,(int)length(readSet[rseqNo])- 2 * (int)options.minMatchLen);
746 	unsigned lDim0 = _max(0,(int)length(readSet[rseqNo])-(int)options.minMatchLen);
747 	unsigned lDim1 = m.gBegin - beginPosition(inf);
748 //  XXXIMPROV
749 //	unsigned rDim0 = length(readSet[rseqNo])-1-options.minMatchLen;
750 	unsigned rDim0 = length(readSet[rseqNo])-1;
751 	unsigned rDim1 = m.gEnd - beginPosition(inf)-1;
752 
753 //    Seed<int,SimpleSeed> seed(lDim0, lDim1, rDim0, rDim1);
754     Seed<Simple> seed(lDim0, lDim1, rDim0+1, rDim1+1);
755 	Score<int> scoreMatrix(0,-1,-1,-1);
756 	int scoreDropOff = static_cast<int>((sequenceLength(rseqNo,readSet) * options.errorRate) - m.editDist);
757 
758 #ifdef RAZERS_DEBUG
759 	::std::cout << "beginPos = " << beginPosition(inf) << std::endl;
760 	::std::cout << "endPos = " << endPosition(inf) << std::endl;
761 	::std::cout << " lDim0: " << lDim0 << ::std::endl;
762 	::std::cout << " lDim1: " << lDim1 << ::std::endl;
763 	::std::cout << " rDim0: " << rDim0 << ::std::endl;
764 	::std::cout << " rDim1: " << rDim1 << ::std::endl;
765 	::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
766 	::std::cout << " readInf: "<< infix(readSet[rseqNo],lDim0,rDim0+1) << ::std::endl;
767 	::std::cout << " gInfInf: "<< infix(inf,lDim1,rDim1+1) << ::std::endl;
768 	::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
769 	::std::cout << " gInf: "<< inf << ::std::endl;
770 #endif
771 
772 	int extScore = 0;
773 	//extendSeedScore(seed,extScore,scoreDropOff,scoreMatrix, readSet[rseqNo],inf,0,GappedXDrop());
774 
775     typedef typename Prefix<typename Value<TReadSet>::Type >::Type TQueryPrefix;
776     typedef typename Prefix<TInf>::Type TDatabasePrefix;
777 
778     TQueryPrefix queryPrefix = prefix(readSet[rseqNo], beginPositionH(seed));
779     TDatabasePrefix databasePrefix = prefix(inf, beginPositionV(seed));
780     extScore = _extendSeedGappedXDropOneDirection(seed, databasePrefix, queryPrefix, EXTEND_LEFT, scoreMatrix, scoreDropOff);
781 
782 //	m.gBegin = leftDim1(seed) + beginPosition(inf);
783 //	m.mScore = rightDim0(seed) - leftDim0(seed) + 1;
784 
785 	m.gBegin = beginPositionV(seed) + beginPosition(inf);
786 	m.mScore = endPositionH(seed) - beginPositionH(seed);
787 	m.editDist -= extScore;
788 
789 #ifdef RAZERS_DEBUG
790 	::std::cout << " lDim0: " << beginPositionH(seed) << ::std::endl;
791 	::std::cout << " lDim1: " << beginPositionV(seed) << ::std::endl;
792 	::std::cout << " rDim0: " << endPositionH(seed) << ::std::endl;
793 	::std::cout << " rDim1: " << endPositionV(seed) << ::std::endl;
794 	::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
795 	::std::cout << " readInf: "<< infix(readSet[rseqNo],beginPositionH(seed),endPositionH(seed)+1) << ::std::endl;
796 	::std::cout << " gInfInf: "<< infix(inf,beginPositionV(seed),endPositionV(seed)+1) << ::std::endl;
797 	::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
798 	::std::cout << " gInf: "<< inf << ::std::endl;
799 #endif
800 }
801 
802 template<typename TReadSet, typename TSize, typename TInf, typename TMatch, typename TOptions>
803 void
extendMatch(TReadSet & readSet,TSize rseqNo,TInf & inf,TMatch & m,TOptions & options,LongestPrefix)804 extendMatch(TReadSet &readSet, TSize rseqNo, TInf & inf, TMatch &m, TOptions &options, LongestPrefix)
805 {
806 #ifdef RAZERS_DEBUG
807 	::std::cout << " extending match right" << ::std::endl;
808 #endif
809 
810 	unsigned lDim0 = 0;
811 	unsigned lDim1 = m.gBegin - beginPosition(inf);
812 	unsigned rDim0 = _min(options.minMatchLen,length(readSet[rseqNo])) - 1;
813 	unsigned rDim1 = m.gEnd - beginPosition(inf) - 1;
814 //	Seed<int,SimpleSeed> seed(lDim0, lDim1, rDim0, rDim1);
815 	Seed<Simple> seed(lDim0, lDim1, rDim0+1, rDim1+1);
816 	Score<int> scoreMatrix(0,-1,-1,-1);
817 	int scoreDropOff = static_cast<int>((sequenceLength(rseqNo,readSet) * options.errorRate) - m.editDist);
818 
819 #ifdef RAZERS_DEBUG
820 	::std::cout << "beginPos = " << beginPosition(inf) << std::endl;
821 	::std::cout << "endPos = " << endPosition(inf) << std::endl;
822 	::std::cout << " lDim0: " << lDim0 << ::std::endl;
823 	::std::cout << " lDim1: " << lDim1 << ::std::endl;
824 	::std::cout << " rDim0: " << rDim0 << ::std::endl;
825 	::std::cout << " rDim1: " << rDim1 << ::std::endl;
826 	::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
827 	::std::cout << " readInf: "<< infix(readSet[rseqNo],lDim0,rDim0+1) << ::std::endl;
828 	::std::cout << " gInfInf: "<< infix(inf,lDim1,rDim1+1) << ::std::endl;
829 	::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
830 	::std::cout << " gInf: "<< inf << ::std::endl;
831 #endif
832 
833 	int extScore = 0;
834 //  XXXIMPROV
835 //	extendSeedScore(seed,extScore,scoreDropOff,scoreMatrix, prefix(readSet[rseqNo],length(readSet[rseqNo])-options.minMatchLen),inf,1,GappedXDrop());
836 
837     typedef typename Suffix<typename Value<TReadSet>::Type >::Type TQuerySuffix;
838     typedef typename Suffix<TInf>::Type TDatabaseSuffix;
839 
840     TQuerySuffix querySuffix = suffix(readSet[rseqNo], endPositionH(seed));
841     TDatabaseSuffix databaseSuffix = suffix(inf, endPositionV(seed));
842     extScore = _extendSeedGappedXDropOneDirection(seed, databaseSuffix, querySuffix, EXTEND_RIGHT, scoreMatrix, scoreDropOff);
843 
844 
845 	//extendSeedScore(seed,extScore,scoreDropOff,scoreMatrix, readSet[rseqNo],inf,1,GappedXDrop());
846 	//m.gEnd = rightDim1(seed) + 1 + beginPosition(inf);
847 	//m.mScore = rightDim0(seed) - leftDim0(seed) + 1;
848 	m.gEnd = endPositionV(seed)  + beginPosition(inf);
849 	m.mScore = endPositionH(seed) - beginPositionH(seed);
850 	m.editDist -= extScore;
851 
852 #ifdef RAZERS_DEBUG
853 	::std::cout << " lDim0: " << beginPositionH(seed) << ::std::endl;
854 	::std::cout << " lDim1: " << beginPositionV(seed) << ::std::endl;
855 	::std::cout << " rDim0: " << endPositionH(seed) << ::std::endl;
856 	::std::cout << " rDim1: " << endPositionV(seed) << ::std::endl;
857 	::std::cout << " scoreDropOff: "<<scoreDropOff << ::std::endl;
858 	::std::cout << " readInf: "<< infix(readSet[rseqNo],beginPositionH(seed),endPositionH(seed)) << ::std::endl;
859 	::std::cout << " gInfInf: "<< infix(inf,beginPositionV(seed),endPositionV(seed)) << ::std::endl;
860 	::std::cout << " read: "<< readSet[rseqNo] << ::std::endl;
861 	::std::cout << " gInf: "<< inf << ::std::endl;
862 #endif
863 }
864 
865 
866 
867 
868 template <
869 	typename TMatch,
870 	typename TGenome,
871 	typename TReadSet,
872 	typename TPattern,
873 	typename TSpec >
874 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> genomeInf,unsigned rseqNo,TReadSet & readSet,TPattern &,RazerSOptions<TSpec> const & options,SwiftSemiGlobalHamming,LongestPrefix)875 matchVerify(
876 	TMatch &m,									// resulting match
877 	Segment<TGenome,InfixSegment>  genomeInf,	// potential match genome region
878 	unsigned rseqNo,							// read number
879 	TReadSet& readSet,							// original readSet
880 	TPattern&,
881 	RazerSOptions<TSpec> const &options,		// RazerS options
882 	SwiftSemiGlobalHamming,						// HammingDistance
883 	LongestPrefix)								// LongestPrefix
884 {
885 
886 	typedef Segment<TGenome, InfixSegment>                  TGenomeInfix;
887 	//typedef typename Size<TGenomeInfix>::Type               TSize;
888 	//typedef typename Value<TGenomeInfix>::Type              TDna;
889 	//typedef typename Position<TGenomeInfix>::Type           TPosition;
890 	typedef typename Value<TReadSet>::Type 			TRead;
891 	typedef typename Iterator<TGenomeInfix, Standard>::Type	TGenomeIterator;
892 	typedef typename Infix<TRead>::Type 			TReadInf;
893 	typedef typename Iterator<TReadInf, Standard>::Type	TReadIterator;
894 
895 	if (length(genomeInf) < options.minMatchLen) return false;
896 	TReadInf read = infix(readSet[rseqNo],0,length(readSet[rseqNo])-options.minMatchLen);
897 
898 
899 	TReadIterator ritBeg	= begin(read, Standard());
900 	TReadIterator ritEnd	= end(read, Standard());
901 	TGenomeIterator git	= begin(genomeInf, Standard());
902 	TGenomeIterator gitEnd	= end(genomeInf, Standard()) - (length(read) - 1);
903 
904 	// this is max number of errors the seed should have
905 //	unsigned maxErrorsSeed = (unsigned)(options.minMatchLen * options.errorRate);
906 	unsigned maxTotalErrors = (unsigned)(length(readSet[rseqNo]) * options.errorRate);
907 	unsigned maxErrorsSeed = options.maxPrefixErrors;
908 	if(maxErrorsSeed > maxTotalErrors) maxErrorsSeed = maxTotalErrors;
909 	// unsigned minSeedErrors = maxErrorsSeed + 1;
910 	unsigned minTotalErrors = maxTotalErrors + 1;
911 	unsigned bestHitLength = 0;
912 
913 	for (; git < gitEnd; ++git)
914 	{
915 		bool hit = true;
916 		unsigned hitLength = 0;
917 		unsigned count = 0;
918 		unsigned seedErrors = 0;
919 		unsigned totalErrors = 0;
920 		TGenomeIterator g = git;
921 		for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g)
922 		{
923 			if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
924 			{
925 				if (count < options.minMatchLen)
926 				{
927 					++totalErrors;
928 					if(++seedErrors > maxErrorsSeed)
929 					{
930 						hit = false;
931 						break;
932 					}
933 				}
934 				else
935 				{
936 					if(++totalErrors > maxTotalErrors)
937 					{
938 						// we exclude this last error position
939 						--totalErrors;
940 						break;
941 					}
942 				}
943 			}
944 			++count;
945 		}
946 		if (hit) hitLength = count;
947 		if (hitLength > bestHitLength ) //simply take the longest hit
948 		{
949 			// minSeedErrors = seedErrors;
950 			minTotalErrors = totalErrors;
951 			bestHitLength = hitLength;
952 			m.gBegin = git - begin(host(genomeInf), Standard());
953 		}
954 	}
955 
956 
957 
958 	if(bestHitLength < options.minMatchLen)
959 		return false;
960 
961 	m.gEnd = m.gBegin + bestHitLength;
962 	m.mScore = bestHitLength;
963 	m.editDist = minTotalErrors;
964 
965 #ifdef RAZERS_DEBUG
966 		std::cout << "m.gBeg  =" << m.gBegin << "\n";
967 		std::cout << "m.gEnd  =" << m.gEnd << "\n";
968 		std::cout << "m.edit  =" << m.editDist << "\n";
969 		std::cout << "m.mScore=" << m.mScore << "\n\n";
970 #endif
971 
972 	return true;
973 
974 }
975 
976 
977 template <
978 	typename TMatch,
979 	typename TGenome,
980 	typename TReadSet,
981 	typename TPattern,
982 	typename TSpec >
983 inline bool
matchVerify(TMatch & m,Segment<TGenome,InfixSegment> genomeInf,unsigned rseqNo,TReadSet & readSet,TPattern &,RazerSOptions<TSpec> const & options,SwiftSemiGlobalHamming,LongestSuffix)984 matchVerify(
985 	TMatch &m,									// resulting match
986 	Segment<TGenome,InfixSegment>  genomeInf,	// potential match genome region
987 	unsigned rseqNo,							// read number
988 	TReadSet & readSet,							// original readSet
989 	TPattern&,
990 	RazerSOptions<TSpec> const &options,		// RazerS options
991 	SwiftSemiGlobalHamming,
992 	LongestSuffix)								// LongestSuffix
993 {
994 
995 	typedef Segment<TGenome, InfixSegment>                  TGenomeInfix;
996 	//typedef typename Size<TGenomeInfix>::Type               TSize;
997 	//typedef typename Value<TGenomeInfix>::Type              TDna;
998 	//typedef typename Position<TGenomeInfix>::Type           TPosition;
999 	typedef typename Value<TReadSet>::Type 			TRead;
1000 	typedef typename Iterator<TGenomeInfix, Standard>::Type	TGenomeIterator;
1001 	typedef typename Infix<TRead>::Type 			TReadInf;
1002 	typedef typename Iterator<TReadInf, Standard>::Type	TReadIterator;
1003 
1004 	if (length(genomeInf) < options.minMatchLen) return false;
1005 	TRead read = infix(readSet[rseqNo],options.minMatchLen,length(readSet[rseqNo]));
1006 
1007 
1008 #ifdef RAZERS_DEBUG
1009 	bool debug = true;
1010 	if(debug)
1011 	{
1012 		::std::cout<< "suffixmatching\n";
1013 		::std::cout << "genome=" << genomeInf << "\nread  =" << read <<"\n";
1014 	}
1015 #endif
1016 
1017 	TReadIterator ritEnd	= end(read, Standard())-1;
1018 	TReadIterator ritBeg	= begin(read, Standard());
1019 	TGenomeIterator git	= end(genomeInf, Standard())-1;
1020 	TGenomeIterator gitBeg	= begin(genomeInf, Standard()) + options.minMatchLen;
1021 
1022 	// this is max number of errors the seed should have
1023 //	unsigned maxErrorsSeed = (unsigned)(options.minMatchLen * options.errorRate);
1024 	unsigned maxTotalErrors = (unsigned)(length(readSet[rseqNo]) * options.errorRate);
1025 	unsigned maxErrorsSeed = options.maxSuffixErrors;
1026 	if(maxErrorsSeed > maxTotalErrors) maxErrorsSeed = maxTotalErrors;
1027 	// unsigned minSeedErrors = maxErrorsSeed + 1;
1028 	unsigned minTotalErrors = maxTotalErrors + 1;
1029 	unsigned bestHitLength = 0;
1030 
1031 	for (; git > gitBeg; --git)
1032 	{
1033 		bool hit = true;
1034 		unsigned hitLength = 0;
1035 		unsigned count = 0;
1036 		unsigned seedErrors = 0;
1037 		unsigned totalErrors = 0;
1038 		TGenomeIterator g = git;
1039 		for(TReadIterator r = ritEnd; r >= ritBeg; --r, --g)
1040 		{
1041 			//if(debug)::std::cout << *r << "\t" << *g << "\n";
1042 			if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
1043 			{
1044 				if (count < options.minMatchLen)
1045 				{
1046 					++totalErrors;
1047 					if(++seedErrors > maxErrorsSeed)
1048 					{
1049 						hit = false;
1050 						break;
1051 					}
1052 				}
1053 				else
1054 				{
1055 					if(++totalErrors > maxTotalErrors)
1056 					{
1057 						// we exclude this last error position
1058 						--totalErrors;
1059 						break;
1060 					}
1061 				}
1062 			}
1063 			++count;
1064 		}
1065 		if (hit) hitLength = count;
1066 		if (hitLength > bestHitLength ) //simply take the longest hit
1067 		{
1068 			// minSeedErrors = seedErrors;
1069 			minTotalErrors = totalErrors;
1070 			bestHitLength = hitLength;
1071 			m.gEnd = git - begin(host(genomeInf), Standard()) + 1;
1072 
1073 		}
1074 	}
1075 
1076 
1077 	if(bestHitLength < options.minMatchLen)
1078 		return false;
1079 
1080 
1081 	m.gBegin = m.gEnd - bestHitLength;
1082 	m.mScore = bestHitLength;
1083 	m.editDist = minTotalErrors;
1084 
1085 #ifdef RAZERS_DEBUG
1086 	std::cout << "m.gBeg  =" << m.gBegin << "\n";
1087 	std::cout << "m.gEnd  =" << m.gEnd << "\n";
1088 	std::cout << "m.edit  =" << m.editDist << "\n";
1089 	std::cout << "m.mScore=" << m.mScore << "\n\n";
1090 #endif
1091 
1092 	return true;
1093 
1094 }
1095 
1096 
1097 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1098 
1099 
1100 
1101 // Edit distance match combination forward
1102 template <typename TScore>
1103 bool
findBestSplitPosition(String<Pair<TScore,int>> & maxColsL,String<Pair<TScore,int>> & maxColsR,int & rowPosL1,int rowPosL2,int & rowPosR1,int rowPosR2,int seq0Len,int & traceExt,OrientationForward,SwiftSemiGlobal)1104 findBestSplitPosition(String<Pair<TScore,int> > & maxColsL,
1105 					  String<Pair<TScore,int> > & maxColsR,
1106 					  int & rowPosL1,
1107 					  int rowPosL2,
1108 					  int & rowPosR1,
1109 					  int rowPosR2,
1110 					  int seq0Len,
1111 					  int & traceExt,
1112 					  OrientationForward,
1113 					  SwiftSemiGlobal)
1114 {
1115 
1116 #ifdef RAZERS_DEBUG
1117 	::std::cout << "findBestSplitEditForward\n";
1118 #endif
1119 
1120 	TScore maxSum = std::numeric_limits<TScore>::min();
1121 	int bestL = rowPosL1;
1122 	int bestR = rowPosR1;
1123 	int bestTraceExtL = rowPosL1;
1124 	int bestTraceExtR = rowPosL1;
1125 	while (rowPosL1 <= rowPosL2 && rowPosR1 >= rowPosR2)
1126 	{
1127 		// this is to prevent same bases from being used in both prefix and suffix match
1128 		// this works, because we store the FIRST bestScore in each row
1129 		if (!(maxColsL[rowPosL1].i2 + maxColsR[rowPosR1].i2 <= seq0Len))
1130 		{
1131 			++rowPosL1;
1132 			--rowPosR1;
1133 			continue;
1134 		}
1135 		if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 > maxSum)
1136 		{
1137 			maxSum = maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1;
1138 			bestL = rowPosL1;
1139 			bestR = rowPosR1;
1140 			bestTraceExtL = rowPosL1;
1141 		}
1142 		else if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 == maxSum)
1143 			bestTraceExtR = rowPosL1;
1144 
1145 		++rowPosL1;
1146 		--rowPosR1;
1147 	}
1148 	traceExt = bestTraceExtR - bestTraceExtL;
1149 	rowPosL1 = bestL;
1150 	rowPosR1 = bestR;
1151 
1152 	return true;
1153 }
1154 
1155 // ---------------------------------------------------------------------------
1156 // BEGIN BREAKPOINT COMPUTATION CODE FROM ALIGN MODULE
1157 // ---------------------------------------------------------------------------
1158 
1159 // TODO(holtgrew): These functions also has to be copied to msplazers.
1160 // TODO(holtgrew): This breakpoint computation code is basically reusable, but needs some love (i.e. documentation, demos, interfaces)
1161 
1162 template <typename TAlign, typename TStringSet, typename TTrace, typename TValPair, typename TIndexPair, typename TDiagonal>
1163 inline void
_alignBandedNeedlemanWunschTrace(TAlign & align,TStringSet const & str,TTrace const & trace,TValPair const & overallMaxValue,TIndexPair const & overallMaxIndex,TDiagonal const diagL,TDiagonal const diagU)1164 _alignBandedNeedlemanWunschTrace(TAlign& align,
1165 					   TStringSet const& str,
1166 					   TTrace const& trace,
1167 					   TValPair const& overallMaxValue,
1168 					   TIndexPair const& overallMaxIndex,
1169 					   TDiagonal const diagL,
1170 					   TDiagonal const diagU)
1171 {
1172 	typedef typename Value<TStringSet>::Type TString;
1173 	typedef typename Id<TStringSet>::Type TId;
1174 	typedef typename Size<TTrace>::Type TSize;
1175 	typedef typename Value<TTrace>::Type TTraceValue;
1176 
1177 	// Traceback values
1178 	TTraceValue Diagonal = 0; TTraceValue Horizontal = 1; TTraceValue Vertical = 2;
1179 
1180 	// Initialization
1181 	TString const& str1 = str[0];
1182 	TString const& str2 = str[1];
1183 	TId id1 = positionToId(const_cast<TStringSet&>(str), 0);
1184 	TId id2 = positionToId(const_cast<TStringSet&>(str), 1);
1185 	TSize len1 = length(str1) + 1;
1186 	TSize len2 = length(str2) + 1;
1187 	TSize lo_row = (diagU <= 0) ? -1 * diagU : 0;
1188 	TSize diagonalWidth = (TSize) (diagU - diagL + 1);
1189 
1190 	//// Debug stuff
1191 	//TColumn originalMat;
1192 	//resize(originalMat, len1 * len2);
1193 	//TSize count = 0;
1194 	//for(TSize i=0; i<len2; ++i) {
1195 	//	for(TSize j=0; j<len1; ++j) {
1196 	//		value(originalMat, i * len1 + j) = count;
1197 	//		std::cout << count << ',';
1198 	//		++count;
1199 	//	}
1200 	//	std::cout << std::endl;
1201 	//}
1202 	//std::cout << std::endl;
1203 
1204 	// Start the trace from the cell with the max value
1205 	TSize row = (overallMaxValue[0] > overallMaxValue[1]) ? overallMaxIndex[0] : overallMaxIndex[2];
1206 	TSize col = (overallMaxValue[0] > overallMaxValue[1]) ? overallMaxIndex[1] : overallMaxIndex[3];
1207 
1208 	// Handle the skipped sequence parts
1209 	TSize actualRow = row + lo_row;
1210 	TSize actualCol = col + diagL + actualRow;
1211 	if (actualCol + 1 < len1) _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, (len1 - (actualCol + 1)),  Horizontal);
1212 	if (actualRow + 1 < len2) _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, (len2 - (actualRow + 1)),  Vertical);
1213 
1214 	if ((actualRow != 0) && (actualCol != 0)) {
1215 		// Find initial direction
1216 		TTraceValue tv = trace[row * diagonalWidth + col];
1217 		if (tv == Horizontal) --col;
1218 		else if (tv == Vertical) {--row; ++col;}
1219 		else --row;
1220 
1221 		// Walk until we hit a border
1222 		TSize seqLen = 1;
1223 		TTraceValue newTv = tv;
1224 		while(true) {
1225 			actualRow = row + lo_row;
1226 			actualCol = col + diagL + actualRow;
1227 			newTv = trace[row * diagonalWidth + col];
1228 
1229 			// Check if we hit a border
1230 			if ((actualRow == 0) || (actualCol == 0)) break;
1231 			else {
1232 				//std::cout << row << ',' << col << ':' << value(originalMat, actualRow * len1 + actualCol) << std::endl;
1233 				if (tv == Diagonal) {
1234 					if (newTv == Horizontal) {
1235 						_alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1236 						--col; seqLen = 1;
1237 					} else if (newTv == Vertical) {
1238 						_alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1239 						--row; ++col; seqLen = 1;
1240 					} else {
1241 						--row; ++seqLen;
1242 					}
1243 				} else {
1244 					if (tv == Horizontal) {
1245 						if (newTv == Diagonal) {
1246 							_alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1247 							--row; seqLen = 1;
1248 						} else if (newTv == Vertical) {
1249 							_alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1250 							--row; ++col; seqLen = 1;
1251 						} else {
1252 							--col; ++seqLen;
1253 						}
1254 					} else {
1255 						if (newTv == Diagonal) {
1256 							_alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1257 							--row; seqLen = 1;
1258 						} else if (newTv == Horizontal) {
1259 							_alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1260 							--col; seqLen = 1;
1261 						} else {
1262 							--row; ++col; ++seqLen;
1263 						}
1264 					}
1265 				}
1266 				tv = newTv;
1267 			}
1268 		}
1269 
1270 		// Align left overs
1271 		if (seqLen) _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
1272 	}
1273 
1274 	// Handle the remaining sequence
1275 	if (actualCol != 0) _alignTracePrint(align, str[0], str[1], (TId) id1, (TSize) 0, (TId) 0, (TSize) 0, (TSize) actualCol,  Horizontal);
1276 	else if (actualRow != 0) _alignTracePrint(align, str[0], str[1], (TId) 0, (TSize) 0, (TId) id2, (TSize) 0, (TSize) actualRow,  Vertical);
1277 }
1278 
1279 template <typename TTrace, typename TStringSet, typename TScore, typename TValPair, typename TIndexPair, typename TDiagonal, typename TAlignConfig>
1280 inline typename Value<TScore>::Type
_alignBandedNeedlemanWunsch(TTrace & trace,TStringSet const & str,TScore const & sc,TValPair & overallMaxValue,TIndexPair & overallMaxIndex,TDiagonal diagL,TDiagonal diagU,TAlignConfig const,String<Pair<typename Value<TScore>::Type,int>> & maxCols,unsigned minColNum)1281 _alignBandedNeedlemanWunsch(TTrace& trace,
1282 			TStringSet const& str,
1283 			TScore const & sc,
1284 			TValPair& overallMaxValue,
1285 			TIndexPair& overallMaxIndex,
1286 			TDiagonal diagL,
1287 			TDiagonal diagU,
1288 			TAlignConfig const,
1289 			String<Pair<typename Value<TScore>::Type,int> > & maxCols,
1290 			unsigned minColNum)
1291 {
1292 	typedef typename Value<TTrace>::Type TTraceValue;
1293 	typedef typename Value<TScore>::Type TScoreValue;
1294 	typedef typename Value<TStringSet>::Type TString;
1295 	typedef typename Size<TTrace>::Type TSize;
1296 
1297 //      ::std::cout << "hierrrrrrrrrr banded ali\n";
1298 	// Initialization
1299 	TTraceValue Diagonal = 0; TTraceValue Horizontal = 1; TTraceValue Vertical = 2;
1300 	TString const& str1 = str[0];
1301 	TString const& str2 = str[1];
1302 	TSize len1 = length(str1) + 1;
1303 	TSize len2 = length(str2) + 1;
1304 //      ::std::cout << "len1="<<len1 << " len2=" << len2 << ::std::endl;
1305 	TSize diagonalWidth = (TSize) (diagU - diagL + 1);
1306 	TSize hi_diag = diagonalWidth;
1307 	TSize lo_diag = 0;
1308 	if (diagL > 0) lo_diag = 0;
1309 	else lo_diag = (diagU < 0) ? hi_diag : (TSize) (1 - diagL);
1310 	TSize lo_row = (diagU <= 0) ? -diagU : 0;
1311 	TSize hi_row = len2;
1312 	if (len1 - diagL < hi_row) hi_row = len1 - diagL;
1313 	TSize height = hi_row - lo_row;
1314 
1315 	clear(maxCols);
1316 
1317 	typedef String<TScoreValue> TRow;
1318 	TRow mat;
1319 	resize(mat, diagonalWidth);
1320 	resize(trace, height * diagonalWidth);
1321 //      ::std::cout <<height << "<-hieght\n";
1322 	overallMaxValue[0] = std::numeric_limits<TScoreValue>::min();
1323 	overallMaxValue[1] = std::numeric_limits<TScoreValue>::min();
1324 	overallMaxIndex[0] = diagonalWidth;     overallMaxIndex[1] = height;
1325 	overallMaxIndex[2] = diagonalWidth;     overallMaxIndex[3] = height;
1326 
1327 	//// Debug stuff
1328 	//String<TScoreValue> originalMat;
1329 	//resize(originalMat, len1 * len2);
1330 	//TSize count = 0;
1331 	//for(TSize i=0; i<len2; ++i) {
1332 	//      for(TSize j=0; j<len1; ++j) {
1333 	//              value(originalMat, i * len1 + j) = count;
1334 	//              std::cerr << count << ',';
1335 	//              ++count;
1336 	//      }
1337 	//      std::cerr << std::endl;
1338 	//}
1339 //std::cerr << std::endl;
1340 
1341 	// Classical DP with affine gap costs
1342 	typedef typename Iterator<TRow, Standard>::Type TRowIter;
1343 	typedef typename Iterator<TTrace, Standard>::Type TTraceIter;
1344 	TSize actualCol = 0;
1345 	TSize actualRow = 0;
1346 	TScoreValue verti_val = 0;
1347 	TScoreValue hori_val = 0;
1348 	for(TSize row = 0; row < height; ++row) {
1349 		TScoreValue maxRowVal = std::numeric_limits<TScoreValue>::min();
1350 		unsigned maxRowCol = 0;
1351 		actualRow = row + lo_row;
1352 		if (lo_diag > 0) --lo_diag;
1353 		if (row + lo_row >= len1 - diagU) --hi_diag;
1354 		TTraceIter traceIt = begin(trace, Standard()) + row * diagonalWidth + lo_diag;
1355 		TRowIter matIt = begin(mat, Standard()) + lo_diag;
1356 		hori_val = std::numeric_limits<TScoreValue>::min();
1357 		for(TSize col = lo_diag; col<hi_diag; ++col, ++matIt, ++traceIt) {
1358 			actualCol = col + diagL + actualRow;
1359 			//std::cerr << row << ',' << col << ':' << value(originalMat, actualRow * len1 + actualCol) << std::endl;
1360 
1361 			if ((actualRow != 0) && (actualCol != 0)) {
1362 				// Get the new maximum for mat
1363 				*matIt += score(const_cast<TScore&>(sc), sequenceEntryForScore(const_cast<TScore&>(sc), str1, ((int) actualCol - 1)),
1364 				                sequenceEntryForScore(const_cast<TScore&>(sc), str2, ((int) actualRow - 1)));
1365 				*traceIt = Diagonal;
1366 				if ((verti_val = (col < diagonalWidth - 1) ? *(matIt+1) +
1367 				    scoreGapExtendVertical(sc, sequenceEntryForScore(sc, str1, ((int) actualCol - 1)),
1368 				                           sequenceEntryForScore(sc, str2, ((int) actualRow - 1))) : std::numeric_limits<TScoreValue>::min()) > *matIt)
1369 				{
1370 					*matIt = verti_val;
1371 					*traceIt = Vertical;
1372 				}
1373 				if ((hori_val = (col > 0) ? hori_val +
1374 				    scoreGapExtendHorizontal(sc, sequenceEntryForScore(sc, str1, ((int) actualCol - 1)),
1375 				                             sequenceEntryForScore(sc, str2, ((int) actualRow - 1))) : std::numeric_limits<TScoreValue>::min()) > *matIt)
1376 				{
1377 					*matIt = hori_val;
1378 					*traceIt = Horizontal;
1379 				}
1380 				hori_val = *matIt;
1381 			} else {
1382 				// Usual initialization for first row and column
1383 				if (actualRow == 0)
1384 				    _initFirstRow(TAlignConfig(), *matIt, (TScoreValue) actualCol *
1385 				                  scoreGapExtendHorizontal(sc, sequenceEntryForScore(sc, str1, std::max(0,((int) actualCol - 1))),
1386 				                                           sequenceEntryForScore(sc, str2, 0)));
1387 				else {
1388 					_initFirstColumn(TAlignConfig(), *matIt, (TScoreValue) actualRow *
1389 					                 scoreGapExtendVertical(sc, sequenceEntryForScore(sc, str1, 0),
1390 					                                        sequenceEntryForScore(sc, str2, std::max(0,((int) actualRow - 1)))));
1391 					hori_val = *matIt;
1392 				}
1393 			}
1394 			if(*matIt > maxRowVal && actualCol >= minColNum)
1395 			{
1396 				maxRowVal = *matIt;
1397 				maxRowCol = actualCol;
1398 			}
1399 			// Store the maximum
1400 			if (actualCol == len1 - 1) _lastColumn(TAlignConfig(), overallMaxValue, overallMaxIndex, *matIt, row, col);
1401 			if (actualRow == len2 - 1) _lastRow(TAlignConfig(), overallMaxValue, overallMaxIndex, *matIt, row, col);
1402 			//std::cerr << row << ',' << col << ':' << *matIt << std::endl;
1403 		}
1404 		appendValue(maxCols,Pair<typename Value<TScore>::Type,int>(maxRowVal,maxRowCol));
1405 	//      std::cout << maxRowVal << ","<<maxRowCol << std::endl;
1406 	}
1407 	return (overallMaxValue[0] > overallMaxValue[1]) ? overallMaxValue[0] : overallMaxValue[1];
1408 }
1409 
1410 template<typename TAlign, typename TStringSet, typename TScore, typename TAlignConfig, typename TDiagonal>
1411 inline typename Value<TScore>::Type
_globalAlignment(TAlign & align,TStringSet const & str,TScore const & sc,TAlignConfig const,TDiagonal diag1,TDiagonal diag2,String<Pair<typename Value<TScore>::Type,int>> & maxCols,unsigned minColNum,NeedlemanWunsch)1412 _globalAlignment(TAlign& align,
1413 			TStringSet const& str,
1414 			TScore const& sc,
1415 			TAlignConfig const,
1416 			TDiagonal diag1,
1417 			TDiagonal diag2,
1418 			String<Pair<typename Value<TScore>::Type,int> > &maxCols,
1419 			unsigned minColNum,
1420 			NeedlemanWunsch)
1421 {
1422 	typedef typename Value<TScore>::Type TScoreValue;
1423 	typedef typename Size<TStringSet>::Type TSize;
1424 
1425 	// Maximum value
1426 	TScoreValue overallMaxValue[2];
1427 	TSize overallMaxIndex[4];
1428 
1429 	// Create the trace
1430 	String<TraceBack> trace;
1431 	TScoreValue maxScore = _alignBandedNeedlemanWunsch(trace, str, sc, overallMaxValue, overallMaxIndex, (int) diag1, (int) diag2, TAlignConfig(),maxCols, minColNum);
1432 
1433 	// Follow the trace and create the graph
1434 	_alignBandedNeedlemanWunschTrace(align, str, trace, overallMaxValue, overallMaxIndex, (int) diag1, (int) diag2);
1435 
1436 	return maxScore;
1437 }
1438 
1439 template<typename TStringSet, typename TScore, typename TAlignConfig, typename TDiagonal>
1440 inline typename Value<TScore>::Type
_globalAlignment(TStringSet const & str,TScore const & sc,TAlignConfig const,TDiagonal diag1,TDiagonal diag2,String<Pair<typename Value<TScore>::Type,int>> & maxCols,unsigned minColNum,NeedlemanWunsch)1441 _globalAlignment(TStringSet const& str,
1442 			TScore const& sc,
1443 			TAlignConfig const,
1444 			TDiagonal diag1,
1445 			TDiagonal diag2,
1446 			String<Pair<typename Value<TScore>::Type,int> > &maxCols,
1447 			unsigned minColNum,
1448 			NeedlemanWunsch)
1449 {
1450 	typedef typename Value<TScore>::Type TScoreValue;
1451 	typedef typename Size<TStringSet>::Type TSize;
1452 
1453 	// Maximum value
1454 	TScoreValue overallMaxValue[2];
1455 	TSize overallMaxIndex[4];
1456 
1457 	// Calculate the score
1458 	String<TraceBack> trace;
1459 	return _alignBandedNeedlemanWunsch(trace, str, sc, overallMaxValue, overallMaxIndex, (int) diag1, (int) diag2, TAlignConfig(),maxCols,minColNum);
1460 }
1461 
1462 // ---------------------------------------------------------------------------
1463 // END BREAKPOINT COMPUTATION CODE FROM ALIGN MODULE
1464 // ---------------------------------------------------------------------------
1465 
1466 // Edit distance match combination reverse
1467 template <typename TScore>
1468 bool
findBestSplitPosition(String<Pair<TScore,int>> & maxColsL,String<Pair<TScore,int>> & maxColsR,int & rowPosL1,int rowPosL2,int & rowPosR1,int rowPosR2,int seq0Len,int & traceExt,OrientationReverse,SwiftSemiGlobal)1469 findBestSplitPosition(String<Pair<TScore,int> > & maxColsL,
1470 					  String<Pair<TScore,int> > & maxColsR,
1471 					  int & rowPosL1,
1472 					  int rowPosL2,
1473 					  int & rowPosR1,
1474 					  int rowPosR2,
1475 					  int seq0Len,
1476 					  int & traceExt,
1477 					  OrientationReverse,
1478 					  SwiftSemiGlobal)
1479 {
1480 #ifdef RAZERS_DEBUG
1481 	::std::cout << "findBestSplitEditReverse\n";
1482 #endif
1483 
1484 	TScore maxSum = std::numeric_limits<TScore>::min();
1485 	int bestL = rowPosL2;
1486 	int bestR = rowPosR2;
1487 	int bestTraceExtR = rowPosL1;
1488 	int bestTraceExtL = rowPosL1;
1489 
1490 	while (rowPosL1 <= rowPosL2 && rowPosR1 >= rowPosR2)
1491 	{
1492 		if(!(maxColsL[rowPosL1].i2 + maxColsR[rowPosR1].i2 <= seq0Len))
1493 		{
1494 			++rowPosL1;
1495 			--rowPosR1;
1496 			continue;
1497 		}
1498 		if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 >= maxSum)
1499 		{
1500 			maxSum = maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1;
1501 			bestL = rowPosL1;
1502 			bestR = rowPosR1;
1503 			bestTraceExtR = rowPosL1;
1504 			if(maxColsL[rowPosL1].i1 + maxColsR[rowPosR1].i1 > maxSum) bestTraceExtL = rowPosL1;
1505 		}
1506 		++rowPosL1;
1507 		--rowPosR1;
1508 	}
1509 	rowPosL1 = bestL;
1510 	rowPosR1 = bestR;
1511 	traceExt = bestTraceExtR - bestTraceExtL;
1512 	return true;
1513 }
1514 
1515 
1516 
1517 
1518 
1519 // Edit distance match combination wrapper
1520 template <typename TMatch, typename TRead, typename TGenome, typename TSpec>
1521 bool
combineLeftRight(TMatch & mR,TMatch & mL,TRead & read,TGenome & genome,RazerSOptions<TSpec> & options,char orientation,SwiftSemiGlobal)1522 combineLeftRight(TMatch & mR,
1523 		 TMatch & mL,
1524 		 TRead & read,
1525 		 TGenome & genome,
1526 		 RazerSOptions<TSpec> &options,
1527 		 char orientation,
1528 		 SwiftSemiGlobal)
1529 {
1530 
1531 #ifdef RAZERS_DEBUG
1532 	::std::cout << "combinLeftRightEdit\n";
1533 #endif
1534 	Score<int> scoreType(0,-1,-1,-1);
1535 	typedef typename Infix<TGenome>::Type TGenomeInf;
1536 	typedef ModifiedString<TGenomeInf,ModReverse> TGenomeInfRev;
1537 
1538 	TGenomeInf genomeInfL = infix(genome, mL.gBegin+mL.gSeedLen, mL.gEnd);
1539 	TGenomeInf readInfL = infix(read,options.minMatchLen,mL.mScore);
1540 	TGenomeInfRev genomeInfR(infix(genome, mR.gBegin, mR.gEnd-mR.gSeedLen));
1541 	TGenomeInfRev readInfR(infix(read,length(read)-mR.mScore,length(read)-options.minMatchLen));
1542 
1543 #ifdef RAZERS_DEBUG
1544 	bool debug = true;
1545 	std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1546 	std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1547 	std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1548 	std::cout << "mL.editDist =" << mL.editDist << "\n";
1549 
1550 	std::cout << "mR.mScore =" << mR.mScore << "\n";
1551 	std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1552 	std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1553 	std::cout << "mR.editDist =" << mR.editDist << "\n";
1554 #endif
1555 
1556 	int readLength = length(read);
1557 	unsigned maxErrors = static_cast<unsigned>(readLength * options.errorRate);
1558 
1559 	// neither insertion nor deletion
1560 	if(mR.gEnd - mL.gBegin == readLength)
1561 	{
1562 #ifdef RAZERS_DEBUG
1563 		::std::cout << "normal\n";
1564 #endif
1565 		if(true)return false;
1566 		unsigned halfReadLen = readLength >> 1;
1567 
1568 		Align<String<Dna5>, ArrayGaps> align;
1569 		assignSource(row(align, 0), infix(read,0,readLength));
1570 		assignSource(row(align, 1), infix(genome, mL.gBegin, mR.gEnd));
1571 		int sc = globalAlignment(align, scoreType, AlignConfig<false,false,false,false>(), NeedlemanWunsch());
1572 		if(-sc > (int)maxErrors) return false;
1573 
1574 		mL.gEnd = mL.gBegin + toSourcePosition(row(align, 1),toViewPosition(row(align, 0), halfReadLen-1));
1575 		mR.gBegin = mL.gEnd;
1576 		mL.mScore = halfReadLen;
1577 		mR.mScore = readLength - mL.mScore;
1578 		mL.editDist = countErrorsInAlign(align,toViewPosition(row(align, 0), halfReadLen));
1579 		mR.editDist = -sc-mL.editDist;
1580 
1581 #ifdef RAZERS_DEBUG
1582 		std::cout << "mL.mScore =" << mL.mScore << "\n";
1583 		std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1584 		std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1585 		std::cout << "mL.editDist =" << mL.editDist << "\n";
1586 
1587 		std::cout << "mR.mScore =" << mR.mScore << "\n";
1588 		std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1589 		std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1590 		std::cout << "mR.editDist =" << mR.editDist << "\n";
1591 #endif
1592 
1593 	}
1594 
1595 	//potential insertion // for edit distance this is a heuristic
1596 	if(mR.gEnd - mL.gBegin < readLength)
1597 	{
1598 #ifdef RAZERS_DEBUG
1599 		::std::cout << "insertion\n";
1600 #endif
1601 		if(mR.gEnd - mL.gBegin < static_cast<long int>(2*options.minMatchLen))  //too close together // actually minus allowed seed errors
1602 			return false;
1603 
1604 		if(mL.gEnd < mR.gBegin)  //prefix and suffix match do not meet
1605 			return false;
1606 
1607 		if(mR.mScore + mL.mScore == mR.gEnd - mL.gBegin)
1608 			if(mL.gEnd == mR.gBegin)
1609 				if(mR.editDist + mL.editDist <= maxErrors ) //prefix and suffix match meet and do not overlap --> perfect
1610 					return true;
1611 
1612 		int diag1L = -static_cast<int>(maxErrors) + mL.seedEditDist;
1613 		int diag2L = maxErrors - mL.seedEditDist;
1614 		int diag1R = -static_cast<int>(maxErrors) + mR.seedEditDist;
1615 		int diag2R = maxErrors - mR.seedEditDist;
1616  		int minColNum = 0;
1617 
1618 		// genomeInf is the shorter sequence --> find best split position on genome
1619 		// rows in alignment matrix represent genome position
1620 		StringSet<TGenomeInf,Dependent<> > strSetL;
1621 		appendValue(strSetL,readInfL);
1622 		appendValue(strSetL,genomeInfL);
1623 		Graph<Alignment<StringSet<TGenomeInf,Dependent<> >, void> > alignL(strSetL);
1624 		String<Pair<int,int> > maxColsL;
1625 		_globalAlignment(alignL,strSetL,scoreType,AlignConfig<false,false,false,false>(),diag1L,diag2L,maxColsL,minColNum,NeedlemanWunsch());
1626 
1627 		StringSet<TGenomeInfRev,Dependent<> > strSetR;
1628 		appendValue(strSetR,readInfR);
1629 		appendValue(strSetR,genomeInfR);
1630 		Graph<Alignment<StringSet<TGenomeInfRev,Dependent<> >, void > > alignR(strSetR);
1631 		String<Pair<int,int> > maxColsR;
1632 		_globalAlignment(alignR,strSetR,scoreType,AlignConfig<false,false,false,false>(),diag1R,diag2R,maxColsR,minColNum,NeedlemanWunsch());
1633 
1634 		//::std::cout << alignL;
1635 		//::std::cout << alignR;
1636 
1637 		// our begin and start positions are defined by the read positions
1638 		// go from read source to view position to genome source position
1639 		int rowPosL1 = 0;
1640 		if (mL.gSeedLen < (int)mR.gBegin - mL.gBegin) rowPosL1 = mR.gBegin - mL.gBegin - mL.gSeedLen;//or from first possible overlap pos
1641 		int rowPosR2 = 0;
1642 		if (mR.gSeedLen < (int)mR.gEnd - mL.gEnd) rowPosR2 = mR.gEnd - mL.gEnd - mR.gSeedLen;
1643 
1644 		int rowPosL2 = mR.gEnd - mR.gSeedLen - rowPosR2 - mL.gBegin - mL.gSeedLen;
1645 		int rowPosR1 = mR.gEnd - mR.gSeedLen - rowPosL1 - mL.gBegin - mL.gSeedLen;
1646 
1647 
1648 #ifdef RAZERS_DEBUG
1649 		::std::cout << "vorher\nrowPosL1=" << rowPosL1 << ::std::endl;
1650 		::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1651 		::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1652 		::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1653 #endif
1654 
1655 		// compute best split position
1656 		int seq0Len = readLength - 2*options.minMatchLen;
1657 		int traceExt = 0;
1658 		if(orientation == 'R')
1659 			findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,traceExt,OrientationReverse(),SwiftSemiGlobal());
1660 		else
1661 			findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,traceExt,OrientationForward(),SwiftSemiGlobal());
1662 
1663 #ifdef RAZERS_DEBUG
1664 		::std::cout << "nachher\nrowPosL1=" << rowPosL1 << ::std::endl;
1665 		::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1666 		::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1667 		::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1668 		::std::cout << "mR.editDist=" << mR.editDist << ::std::endl;
1669 		::std::cout << "mL.editDist=" << mL.editDist << ::std::endl;
1670 		::std::cout << "maxErros=" << maxErrors << ::std::endl;
1671 #endif
1672 		mL.editDist = mR.seedEditDist - maxColsL[rowPosL1].i1;	//scores are negative
1673 		mR.editDist = mL.seedEditDist - maxColsR[rowPosR1].i1;
1674 		if(mR.editDist + mL.editDist > maxErrors )
1675 			return false;
1676 
1677 		mR.traceExtension = mL.traceExtension = traceExt;
1678 
1679 		// best split position in genome is saved in rowPosL1 (and rowPosR1)
1680 		mL.mScore = options.minMatchLen + maxColsL[rowPosL1].i2;
1681 		mL.gEnd = mL.gBegin + mL.gSeedLen + rowPosL1; //read position of best genomic split
1682 
1683 		mR.mScore = options.minMatchLen + maxColsR[rowPosR1].i2;
1684 		mR.gBegin = mR.gEnd - mR.gSeedLen - rowPosR1; //read position of best genomic split
1685 
1686 #ifdef RAZERS_DEBUG
1687 		if(mL.editDist > 50 || mR.mScore < options.minMatchLen || mL.mScore < options.minMatchLen)
1688 		{
1689 			::std::cout << "-maxColsL[rowPosL1].i1=" << -maxColsL[rowPosL1].i1 << " -maxColsL[rowPosL1].i2=" << -maxColsL[rowPosL1].i2 << " rowPosL1="<<rowPosL1;
1690 			::std::cout << " maxColsLLen="<< length(maxColsL) << ::std::endl;
1691 			::std::cout << "-maxColsR[rowPosR1].i1=" << -maxColsR[rowPosR1].i1 << "-maxColsR[rowPosR1].i2=" << -maxColsR[rowPosR1].i2 << " rowPosR1="<<rowPosR1;
1692 			::std::cout << " maxColsRLen="<< length(maxColsR) << ::std::endl;
1693 		}
1694 #endif
1695 	}
1696 
1697 	//potential deletion
1698 	if(mR.gEnd - mL.gBegin > readLength)
1699 	{
1700 #ifdef RAZERS_DEBUG
1701 		::std::cout << "deletion\n";
1702 #endif
1703 		if(mR.mScore + mL.mScore < readLength)
1704 			return false;
1705 		if(mR.mScore + mL.mScore == readLength && mR.editDist + mL.editDist > maxErrors) //the prefix and suffix match meet, but too many errors
1706 			return false;
1707 
1708 		int diag1L = -static_cast<int>(maxErrors) + mL.seedEditDist;
1709 		int diag2L = maxErrors - mL.seedEditDist;
1710 		int diag1R = -static_cast<int>(maxErrors) + mR.seedEditDist;
1711 		int diag2R = maxErrors - mR.seedEditDist;
1712 		int minColNum = 0;
1713 
1714 		// readInf is the shorter sequence --> find best split position on read
1715 		// rows in alignment matrix represent read position
1716 		StringSet<TGenomeInf> strL;
1717 		appendValue(strL,genomeInfL);
1718 		appendValue(strL,readInfL);
1719 		String<Pair<int,int> > maxColsL;
1720 		_globalAlignment(strL,scoreType,AlignConfig<false,false,false,false>(),diag1L,diag2L,maxColsL,minColNum,NeedlemanWunsch());
1721 
1722 		StringSet<TGenomeInfRev> strR;
1723 		appendValue(strR,genomeInfR);
1724 		appendValue(strR,readInfR);
1725 		String<Pair<int,int> > maxColsR;
1726 
1727 		_globalAlignment(strR,scoreType,AlignConfig<false,false,false,false>(),diag1R,diag2R,maxColsR,minColNum,NeedlemanWunsch());
1728 
1729 		int rowPosL1 = ((int)options.minMatchLen > readLength-mR.mScore) ? (int)0 : readLength-mR.mScore-options.minMatchLen;
1730 		int rowPosL2 = (readLength-(int)options.minMatchLen < mL.mScore) ? readLength-(int)2*options.minMatchLen : mL.mScore - options.minMatchLen;
1731 		int rowPosR1 = (int)readLength - 2*options.minMatchLen - rowPosL1;
1732 		int rowPosR2 = (int)readLength - 2*options.minMatchLen - rowPosL2;
1733 
1734 #ifdef RAZERS_DEBUG
1735 		::std::cout << "rowPosL1=" << rowPosL1 << ::std::endl;
1736 		::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1737 		::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1738 		::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1739 
1740 		std::cout << "before split:\nmL.mScore =" << mL.mScore << "\n";
1741 		std::cout << "mL.gBegin =" << length(genome)-mL.gBegin << "\n";
1742 		std::cout << "mL.gEnd =" << length(genome)-mL.gEnd << "\n";
1743 		std::cout << "mL.editDist =" << mL.editDist << "\n";
1744 
1745 		std::cout << "mR.mScore =" << mR.mScore << "\n";
1746 		std::cout << "mR.gBegin =" << length(genome)-mR.gBegin << "\n";
1747 		std::cout << "mR.gEnd =" << length(genome)-mR.gEnd << "\n";
1748 		std::cout << "mR.editDist =" << mR.editDist << "\n";
1749 #endif
1750 
1751 		// compute best split position
1752 		int seq0Len = mR.gEnd - mL.gBegin;
1753 		int traceExt = 0;
1754 		if(orientation == 'R')
1755 			findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,traceExt,OrientationReverse(),SwiftSemiGlobal());
1756 		else
1757 			findBestSplitPosition(maxColsL,maxColsR,rowPosL1,rowPosL2,rowPosR1,rowPosR2,seq0Len,traceExt,OrientationForward(),SwiftSemiGlobal());
1758 
1759 
1760 		// best split position in read is saved in rowPosL1 (and rowPosR1)
1761 		mL.editDist = mL.seedEditDist - maxColsL[rowPosL1].i1;
1762 		mR.editDist = mR.seedEditDist - maxColsR[rowPosR1].i1;
1763 
1764 
1765 		if(mR.editDist + mL.editDist > maxErrors)
1766 			return false;
1767 
1768 		mR.mScore = options.minMatchLen + rowPosR1;
1769 		mR.gBegin = mR.gEnd - maxColsR[rowPosR1].i2 - mR.gSeedLen; //genomic position of best read split
1770 		mL.mScore = options.minMatchLen + rowPosL1;
1771 		mR.traceExtension = mL.traceExtension = traceExt;
1772 
1773 		mL.gEnd = mL.gBegin + maxColsL[rowPosL1].i2 + mL.gSeedLen; //genomic position of best read split
1774 #ifdef RAZERS_DEBUG
1775 		::std::cout << "rowPosL1=" << rowPosL1 << ::std::endl;
1776 		::std::cout << "rowPosL2=" << rowPosL2 << ::std::endl;
1777 		::std::cout << "rowPosR1=" << rowPosR1 << ::std::endl;
1778 		::std::cout << "rowPosR2=" << rowPosR2 << ::std::endl;
1779 
1780 		std::cout << "after split:\nmL.mScore =" << mL.mScore << "\n";
1781 		std::cout << "mL.gBegin =" << length(genome)-mL.gBegin << "\n";
1782 		std::cout << "mL.gEnd =" << length(genome)-mL.gEnd << "\n";
1783 		std::cout << "mL.editDist =" << mL.editDist << "\n";
1784 
1785 		std::cout << "mR.mScore =" << mR.mScore << "\n";
1786 		std::cout << "mR.gBegin =" << length(genome)-mR.gBegin << "\n";
1787 		std::cout << "mR.gEnd =" << length(genome)-mR.gEnd << "\n";
1788 		std::cout << "mR.editDist =" << mR.editDist << "\n";
1789 #endif
1790 
1791 	}
1792 	maxErrors = static_cast<unsigned>((mR.mScore + mL.mScore) * options.errorRate);
1793 	if(mR.editDist + mL.editDist > maxErrors) return false; //make sure percent identity critrium is fulfilled on matched part of the read
1794 
1795 	return true;
1796 }
1797 
1798 
1799 
1800 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1801 // Hamming distance match combination
1802 
1803 template <typename TMatch, typename TOriRead, typename TGenome, typename TSpec>
1804 bool
combineLeftRight(TMatch & mR,TMatch & mL,TOriRead const & oriRead,TGenome & genome,RazerSOptions<TSpec> & options,char orientation,SwiftSemiGlobalHamming)1805 combineLeftRight(TMatch & mR,
1806 		 TMatch & mL,
1807 		 TOriRead const& oriRead,
1808 		 TGenome & genome,
1809 		 RazerSOptions<TSpec> &options,
1810 		 char orientation,
1811 		 SwiftSemiGlobalHamming)
1812 {
1813 
1814 #ifdef RAZERS_DEBUG
1815 	::std::cout << "combineLeftRightHamming\n";
1816 #endif
1817 
1818 
1819 	typedef typename Infix<TOriRead const>::Type TRead;
1820 	TRead read = infix(oriRead,0,length(oriRead));
1821 
1822 	typedef typename Infix<TGenome>::Type TGenomeInf;
1823 	TGenomeInf genomeInf = infix(genome, mL.gBegin, mR.gEnd);
1824 	int readLength = length(read);
1825 	unsigned maxErrors = static_cast<unsigned>(readLength * options.errorRate);
1826 
1827 
1828 #ifdef RAZERS_DEBUG
1829 	std::cout << "readLength=" << readLength << "\n";
1830 	std::cout << "sumLen=" << mL.mScore + mR.mScore << "\n";
1831 	std::cout << "gInfLength=" << length(genomeInf) << "\n";
1832 	std::cout << "gInf=" << genomeInf << "\n";
1833 
1834 	std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1835 	std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1836 	std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1837 	std::cout << "mL.editDist =" << mL.editDist << "\n";
1838 	std::cout << "readPref=" << prefix(oriRead,mL.mScore) << "\n";
1839 
1840 	std::cout << "mR.mScore =" << mR.mScore << "\n";
1841 	std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1842 	std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1843 	std::cout << "mR.editDist =" << mR.editDist << "\n";
1844 	std::cout << "readSuff=" << suffix(oriRead,length(oriRead)-mR.mScore) << "\n";
1845 #endif
1846 
1847 
1848 	// neither insertion nor deletion
1849 	if(mR.gEnd - mL.gBegin == readLength)
1850 	{
1851 #ifdef RAZERS_DEBUG
1852 		::std::cout << "normal\n";
1853 #endif
1854 		unsigned halfReadLen = readLength >> 1;
1855 		unsigned prefixErrors = 0;
1856 		unsigned suffixErrors = 0;
1857 		for (unsigned i = 0 ; i < length(genomeInf); ++i)
1858 		{
1859 			if ((options.compMask[ordValue(read[i])] & options.compMask[ordValue(genomeInf[i])]) == 0)
1860 			{
1861 				if(i >= halfReadLen)
1862 					++suffixErrors;
1863 				else
1864 					++prefixErrors;
1865 			}
1866 		}
1867 
1868 		if (suffixErrors+prefixErrors <= maxErrors)
1869 		{
1870 			mL.mScore = halfReadLen;
1871 			mR.mScore = length(read)- halfReadLen;
1872 			mR.gBegin = mR.gEnd - mR.mScore;
1873 			mL.gEnd = mL.gBegin + mL.mScore;
1874 			mL.editDist = prefixErrors;
1875 			mR.editDist = suffixErrors;
1876 		}
1877 		else return false;
1878 
1879 	}
1880 	//potential insertion
1881 	if(mR.gEnd - mL.gBegin < readLength)
1882 	{
1883 #ifdef RAZERS_DEBUG
1884 		::std::cout << "insertion\n";
1885 #endif
1886 
1887 		if(mR.gEnd - mL.gBegin < static_cast<long int>(2*options.minMatchLen))//too close together
1888 			 return false;
1889 
1890 		if(mR.mScore + mL.mScore < mR.gEnd - mL.gBegin) //prefix and suffix match do not meet
1891 			return false;
1892 
1893 		if((mR.mScore + mL.mScore == mR.gEnd - mL.gBegin) && (mR.editDist + mL.editDist > maxErrors)) //prefix and suffix match meet but too many errors
1894 			return false;
1895 
1896 //		if((mR.gEnd - mL.gBegin <= mL.mScore) || (mR.gEnd - mL.gBegin <= mR.mScore))//too close together
1897 //			 return false;
1898 		int traceExt = 0;
1899 		bool result = findBestSplitPosition(read,genomeInf,mL.mScore,mR.mScore,mL.editDist,mR.editDist, traceExt, options, orientation, SwiftSemiGlobalHamming());
1900 		if(!result || mR.editDist + mL.editDist > maxErrors) return false;
1901 		mR.gBegin = mR.gEnd - mR.mScore;
1902 		mL.gEnd = mL.gBegin + mL.mScore;
1903 		mR.traceExtension = mL.traceExtension = traceExt;
1904 
1905 
1906 	}
1907 	//potential deletion
1908 	if(mR.gEnd - mL.gBegin > readLength)
1909 	{
1910 #ifdef RAZERS_DEBUG
1911 		::std::cout << "deletion\n";
1912 #endif
1913 
1914 		if(mR.mScore + mL.mScore < readLength)
1915 			return false;
1916 
1917 		if((mR.mScore + mL.mScore == readLength) && (mR.editDist + mL.editDist > maxErrors)) //the prefix and suffix match meet and do not overlap --> perfect
1918 			return false;
1919 
1920 		int traceExt = 0;
1921 		bool result = findBestSplitPosition(genomeInf,read,mL.mScore,mR.mScore,mL.editDist,mR.editDist, traceExt, options, orientation,SwiftSemiGlobalHamming());
1922 
1923 		if(!result || mR.editDist + mL.editDist > maxErrors) return false;
1924 
1925 		mR.traceExtension = mL.traceExtension = traceExt;
1926 		mR.gBegin = mR.gEnd - mR.mScore;
1927 		mL.gEnd = mL.gBegin + mL.mScore;
1928 
1929 	}
1930 #ifdef RAZERS_DEBUG
1931 	std::cout << "incombineleftright\nmL.mScore =" << mL.mScore << "\n";
1932 	std::cout << "mL.gBegin =" << mL.gBegin << "\n";
1933 	std::cout << "mL.gEnd =" << mL.gEnd << "\n";
1934 	std::cout << "mL.editDist =" << mL.editDist << "\n";
1935 
1936 	std::cout << "mR.mScore =" << mR.mScore << "\n";
1937 	std::cout << "mR.gBegin =" << mR.gBegin << "\n";
1938 	std::cout << "mR.gEnd =" << mR.gEnd << "\n";
1939 	std::cout << "mR.editDist =" << mR.editDist << "\n";
1940 #endif
1941 
1942 	maxErrors = static_cast<unsigned>((mR.mScore + mL.mScore) * options.errorRate);
1943 	if(mR.editDist + mL.editDist > maxErrors) return false; //make sure percent identity critrium is fulfilled on matched part of the read
1944 
1945 	return true;
1946 }
1947 
1948 
1949 // find the best split position for a split match
1950 // positions are relative to shorter sequence
1951 // (deletion --> read is shorter, insertion --> genomeInf is shorter)
1952 template <typename TSize, typename TLongerSegment, typename TShorterSegment, typename TErrors, typename TOptions>
1953 bool
findBestSplitPosition(TLongerSegment & longSeg,TShorterSegment & shortSeg,TSize & mLmScore,TSize & mRmScore,TErrors & errorsLeft,TErrors & errorsRight,int & traceExt,TOptions & options,char orientation,SwiftSemiGlobalHamming)1954 findBestSplitPosition(TLongerSegment &longSeg,
1955 			TShorterSegment &shortSeg,
1956 			TSize & mLmScore,
1957 			TSize & mRmScore,
1958 			TErrors & errorsLeft,
1959 			TErrors & errorsRight,
1960 			int & traceExt,
1961 			TOptions & options,
1962 			char orientation,
1963 			SwiftSemiGlobalHamming)
1964 {
1965 
1966 #ifdef RAZERS_DEBUG
1967 	::std::cout << "findBestSplitHamming"<<orientation<<"\n";
1968 #endif
1969 
1970 	// usually, both types should be the same, but you never know...
1971 	//typedef typename Iterator<TLongerSegment const>::Type TLongIterator;
1972 	//typedef typename Iterator<TShorterSegment const>::Type TShortIterator;
1973 	typedef typename Size<TShorterSegment const>::Type TShortSize;
1974 	typedef typename Size<TLongerSegment const>::Type TLongSize;
1975 
1976 
1977 	TShortSize shortLen = length(shortSeg);
1978 	TLongSize  longLen  = length(longSeg);
1979 	TLongSize  lenDiff  = longLen - shortLen;
1980 
1981 	TLongSize leftLongBegin = _max((int)options.minMatchLen,(int)shortLen-mRmScore);
1982 	TLongSize leftLongEnd = _min((int)mLmScore,(int)shortLen-(int)options.minMatchLen);
1983 	TLongSize leftLongPos = leftLongBegin;
1984 
1985 	TLongSize rightLongPos = leftLongBegin + lenDiff;
1986 	TShortSize shortPos = leftLongBegin;
1987 
1988 	int bestSumErrors = 0;
1989 	// int bestLErrors = 0;
1990 	// int bestRErrors = 0;
1991 	int bestPos = shortPos;
1992 	int errorsL = 0;
1993 	int errorsR = 0;
1994 	int errorsPosL = 0;
1995 	int errorsPosR = 0;
1996 
1997 	// determine trace extensions
1998 	int bestTraceExtL = shortPos;
1999 	int bestTraceExtR = shortPos;
2000 
2001 #ifdef RAZERS_DEBUG
2002 	std::cout << "before\nerrorsLeft= " << errorsLeft << std::endl;
2003 	std::cout << "errorsRight= " << errorsRight << std::endl;
2004 	std::cout << "mLmScore= " << mLmScore << std::endl;
2005 	std::cout << "mRmScore= " << mRmScore << std::endl;
2006 
2007 	std::cout << "leftLongPos " << leftLongPos << std::endl;
2008 	std::cout << "leftLongEnd " << leftLongEnd << std::endl;
2009 	std::cout << "rightLongPos " << rightLongPos << std::endl;
2010 	std::cout << "shortPos " << shortPos << std::endl;
2011 
2012 	std::cout << longSeg << "=longSeg\n";
2013 	std::cout << shortSeg << "=shortSeg\n";
2014 #endif
2015 
2016 	//find best split position --> min. sum errors
2017 	while(leftLongPos < leftLongEnd)
2018 	{
2019 		if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[leftLongPos])]) == 0)
2020 		{
2021 			++errorsL;
2022 			++errorsPosL;
2023 		}
2024 		if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[rightLongPos])]) == 0)
2025 		{
2026 			--errorsPosR;
2027 			++errorsR;
2028 		}
2029 		if(errorsPosL+errorsPosR < bestSumErrors
2030 			|| (orientation == 'R' && errorsPosL+errorsPosR == bestSumErrors))
2031 		{
2032 			bestSumErrors = errorsPosL+errorsPosR;
2033 			// bestLErrors = errorsPosL;
2034 			// bestRErrors = errorsPosR;
2035 			bestPos = shortPos + 1;
2036 			if(errorsPosL+errorsPosR < bestSumErrors) bestTraceExtL = bestPos;
2037 		}
2038 		if(errorsPosL+errorsPosR == bestSumErrors)
2039 		{
2040 			bestTraceExtR = shortPos + 1;
2041 		}
2042 		++leftLongPos;
2043 		++rightLongPos;
2044 		++shortPos;
2045 	}
2046 
2047 	// trace extension:
2048 	traceExt = bestTraceExtR - bestTraceExtL;
2049 
2050 	//update to new match lengths
2051 	mLmScore = bestPos;
2052 	mRmScore = shortLen - bestPos;
2053 
2054 	//count numErrors for left and right match
2055 	//(have to count completely new, because mScore may be longer than shortLen --> no able to track errors outside segment)
2056 	errorsRight = errorsLeft = 0;
2057 	for(leftLongPos = 0, shortPos = 0; leftLongPos < (unsigned)mLmScore; ++leftLongPos, ++shortPos)
2058 	{
2059 		if((options.compMask[ordValue(shortSeg[shortPos])] & options.compMask[ordValue(longSeg[leftLongPos])]) == (unsigned) 0)
2060 			++errorsLeft;
2061 
2062 	}
2063 	for(rightLongPos = 0, shortPos = 0; rightLongPos < (unsigned)mRmScore; ++rightLongPos, ++shortPos)
2064 	{
2065 		if((options.compMask[ordValue(shortSeg[shortLen-1-shortPos])] & options.compMask[ordValue(longSeg[longLen-1-rightLongPos])]) ==  (unsigned) 0)
2066 			++errorsRight;
2067 
2068 	}
2069 
2070 #ifdef RAZERS_DEBUG
2071 	std::cout << "bestSumErrors= " << bestSumErrors << std::endl;
2072 	std::cout << "errorsPosR= " << errorsPosR << std::endl;
2073 	std::cout << "errorsPosL= " << errorsPosL << std::endl;
2074 	std::cout << "after\nerrorsLeft= " << errorsLeft << std::endl;
2075 	std::cout << "errorsRight= " << errorsRight << std::endl;
2076 	std::cout << "mLmScore= " << mLmScore << std::endl;
2077 	std::cout << "mRmScore= " << mRmScore << std::endl;
2078 	std::cout << "traceExt= " << traceExt << std::endl;
2079 #endif
2080 
2081 	return true;
2082 }
2083 
2084 
2085 
2086 
2087 
2088 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2089 // Find read matches in many genome sequences (import from Fasta)
2090 template <
2091 	typename TMatches,
2092 	typename TReadSet_,
2093 	typename TCounts,
2094 	typename TSpec,
2095 	typename TShapeL,
2096 	typename TShapeR,
2097 	typename TSwiftSpec >
mapSplicedReads(TMatches & matches,StringSet<CharString> & genomeFileNameList,StringSet<CharString> & genomeNames,::std::map<unsigned,::std::pair<::std::string,unsigned>> & gnoToFileMap,TReadSet_ & readSet,TReadRegions & readRegions,TCounts & cnts,RazerSOptions<TSpec> & options,TShapeL const & shapeL,TShapeR const & shapeR,Swift<TSwiftSpec> const)2098 int mapSplicedReads(
2099 	TMatches &		matches,
2100 	StringSet<CharString> &	genomeFileNameList,
2101 	StringSet<CharString> &	genomeNames,	// genome names, taken from the Fasta file
2102 	::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
2103 	TReadSet_ & 		readSet,
2104 	TReadRegions &		readRegions,
2105 	TCounts &		cnts,
2106 	RazerSOptions<TSpec> &	options,
2107 	TShapeL const &		shapeL,
2108 	TShapeR const &		shapeR,
2109 	Swift<TSwiftSpec> const)
2110 {
2111 
2112 
2113 	typedef typename Value<TReadSet_>::Type								TRead;
2114 	typedef StringSet<typename Infix<TRead>::Type>						TReadSet;
2115 	typedef Index<TReadSet, IndexQGram<TShapeL, TQGramIndexSpec> >		TIndexL;			// q-gram index left
2116 	typedef Index<TReadSet, IndexQGram<TShapeR, TQGramIndexSpec> >		TIndexR;			// q-gram index right
2117 	typedef Pattern<TIndexL, Swift<TSwiftSpec> >							TSwiftPatternL;	// filter	//should be the same thing for left and right
2118 	typedef Pattern<TIndexR, Swift<TSwiftSpec> >							TSwiftPatternR;	// filter
2119 	typedef Pattern<TRead, MyersUkkonen>								TMyersPattern;	// verifier
2120 
2121 
2122 	// split reads over two indices, one for prefixes, the other for suffixes
2123 	TReadSet readSetL, readSetR;
2124 	unsigned readCount = length(readSet);
2125 	resize(readSetL, readCount);
2126 	resize(readSetR, readCount);
2127 
2128 	if(options._debugLevel > 0)
2129 	{
2130 		int64_t genomeLen = static_cast<int64_t>(3000000000lu) * 2;					// ufff make that an option
2131 		expNumRandomMatches(readSet, genomeLen, options);
2132 	}
2133 
2134 	if(options._debugLevel > 0 )
2135 		std::cout << "Performing spliced mapping.\n";
2136 	for (unsigned i = 0; i < readCount; ++i)
2137 	{
2138 		if(length(readSet[i])>=2*options.minMatchLen)
2139 		{
2140 			assign(readSetL[i], infix(readSet[i],0,options.minMatchLen), Exact());
2141 			assign(readSetR[i], infix(readSet[i],length(readSet[i])-options.minMatchLen,length(readSet[i])), Exact());
2142 		}
2143 		else
2144 		{
2145 			assign(readSetL[i], infix(readSet[i],0,0), Exact());
2146 			assign(readSetR[i], infix(readSet[i],0,0), Exact());
2147 		}
2148 	}
2149 
2150 
2151 	if(options._debugLevel > 1)::std::cout << "Make index left right\n";
2152 	// configure q-gram index
2153 	TIndexL swiftIndexL(readSetL, shapeL);
2154 	TIndexR swiftIndexR(readSetR, shapeR);
2155 
2156 #ifdef RAZERS_OPENADDRESSING
2157 	swiftIndexL.alpha = 2;
2158 	swiftIndexR.alpha = 2;
2159 #endif
2160 
2161 	cargo(swiftIndexL).abundanceCut = options.abundanceCut;
2162 	cargo(swiftIndexR).abundanceCut = options.abundanceCut;
2163 	cargo(swiftIndexL)._debugLevel = 0;
2164 	cargo(swiftIndexR)._debugLevel = options._debugLevel;
2165 
2166 	// configure Swift
2167 	TSwiftPatternL swiftPatternL(swiftIndexL);
2168 	TSwiftPatternR swiftPatternR(swiftIndexR);
2169 	swiftPatternL.params.minThreshold = options.threshold;
2170 	swiftPatternR.params.minThreshold = options.thresholdR;
2171 	swiftPatternL.params.tabooLength = options.tabooLength;
2172 	swiftPatternR.params.tabooLength = options.tabooLength;
2173 
2174 	// init edit distance verifiers
2175 	String<TMyersPattern> forwardPatternsL;
2176 	String<TMyersPattern> forwardPatternsR;
2177 	options.compMask[4] = (options.matchN)? 15: 0;
2178 	if (!options.hammingOnly)
2179 	{
2180 		resize(forwardPatternsL, readCount, Exact());
2181 		resize(forwardPatternsR, readCount, Exact());
2182 		for(unsigned i = 0; i < readCount; ++i)
2183 		{
2184 			setHost(forwardPatternsL[i], indexText(swiftIndexL)[i]);
2185 			setHost(forwardPatternsR[i], indexText(swiftIndexR)[i]);
2186 			_patternMatchNOfPattern(forwardPatternsL[i], options.matchN);
2187 			_patternMatchNOfPattern(forwardPatternsR[i], options.matchN);
2188 			_patternMatchNOfFinder(forwardPatternsL[i], options.matchN);
2189 			_patternMatchNOfFinder(forwardPatternsR[i], options.matchN);
2190 		}
2191 	}
2192 
2193 	if(options._debugLevel > 1)::std::cout << "Patterns created\n";
2194 
2195 	// clear stats
2196 	options.FP = 0;
2197 	options.TP = 0;
2198 	options.timeMapReads = 0;
2199 	options.timeDumpResults = 0;
2200 
2201 	unsigned numFiles = length(genomeFileNameList);
2202 	unsigned gseqNo = 0;
2203 
2204 	// open genome files, one by one
2205 	for (unsigned filecount = 0; filecount < numFiles; ++filecount)
2206 	{
2207 		// open genome file
2208 		SeqFileIn file;
2209 		if (!open(file, toCString(genomeFileNameList[filecount])))
2210 			return RAZERS_GENOME_FAILED;
2211 
2212 		// remove the directory prefix of current genome file
2213 		::std::string genomeFile(toCString(genomeFileNameList[filecount]));
2214 		size_t lastPos = genomeFile.find_last_of('/') + 1;
2215 		if (lastPos == genomeFile.npos) lastPos = genomeFile.find_last_of('\\') + 1;
2216 		if (lastPos == genomeFile.npos) lastPos = 0;
2217 		::std::string genomeName = genomeFile.substr(lastPos);
2218 
2219 		CharString	id;
2220 		//Dna5String	genome;
2221 		String<Dna5Q> genome;
2222 		unsigned gseqNoWithinFile = 0;
2223 		SEQAN_PROTIMESTART(find_time);
2224 
2225 		// iterate over genome sequences
2226 		for(; !atEnd(file); ++gseqNo)
2227 		{
2228             readRecord(id, genome, file);               // read Fasta id and sequence
2229 			if (options.genomeNaming == 0)
2230             {
2231                 cropAfterFirst(id, IsWhitespace());     // crop id after the first whitespace
2232 				appendValue(genomeNames, id, Generous());
2233             }
2234 			gnoToFileMap.insert(::std::make_pair(gseqNo,::std::make_pair(genomeName,gseqNoWithinFile)));
2235 
2236 			if (options.forward)
2237 				mapSplicedReads(matches, genome, gseqNo, readSet, readRegions, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'F', options);
2238 
2239 			if (options.reverse)
2240 			{
2241 				reverseComplement(genome);
2242 				mapSplicedReads(matches, genome, gseqNo, readSet, readRegions, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'R', options);
2243 			}
2244 			++gseqNoWithinFile;
2245 
2246 		}
2247 		options.timeMapReads += SEQAN_PROTIMEDIFF(find_time);
2248 	}
2249 
2250 	compactSplicedMatches(matches, cnts, options, false, swiftPatternL, swiftPatternR);
2251 
2252 	if (options._debugLevel >= 1)
2253 		::std::cerr << ::std::endl << "Finding reads took               \t" << options.timeMapReads << " seconds" << ::std::endl;
2254 	if (options._debugLevel >= 2) {
2255 		::std::cerr << ::std::endl;
2256 		::std::cerr << "___FILTRATION_STATS____" << ::std::endl;
2257 		::std::cerr << "Swift FP: " << options.FP << ::std::endl;
2258 		::std::cerr << "Swift TP: " << options.TP << ::std::endl;
2259 	}
2260 	return 0;
2261 }
2262 
2263 
2264 // returns start pos furthest to the left
2265 template<typename TValue, typename TRegion, typename TOptions>
2266 inline typename Value<typename Value<TRegion,2>::Type,2>::Type
regionStartPos(TRegion & readRegion,TValue readLength,TOptions & options)2267 regionStartPos(TRegion & readRegion,
2268 			   TValue readLength,
2269 			   TOptions & options)
2270 {
2271 	typedef typename Value<typename Value<TRegion,2>::Type,2>::Type TSignedPos;
2272 
2273 	if(readRegion.i2.i1 < 2)	// i2 stores expected end pos of match
2274 		return _max((TSignedPos)0,(TSignedPos)(readRegion.i2.i2 - options.libraryLength + readLength - options.libraryError - options.maxGap));
2275 	else				 	    // i2 stores expected start pos
2276 		return (readRegion.i2.i2  + options.libraryLength - readLength - options.libraryError);
2277 
2278 }
2279 
2280 // returns end pos furthest to the right
2281 template<typename TValue, typename TRegion, typename TOptions>
2282 inline typename Value<typename Value<TRegion,2>::Type,2>::Type
regionEndPos(TRegion & readRegion,TValue readLength,TOptions & options)2283 regionEndPos(TRegion & readRegion,
2284 			   TValue readLength,
2285 			   TOptions & options)
2286 {
2287 	typedef typename Value<typename Value<TRegion,2>::Type,2>::Type TSignedPos;
2288 
2289 	if(readRegion.i2.i1 < 2)	// i2 stores expected end pos of match
2290 		return (TSignedPos)readRegion.i2.i2-options.libraryLength+2*readLength + options.libraryError;
2291 	else					// i2 stores expected start pos
2292 		return readRegion.i2.i2+ options.libraryLength + options.libraryError + options.maxGap;
2293 
2294 }
2295 
2296 
2297 
2298 template<typename TMatch, typename TValue, typename TRegion, typename TOptions>
2299 inline bool
isValidRegion(TMatch & mL,TMatch & mR,TValue readLength,TRegion & readRegion,TOptions & options)2300 isValidRegion(TMatch & mL,
2301 			   TMatch & mR,
2302 			   TValue readLength,
2303 			   TRegion & readRegion,
2304 			   TOptions & options)
2305 {
2306 	typedef typename Value<typename Value<TRegion,2>::Type,2>::Type TSignedPos;
2307 
2308 	if(readRegion.i2.i1 < 2)	// mR.gEnd needs to lie within a specific region
2309 	{
2310 		TSignedPos expEndPos = readRegion.i2.i2 - options.libraryLength + 2*readLength;
2311 		if (mR.gEnd < expEndPos - options.libraryError ||
2312 			mR.gEnd > expEndPos + options.libraryError )
2313 			return false;
2314 	}
2315 	else					// mL.gBegin needs to lie within a specific region
2316 	{
2317 		TSignedPos expStartPos = readRegion.i2.i2 + options.libraryLength - readLength;
2318 		if (mL.gBegin < expStartPos - options.libraryError ||
2319 			mL.gBegin > expStartPos + options.libraryError )
2320 			return false;
2321 	}
2322 	return true;
2323 }
2324 
2325 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2326 // Find read matches in one genome sequence
2327 template <
2328 	typename TMatches,
2329 	typename TGenome,
2330 	typename TOriReadSet,
2331 	typename TReadIndexL,
2332 	typename TReadIndexR,
2333 	typename TSwiftSpec,
2334 	typename TVerifier,
2335 	typename TCounts,
2336 	typename TSpec >
mapSplicedReads(TMatches & matches,TGenome & genome,unsigned gseqNo,TOriReadSet & readSet,TReadRegions & readRegions,Pattern<TReadIndexL,Swift<TSwiftSpec>> & swiftPatternL,Pattern<TReadIndexR,Swift<TSwiftSpec>> & swiftPatternR,TVerifier & forwardPatternsL,TVerifier & forwardPatternsR,TCounts & cnts,char orientation,RazerSOptions<TSpec> & options)2337 void mapSplicedReads(
2338 	TMatches &matches,				// resulting matches
2339 	TGenome &genome,				// genome ...
2340 	unsigned gseqNo,				// ... and its sequence number
2341 	TOriReadSet &readSet,			// reads
2342 	TReadRegions & readRegions,
2343 	Pattern<TReadIndexL, Swift<TSwiftSpec> > &swiftPatternL,	// left index
2344 	Pattern<TReadIndexR, Swift<TSwiftSpec> > &swiftPatternR,	// right index
2345 	TVerifier &forwardPatternsL,
2346 	TVerifier &forwardPatternsR,
2347 	TCounts & cnts,
2348 	char orientation,
2349 	RazerSOptions<TSpec> &options)
2350 {
2351 	typedef typename Value<TOriReadSet>::Type TRead;
2352 	typedef typename Fibre<TReadIndexL, FibreText>::Type	TReadSetL;
2353 	typedef typename Fibre<TReadIndexR, FibreText>::Type	TReadSetR;
2354 	typedef typename Size<TGenome>::Type			TSize;
2355 	typedef typename Position<TGenome>::Type		TGPos;
2356 	typedef typename MakeSigned_<TGPos>::Type		TSignedGPos;
2357 	typedef typename Value<TMatches>::Type			TMatch;
2358 	typedef typename Infix<TGenome>::Type			TGenomeInf;
2359 
2360 	// Prefix-Suffix filtration
2361 	//typedef Finder<TGenome, Swift<TSwiftSpec> >		TSwiftFinderL;
2362 	typedef Finder<TGenomeInf, Swift<TSwiftSpec> >		TSwiftFinderL;
2363 	typedef Finder<TGenomeInf, Swift<TSwiftSpec> >	TSwiftFinderR;
2364 	//typedef Pattern<TReadIndexL, Swift<TSwiftSpec> >	TSwiftPatternL;
2365 	//typedef Pattern<TReadIndexR, Swift<TSwiftSpec> >	TSwiftPatternR;
2366 
2367 	typedef Pair<int64_t, TMatch>				TDequeueValue;
2368 	typedef Dequeue<TDequeueValue>				TDequeue;
2369 	typedef typename TDequeue::TIter			TDequeueIterator;
2370 
2371 	const unsigned NOT_VERIFIED = 1u << (8*sizeof(unsigned)-1);
2372 
2373 	// iterate all genomic sequences
2374 	if (options._debugLevel >= 1)
2375 	{
2376 		::std::cerr << ::std::endl << "Process genome seq #" << gseqNo;
2377 		if (orientation == 'F')
2378 			::std::cerr << "[fwd]";
2379 		else
2380 			::std::cerr << "[rev]";
2381 	}
2382 
2383 	TReadSetL &readSetL = host(host(swiftPatternL));
2384 	TReadSetR &readSetR = host(host(swiftPatternR));
2385 
2386 	if (empty(readSetL) || empty(readSetR))
2387 		return;
2388 
2389 
2390 	TSignedGPos maxDistance = options.maxGap;
2391 //	raus:TSignedGPos maxDistance = options.maxGap + (int)options.maxReadLength;
2392 	//TSignedGPos minDistance = options.minGap;// + 2*options.minMatchLen ;
2393 	TSignedGPos minDistance = 2*options.minMatchLen ;
2394 	if(!options.hammingOnly)
2395 		minDistance -= (options.maxPrefixErrors + options.maxSuffixErrors);
2396 
2397 	// exit if contig is shorter than minDistance
2398 	if (length(genome) <= (unsigned)2*options.minMatchLen)
2399 		return;
2400 
2401 	TGPos scanBegin = 0;
2402 	TGPos scanEnd = length(genome);
2403 	if(!empty(readRegions))
2404 	{
2405 		if(options._debugLevel > 1)
2406 		{
2407 			std::cout << "MaxRegionEndPos=" << options.maxReadRegionsEnd << std::endl;
2408 			std::cout << "MinRegionStartPos=" << options.minReadRegionsStart << std::endl;
2409 		}
2410 		scanBegin = options.minReadRegionsStart;
2411 		if (scanBegin > length(genome))
2412 			scanBegin = length(genome);
2413 		scanEnd = options.maxReadRegionsEnd;
2414 		if (scanEnd > length(genome))
2415 			scanEnd = length(genome);
2416 	}
2417 	TGenomeInf genomeInf = infix(genome, scanBegin, scanEnd);
2418 	//TSwiftFinderL swiftFinderL(genome, options.repeatLength, 1);
2419 	TSwiftFinderL swiftFinderL(genomeInf, options.repeatLength, 1);
2420 	TSwiftFinderR swiftFinderR(genomeInf, options.repeatLength, 1);
2421 
2422 	TDequeue fifo;						// stores potential prefix matches
2423 	String<int64_t> lastPotMatchNo;		// last number of a potential prefix match
2424 	int64_t lastNo = 0;					// last number over all potential prefix matches in the queue
2425 	int64_t firstNo = 0;				// first number over all potential prefix matches in the queue
2426 	Pair<TGPos> gPair;
2427 
2428 	resize(lastPotMatchNo, length(host(swiftPatternL)), (int64_t)-1, Exact());
2429 
2430 	String<Pair<TGPos> > lastRightMatch;		// begin and end of last verified suffix match
2431 	resize(lastRightMatch, length(host(swiftPatternL)), Pair<TGPos>(0,0), Exact());
2432 
2433 	TSize gLength = length(genome);
2434 	if(orientation == 'R') scanBegin = gLength - scanEnd;
2435 	TMatch mR = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2436 	TMatch temp = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2437 	TDequeueValue fL(-1, mR);
2438 	fL.i2.gseqNo = gseqNo;
2439 	mR.gseqNo = gseqNo;
2440 	fL.i2.orientation = orientation;
2441 	mR.orientation = orientation;
2442 
2443 	double maxErrorRateL = (double)options.maxPrefixErrors/options.minMatchLen;
2444 	double maxErrorRateR = (double)options.maxSuffixErrors/options.minMatchLen;
2445 
2446 	double totalTime = sysTime();
2447 	double totalTimeCombine = 0.0;
2448 	double totalTimeLeftVerify = 0.0;
2449 	double totalTimeLeftFilter = 0.0;
2450 	double totalTimeRightVerify = 0.0;
2451 
2452 	Pair<TGPos,TGPos> lastLeftMatch(0,0);
2453 	// iterate all verification regions returned by SWIFT
2454 	while (find(swiftFinderR, swiftPatternR, maxErrorRateR))
2455 	{
2456 		unsigned rseqNo = swiftPatternR.curSeqNo;
2457 		TGPos rEndPos = endPosition(swiftFinderR) + scanBegin;
2458 		TGPos rBeginPos = beginPosition(swiftFinderR) + scanBegin;
2459 	//	std::cout << "rEnd=" << rEndPos << "\t";
2460 		//TGPos doubleParWidth = 2 * (*swiftFinderR.curHit).bucketWidth;
2461 		TRead const &read = readSet[rseqNo];
2462 		unsigned readLength = length(readSet[rseqNo]);
2463 		TSignedGPos extensionOffset = length(read)-2*options.minMatchLen;
2464 		if(!options.hammingOnly)
2465 			extensionOffset += static_cast<TGPos>(floor(options.errorRate*length(read)));
2466 		// passed all valid regions
2467 		if(!empty(readRegions) && (TSignedGPos)rBeginPos > (TSignedGPos)options.maxReadRegionsEnd)
2468 			break;
2469 		if(!empty(readRegions) &&
2470 			((TSignedGPos)rEndPos < (TSignedGPos)regionStartPos(readRegions[rseqNo],readLength,options) || (TSignedGPos)rBeginPos > (TSignedGPos)regionEndPos(readRegions[rseqNo],readLength,options))) //parallelogram must lie within possible mapping region
2471 			continue;
2472 
2473 		// Check this again...
2474 		// remove out-of-window prefixes from fifo
2475 		while(!empty(fifo) &&  front(fifo).i2.gEnd + extensionOffset + maxDistance < (TSignedGPos) rBeginPos)
2476 		//raus:while (!empty(fifo) && front(fifo).i2.gBegin + maxDistance + (TSignedGPos)doubleParWidth < (TSignedGPos)rEndPos)
2477 		{
2478 			popFront(fifo);
2479 			++firstNo;
2480 		}
2481 
2482 		// Check this again...
2483 		// add within-window prefixes to fifo
2484 		double vTimeBegin1 = sysTime();
2485 		while (empty(fifo) || back(fifo).i2.gBegin + minDistance <= (TSignedGPos)rEndPos )
2486 		{
2487 			if (find(swiftFinderL, swiftPatternL, maxErrorRateL))
2488 			{
2489 				gPair = positionRange(swiftFinderL);
2490 				gPair.i1 += scanBegin;
2491 				gPair.i2 += scanBegin;
2492 			//	std::cout << "lBegin=" << gPair.i1 << "\t";
2493 				if(!empty(readRegions) && (TSignedGPos)gPair.i1 > (TSignedGPos) options.maxReadRegionsEnd)
2494 					break;
2495 				if ((TSignedGPos)gPair.i2 + maxDistance + extensionOffset >= (TSignedGPos)rEndPos
2496 					&& (empty(readRegions) ||
2497 					((TSignedGPos)gPair.i2 > (TSignedGPos)regionStartPos(readRegions[swiftPatternL.curSeqNo],readLength,options)
2498 					&& (TSignedGPos)gPair.i1 < (TSignedGPos)regionEndPos(readRegions[swiftPatternL.curSeqNo],readLength,options)))) //parallelogram must lie within possible mapping region
2499 				//raus:if ((TSignedGPos)gPair.i2 + maxDistance + (TSignedGPos)doubleParWidth >= (TSignedGPos)rEndPos)
2500 				{
2501 					// link in
2502 					fL.i1 = lastPotMatchNo[swiftPatternL.curSeqNo]; //link to last previous potential match
2503 					lastPotMatchNo[swiftPatternL.curSeqNo] = lastNo++; //++ general counter and remember last pot match of curSeqNo
2504 
2505 					fL.i2.rseqNo = swiftPatternL.curSeqNo | NOT_VERIFIED; // set first bit
2506 					fL.i2.gBegin = gPair.i1;
2507 					fL.i2.gEnd = gPair.i2;
2508 					pushBack(fifo, fL);
2509 				}
2510 #ifdef RAZERS_DEBUG
2511 				::std::cout << "Discard potential match, out of window\n";
2512 #endif
2513 			}
2514 			else
2515 				break;
2516 		}
2517 		double vTimeEnd1 = sysTime();
2518 		totalTimeLeftFilter += (vTimeEnd1 - vTimeBegin1);
2519 
2520 
2521 		TDequeueIterator it;
2522 		int64_t lastPositive = (int64_t)-1;
2523 
2524 		TSize counter = 0;
2525 		bool noMatchRight = false;
2526 		bool notYetVerifiedRight = true;
2527 		lastLeftMatch.i1 = 0;
2528 		lastLeftMatch.i2 = 0;
2529 
2530 		// walk through all potential prefix matches
2531 		// if suffix is positive, verify prefixes (if not verfied already), mark as positive or negative
2532 		for (int64_t i = lastPotMatchNo[rseqNo]; firstNo <= i; i = (*it).i1)
2533 		{
2534 			//CHECK HIER raus do suffix match verification only once
2535 			if(notYetVerifiedRight)
2536 			{
2537 				notYetVerifiedRight = false;
2538 				TGPos minBeginPos = beginPosition(infix(swiftFinderR, genomeInf));
2539 				if((int)minBeginPos - extensionOffset > 0)
2540 					minBeginPos = minBeginPos - extensionOffset;
2541 				else minBeginPos = 0;
2542 				double vTimeBegin = sysTime();
2543 				if (!matchVerify(mR,
2544 					infix(genome,minBeginPos,endPosition(infix(swiftFinderR, genomeInf))),
2545 					rseqNo,
2546 					readSet,//readSetR,
2547 					forwardPatternsR,
2548 					options,
2549 					TSwiftSpec(),
2550 					LongestSuffix()))
2551 				{
2552 					double vTimeEnd = sysTime();
2553 					totalTimeRightVerify += (vTimeEnd - vTimeBegin);
2554 					noMatchRight = true;
2555 //					continue;
2556 				}
2557 				else
2558 				{
2559 					double vTimeEnd = sysTime();
2560 					totalTimeRightVerify += (vTimeEnd - vTimeBegin);
2561 					if (lastRightMatch[rseqNo].i1 == (TGPos)mR.gBegin && lastRightMatch[rseqNo].i2 == (TGPos)mR.gEnd)
2562 						noMatchRight = true;
2563 					lastRightMatch[rseqNo].i1 = mR.gBegin;
2564 					lastRightMatch[rseqNo].i2 = mR.gEnd;
2565 
2566 					// check here if the match lies within the candidate region!!!
2567 				}
2568 			}
2569 
2570 
2571 
2572 		//	std::cout << "i=" <<i << "\t";
2573 			it = &value(fifo, i - firstNo);
2574 			//CHECK HIER raus noMatchRight --> \FCberspringen korrekt?
2575 			if (noMatchRight || (*it).i2.gBegin + minDistance > (TSignedGPos)rEndPos)
2576 			{
2577 				if (lastPositive == (int64_t)-1)
2578 					lastPotMatchNo[rseqNo] = i;
2579 				else
2580 					value(fifo, lastPositive - firstNo).i1 = i;
2581 				lastPositive = i;
2582 				continue;
2583 			}
2584 
2585 			// verify left mate (equal seqNo), if not done already
2586 			if ((*it).i2.rseqNo & NOT_VERIFIED)
2587 			{
2588 				TGPos maxEndPos = (*it).i2.gEnd + extensionOffset;
2589 				if(maxEndPos > length(genome)) maxEndPos = length(genome);
2590 				double vTimeBegin = sysTime();
2591 				if (matchVerify( (*it).i2,
2592 						infix(genome, (*it).i2.gBegin, maxEndPos),
2593 						rseqNo,
2594 						readSet, //readSetL
2595 						forwardPatternsL,
2596 						options,
2597 						TSwiftSpec(),
2598 						LongestPrefix()) &&
2599 						!(lastLeftMatch.i1 == (TGPos)(*it).i2.gBegin && lastLeftMatch.i2 == (TGPos)(*it).i2.gEnd ))
2600 				{
2601 					double vTimeEnd = sysTime();
2602 					totalTimeLeftVerify += (vTimeEnd - vTimeBegin);
2603 					(*it).i2.rseqNo &= ~NOT_VERIFIED; // has been verified positively // unset first bit
2604 					lastLeftMatch.i1 = (*it).i2.gBegin;
2605 					lastLeftMatch.i2 = (*it).i2.gEnd;
2606 					// short-cut negative matches
2607 					if (lastPositive == (int64_t)-1)
2608 						lastPotMatchNo[rseqNo] = i;
2609 					else
2610 						value(fifo, lastPositive - firstNo).i1 = i;
2611 					lastPositive = i;
2612 				}
2613 				else
2614 				{
2615 					double vTimeEnd = sysTime();
2616 					totalTimeLeftVerify += (vTimeEnd - vTimeBegin);
2617 					(*it).i2.rseqNo = ~NOT_VERIFIED;		// has been verified negatively
2618 				}
2619 
2620 			}
2621 
2622 			// prefix match was found
2623 			if ((*it).i2.rseqNo == rseqNo)
2624 			{
2625 				lastLeftMatch.i1 = (*it).i2.gBegin;
2626 				lastLeftMatch.i2 = (*it).i2.gEnd;
2627 
2628 				// dont shortcut too much
2629 				if (lastPositive == (int64_t)-1 || i < lastPositive)
2630 					lastPositive = i;
2631 
2632 /*				// CHECK HIER rein
2633 				// do suffix match verification
2634 				if(notYetVerifiedRight)
2635 				{
2636 					notYetVerifiedRight = false;
2637 					TGPos minBeginPos = beginPosition(infix(swiftFinderR, genomeInf));
2638 					if((int)minBeginPos - extensionOffset > 0)
2639 						minBeginPos = minBeginPos - extensionOffset;
2640 					else minBeginPos = 0;
2641 					double vTimeBegin = sysTime();
2642 					if (!matchVerify(mR,
2643 						infix(genome,minBeginPos,endPosition(infix(swiftFinderR, genomeInf))),
2644 						rseqNo,
2645 						readSet,//readSetR,
2646 						forwardPatternsR,
2647 						options,
2648 						TSwiftSpec(),
2649 						LongestSuffix()))
2650 					{
2651 						double vTimeEnd = sysTime();
2652 						totalTimeRightVerify += (vTimeEnd - vTimeBegin);
2653 						noMatchRight = true;
2654 						continue;
2655 					}
2656 					else
2657 					{
2658 						double vTimeEnd = sysTime();
2659 						totalTimeRightVerify += (vTimeEnd - vTimeBegin);
2660 						if (lastRightMatch[rseqNo].i1 == (TGPos)mR.gBegin && lastRightMatch[rseqNo].i2 == (TGPos)mR.gEnd)
2661 							noMatchRight = true;
2662 						lastRightMatch[rseqNo].i1 = mR.gBegin;
2663 						lastRightMatch[rseqNo].i2 = mR.gEnd;
2664 					}
2665 				}
2666 */
2667 				//else check if prefix and suffix match fit together
2668 				if(!noMatchRight)
2669 				{
2670 					int outerDistance = mR.gEnd - (*it).i2.gBegin;
2671 #ifdef RAZERS_DEBUG
2672 					std::cout << "before\nmL.mScore =" << (*it).i2.mScore << "\n";
2673 					std::cout << "mL.gBegin =" << (*it).i2.gBegin << "\n";
2674 					std::cout << "mL.gEnd =" << (*it).i2.gEnd << "\n";
2675 					std::cout << "mL.editDist =" << (*it).i2.editDist << "\n";
2676 
2677 					std::cout << "mR.mScore =" << mR.mScore << "\n";
2678 					std::cout << "mR.gBegin =" << mR.gBegin << "\n";
2679 					std::cout << "mR.gEnd =" << mR.gEnd << "\n";
2680 					std::cout << "mR.editDist =" << mR.editDist << "\n";
2681 					std::cout << "outerDist =" << outerDistance << "\n";
2682 #endif
2683 					if (outerDistance < (int)(2 * options.minMatchLen)) //subtract suffix/prefix-errors in case of edit distance
2684 						continue;
2685 //					::std::cout << "outerDistance->" << outerDistance << std::endl;
2686 					int outerDistanceError = length(readSet[rseqNo]) -(int)(outerDistance);
2687 					if (outerDistanceError < 0) outerDistanceError = -outerDistanceError;
2688 					if ((outerDistanceError > (int) options.maxGap) || // f\FCr edit + static_cast<TGPos>(floor(options.errorRate*length(read))
2689 						(outerDistanceError < (int) options.minGap))   // f\FCr edit - static_cast<TGPos>(floor(options.errorRate*length(read))
2690 						continue;
2691 
2692 					TMatch mRtmp = mR;
2693 					TMatch mLtmp = (*it).i2;
2694 					if(!empty(readRegions) &&
2695 						!isValidRegion(mLtmp,mRtmp,readLength,readRegions[rseqNo],options))
2696 //						((TSignedGPos)mLtmp.gBegin < (TSignedGPos)readRegions[rseqNo].i2 ||
2697 //					 	(TSignedGPos)mRtmp.gEnd > (TSignedGPos)readRegions[rseqNo].i3)) //match must lie within possible mapping region
2698 						continue;
2699 					double t1 = sysTime();
2700 					if(options.spec.DONT_VERIFY ||
2701 					  !combineLeftRight(mRtmp,mLtmp,read,genome,options,orientation,TSwiftSpec()))
2702 					{
2703 						++options.FP;
2704 						double t2 = sysTime();
2705 						totalTimeCombine += (t2 - t1) ;
2706 						continue;
2707 					}
2708 					else {
2709 						double t2 = sysTime();
2710 						totalTimeCombine += (t2 - t1) ;
2711 						++options.TP;
2712 					}
2713 					++counter;
2714 
2715 					//assign an alignment score
2716 					//mRtmp.alignmentScore = (options.matchScore * (sumLength - sumDist)) + (options.mismatchScore * sumDist);
2717 					//if(outerDistanceError != 0) mRtmp.alignmentScore += ((outerDistanceError - 1) * options.gapExtend) + options.gapOpen;
2718 					//mLtmp.alignmentScore = mRtmp.alignmentScore;
2719 
2720 					//mRtmp.alignmentScore = (options.matchScore * (sumLength - sumDist)) + (options.mismatchScore * sumDist);
2721 					//if(outerDistanceError != 0)
2722 					//{
2723 					//	int maxErrors = (int)(options.errorRate*length(readSet[rseqNo]));
2724 					//	double gapScoreConvergeTo = ((options.minMatchLen*2 - maxErrors) * options.matchScore) + (maxErrors * options.mismatchScore) ;
2725 					//	double gapPenalty =	gapScoreConvergeTo*(1-exp((double)-options.indelLambda*outerDistanceError));
2726 					//	mRtmp.alignmentScore -= (int)gapPenalty;
2727 					//}
2728 					//mLtmp.alignmentScore = mRtmp.alignmentScore;
2729 
2730 					if (orientation == 'R')
2731 					{
2732 						TSize temp = mLtmp.gBegin;
2733 						mLtmp.gBegin = gLength - mLtmp.gEnd;
2734 						mLtmp.gEnd = gLength - temp;
2735 						temp = mRtmp.gBegin;
2736 						mRtmp.gBegin = gLength - mRtmp.gEnd;
2737 						mRtmp.gEnd = gLength - temp;
2738 						outerDistance = -outerDistance;
2739 					}
2740 					// set a unique pair id
2741 					mLtmp.pairId = mRtmp.pairId = options.nextMatePairId;
2742 					if (++options.nextMatePairId == 0)
2743 						options.nextMatePairId = 1;
2744 
2745 					// score the whole match pair
2746 					//mLtmp.pairScore = mRtmp.pairScore = 0 - mLtmp.editDist - mRtmp.editDist;
2747 					// score the whole match pair by # of matches bases - # mismatched bases (not entirely correct for edit distance)
2748 #ifdef TRY_SCORES
2749 					double identityScore = ( (100.00 - (100.00* (double)(mLtmp.editDist + mRtmp.editDist)/(mRtmp.mScore + mLtmp.mScore))) - 80.0 ) * 5.0;
2750 					if(options._debugLevel > 1 )std::cout << "identityScore: " << identityScore << std::endl;
2751 
2752 					double aliScore = mRtmp.mScore + mLtmp.mScore - 2* mLtmp.editDist - 2* mRtmp.editDist;
2753 					aliScore = (double)(aliScore*100)/readLength;
2754 					mLtmp.pairScore = mRtmp.pairScore = (int)(identityScore+aliScore)/2.0;
2755 #else
2756 					mLtmp.pairScore = mRtmp.pairScore = mRtmp.mScore + mLtmp.mScore - 2* mLtmp.editDist - 2* mRtmp.editDist;
2757 #endif
2758 
2759 
2760 					if(outerDistanceError != 0)
2761 					{ //subtract one if there is an indel in the middle (such that perfect matches are better than indel matches..)
2762 						mLtmp.pairScore -= (int)((double)options.penaltyC * length(readSet[rseqNo])/100) ;
2763 						mRtmp.pairScore -= (int)((double)options.penaltyC * length(readSet[rseqNo])/100) ;
2764 					}
2765 #ifdef RAZERS_DEBUG_LIGHT
2766 					bool falsch = false;
2767 					if((unsigned)(mRtmp.mScore + mLtmp.mScore) > length(readSet[rseqNo])) falsch = true;
2768 					if(mRtmp.mScore> length(readSet[rseqNo])-options.minMatchLen || mRtmp.mScore < options.minMatchLen) falsch = true;
2769 					if((int)mRtmp.gEnd - (int)mRtmp.gBegin < 0 || (int)mRtmp.gEnd - (int)mRtmp.gBegin > 500) falsch = true;
2770 					if(mRtmp.editDist > 200) falsch = true;
2771 
2772 					if(mLtmp.mScore> length(readSet[rseqNo])-options.minMatchLen ||mLtmp.mScore < options.minMatchLen) falsch = true;
2773 					if((int)mLtmp.gEnd - (int)mLtmp.gBegin < 0 || (int)mLtmp.gEnd - (int)mLtmp.gBegin > 500) falsch = true;
2774 					if(mLtmp.editDist > 200) falsch = true;
2775 
2776 					if(falsch)
2777 					{
2778 						::std::cout << "rseqNo="<<rseqNo <<::std::endl;
2779 							std::cout << "mL.mScore =" << mLtmp.mScore << "\n";
2780 							std::cout << "mL.gBegin =" << mLtmp.gBegin << "\n";
2781 							std::cout << "mL.gEnd =" << mLtmp.gEnd << "\n";
2782 							std::cout << "mL.editDist =" << mLtmp.editDist << "\n";
2783 
2784 							std::cout << "mR.mScore =" << mRtmp.mScore << "\n";
2785 							std::cout << "mR.gBegin =" << mRtmp.gBegin << "\n";
2786 							std::cout << "mR.gEnd =" << mRtmp.gEnd << "\n";
2787 							std::cout << "mR.editDist =" << mRtmp.editDist << "\n";
2788 
2789 
2790 
2791 					if(-mLtmp.pairScore > (int) (options.errorRate * length(readSet[rseqNo])))
2792 					{
2793 						::std::cout << "mLtmp.pairScore = " << mLtmp.pairScore;
2794 						::std::cout << "\treadLen = " << length(readSet[rseqNo]);
2795 						::std::cout << "\trseqNo = " << rseqNo << ::std::endl;
2796 					}
2797 					std::cout << "mScoreSum =" << mLtmp.mScore + mRtmp.mScore << "\t";
2798 					std::cout << "readLength =" << length(readSet[rseqNo]) << "\t";
2799 					std::cout << "mL.mScore =" << mLtmp.mScore << "\t";
2800 					std::cout << "mL.gBegin =" << mLtmp.gBegin << "\t";
2801 					std::cout << "mL.gEnd =" << mLtmp.gEnd << "\t";
2802 					std::cout << "mL.editDist =" << mLtmp.editDist << "\n";
2803 
2804 					std::cout << "mR.mScore =" << mRtmp.mScore << "\t";
2805 					std::cout << "mR.gBegin =" << mRtmp.gBegin << "\t";
2806 					std::cout << "mR.gEnd =" << mRtmp.gEnd << "\t";
2807 					std::cout << "mR.editDist =" << mRtmp.editDist << "\n";
2808 
2809 					Align<String<Dna5Q>, ArrayGaps> alignL;
2810 					Score<int> scoreType = Score<int>(0, -999, -1001, -1000);	// levenshtein-score (match, mismatch, gapOpen, gapExtend)
2811 					if (options.hammingOnly)
2812 						scoreType.data_mismatch = -1;
2813 					resize(rows(alignL), 2);
2814 					assignSource(row(alignL, 0), infix(readSet[rseqNo],0,mLtmp.mScore));
2815 					assignSource(row(alignL, 1), infix(genome,mLtmp.gBegin, mLtmp.gEnd));
2816 					if (orientation == 'R')
2817 						reverseComplement(source(row(alignL, 1)));
2818 
2819 					globalAlignment(alignL, scoreType);
2820 					std::cout << alignL;
2821 
2822 					Align<String<Dna5Q>, ArrayGaps> alignR;
2823 					resize(rows(alignR), 2);
2824 					assignSource(row(alignR, 0), infix(readSet[rseqNo],length(readSet[rseqNo])-mRtmp.mScore,length(readSet[rseqNo])));
2825 					assignSource(row(alignR, 1), infix(genome,mRtmp.gBegin, mRtmp.gEnd));
2826 					if (orientation == 'R')
2827 						reverseComplement(source(row(alignR, 1)));
2828 
2829 					globalAlignment(alignR, scoreType);
2830 					std::cout << alignR;
2831 
2832 					std::cout << "Komplettali:\n";
2833 					Align<String<Dna5Q>, ArrayGaps> align;
2834 					resize(rows(align), 2);
2835 					assignSource(row(align, 0), infix(readSet[rseqNo],0,length(readSet[rseqNo])));
2836 					assignSource(row(align, 1), infix(genome,_min(mRtmp.gBegin,mLtmp.gBegin), _max(mRtmp.gEnd,mLtmp.gEnd)));
2837 					if (orientation == 'R')
2838 						reverseComplement(source(row(align, 1)));
2839 
2840 					globalAlignment(align, scoreType);
2841 					std::cout << align;
2842 					}
2843 #endif
2844 
2845 					// relative positions
2846 					mLtmp.mateDelta = outerDistance;
2847 					mRtmp.mateDelta = -outerDistance;
2848 
2849 					mLtmp.rseqNo = mRtmp.rseqNo = rseqNo;
2850 
2851 					if (!empty(readRegions) && options.anchored && options.outputFormat != 4)
2852 					{
2853 						if(readRegions[rseqNo].i2.i1 > 1) // match actually is on reverse strand
2854 						{
2855 							temp = mLtmp;
2856 							mLtmp = mRtmp;
2857 							mLtmp = temp;
2858 							mLtmp.orientation = mRtmp.orientation = 'R';
2859 						}
2860 					}
2861 
2862 					if (!options.spec.DONT_DUMP_RESULTS)
2863 					{
2864 						appendValue(matches, mLtmp, Generous());
2865 						appendValue(matches, mRtmp, Generous());
2866 					}
2867 				}
2868 			}
2869 		}
2870 		if (length(matches) > options.compactThresh)
2871 		{
2872 			typename Size<TMatches>::Type oldSize = length(matches);
2873 //			maskDuplicates(matches);	// overlapping parallelograms cause duplicates for edit distance //TODO: implement!
2874 			compactSplicedMatches(matches, cnts, options, false, swiftPatternL, swiftPatternR);
2875 			options.compactThresh += (options.compactThresh >> 1);
2876 			if (options._debugLevel >= 2)
2877 				::std::cerr << '(' << oldSize - length(matches) << " matches removed)";
2878 		}
2879 
2880 		// short-cut negative matches
2881 		if (lastPositive == (int64_t)-1)
2882 			lastPotMatchNo[rseqNo] = (int64_t)-1;
2883 		else
2884 			value(fifo, lastPositive - firstNo).i1 = (int64_t)-1; // the first positive's link to previous is removed
2885 
2886 
2887 	}//swiftFinderR
2888 	double totalTimeEnd = sysTime();
2889 	if(options._debugLevel > 1)
2890 	{
2891 		std::cout << "TotalTime:"<< totalTimeEnd - totalTime << std::endl;
2892 		std::cout << "TotalTimeLeftFilter:" <<totalTimeLeftFilter << std::endl;
2893 		std::cout << "TotalTimeLeftVerify:"<< totalTimeLeftVerify << std::endl;
2894 		std::cout << "TotalTimeRightVerify:"<< totalTimeRightVerify << std::endl;
2895 		std::cout << "TotalTimeCombine:"<< totalTimeCombine << std::endl;
2896 	}
2897 
2898 
2899 }//function
2900 
2901 
2902 
2903 
2904 //////////////////////////////////////////////////////////////////////////////
2905 // Find split read matches in many genome sequences (given as StringSet)
2906 template <
2907 	typename TMatches,
2908 	typename TGenomeSet,
2909 	typename TReadSet,
2910 	typename TCounts,
2911 	typename TSpec,
2912 	typename TShapeL,
2913 	typename TShapeR,
2914 	typename TSwiftSpec >
mapSplicedReads(TMatches & matches,TGenomeSet & genomeSet,TReadSet const & readSet,TReadRegions & readRegions,TCounts & cnts,RazerSOptions<TSpec> & options,TShapeL const & shapeL,TShapeR const & shapeR,Swift<TSwiftSpec> const)2915 int mapSplicedReads(
2916 	TMatches &				matches,
2917 	TGenomeSet &			genomeSet,
2918 	TReadSet const &		readSet,
2919 	TReadRegions &			readRegions,
2920 	TCounts &				cnts,
2921 	RazerSOptions<TSpec> &	options,
2922 	TShapeL const &			shapeL,
2923 	TShapeR const &			shapeR,
2924 	Swift<TSwiftSpec> const)
2925 {
2926 	typedef typename Value<TReadSet>::Type							TRead;
2927 	typedef Index<TReadSet, IndexQGram<TShapeL, TQGramIndexSpec> >	TIndexL;			// q-gram index
2928 	typedef Index<TReadSet, IndexQGram<TShapeR, TQGramIndexSpec> >	TIndexR;			// q-gram index
2929 	typedef Pattern<TIndexL, Swift<TSwiftSpec> >						TSwiftPatternL;	// filter
2930 	typedef Pattern<TIndexR, Swift<TSwiftSpec> >						TSwiftPatternR;	// filter
2931 	typedef Pattern<TRead, MyersUkkonen>							TMyersPattern;	// verifier
2932 
2933 	unsigned readCount = length(readSet);
2934 
2935 	// configure q-gram index
2936 	TIndexL swiftIndexL(readSet, shapeL);
2937 	cargo(swiftIndexL).abundanceCut = options.abundanceCut;
2938 	cargo(swiftIndexL)._debugLevel = options._debugLevel;
2939 
2940 	TIndexR swiftIndexR(readSet, shapeR);
2941 	cargo(swiftIndexR).abundanceCut = options.abundanceCut;
2942 	cargo(swiftIndexR)._debugLevel = options._debugLevel;
2943 
2944 	// configure Swift
2945 	TSwiftPatternL swiftPatternL(swiftIndexL);
2946 	swiftPatternL.params.minThreshold = options.threshold;
2947 	swiftPatternL.params.tabooLength = options.tabooLength;
2948 
2949 	TSwiftPatternR swiftPatternR(swiftIndexR);
2950 	swiftPatternL.params.minThreshold = options.thresholdR;
2951 	swiftPatternL.params.tabooLength = options.tabooLength;
2952 
2953 	// init edit distance verifiers
2954 	String<TMyersPattern> forwardPatternsL;
2955 	String<TMyersPattern> forwardPatternsR;
2956 	options.compMask[4] = (options.matchN)? 15: 0;
2957 	if (!options.hammingOnly)
2958 	{
2959 		resize(forwardPatternsL, readCount, Exact());
2960 		resize(forwardPatternsR, readCount, Exact());
2961 		for(unsigned i = 0; i < readCount; ++i)
2962 		{
2963 			setHost(forwardPatternsL[i], indexText(swiftIndexL)[i]);
2964 			setHost(forwardPatternsR[i], indexText(swiftIndexR)[i]);
2965 			_patternMatchNOfPattern(forwardPatternsL[i], options.matchN);
2966 			_patternMatchNOfPattern(forwardPatternsR[i], options.matchN);
2967 			_patternMatchNOfFinder(forwardPatternsL[i], options.matchN);
2968 			_patternMatchNOfFinder(forwardPatternsR[i], options.matchN);
2969 		}
2970 	}
2971 
2972 	// clear stats
2973 	options.FP = 0;
2974 	options.TP = 0;
2975 	options.timeMapReads = 0;
2976 	options.timeDumpResults = 0;
2977 
2978 	CharString	id;
2979 
2980 	// iterate over genome sequences
2981 	SEQAN_PROTIMESTART(find_time);
2982 	for(unsigned gseqNo = 0; gseqNo < length(genomeSet); ++gseqNo)
2983 	{
2984 		if (options.forward)
2985 			mapSplicedReads(matches, genomeSet[gseqNo], gseqNo, readSet, readRegions, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'F', options);
2986 
2987 		if (options.reverse)
2988 		{
2989 			reverseComplement(genomeSet[gseqNo]);
2990 			mapSplicedReads(matches, genomeSet[gseqNo], gseqNo, readSet, readRegions, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'R', options);
2991 			reverseComplement(genomeSet[gseqNo]);
2992 		}
2993 
2994 	}
2995 	options.timeMapReads += SEQAN_PROTIMEDIFF(find_time);
2996 
2997 	if (options._debugLevel >= 1)
2998 		::std::cerr << ::std::endl << "Finding reads took               \t" << options.timeMapReads << " seconds" << ::std::endl;
2999 	if (options._debugLevel >= 2) {
3000 		::std::cerr << ::std::endl;
3001 		::std::cerr << "___FILTRATION_STATS____" << ::std::endl;
3002 		::std::cerr << "Swift FP: " << options.FP << ::std::endl;
3003 		::std::cerr << "Swift TP: " << options.TP << ::std::endl;
3004 	}
3005 	return 0;
3006 }
3007 
3008 
3009 //////////////////////////////////////////////////////////////////////////////
3010 // Wrapper for different template specializations
3011 template <typename TMatches, typename TReadSet, typename TCounts, typename TSpec>
mapSplicedReads(TMatches & matches,StringSet<CharString> & genomeFileNameList,StringSet<CharString> & genomeNames,::std::map<unsigned,::std::pair<::std::string,unsigned>> & gnoToFileMap,TReadSet & readSet,TReadRegions & readRegions,TCounts & cnts,RazerSOptions<TSpec> & options)3012 int mapSplicedReads(
3013 	TMatches &				matches,
3014 	StringSet<CharString> &	genomeFileNameList,
3015 	StringSet<CharString> &	genomeNames,	// genome names, taken from the Fasta file
3016 	::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
3017 	TReadSet &		readSet,
3018 	TReadRegions &			readRegions,
3019 	TCounts &				cnts,
3020 	RazerSOptions<TSpec> &	options)
3021 {
3022 
3023 	Shape<Dna, SimpleShape>		ungappedL;
3024 	Shape<Dna, OneGappedShape>	onegappedL;
3025 	Shape<Dna, GenericShape>	gappedL;
3026 
3027 	Shape<Dna, SimpleShape>		ungappedR;
3028 	Shape<Dna, OneGappedShape>	onegappedR;
3029 	Shape<Dna, GenericShape>	gappedR;
3030 
3031 	// 2x3x3 SPECIALIZATION
3032 
3033 	if (options.hammingOnly)
3034 	{
3035 		// select best-fitting combination of shape
3036 		if (stringToShape(ungappedL, options.shape))
3037 		{
3038 			if (stringToShape(ungappedR, options.shapeR))
3039 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3040 			if (stringToShape(onegappedR, options.shapeR))
3041 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3042 			if (stringToShape(gappedR, options.shapeR))
3043 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3044 		}
3045 
3046 		if (stringToShape(onegappedL, options.shape))
3047 		{
3048 			if (stringToShape(ungappedR, options.shapeR))
3049 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3050 			if (stringToShape(onegappedR, options.shapeR))
3051 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3052 			if (stringToShape(gappedR, options.shapeR))
3053 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3054 		}
3055 
3056 		if (stringToShape(gappedL, options.shape))
3057 		{
3058 			if (stringToShape(ungappedR, options.shapeR))
3059 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3060 			if (stringToShape(onegappedR, options.shapeR))
3061 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3062 			if (stringToShape(gappedR, options.shapeR))
3063 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3064 		}
3065 	}
3066 	else
3067 	{
3068 		// select best-fitting combination of shape
3069 		if (stringToShape(ungappedL, options.shape))
3070 		{
3071 			if (stringToShape(ungappedR, options.shapeR))
3072 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, ungappedR, Swift<SwiftSemiGlobal>());
3073 			if (stringToShape(onegappedR, options.shapeR))
3074 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, onegappedR, Swift<SwiftSemiGlobal>());
3075 			if (stringToShape(gappedR, options.shapeR))
3076 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, ungappedL, gappedR, Swift<SwiftSemiGlobal>());
3077 		}
3078 
3079 		if (stringToShape(onegappedL, options.shape))
3080 		{
3081 			if (stringToShape(ungappedR, options.shapeR))
3082 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, ungappedR, Swift<SwiftSemiGlobal>());
3083 			if (stringToShape(onegappedR, options.shapeR))
3084 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, onegappedR, Swift<SwiftSemiGlobal>());
3085 			if (stringToShape(gappedR, options.shapeR))
3086 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, onegappedL, gappedR, Swift<SwiftSemiGlobal>());
3087 		}
3088 
3089 		if (stringToShape(gappedL, options.shape))
3090 		{
3091 			if (stringToShape(ungappedR, options.shapeR))
3092 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, ungappedR, Swift<SwiftSemiGlobal>());
3093 			if (stringToShape(onegappedR, options.shapeR))
3094 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, onegappedR, Swift<SwiftSemiGlobal>());
3095 			if (stringToShape(gappedR, options.shapeR))
3096 				return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options, gappedL, gappedR, Swift<SwiftSemiGlobal>());
3097 		}
3098 	}
3099 
3100 	return RAZERS_INVALID_SHAPE;
3101 }
3102 
3103 
3104 template <typename TMatches, typename TGenomeSet, typename TReadSet, typename TCounts, typename TSpec>
mapSplicedReads(TMatches & matches,TGenomeSet & genomeSet,TReadSet & readSet,TReadRegions & readRegions,TCounts & cnts,RazerSOptions<TSpec> & options)3105 int mapSplicedReads(
3106 	TMatches &				matches,
3107 	TGenomeSet &			genomeSet,
3108 	TReadSet &				readSet,
3109 	TReadRegions &			readRegions,
3110 	TCounts &				cnts,
3111 	RazerSOptions<TSpec> &	options)
3112 {
3113 
3114 
3115 	Shape<Dna, SimpleShape>		ungappedL;
3116 	Shape<Dna, OneGappedShape>	onegappedL;
3117 	Shape<Dna, GenericShape>	gappedL;
3118 
3119 
3120 	Shape<Dna, SimpleShape>		ungappedR;
3121 	Shape<Dna, OneGappedShape>	onegappedR;
3122 	Shape<Dna, GenericShape>	gappedR;
3123 
3124 	// 2x3x3 SPECIALIZATION
3125 
3126 	if (options.hammingOnly)
3127 	{
3128 		// select best-fitting combination of shape
3129 		if (stringToShape(ungappedL, options.shape))
3130 		{
3131 			if (stringToShape(ungappedR, options.shapeR))
3132 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3133 			if (stringToShape(onegappedR, options.shapeR))
3134 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3135 			if (stringToShape(gappedR, options.shapeR))
3136 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3137 		}
3138 
3139 		if (stringToShape(onegappedL, options.shape))
3140 		{
3141 			if (stringToShape(ungappedR, options.shapeR))
3142 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3143 			if (stringToShape(onegappedR, options.shapeR))
3144 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3145 			if (stringToShape(gappedR, options.shapeR))
3146 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3147 		}
3148 
3149 		if (stringToShape(gappedL, options.shape))
3150 		{
3151 			if (stringToShape(ungappedR, options.shapeR))
3152 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, ungappedR, Swift<SwiftSemiGlobalHamming>());
3153 			if (stringToShape(onegappedR, options.shapeR))
3154 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, onegappedR, Swift<SwiftSemiGlobalHamming>());
3155 			if (stringToShape(gappedR, options.shapeR))
3156 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, gappedR, Swift<SwiftSemiGlobalHamming>());
3157 		}
3158 	}
3159 	else
3160 	{
3161 		// select best-fitting combination of shape
3162 		if (stringToShape(ungappedL, options.shape))
3163 		{
3164 			if (stringToShape(ungappedR, options.shapeR))
3165 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, ungappedR, Swift<SwiftSemiGlobal>());
3166 			if (stringToShape(onegappedR, options.shapeR))
3167 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, onegappedR, Swift<SwiftSemiGlobal>());
3168 			if (stringToShape(gappedR, options.shapeR))
3169 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, ungappedL, gappedR, Swift<SwiftSemiGlobal>());
3170 		}
3171 
3172 		if (stringToShape(onegappedL, options.shape))
3173 		{
3174 			if (stringToShape(ungappedR, options.shapeR))
3175 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, ungappedR, Swift<SwiftSemiGlobal>());
3176 			if (stringToShape(onegappedR, options.shapeR))
3177 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, onegappedR, Swift<SwiftSemiGlobal>());
3178 			if (stringToShape(gappedR, options.shapeR))
3179 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, onegappedL, gappedR, Swift<SwiftSemiGlobal>());
3180 		}
3181 
3182 		if (stringToShape(gappedL, options.shape))
3183 		{
3184 			if (stringToShape(ungappedR, options.shapeR))
3185 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, ungappedR, Swift<SwiftSemiGlobal>());
3186 			if (stringToShape(onegappedR, options.shapeR))
3187 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, onegappedR, Swift<SwiftSemiGlobal>());
3188 			if (stringToShape(gappedR, options.shapeR))
3189 				return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options, gappedL, gappedR, Swift<SwiftSemiGlobal>());
3190 		}
3191 	}
3192 
3193 	return RAZERS_INVALID_SHAPE;
3194 }
3195 
3196 
3197 
3198 }//namespace
3199 
3200 #endif
3201