1  /*==========================================================================
2              RazerS - Fast Read Mapping with Controlled Loss Rate
3                    http://www.seqan.de/projects/razers.html
4 
5  ============================================================================
6   Copyright (C) 2008 by David Weese
7 
8   This program is free software; you can redistribute it and/or
9   modify it under the terms of the GNU Lesser General Public
10   License as published by the Free Software Foundation; either
11   version 3 of the License, or (at your options) any later version.
12 
13   This program is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20  ==========================================================================*/
21 
22 #ifndef SEQAN_HEADER_RAZERS_H
23 #define SEQAN_HEADER_RAZERS_H
24 
25 #include <iostream>
26 #include <fstream>
27 
28 #include <seqan/find.h>
29 #include <seqan/index.h>
30 
31 #ifdef RAZERS_PARALLEL
32 #include "tbb/spin_mutex.h"
33 #endif
34 
35 namespace SEQAN_NAMESPACE_MAIN
36 {
37 
38 //////////////////////////////////////////////////////////////////////////////
39 // Default options
40 
41 #ifdef RAZERS_OPENADDRESSING
42 	typedef OpenAddressing	TQGramIndexSpec;
43 #else
44 	typedef Default	TQGramIndexSpec;
45 #endif
46 
47 	template < bool DONT_VERIFY_ = false, bool DONT_DUMP_RESULTS_ = false >
48 	struct RazerSSpec
49 	{
50 		enum { DONT_VERIFY = DONT_VERIFY_ };				// omit verifying potential matches
51 		enum { DONT_DUMP_RESULTS = DONT_DUMP_RESULTS_ };	// omit dumping results
52 	};
53 
54 	template < typename TSpec = RazerSSpec<> >
55 	struct RazerSOptions
56 	{
57 	// main options
58 		TSpec		spec;
59 		bool		forward;			// compute forward oriented read matches
60 		bool		reverse;			// compute reverse oriented read matches
61 		double		errorRate;			// Criteria 1 threshold
62 		unsigned	maxHits;			// output at most maxHits many matches
63 		unsigned	distanceRange;		// output only the best, second best, ..., distanceRange best matches
64 		bool		purgeAmbiguous;		// true..remove reads with more than maxHits best matches, false..keep them
65 		CharString	output;				// name of result file
66 		int			_debugLevel;		// level of verbosity
67 		bool		printVersion;		// print version number
68 		bool		hammingOnly;		// no indels
69 		int			trimLength;			// if >0, cut reads to #trimLength characters
70 
71 	// output format options
72 		unsigned	outputFormat;		// 0..Razer format
73 										// 1..enhanced Fasta
74 										// 2..ELAND format
75 		bool		dumpAlignment;		// compute and dump the match alignments in the result files
76 		unsigned	genomeNaming;		// 0..use Fasta id
77 										// 1..enumerate reads beginning with 1
78 		unsigned	readNaming;			// 0..use Fasta id
79 										// 1..enumerate reads beginning with 1
80 										// 2..use the read sequence (only for short reads!)
81 		unsigned	sortOrder;			// 0..sort keys: 1. read number, 2. genome position
82 										// 1..           1. genome pos50ition, 2. read number
83 		unsigned	positionFormat;		// 0..gap space
84 										// 1..position space
85 		const char	*runID;				// runID needed for gff output
86 
87 	// filtration parameters
88 		::std::string shape;			// shape (e.g. 11111111111)
89 		int			threshold;			// threshold
90 		int			tabooLength;		// taboo length
91 		int			repeatLength;		// repeat length threshold
92 		double		abundanceCut;		// abundance threshold
93 
94 	// mate-pair parameters
95 		int			libraryLength;		// offset between two mates
96 		int			libraryError;		// offset tolerance
97 		unsigned	nextMatePairId;		// use this id for the next mate-pair
98 
99 	// verification parameters
100 		bool		matchN;				// false..N is always a mismatch, true..N matches with all
101 		unsigned char compMask[5];
102 
103 	// statistics
104 		__int64		FP;					// false positives (threshold reached, no match)
105 		__int64		TP;					// true positives (threshold reached, match)
106 		double		timeLoadFiles;		// time for loading input files
107 		double		timeMapReads;		// time for mapping reads
108 		double		timeDumpResults;	// time for dumping the results
109 
110 #ifdef RAZERS_DIRECT_MAQ_MAPPING
111 		bool		maqMapping;
112 		int		maxMismatchQualSum;
113 		int		mutationRateQual;
114 		int		absMaxQualSumErrors;
115 		unsigned	artSeedLength;
116 		bool		noBelowIdentity;
117 #endif
118 
119 #ifdef RAZERS_MICRO_RNA
120 		bool		microRNA;
121 		unsigned	rnaSeedLength;
122 		bool 		exactSeed;
123 #endif
124 
125 		bool		lowMemory;		// set maximum shape weight to 13 to limit size of q-gram index
126 		bool		fastaIdQual;		// hidden option for special fasta+quality format we use
127 		int			minClippedLen;
128 
129 	// misc
130 		unsigned	compactThresh;		// compact match array if larger than compactThresh
131  		int 		maxReadLength;
132 
133 		unsigned        minMatchLen;
134  		int		maxPrefixErrors;
135  		int		maxSuffixErrors;
136  		int	        maxGap;
137  		int	        minGap;
138  		::std::string 	shapeR;			// shape (e.g. 11111111111)
139  		int		thresholdR;			// threshold
140  		unsigned int	specifiedGenomeLen;
141 
142 
143 	// multi-threading
144 
145 #ifdef RAZERS_PARALLEL
146 		typedef ::tbb::spin_mutex	TMutex;
147 
148 		TMutex		*patternMutex;
149 		TMutex		optionsMutex;
150 		TMutex		matchMutex;
151 #endif
152 
RazerSOptionsRazerSOptions153 		RazerSOptions()
154 		{
155 			forward = true;
156 			reverse = true;
157 			errorRate = 0.08;
158 			maxHits = 100;
159 			distanceRange = 0;	// disabled
160 			purgeAmbiguous = false;
161 			output = "";
162 			_debugLevel = 0;
163 			printVersion = false;
164 			hammingOnly = false;
165 			trimLength = 0;
166 
167 			outputFormat = 0;
168 			dumpAlignment = false;
169 			genomeNaming = 0;
170 			readNaming = 0;
171 			sortOrder = 0;
172 			positionFormat = 0;
173 			runID = "s"; 	//
174 
175 			matchN = false;
176 
177 			shape = "11111111111";
178 			threshold = 1;
179 			tabooLength = 1;
180 			repeatLength = 1000;
181 			abundanceCut = 1;
182 
183 			libraryLength = 220;
184 			libraryError = 50;
185 			nextMatePairId = 1;
186 
187 			for (unsigned i = 0; i < 4; ++i)
188 				compMask[i] = 1 << i;
189 			compMask[4] = 0;
190 
191 			compactThresh = 1024;
192 #ifdef RAZERS_DIRECT_MAQ_MAPPING
193 			maqMapping = false;
194 			maxMismatchQualSum = 70;
195 			mutationRateQual = 30;
196 			artSeedLength = 28;	// the "artificial" seed length that is used for mapping quality assignment
197 						// (28bp is maq default)
198 			absMaxQualSumErrors = 100; // maximum for sum of mism qualities in total readlength
199 			noBelowIdentity = false;
200 #endif
201 
202 #ifdef RAZERS_MICRO_RNA
203 			microRNA = false;
204 			rnaSeedLength = 16;
205 			exactSeed = true;
206 #endif
207 			minMatchLen = 23;
208  			shapeR = "11111111111";
209  			thresholdR = 1;
210  			maxGap = 10000;
211  			minGap = 0;
212  			maxPrefixErrors = -1;
213  			maxSuffixErrors = -1;
214  			specifiedGenomeLen = 3000000000U; //whole human genome default
215 
216  			maxReadLength = 0;
217 			lowMemory = false;		// set maximum shape weight to 13 to limit size of q-gram index
218 			fastaIdQual = false;
219 			minClippedLen = 0;
220 
221 		}
222 	};
223 
224 struct MicroRNA{};
225 
226 #ifdef RAZERS_MICRO_RNA
227 #define RAZERS_EXTENDED_MATCH
228 #endif
229 
230 #ifdef RAZERS_DIRECT_MAQ_MAPPING
231 #define RAZERS_EXTENDED_MATCH
232 #endif
233 
234 #ifdef RAZERS_SPLICED
235 #define RAZERS_EXTENDED_MATCH
236 #endif
237 
238 //////////////////////////////////////////////////////////////////////////////
239 // Typedefs
240 
241 	// definition of a Read match
242 	template <typename TGPos_>
243 	struct ReadMatch
244 	{
245 		typedef typename MakeSigned_<TGPos_>::Type TGPos;
246 
247 		unsigned		gseqNo;			// genome seqNo
248 		unsigned		rseqNo;			// read seqNo
249 		TGPos			gBegin;			// begin position of the match in the genome
250 		TGPos			gEnd;			// end position of the match in the genome
251 #ifdef RAZERS_MATEPAIRS
252 		unsigned		pairId;			// unique id for the two mate-pair matches (0 if unpaired)
253 		int				mateDelta:24;	// outer coordinate delta to the other mate
254 		int				pairScore:8;	// combined score of both mates
255 #endif
256 		unsigned short	editDist;		// Levenshtein distance
257 #ifdef RAZERS_EXTENDED_MATCH
258 		short	 		mScore;
259 		short			seedEditDist:8;
260 #endif
261 #ifdef RAZERS_SPLICED
262 		char			traceExtension;
263 		short			gSeedLen:8;// used as gMinMatchLen to store genomic end position of seed match
264 #endif
265 		char			orientation;	// 'F'..forward strand, 'R'..reverse comp. strand
266 	};
267 
268 	enum RAZERS_ERROR
269 	{
270 		RAZERS_INVALID_OPTIONS = -1,
271 		RAZERS_READS_FAILED    = -2,
272 		RAZERS_GENOME_FAILED   = -3,
273 		RAZERS_INVALID_SHAPE   = -4
274 	};
275 
276 //////////////////////////////////////////////////////////////////////////////
277 // Definitions
278 
279 	typedef Dna5String									TGenome;
280 	typedef StringSet<TGenome>							TGenomeSet;
281 //	typedef Dna5String									TRead;
282 	typedef String<Dna5Q>								TRead;
283 #ifdef RAZERS_CONCATREADS
284 	typedef StringSet<TRead, Owner<ConcatDirect<> > >	TReadSet;
285 #else
286 	typedef StringSet<TRead>							TReadSet;
287 #endif
288 
289 	typedef ReadMatch<Difference<TGenome>::Type>		TMatch;		// a single match
290 	typedef String<TMatch/*, MMap<>*/ >					TMatches;	// array of matches
291 
292 
293 	template <typename TSpec>
294 	struct Cargo< Index<TReadSet, TSpec> > {
295 		typedef struct {
296 			double		abundanceCut;
297 			int			_debugLevel;
298 		} Type;
299 	};
300 
301 	template <typename TAnyReadSet, typename TSpec>
302 	struct Cargo< Index<TAnyReadSet, TSpec> > {
303 		typedef struct {
304 			double		abundanceCut;
305 			int			_debugLevel;
306 		} Type;
307 	};
308 //////////////////////////////////////////////////////////////////////////////
309 // Memory tuning
310 
311 #ifdef RAZERS_MEMOPT
312 
313 	template <typename TSpec>
314 	struct SAValue< Index<TReadSet, TSpec> > {
315 		typedef Pair<
316 			unsigned,
317 			unsigned,
318 			BitCompressed<24, 8>	// max. 16M reads of length < 256
319 		> Type;
320 	};
321 	//454
322 //	template <typename TSpec>
323 //	struct SAValue< Index<TReadSet, TSpec> > {
324 //		typedef Pair<
325 //			unsigned,
326 //			unsigned,
327 //			BitCompressed<22, 10>	// max. 4M reads of length < 1024
328 //		> Type;
329 //	};
330 
331 #else
332 
333 	template <typename TSpec>
334 	struct SAValue< Index<TReadSet, TSpec> > {
335 		typedef Pair<
336 			unsigned,			// many reads
337 			unsigned,			// of arbitrary length
338 			Compressed
339 		> Type;
340 	};
341 
342 #endif
343 
344 	template <>
345 	struct Size<Dna5String>
346 	{
347 		typedef unsigned Type;
348 	};
349 
350 	template <typename TShape, typename TSpec>
351 	struct Size< Index<TReadSet, IndexQGram<TShape, TSpec> > >
352 	{
353 		typedef unsigned Type;
354 	};
355 
356 
357 #ifdef RAZERS_PRUNE_QGRAM_INDEX
358 
359 	//////////////////////////////////////////////////////////////////////////////
360 	// Repeat masker
361 	template <typename TShape, typename TSpec>
362 	inline bool _qgramDisableBuckets(Index<TReadSet, IndexQGram<TShape, TSpec> > &index)
363 	{
364 		typedef Index<TReadSet, IndexQGram<TShape, TSpec>	>	TReadIndex;
365 		typedef typename Fibre<TReadIndex, QGramDir>::Type		TDir;
366 		typedef typename Iterator<TDir, Standard>::Type			TDirIterator;
367 		typedef typename Value<TDir>::Type						TSize;
368 
369 		TDir &dir    = indexDir(index);
370 		bool result  = false;
371 		unsigned counter = 0;
372 		TSize thresh = (TSize)(length(index) * cargo(index).abundanceCut);
373 		if (thresh < 100) thresh = 100;
374 
375 		TDirIterator it = begin(dir, Standard());
376 		TDirIterator itEnd = end(dir, Standard());
377 		for (; it != itEnd; ++it)
378 			if (*it > thresh)
379 			{
380 				*it = (TSize)-1;
381 				result = true;
382 				++counter;
383 			}
384 
385 		if (counter > 0 && cargo(index)._debugLevel >= 1)
386 			::std::cerr << "Removed " << counter << " k-mers" << ::std::endl;
387 
388 		return result;
389 	}
390 
391 #endif
392 
393 
394 template<typename TIdString, typename TSeqString, typename TQString, typename TOptions>
395 bool
396 _clipReads(TIdString & fastaID, TSeqString & seq, TQString & qual, TOptions & options)
397 {
398 	typedef typename Value<TIdString>::Type TChar;
399 
400 	int tagStart = length(fastaID);
401 	int clipFront = -1;
402 	int clipBack = -1;
403 	for(unsigned i = 0; i < length(fastaID); ++i)
404 	{
405 		TChar c = fastaID[i];
406 		if(c == 'c')
407 		{
408 			tagStart = i-1;
409 			if(infix(fastaID,i,i+5)=="clip=")
410 			{
411 				i += 5;
412 				String<TChar> clipStr = "";
413 				while(i < length(fastaID) && _parseIsDigit(fastaID[i]))
414 				{
415 					append(clipStr,fastaID[i]);
416 					++i;
417 				}
418 				std::istringstream istr(toCString(clipStr));
419 				istr >> clipFront;
420 				if(i < length(fastaID) && fastaID[i] == ',') ++i;
421 				clipStr = "";
422 				while(i < length(fastaID) && _parseIsDigit(fastaID[i]))
423 				{
424 					append(clipStr,fastaID[i]);
425 					++i;
426 				}
427 				std::istringstream istr2(toCString(clipStr));
428 				istr2 >> clipBack;
429 				break;
430 			}
431 		}
432 	}
433 	if(clipFront<0 && clipBack<0) return true;	//no clip tag found
434 	if(clipFront<0) clipFront = 0;
435 	if(clipBack<0) clipBack = 0;
436 	resize(fastaID,tagStart);	// only resize fastaID, as it might contain the quality string
437 
438 	if(options.minClippedLen == 0) // clipping tag was found, but clipping option is turned off
439 		return true;
440 
441 	if(((int)length(seq)-clipBack-clipFront) < options.minClippedLen)  // sequence is too short after clipping
442 		return false;
443 
444 //	int newLen = (int)length(seq)-clipBack-clipFront;
445 
446 	//clip
447 	if(length(qual) == 0) // meaning that quality is in fasta header
448 	{
449 		// first adapt the fasta header
450 		TIdString tmp = fastaID;
451 		fastaID = infix(tmp,0,length(tmp)-length(seq));
452 		append(fastaID,infix(tmp,length(tmp)-length(seq)+clipFront,length(tmp)-clipBack));
453 	}
454 	else
455 		qual = infix(qual,clipFront,length(qual)-clipBack);
456 
457 	// now adapt the sequence
458 	seq = infix(seq,clipFront,length(seq)-clipBack);
459 
460 	return true;
461 
462 }
463 
464 
465 //////////////////////////////////////////////////////////////////////////////
466 // Load multi-Fasta sequences with or w/o quality values
467 template <typename TReadSet, typename TNameSet, typename TRazerSOptions>
468 bool loadReads(
469 	TReadSet &reads,
470 	TNameSet &fastaIDs,
471 	const char *fileName,
472 	TRazerSOptions &options)
473 {
474 	bool countN = !(options.matchN || options.outputFormat == 1);
475 #ifdef RAZERS_MICRO_RNA
476 	if(options.microRNA) countN = false;
477 #endif
478 
479 	MultiFasta multiFasta;
480 	if (!open(multiFasta.concat, fileName, OPEN_RDONLY)) return false;
481 
482 	AutoSeqFormat format;
483 	guessFormat(multiFasta.concat, format);
484 	split(multiFasta, format);
485 
486 	unsigned seqCount = length(multiFasta);
487 #ifndef RAZERS_CONCATREADS
488 	resize(reads, seqCount, Exact());
489 #endif
490 	if (options.readNaming == 0)
491 		resize(fastaIDs, seqCount, Exact());
492 
493 	String<Dna5Q> seq;
494 	CharString qual;
495 
496 	unsigned kickoutcount = 0;
497 	for(unsigned i = 0; i < seqCount; ++i)
498 	{
499 		if (options.readNaming == 0
500 #ifdef RAZERS_DIRECT_MAQ_MAPPING
501 			|| options.fastaIdQual
502 #endif
503 			)
504 			assignSeqId(fastaIDs[i], multiFasta[i], format);	// read Fasta id
505 		assignSeq(seq, multiFasta[i], format);					// read Read sequence
506 		assignQual(qual, multiFasta[i], format);				// read ascii quality values
507 #ifdef RAZERS_DIRECT_MAQ_MAPPING
508 		//check if sequence has a clip tag
509 		if(options.minClippedLen > 0)
510 		{
511 			if(!_clipReads(fastaIDs[i],seq,qual,options))
512 			{
513 				clear(seq);
514 				clear(qual);
515 				++kickoutcount;
516 			}
517 		}
518 		if(options.fastaIdQual && !empty(seq))
519 		{
520 			if(options.minClippedLen == 0)_clipReads(fastaIDs[i],seq,qual,options); // if the header wasnt clipped before, then clip now!! necessary for quality in header
521 			qual = suffix(fastaIDs[i],length(fastaIDs[i])-length(seq));
522 			if(options.readNaming == 0)
523 				fastaIDs[i] = prefix(fastaIDs[i],length(fastaIDs[i])-length(seq));
524 			else clear(fastaIDs[i]);
525 		}
526 #endif
527 		if (countN)
528 		{
529 			int count = 0;
530 			int cutoffCount = (int)(options.errorRate * length(seq));
531 			for (unsigned j = 0; j < length(seq); ++j)
532 				if (getValue(seq, j) == 'N')
533 					if (++count > cutoffCount)
534 					{
535 						clear(seq);
536 						clear(qual);
537 						++kickoutcount;
538 						break;
539 					}
540 // low qual. reads are empty to output them and their id later as LQ reads
541 //			if (count > cutoffCount) continue;
542 		}
543 #ifdef RAZERS_MICRO_RNA
544 		if(options.microRNA && length(seq)<options.rnaSeedLength)
545 		{
546 			clear(seq);
547 			clear(qual);
548 		}
549 #endif
550 
551 		// store dna and quality together
552 		for (unsigned j = 0; j < length(qual) && j < length(seq); ++j)
553 			assignQualityValue(seq[j], (int)(ordValue(qual[j]) - 33));
554 /*			if(ordValue(seq[j])>3)
555 				setValue(hybridSeq[j],(unsigned int)164); //N=164 such that &3 == 0
556 			else
557 				setValue(hybridSeq[j],(unsigned int)((ordValue(qual[j]) - 33) * 4 + ordValue(seq[j])));
558 */
559 		if (options.trimLength > 0 && length(seq) > (unsigned)options.trimLength)
560 			resize(seq, options.trimLength);
561 		if((int)length(seq) > options.maxReadLength)
562  			options.maxReadLength = length(seq);
563 
564 #ifdef RAZERS_CONCATREADS
565 		appendValue(reads, seq, Generous());
566 #else
567 		assign(reads[i], seq, Exact());
568 #endif
569 	}
570 #ifdef RAZERS_CONCATREADS
571 	reserve(reads.concat, length(reads.concat), Exact());
572 #endif
573 
574 	if (options._debugLevel > 1 && kickoutcount > 0)
575 		::std::cerr << "Ignoring " << kickoutcount << " low quality reads.\n";
576 	return (seqCount > 0);
577 }
578 
579 //////////////////////////////////////////////////////////////////////////////
580 // Read the first sequence of a multi-sequence file
581 // and return its length
582 inline int estimateReadLength(char const *fileName)
583 {
584 	MultiFasta multiFasta;
585 	if (!open(multiFasta.concat, fileName, OPEN_RDONLY))	// open the whole file
586 		return RAZERS_READS_FAILED;
587 
588 	AutoSeqFormat format;
589 	guessFormat(multiFasta.concat, format);					// guess file format
590 	split(multiFasta, format);								// divide into single sequences
591 
592 	if (length(multiFasta) == 0)
593 		return 0;
594 
595 	Dna5String firstRead;
596 	assignSeq(firstRead, multiFasta[0], format);			// read the first sequence
597 	return length(firstRead);
598 }
599 
600 
601 /*
602 //////////////////////////////////////////////////////////////////////////////
603 // Load multi-Fasta sequences
604 template <typename TGenomeSet>
605 bool loadGenomes(TGenomeSet &genomes, const char *fileName)
606 {
607 	MultiFasta multiFasta;
608 	if (!open(multiFasta.concat, fileName, OPEN_RDONLY)) return false;
609 	split(multiFasta, Fasta());
610 
611 	unsigned seqCount = length(multiFasta);
612 	resize(genomes, seqCount, Exact());
613 	for(unsigned i = 0; i < seqCount; ++i)
614 		assignSeq(genomes[i], multiFasta[i], Fasta());		// read Genome sequence
615 
616 	return (seqCount > 0);
617 }*/
618 
619 //////////////////////////////////////////////////////////////////////////////
620 // Load multi-Fasta sequences from multiple files
621 template <typename TGenomeSet>
622 bool loadGenomes(TGenomeSet &genomes, StringSet<CharString> &fileNameList)
623 {
624 	unsigned gSeqNo = 0;
625 	unsigned filecount = 0;
626 	while(filecount < length(fileNameList))
627 	{
628 		MultiFasta multiFasta;
629 		if (!open(multiFasta.concat, toCString(fileNameList[filecount]), OPEN_RDONLY)) return false;
630 		split(multiFasta, Fasta());
631 
632 		unsigned seqCount = length(multiFasta);
633 		if(length(genomes) < gSeqNo+seqCount)
634 			resize(genomes,gSeqNo+seqCount);
635 		for(unsigned i = 0; i < seqCount; ++i)
636 			assignSeq(genomes[gSeqNo+i], multiFasta[i], Fasta());		// read Genome sequence
637 		gSeqNo += seqCount;
638 		++filecount;
639 	}
640 	resize(genomes,gSeqNo);
641 	return (gSeqNo > 0);
642 }
643 
644 #ifdef RAZERS_MICRO_RNA
645 	template <typename TReadMatch>
646 	struct LessRNoGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool >
647 	{
648 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
649 		{
650 			// read number
651 			if (a.rseqNo < b.rseqNo) return true;
652 			if (a.rseqNo > b.rseqNo) return false;
653 
654 			// genome position and orientation
655 			if (a.gseqNo < b.gseqNo) return true;
656 			if (a.gseqNo > b.gseqNo) return false;
657 			if (a.gBegin < b.gBegin) return true;
658 			if (a.gBegin > b.gBegin) return false;
659 			if (a.orientation < b.orientation) return true;
660 			if (a.orientation > b.orientation) return false;
661 
662 
663 			if(a.editDist < b.editDist) return true;
664 			if(a.editDist > b.editDist) return false;
665 			return (a.mScore > b.mScore);
666 		}
667 	};
668 
669 	template <typename TReadMatch>
670 	struct LessRNoEdistHLen : public ::std::binary_function < TReadMatch, TReadMatch, bool >
671 	{
672 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
673 		{
674 			// read number
675 			if (a.rseqNo < b.rseqNo) return true;
676 			if (a.rseqNo > b.rseqNo) return false;
677 
678 			if(a.editDist < b.editDist) return true;
679 			if(a.editDist > b.editDist) return false;
680 			return (a.mScore > b.mScore);
681 
682 		}
683 	};
684 #else
685 
686 
687 	template <typename TReadMatch>
688 	struct LessRNoGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool >
689 	{
690 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
691 		{
692 			// read number
693 			if (a.rseqNo < b.rseqNo) return true;
694 			if (a.rseqNo > b.rseqNo) return false;
695 
696 			// genome position and orientation
697 			if (a.gseqNo < b.gseqNo) return true;
698 			if (a.gseqNo > b.gseqNo) return false;
699 			if (a.gBegin < b.gBegin) return true;
700 			if (a.gBegin > b.gBegin) return false;
701 			if (a.orientation < b.orientation) return true;
702 			if (a.orientation > b.orientation) return false;
703 
704 			// quality
705 #ifdef RAZERS_MATEPAIRS
706 			return a.pairScore > b.pairScore;
707 #else
708 			return a.editDist < b.editDist;
709 #endif
710 		}
711 	};
712 #endif
713 
714 	// ... to sort matches and remove duplicates with equal gEnd
715 	template <typename TReadMatch>
716 	struct LessRNoGEndPos : public ::std::binary_function < TReadMatch, TReadMatch, bool >
717 	{
718 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
719 		{
720 			// read number
721 			if (a.rseqNo < b.rseqNo) return true;
722 			if (a.rseqNo > b.rseqNo) return false;
723 
724 			// genome position and orientation
725 			if (a.gseqNo < b.gseqNo) return true;
726 			if (a.gseqNo > b.gseqNo) return false;
727 			if (a.gEnd   < b.gEnd) return true;
728 			if (a.gEnd   > b.gEnd) return false;
729 			if (a.orientation < b.orientation) return true;
730 			if (a.orientation > b.orientation) return false;
731 
732 			// quality
733 #ifdef RAZERS_MATEPAIRS
734 			return a.pairScore > b.pairScore;
735 #else
736 			return a.editDist < b.editDist;
737 #endif
738 		}
739 	};
740 
741 	template <typename TReadMatch>
742 	struct LessErrors : public ::std::binary_function < TReadMatch, TReadMatch, bool >
743 	{
744 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
745 		{
746 			// read number
747 			if (a.rseqNo < b.rseqNo) return true;
748 			if (a.rseqNo > b.rseqNo) return false;
749 
750 			// quality
751 #ifdef RAZERS_MATEPAIRS
752 			return a.pairScore > b.pairScore;
753 #else
754 			return a.editDist < b.editDist;
755 #endif
756 		}
757 	};
758 
759 template <typename TReadMatch>
760 struct LessSplicedScore : public ::std::binary_function < TReadMatch, TReadMatch, bool >
761 {
762 	inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
763 	{
764 		// read number
765 		if (a.rseqNo < b.rseqNo ) return true;
766 		if (a.rseqNo > b.rseqNo ) return false;
767 		//if ((a.rseqNo >> 1) < (b.rseqNo >> 1)) return true;
768 		//if ((a.rseqNo >> 1) > (b.rseqNo >> 1)) return false;
769 
770 		// quality
771 		if (a.pairScore > b.pairScore) return true;
772 		if (a.pairScore < b.pairScore) return false;
773 		return a.pairId < b.pairId;
774 	}
775 };
776 
777 
778 template <typename TReadMatch>
779 struct LessSplicedScoreGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool >
780 {
781 	inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
782 	{
783 		// read number
784 		if (a.rseqNo < b.rseqNo ) return true;
785 		if (a.rseqNo > b.rseqNo ) return false;
786 		//if ((a.rseqNo >> 1) < (b.rseqNo >> 1)) return true;
787 		//if ((a.rseqNo >> 1) > (b.rseqNo >> 1)) return false;
788 
789 		// quality
790 		if (a.pairScore > b.pairScore) return true;
791 		if (a.pairScore < b.pairScore) return false;
792 		if (a.pairId < b.pairId) return true;
793 		if (a.pairId > b.pairId) return false;
794 		if (a.gBegin < b.gBegin) return true;
795 		if (a.gBegin > b.gBegin) return false;
796 		return a.editDist < b.editDist;
797 	}
798 };
799 
800 
801 
802 #ifdef RAZERS_DIRECT_MAQ_MAPPING
803 
804 	struct QualityBasedScoring{};
805 
806 	template <typename TReadMatch>
807 	struct LessRNoMQ : public ::std::binary_function < TReadMatch, TReadMatch, bool >
808 	{
809 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
810 		{
811 			// read number
812 			if (a.rseqNo < b.rseqNo) return true;
813 			if (a.rseqNo > b.rseqNo) return false;
814 
815 			// quality
816 			if (a.mScore < b.mScore) return true; // sum of quality values of mismatches (the smaller the better)
817 			if (a.mScore > b.mScore) return false;
818 
819 			return (a.editDist < b.editDist); // seedEditDist?
820 			// genome position and orientation
821 	/*		if (a.gseqNo < b.gseqNo) return true;
822 			if (a.gseqNo > b.gseqNo) return false;
823 			if (a.gBegin < b.gBegin) return true;
824 			if (a.gBegin > b.gBegin) return false;
825 			if (a.orientation < b.orientation) return true;
826 			if (a.orientation > b.orientation) return false;
827 	*/
828 		}
829 	};
830 #endif
831 
832 //////////////////////////////////////////////////////////////////////////////
833 // Remove duplicate matches and leave at most maxHits many distanceRange
834 // best matches per read
835 template < typename TMatches >
836 void maskDuplicates(TMatches &matches)
837 {
838 	typedef typename Value<TMatches>::Type					TMatch;
839 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
840 
841 	//////////////////////////////////////////////////////////////////////////////
842 	// remove matches with equal ends
843 
844 	::std::sort(
845 		begin(matches, Standard()),
846 		end(matches, Standard()),
847 		LessRNoGEndPos<TMatch>());
848 
849 	typename	TMatch::TGPos gBegin = -1;
850 	typename	TMatch::TGPos gEnd = -1;
851 	unsigned	gseqNo = -1;
852 	unsigned	readNo = -1;
853 	char		orientation = '-';
854 
855 	TIterator it = begin(matches, Standard());
856 	TIterator itEnd = end(matches, Standard());
857 
858 	for (; it != itEnd; ++it)
859 	{
860 #ifdef RAZERS_MATEPAIRS
861 		if ((*it).pairId != 0) continue;
862 #endif
863 		if (gEnd == (*it).gEnd && orientation == (*it).orientation &&
864 			gseqNo == (*it).gseqNo && readNo == (*it).rseqNo)
865 		{
866 			(*it).orientation = '-';
867 			continue;
868 		}
869 		readNo = (*it).rseqNo;
870 		gseqNo = (*it).gseqNo;
871 		gEnd = (*it).gEnd;
872 		orientation = (*it).orientation;
873 	}
874 
875 	//////////////////////////////////////////////////////////////////////////////
876 	// remove matches with equal begins
877 
878 	::std::sort(
879 		begin(matches, Standard()),
880 		end(matches, Standard()),
881 		LessRNoGPos<TMatch>());
882 
883 	orientation = '-';
884 
885 	it = begin(matches, Standard());
886 	itEnd = end(matches, Standard());
887 
888 	for (; it != itEnd; ++it)
889 	{
890 		if ((*it).orientation == '-'
891 #ifdef RAZERS_MATEPAIRS
892 			|| ((*it).pairId != 0)
893 #endif
894 			) continue;
895 		if (gBegin == (*it).gBegin && readNo == (*it).rseqNo &&
896 			gseqNo == (*it).gseqNo && orientation == (*it).orientation)
897 		{
898 			(*it).orientation = '-';
899 			continue;
900 		}
901 		readNo = (*it).rseqNo;
902 		gseqNo = (*it).gseqNo;
903 		gBegin = (*it).gBegin;
904 		orientation = (*it).orientation;
905 	}
906 
907 	::std::sort(
908 		begin(matches, Standard()),
909 		end(matches, Standard()),
910 		LessErrors<TMatch>());
911 }
912 
913 //////////////////////////////////////////////////////////////////////////////
914 // Count matches for each number of errors
915 template < typename TMatches, typename TCounts >
916 void countMatches(TMatches &matches, TCounts &cnt)
917 {
918 	typedef typename Value<TMatches>::Type					TMatch;
919 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
920 	typedef typename Value<TCounts>::Type					TRow;
921 	typedef typename Value<TRow>::Type						TValue;
922 
923 	TIterator it = begin(matches, Standard());
924 	TIterator itEnd = end(matches, Standard());
925 
926 	unsigned readNo = -1;
927 	short editDist = -1;
928 	__int64 count = 0;
929 	__int64 maxVal = MaxValue<TValue>::VALUE;
930 
931 	for (; it != itEnd; ++it)
932 	{
933 		if ((*it).orientation == '-') continue;
934 		if (readNo == (*it).rseqNo && editDist == (*it).editDist)
935 			++count;
936 		else
937 		{
938 			if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt))
939 				cnt[editDist][readNo] = (maxVal < count)? maxVal : count;
940 			readNo = (*it).rseqNo;
941 			editDist = (*it).editDist;
942 			count = 1;
943 		}
944 	}
945 	if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt))
946 		cnt[editDist][readNo] = count;
947 
948 
949 }
950 
951 //////////////////////////////////////////////////////////////////////////////
952 
953 template < typename TReadNo, typename TMaxErrors >
954 inline void
955 setMaxErrors(Nothing &, TReadNo, TMaxErrors)
956 {
957 }
958 
959 template < typename TSwift, typename TReadNo, typename TMaxErrors >
960 inline void
961 setMaxErrors(TSwift &swift, TReadNo readNo, TMaxErrors maxErrors)
962 {
963 	int minT = _qgramLemma(swift, readNo, maxErrors);
964 	if (minT > 1)
965 	{
966 		if (maxErrors < 0) minT = MaxValue<int>::VALUE;
967 //		::std::cout<<" read:"<<readNo<<" newThresh:"<<minT;
968 		setMinThreshold(swift, readNo, (unsigned)minT);
969 	}
970 }
971 
972 //////////////////////////////////////////////////////////////////////////////
973 // Remove low quality matches
974 template < typename TMatches, typename TCounts, typename TSpec, typename TSwift >
975 void compactMatches(TMatches &matches, TCounts &
976 #ifdef RAZERS_DIRECT_MAQ_MAPPING
977 		cnts
978 #endif
979 	, RazerSOptions<TSpec> &options
980 	, bool compactFinal ,
981 	TSwift &
982 #if defined RAZERS_DIRECT_MAQ_MAPPING || defined RAZERS_MASK_READS
983 		swift
984 #endif
985 	)
986 {
987     // Get rid of "unused variable" warnings.  This is hard to read
988     // and should not be done anywhere else. Better not use ifdefs.
989     (void)compactFinal;
990 #ifdef RAZERS_DIRECT_MAQ_MAPPING
991 	if(options.maqMapping) compactMatches(matches, cnts,options,compactFinal,swift,true);
992 #endif
993 	typedef typename Value<TMatches>::Type					TMatch;
994 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
995 
996 #ifdef RAZERS_MICRO_RNA
997 	if(options.microRNA)
998 		::std::sort(
999 			begin(matches, Standard()),
1000 			end(matches, Standard()),
1001 			LessRNoEdistHLen<TMatch>());
1002 	int bestMScore = 0;
1003 #endif
1004 
1005 	unsigned readNo = -1;
1006 	unsigned hitCount = 0;
1007 	unsigned hitCountCutOff = options.maxHits;
1008 	__int64 disabled = 0;
1009 #ifdef RAZERS_MICRO_RNA
1010 	if(options.microRNA && options.purgeAmbiguous)
1011 		++hitCountCutOff;	// we keep one more match than we actually want, so we can later decide
1012 							// whether the read mapped more than maxhits times
1013 #endif
1014 	int editDistCutOff = MaxValue<int>::VALUE;
1015 
1016 	TIterator it = begin(matches, Standard());
1017 	TIterator itEnd = end(matches, Standard());
1018 	TIterator dit = it;
1019 	TIterator ditBeg = it;
1020 
1021 	for (; it != itEnd; ++it)
1022 	{
1023 		if ((*it).orientation == '-') continue;
1024 		if (readNo == (*it).rseqNo
1025 #ifdef RAZERS_MATEPAIRS
1026 			&& (*it).pairId == 0
1027 #endif
1028 			)
1029 		{
1030 			if ((*it).editDist >= editDistCutOff) continue;
1031 #ifdef RAZERS_MICRO_RNA
1032 			if ( (*it).mScore < bestMScore) continue;
1033 #endif
1034 			if (++hitCount >= hitCountCutOff)
1035 			{
1036 #ifdef RAZERS_MASK_READS
1037 				if (hitCount == hitCountCutOff)
1038 				{
1039 					// we have enough, now look for better matches
1040 					int maxErrors = (*it).editDist - 1;
1041 					if (options.purgeAmbiguous && (options.distanceRange == 0 || (*it).editDist < options.distanceRange))
1042 						maxErrors = -1;
1043 
1044 					setMaxErrors(swift, readNo, maxErrors);
1045 
1046 					if (maxErrors == -1)
1047 						++disabled;
1048 //						::std::cerr << "(read #" << readNo << " disabled)";
1049 
1050 					if (options.purgeAmbiguous)
1051 					{
1052 						if (options.distanceRange == 0 || (*it).editDist < options.distanceRange || compactFinal)
1053 							dit = ditBeg;
1054 						else {
1055 							*dit = *it;
1056 							++dit;
1057 						}
1058 					}
1059 				}
1060 #endif
1061 				continue;
1062 			}
1063 		}
1064 		else
1065 		{
1066 			readNo = (*it).rseqNo;
1067 			hitCount = 0;
1068 			if (options.distanceRange > 0)
1069 				editDistCutOff = (*it).editDist + options.distanceRange;
1070 #ifdef RAZERS_MICRO_RNA
1071 			bestMScore = (*it).mScore;
1072 #endif
1073 			ditBeg = dit;
1074 		}
1075 		*dit = *it;
1076 		++dit;
1077 	}
1078 	if (options._debugLevel >= 2)
1079 	{
1080 		std::cerr << " [" << length(matches) - (dit - begin(matches, Standard())) << " matches removed]";
1081 		std::cerr << " [" << disabled << " reads disabled]";
1082 	}
1083 
1084 	resize(matches, dit - begin(matches, Standard()));
1085 }
1086 
1087 
1088 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1089 //////////////////////////////////////////////////////////////////////////////
1090 // Remove low quality matches
1091 template < typename TMatches, typename TCounts, typename TSpec, typename TSwift >
1092 void compactMatches(TMatches &matches, TCounts &cnts, RazerSOptions<TSpec> &, bool, TSwift &, bool dontCountFirstTwo)
1093 {
1094 	typedef typename Value<TMatches>::Type				TMatch;
1095 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
1096 
1097 	::std::sort(
1098 		begin(matches, Standard()),
1099 		end(matches, Standard()),
1100 		LessRNoMQ<TMatch>());
1101 
1102 	unsigned readNo = -1;
1103 
1104 	TIterator it = begin(matches, Standard());
1105 	TIterator itEnd = end(matches, Standard());
1106 	TIterator dit = it;
1107 
1108 	//number of errors may not exceed 31!
1109 	bool second = true;
1110 	for (; it != itEnd; ++it)
1111 	{
1112 		if ((*it).orientation == '-') continue;
1113 		if (readNo == (*it).rseqNo)
1114 		{
1115 			//second best match
1116 			if (second)
1117 			{
1118 				second = false;
1119 				if((cnts[1][(*it).rseqNo] & 31)  > (*it).editDist)
1120 				{
1121 					//this second best match is better than any second best match before
1122 					cnts[1][(*it).rseqNo] = (*it).editDist; // second best dist is this editDist
1123 										// count is 0 (will be updated if countFirstTwo)
1124 				}
1125 				if(!dontCountFirstTwo)
1126 					if((cnts[1][(*it).rseqNo]>>5) != 2047) cnts[1][(*it).rseqNo] += 32;
1127 			}
1128 			else
1129 			{
1130 				if ((*it).editDist <= (cnts[0][(*it).rseqNo] & 31) )
1131 					if(cnts[0][(*it).rseqNo]>>5 != 2047)
1132 						cnts[0][(*it).rseqNo] +=32;
1133 				if ((*it).editDist <= (cnts[1][(*it).rseqNo] & 31) )
1134 					if((cnts[1][(*it).rseqNo]>>5) != 2047)
1135 						cnts[1][(*it).rseqNo] +=32;
1136 				continue;
1137 			}
1138 		} else
1139 		{	//best match
1140 			second = true;
1141 			readNo = (*it).rseqNo;
1142 			//cnts has 16bits, 11:5 for count:dist
1143 			if((cnts[0][(*it).rseqNo] & 31)  > (*it).editDist)
1144 			{
1145 				//this match is better than any match before
1146 				cnts[1][(*it).rseqNo] = cnts[0][(*it).rseqNo]; // best before is now second best
1147 									       // (count will be updated when match is removed)
1148 				cnts[0][(*it).rseqNo] = (*it).editDist; // best dist is this editDist
1149 									// count is 0 (will be updated if countFirstTwo)
1150 			}
1151 			if(!dontCountFirstTwo)
1152 				if((cnts[0][(*it).rseqNo]>>5) != 2047) cnts[0][(*it).rseqNo] += 32;	// shift 5 to the right, add 1, shift 5 to the left, and keep count
1153 		}
1154 		*dit = *it;
1155 		++dit;
1156 	}
1157 
1158 	resize(matches, dit - begin(matches, Standard()));
1159 }
1160 #endif
1161 
1162 
1163 #ifdef RAZERS_MICRO_RNA
1164 
1165 template < typename TMatches, typename TSpec >
1166 void purgeAmbiguousRnaMatches(TMatches &matches, RazerSOptions<TSpec> &options)
1167 {
1168 	typedef typename Value<TMatches>::Type                          TMatch;
1169 	typedef typename Iterator<TMatches, Standard>::Type             TIterator;
1170 
1171 	::std::sort(begin(matches, Standard()),end(matches, Standard()),LessRNoEdistHLen<TMatch>());
1172 	int bestMScore = 0;
1173 
1174 	unsigned readNo = -1;
1175 	unsigned hitCount = 0;
1176 	unsigned hitCountCutOff = options.maxHits;
1177 	int editDistCutOff = MaxValue<int>::VALUE;
1178 
1179 	TIterator it = begin(matches, Standard());
1180 	TIterator itEnd = end(matches, Standard());
1181 	TIterator dit = it;
1182 	TIterator ditBeg = it;
1183 
1184 	for (; it != itEnd; ++it)
1185 	{
1186 		if ((*it).orientation == '-') continue;
1187 		if (readNo == (*it).rseqNo)
1188 		{
1189 			if ((*it).editDist >= editDistCutOff) continue;
1190 			if ( (*it).mScore < bestMScore) continue;
1191 			if (++hitCount >= hitCountCutOff)
1192 			{
1193 				if (hitCount == hitCountCutOff)
1194 					dit = ditBeg;
1195 				continue;
1196 			}
1197 		}
1198 		else
1199 		{
1200 			readNo = (*it).rseqNo;
1201 			hitCount = 0;
1202 			if (options.distanceRange > 0)
1203 				editDistCutOff = (*it).editDist + options.distanceRange;
1204 			bestMScore = (*it).mScore;
1205 			ditBeg = dit;
1206 		}
1207 		*dit = *it;
1208 		++dit;
1209 	}
1210 	resize(matches, dit - begin(matches, Standard()));
1211 }
1212 
1213 
1214 
1215 //////////////////////////////////////////////////////////////////////////////
1216 // Hamming verification recording sum of mismatching qualities in m.mScore
1217 template <
1218 	typename TMatch,
1219 	typename TGenome,
1220 	typename TReadSet,
1221 	typename TSpec >
1222 inline bool
1223 matchVerify(
1224 	TMatch &m,					// resulting match
1225 	Segment<TGenome, InfixSegment> inf,		// potential match genome region
1226 	unsigned rseqNo,				// read number
1227 	TReadSet &readSet,				// reads
1228 	RazerSOptions<TSpec> const &options,		// RazerS options
1229 	MicroRNA)					// MaqMapping
1230 {
1231 
1232 	typedef Segment<TGenome, InfixSegment>					TGenomeInfix;
1233 	typedef typename Value<TReadSet>::Type const			TRead;
1234 	typedef typename Iterator<TGenomeInfix, Standard>::Type	TGenomeIterator;
1235 	typedef typename Iterator<TRead, Standard>::Type		TReadIterator;
1236 
1237 	unsigned ndlLength = sequenceLength(rseqNo, readSet);
1238 	if (length(inf) < ndlLength) return false;
1239 
1240 	// verify
1241 	TRead &read		= readSet[rseqNo];
1242 	TReadIterator ritBeg	= begin(read, Standard());
1243 	TReadIterator ritEnd	= end(read, Standard());
1244 	TGenomeIterator git	= begin(inf, Standard());
1245 	TGenomeIterator gitEnd	= end(inf, Standard()) - (ndlLength - 1);
1246 
1247 	// this is max number of errors the seed should have
1248 	unsigned maxErrorsSeed = (unsigned)(options.rnaSeedLength * options.errorRate);
1249 	unsigned minSeedErrors = maxErrorsSeed + 1;
1250 	unsigned bestHitLength = 0;
1251 
1252 	for (; git < gitEnd; ++git)
1253 	{
1254 		bool hit = true;
1255 		unsigned hitLength = 0;
1256 		unsigned count = 0;
1257 		unsigned seedErrors = 0;
1258 		TGenomeIterator g = git;	//maq would count errors in the first 28bp only (at least initially. not for output)
1259 		for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g)
1260 		{
1261 			if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
1262 			{
1263 				if (count < options.rnaSeedLength)		// the maq (28bp-)seed
1264 				{
1265 					if(++seedErrors > maxErrorsSeed)
1266 					{
1267 						hit = false;
1268 						break;
1269 					}
1270 				}
1271 				else
1272 					break;
1273 			}
1274 			++count;
1275 		}
1276 		if (hit) hitLength = count;
1277 		if (hitLength > bestHitLength ) //simply take the longest hit
1278 		{
1279 			minSeedErrors = seedErrors;
1280 			bestHitLength = hitLength;
1281 			m.gBegin = git - begin(host(inf), Standard());
1282 		}
1283 	}
1284 
1285 //	std::cout  << "options.absMaxQualSumErrors" << options.absMaxQualSumErrors << std::endl;
1286 //	std::cout  << "maxSeedErrors" << maxErrorsSeed << std::endl;
1287 //	std::cout  << "minErrors" << minSeedErrors << std::endl;
1288 //	if(derBesgte) ::std::cout << minErrors <<"minErrors\n";
1289 	if (minSeedErrors > maxErrorsSeed) return false;
1290 
1291 	m.gEnd = m.gBegin + bestHitLength;
1292 	m.editDist = minSeedErrors;			// errors in seed or total number of errors?
1293 	m.mScore = bestHitLength;
1294 	m.seedEditDist = minSeedErrors;
1295 	return true;
1296 }
1297 
1298 #endif
1299 
1300 
1301 
1302 //////////////////////////////////////////////////////////////////////////////
1303 // Hamming verification
1304 template <
1305 	typename TMatch,
1306 	typename TGenome,
1307 	typename TReadSet,
1308 	typename TMyersPatterns,
1309 	typename TSpec >
1310 inline bool
1311 matchVerify(
1312 	TMatch &m,								// resulting match
1313 	Segment<TGenome, InfixSegment> inf,		// potential match genome region
1314 	unsigned rseqNo,						// read number
1315 	TReadSet &readSet,						// reads
1316 	TMyersPatterns const &,					// MyersBitVector preprocessing data
1317 	RazerSOptions<TSpec> const &options,	// RazerS options
1318 	SwiftSemiGlobalHamming)					// Hamming only
1319 {
1320 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1321 	if(options.maqMapping)
1322 		return matchVerify(m,inf,rseqNo,readSet,options,QualityBasedScoring());
1323 #endif
1324 
1325 #ifdef RAZERS_MICRO_RNA
1326 	if(options.microRNA)
1327 		return matchVerify(m,inf,rseqNo,readSet,options,MicroRNA());
1328 #endif
1329 
1330 	typedef Segment<TGenome, InfixSegment>					TGenomeInfix;
1331 	typedef typename Value<TReadSet>::Type const			TRead;
1332 	typedef typename Iterator<TGenomeInfix, Standard>::Type	TGenomeIterator;
1333 	typedef typename Iterator<TRead, Standard>::Type		TReadIterator;
1334 
1335 #ifdef RAZERS_DEBUG
1336 	::std::cout<<"Verify: "<<::std::endl;
1337 	::std::cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl;
1338 	::std::cout<<"Read:   "<<readSet[rseqNo] << ::std::endl;
1339 #endif
1340 
1341 	unsigned ndlLength = sequenceLength(rseqNo, readSet);
1342 	if (length(inf) < ndlLength) return false;
1343 
1344 	// verify
1345 	TRead &read				= readSet[rseqNo];
1346 	TReadIterator ritBeg	= begin(read, Standard());
1347 	TReadIterator ritEnd	= end(read, Standard());
1348 	TGenomeIterator git		= begin(inf, Standard());
1349 	TGenomeIterator gitEnd	= end(inf, Standard()) - (ndlLength - 1);
1350 
1351 	unsigned maxErrors = (unsigned)(ndlLength * options.errorRate);
1352 	unsigned minErrors = maxErrors + 1;
1353 
1354 	for (; git < gitEnd; ++git)
1355 	{
1356 		unsigned errors = 0;
1357 		TGenomeIterator g = git;
1358 		for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g)
1359 			if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
1360 				if (++errors > maxErrors)
1361 					break;
1362 		if (minErrors > errors)
1363 		{
1364 			minErrors = errors;
1365 			m.gBegin = git - begin(host(inf), Standard());
1366 		}
1367 	}
1368 
1369 	if (minErrors > maxErrors) return false;
1370 
1371 	m.gEnd = m.gBegin + ndlLength;
1372 	m.editDist = minErrors;
1373 	return true;
1374 }
1375 
1376 
1377 //////////////////////////////////////////////////////////////////////////////
1378 // Edit distance verification
1379 template <
1380 	typename TMatch,
1381 	typename TGenome,
1382 	typename TReadSet,
1383 	typename TMyersPatterns,
1384 	typename TSpec >
1385 inline bool
1386 matchVerify(
1387 	TMatch &m,								// resulting match
1388 	Segment<TGenome, InfixSegment> inf,		// potential match genome region
1389 	unsigned rseqNo,						// read number
1390 	TReadSet &readSet,	    				// reads
1391 	TMyersPatterns &forwardPatterns,		// MyersBitVector preprocessing data
1392 	RazerSOptions<TSpec> const &options,	// RazerS options
1393 	SwiftSemiGlobal)						// Swift specialization
1394 {
1395 	typedef Segment<TGenome, InfixSegment>					TGenomeInfix;
1396 	typedef typename Value<TReadSet>::Type					TRead;
1397 
1398 	// find read match end
1399 	typedef Finder<TGenomeInfix>							TMyersFinder;
1400 	typedef typename Value<TMyersPatterns>::Type			TMyersPattern;
1401 
1402 	// find read match begin
1403 	typedef ModifiedString<TGenomeInfix, ModReverse>		TGenomeInfixRev;
1404 	typedef ModifiedString<TRead, ModReverse>				TReadRev;
1405 	typedef Finder<TGenomeInfixRev>							TMyersFinderRev;
1406 	typedef Pattern<TReadRev, MyersUkkonenGlobal>			TMyersPatternRev;
1407 
1408 	TMyersFinder myersFinder(inf);
1409 	TMyersPattern &myersPattern = forwardPatterns[rseqNo];
1410 
1411 #ifdef RAZERS_DEBUG
1412 	::std::cout<<"Verify: "<<::std::endl;
1413 	::std::cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl;
1414 	::std::cout<<"Read:   "<<readSet[rseqNo]<<::std::endl;
1415 #endif
1416 
1417     unsigned ndlLength = sequenceLength(rseqNo, readSet);
1418 	int maxScore = MinValue<int>::VALUE;
1419 	int minScore = -(int)(ndlLength * options.errorRate);
1420 	TMyersFinder maxPos;
1421 
1422 	// find end of best semi-global alignment
1423 	while (find(myersFinder, myersPattern, minScore))
1424 		if (maxScore <= getScore(myersPattern))
1425 		{
1426 			maxScore = getScore(myersPattern);
1427 			maxPos = myersFinder;
1428 		}
1429 
1430 	if (maxScore < minScore) return false;
1431 	m.editDist	= (unsigned)-maxScore;
1432 	setEndPosition(inf, m.gEnd = (beginPosition(inf) + position(maxPos) + 1));
1433 
1434 	// limit the beginning to needle length plus errors (== -maxScore)
1435 	if (length(inf) > ndlLength - maxScore)
1436 		setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore);
1437 
1438 	// find beginning of best semi-global alignment
1439 	TGenomeInfixRev		infRev(inf);
1440 	TReadRev			readRev(readSet[rseqNo]);
1441 	TMyersFinderRev		myersFinderRev(infRev);
1442 	TMyersPatternRev	myersPatternRev(readRev);
1443 
1444 	_patternMatchNOfPattern(myersPatternRev, options.matchN);
1445 	_patternMatchNOfFinder(myersPatternRev, options.matchN);
1446 	while (find(myersFinderRev, myersPatternRev, maxScore))
1447 		m.gBegin = m.gEnd - (position(myersFinderRev) + 1);
1448 
1449 	return true;
1450 }
1451 
1452 
1453 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1454 //////////////////////////////////////////////////////////////////////////////
1455 // Hamming verification recording sum of mismatching qualities in m.mScore
1456 template <
1457 	typename TMatch,
1458 	typename TGenome,
1459 	typename TReadSet,
1460 	typename TSpec >
1461 inline bool
1462 matchVerify(
1463 	TMatch &m,					// resulting match
1464 	Segment<TGenome, InfixSegment> inf,		// potential match genome region
1465 	unsigned rseqNo,				// read number
1466 	TReadSet &readSet,				// reads
1467 	RazerSOptions<TSpec> const &options,		// RazerS options
1468 	QualityBasedScoring)					// MaqMapping
1469 {
1470 
1471 	typedef Segment<TGenome, InfixSegment>				TGenomeInfix;
1472 	typedef typename Value<TReadSet>::Type const			TRead;
1473 	typedef typename Iterator<TGenomeInfix, Standard>::Type	TGenomeIterator;
1474 	typedef typename Iterator<TRead, Standard>::Type		TReadIterator;
1475 
1476 // #ifdef RAZERS_DEBUG
1477 // 	cout<<"Verify: "<<::std::endl;
1478 // 	cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl;
1479 // 	cout<<"Read:   "<<host(myersPattern)<<::std::endl;
1480 // #endif
1481 
1482 //	bool derBesgte = false;
1483 	//if(rseqNo == 2) derBesgte = true;
1484 //	if(derBesgte) ::std::cout << "der besagte\n";
1485 	unsigned ndlLength = sequenceLength(rseqNo, readSet);
1486 	if (length(inf) < ndlLength) return false;
1487 
1488 	// verify
1489 	TRead &read		= readSet[rseqNo];
1490 	TReadIterator ritBeg	= begin(read, Standard());
1491 	TReadIterator ritEnd	= end(read, Standard());
1492 	TGenomeIterator git	= begin(inf, Standard());
1493 	TGenomeIterator gitEnd	= end(inf, Standard()) - (ndlLength - 1);
1494 	TGenomeIterator bestIt	= begin(inf, Standard());
1495 
1496 	// this is max number of errors the 28bp 'seed' should have
1497 	//assuming that maxErrors-1 matches can be found with 100% SN
1498 	unsigned maxErrorsSeed = (unsigned)(options.artSeedLength * options.errorRate) + 1;
1499 	unsigned maxErrorsTotal = (unsigned)(ndlLength * 0.25); //options.maxErrorRate);
1500 	unsigned minErrors = maxErrorsTotal + 1;
1501 	int minQualSumErrors = options.absMaxQualSumErrors + 10;
1502 	unsigned minSeedErrors = maxErrorsSeed + 1;
1503 
1504 	for (; git < gitEnd; ++git)
1505 	{
1506 		bool hit = true;
1507 		unsigned errors = 0;
1508 		unsigned count = 0;
1509 		unsigned seedErrors = 0;
1510 		int qualSumErrors = 0;
1511 		TGenomeIterator g = git;	//maq would count errors in the first 28bp only (at least initially. not for output)
1512 		for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g)
1513 		{
1514 			if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
1515 			{
1516 			//	::std::cout << count << "<-";
1517 				if (++errors > maxErrorsTotal)
1518 				{
1519 					hit = false;
1520 					break;
1521 				}
1522 				//int qualityValue = (int)((unsigned char)*r >> 3);
1523 				//qualSumErrors += (qualityValue < options.mutationRateQual) ? qualityValue : options.mutationRateQual;
1524 				qualSumErrors += (getQualityValue(*r) < options.mutationRateQual) ? getQualityValue(*r) : options.mutationRateQual;
1525 				if(qualSumErrors > options.absMaxQualSumErrors || qualSumErrors > minQualSumErrors)
1526 				{
1527 					hit = false;
1528 					break;
1529 				}
1530 				if (count < options.artSeedLength)		// the maq (28bp-)seed
1531 				{
1532 					if(++seedErrors > maxErrorsSeed)
1533 					{
1534 						hit = false;
1535 						break;
1536 					}
1537 					if(qualSumErrors > options.maxMismatchQualSum)
1538 					{
1539 						hit = false;
1540 						break;
1541 					}// discard match, if 'seed' is bad (later calculations are done with the quality sum over the whole read)
1542 				}
1543 			}
1544 			++count;
1545 		}
1546 		if (hit && (qualSumErrors < minQualSumErrors /*&& seedErrors <=maxErrorsSeed*/) ) //oder (seedErrors < minSeedErrors)
1547 		{
1548 			minErrors = errors;
1549 			minSeedErrors = seedErrors;
1550 			minQualSumErrors = qualSumErrors;
1551 			m.gBegin = git - begin(host(inf), Standard());
1552 			bestIt = git;
1553 		}
1554 	}
1555 	if (minQualSumErrors > options.absMaxQualSumErrors || minSeedErrors > maxErrorsSeed || minErrors > maxErrorsTotal) return false;
1556 
1557 	m.gEnd = m.gBegin + ndlLength;
1558 	m.editDist = minErrors;			// errors in seed or total number of errors?
1559 	m.mScore = minQualSumErrors;
1560 	m.seedEditDist = minSeedErrors;
1561 	return true;
1562 }
1563 
1564 
1565 //////////////////////////////////////////////////////////////////////////////
1566 // Edit distance verification
1567 template <
1568 	typename TMatch,
1569 	typename TGenome,
1570 	typename TReadSet,
1571 	typename TMyersPatterns,
1572 	typename TSpec >
1573 inline bool
1574 matchVerify(
1575 	TMatch &m,								// resulting match
1576 	Segment<TGenome, InfixSegment> inf,		// potential match genome region
1577 	unsigned rseqNo,						// read number
1578 	TReadSet &readSet,	    				// reads
1579 	TMyersPatterns const & pat,				// MyersBitVector preprocessing data
1580 	RazerSOptions<TSpec> const &options,		// RazerS options
1581 	SwiftSemiGlobal const &swiftsemi,				// Hamming only
1582 	QualityBasedScoring)						// Swift specialization
1583 {
1584 	//if(!options.maqMapping)
1585 		return matchVerify(m,inf,rseqNo,readSet,pat,options,swiftsemi);
1586 	//else
1587 	//	return matchVerify(m,inf,rseqNo,readSet,pat,options,swiftsemi); // todo!
1588 }
1589 #endif
1590 
1591 
1592 
1593 
1594 #ifndef RAZERS_PARALLEL
1595 //////////////////////////////////////////////////////////////////////////////
1596 // Find read matches in one genome sequence
1597 template <
1598 	typename TMatches,
1599 	typename TGenome,
1600 	typename TReadIndex,
1601 	typename TSwiftSpec,
1602 	typename TVerifier,
1603 	typename TCounts,
1604 	typename TSpec >
1605 void mapSingleReads(
1606 	TMatches &matches,				// resulting matches
1607 	TGenome &genome,				// genome ...
1608 	unsigned gseqNo,				// ... and its sequence number
1609 	Pattern<TReadIndex, Swift<TSwiftSpec> > &swiftPattern,
1610 	TVerifier &forwardPatterns,
1611 	TCounts & cnts,
1612 	char orientation,				// q-gram index of reads
1613 	RazerSOptions<TSpec> &options)
1614 {
1615 	typedef typename Fibre<TReadIndex, FibreText>::Type	TReadSet;
1616 	typedef typename Size<TGenome>::Type					TSize;
1617 	typedef typename Value<TMatches>::Type					TMatch;
1618 
1619 
1620 	// FILTRATION
1621 	typedef Finder<TGenome, Swift<TSwiftSpec> >				TSwiftFinder;
1622 	typedef Pattern<TReadIndex, Swift<TSwiftSpec> >			TSwiftPattern;
1623 
1624 	// iterate all genomic sequences
1625 	if (options._debugLevel >= 1)
1626 	{
1627 		::std::cerr << ::std::endl << "Process genome seq #" << gseqNo;
1628 		if (orientation == 'F')
1629 			::std::cerr << "[fwd]";
1630 		else
1631 			::std::cerr << "[rev]";
1632 	}
1633 
1634 	TReadSet &readSet = host(host(swiftPattern));
1635 	TSwiftFinder swiftFinder(genome, options.repeatLength, 1);
1636 
1637 	TMatch m = {	// to supress uninitialized warnings
1638 		0, 0, 0, 0,
1639 #ifdef RAZERS_MATEPAIRS
1640 		0, 0, 0,
1641 #endif
1642 		0,
1643 #ifdef RAZERS_EXTENDED_MATCH
1644 		0, 0,
1645 #endif
1646 #ifdef RAZERS_SPLICED
1647 		0, 0,
1648 #endif
1649 		0
1650 	};
1651 
1652 	SEQAN_PROTIMESTART(map_contig_time);
1653 
1654 	// iterate all verification regions returned by SWIFT
1655 	TSize gLength = length(genome);
1656 	__int64 localTP = 0;
1657 	__int64 localFP = 0;
1658 
1659 #ifdef RAZERS_MICRO_RNA
1660 	while (find(swiftFinder, swiftPattern, 0.2))
1661 #else
1662 	while (find(swiftFinder, swiftPattern, options.errorRate))
1663 #endif
1664 	{
1665 		unsigned rseqNo = (*swiftFinder.curHit).ndlSeqNo;
1666 		if (!options.spec.DONT_VERIFY &&
1667 			matchVerify(m, infix(swiftFinder), rseqNo, readSet, forwardPatterns, options, TSwiftSpec()))
1668 		{
1669 			// transform coordinates to the forward strand
1670 			if (orientation == 'R')
1671 			{
1672 				TSize temp = m.gBegin;
1673 				m.gBegin = gLength - m.gEnd;
1674 				m.gEnd = gLength - temp;
1675 			}
1676 			m.gseqNo = gseqNo;
1677 			m.rseqNo = rseqNo;
1678 			m.orientation = orientation;
1679 #ifdef RAZERS_MATEPAIRS
1680 			m.pairId = 0;
1681 			m.pairScore = 0 - m.editDist;
1682 #endif
1683 
1684 			if (!options.spec.DONT_DUMP_RESULTS)
1685 			{
1686 				appendValue(matches, m, Generous());
1687 				if (length(matches) > options.compactThresh)
1688 				{
1689 #ifndef RAZERS_MICRO_RNA
1690 					typename Size<TMatches>::Type oldSize = length(matches);
1691 #endif
1692 					maskDuplicates(matches);	// overlapping parallelograms cause duplicates
1693 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1694 					if(options.maqMapping)
1695 						compactMatches(matches, cnts, options, false, swiftPattern, true);
1696 					else
1697 #endif
1698 						compactMatches(matches, cnts, options, false, swiftPattern);
1699 #ifdef RAZERS_MICRO_RNA
1700 					options.compactThresh = 2 * length(matches);
1701 #else
1702 					if (length(matches) * 4 > oldSize)			// the threshold should not be raised
1703 						options.compactThresh += (options.compactThresh >> 1);	// if too many matches were removed
1704 #endif
1705 				}
1706 			}
1707 
1708 			++localTP;
1709 //			::std::cerr << "\"" << infix(swiftFinder) << "\"  ";
1710 //			::std::cerr << hstkPos << " + ";
1711 //			::std::cerr << ::std::endl;
1712 		} else {
1713 			++localFP;
1714 //			::std::cerr << "\"" << infx(swiftFinder) << "\"   \"" << infix(swiftPattern) << "\"  ";
1715 //			::std::cerr << rseqNo << " : ";
1716 //			::std::cerr << hstkPos << " + ";
1717 //			::std::cerr << bucketWidth << "  " << TP << ::std::endl;
1718 		}
1719 	}
1720 	options.TP += localTP;
1721 	options.FP += localFP;
1722 	if (options._debugLevel >= 2)
1723 	{
1724 		double spec = 100;
1725 		double time = SEQAN_PROTIMEDIFF(map_contig_time);
1726 		if (localFP+localTP != 0)
1727 			spec = (1000*localTP/(localFP+localTP))/10.0;
1728 		::std::cerr << " [" << (int)((gLength / time)/100.0)/10.0 << "kbp/s " << time << "s] [" << spec << "%SP " << localFP+localTP << "V]";
1729 	}
1730 }
1731 #endif
1732 
1733 
1734 
1735 #ifdef RAZERS_MICRO_RNA
1736 
1737 	// multiple sequences
1738 	template <
1739 		typename TSA,
1740 		typename TStringSet,
1741 		typename TShape,
1742 		typename TDir,
1743 		typename TValue,
1744 		typename TWithConstraints
1745 	>
1746 	inline void
1747 	_qgramFillSuffixArray(
1748 		TSA &sa,
1749 		TStringSet &stringSet,
1750 		TShape &shape,
1751 		TDir &dir,
1752 		Nothing,
1753 		TWithConstraints const,
1754 		TValue prefixLen,
1755 		MicroRNA)
1756 	{
1757 	SEQAN_CHECKPOINT
1758 		typedef typename Value<TStringSet>::Type					TString;
1759 		typedef typename Iterator<TString const, Standard>::Type	TIterator;
1760 		typedef typename Value<TDir>::Type				TSize;
1761 		typedef typename Value<TShape>::Type				THash;
1762 
1763 		for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo)
1764 		{
1765 			TString const &sequence = value(stringSet, seqNo);
1766 			if (length(sequence) < length(shape)) continue;
1767 			int num_qgrams = prefixLen - (int)length(shape) + 1;
1768 
1769 			typename Value<TSA>::Type localPos;
1770 			assignValueI1(localPos, seqNo);
1771 			assignValueI2(localPos, 0);
1772 
1773 			TIterator itText = begin(sequence, Standard());
1774 			if (TWithConstraints::VALUE) {
1775 				THash h = hash(shape, itText) + 1;						// first hash
1776 				if (dir[h] != (TSize)-1) sa[dir[h]++] = localPos;		// if bucket is enabled
1777 			} else
1778 				sa[dir[hash(shape, itText) + 1]++] = localPos;			// first hash
1779 
1780 			for(int i = 1; i < num_qgrams; ++i)
1781 			{
1782 				++itText;
1783 				assignValueI2(localPos, i);
1784 				if (TWithConstraints::VALUE) {
1785 					THash h = hashNext(shape, itText) + 1;				// next hash
1786 					if (dir[h] != (TSize)-1) sa[dir[h]++] = localPos;	// if bucket is enabled
1787 				} else
1788 					sa[dir[hashNext(shape, itText) + 1]++] = localPos;	// next hash
1789 			}
1790 		}
1791 	}
1792 
1793 	template < typename TDir, typename TStringSet, typename TShape, typename TValue >
1794 	inline void
1795 	_qgramCountQGrams(TDir &dir, TStringSet &stringSet, TShape &shape, TValue prefixLen, MicroRNA)
1796 	{
1797 	SEQAN_CHECKPOINT
1798 		typedef typename Value<TStringSet>::Type					TString;
1799 		typedef typename Iterator<TString const, Standard>::Type	TIterator;
1800 		typedef typename Value<TDir>::Type							TSize;
1801 
1802 		for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo)
1803 		{
1804 			TString const &sequence = value(stringSet, seqNo);
1805 			if (length(sequence) < length(shape)) continue;
1806 			TSize num_qgrams = prefixLen - length(shape) + 1;
1807 
1808 			TIterator itText = begin(sequence, Standard());
1809 			++dir[hash(shape, itText)];
1810 			for(TSize i = 1; i < num_qgrams; ++i)
1811 			{
1812 				++itText;
1813 				++dir[hashNext(shape, itText)];
1814 			}
1815 		}
1816 	}
1817 
1818 
1819 	template < typename TIndex, typename TValue>
1820 	void createQGramIndex(TIndex &index, TValue prefixLen, MicroRNA)
1821 	{
1822 	SEQAN_CHECKPOINT
1823 		typename Fibre<TIndex, QGramText>::Type	   &text  = indexText(index);
1824 		typename Fibre<TIndex, QGramSA>::Type         &sa    = indexSA(index);
1825 		typename Fibre<TIndex, QGramDir>::Type        &dir   = indexDir(index);
1826 		typename Fibre<TIndex, QGramShape>::Type      &shape = indexShape(index);
1827 
1828 		Nothing nothing;
1829 
1830 		// 1. clear counters
1831 		arrayFill(begin(dir, Standard()), end(dir, Standard()), 0);
1832 
1833 		// 2. count q-grams
1834 		_qgramCountQGrams(dir, text, shape, prefixLen,MicroRNA());
1835 
1836 		if (_qgramDisableBuckets(index))
1837 		{
1838 			// 3. cumulative sum
1839 			_qgramCummulativeSum(dir, True());
1840 
1841 			// 4. fill suffix array
1842 			_qgramFillSuffixArray(sa, text, shape, dir, nothing, True(), prefixLen, MicroRNA());
1843 
1844 			// 5. correct disabled buckets
1845 			_qgramPostprocessBuckets(dir);
1846 		}
1847 		else
1848 		{
1849 			// 3. cumulative sum
1850 			_qgramCummulativeSum(dir, False());
1851 
1852 			// 4. fill suffix array
1853 			_qgramFillSuffixArray(sa, text, shape, dir, nothing, False(),prefixLen, MicroRNA());
1854 		}
1855 	}
1856 
1857 
1858 #endif
1859 
1860 
1861 
1862 //////////////////////////////////////////////////////////////////////////////
1863 // Find read matches in many genome sequences (import from Fasta)
1864 template <
1865 	typename TMatches,
1866 	typename TReadSet,
1867 	typename TCounts,
1868 	typename TSpec,
1869 	typename TShape,
1870 	typename TSwiftSpec >
1871 int mapSingleReads(
1872 	TMatches &				matches,
1873 	StringSet<CharString> &	genomeFileNameList,
1874 	StringSet<CharString> &	genomeNames,	// genome names, taken from the Fasta file
1875 	::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
1876 	TReadSet const &		readSet,
1877 	TCounts & cnts,
1878 	RazerSOptions<TSpec> &	options,
1879 	TShape const &			shape,
1880 	Swift<TSwiftSpec> const)
1881 {
1882 	typedef typename Value<TReadSet>::Type							TRead;
1883 	typedef Index<TReadSet, IndexQGram<TShape, TQGramIndexSpec> >	TIndex;			// q-gram index
1884 	typedef Pattern<TIndex, Swift<TSwiftSpec> >						TSwiftPattern;	// filter
1885 	typedef Pattern<TRead, MyersUkkonen>							TMyersPattern;	// verifier
1886 
1887 /*	// try opening each genome file once before running the whole mapping procedure
1888 	int filecount = 0;
1889 	int numFiles = length(genomeFileNameList);
1890 	while(filecount < numFiles)
1891 	{
1892 		::std::ifstream file;
1893 		file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary);
1894 		if (!file.is_open())
1895 			return RAZERS_GENOME_FAILED;
1896 		file.close();
1897 		++filecount;
1898 	}
1899 	*/
1900 
1901 	// configure q-gram index
1902 	TIndex swiftIndex(readSet, shape);
1903 #ifdef RAZERS_OPENADDRESSING
1904 	swiftIndex.alpha = 2;
1905 #endif
1906 	cargo(swiftIndex).abundanceCut = options.abundanceCut;
1907 	cargo(swiftIndex)._debugLevel = options._debugLevel;
1908 
1909 	// configure Swift
1910 	TSwiftPattern swiftPattern(swiftIndex);
1911 	swiftPattern.params.minThreshold = options.threshold;
1912 	swiftPattern.params.tabooLength = options.tabooLength;
1913 	swiftPattern.params.printDots = options._debugLevel > 0;
1914 
1915 	// init edit distance verifiers
1916 	unsigned readCount = countSequences(swiftIndex);
1917 	String<TMyersPattern> forwardPatterns;
1918 	options.compMask[4] = (options.matchN)? 15: 0;
1919 	if (!options.hammingOnly)
1920 	{
1921 		resize(forwardPatterns, readCount, Exact());
1922 		for(unsigned i = 0; i < readCount; ++i)
1923 		{
1924 			setHost(forwardPatterns[i], indexText(swiftIndex)[i]);
1925 			_patternMatchNOfPattern(forwardPatterns[i], options.matchN);
1926 			_patternMatchNOfFinder(forwardPatterns[i], options.matchN);
1927 		}
1928 	}
1929 #ifdef RAZERS_MICRO_RNA
1930 	typename Size<TIndex>::Type qgram_count = 0;
1931 	if(options.microRNA)
1932 	{
1933 		for(unsigned i = 0; i < countSequences(swiftIndex); ++i)
1934 			if (sequenceLength(i, swiftIndex) >= options.rnaSeedLength)
1935 				qgram_count += options.rnaSeedLength - (length(shape) - 1);
1936 		resize(indexSA(swiftIndex), qgram_count, Exact());
1937 		resize(indexDir(swiftIndex), _fullDirLength(swiftIndex), Exact());
1938 		createQGramIndex(swiftIndex,options.rnaSeedLength,MicroRNA());
1939 	}
1940 #endif
1941 
1942 
1943 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1944 	if(options.maqMapping)
1945 	{
1946 		resize(cnts, 2);
1947 		for (unsigned i = 0; i < length(cnts); ++i)
1948 			resize(cnts[i], readCount, 31); //initialize with maxeditDist, 11:5 for count:dist
1949 	}
1950 #endif
1951 
1952 	// clear stats
1953 	options.FP = 0;
1954 	options.TP = 0;
1955 	options.timeMapReads = 0;
1956 	options.timeDumpResults = 0;
1957 
1958 	unsigned filecount = 0;
1959 	unsigned numFiles = length(genomeFileNameList);
1960 	unsigned gseqNo = 0;
1961 
1962 	// open genome files, one by one
1963 	while (filecount < numFiles)
1964 	{
1965 		// open genome file
1966 		::std::ifstream file;
1967 		file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary);
1968 		if (!file.is_open())
1969 			return RAZERS_GENOME_FAILED;
1970 
1971 		// remove the directory prefix of current genome file
1972 		::std::string genomeFile(toCString(genomeFileNameList[filecount]));
1973 		size_t lastPos = genomeFile.find_last_of('/') + 1;
1974 		if (lastPos == genomeFile.npos) lastPos = genomeFile.find_last_of('\\') + 1;
1975 		if (lastPos == genomeFile.npos) lastPos = 0;
1976 		::std::string genomeName = genomeFile.substr(lastPos);
1977 
1978 
1979 		CharString	id;
1980 		Dna5String	genome;
1981 		unsigned gseqNoWithinFile = 0;
1982 		// iterate over genome sequences
1983 		SEQAN_PROTIMESTART(find_time);
1984 		for(; !_streamEOF(file); ++gseqNo)
1985 		{
1986 			if (options.genomeNaming == 0)
1987 			{
1988 				//readID(file, id, Fasta());			// read Fasta id
1989 				readShortID(file, id, Fasta());			// read Fasta id up to first whitespace
1990 				appendValue(genomeNames, id, Generous());
1991 			}
1992 			read(file, genome, Fasta());			// read Fasta sequence
1993 
1994 			gnoToFileMap.insert(::std::make_pair<unsigned,::std::pair< ::std::string,unsigned> >(gseqNo,::std::make_pair< ::std::string,unsigned>(genomeName,gseqNoWithinFile)));
1995 
1996 			if (options.forward)
1997 				mapSingleReads(matches, genome, gseqNo, swiftPattern, forwardPatterns, cnts, 'F', options);
1998 
1999 			if (options.reverse)
2000 			{
2001 				reverseComplement(genome);
2002 				mapSingleReads(matches, genome, gseqNo, swiftPattern, forwardPatterns, cnts, 'R', options);
2003 			}
2004 			++gseqNoWithinFile;
2005 
2006 		}
2007 		options.timeMapReads += SEQAN_PROTIMEDIFF(find_time);
2008 		file.close();
2009 		++filecount;
2010 	}
2011 
2012 	if (options._debugLevel >= 1)
2013 		::std::cerr << ::std::endl << "Finding reads took               \t" << options.timeMapReads << " seconds" << ::std::endl;
2014 	if (options._debugLevel >= 2) {
2015 		::std::cerr << ::std::endl;
2016 		::std::cerr << "___FILTRATION_STATS____" << ::std::endl;
2017 		::std::cerr << "Swift FP: " << options.FP << ::std::endl;
2018 		::std::cerr << "Swift TP: " << options.TP << ::std::endl;
2019 	}
2020 	return 0;
2021 }
2022 
2023 
2024 //////////////////////////////////////////////////////////////////////////////
2025 // Find read matches in many genome sequences (given as StringSet)
2026 template <
2027 	typename TMatches,
2028 	typename TGenomeSet,
2029 	typename TReadSet,
2030 	typename TCounts,
2031 	typename TSpec,
2032 	typename TShape,
2033 	typename TSwiftSpec >
2034 int mapSingleReads(
2035 	TMatches &				matches,
2036 	TGenomeSet &			genomeSet,
2037 	TReadSet const &		readSet,
2038 	TCounts &				cnts,
2039 	RazerSOptions<TSpec> &	options,
2040 	TShape const &			shape,
2041 	Swift<TSwiftSpec> const)
2042 {
2043 	typedef typename Value<TReadSet>::Type							TRead;
2044 	typedef Index<TReadSet, IndexQGram<TShape, TQGramIndexSpec> >	TIndex;			// q-gram index
2045 	typedef Pattern<TIndex, Swift<TSwiftSpec> >						TSwiftPattern;	// filter
2046 	typedef Pattern<TRead, MyersUkkonen>							TMyersPattern;	// verifier
2047 
2048 	// configure q-gram index
2049 	TIndex swiftIndex(readSet, shape);
2050 	cargo(swiftIndex).abundanceCut = options.abundanceCut;
2051 	cargo(swiftIndex)._debugLevel = options._debugLevel;
2052 
2053 	// configure Swift
2054 	TSwiftPattern swiftPattern(swiftIndex);
2055 	swiftPattern.params.minThreshold = options.threshold;
2056 	swiftPattern.params.tabooLength = options.tabooLength;
2057 
2058 	// init edit distance verifiers
2059 	String<TMyersPattern> forwardPatterns;
2060 	options.compMask[4] = (options.matchN)? 15: 0;
2061 	if (!options.hammingOnly)
2062 	{
2063 		unsigned readCount = countSequences(swiftIndex);
2064 		resize(forwardPatterns, readCount, Exact());
2065 		for(unsigned i = 0; i < readCount; ++i)
2066 		{
2067 			setHost(forwardPatterns[i], indexText(swiftIndex)[i]);
2068 			_patternMatchNOfPattern(forwardPatterns[i], options.matchN);
2069 			_patternMatchNOfFinder(forwardPatterns[i], options.matchN);
2070 		}
2071 	}
2072 
2073 	// clear stats
2074 	options.FP = 0;
2075 	options.TP = 0;
2076 	options.timeMapReads = 0;
2077 	options.timeDumpResults = 0;
2078 
2079 	CharString	id;
2080 #ifdef RAZERS_DIRECT_MAQ_MAPPING
2081  	if(options.maqMapping)
2082  	{
2083 		resize(cnts, 2);
2084 		for (unsigned i = 0; i < length(cnts); ++i)
2085 			resize(cnts[i], length(readSet), 31);
2086 	}
2087 #endif
2088 
2089 
2090 
2091 	// iterate over genome sequences
2092 	SEQAN_PROTIMESTART(find_time);
2093 	for(unsigned gseqNo = 0; gseqNo < length(genomeSet); ++gseqNo)
2094 	{
2095 		if (options.forward)
2096 			mapSingleReads(matches, genomeSet[gseqNo], gseqNo, swiftPattern, forwardPatterns, cnts, 'F', options);
2097 
2098 		if (options.reverse)
2099 		{
2100 			reverseComplement(genomeSet[gseqNo]);
2101 			mapSingleReads(matches, genomeSet[gseqNo], gseqNo, swiftPattern, forwardPatterns, cnts, 'R', options);
2102 			reverseComplement(genomeSet[gseqNo]);
2103 		}
2104 
2105 	}
2106 	options.timeMapReads += SEQAN_PROTIMEDIFF(find_time);
2107 
2108 	if (options._debugLevel >= 1)
2109 		::std::cerr << ::std::endl << "Finding reads took               \t" << options.timeMapReads << " seconds" << ::std::endl;
2110 	if (options._debugLevel >= 2) {
2111 		::std::cerr << ::std::endl;
2112 		::std::cerr << "___FILTRATION_STATS____" << ::std::endl;
2113 		::std::cerr << "Swift FP: " << options.FP << ::std::endl;
2114 		::std::cerr << "Swift TP: " << options.TP << ::std::endl;
2115 	}
2116 	return 0;
2117 }
2118 
2119 //////////////////////////////////////////////////////////////////////////////
2120 // Wrapper for single/mate-pair mapping
2121 template <
2122 	typename TMatches,
2123 	typename TReadSet,
2124 	typename TCounts,
2125 	typename TSpec,
2126 	typename TShape,
2127 	typename TSwiftSpec >
2128 int mapReads(
2129 	TMatches &				matches,
2130 	StringSet<CharString> &	genomeFileNameList,
2131 	StringSet<CharString> &	genomeNames,	// genome names, taken from the Fasta file
2132 	::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
2133 	TReadSet &		readSet,
2134 	TCounts &				cnts,
2135 	RazerSOptions<TSpec> &	options,
2136 	TShape const &			shape,
2137 	Swift<TSwiftSpec> const)
2138 {
2139 
2140 #ifdef RAZERS_MATEPAIRS
2141 	if (options.libraryLength >= 0)
2142 		return mapMatePairReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, shape, Swift<TSwiftSpec>());
2143 	else
2144 #endif
2145 		return mapSingleReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, shape, Swift<TSwiftSpec>());
2146 }
2147 
2148 template <
2149 	typename TMatches,
2150 	typename TGenomeSet,
2151 	typename TReadSet,
2152 	typename TCounts,
2153 	typename TSpec,
2154 	typename TShape,
2155 	typename TSwiftSpec >
2156 int mapReads(
2157 	TMatches &				matches,
2158 	TGenomeSet &			genomeSet,
2159 	TReadSet &		readSet,
2160 	TCounts &				cnts,
2161 	RazerSOptions<TSpec> &	options,
2162 	TShape const &			shape,
2163 	Swift<TSwiftSpec> const)
2164 {
2165 #ifdef RAZERS_MATEPAIRS
2166 	if (options.libraryLength >= 0)
2167 		return mapMatePairReads(matches, genomeSet, readSet, cnts, options, shape, Swift<TSwiftSpec>());
2168 	else
2169 #endif
2170 		return mapSingleReads(matches, genomeSet, readSet, cnts, options, shape, Swift<TSwiftSpec>());
2171 }
2172 
2173 //////////////////////////////////////////////////////////////////////////////
2174 // Wrapper for different template specializations
2175 template <typename TMatches, typename TReadSet, typename TCounts, typename TSpec>
2176 int mapReads(
2177 	TMatches &				matches,
2178 	StringSet<CharString> &	genomeFileNameList,
2179 	StringSet<CharString> &	genomeNames,	// genome names, taken from the Fasta file
2180 	::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
2181 	TReadSet &		readSet,
2182 	TCounts &				cnts,
2183 	RazerSOptions<TSpec> &	options)
2184 {
2185         // first check whether split mapping (--> two shapes) or normal mapping
2186 #ifdef RAZERS_SPLICED
2187         if (options.minMatchLen > 0)
2188                 return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options);
2189 #endif
2190 
2191 	Shape<Dna, SimpleShape>		ungapped;
2192 	Shape<Dna, OneGappedShape>	onegapped;
2193 	Shape<Dna, GenericShape>	gapped;
2194 
2195 	// 2x3 SPECIALIZATION
2196 
2197 	if (options.hammingOnly)
2198 	{
2199 		// select best-fitting shape
2200 		if (stringToShape(ungapped, options.shape))
2201 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobalHamming>());
2202 
2203 		if (stringToShape(onegapped, options.shape))
2204 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobalHamming>());
2205 
2206 		if (stringToShape(gapped, options.shape))
2207 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, gapped, Swift<SwiftSemiGlobalHamming>());
2208 	}
2209 	else
2210 	{
2211 		if (stringToShape(ungapped, options.shape))
2212 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobal>());
2213 
2214 		if (stringToShape(onegapped, options.shape))
2215 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobal>());
2216 
2217 		if (stringToShape(gapped, options.shape))
2218 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, gapped, Swift<SwiftSemiGlobal>());
2219 	}
2220 
2221 	return RAZERS_INVALID_SHAPE;
2222 }
2223 
2224 template <typename TMatches, typename TGenomeSet, typename TReadSet, typename TCounts, typename TSpec>
2225 int mapReads(
2226 	TMatches &				matches,
2227 	TGenomeSet &			genomeSet,
2228 	TReadSet &		readSet,
2229 	TCounts &				cnts,
2230 	RazerSOptions<TSpec> &	options)
2231 {
2232 
2233 
2234         // first check whether split mapping (--> two shapes) or normal mapping
2235 #ifdef RAZERS_SPLICED
2236         if (options.minMatchLen > 0)
2237                 return mapSplicedReads(matches, genomeSet, readSet, cnts, options);
2238 #endif
2239 
2240 
2241 	Shape<Dna, SimpleShape>		ungapped;
2242 	Shape<Dna, OneGappedShape>	onegapped;
2243 	Shape<Dna, GenericShape>	gapped;
2244 
2245 	// 2x3 SPECIALIZATION
2246 
2247 	if (options.hammingOnly)
2248 	{
2249 		// select best-fitting shape
2250 		if (stringToShape(ungapped, options.shape))
2251 			return mapReads(matches, genomeSet, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobalHamming>());
2252 
2253 		if (stringToShape(onegapped, options.shape))
2254 			return mapReads(matches, genomeSet, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobalHamming>());
2255 
2256 		if (stringToShape(gapped, options.shape))
2257 			return mapReads(matches, genomeSet, readSet, cnts, options, gapped, Swift<SwiftSemiGlobalHamming>());
2258 	}
2259 	else
2260 	{
2261 		if (stringToShape(ungapped, options.shape))
2262 			return mapReads(matches, genomeSet, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobal>());
2263 
2264 		if (stringToShape(onegapped, options.shape))
2265 			return mapReads(matches, genomeSet, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobal>());
2266 
2267 		if (stringToShape(gapped, options.shape))
2268 			return mapReads(matches, genomeSet, readSet, cnts, options, gapped, Swift<SwiftSemiGlobal>());
2269 	}
2270 
2271 	return RAZERS_INVALID_SHAPE;
2272 }
2273 
2274 }
2275 
2276 #endif
2277