1  /*==========================================================================
2              RazerS - Fast Read Mapping with Controlled Loss Rate
3                    http://www.seqan.de/projects/razers.html
4 
5  ============================================================================
6   Copyright (C) 2008 by David Weese
7 
8   This program is free software; you can redistribute it and/or
9   modify it under the terms of the GNU Lesser General Public
10   License as published by the Free Software Foundation; either
11   version 3 of the License, or (at your options) any later version.
12 
13   This program is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20  ==========================================================================*/
21 
22 #ifndef SEQAN_HEADER_RAZERS_H
23 #define SEQAN_HEADER_RAZERS_H
24 
25 #include <iostream>
26 #include <fstream>
27 
28 #include <seqan/find.h>
29 #include <seqan/seq_io.h>
30 #include <seqan/index.h>
31 #include <seqan/seq_io.h>
32 
33 #ifdef RAZERS_PARALLEL
34 #include "tbb/spin_mutex.h"
35 #endif
36 
37 namespace seqan
38 {
39 
40 //////////////////////////////////////////////////////////////////////////////
41 // Default options
42 
43 #ifdef RAZERS_OPENADDRESSING
44 	typedef OpenAddressing	TQGramIndexSpec;
45 #else
46 	typedef Default	TQGramIndexSpec;
47 #endif
48 
49 	template < bool DONT_VERIFY_ = false, bool DONT_DUMP_RESULTS_ = false >
50 	struct RazerSSpec
51 	{
52 		enum { DONT_VERIFY = DONT_VERIFY_ };				// omit verifying potential matches
53 		enum { DONT_DUMP_RESULTS = DONT_DUMP_RESULTS_ };	// omit dumping results
54 	};
55 
56 	template < typename TSpec = RazerSSpec<> >
57 	struct RazerSOptions
58 	{
59 	// main options
60 		TSpec		spec;
61 		bool		forward;			// compute forward oriented read matches
62 		bool		reverse;			// compute reverse oriented read matches
63 		double		errorRate;			// Criteria 1 threshold
64 		unsigned	maxHits;			// output at most maxHits many matches
65 		unsigned	distanceRange;		// output only the best, second best, ..., distanceRange best matches
66 		bool		purgeAmbiguous;		// true..remove reads with more than maxHits best matches, false..keep them
67 		CharString	output;				// name of result file
68 		int			_debugLevel;		// level of verbosity
69 		bool		printVersion;		// print version number
70 		bool		hammingOnly;		// no indels
71 		int			trimLength;			// if >0, cut reads to #trimLength characters
72 		CharString	outputUnmapped;		// name of file storing unmapped reads
73 
74 
75 	// output format options
76 		unsigned	outputFormat;		// 0..Razer format
77 										// 1..enhanced Fasta
78 										// 2..ELAND format
79 		bool		dumpAlignment;		// compute and dump the match alignments in the result files
80 		unsigned	genomeNaming;		// 0..use Fasta id
81 										// 1..enumerate reads beginning with 1
82 		unsigned	readNaming;			// 0..use Fasta id
83 										// 1..enumerate reads beginning with 1
84 										// 2..use the read sequence (only for short reads!)
85 		unsigned	sortOrder;			// 0..sort keys: 1. read number, 2. genome position
86 										// 1..           1. genome pos50ition, 2. read number
87 		unsigned	positionFormat;		// 0..gap space
88 										// 1..position space
89 		const char	*runID;				// runID needed for gff output
90 
91 	// filtration parameters
92 		::std::string shape;			// shape (e.g. 11111111111)
93 		int			threshold;			// threshold
94 		int			tabooLength;		// taboo length
95 		int			repeatLength;		// repeat length threshold
96 		double		abundanceCut;		// abundance threshold
97 
98 	// mate-pair parameters
99 		int			libraryLength;		// offset between two mates
100 		int			libraryError;		// offset tolerance
101 		unsigned	nextMatePairId;		// use this id for the next mate-pair
102 
103 	// verification parameters
104 		bool		matchN;				// false..N is always a mismatch, true..N matches with all
105 		unsigned char compMask[5];
106 
107 	// statistics
108 		int64_t		FP;					// false positives (threshold reached, no match)
109 		int64_t		TP;					// true positives (threshold reached, match)
110 		double		timeLoadFiles;		// time for loading input files
111 		double		timeMapReads;		// time for mapping reads
112 		double		timeDumpResults;	// time for dumping the results
113 
114 #ifdef RAZERS_DIRECT_MAQ_MAPPING
115 		bool		maqMapping;
116 		int		maxMismatchQualSum;
117 		int		mutationRateQual;
118 		int		absMaxQualSumErrors;
119 		unsigned	artSeedLength;
120 		bool		noBelowIdentity;
121 #endif
122 		bool	readsWithQualities;
123 
124 #ifdef RAZERS_MICRO_RNA
125 		bool		microRNA;
126 		unsigned	rnaSeedLength;
127 		bool 		exactSeed;
128 #endif
129 
130 		bool		lowMemory;		// set maximum shape weight to 13 to limit size of q-gram index
131 		bool		fastaIdQual;		// hidden option for special fasta+quality format we use
132 		int			minClippedLen;
133 
134 	// misc
135 		unsigned	compactThresh;		// compact match array if larger than compactThresh
136  		int 		maxReadLength;
137 
138 		int			penaltyC;			// puts penalty on the existence on a middle gap in split mapping mode
139 		unsigned        minMatchLen;	// minimum prefix/suffix match length in split mapping mode
140  		int		maxPrefixErrors;		//
141  		int		maxSuffixErrors;
142  		int	        maxGap;
143  		int	        minGap;
144  		::std::string 	shapeR;			// shape (e.g. 11111111111)
145  		int		thresholdR;			// threshold
146  		unsigned int	specifiedGenomeLen;
147 		bool		anchored;
148 		int	    maxReadRegionsEnd;
149 		int     minReadRegionsStart;
150 
151 
152 	// multi-threading
153 
154 #ifdef RAZERS_PARALLEL
155 		typedef ::tbb::spin_mutex	TMutex;
156 
157 		TMutex		*patternMutex;
158 		TMutex		optionsMutex;
159 		TMutex		matchMutex;
160 #endif
161 
RazerSOptionsRazerSOptions162 		RazerSOptions()
163 		{
164 			forward = true;
165 			reverse = true;
166 			errorRate = 0.08;
167 			maxHits = 100;
168 			distanceRange = 0;	// disabled
169 			purgeAmbiguous = false;
170 			output = "";
171 			_debugLevel = 0;
172 			printVersion = false;
173 			hammingOnly = false;
174 			outputUnmapped = "";
175 			trimLength = 0;
176 
177 			outputFormat = 0;
178 			dumpAlignment = false;
179 			genomeNaming = 0;
180 			readNaming = 0;
181 			sortOrder = 0;
182 			positionFormat = 0;
183 			runID = "s"; 	//
184 
185 			matchN = false;
186 
187 			shape = "11111111111";
188 			threshold = 1;
189 			tabooLength = 1;
190 			repeatLength = 1000;
191 			abundanceCut = 1;
192 
193 			libraryLength = 220;
194 			libraryError = 50;
195 			nextMatePairId = 1;
196 
197 			for (unsigned i = 0; i < 4; ++i)
198 				compMask[i] = 1 << i;
199 			compMask[4] = 0;
200 
201 			compactThresh = 1024;
202 #ifdef RAZERS_DIRECT_MAQ_MAPPING
203 			maqMapping = false;
204 			maxMismatchQualSum = 70;
205 			mutationRateQual = 30;
206 			artSeedLength = 28;	// the "artificial" seed length that is used for mapping quality assignment
207 						// (28bp is maq default)
208 			absMaxQualSumErrors = 100; // maximum for sum of mism qualities in total readlength
209 			noBelowIdentity = false;
210 #endif
211 			readsWithQualities = false;
212 
213 #ifdef RAZERS_MICRO_RNA
214 			microRNA = false;
215 			rnaSeedLength = 16;
216 			exactSeed = true;
217 #endif
218 			penaltyC = 2;
219 			minMatchLen = 18;
220  			shapeR = "11111111111";
221  			thresholdR = 1;
222  			maxGap = 10000;
223  			minGap = 0;
224  			maxPrefixErrors = 1;
225  			maxSuffixErrors = 1;
226  			specifiedGenomeLen = 3000; //in Mb, whole human genome default
227 			anchored = false;
228 			maxReadRegionsEnd = 0;
229 			minReadRegionsStart = 0; //
230 
231  			maxReadLength = 0;
232 			lowMemory = false;		// set maximum shape weight to 13 to limit size of q-gram index
233 			fastaIdQual = false;
234 			minClippedLen = 0;
235 
236 			FP = 0;
237 			TP = 0;
238 			timeLoadFiles = 0.0;
239 			timeMapReads = 0.0;
240 			timeDumpResults = 0;
241 		}
242 	};
243 
244 struct MicroRNA{};
245 
246 #ifdef RAZERS_MICRO_RNA
247 #define RAZERS_EXTENDED_MATCH
248 #endif
249 
250 #ifdef RAZERS_DIRECT_MAQ_MAPPING
251 #define RAZERS_EXTENDED_MATCH
252 #endif
253 
254 #ifdef RAZERS_SPLICED
255 #define RAZERS_EXTENDED_MATCH
256 #endif
257 
258 //////////////////////////////////////////////////////////////////////////////
259 // Typedefs
260 
261 	// definition of a Read match
262 	template <typename TGPos_>
263 	struct ReadMatch
264 	{
265 		typedef typename MakeSigned_<TGPos_>::Type TGPos;
266 
267 		unsigned		gseqNo;			// genome seqNo
268 		unsigned		rseqNo;			// read seqNo
269 		TGPos			gBegin;			// begin position of the match in the genome
270 		TGPos			gEnd;			// end position of the match in the genome
271 #ifdef RAZERS_MATEPAIRS
272 		unsigned		pairId;			// unique id for the two mate-pair matches (0 if unpaired)
273 		int				mateDelta:23;	// outer coordinate delta to the other mate
274 		unsigned		pairScore:9;	// combined score of both mates max 256
275 #endif
276 		unsigned short	editDist;		// Levenshtein distance
277 #ifdef RAZERS_EXTENDED_MATCH
278 		short	 		mScore;
279 		short			seedEditDist:8;
280 #endif
281 #ifdef RAZERS_SPLICED
282 		char			traceExtension;
283 		short			gSeedLen:8;// used as gMinMatchLen to store genomic end position of seed match
284 #endif
285 		char			orientation;	// 'F'..forward strand, 'R'..reverse comp. strand
286 	};
287 
288 	enum RAZERS_ERROR
289 	{
290 		RAZERS_INVALID_OPTIONS = -1,
291 		RAZERS_READS_FAILED    = -2,
292 		RAZERS_GENOME_FAILED   = -3,
293 		RAZERS_INVALID_SHAPE   = -4
294 	};
295 
296 //////////////////////////////////////////////////////////////////////////////
297 // Definitions
298 
299 	typedef Dna5String									TGenome;
300 	typedef StringSet<TGenome>							TGenomeSet;
301 //	typedef Dna5String									TRead;
302 	typedef String<Dna5Q>								TRead;
303 #ifdef RAZERS_CONCATREADS
304 	typedef StringSet<TRead, Owner<ConcatDirect<> > >	TReadSet;
305 #else
306 	typedef StringSet<TRead>							TReadSet;
307 #endif
308 
309 	typedef ReadMatch<Difference<TGenome>::Type>		TMatch;		// a single match
310 	typedef String<TMatch/*, MMap<>*/ >					TMatches;	// array of matches
311 	//chrNo, startPos, endPos
312 //	typedef String<Pair<unsigned,MakeSigned_<Difference<TGenome>::Type>::Type > >	TReadRegions;
313 
314 	typedef MakeSigned_<Difference<TGenome>::Type>::Type TSignedPos;
315 	typedef Pair<unsigned,TSignedPos,BitPacked<2,BitsPerValue<TSignedPos>::VALUE> > TFlagPos;
316 
317 	//chrNo, flag to remember flag, startPos of mate
318 	typedef String<Pair<unsigned,TFlagPos > >	TReadRegions;
319 
320 
321 	template <typename TSpec>
322 	struct Cargo< Index<TReadSet, TSpec> > {
323 		typedef struct {
324 			double		abundanceCut;
325 			int			_debugLevel;
326 		} Type;
327 	};
328 
329 	template <typename TAnyReadSet, typename TSpec>
330 	struct Cargo< Index<TAnyReadSet, TSpec> > {
331 		typedef struct {
332 			double		abundanceCut;
333 			int			_debugLevel;
334 		} Type;
335 	};
336 //////////////////////////////////////////////////////////////////////////////
337 // Memory tuning
338 
339 #ifdef RAZERS_MEMOPT
340 
341 	template <typename TSpec>
342 	struct SAValue< Index<TReadSet, TSpec> > {
343 		typedef Pair<
344 			unsigned,
345 			unsigned,
346 			BitPacked<24, 8>	// max. 16M reads of length < 256
347 		> Type;
348 	};
349 	//454
350 //	template <typename TSpec>
351 //	struct SAValue< Index<TReadSet, TSpec> > {
352 //		typedef Pair<
353 //			unsigned,
354 //			unsigned,
355 //			BitPacked<22, 10>	// max. 4M reads of length < 1024
356 //		> Type;
357 //	};
358 
359 #else
360 
361 	template <typename TSpec>
362 	struct SAValue< Index<TReadSet, TSpec> > {
363 		typedef Pair<
364 			unsigned,			// many reads
365 			unsigned,			// of arbitrary length
366 			Pack
367 		> Type;
368 	};
369 
370 #endif
371 
372 	template <>
373 	struct Size<Dna5String>
374 	{
375 		typedef unsigned Type;
376 	};
377 
378 	template <typename TShape, typename TSpec>
379 	struct Size< Index<TReadSet, IndexQGram<TShape, TSpec> > >
380 	{
381 		typedef unsigned Type;
382 	};
383 
384 
385 #ifdef RAZERS_PRUNE_QGRAM_INDEX
386 
387 	//////////////////////////////////////////////////////////////////////////////
388 	// Repeat masker
389 	template <typename TShape, typename TSpec>
390 	inline bool _qgramDisableBuckets(Index<TReadSet, IndexQGram<TShape, TSpec> > &index)
391 	{
392 		typedef Index<TReadSet, IndexQGram<TShape, TSpec>	>	TReadIndex;
393 		typedef typename Fibre<TReadIndex, QGramDir>::Type		TDir;
394 		typedef typename Iterator<TDir, Standard>::Type			TDirIterator;
395 		typedef typename Value<TDir>::Type						TSize;
396 
397 		TDir &dir    = indexDir(index);
398 		bool result  = false;
399 		unsigned counter = 0;
400 		TSize thresh = (TSize)(length(index) * cargo(index).abundanceCut);
401 		if (thresh < 100) thresh = 100;
402 
403 		TDirIterator it = begin(dir, Standard());
404 		TDirIterator itEnd = end(dir, Standard());
405 		for (; it != itEnd; ++it)
406 			if (*it > thresh)
407 			{
408 				*it = (TSize)-1;
409 				result = true;
410 				++counter;
411 			}
412 
413 		if (counter > 0 && cargo(index)._debugLevel >= 1)
414 			::std::cerr << "Removed " << counter << " k-mers" << ::std::endl;
415 
416 		return result;
417 	}
418 
419 #endif
420 
421 
422 template<typename TIdString, typename TSeqString, typename TQString, typename TOptions>
423 bool
424 _clipReads(TIdString & fastaID, TSeqString & seq, TQString & qual, TOptions & options)
425 {
426 	typedef typename Value<TIdString>::Type TChar;
427 
428 	int tagStart = length(fastaID);
429 	int clipFront = -1;
430 	int clipBack = -1;
431 	for(unsigned i = 0; i < length(fastaID); ++i)
432 	{
433 		TChar c = fastaID[i];
434 		if(c == 'c')
435 		{
436 			tagStart = i-1;
437 			if(infix(fastaID,i,i+5)=="clip=")
438 			{
439 				i += 5;
440 				String<TChar> clipStr = "";
441 				while(i < length(fastaID) && _parseIsDigit(fastaID[i]))
442 				{
443 					append(clipStr,fastaID[i]);
444 					++i;
445 				}
446 				std::istringstream istr(toCString(clipStr));
447 				istr >> clipFront;
448 				if(i < length(fastaID) && fastaID[i] == ',') ++i;
449 				clipStr = "";
450 				while(i < length(fastaID) && _parseIsDigit(fastaID[i]))
451 				{
452 					append(clipStr,fastaID[i]);
453 					++i;
454 				}
455 				std::istringstream istr2(toCString(clipStr));
456 				istr2 >> clipBack;
457 				break;
458 			}
459 		}
460 	}
461 	if(clipFront<0 && clipBack<0) return true;	//no clip tag found
462 	if(clipFront<0) clipFront = 0;
463 	if(clipBack<0) clipBack = 0;
464 	resize(fastaID,tagStart);	// only resize fastaID, as it might contain the quality string
465 
466 	if(options.minClippedLen == 0) // clipping tag was found, but clipping option is turned off
467 		return true;
468 
469 	if(((int)length(seq)-clipBack-clipFront) < options.minClippedLen)  // sequence is too short after clipping
470 		return false;
471 
472 //	int newLen = (int)length(seq)-clipBack-clipFront;
473 
474 	//clip
475 	if(length(qual) == 0) // meaning that quality is in fasta header
476 	{
477 		// first adapt the fasta header
478 		TIdString tmp = fastaID;
479 		fastaID = infix(tmp,0,length(tmp)-length(seq));
480 		append(fastaID,infix(tmp,length(tmp)-length(seq)+clipFront,length(tmp)-clipBack));
481 	}
482 	else
483 		qual = infix(qual,clipFront,length(qual)-clipBack);
484 
485 	// now adapt the sequence
486 	seq = infix(seq,clipFront,length(seq)-clipBack);
487 
488 	return true;
489 
490 }
491 
492 
493 //////////////////////////////////////////////////////////////////////////////
494 // Load multi-Fasta sequences with or w/o quality values
495 template <typename TReadSet, typename TNameSet, typename TRazerSOptions>
496 bool loadReads(
497 	TReadSet &reads,
498 	TNameSet &fastaIDs,
499 	char const * fileName,
500 	TRazerSOptions &options)
501 {
502 	bool countN = !(options.matchN || options.outputFormat == 1);
503 	if (!empty(options.outputUnmapped)) countN = false;
504 #ifdef RAZERS_MICRO_RNA
505 	if(options.microRNA) countN = false;
506 #endif
507 
508     SeqFileIn seqFile;
509 
510     bool success;
511     if (!isEqual(fileName, "-"))
512         success = open(seqFile, fileName);
513     else
514         success = open(seqFile, std::cin);
515     if (!success)
516         return false;
517 
518     CharString fastaId;
519 	String<Dna5Q> seq;
520 	CharString qual;
521 
522 	unsigned kickoutcount = 0;
523 	while (!atEnd(seqFile))
524 	{
525         readRecord(fastaId, seq, qual, seqFile);
526 		if (options.readNaming == 0
527 #ifdef RAZERS_DIRECT_MAQ_MAPPING
528 			|| options.fastaIdQual
529 #endif
530 			)
531 			appendValue(fastaIDs, fastaId); // append Fasta id
532 		if(!empty(qual)) options.readsWithQualities = true;
533 #ifdef RAZERS_DIRECT_MAQ_MAPPING
534 		//check if sequence has a clip tag
535 		if(options.minClippedLen > 0)
536 		{
537 			if(!_clipReads(back(fastaIDs),seq,qual,options))
538 			{
539 				clear(seq);
540 				clear(qual);
541 				++kickoutcount;
542 			}
543 		}
544 		if(options.fastaIdQual && !empty(seq))
545 		{
546 			if(options.minClippedLen == 0)_clipReads(back(fastaIDs),seq,qual,options); // if the header wasnt clipped before, then clip now!! necessary for quality in header
547 			qual = suffix(back(fastaIDs),length(back(fastaIDs))-length(seq));
548 			if(options.readNaming == 0)
549 				back(fastaIDs) = prefix(back(fastaIDs),length(back(fastaIDs))-length(seq));
550 			else clear(back(fastaIDs));
551 		}
552 #endif
553 		if (countN)
554 		{
555 			int count = 0;
556 			int cutoffCount = (int)(options.errorRate * length(seq));
557 			for (unsigned j = 0; j < length(seq); ++j)
558 				if (getValue(seq, j) == 'N')
559 					if (++count > cutoffCount)
560 					{
561 						clear(seq);
562 						clear(qual);
563 						++kickoutcount;
564 						break;
565 					}
566 // low qual. reads are empty to output them and their id later as LQ reads
567 //			if (count > cutoffCount) continue;
568 		}
569 #ifdef RAZERS_MICRO_RNA
570 		if(options.microRNA && length(seq)<options.rnaSeedLength)
571 		{
572 			clear(seq);
573 			clear(qual);
574 		}
575 #endif
576 
577 		// store dna and quality together
578 		for (unsigned j = 0; j < length(qual) && j < length(seq); ++j)
579 			assignQualityValue(seq[j], (int)(ordValue(qual[j]) - 33));
580 /*			if(ordValue(seq[j])>3)
581 				setValue(hybridSeq[j],(unsigned int)164); //N=164 such that &3 == 0
582 			else
583 				setValue(hybridSeq[j],(unsigned int)((ordValue(qual[j]) - 33) * 4 + ordValue(seq[j])));
584 */
585 		if (options.trimLength > 0 && length(seq) > (unsigned)options.trimLength)
586 			resize(seq, options.trimLength);
587 		if((int)length(seq) > options.maxReadLength)
588  			options.maxReadLength = length(seq);
589 
590 		appendValue(reads, seq, Generous());
591 	}
592 
593 	if (options._debugLevel > 1 && kickoutcount > 0)
594 		::std::cerr << "Ignoring " << kickoutcount << " low quality reads.\n";
595 	return !empty(reads);
596 }
597 
598 //////////////////////////////////////////////////////////////////////////////
599 // Read the first sequence of a multi-sequence file
600 // and return its length
601 inline int estimateReadLength(char const *fname)
602 {
603     SeqFileIn seqFile;
604 	if (!open(seqFile, fname) || atEnd(seqFile))
605 		return RAZERS_READS_FAILED;
606 
607     // parse record from file
608     CharString fastaId, seq;
609     readRecord(fastaId, seq, seqFile);
610     return length(seq);
611 }
612 
613 //////////////////////////////////////////////////////////////////////////////
614 // Load multi-Fasta sequences from multiple files
615 template <typename TGenomeSet>
616 bool loadGenomes(TGenomeSet &genomes, StringSet<CharString> &fileNameList)
617 {
618     SeqFileIn seqFile;
619     StringSet<CharString> ids;
620 	for (size_t i = 0; i != length(fileNameList); ++i)
621 	{
622 		if (!open(seqFile, toCString(fileNameList[i])))
623             return false;
624 
625         readRecords(ids, genomes, seqFile);
626         clear(ids);
627         close(seqFile);
628 	}
629 	return length(genomes);
630 }
631 
632 #ifdef RAZERS_MICRO_RNA
633 	template <typename TReadMatch>
634 	struct LessRNoGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool >
635 	{
636 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
637 		{
638 			// read number
639 			if (a.rseqNo < b.rseqNo) return true;
640 			if (a.rseqNo > b.rseqNo) return false;
641 
642 			// genome position and orientation
643 			if (a.gseqNo < b.gseqNo) return true;
644 			if (a.gseqNo > b.gseqNo) return false;
645 			if (a.gBegin < b.gBegin) return true;
646 			if (a.gBegin > b.gBegin) return false;
647 			if (a.orientation < b.orientation) return true;
648 			if (a.orientation > b.orientation) return false;
649 
650 
651 			if(a.editDist < b.editDist) return true;
652 			if(a.editDist > b.editDist) return false;
653 			return (a.mScore > b.mScore);
654 		}
655 	};
656 
657 	template <typename TReadMatch>
658 	struct LessRNoEdistHLen : public ::std::binary_function < TReadMatch, TReadMatch, bool >
659 	{
660 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
661 		{
662 			// read number
663 			if (a.rseqNo < b.rseqNo) return true;
664 			if (a.rseqNo > b.rseqNo) return false;
665 
666 			if(a.editDist < b.editDist) return true;
667 			if(a.editDist > b.editDist) return false;
668 			return (a.mScore > b.mScore);
669 
670 		}
671 	};
672 #else
673 
674 
675 	template <typename TReadMatch>
676 	struct LessRNoGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool >
677 	{
678 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
679 		{
680 			// read number
681 			if (a.rseqNo < b.rseqNo) return true;
682 			if (a.rseqNo > b.rseqNo) return false;
683 
684 			// genome position and orientation
685 			if (a.gseqNo < b.gseqNo) return true;
686 			if (a.gseqNo > b.gseqNo) return false;
687 			if (a.gBegin < b.gBegin) return true;
688 			if (a.gBegin > b.gBegin) return false;
689 			if (a.orientation < b.orientation) return true;
690 			if (a.orientation > b.orientation) return false;
691 
692 			// quality
693 #ifdef RAZERS_MATEPAIRS
694 			return a.pairScore > b.pairScore;
695 #else
696 			return a.editDist < b.editDist;
697 #endif
698 		}
699 	};
700 #endif
701 
702 	// ... to sort matches and remove duplicates with equal gEnd
703 	template <typename TReadMatch>
704 	struct LessRNoGEndPos : public ::std::binary_function < TReadMatch, TReadMatch, bool >
705 	{
706 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
707 		{
708 			// read number
709 			if (a.rseqNo < b.rseqNo) return true;
710 			if (a.rseqNo > b.rseqNo) return false;
711 
712 			// genome position and orientation
713 			if (a.gseqNo < b.gseqNo) return true;
714 			if (a.gseqNo > b.gseqNo) return false;
715 			if (a.gEnd   < b.gEnd) return true;
716 			if (a.gEnd   > b.gEnd) return false;
717 			if (a.orientation < b.orientation) return true;
718 			if (a.orientation > b.orientation) return false;
719 
720 			// quality
721 #ifdef RAZERS_MATEPAIRS
722 			return a.pairScore > b.pairScore;
723 #else
724 			return a.editDist < b.editDist;
725 #endif
726 		}
727 	};
728 
729 	template <typename TReadMatch>
730 	struct LessErrors : public ::std::binary_function < TReadMatch, TReadMatch, bool >
731 	{
732 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
733 		{
734 			// read number
735 			if (a.rseqNo < b.rseqNo) return true;
736 			if (a.rseqNo > b.rseqNo) return false;
737 
738 			// quality
739 #ifdef RAZERS_MATEPAIRS
740 			return a.pairScore > b.pairScore;
741 #else
742 			return a.editDist < b.editDist;
743 #endif
744 		}
745 	};
746 
747 template <typename TReadMatch>
748 struct LessSplicedScore : public ::std::binary_function < TReadMatch, TReadMatch, bool >
749 {
750 	inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
751 	{
752 		// read number
753 		if (a.rseqNo < b.rseqNo ) return true;
754 		if (a.rseqNo > b.rseqNo ) return false;
755 		//if ((a.rseqNo >> 1) < (b.rseqNo >> 1)) return true;
756 		//if ((a.rseqNo >> 1) > (b.rseqNo >> 1)) return false;
757 
758 		// quality
759 		if (a.pairScore > b.pairScore) return true;
760 		if (a.pairScore < b.pairScore) return false;
761 //CHECK:		if (abs(a.mateDelta) < abs(b.mateDelta)) return true;
762 //CHECK:		if (abs(a.mateDelta) > abs(b.mateDelta)) return falsee;
763 
764 		return a.pairId < b.pairId;
765 	}
766 };
767 
768 
769 template <typename TReadMatch>
770 struct LessSplicedScoreGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool >
771 {
772 	inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
773 	{
774 		// read number
775 		if (a.rseqNo < b.rseqNo ) return true;
776 		if (a.rseqNo > b.rseqNo ) return false;
777 		//if ((a.rseqNo >> 1) < (b.rseqNo >> 1)) return true;
778 		//if ((a.rseqNo >> 1) > (b.rseqNo >> 1)) return false;
779 
780 		// quality
781 		if (a.pairScore > b.pairScore) return true;
782 		if (a.pairScore < b.pairScore) return false;
783 		if (a.pairId < b.pairId) return true;
784 		if (a.pairId > b.pairId) return false;
785 		if (a.gBegin < b.gBegin) return true;
786 		if (a.gBegin > b.gBegin) return false;
787 		return a.editDist < b.editDist;
788 	}
789 };
790 
791 
792 
793 #ifdef RAZERS_DIRECT_MAQ_MAPPING
794 
795 	struct QualityBasedScoring{};
796 
797 	template <typename TReadMatch>
798 	struct LessRNoMQ : public ::std::binary_function < TReadMatch, TReadMatch, bool >
799 	{
800 		inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
801 		{
802 			// read number
803 			if (a.rseqNo < b.rseqNo) return true;
804 			if (a.rseqNo > b.rseqNo) return false;
805 
806 			// quality
807 			if (a.mScore < b.mScore) return true; // sum of quality values of mismatches (the smaller the better)
808 			if (a.mScore > b.mScore) return false;
809 
810 			return (a.editDist < b.editDist); // seedEditDist?
811 			// genome position and orientation
812 	/*		if (a.gseqNo < b.gseqNo) return true;
813 			if (a.gseqNo > b.gseqNo) return false;
814 			if (a.gBegin < b.gBegin) return true;
815 			if (a.gBegin > b.gBegin) return false;
816 			if (a.orientation < b.orientation) return true;
817 			if (a.orientation > b.orientation) return false;
818 	*/
819 		}
820 	};
821 #endif
822 
823 //////////////////////////////////////////////////////////////////////////////
824 // Remove duplicate matches and leave at most maxHits many distanceRange
825 // best matches per read
826 template < typename TMatches >
827 void maskDuplicates(TMatches &matches)
828 {
829 	typedef typename Value<TMatches>::Type					TMatch;
830 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
831 
832 	//////////////////////////////////////////////////////////////////////////////
833 	// remove matches with equal ends
834 
835 	::std::sort(
836 		begin(matches, Standard()),
837 		end(matches, Standard()),
838 		LessRNoGEndPos<TMatch>());
839 
840 	typename	TMatch::TGPos gBegin = -1;
841 	typename	TMatch::TGPos gEnd = -1;
842 	unsigned	gseqNo = -1;
843 	unsigned	readNo = -1;
844 	char		orientation = '-';
845 
846 	TIterator it = begin(matches, Standard());
847 	TIterator itEnd = end(matches, Standard());
848 
849 	for (; it != itEnd; ++it)
850 	{
851 #ifdef RAZERS_MATEPAIRS
852 		if ((*it).pairId != 0) continue;
853 #endif
854 		if (gEnd == (*it).gEnd && orientation == (*it).orientation &&
855 			gseqNo == (*it).gseqNo && readNo == (*it).rseqNo)
856 		{
857 			(*it).orientation = '-';
858 			continue;
859 		}
860 		readNo = (*it).rseqNo;
861 		gseqNo = (*it).gseqNo;
862 		gEnd = (*it).gEnd;
863 		orientation = (*it).orientation;
864 	}
865 
866 	//////////////////////////////////////////////////////////////////////////////
867 	// remove matches with equal begins
868 
869 	::std::sort(
870 		begin(matches, Standard()),
871 		end(matches, Standard()),
872 		LessRNoGPos<TMatch>());
873 
874 	orientation = '-';
875 
876 	it = begin(matches, Standard());
877 	itEnd = end(matches, Standard());
878 
879 	for (; it != itEnd; ++it)
880 	{
881 		if ((*it).orientation == '-'
882 #ifdef RAZERS_MATEPAIRS
883 			|| ((*it).pairId != 0)
884 #endif
885 			) continue;
886 		if (gBegin == (*it).gBegin && readNo == (*it).rseqNo &&
887 			gseqNo == (*it).gseqNo && orientation == (*it).orientation)
888 		{
889 			(*it).orientation = '-';
890 			continue;
891 		}
892 		readNo = (*it).rseqNo;
893 		gseqNo = (*it).gseqNo;
894 		gBegin = (*it).gBegin;
895 		orientation = (*it).orientation;
896 	}
897 
898 	::std::sort(
899 		begin(matches, Standard()),
900 		end(matches, Standard()),
901 		LessErrors<TMatch>());
902 }
903 
904 //////////////////////////////////////////////////////////////////////////////
905 // Count matches for each number of errors
906 template < typename TMatches, typename TCounts >
907 void countMatches(TMatches &matches, TCounts &cnt)
908 {
909 	//typedef typename Value<TMatches>::Type					TMatch;
910 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
911 	typedef typename Value<TCounts>::Type					TRow;
912 	typedef typename Value<TRow>::Type						TValue;
913 
914 	TIterator it = begin(matches, Standard());
915 	TIterator itEnd = end(matches, Standard());
916 
917 	unsigned readNo = -1;
918 	short editDist = -1;
919 	int64_t count = 0;
920 	int64_t maxVal = std::numeric_limits<TValue>::max();
921 
922 	for (; it != itEnd; ++it)
923 	{
924 		if ((*it).orientation == '-') continue;
925 		if (readNo == (*it).rseqNo && editDist == (*it).editDist)
926 			++count;
927 		else
928 		{
929 			if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt))
930 				cnt[editDist][readNo] = (maxVal < count)? maxVal : count;
931 			readNo = (*it).rseqNo;
932 			editDist = (*it).editDist;
933 			count = 1;
934 		}
935 	}
936 	if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt))
937 		cnt[editDist][readNo] = count;
938 
939 
940 }
941 
942 //////////////////////////////////////////////////////////////////////////////
943 
944 template < typename TReadNo, typename TMaxErrors >
945 inline void
946 setMaxErrors(Nothing &, TReadNo, TMaxErrors)
947 {
948 }
949 
950 template < typename TSwift, typename TReadNo, typename TMaxErrors >
951 inline void
952 setMaxErrors(TSwift &swift, TReadNo readNo, TMaxErrors maxErrors)
953 {
954 	int minT = _qgramLemma(swift, readNo, maxErrors);
955 	if (minT > 1)
956 	{
957 		if (maxErrors < 0) minT = std::numeric_limits<int>::max();
958 //		::std::cout<<" read:"<<readNo<<" newThresh:"<<minT;
959 		setMinThreshold(swift, readNo, (unsigned)minT);
960 	}
961 }
962 
963 //////////////////////////////////////////////////////////////////////////////
964 // Remove low quality matches
965 template < typename TMatches, typename TCounts, typename TSpec, typename TSwift >
966 void compactMatches(TMatches &matches, TCounts &
967 #ifdef RAZERS_DIRECT_MAQ_MAPPING
968 		cnts
969 #endif
970 	, RazerSOptions<TSpec> &options
971 	, bool compactFinal ,
972 	TSwift &
973 #if defined RAZERS_DIRECT_MAQ_MAPPING || defined RAZERS_MASK_READS
974 		swift
975 #endif
976 	)
977 {
978     // Get rid of "unused variable" warnings.  This is hard to read
979     // and should not be done anywhere else. Better not use ifdefs.
980     (void)compactFinal;
981 #ifdef RAZERS_DIRECT_MAQ_MAPPING
982 	if(options.maqMapping) compactMatches(matches, cnts,options,compactFinal,swift,true);
983 #endif
984 	//typedef typename Value<TMatches>::Type					TMatch;
985 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
986 
987 #ifdef RAZERS_MICRO_RNA
988 	if(options.microRNA)
989 		::std::sort(
990 			begin(matches, Standard()),
991 			end(matches, Standard()),
992 			LessRNoEdistHLen<TMatch>());
993 	int bestMScore = 0;
994 #endif
995 
996 	unsigned readNo = -1;
997 	unsigned hitCount = 0;
998 	unsigned hitCountCutOff = options.maxHits;
999 	int64_t disabled = 0;
1000 #ifdef RAZERS_MICRO_RNA
1001 	if(options.microRNA && options.purgeAmbiguous)
1002 		++hitCountCutOff;	// we keep one more match than we actually want, so we can later decide
1003 							// whether the read mapped more than maxhits times
1004 #endif
1005 	int editDistCutOff = std::numeric_limits<int>::max();
1006 
1007 	TIterator it = begin(matches, Standard());
1008 	TIterator itEnd = end(matches, Standard());
1009 	TIterator dit = it;
1010 	TIterator ditBeg = it;
1011 
1012 	for (; it != itEnd; ++it)
1013 	{
1014 		if ((*it).orientation == '-') continue;
1015 		if (readNo == (*it).rseqNo
1016 #ifdef RAZERS_MATEPAIRS
1017 			&& (*it).pairId == 0
1018 #endif
1019 			)
1020 		{
1021 			if ((*it).editDist >= editDistCutOff) continue;
1022 #ifdef RAZERS_MICRO_RNA
1023 			if ( (*it).mScore < bestMScore) continue;
1024 #endif
1025 			if (++hitCount >= hitCountCutOff)
1026 			{
1027 #ifdef RAZERS_MASK_READS
1028 				if (hitCount == hitCountCutOff)
1029 				{
1030 					// we have enough, now look for better matches
1031 					int maxErrors = (*it).editDist - 1;
1032 					if (options.purgeAmbiguous && (options.distanceRange == 0 || (*it).editDist < options.distanceRange))
1033 						maxErrors = -1;
1034 
1035 					setMaxErrors(swift, readNo, maxErrors);
1036 
1037 					if (maxErrors == -1)
1038 						++disabled;
1039 //						::std::cerr << "(read #" << readNo << " disabled)";
1040 
1041 					if (options.purgeAmbiguous)
1042 					{
1043 						if (options.distanceRange == 0 || (*it).editDist < options.distanceRange || compactFinal)
1044 							dit = ditBeg;
1045 						else {
1046 							*dit = *it;
1047 							++dit;
1048 						}
1049 					}
1050 				}
1051 #endif
1052 				continue;
1053 			}
1054 		}
1055 		else
1056 		{
1057 			readNo = (*it).rseqNo;
1058 			hitCount = 0;
1059 			if (options.distanceRange > 0)
1060 				editDistCutOff = (*it).editDist + options.distanceRange;
1061 #ifdef RAZERS_MICRO_RNA
1062 			bestMScore = (*it).mScore;
1063 #endif
1064 			ditBeg = dit;
1065 		}
1066 		*dit = *it;
1067 		++dit;
1068 	}
1069 	if (options._debugLevel >= 2)
1070 	{
1071 		std::cerr << " [" << length(matches) - (dit - begin(matches, Standard())) << " matches removed]";
1072 		std::cerr << " [" << disabled << " reads disabled]";
1073 	}
1074 
1075 	resize(matches, dit - begin(matches, Standard()));
1076 }
1077 
1078 
1079 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1080 //////////////////////////////////////////////////////////////////////////////
1081 // Remove low quality matches
1082 template < typename TMatches, typename TCounts, typename TSpec, typename TSwift >
1083 void compactMatches(TMatches &matches, TCounts &cnts, RazerSOptions<TSpec> &, bool, TSwift &, bool dontCountFirstTwo)
1084 {
1085 	typedef typename Value<TMatches>::Type				TMatch;
1086 	typedef typename Iterator<TMatches, Standard>::Type		TIterator;
1087 
1088 	::std::sort(
1089 		begin(matches, Standard()),
1090 		end(matches, Standard()),
1091 		LessRNoMQ<TMatch>());
1092 
1093 	unsigned readNo = -1;
1094 
1095 	TIterator it = begin(matches, Standard());
1096 	TIterator itEnd = end(matches, Standard());
1097 	TIterator dit = it;
1098 
1099 	//number of errors may not exceed 31!
1100 	bool second = true;
1101 	for (; it != itEnd; ++it)
1102 	{
1103 		if ((*it).orientation == '-') continue;
1104 		if (readNo == (*it).rseqNo)
1105 		{
1106 			//second best match
1107 			if (second)
1108 			{
1109 				second = false;
1110 				if((cnts[1][(*it).rseqNo] & 31)  > (*it).editDist)
1111 				{
1112 					//this second best match is better than any second best match before
1113 					cnts[1][(*it).rseqNo] = (*it).editDist; // second best dist is this editDist
1114 										// count is 0 (will be updated if countFirstTwo)
1115 				}
1116 				if(!dontCountFirstTwo)
1117 					if((cnts[1][(*it).rseqNo]>>5) != 2047) cnts[1][(*it).rseqNo] += 32;
1118 			}
1119 			else
1120 			{
1121 				if ((*it).editDist <= (cnts[0][(*it).rseqNo] & 31) )
1122 					if(cnts[0][(*it).rseqNo]>>5 != 2047)
1123 						cnts[0][(*it).rseqNo] +=32;
1124 				if ((*it).editDist <= (cnts[1][(*it).rseqNo] & 31) )
1125 					if((cnts[1][(*it).rseqNo]>>5) != 2047)
1126 						cnts[1][(*it).rseqNo] +=32;
1127 				continue;
1128 			}
1129 		} else
1130 		{	//best match
1131 			second = true;
1132 			readNo = (*it).rseqNo;
1133 			//cnts has 16bits, 11:5 for count:dist
1134 			if((cnts[0][(*it).rseqNo] & 31)  > (*it).editDist)
1135 			{
1136 				//this match is better than any match before
1137 				cnts[1][(*it).rseqNo] = cnts[0][(*it).rseqNo]; // best before is now second best
1138 									       // (count will be updated when match is removed)
1139 				cnts[0][(*it).rseqNo] = (*it).editDist; // best dist is this editDist
1140 									// count is 0 (will be updated if countFirstTwo)
1141 			}
1142 			if(!dontCountFirstTwo)
1143 				if((cnts[0][(*it).rseqNo]>>5) != 2047) cnts[0][(*it).rseqNo] += 32;	// shift 5 to the right, add 1, shift 5 to the left, and keep count
1144 		}
1145 		*dit = *it;
1146 		++dit;
1147 	}
1148 
1149 	resize(matches, dit - begin(matches, Standard()));
1150 }
1151 #endif
1152 
1153 
1154 #ifdef RAZERS_MICRO_RNA
1155 
1156 template < typename TMatches, typename TSpec >
1157 void purgeAmbiguousRnaMatches(TMatches &matches, RazerSOptions<TSpec> &options)
1158 {
1159 	typedef typename Value<TMatches>::Type                          TMatch;
1160 	typedef typename Iterator<TMatches, Standard>::Type             TIterator;
1161 
1162 	::std::sort(begin(matches, Standard()),end(matches, Standard()),LessRNoEdistHLen<TMatch>());
1163 	int bestMScore = 0;
1164 
1165 	unsigned readNo = -1;
1166 	unsigned hitCount = 0;
1167 	unsigned hitCountCutOff = options.maxHits;
1168 	int editDistCutOff = std::numeric_limits<int>::max();
1169 
1170 	TIterator it = begin(matches, Standard());
1171 	TIterator itEnd = end(matches, Standard());
1172 	TIterator dit = it;
1173 	TIterator ditBeg = it;
1174 
1175 	for (; it != itEnd; ++it)
1176 	{
1177 		if ((*it).orientation == '-') continue;
1178 		if (readNo == (*it).rseqNo)
1179 		{
1180 			if ((*it).editDist >= editDistCutOff) continue;
1181 			if ( (*it).mScore < bestMScore) continue;
1182 			if (++hitCount >= hitCountCutOff)
1183 			{
1184 				if (hitCount == hitCountCutOff)
1185 					dit = ditBeg;
1186 				continue;
1187 			}
1188 		}
1189 		else
1190 		{
1191 			readNo = (*it).rseqNo;
1192 			hitCount = 0;
1193 			if (options.distanceRange > 0)
1194 				editDistCutOff = (*it).editDist + options.distanceRange;
1195 			bestMScore = (*it).mScore;
1196 			ditBeg = dit;
1197 		}
1198 		*dit = *it;
1199 		++dit;
1200 	}
1201 	resize(matches, dit - begin(matches, Standard()));
1202 }
1203 
1204 
1205 
1206 //////////////////////////////////////////////////////////////////////////////
1207 // Hamming verification recording sum of mismatching qualities in m.mScore
1208 template <
1209 	typename TMatch,
1210 	typename TGenome,
1211 	typename TReadSet,
1212 	typename TSpec >
1213 inline bool
1214 matchVerify(
1215 	TMatch &m,					// resulting match
1216 	Segment<TGenome, InfixSegment> inf,		// potential match genome region
1217 	unsigned rseqNo,				// read number
1218 	TReadSet &readSet,				// reads
1219 	RazerSOptions<TSpec> const &options,		// RazerS options
1220 	MicroRNA)					// MaqMapping
1221 {
1222 
1223 	typedef Segment<TGenome, InfixSegment>					TGenomeInfix;
1224 	typedef typename Value<TReadSet>::Type const			TRead;
1225 	typedef typename Iterator<TGenomeInfix, Standard>::Type	TGenomeIterator;
1226 	typedef typename Iterator<TRead, Standard>::Type		TReadIterator;
1227 
1228 	unsigned ndlLength = sequenceLength(rseqNo, readSet);
1229 	if (length(inf) < ndlLength) return false;
1230 
1231 	// verify
1232 	TRead &read		= readSet[rseqNo];
1233 	TReadIterator ritBeg	= begin(read, Standard());
1234 	TReadIterator ritEnd	= end(read, Standard());
1235 	TGenomeIterator git	= begin(inf, Standard());
1236 	TGenomeIterator gitEnd	= end(inf, Standard()) - (ndlLength - 1);
1237 
1238 	// this is max number of errors the seed should have
1239 	unsigned maxErrorsSeed = (unsigned)(options.rnaSeedLength * options.errorRate);
1240 	unsigned minSeedErrors = maxErrorsSeed + 1;
1241 	unsigned bestHitLength = 0;
1242 
1243 	for (; git < gitEnd; ++git)
1244 	{
1245 		bool hit = true;
1246 		unsigned hitLength = 0;
1247 		unsigned count = 0;
1248 		unsigned seedErrors = 0;
1249 		TGenomeIterator g = git;	//maq would count errors in the first 28bp only (at least initially. not for output)
1250 		for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g)
1251 		{
1252 			if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
1253 			{
1254 				if (count < options.rnaSeedLength)		// the maq (28bp-)seed
1255 				{
1256 					if(++seedErrors > maxErrorsSeed)
1257 					{
1258 						hit = false;
1259 						break;
1260 					}
1261 				}
1262 				else
1263 					break;
1264 			}
1265 			++count;
1266 		}
1267 		if (hit) hitLength = count;
1268 		if (hitLength > bestHitLength ) //simply take the longest hit
1269 		{
1270 			minSeedErrors = seedErrors;
1271 			bestHitLength = hitLength;
1272 			m.gBegin = git - begin(host(inf), Standard());
1273 		}
1274 	}
1275 
1276 //	std::cout  << "options.absMaxQualSumErrors" << options.absMaxQualSumErrors << std::endl;
1277 //	std::cout  << "maxSeedErrors" << maxErrorsSeed << std::endl;
1278 //	std::cout  << "minErrors" << minSeedErrors << std::endl;
1279 //	if(derBesgte) ::std::cout << minErrors <<"minErrors\n";
1280 	if (minSeedErrors > maxErrorsSeed) return false;
1281 
1282 	m.gEnd = m.gBegin + bestHitLength;
1283 	m.editDist = minSeedErrors;			// errors in seed or total number of errors?
1284 	m.mScore = bestHitLength;
1285 	m.seedEditDist = minSeedErrors;
1286 	return true;
1287 }
1288 
1289 #endif
1290 
1291 
1292 
1293 //////////////////////////////////////////////////////////////////////////////
1294 // Hamming verification
1295 template <
1296 	typename TMatch,
1297 	typename TGenome,
1298 	typename TReadSet,
1299 	typename TMyersPatterns,
1300 	typename TSpec >
1301 inline bool
1302 matchVerify(
1303 	TMatch &m,								// resulting match
1304 	Segment<TGenome, InfixSegment> inf,		// potential match genome region
1305 	unsigned rseqNo,						// read number
1306 	TReadSet &readSet,						// reads
1307 	TMyersPatterns const &,					// MyersBitVector preprocessing data
1308 	RazerSOptions<TSpec> const &options,	// RazerS options
1309 	SwiftSemiGlobalHamming)					// Hamming only
1310 {
1311 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1312 	if(options.maqMapping)
1313 		return matchVerify(m,inf,rseqNo,readSet,options,QualityBasedScoring());
1314 #endif
1315 
1316 #ifdef RAZERS_MICRO_RNA
1317 	if(options.microRNA)
1318 		return matchVerify(m,inf,rseqNo,readSet,options,MicroRNA());
1319 #endif
1320 
1321 	typedef Segment<TGenome, InfixSegment>					TGenomeInfix;
1322 	typedef typename Value<TReadSet>::Type const			TRead;
1323 	typedef typename Iterator<TGenomeInfix, Standard>::Type	TGenomeIterator;
1324 	typedef typename Iterator<TRead, Standard>::Type		TReadIterator;
1325 
1326 #ifdef RAZERS_DEBUG
1327 	::std::cout<<"Verify: "<<::std::endl;
1328 	::std::cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl;
1329 	::std::cout<<"Read:   "<<readSet[rseqNo] << ::std::endl;
1330 #endif
1331 
1332 	unsigned ndlLength = sequenceLength(rseqNo, readSet);
1333 	if (length(inf) < ndlLength) return false;
1334 
1335 	// verify
1336 	TRead &read				= readSet[rseqNo];
1337 	TReadIterator ritBeg	= begin(read, Standard());
1338 	TReadIterator ritEnd	= end(read, Standard());
1339 	TGenomeIterator git		= begin(inf, Standard());
1340 	TGenomeIterator gitEnd	= end(inf, Standard()) - (ndlLength - 1);
1341 
1342 	unsigned maxErrors = (unsigned)(ndlLength * options.errorRate);
1343 	unsigned minErrors = maxErrors + 1;
1344 
1345 	for (; git < gitEnd; ++git)
1346 	{
1347 		unsigned errors = 0;
1348 		TGenomeIterator g = git;
1349 		for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g)
1350 			if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
1351 				if (++errors > maxErrors)
1352 					break;
1353 		if (minErrors > errors)
1354 		{
1355 			minErrors = errors;
1356 			m.gBegin = git - begin(host(inf), Standard());
1357 		}
1358 	}
1359 
1360 	if (minErrors > maxErrors) return false;
1361 
1362 	m.gEnd = m.gBegin + ndlLength;
1363 	m.editDist = minErrors;
1364 	return true;
1365 }
1366 
1367 
1368 //////////////////////////////////////////////////////////////////////////////
1369 // Edit distance verification
1370 template <
1371 	typename TMatch,
1372 	typename TGenome,
1373 	typename TReadSet,
1374 	typename TMyersPatterns,
1375 	typename TSpec >
1376 inline bool
1377 matchVerify(
1378 	TMatch &m,								// resulting match
1379 	Segment<TGenome, InfixSegment> inf,		// potential match genome region
1380 	unsigned rseqNo,						// read number
1381 	TReadSet &readSet,	    				// reads
1382 	TMyersPatterns &forwardPatterns,		// MyersBitVector preprocessing data
1383 	RazerSOptions<TSpec> const &options,	// RazerS options
1384 	SwiftSemiGlobal)						// Swift specialization
1385 {
1386 	typedef Segment<TGenome, InfixSegment>					TGenomeInfix;
1387 	typedef typename Value<TReadSet>::Type					TRead;
1388 
1389 	// find read match end
1390 	typedef Finder<TGenomeInfix>							TMyersFinder;
1391 	typedef typename Value<TMyersPatterns>::Type			TMyersPattern;
1392 
1393 	// find read match begin
1394 	typedef ModifiedString<TGenomeInfix, ModReverse>		TGenomeInfixRev;
1395 	typedef ModifiedString<TRead, ModReverse>				TReadRev;
1396 	typedef Finder<TGenomeInfixRev>							TMyersFinderRev;
1397 	typedef Pattern<TReadRev, MyersUkkonenGlobal>			TMyersPatternRev;
1398 
1399 	TMyersFinder myersFinder(inf);
1400 	TMyersPattern &myersPattern = forwardPatterns[rseqNo];
1401 
1402 #ifdef RAZERS_DEBUG
1403 	::std::cout<<"Verify: "<<::std::endl;
1404 	::std::cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl;
1405 	::std::cout<<"Read:   "<<readSet[rseqNo]<<::std::endl;
1406 #endif
1407 
1408     unsigned ndlLength = sequenceLength(rseqNo, readSet);
1409 	int maxScore = std::numeric_limits<int>::min();
1410 	int minScore = -(int)(ndlLength * options.errorRate);
1411 	TMyersFinder maxPos;
1412 
1413 	// find end of best semi-global alignment
1414 	while (find(myersFinder, myersPattern, minScore))
1415 		if (maxScore <= getScore(myersPattern))
1416 		{
1417 			maxScore = getScore(myersPattern);
1418 			maxPos = myersFinder;
1419 		}
1420 
1421 	if (maxScore < minScore) return false;
1422 	m.editDist	= (unsigned)-maxScore;
1423 	setEndPosition(inf, m.gEnd = (beginPosition(inf) + position(maxPos) + 1));
1424 
1425 	// limit the beginning to needle length plus errors (== -maxScore)
1426 	if (length(inf) > ndlLength - maxScore)
1427 		setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore);
1428 
1429 	// find beginning of best semi-global alignment
1430 	TGenomeInfixRev		infRev(inf);
1431 	TReadRev			readRev(readSet[rseqNo]);
1432 	TMyersFinderRev		myersFinderRev(infRev);
1433 	TMyersPatternRev	myersPatternRev(readRev);
1434 
1435 	_patternMatchNOfPattern(myersPatternRev, options.matchN);
1436 	_patternMatchNOfFinder(myersPatternRev, options.matchN);
1437 	while (find(myersFinderRev, myersPatternRev, maxScore))
1438 		m.gBegin = m.gEnd - (position(myersFinderRev) + 1);
1439 
1440 	return true;
1441 }
1442 
1443 
1444 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1445 //////////////////////////////////////////////////////////////////////////////
1446 // Hamming verification recording sum of mismatching qualities in m.mScore
1447 template <
1448 	typename TMatch,
1449 	typename TGenome,
1450 	typename TReadSet,
1451 	typename TSpec >
1452 inline bool
1453 matchVerify(
1454 	TMatch &m,					// resulting match
1455 	Segment<TGenome, InfixSegment> inf,		// potential match genome region
1456 	unsigned rseqNo,				// read number
1457 	TReadSet &readSet,				// reads
1458 	RazerSOptions<TSpec> const &options,		// RazerS options
1459 	QualityBasedScoring)					// MaqMapping
1460 {
1461 
1462 	typedef Segment<TGenome, InfixSegment>				TGenomeInfix;
1463 	typedef typename Value<TReadSet>::Type const			TRead;
1464 	typedef typename Iterator<TGenomeInfix, Standard>::Type	TGenomeIterator;
1465 	typedef typename Iterator<TRead, Standard>::Type		TReadIterator;
1466 
1467 // #ifdef RAZERS_DEBUG
1468 // 	cout<<"Verify: "<<::std::endl;
1469 // 	cout<<"Genome: "<<inf<<"\t" << beginPosition(inf) << "," << endPosition(inf) << ::std::endl;
1470 // 	cout<<"Read:   "<<host(myersPattern)<<::std::endl;
1471 // #endif
1472 
1473 //	bool derBesgte = false;
1474 	//if(rseqNo == 2) derBesgte = true;
1475 //	if(derBesgte) ::std::cout << "der besagte\n";
1476 	unsigned ndlLength = sequenceLength(rseqNo, readSet);
1477 	if (length(inf) < ndlLength) return false;
1478 
1479 	// verify
1480 	TRead &read		= readSet[rseqNo];
1481 	TReadIterator ritBeg	= begin(read, Standard());
1482 	TReadIterator ritEnd	= end(read, Standard());
1483 	TGenomeIterator git	= begin(inf, Standard());
1484 	TGenomeIterator gitEnd	= end(inf, Standard()) - (ndlLength - 1);
1485 	TGenomeIterator bestIt	= begin(inf, Standard());
1486 
1487 	// this is max number of errors the 28bp 'seed' should have
1488 	//assuming that maxErrors-1 matches can be found with 100% SN
1489 	unsigned maxErrorsSeed = (unsigned)(options.artSeedLength * options.errorRate) + 1;
1490 	unsigned maxErrorsTotal = (unsigned)(ndlLength * 0.25); //options.maxErrorRate);
1491 	unsigned minErrors = maxErrorsTotal + 1;
1492 	int minQualSumErrors = options.absMaxQualSumErrors + 10;
1493 	unsigned minSeedErrors = maxErrorsSeed + 1;
1494 
1495 	for (; git < gitEnd; ++git)
1496 	{
1497 		bool hit = true;
1498 		unsigned errors = 0;
1499 		unsigned count = 0;
1500 		unsigned seedErrors = 0;
1501 		int qualSumErrors = 0;
1502 		TGenomeIterator g = git;	//maq would count errors in the first 28bp only (at least initially. not for output)
1503 		for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g)
1504 		{
1505 			if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0)
1506 			{
1507 			//	::std::cout << count << "<-";
1508 				if (++errors > maxErrorsTotal)
1509 				{
1510 					hit = false;
1511 					break;
1512 				}
1513 				//int qualityValue = (int)((unsigned char)*r >> 3);
1514 				//qualSumErrors += (qualityValue < options.mutationRateQual) ? qualityValue : options.mutationRateQual;
1515 				qualSumErrors += (getQualityValue(*r) < options.mutationRateQual) ? getQualityValue(*r) : options.mutationRateQual;
1516 				if(qualSumErrors > options.absMaxQualSumErrors || qualSumErrors > minQualSumErrors)
1517 				{
1518 					hit = false;
1519 					break;
1520 				}
1521 				if (count < options.artSeedLength)		// the maq (28bp-)seed
1522 				{
1523 					if(++seedErrors > maxErrorsSeed)
1524 					{
1525 						hit = false;
1526 						break;
1527 					}
1528 					if(qualSumErrors > options.maxMismatchQualSum)
1529 					{
1530 						hit = false;
1531 						break;
1532 					}// discard match, if 'seed' is bad (later calculations are done with the quality sum over the whole read)
1533 				}
1534 			}
1535 			++count;
1536 		}
1537 		if (hit && (qualSumErrors < minQualSumErrors /*&& seedErrors <=maxErrorsSeed*/) ) //oder (seedErrors < minSeedErrors)
1538 		{
1539 			minErrors = errors;
1540 			minSeedErrors = seedErrors;
1541 			minQualSumErrors = qualSumErrors;
1542 			m.gBegin = git - begin(host(inf), Standard());
1543 			bestIt = git;
1544 		}
1545 	}
1546 	if (minQualSumErrors > options.absMaxQualSumErrors || minSeedErrors > maxErrorsSeed || minErrors > maxErrorsTotal) return false;
1547 
1548 	m.gEnd = m.gBegin + ndlLength;
1549 	m.editDist = minErrors;			// errors in seed or total number of errors?
1550 	m.mScore = minQualSumErrors;
1551 	m.seedEditDist = minSeedErrors;
1552 	return true;
1553 }
1554 
1555 
1556 //////////////////////////////////////////////////////////////////////////////
1557 // Edit distance verification
1558 template <
1559 	typename TMatch,
1560 	typename TGenome,
1561 	typename TReadSet,
1562 	typename TMyersPatterns,
1563 	typename TSpec >
1564 inline bool
1565 matchVerify(
1566 	TMatch &m,								// resulting match
1567 	Segment<TGenome, InfixSegment> inf,		// potential match genome region
1568 	unsigned rseqNo,						// read number
1569 	TReadSet &readSet,	    				// reads
1570 	TMyersPatterns const & pat,				// MyersBitVector preprocessing data
1571 	RazerSOptions<TSpec> const &options,		// RazerS options
1572 	SwiftSemiGlobal const &swiftsemi,				// Hamming only
1573 	QualityBasedScoring)						// Swift specialization
1574 {
1575 	//if(!options.maqMapping)
1576 		return matchVerify(m,inf,rseqNo,readSet,pat,options,swiftsemi);
1577 	//else
1578 	//	return matchVerify(m,inf,rseqNo,readSet,pat,options,swiftsemi); // todo!
1579 }
1580 #endif
1581 
1582 
1583 
1584 
1585 #ifndef RAZERS_PARALLEL
1586 //////////////////////////////////////////////////////////////////////////////
1587 // Find read matches in one genome sequence
1588 template <
1589 	typename TMatches,
1590 	typename TGenome,
1591 	typename TReadIndex,
1592 	typename TSwiftSpec,
1593 	typename TVerifier,
1594 	typename TCounts,
1595 	typename TSpec >
1596 void mapSingleReads(
1597 	TMatches &matches,				// resulting matches
1598 	TGenome &genome,				// genome ...
1599 	unsigned gseqNo,				// ... and its sequence number
1600 	Pattern<TReadIndex, Swift<TSwiftSpec> > &swiftPattern,
1601 	TVerifier &forwardPatterns,
1602 	TCounts & cnts,
1603 	char orientation,				// q-gram index of reads
1604 	RazerSOptions<TSpec> &options)
1605 {
1606 	typedef typename Fibre<TReadIndex, FibreText>::Type	TReadSet;
1607 	typedef typename Size<TGenome>::Type					TSize;
1608 	typedef typename Value<TMatches>::Type					TMatch;
1609 
1610 
1611 	// FILTRATION
1612 	typedef Finder<TGenome, Swift<TSwiftSpec> >				TSwiftFinder;
1613 	//typedef Pattern<TReadIndex, Swift<TSwiftSpec> >			TSwiftPattern;
1614 
1615 	// iterate all genomic sequences
1616 	if (options._debugLevel >= 1)
1617 	{
1618 		::std::cerr << ::std::endl << "Process genome seq #" << gseqNo;
1619 		if (orientation == 'F')
1620 			::std::cerr << "[fwd]";
1621 		else
1622 			::std::cerr << "[rev]";
1623 	}
1624 
1625 	TReadSet &readSet = host(host(swiftPattern));
1626 	TSwiftFinder swiftFinder(genome, options.repeatLength, 1);
1627 
1628 	TMatch m = {	// to supress uninitialized warnings
1629 		0, 0, 0, 0,
1630 #ifdef RAZERS_MATEPAIRS
1631 		0, 0, 0,
1632 #endif
1633 		0,
1634 #ifdef RAZERS_EXTENDED_MATCH
1635 		0, 0,
1636 #endif
1637 #ifdef RAZERS_SPLICED
1638 		0, 0,
1639 #endif
1640 		0
1641 	};
1642 
1643 	SEQAN_PROTIMESTART(map_contig_time);
1644 
1645 	// iterate all verification regions returned by SWIFT
1646 	TSize gLength = length(genome);
1647 	int64_t localTP = 0;
1648 	int64_t localFP = 0;
1649 
1650 #ifdef RAZERS_MICRO_RNA
1651 	while (find(swiftFinder, swiftPattern, 0.2))
1652 #else
1653 	while (find(swiftFinder, swiftPattern, options.errorRate))
1654 #endif
1655 	{
1656 		unsigned rseqNo = (*swiftFinder.curHit).ndlSeqNo;
1657 		if (!options.spec.DONT_VERIFY &&
1658 			matchVerify(m, infix(swiftFinder), rseqNo, readSet, forwardPatterns, options, TSwiftSpec()))
1659 		{
1660 			// transform coordinates to the forward strand
1661 			if (orientation == 'R')
1662 			{
1663 				TSize temp = m.gBegin;
1664 				m.gBegin = gLength - m.gEnd;
1665 				m.gEnd = gLength - temp;
1666 			}
1667 			m.gseqNo = gseqNo;
1668 			m.rseqNo = rseqNo;
1669 			m.orientation = orientation;
1670 #ifdef RAZERS_MATEPAIRS
1671 			m.pairId = 0;
1672 			m.pairScore = 0 - m.editDist;
1673 #endif
1674 
1675 			if (!options.spec.DONT_DUMP_RESULTS)
1676 			{
1677 				appendValue(matches, m, Generous());
1678 				if (length(matches) > options.compactThresh)
1679 				{
1680 #ifndef RAZERS_MICRO_RNA
1681 					typename Size<TMatches>::Type oldSize = length(matches);
1682 #endif
1683 					maskDuplicates(matches);	// overlapping parallelograms cause duplicates
1684 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1685 					if(options.maqMapping)
1686 						compactMatches(matches, cnts, options, false, swiftPattern, true);
1687 					else
1688 #endif
1689 						compactMatches(matches, cnts, options, false, swiftPattern);
1690 #ifdef RAZERS_MICRO_RNA
1691 					options.compactThresh = 2 * length(matches);
1692 #else
1693 					if (length(matches) * 4 > oldSize)			// the threshold should not be raised
1694 						options.compactThresh += (options.compactThresh >> 1);	// if too many matches were removed
1695 #endif
1696 				}
1697 			}
1698 
1699 			++localTP;
1700 //			::std::cerr << "\"" << infix(swiftFinder) << "\"  ";
1701 //			::std::cerr << hstkPos << " + ";
1702 //			::std::cerr << ::std::endl;
1703 		} else {
1704 			++localFP;
1705 //			::std::cerr << "\"" << infx(swiftFinder) << "\"   \"" << infix(swiftPattern) << "\"  ";
1706 //			::std::cerr << rseqNo << " : ";
1707 //			::std::cerr << hstkPos << " + ";
1708 //			::std::cerr << bucketWidth << "  " << TP << ::std::endl;
1709 		}
1710 	}
1711 	options.TP += localTP;
1712 	options.FP += localFP;
1713 	if (options._debugLevel >= 2)
1714 	{
1715 		double spec = 100;
1716 		double time = SEQAN_PROTIMEDIFF(map_contig_time);
1717 		if (localFP+localTP != 0)
1718 			spec = (1000*localTP/(localFP+localTP))/10.0;
1719 		::std::cerr << " [" << (int)((gLength / time)/100.0)/10.0 << "kbp/s " << time << "s] [" << spec << "%SP " << localFP+localTP << "V]";
1720 	}
1721 }
1722 #endif
1723 
1724 
1725 
1726 #ifdef RAZERS_MICRO_RNA
1727 
1728 	// multiple sequences
1729 	template <
1730 		typename TSA,
1731 		typename TStringSet,
1732 		typename TShape,
1733 		typename TDir,
1734 		typename TValue,
1735 		typename TWithConstraints
1736 	>
1737 	inline void
1738 	_qgramFillSuffixArray(
1739 		TSA &sa,
1740 		TStringSet &stringSet,
1741 		TShape &shape,
1742 		TDir &dir,
1743 		Nothing,
1744 		TWithConstraints const,
1745 		TValue prefixLen,
1746 		MicroRNA)
1747 	{
1748 		typedef typename Value<TStringSet>::Type					TString;
1749 		typedef typename Iterator<TString const, Standard>::Type	TIterator;
1750 		typedef typename Value<TDir>::Type				TSize;
1751 		typedef typename Value<TShape>::Type				THash;
1752 
1753 		for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo)
1754 		{
1755 			TString const &sequence = value(stringSet, seqNo);
1756 			if (length(sequence) < length(shape)) continue;
1757 			int num_qgrams = prefixLen - (int)length(shape) + 1;
1758 
1759 			typename Value<TSA>::Type localPos;
1760 			assignValueI1(localPos, seqNo);
1761 			assignValueI2(localPos, 0);
1762 
1763 			TIterator itText = begin(sequence, Standard());
1764 			if (TWithConstraints::VALUE) {
1765 				THash h = hash(shape, itText) + 1;						// first hash
1766 				if (dir[h] != (TSize)-1) sa[dir[h]++] = localPos;		// if bucket is enabled
1767 			} else
1768 				sa[dir[hash(shape, itText) + 1]++] = localPos;			// first hash
1769 
1770 			for(int i = 1; i < num_qgrams; ++i)
1771 			{
1772 				++itText;
1773 				assignValueI2(localPos, i);
1774 				if (TWithConstraints::VALUE) {
1775 					THash h = hashNext(shape, itText) + 1;				// next hash
1776 					if (dir[h] != (TSize)-1) sa[dir[h]++] = localPos;	// if bucket is enabled
1777 				} else
1778 					sa[dir[hashNext(shape, itText) + 1]++] = localPos;	// next hash
1779 			}
1780 		}
1781 	}
1782 
1783 	template < typename TDir, typename TStringSet, typename TShape, typename TValue >
1784 	inline void
1785 	_qgramCountQGrams(TDir &dir, TStringSet &stringSet, TShape &shape, TValue prefixLen, MicroRNA)
1786 	{
1787 		typedef typename Value<TStringSet>::Type					TString;
1788 		typedef typename Iterator<TString const, Standard>::Type	TIterator;
1789 		typedef typename Value<TDir>::Type							TSize;
1790 
1791 		for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo)
1792 		{
1793 			TString const &sequence = value(stringSet, seqNo);
1794 			if (length(sequence) < length(shape)) continue;
1795 			TSize num_qgrams = prefixLen - length(shape) + 1;
1796 
1797 			TIterator itText = begin(sequence, Standard());
1798 			++dir[hash(shape, itText)];
1799 			for(TSize i = 1; i < num_qgrams; ++i)
1800 			{
1801 				++itText;
1802 				++dir[hashNext(shape, itText)];
1803 			}
1804 		}
1805 	}
1806 
1807 
1808 	template < typename TIndex, typename TValue>
1809 	void createQGramIndex(TIndex &index, TValue prefixLen, MicroRNA)
1810 	{
1811 		typename Fibre<TIndex, QGramText>::Type	   &text  = indexText(index);
1812 		typename Fibre<TIndex, QGramSA>::Type         &sa    = indexSA(index);
1813 		typename Fibre<TIndex, QGramDir>::Type        &dir   = indexDir(index);
1814 		typename Fibre<TIndex, QGramShape>::Type      &shape = indexShape(index);
1815 
1816 		Nothing nothing;
1817 
1818 		// 1. clear counters
1819 		arrayFill(begin(dir, Standard()), end(dir, Standard()), 0);
1820 
1821 		// 2. count q-grams
1822 		_qgramCountQGrams(dir, text, shape, prefixLen,MicroRNA());
1823 
1824 		if (_qgramDisableBuckets(index))
1825 		{
1826 			// 3. cumulative sum
1827 			_qgramCummulativeSum(dir, True());
1828 
1829 			// 4. fill suffix array
1830 			_qgramFillSuffixArray(sa, text, shape, dir, nothing, True(), prefixLen, MicroRNA());
1831 
1832 			// 5. correct disabled buckets
1833 			_qgramPostprocessBuckets(dir);
1834 		}
1835 		else
1836 		{
1837 			// 3. cumulative sum
1838 			_qgramCummulativeSum(dir, False());
1839 
1840 			// 4. fill suffix array
1841 			_qgramFillSuffixArray(sa, text, shape, dir, nothing, False(),prefixLen, MicroRNA());
1842 		}
1843 	}
1844 
1845 
1846 #endif
1847 
1848 //////////////////////////////////////////////////////////////////////////////
1849 // Find read matches in many genome sequences (import from Fasta)
1850 template <
1851 	typename TMatches,
1852 	typename TReadSet,
1853 	typename TCounts,
1854 	typename TSpec,
1855 	typename TShape,
1856 	typename TSwiftSpec >
1857 int mapSingleReads(
1858 	TMatches &				matches,
1859 	StringSet<CharString> &	genomeFileNameList,
1860 	StringSet<CharString> &	genomeNames,	// genome names, taken from the Fasta file
1861 	::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
1862 	TReadSet const &		readSet,
1863 	TCounts & cnts,
1864 	RazerSOptions<TSpec> &	options,
1865 	TShape const &			shape,
1866 	Swift<TSwiftSpec> const)
1867 {
1868 	typedef typename Value<TReadSet>::Type							TRead;
1869 	typedef Index<TReadSet, IndexQGram<TShape, TQGramIndexSpec> >	TIndex;			// q-gram index
1870 	typedef Pattern<TIndex, Swift<TSwiftSpec> >						TSwiftPattern;	// filter
1871 	typedef Pattern<TRead, MyersUkkonen>							TMyersPattern;	// verifier
1872 
1873 /*	// try opening each genome file once before running the whole mapping procedure
1874 	int filecount = 0;
1875 	int numFiles = length(genomeFileNameList);
1876 	while(filecount < numFiles)
1877 	{
1878 		::std::ifstream file;
1879 		file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary);
1880 		if (!file.is_open())
1881 			return RAZERS_GENOME_FAILED;
1882 		file.close();
1883 		++filecount;
1884 	}
1885 	*/
1886 
1887 	// configure q-gram index
1888 	TIndex swiftIndex(readSet, shape);
1889 #ifdef RAZERS_OPENADDRESSING
1890 	swiftIndex.alpha = 2;
1891 #endif
1892 	cargo(swiftIndex).abundanceCut = options.abundanceCut;
1893 	cargo(swiftIndex)._debugLevel = options._debugLevel;
1894 
1895 	// configure Swift
1896 	TSwiftPattern swiftPattern(swiftIndex);
1897 	swiftPattern.params.minThreshold = options.threshold;
1898 	swiftPattern.params.tabooLength = options.tabooLength;
1899 	swiftPattern.params.printDots = options._debugLevel > 0;
1900 
1901 	// init edit distance verifiers
1902 	unsigned readCount = countSequences(swiftIndex);
1903 	String<TMyersPattern> forwardPatterns;
1904 	options.compMask[4] = (options.matchN)? 15: 0;
1905 	if (!options.hammingOnly)
1906 	{
1907 		resize(forwardPatterns, readCount, Exact());
1908 		for(unsigned i = 0; i < readCount; ++i)
1909 		{
1910 			setHost(forwardPatterns[i], indexText(swiftIndex)[i]);
1911 			_patternMatchNOfPattern(forwardPatterns[i], options.matchN);
1912 			_patternMatchNOfFinder(forwardPatterns[i], options.matchN);
1913 		}
1914 	}
1915 #ifdef RAZERS_MICRO_RNA
1916 	typename Size<TIndex>::Type qgram_count = 0;
1917 	if(options.microRNA)
1918 	{
1919 		for(unsigned i = 0; i < countSequences(swiftIndex); ++i)
1920 			if (sequenceLength(i, swiftIndex) >= options.rnaSeedLength)
1921 				qgram_count += options.rnaSeedLength - (length(shape) - 1);
1922 		resize(indexSA(swiftIndex), qgram_count, Exact());
1923 		resize(indexDir(swiftIndex), _fullDirLength(swiftIndex), Exact());
1924 		createQGramIndex(swiftIndex,options.rnaSeedLength,MicroRNA());
1925 	}
1926 #endif
1927 
1928 
1929 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1930 	if(options.maqMapping)
1931 	{
1932 		resize(cnts, 2);
1933 		for (unsigned i = 0; i < length(cnts); ++i)
1934 			resize(cnts[i], readCount, 31); //initialize with maxeditDist, 11:5 for count:dist
1935 	}
1936 #endif
1937 
1938 	// clear stats
1939 	options.FP = 0;
1940 	options.TP = 0;
1941 	options.timeMapReads = 0;
1942 	options.timeDumpResults = 0;
1943 
1944 	unsigned numFiles = length(genomeFileNameList);
1945 	unsigned gseqNo = 0;
1946 
1947 	// open genome files, one by one
1948 	for (unsigned filecount = 0; filecount < numFiles; ++filecount)
1949 	{
1950 		// open genome file
1951 		SeqFileIn file;
1952 		if (!open(file, toCString(genomeFileNameList[filecount])))
1953 			return RAZERS_GENOME_FAILED;
1954 
1955 		// remove the directory prefix of current genome file
1956 		::std::string genomeFile(toCString(genomeFileNameList[filecount]));
1957 		size_t lastPos = genomeFile.find_last_of('/') + 1;
1958 		if (lastPos == genomeFile.npos) lastPos = genomeFile.find_last_of('\\') + 1;
1959 		if (lastPos == genomeFile.npos) lastPos = 0;
1960 		::std::string genomeName = genomeFile.substr(lastPos);
1961 
1962 		CharString	id;
1963 		Dna5String	genome;
1964 		unsigned gseqNoWithinFile = 0;
1965 		// iterate over genome sequences
1966 		SEQAN_PROTIMESTART(find_time);
1967 		for(; !atEnd(file); ++gseqNo)
1968 		{
1969             readRecord(id, genome, file);			// read Fasta id and sequence
1970 			if (options.genomeNaming == 0)
1971 			{
1972                 cropAfterFirst(id, IsWhitespace());
1973 				appendValue(genomeNames, id, Generous());
1974 			}
1975 
1976 			gnoToFileMap.insert(::std::make_pair(gseqNo,::std::make_pair(genomeName,gseqNoWithinFile)));
1977 
1978 			if (options.forward)
1979 				mapSingleReads(matches, genome, gseqNo, swiftPattern, forwardPatterns, cnts, 'F', options);
1980 
1981 			if (options.reverse)
1982 			{
1983 				reverseComplement(genome);
1984 				mapSingleReads(matches, genome, gseqNo, swiftPattern, forwardPatterns, cnts, 'R', options);
1985 			}
1986 			++gseqNoWithinFile;
1987 
1988 		}
1989 		options.timeMapReads += SEQAN_PROTIMEDIFF(find_time);
1990 	}
1991 
1992 	if (options._debugLevel >= 1)
1993 		::std::cerr << ::std::endl << "Finding reads took               \t" << options.timeMapReads << " seconds" << ::std::endl;
1994 	if (options._debugLevel >= 2) {
1995 		::std::cerr << ::std::endl;
1996 		::std::cerr << "___FILTRATION_STATS____" << ::std::endl;
1997 		::std::cerr << "Swift FP: " << options.FP << ::std::endl;
1998 		::std::cerr << "Swift TP: " << options.TP << ::std::endl;
1999 	}
2000 	return 0;
2001 }
2002 
2003 
2004 //////////////////////////////////////////////////////////////////////////////
2005 // Find read matches in many genome sequences (given as StringSet)
2006 template <
2007 	typename TMatches,
2008 	typename TGenomeSet,
2009 	typename TReadSet,
2010 	typename TCounts,
2011 	typename TSpec,
2012 	typename TShape,
2013 	typename TSwiftSpec >
2014 int mapSingleReads(
2015 	TMatches &				matches,
2016 	TGenomeSet &			genomeSet,
2017 	TReadSet const &		readSet,
2018 	TCounts &				cnts,
2019 	RazerSOptions<TSpec> &	options,
2020 	TShape const &			shape,
2021 	Swift<TSwiftSpec> const)
2022 {
2023 	typedef typename Value<TReadSet>::Type							TRead;
2024 	typedef Index<TReadSet, IndexQGram<TShape, TQGramIndexSpec> >	TIndex;			// q-gram index
2025 	typedef Pattern<TIndex, Swift<TSwiftSpec> >						TSwiftPattern;	// filter
2026 	typedef Pattern<TRead, MyersUkkonen>							TMyersPattern;	// verifier
2027 
2028 	// configure q-gram index
2029 	TIndex swiftIndex(readSet, shape);
2030 	cargo(swiftIndex).abundanceCut = options.abundanceCut;
2031 	cargo(swiftIndex)._debugLevel = options._debugLevel;
2032 
2033 	// configure Swift
2034 	TSwiftPattern swiftPattern(swiftIndex);
2035 	swiftPattern.params.minThreshold = options.threshold;
2036 	swiftPattern.params.tabooLength = options.tabooLength;
2037 
2038 	// init edit distance verifiers
2039 	String<TMyersPattern> forwardPatterns;
2040 	options.compMask[4] = (options.matchN)? 15: 0;
2041 	if (!options.hammingOnly)
2042 	{
2043 		unsigned readCount = countSequences(swiftIndex);
2044 		resize(forwardPatterns, readCount, Exact());
2045 		for(unsigned i = 0; i < readCount; ++i)
2046 		{
2047 			setHost(forwardPatterns[i], indexText(swiftIndex)[i]);
2048 			_patternMatchNOfPattern(forwardPatterns[i], options.matchN);
2049 			_patternMatchNOfFinder(forwardPatterns[i], options.matchN);
2050 		}
2051 	}
2052 
2053 	// clear stats
2054 	options.FP = 0;
2055 	options.TP = 0;
2056 	options.timeMapReads = 0;
2057 	options.timeDumpResults = 0;
2058 
2059 	CharString	id;
2060 #ifdef RAZERS_DIRECT_MAQ_MAPPING
2061  	if(options.maqMapping)
2062  	{
2063 		resize(cnts, 2);
2064 		for (unsigned i = 0; i < length(cnts); ++i)
2065 			resize(cnts[i], length(readSet), 31);
2066 	}
2067 #endif
2068 
2069 
2070 
2071 	// iterate over genome sequences
2072 	SEQAN_PROTIMESTART(find_time);
2073 	for(unsigned gseqNo = 0; gseqNo < length(genomeSet); ++gseqNo)
2074 	{
2075 		if (options.forward)
2076 			mapSingleReads(matches, genomeSet[gseqNo], gseqNo, swiftPattern, forwardPatterns, cnts, 'F', options);
2077 
2078 		if (options.reverse)
2079 		{
2080 			reverseComplement(genomeSet[gseqNo]);
2081 			mapSingleReads(matches, genomeSet[gseqNo], gseqNo, swiftPattern, forwardPatterns, cnts, 'R', options);
2082 			reverseComplement(genomeSet[gseqNo]);
2083 		}
2084 
2085 	}
2086 	options.timeMapReads += SEQAN_PROTIMEDIFF(find_time);
2087 
2088 	if (options._debugLevel >= 1)
2089 		::std::cerr << ::std::endl << "Finding reads took               \t" << options.timeMapReads << " seconds" << ::std::endl;
2090 	if (options._debugLevel >= 2) {
2091 		::std::cerr << ::std::endl;
2092 		::std::cerr << "___FILTRATION_STATS____" << ::std::endl;
2093 		::std::cerr << "Swift FP: " << options.FP << ::std::endl;
2094 		::std::cerr << "Swift TP: " << options.TP << ::std::endl;
2095 	}
2096 	return 0;
2097 }
2098 
2099 //////////////////////////////////////////////////////////////////////////////
2100 // Wrapper for single/mate-pair mapping
2101 template <
2102 	typename TMatches,
2103 	typename TReadSet,
2104 	typename TCounts,
2105 	typename TSpec,
2106 	typename TShape,
2107 	typename TSwiftSpec >
2108 int mapReads(
2109 	TMatches &				matches,
2110 	StringSet<CharString> &	genomeFileNameList,
2111 	StringSet<CharString> &	genomeNames,	// genome names, taken from the Fasta file
2112 	::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
2113 	TReadSet &		readSet,
2114 	TCounts &				cnts,
2115 	RazerSOptions<TSpec> &	options,
2116 	TShape const &			shape,
2117 	Swift<TSwiftSpec> const)
2118 {
2119 
2120 #ifdef RAZERS_MATEPAIRS
2121 	if (options.libraryLength >= 0)
2122 		return mapMatePairReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, shape, Swift<TSwiftSpec>());
2123 	else
2124 #endif
2125 		return mapSingleReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, shape, Swift<TSwiftSpec>());
2126 }
2127 
2128 template <
2129 	typename TMatches,
2130 	typename TGenomeSet,
2131 	typename TReadSet,
2132 	typename TCounts,
2133 	typename TSpec,
2134 	typename TShape,
2135 	typename TSwiftSpec >
2136 int mapReads(
2137 	TMatches &				matches,
2138 	TGenomeSet &			genomeSet,
2139 	TReadSet &		readSet,
2140 	TCounts &				cnts,
2141 	RazerSOptions<TSpec> &	options,
2142 	TShape const &			shape,
2143 	Swift<TSwiftSpec> const)
2144 {
2145 #ifdef RAZERS_MATEPAIRS
2146 	if (options.libraryLength >= 0)
2147 		return mapMatePairReads(matches, genomeSet, readSet, cnts, options, shape, Swift<TSwiftSpec>());
2148 	else
2149 #endif
2150 		return mapSingleReads(matches, genomeSet, readSet, cnts, options, shape, Swift<TSwiftSpec>());
2151 }
2152 
2153 //////////////////////////////////////////////////////////////////////////////
2154 // Wrapper for different template specializations
2155 template <typename TMatches, typename TReadSet, typename TCounts, typename TSpec>
2156 int mapReads(
2157 	TMatches &				matches,
2158 	StringSet<CharString> &	genomeFileNameList,
2159 	StringSet<CharString> &	genomeNames,	// genome names, taken from the Fasta file
2160 	::std::map<unsigned,::std::pair< ::std::string,unsigned> > & gnoToFileMap,
2161 	TReadSet &				readSet,
2162 	TReadRegions	&		readRegions,
2163 	TCounts &				cnts,
2164 	RazerSOptions<TSpec> &	options)
2165 {
2166     // first check whether split mapping (--> two shapes) or normal mapping
2167 #ifdef RAZERS_SPLICED
2168     if (options.minMatchLen > 0)
2169         return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options);
2170 #else
2171     (void)readRegions;
2172 #endif
2173 
2174     Shape<Dna, SimpleShape>		ungapped;
2175     Shape<Dna, OneGappedShape>	onegapped;
2176     Shape<Dna, GenericShape>	gapped;
2177 
2178 	// 2x3 SPECIALIZATION
2179 
2180 	if (options.hammingOnly)
2181 	{
2182 		// select best-fitting shape
2183 		if (stringToShape(ungapped, options.shape))
2184 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobalHamming>());
2185 
2186 		if (stringToShape(onegapped, options.shape))
2187 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobalHamming>());
2188 
2189 		if (stringToShape(gapped, options.shape))
2190 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, gapped, Swift<SwiftSemiGlobalHamming>());
2191 	}
2192 	else
2193 	{
2194 		if (stringToShape(ungapped, options.shape))
2195 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobal>());
2196 
2197 		if (stringToShape(onegapped, options.shape))
2198 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobal>());
2199 
2200 		if (stringToShape(gapped, options.shape))
2201 			return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, gapped, Swift<SwiftSemiGlobal>());
2202 	}
2203 
2204 	return RAZERS_INVALID_SHAPE;
2205 }
2206 
2207 template <typename TMatches, typename TGenomeSet, typename TReadSet, typename TReadRegions, typename TCounts, typename TSpec>
2208 int mapReads(
2209 	TMatches &				matches,
2210 	TGenomeSet &			genomeSet,
2211 	TReadSet &				readSet,
2212 	TReadRegions &
2213 #ifdef RAZERS_SPLICED
2214 	readRegions
2215 #endif
2216 	,
2217 	TCounts &				cnts,
2218 	RazerSOptions<TSpec> &	options)
2219 {
2220 
2221 
2222         // first check whether split mapping (--> two shapes) or normal mapping
2223 #ifdef RAZERS_SPLICED
2224         if (options.minMatchLen > 0)
2225                 return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options);
2226 #endif
2227 
2228 
2229 	Shape<Dna, SimpleShape>		ungapped;
2230 	Shape<Dna, OneGappedShape>	onegapped;
2231 	Shape<Dna, GenericShape>	gapped;
2232 
2233 	// 2x3 SPECIALIZATION
2234 
2235 	if (options.hammingOnly)
2236 	{
2237 		// select best-fitting shape
2238 		if (stringToShape(ungapped, options.shape))
2239 			return mapReads(matches, genomeSet, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobalHamming>());
2240 
2241 		if (stringToShape(onegapped, options.shape))
2242 			return mapReads(matches, genomeSet, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobalHamming>());
2243 
2244 		if (stringToShape(gapped, options.shape))
2245 			return mapReads(matches, genomeSet, readSet, cnts, options, gapped, Swift<SwiftSemiGlobalHamming>());
2246 	}
2247 	else
2248 	{
2249 		if (stringToShape(ungapped, options.shape))
2250 			return mapReads(matches, genomeSet, readSet, cnts, options, ungapped, Swift<SwiftSemiGlobal>());
2251 
2252 		if (stringToShape(onegapped, options.shape))
2253 			return mapReads(matches, genomeSet, readSet, cnts, options, onegapped, Swift<SwiftSemiGlobal>());
2254 
2255 		if (stringToShape(gapped, options.shape))
2256 			return mapReads(matches, genomeSet, readSet, cnts, options, gapped, Swift<SwiftSemiGlobal>());
2257 	}
2258 
2259 	return RAZERS_INVALID_SHAPE;
2260 }
2261 
2262 }
2263 
2264 #endif
2265