1  /*==========================================================================
2              RazerS - Fast Read Mapping with Controlled Loss Rate
3                    http://www.seqan.de/projects/razers.html
4 
5  ============================================================================
6   Copyright (C) 2008 by David Weese
7 
8   This program is free software; you can redistribute it and/or
9   modify it under the terms of the GNU Lesser General Public
10   License as published by the Free Software Foundation; either
11   version 3 of the License, or (at your options) any later version.
12 
13   This program is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20  ==========================================================================*/
21 
22 #ifndef SEQAN_HEADER_RAZERS_MATEPAIRS_H
23 #define SEQAN_HEADER_RAZERS_MATEPAIRS_H
24 
25 #include <seqan/misc/misc_dequeue.h>
26 
27 namespace SEQAN_NAMESPACE_MAIN
28 {
29 
30 // We require mate-pairs to be stored together in one read string.
31 // Pair i has mates at positions 2*i and 2*i+1 in the read string.
32 
33 
34 //////////////////////////////////////////////////////////////////////////////
35 // Definitions
36 
37 typedef StringSet<TRead const, Dependent<> >	TMPReadSet;
38 
39 #ifdef RAZERS_MEMOPT
40 
41 	template <typename TShape>
42 	struct SAValue< Index<TMPReadSet, IndexQGram<TShape> > > {
43 		typedef Pair<
44 			unsigned,
45 			unsigned,
46 			BitCompressed<24, 8>	// max. 16M reads of length < 256
47 		> Type;
48 	};
49 
50 #else
51 
52 	template <typename TShape>
53 	struct SAValue< Index<TMPReadSet, IndexQGram<TShape> > > {
54 		typedef Pair<
55 			unsigned,			// many reads
56 			unsigned,			// of arbitrary length
57 			Compressed
58 		> Type;
59 	};
60 
61 #endif
62 
63 
64 template <typename TShape, typename TSpec>
65 struct Cargo< Index<TMPReadSet, IndexQGram<TShape, TSpec> > > {
66 	typedef struct {
67 		double		abundanceCut;
68 		int			_debugLevel;
69 	} Type;
70 };
71 
72 #ifdef RAZERS_PRUNE_QGRAM_INDEX
73 
74 //////////////////////////////////////////////////////////////////////////////
75 // Repeat masker
76 template <typename TShape>
77 inline bool _qgramDisableBuckets(Index<TMPReadSet, IndexQGram<TShape> > &index)
78 {
79 	typedef Index<TMPReadSet, IndexQGram<TShape>	>	TReadIndex;
80 	typedef typename Fibre<TReadIndex, QGramDir>::Type	TDir;
81 	typedef typename Iterator<TDir, Standard>::Type		TDirIterator;
82 	typedef typename Value<TDir>::Type					TSize;
83 
84 	TDir &dir    = indexDir(index);
85 	bool result  = false;
86 	unsigned counter = 0;
87 	TSize thresh = (TSize)(length(index) * cargo(index).abundanceCut);
88 	if (thresh < 100) thresh = 100;
89 
90 	TDirIterator it = begin(dir, Standard());
91 	TDirIterator itEnd = end(dir, Standard());
92 	for (; it != itEnd; ++it)
93 		if (*it > thresh)
94 		{
95 			*it = (TSize)-1;
96 			result = true;
97 			++counter;
98 		}
99 
100 	if (counter > 0 && cargo(index)._debugLevel >= 1)
101 		::std::cerr << "Removed " << counter << " k-mers" << ::std::endl;
102 
103 	return result;
104 }
105 
106 #endif
107 
108 //////////////////////////////////////////////////////////////////////////////
109 // Load multi-Fasta sequences
110 template <typename TFSSpec, typename TFSConfig, typename TRazerSOptions>
111 bool loadReads(
112 	FragmentStore<TFSSpec, TFSConfig>	& store,
113 	const char							* fileNameL,		// left mates file
114 	const char							* fileNameR,		// right mates file
115 	TRazerSOptions						& options)
116 {
117 	bool countN = !(options.matchN || options.outputFormat == 1);
118 
119 	MultiFasta leftMates;
120 	MultiFasta rightMates;
121 
122 	if (!open(leftMates.concat, fileNameL, OPEN_RDONLY)) return false;
123 	if (!open(rightMates.concat, fileNameR, OPEN_RDONLY)) return false;
124 
125 	AutoSeqFormat formatL;
126 	guessFormat(leftMates.concat, formatL);
127 	split(leftMates, formatL);
128 
129 	AutoSeqFormat formatR;
130 	guessFormat(rightMates.concat, formatR);
131 	split(rightMates, formatR);
132 
133 	unsigned seqCount = length(leftMates);
134 	if (seqCount != length(rightMates))
135 	if (options._debugLevel > 1)
136 	{
137 		::std::cerr << "Numbers of mates differ: " << seqCount << "(left) != " << length(rightMates) << "(right).\n";
138 		return false;
139 	}
140 
141 	String<Dna5Q>	seq[2];
142 	CharString		qual[2];
143 	CharString		id[2];
144 
145 	unsigned kickoutcount = 0;
146 	unsigned maxReadLength = 0;
147 	for(unsigned i = 0; i < seqCount; ++i)
148 	{
149 		if (options.readNaming == 0 || options.readNaming == 3)
150 		{
151 			assignSeqId(id[0], leftMates[i], formatL);				// read left Fasta id
152 			assignSeqId(id[1], rightMates[i], formatR);				// read right Fasta id
153 			if (options.readNaming == 0)
154 			{
155 				append(id[0], "/L");
156 				append(id[1], "/R");
157 			}
158 		}
159 
160 		assignSeq(seq[0], leftMates[i], formatL);					// read left Read sequence
161 		assignSeq(seq[1], rightMates[i], formatR);					// read right Read sequence
162 		assignQual(qual[0], leftMates[i], formatL);					// read left ascii quality values
163 		assignQual(qual[1], rightMates[i], formatR);				// read right ascii quality values
164 
165 		if (countN)
166 		{
167 			for (int j = 0; j < 2; ++j)
168 			{
169 				int maxBase = (int)(0.8 * length(seq[j]));
170 				int allowed[5] =
171 					{ maxBase, maxBase, maxBase, maxBase, (int)(options.errorRate * length(seq[j]))};
172 				for (unsigned k = 0; k < length(seq[j]); ++k)
173 					if (--allowed[ordValue(getValue(seq[j], k))] == 0)
174 					{
175 //						std::cout << "Ignoring mate-pair: " << seq[0] << " " << seq[1] << std::endl;
176 						clear(seq[0]);
177 						clear(seq[1]);
178 						clear(id[0]);
179 						clear(id[1]);
180 						++kickoutcount;
181 						break;
182 					}
183 			}
184 		}
185 
186 		for (int j = 0; j < 2; ++j)
187 		{
188 			// store dna and quality together
189 			assignQualities(seq[j], qual[j]);
190 
191 			if (options.trimLength > 0 && length(seq[j]) > (unsigned)options.trimLength)
192 				resize(seq[j], options.trimLength);
193 		}
194 		appendMatePair(store, seq[0], seq[1], id[0], id[1]);
195 		if (maxReadLength < length(seq[0]))
196 			maxReadLength = length(seq[0]);
197 		if (maxReadLength < length(seq[1]))
198 			maxReadLength = length(seq[1]);
199 	}
200 	// memory optimization
201 	reserve(store.readSeqStore.concat, length(store.readSeqStore.concat), Exact());
202 //	reserve(store.readNameStore.concat, length(store.readNameStore.concat), Exact());
203 
204 	typedef Shape<Dna, SimpleShape> TShape;
205 	typedef typename SAValue< Index<TMPReadSet, IndexQGram<TShape, OpenAddressing> > >::Type TSAValue;
206 	TSAValue sa(0, 0);
207 	sa.i1 = ~sa.i1;
208 	sa.i2 = ~sa.i2;
209 
210 	if ((unsigned)sa.i1 < length(store.readSeqStore) - 1)
211 	{
212 		::std::cerr << "Maximal read number of " << (unsigned)sa.i1 + 1 << " exceeded. Please remove \"#define RAZERS_MEMOPT\" in razers.cpp and recompile." << ::std::endl;
213 		seqCount = 0;
214 	}
215 	if ((unsigned)sa.i2 < maxReadLength - 1)
216 	{
217 		::std::cerr << "Maximal read length of " << (unsigned)sa.i2 + 1 << " bps exceeded. Please remove \"#define RAZERS_MEMOPT\" in razers.cpp and recompile." << ::std::endl;
218 		seqCount = 0;
219 	}
220 
221 	if (options._debugLevel > 1 && kickoutcount > 0)
222 		::std::cerr << "Ignoring " << kickoutcount << " low quality mate-pairs.\n";
223 	return (seqCount > 0);
224 }
225 
226 	template <typename TFragmentStore>
227 	struct LessPairScore :
228 		public ::std::binary_function <
229 			typename Value<typename TFragmentStore::TAlignedReadStore>::Type,
230 			typename Value<typename TFragmentStore::TAlignedReadStore>::Type,
231 			bool >
232 	{
233 		TFragmentStore &store;
234 
235 		LessPairScore(TFragmentStore &_store):
236 			store(_store) {}
237 
238 		inline bool operator() (
239 			typename Value<typename TFragmentStore::TAlignedReadStore>::Type const &a,
240 			typename Value<typename TFragmentStore::TAlignedReadStore>::Type const &b) const
241 		{
242 			typedef typename TFragmentStore::TReadStore			TReadStore;
243 			typedef typename TFragmentStore::TAlignedReadStore	TAlignedReadStore;
244 			typedef typename TFragmentStore::TAlignQualityStore	TAlignQualityStore;
245 			typedef typename Value<TReadStore>::Type			TRead;
246 			typedef typename Value<TAlignedReadStore>::Type		TAlignedRead;
247 			typedef typename Value<TAlignQualityStore>::Type	TQual;
248 			typedef typename Id<TRead>::Type					TId;
249 
250 			// pair number
251 			if (b.readId == TAlignedRead::INVALID_ID) return false;
252 			if (a.readId == TAlignedRead::INVALID_ID) return true;
253 			TRead const &ra = store.readStore[a.readId];
254 			TRead const &rb = store.readStore[b.readId];
255 			if (ra.matePairId < rb.matePairId) return true;
256 			if (ra.matePairId > rb.matePairId) return false;
257 
258 			// quality
259 			if (a.id == TAlignedRead::INVALID_ID) return false;
260 			if (b.id == TAlignedRead::INVALID_ID) return true;
261 			TQual const &qa = store.alignQualityStore[a.id];
262 			TQual const &qb = store.alignQualityStore[b.id];
263 			if (qa.pairScore > qb.pairScore) return true;
264 			if (qa.pairScore < qb.pairScore) return false;
265 
266 			return a.pairMatchId < b.pairMatchId;
267 		}
268 	};
269 
270 
271 //////////////////////////////////////////////////////////////////////////////
272 // Remove low quality matches
273 template < typename TFragmentStore, typename TCounts, typename TSpec, typename TSwiftL, typename TSwiftR >
274 void compactPairMatches(
275 	TFragmentStore			& store,
276 	TCounts					&,
277 	RazerSOptions<TSpec>	& options,
278 	TSwiftL					& swiftL,
279 	TSwiftR					& swiftR)
280 {
281 	typedef typename TFragmentStore::TReadStore						TReadStore;
282 	typedef typename TFragmentStore::TAlignedReadStore				TAlignedReadStore;
283 	typedef typename TFragmentStore::TAlignQualityStore				TAlignQualityStore;
284 	typedef typename Value<TReadStore>::Type						TRead;
285 	typedef typename Value<TAlignedReadStore>::Type					TAlignedRead;
286 	typedef typename Value<TAlignQualityStore>::Type				TQual;
287 	typedef typename Iterator<TAlignedReadStore, Standard>::Type	TIterator;
288 
289 	unsigned matePairId = -2;
290 	unsigned hitCount = 0;
291 	unsigned hitCountCutOff = options.maxHits;
292 	int scoreDistCutOff = MinValue<int>::VALUE;
293 
294 	TIterator it = begin(store.alignedReadStore, Standard());
295 	TIterator itEnd = end(store.alignedReadStore, Standard());
296 	TIterator dit = it;
297 	TIterator ditBeg = it;
298 
299 	// sort
300 	sortAlignedReads(store.alignedReadStore, LessPairScore<TFragmentStore>(store));
301 
302 	for (; it != itEnd; ++it)
303 	{
304 		if ((*it).id == TAlignedRead::INVALID_ID || (*it).readId == TAlignedRead::INVALID_ID) continue;
305 		TRead &r = store.readStore[(*it).readId];
306 		TQual &q = store.alignQualityStore[(*it).id];
307 		if (matePairId == r.matePairId)
308 		{
309 			if (q.pairScore <= scoreDistCutOff) continue;
310 			if (++hitCount >= hitCountCutOff)
311 			{
312 #ifdef RAZERS_MASK_READS
313 				if (hitCount == hitCountCutOff)
314 				{
315 					// we have enough, now look for better matches
316 					int maxErrors = -1 - q.pairScore;
317 					if (options.purgeAmbiguous)
318 						maxErrors = -1;
319 
320 					setMaxErrors(swiftL, matePairId, maxErrors);
321 					setMaxErrors(swiftR, matePairId, maxErrors);
322 
323 					if (maxErrors == -1 && options._debugLevel >= 2)
324 						::std::cerr << "(pair #" << matePairId << " disabled)";
325 
326 					if (options.purgeAmbiguous)
327 						dit = ditBeg;
328 				}
329 #endif
330 				continue;
331 			}
332 		}
333 		else
334 		{
335 			matePairId = r.matePairId;
336 			hitCount = 0;
337 			if (options.distanceRange > 0)
338 				scoreDistCutOff = q.pairScore - options.distanceRange;
339 			ditBeg = dit;
340 		}
341 		*dit = *it;	++dit; ++it;
342 		*dit = *it;	++dit;
343 	}
344 	resize(store.alignedReadStore, dit - begin(store.alignedReadStore, Standard()));
345 	compactAlignedReads(store);
346 }
347 
348 
349 
350 #ifndef RAZERS_PARALLEL
351 //////////////////////////////////////////////////////////////////////////////
352 // Find read matches in one genome sequence
353 template <
354 	typename TFSSpec,
355 	typename TFSConfig,
356 	typename TGenome,
357 	typename TReadIndex,
358 	typename TSwiftSpec,
359 	typename TPreprocessing,
360 	typename TCounts,
361 	typename TRazerSOptions >
362 void mapMatePairReads(
363 	FragmentStore<TFSSpec, TFSConfig>		& store,
364 	TGenome									& genome,				// genome ...
365 	unsigned								  contigId,				// ... and its sequence number
366 	Pattern<TReadIndex, Swift<TSwiftSpec> >	& swiftPatternL,
367 	Pattern<TReadIndex, Swift<TSwiftSpec> >	& swiftPatternR,
368 	TPreprocessing							& preprocessingL,
369 	TPreprocessing							& preprocessingR,
370 	TCounts									& cnts,
371 	char									  orientation,			// q-gram index of reads
372 	TRazerSOptions							& options)
373 {
374 	typedef FragmentStore<TFSSpec, TFSConfig>				TFragmentStore;
375 	typedef typename TFragmentStore::TMatePairStore			TMatePairStore;
376 	typedef typename TFragmentStore::TAlignedReadStore		TAlignedReadStore;
377 	typedef typename TFragmentStore::TAlignQualityStore		TAlignQualityStore;
378 	typedef typename Value<TMatePairStore>::Type			TMatePair;
379 	typedef typename Value<TAlignedReadStore>::Type			TAlignedRead;
380 	typedef typename Value<TAlignQualityStore>::Type		TAlignQuality;
381 	typedef typename Fibre<TReadIndex, FibreText>::Type	TReadSet;
382 	typedef typename Id<TAlignedRead>::Type					TId;
383 
384 	typedef typename Size<TGenome>::Type					TSize;
385 	typedef typename Position<TGenome>::Type				TGPos;
386 	typedef typename MakeSigned_<TGPos>::Type				TSignedGPos;
387 	typedef typename Infix<TGenome>::Type					TGenomeInf;
388 
389 	// FILTRATION
390 	typedef Finder<TGenome, Swift<TSwiftSpec> >				TSwiftFinderL;
391 	typedef Finder<TGenomeInf, Swift<TSwiftSpec> >			TSwiftFinderR;
392 	typedef Pattern<TReadIndex, Swift<TSwiftSpec> >			TSwiftPattern;
393 
394 	// MATE-PAIR FILTRATION
395 	typedef Triple<__int64, TAlignedRead, TAlignQuality>	TDequeueValue;
396 	typedef Dequeue<TDequeueValue>							TDequeue;
397 	typedef typename TDequeue::TIter						TDequeueIterator;
398 
399 	// VERIFICATION
400 	typedef MatchVerifier <
401 		TFragmentStore,
402 		TRazerSOptions,
403 		TPreprocessing,
404 		TSwiftPattern,
405 		TCounts >											TVerifier;
406 
407 	const unsigned NOT_VERIFIED = 1u << (8*sizeof(unsigned)-1);
408 
409 	// iterate all genomic sequences
410 	if (options._debugLevel >= 1)
411 	{
412 		::std::cerr << ::std::endl << "Process genome seq #" << contigId;
413 		if (orientation == 'F')
414 			::std::cerr << "[fwd]";
415 		else
416 			::std::cerr << "[rev]";
417 	}
418 
419 	TReadSet	&readSetL = host(host(swiftPatternL));
420 	TReadSet	&readSetR = host(host(swiftPatternR));
421 	TVerifier	verifierL(store, options, preprocessingL, swiftPatternL, cnts);
422 	TVerifier	verifierR(store, options, preprocessingR, swiftPatternR, cnts);
423 
424 	verifierL.oneMatchPerBucket = true;
425 	verifierR.oneMatchPerBucket = true;
426 	verifierL.m.contigId = contigId;
427 	verifierR.m.contigId = contigId;
428 
429 	if (empty(readSetL))
430 		return;
431 
432 	// distance <= libLen + libErr + 2*(parWidth-readLen) - shapeLen
433 	// distance >= libLen - libErr - 2*parWidth + shapeLen
434 	TSize readLength = length(readSetL[0]);
435 	TSignedGPos maxDistance = options.libraryLength + options.libraryError - 2 * (int)readLength - (int)length(indexShape(host(swiftPatternL)));
436 	TSignedGPos minDistance = options.libraryLength - options.libraryError + (int)length(indexShape(host(swiftPatternL)));
437 	TGPos scanShift = (minDistance < 0)? 0: minDistance;
438 
439 	// exit if contig is shorter than library size
440 	if (length(genome) <= scanShift)
441 		return;
442 
443 	TGenomeInf genomeInf = infix(genome, scanShift, length(genome));
444 	TSwiftFinderL swiftFinderL(genome, options.repeatLength, 1);
445 	TSwiftFinderR swiftFinderR(genomeInf, options.repeatLength, 1);
446 
447 	TDequeue fifo;						// stores left-mate potential matches
448 	String<__int64> lastPotMatchNo;		// last number of a left-mate potential
449 	__int64 lastNo = 0;					// last number over all left-mate pot. matches in the queue
450 	__int64 firstNo = 0;				// first number over all left-mate pot. match in the queue
451 	Pair<TGPos> gPair;
452 
453 	resize(lastPotMatchNo, length(host(swiftPatternL)), (__int64)-1, Exact());
454 
455 	TSize gLength = length(genome);
456 
457 	TAlignedRead mR;
458 	TAlignQuality qR;
459 	TDequeueValue fL(-1, mR, qR);	// to supress uninitialized warnings
460 
461 //	unsigned const preFetchMatches = 2048;
462 
463 	// iterate all verification regions returned by SWIFT
464 	while (find(swiftFinderR, swiftPatternR, options.errorRate))
465 	{
466 		unsigned matePairId = swiftPatternR.curSeqNo;
467 		TGPos rEndPos = endPosition(swiftFinderR) + scanShift;
468 		TGPos doubleParWidth = 2 * (*swiftFinderR.curHit).bucketWidth;
469 
470 		// remove out-of-window left mates from fifo
471 		while (!empty(fifo) && (TSignedGPos)front(fifo).i2.endPos + maxDistance + (TSignedGPos)doubleParWidth < (TSignedGPos)rEndPos)
472 		{
473 			popFront(fifo);
474 			++firstNo;
475 		}
476 /*
477 		if (empty(fifo) || back(fifo).endPos + minDistance < (TSignedGPos)(rEndPos + doubleParWidth))
478 			for (unsigned i = 0; i < preFetchMatches; ++i)
479 				if (find(swiftFinderL, swiftPatternL, options.errorRate, false))
480 					pushBack(fifo, mL);
481 				else
482 					break;
483 */
484 		// add within-window left mates to fifo
485 		while (empty(fifo) || (TSignedGPos)back(fifo).i2.endPos + minDistance < (TSignedGPos)(rEndPos + doubleParWidth))
486 		{
487 			if (find(swiftFinderL, swiftPatternL, options.errorRate))
488 			{
489 				gPair = positionRange(swiftFinderL);
490 				if ((TSignedGPos)gPair.i2 + maxDistance + (TSignedGPos)doubleParWidth >= (TSignedGPos)rEndPos)
491 				{
492 					// link in
493 					fL.i1 = lastPotMatchNo[swiftPatternL.curSeqNo];
494 					lastPotMatchNo[swiftPatternL.curSeqNo] = lastNo++;
495 
496 					fL.i2.readId = store.matePairStore[swiftPatternL.curSeqNo].readId[0] | NOT_VERIFIED;
497 					fL.i2.beginPos = gPair.i1;
498 					fL.i2.endPos = gPair.i2;
499 
500 					pushBack(fifo, fL);
501 				}
502 			} else
503 				break;
504 		}
505 
506 		int	bestLeftScore = MinValue<int>::VALUE;
507 		int bestLibSizeError = MaxValue<int>::VALUE;
508 		TDequeueIterator bestLeft = TDequeueIterator();
509 
510 		TDequeueIterator it;
511 		unsigned leftReadId = store.matePairStore[matePairId].readId[0];
512 		__int64 lastPositive = (__int64)-1;
513 		for (__int64 i = lastPotMatchNo[matePairId]; firstNo <= i; i = (*it).i1)
514 		{
515 			it = &value(fifo, i - firstNo);
516 
517 			// search left mate
518 //			if (((*it).i2.readId & ~NOT_VERIFIED) == leftReadId)
519 			{
520 				// verify left mate (equal seqNo), if not done already
521 				if ((*it).i2.readId & NOT_VERIFIED)
522 				{
523 
524 //					if (matchVerify(
525 //							(*it).i2, (*it).i3, infix(genome, (TSignedGPos)(*it).i2.beginPos, (TSignedGPos)(*it).i2.endPos),
526 //							matePairId, readSetL, forwardPatternsL,
527 //							options, TSwiftSpec()))
528 					if (matchVerify(verifierL, infix(genome, (TSignedGPos)(*it).i2.beginPos, (TSignedGPos)(*it).i2.endPos),
529 							matePairId, readSetL, TSwiftSpec()))
530 					{
531 						verifierL.m.readId = (*it).i2.readId & ~NOT_VERIFIED;		// has been verified positively
532 						(*it).i2 = verifierL.m;
533 						(*it).i3 = verifierL.q;
534 
535 						// short-cut negative matches
536 						if (lastPositive == (__int64)-1)
537 							lastPotMatchNo[matePairId] = i;
538 						else
539 							value(fifo, lastPositive - firstNo).i1 = i;
540 						lastPositive = i;
541 					} else
542 						(*it).i2.readId = ~NOT_VERIFIED;				// has been verified negatively
543 				}
544 /*
545 				if ((*it).i2.readId == leftReadId)
546 				{
547 					bestLeft = it;
548 					bestLeftScore = (*it).i3.score;
549 					break;
550 				}
551 */
552 				if ((*it).i2.readId == leftReadId)
553 				{
554 					int score = (*it).i3.score;
555 					if (bestLeftScore <= score)
556 					{
557 						int libSizeError = options.libraryLength - (int)((__int64)mR.endPos - (__int64)(*it).i2.beginPos);
558 						if (libSizeError < 0) libSizeError = -libSizeError;
559 						if (bestLeftScore == score)
560 						{
561 							if (bestLibSizeError > libSizeError)
562 							{
563 								bestLibSizeError = libSizeError;
564 								bestLeft = it;
565 							}
566 						}
567 						else
568 						{
569 							bestLeftScore = score;
570 							bestLibSizeError = libSizeError;
571 							bestLeft = it;
572 							if (bestLeftScore == 0) break;	// TODO: replace if we have real qualities
573 						}
574 					}
575 				}
576 			}
577 //			else
578 //				std::cout << "HUH?" << std::endl;
579 		}
580 
581 		// short-cut negative matches
582 		if (lastPositive == (__int64)-1)
583 			lastPotMatchNo[matePairId] = (__int64)-1;
584 		else
585 			value(fifo, lastPositive - firstNo).i1 = (__int64)-1;
586 
587 		// verify right mate, if left mate matches
588 		if (bestLeftScore != MinValue<int>::VALUE)
589 		{
590 //			if (matchVerify(
591 //					mR, qR, infix(swiftFinderR, genomeInf),
592 //					matePairId, readSetR, forwardPatternsR,
593 //					options, TSwiftSpec()))
594 			if (matchVerify(verifierR, infix(swiftFinderR, genomeInf),
595 					matePairId, readSetR, TSwiftSpec()))
596 			{
597 				// distance between left mate beginning and right mate end
598 				__int64 dist = (__int64)verifierR.m.endPos - (__int64)(*bestLeft).i2.beginPos;
599 				if (dist <= options.libraryLength + options.libraryError &&
600 					options.libraryLength <= dist + options.libraryError)
601 				{
602 					mR = verifierR.m;
603 					qR = verifierR.q;
604 
605 					fL.i2 = (*bestLeft).i2;
606 					fL.i3 = (*bestLeft).i3;
607 
608 					// transform mate readNo to global readNo
609 					TMatePair &mp = store.matePairStore[matePairId];
610 					fL.i2.readId = mp.readId[0];
611 					mR.readId    = mp.readId[1];
612 
613 					// transform coordinates to the forward strand
614 					if (orientation == 'F')
615 					{
616 						TSize temp = mR.beginPos;
617 						mR.beginPos = mR.endPos;
618 						mR.endPos = temp;
619 					} else
620 					{
621 						fL.i2.beginPos = gLength - fL.i2.beginPos;
622 						fL.i2.endPos = gLength - fL.i2.endPos;
623 						TSize temp = mR.beginPos;
624 						mR.beginPos = gLength - mR.endPos;
625 						mR.endPos = gLength - temp;
626 						dist = -dist;
627 					}
628 
629 					// set a unique pair id
630 					fL.i2.pairMatchId = mR.pairMatchId = options.nextPairMatchId;
631 					if (++options.nextPairMatchId == TAlignedRead::INVALID_ID)
632 						options.nextPairMatchId = 0;
633 
634 					// score the whole match pair
635 					fL.i3.pairScore = qR.pairScore = fL.i3.score + qR.score;
636 
637 					// both mates match with correct library size
638 /*								std::cout << "found " << matePairId << " on " << orientation << contigId;
639 					std::cout << " dist:" << dist;
640 					if (orientation=='F')
641 						std::cout << " \t_" << fL.i2.beginPos+1 << "_" << mR.endPos;
642 					else
643 						std::cout << " \t_" << mR.beginPos+1 << "_" << mL.endPos;
644 //							std::cout << " L_" << (*bestLeft).beginPos << "_" << (*bestLeft).endPos << "_" << (*bestLeft).editDist;
645 //							std::cout << " R_" << mR.beginPos << "_" << mR.endPos << "_" << mR.editDist;
646 					std::cout << std::endl;
647 */
648 					if (!options.spec.DONT_DUMP_RESULTS)
649 					{
650 						fL.i2.id = length(store.alignedReadStore);
651 						appendValue(store.alignedReadStore, fL.i2, Generous());
652 						appendValue(store.alignQualityStore, fL.i3, Generous());
653 						mR.id = length(store.alignedReadStore);
654 						appendValue(store.alignedReadStore, mR, Generous());
655 						appendValue(store.alignQualityStore, qR, Generous());
656 
657 						if (length(store.alignedReadStore) > options.compactThresh)
658 						{
659 							typename Size<TAlignedReadStore>::Type oldSize = length(store.alignedReadStore);
660 //									maskDuplicates(matches);	// overlapping parallelograms cause duplicates
661 							compactPairMatches(store, cnts, options, swiftPatternL, swiftPatternR);
662 
663 							if (length(store.alignedReadStore) * 4 > oldSize)			// the threshold should not be raised
664 								options.compactThresh += (options.compactThresh >> 1);	// if too many matches were removed
665 
666 							if (options._debugLevel >= 2)
667 								::std::cerr << '(' << oldSize - length(store.alignedReadStore) << " matches removed)";
668 						}
669 					}
670 					++options.countVerification;
671 				}
672 				++options.countFiltration;
673 			}
674 		}
675 	}
676 }
677 #endif
678 
679 
680 //////////////////////////////////////////////////////////////////////////////
681 // Find read matches in many genome sequences (import from Fasta)
682 template <
683 	typename TFSSpec,
684 	typename TFSConfig,
685 	typename TGNoToFile,
686 	typename TCounts,
687 	typename TSpec,
688 	typename TShape,
689 	typename TSwiftSpec >
690 int mapMatePairReads(
691 	FragmentStore<TFSSpec, TFSConfig>	& store,
692 	StringSet<CharString>				& genomeFileNameList,
693 	String<TGNoToFile>					& gnoToFileMap,
694 	TCounts								& cnts,
695 	RazerSOptions<TSpec>				& options,
696 	TShape const						& shape,
697 	Swift<TSwiftSpec> const)
698 {
699 	typedef FragmentStore<TFSSpec, TFSConfig>			TFragmentStore;
700 	typedef typename TFragmentStore::TReadSeqStore		TReadSeqStore;
701 
702 	typedef typename Value<TReadSeqStore>::Type			TRead;
703 	typedef StringSet<TRead>							TReadSet;
704 	typedef Index<TReadSet, IndexQGram<TShape> >		TIndex;			// q-gram index
705 	typedef Pattern<TIndex, Swift<TSwiftSpec> >			TSwiftPattern;	// filter
706 	typedef Pattern<TRead, MyersUkkonen>				TMyersPattern;	// verifier
707 
708 //	std::cout << "SA-TYPE:" <<sizeof(typename SAValue<TIndex>::Type)<<std::endl;
709 
710 	// split mate-pairs over two indices
711 	TReadSet readSetL, readSetR;
712 	unsigned pairCount = length(store.matePairStore);
713 	resize(readSetL, pairCount, Exact());
714 	resize(readSetR, pairCount, Exact());
715 
716 	for (unsigned i = 0; i < pairCount; ++i)
717 	{
718 		assign(readSetL[i], store.readSeqStore[store.matePairStore[i].readId[0]]);
719 		assign(readSetR[i], store.readSeqStore[store.matePairStore[i].readId[1]]);
720 	}
721 	reverseComplement(readSetR);
722 
723 	// configure q-gram index
724 	TIndex swiftIndexL(readSetL, shape);
725 	TIndex swiftIndexR(readSetR, shape);
726 	reverse(indexShape(swiftIndexR));		// right mate qualities are reversed -> reverse right shape
727 
728 	cargo(swiftIndexL).abundanceCut = options.abundanceCut;
729 	cargo(swiftIndexR).abundanceCut = options.abundanceCut;
730 	cargo(swiftIndexL)._debugLevel = options._debugLevel;
731 	cargo(swiftIndexR)._debugLevel = options._debugLevel;
732 
733 	// configure Swift
734 	TSwiftPattern swiftPatternL(swiftIndexL);
735 	TSwiftPattern swiftPatternR(swiftIndexR);
736 	swiftPatternL.params.minThreshold = options.threshold;
737 	swiftPatternR.params.minThreshold = options.threshold;
738 	swiftPatternL.params.tabooLength = options.tabooLength;
739 	swiftPatternR.params.tabooLength = options.tabooLength;
740 	swiftPatternL.params.printDots = false;
741 	swiftPatternR.params.printDots = options._debugLevel > 0;
742 
743 	// init edit distance verifiers
744 	String<TMyersPattern> forwardPatternsL;
745 	String<TMyersPattern> forwardPatternsR;
746 	options.compMask[4] = (options.matchN)? 15: 0;
747 	if (!options.hammingOnly)
748 	{
749 		resize(forwardPatternsL, pairCount, Exact());
750 		resize(forwardPatternsR, pairCount, Exact());
751 		for(unsigned i = 0; i < pairCount; ++i)
752 		{
753 #ifdef RAZERS_NOOUTERREADGAPS
754 			if (!empty(readSetL[i]) && !empty(readSetR[i])) {
755 				setHost(forwardPatternsL[i], prefix(readSetL[i], length(readSetL[i]) - 1));
756 				setHost(forwardPatternsR[i], prefix(readSetR[i], length(readSetR[i]) - 1));
757 			}
758 #else
759 			setHost(forwardPatternsL[i], readSetL[i]);
760 			setHost(forwardPatternsR[i], readSetR[i]);
761 #endif
762 			_patternMatchNOfPattern(forwardPatternsL[i], options.matchN);
763 			_patternMatchNOfPattern(forwardPatternsR[i], options.matchN);
764 			_patternMatchNOfFinder(forwardPatternsL[i], options.matchN);
765 			_patternMatchNOfFinder(forwardPatternsR[i], options.matchN);
766 		}
767 	}
768 
769 	// clear stats
770 	options.countFiltration = 0;
771 	options.countVerification = 0;
772 	options.timeMapReads = 0;
773 	options.timeDumpResults = 0;
774 
775 	unsigned filecount = 0;
776 	unsigned numFiles = length(genomeFileNameList);
777 	unsigned contigId = 0;
778 
779 	// open genome files, one by one
780 	while (filecount < numFiles)
781 	{
782 		// open genome file
783 		::std::ifstream file;
784 		file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary);
785 		if (!file.is_open())
786 			return RAZERS_GENOME_FAILED;
787 
788 		// remove the directory prefix of current genome file
789 		::std::string genomeFile(toCString(genomeFileNameList[filecount]));
790 		size_t lastPos = genomeFile.find_last_of('/') + 1;
791 		if (lastPos == genomeFile.npos) lastPos = genomeFile.find_last_of('\\') + 1;
792 		if (lastPos == genomeFile.npos) lastPos = 0;
793 		::std::string genomeName = genomeFile.substr(lastPos);
794 
795 		CharString	id;
796 		Dna5String	genome;
797 		unsigned gseqNoWithinFile = 0;
798 		// iterate over genome sequences
799 		SEQAN_PROTIMESTART(find_time);
800 		for(; !_streamEOF(file); ++contigId)
801 		{
802 			if (options.genomeNaming == 0)
803 			{
804 				//readID(file, id, Fasta());			// read Fasta id
805 				readShortID(file, id, Fasta());			// read Fasta id up to first whitespace
806 				appendValue(store.contigNameStore, id, Generous());
807 			}
808 			read(file, genome, Fasta());			// read Fasta sequence
809 
810 			appendValue(gnoToFileMap, TGNoToFile(genomeName, gseqNoWithinFile));
811 
812 			if (options.forward)
813 				mapMatePairReads(store, genome, contigId, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'F', options);
814 
815 			if (options.reverse)
816 			{
817 				reverseComplement(genome);
818 				mapMatePairReads(store, genome, contigId, swiftPatternL, swiftPatternR, forwardPatternsL, forwardPatternsR, cnts, 'R', options);
819 			}
820 			++gseqNoWithinFile;
821 
822 		}
823 		options.timeMapReads += SEQAN_PROTIMEDIFF(find_time);
824 		file.close();
825 		++filecount;
826 	}
827 
828 	compactPairMatches(store, cnts, options, swiftPatternL, swiftPatternR);
829 	reverseComplement(readSetR);
830 
831 	if (options._debugLevel >= 1)
832 		::std::cerr << ::std::endl << "Finding reads took               \t" << options.timeMapReads << " seconds" << ::std::endl;
833 	if (options._debugLevel >= 2) {
834 		::std::cerr << ::std::endl;
835 		::std::cerr << "___FILTRATION_STATS____" << ::std::endl;
836 		::std::cerr << "Filtration counter:  " << options.countFiltration << ::std::endl;
837 		::std::cerr << "Verfication counter: " << options.countVerification << ::std::endl;
838 	}
839 	return 0;
840 }
841 
842 
843 }
844 
845 #endif
846