1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34 
35 #ifndef SEQAN_HEADER_SHAPE_THRESHOLD_H
36 #define SEQAN_HEADER_SHAPE_THRESHOLD_H
37 
38 namespace SEQAN_NAMESPACE_MAIN
39 {
40 
41 struct ThreshQGramLemma_;
42 struct ThreshExact_;
43 struct ThreshHeuristic_;
44 
45 typedef Tag<ThreshQGramLemma_> const	ThreshQGramLemma;
46 typedef Tag<ThreshHeuristic_> const		ThreshHeuristic;
47 typedef Tag<ThreshExact_> const			ThreshExact;
48 
49 
50 //////////////////////////////////////////////////////////////////////////////
51 // q-gram lemma
52 //
53 // - exact for ungapped shapes or errors <= 1
54 // - lower bound gapped shapes
55 //////////////////////////////////////////////////////////////////////////////
56 
57 template <typename TShape, typename TPatternLength, typename TErrors, typename TDistance>
qgramThreshold(TShape const & shape,TPatternLength patternLength,TErrors errors,TDistance const,ThreshQGramLemma const)58 inline int qgramThreshold(TShape const & shape, TPatternLength patternLength, TErrors errors, TDistance const, ThreshQGramLemma const)
59 {
60 	int t = (int)patternLength - (int)length(shape) + 1 - (int)errors * (int)weight(shape);
61 	return (t > 0)? t: 0;
62 }
63 
64 
65 //////////////////////////////////////////////////////////////////////////////
66 // q-gram heuristic
67 //
68 // - exact for errors <= 1
69 // - upper bound
70 //////////////////////////////////////////////////////////////////////////////
71 
72 template <typename TShape, typename TPatternSize, typename TErrors, typename TDistance>
qgramThreshold(TShape const & shape,TPatternSize patternLength,TErrors errors,TDistance const,ThreshHeuristic const)73 int qgramThreshold(TShape const & shape, TPatternSize patternLength, TErrors errors, TDistance const, ThreshHeuristic const)
74 {
75 	String<unsigned char> coverage;
76 	String<bool> preserved;
77 	String<unsigned> ones;
78 	CharString bitString;
79 
80 	// initialize coverage map and bitmap of preserved q-grams
81 	resize(preserved, patternLength - length(shape) + 1, true);
82 	resize(coverage, patternLength, 0);
83 
84 	shapeToString(bitString, shape);
85 	for (unsigned i = 0; i < length(bitString); ++i)
86 		if (bitString[i] == '1')
87 		{
88 			appendValue(ones, i);
89 			for (unsigned j = 0; j < length(preserved); ++j)
90 				++coverage[i + j];
91 		}
92 
93 	// greedily destroy a maximum number of q-grams
94 	for (; errors > 0; --errors)
95 	{
96 		// find position that destroys a maximum number of q-grams
97 		unsigned maxCoverage = 0;
98 		unsigned maxCoveragePos = 0;
99 		for (unsigned i = 0; i < length(coverage); ++i)
100 			if (maxCoverage < coverage[i])
101 			{
102 				maxCoverage = coverage[i];
103 				maxCoveragePos = i;
104 			}
105 
106 		// destroy q-grams
107 		for (unsigned k = 0; k < length(ones); ++k)
108 			if (ones[k] <= maxCoveragePos)
109 			{
110 				unsigned startPos = maxCoveragePos - ones[k];
111 				if (startPos < length(preserved) && preserved[startPos])
112 				{
113 					preserved[startPos] = false;
114 					for (unsigned l = 0; l < length(ones); ++l)
115 						--coverage[startPos + ones[l]];
116 				}
117 			}
118 	}
119 
120 	unsigned thresh = 0;
121 	for (unsigned i = 0; i < length(preserved); ++i)
122 		if (preserved[i])
123 			++thresh;
124 
125 	return thresh;
126 }
127 
128 
129 
130 
131 
132 //____________________________________________________________________________
133 // Extensions to SeqAn
134 
135 	struct ErrorAlphabet_ {};
136 	typedef SimpleType<unsigned char, ErrorAlphabet_> ErrorAlphabet;
137 
138 	template <> struct ValueSize< ErrorAlphabet >    { enum { VALUE = 4 }; };
139 	template <> struct BitsPerValue< ErrorAlphabet > { enum { VALUE = 2 }; };
140 
141 	template <typename T = void>
142 	struct TranslateTableErrorToAscii_
143 	{
144 		static char const VALUE[4];
145 	};
146 	template <typename T>
147 	char const TranslateTableErrorToAscii_<T>::VALUE[4] = {'.', 'M', 'I', 'D'};
148 
149 	inline void assign(Ascii & c_target,
150 					   ErrorAlphabet const & source)
151 	{
152 	SEQAN_CHECKPOINT
153 		c_target = TranslateTableErrorToAscii_<>::VALUE[source.value];
154 	}
155 
156 
157 	struct ErrorPackedString;
158 
159 	template <typename TValue>
160 	struct Host<String<TValue, Packed<ErrorPackedString> > >
161 	{
162 		typedef String<__int64, Array<1> > Type;
163 	};
164 
165 	template <typename TValue>
166 	struct Host<String<TValue, Packed<ErrorPackedString> > const >
167 	{
168 		typedef String<__int64, Array<1> > const Type;
169 	};
170 
171 
172 //____________________________________________________________________________
173 
174 	enum ErrorType {
175 		SEQAN_MATCH    = 0,
176 		SEQAN_MISMATCH = 1,
177 		SEQAN_INSERT   = 2,
178 		SEQAN_DELETE   = 3
179 	};
180 
181 	template <typename TDistance>
182 	struct ErrorTypes {
183 		enum { VALUE = 4 };
184 	};
185 
186 	template <>
187 	struct ErrorTypes<HammingDistance> {
188 		enum { VALUE = 2 };
189 	};
190 
191 	// descriptor of the modification pattern
192 	// in the recursion it modifies the last q-gram of a read sequence
193 	template <typename TDistance, typename TFloat>
194 	struct SensitivityDPState_
195 	{
196 		enum { TRANSITIONS = ErrorTypes<TDistance>::VALUE };
197 		TFloat prob;				// probability of this state
198 		int transition[ErrorTypes<TDistance>::VALUE];	// returns previous state
199 		unsigned char len;			// length of this pattern (shapeSpan-errors <= this value <= shapeSpan+errors)
200 		unsigned char errors:4;		// errors in this state
201 		bool skipFirst:1;			// skip this pattern if it is the first
202 		bool skipLast:1;			// skip this pattern if it is the last
203 		bool intermediate:1;		// this is an intermediate result (beginning with INSERT)
204 		bool qgramHit:1;			// is this a q-gram hit? (result of the former delta function)
205 	}
206 #ifndef PLATFORM_WINDOWS
207 	__attribute__((packed))
208 #endif
209 	;
210 
211 	// descriptor of the modification pattern
212 	// in the recursion it modifies the last q-gram of a read sequence
213 	template <typename TDistance>
214 	struct ThreshDPState_
215 	{
216 		enum { TRANSITIONS = ErrorTypes<TDistance>::VALUE };
217 		int transition[ErrorTypes<TDistance>::VALUE];	// returns previous state
218 		unsigned char len;			// length of this pattern (shapeSpan-errors <= this value <= shapeSpan+errors)
219 		unsigned char errors:4;		// errors in this state
220 		bool skipFirst:1;			// skip this pattern if it is the first
221 		bool skipLast:1;			// skip this pattern if it is the last
222 		bool intermediate:1;		// this is an intermediate result (beginning with INSERT)
223 		bool qgramHit:1;			// is this a q-gram hit? (result of the former delta function)
224 	}
225 #ifndef PLATFORM_WINDOWS
226 	__attribute__((packed))
227 #endif
228 	;
229 
230 #ifdef PLATFORM_WINDOWS
231 
232 	template<typename TValue>
233 	inline bool isnan(TValue value)
234 	{
235 		return value != value;
236 	}
237 
238 	template<typename TValue>
239 	inline bool isinf(TValue value)
240 	{
241 		return value == log(0.0);
242 	}
243 
244 #else
245 
246 	template<typename TValue>
247 	inline bool isnan(TValue value)
248 	{
249 		return std::isnan(value);
250 	}
251 
252 	template<typename TValue>
253 	inline bool isinf(TValue value)
254 	{
255 		return std::isinf(value);
256 	}
257 
258 #endif
259 
260 
261 	template <typename TValue>
262 	inline long double
263 	_transform(TValue a)
264 	{
265 #ifdef USE_LOGVALUES
266 		return log(a);
267 #else
268 		return a;
269 #endif
270 	}
271 
272 	template <typename TValue>
273 	inline long double
274 	_transformBack(TValue a)
275 	{
276 #ifdef USE_LOGVALUES
277 		return exp(a);
278 #else
279 		return a;
280 #endif
281 	}
282 
283 	//////////////////////////////////////////////////////////////////////////////
284 	// Returns the sum of two probability values in log space
285 	template <typename TValue>
286 	inline void
287 	_probAdd(TValue &a, TValue b)
288 	{
289 #ifdef USE_LOGVALUES
290 		if (isinf(a)) {
291 			a = b;
292 			return;
293 		}
294 		if (isinf(b)) return;
295 		if (isnan(a + log(1 + exp(b - a)))) return;
296 		a += log(1 + exp(b - a));
297 #else
298 		a += b;
299 #endif
300 	}
301 
302 	template <typename TValue>
303 	inline TValue
304 	_probMul(TValue a, TValue b)
305 	{
306 #ifdef USE_LOGVALUES
307 		return a + b;
308 #else
309 		return a * b;
310 #endif
311 	}
312 
313 	template <typename TValue>
314 	inline TValue
315 	_probDiv(TValue a, TValue b)
316 	{
317 #ifdef USE_LOGVALUES
318 		return a - b;
319 #else
320 		return a / b;
321 #endif
322 	}
323 
324 
325 struct ErrorPatternLess
326 {
327 	template <typename TPattern>
328 	bool operator() (TPattern const &a, TPattern const &b) const
329 	{
330 		typedef typename Iterator<TPattern const>::Type TIter;
331 		TIter itA = end(a, Standard());
332 		TIter itB = end(b, Standard());
333 		TIter itEnd;
334 		if (length(a) <= length(b))
335 		{
336 			itEnd = begin(a, Standard());
337 			for (; itA != itEnd;)
338 			{
339 				--itA;
340 				--itB;
341 				if (*itA < *itB) return true;
342 				if (*itA > *itB) return false;
343 			}
344 			return false;
345 		} else
346 		{
347 			itEnd = begin(b, Standard());
348 			for (; itB != itEnd;)
349 			{
350 				--itA;
351 				--itB;
352 				if (*itA < *itB) return true;
353 				if (*itA > *itB) return false;
354 			}
355 			return true;
356 		}
357 	}
358 };
359 
360 template <typename TPatternStore, typename TPattern>
361 inline int
362 _getErrorPatternIndex(TPatternStore const &patternStore, TPattern const &pattern)
363 {
364 	typedef typename Iterator<TPatternStore const>::Type TIter;
365 	TIter lb = std::lower_bound(begin(patternStore, Standard()), end(patternStore, Standard()), pattern, ErrorPatternLess());
366 	TIter invalid = end(patternStore, Standard());
367 	if (lb != invalid && *lb == pattern) {
368 //		std::cout << pattern;
369 		return lb - begin(patternStore, Standard());
370 	} else {
371 /*		std::cerr << "  !Pattern Not Found! " << pattern;
372 		if (lb != invalid) std::cerr << "\tnext is " << *lb;
373 		std::cerr << std::endl;
374 */		return -1;
375 	}
376 }
377 
378 // Cut 1 read character and trailing INSERTs of the pattern
379 template <typename TPattern>
380 inline int
381 _cutErrorPattern(TPattern &_pattern)
382 {
383 	typedef typename Iterator<TPattern const, Standard>::Type TIter;
384 	TPattern const & pattern = const_cast<TPattern const&>(_pattern);
385 	TIter it = end(pattern, Standard());
386 	int cuttedErrors = -2;
387 
388 	// cut trailing INSERTs
389 	do {
390 		--it;
391 		++cuttedErrors;
392 	} while ((int)getValue(it) == SEQAN_INSERT);
393 
394 	// cut non INSERT
395 	if ((int)getValue(it) != SEQAN_MATCH)
396 		++cuttedErrors;
397 
398 	//  and all adjacent INSERTs
399 	do {
400 		--it;
401 		++cuttedErrors;
402 	} while ((int)getValue(it) == SEQAN_INSERT);
403 
404 	resize(_pattern, 1 + (it - begin(pattern, Standard())));
405 	return cuttedErrors;
406 }
407 
408 template < typename TLogErrorDistr >
409 typename Value<TLogErrorDistr>::Type
410 _getProb(TLogErrorDistr const &logError, int errorType, int readPos)
411 {
412 	int maxN = length(logError) / 4;
413 	SEQAN_ASSERT(readPos >= 0 && readPos < maxN);
414 	return logError[maxN * (int)errorType + readPos];
415 }
416 
417 //////////////////////////////////////////////////////////////////////////////
418 // Returns log probability of q-gram-configuration q ending at position pos in sequence
419 template < typename TState, typename TLogErrorDistr, typename TPattern >
420 inline void
421 _getLastPatternProb(TState &state, TLogErrorDistr const &logError, TPattern const &pattern, int span)
422 {
423 	int maxN = length(logError) / 4;
424 	typename Value<TLogErrorDistr>::Type prob = _transform(1.0);
425 	for (int i = 0, j = 0; j < (int)length(pattern); ++j)
426 	{
427 		prob = _probMul(prob, _getProb(logError, getValue(pattern, j), maxN - span + i));
428 		if ((int)getValue(pattern, j) != SEQAN_INSERT)
429 			++i;
430 	}
431 	state.prob = prob;
432 }
433 
434 template < typename TState, typename TPattern >
435 inline void
436 _getLastPatternProb(TState &, Nothing const &, TPattern const &, int)
437 {
438 }
439 
440 
441 //////////////////////////////////////////////////////////////////////////////
442 // Initialize states-string for edit/hamming-distance filters
443 template <
444 	typename TStateString,
445 	typename TShape,
446 	typename TLogErrorDistr,
447 	typename TDistance >
448 void initPatterns(
449 	TStateString &states,				// resulting states-string
450 	TShape const &bitShape,				// bit-string of the shape
451 	int maxErrors,						// allowed errors per pattern
452 	TLogErrorDistr const &logError,		// error distribution (Nothing or string of 4*patternLen floats)
453 	TDistance,							// enumerate hamming or edit distance patterns
454 	bool optionMinOutput)				// omit output
455 {
456 #ifndef DEBUG_RECOG_DP
457 //	typedef String<ErrorAlphabet, Packed<ErrorPackedString> >	TPattern;
458 	typedef String<ErrorAlphabet>								TPattern;
459 #endif
460 
461 	typedef typename Iterator<TPattern, Standard>::Type			TIter;
462 	typedef typename Value<TStateString>::Type					TState;
463 
464 	ErrorType lastErrorType = (IsSameType<TDistance, HammingDistance>::VALUE)? SEQAN_MISMATCH: SEQAN_DELETE;
465 
466 	SEQAN_ASSERT(SEQAN_MATCH == 0);
467 	SEQAN_ASSERT((length(logError) % 4) == 0);
468 
469 #ifndef DEBUG_RECOG_DP
470 	String<TPattern> patternStore;
471 #endif
472 
473 	// a modifier is a pair of position and error type
474 	String<Pair<int, ErrorType> > mods;
475 	resize(mods, maxErrors, Pair<int, ErrorType> (0, SEQAN_MATCH));
476 
477 	TPattern pattern;
478 	int span = length(bitShape);
479 
480 	//////////////////////////////////////////////////////////////////////////////
481 	// Enumerate all edit-modification patterns with up to k errors
482 	if (maxErrors == 0)
483 	{
484 		resize(pattern, span, (ErrorAlphabet)SEQAN_MATCH);
485 		appendValue(patternStore, pattern, Generous());
486 	}
487 	else
488 	do
489 	{
490 		clear(pattern);
491 		resize(pattern, span, (ErrorAlphabet)SEQAN_MATCH);
492 
493 		// place errors in the pattern
494 		bool skip = false;
495 		for (int i = 0; (i < maxErrors) && !skip; ++i)
496 		{
497 //			std::cout << mods[i].i1 << " " << (ErrorAlphabet)mods[i].i2 << "\t";
498 			switch (mods[i].i2)
499 			{
500 			case SEQAN_MISMATCH:
501 			case SEQAN_DELETE:
502 				if (pattern[mods[i].i1] != (ErrorAlphabet)SEQAN_MATCH)
503 				{
504 					skip = true;
505 					break;
506 				}
507 				pattern[mods[i].i1] = (ErrorAlphabet)mods[i].i2;
508 				break;
509 
510 			case SEQAN_INSERT:
511 				insertValue(pattern, mods[i].i1, (ErrorAlphabet)SEQAN_INSERT);
512 				break;
513 
514 			case SEQAN_MATCH:
515 				break;
516 			}
517 		}
518 
519 		// remove redundant patterns
520 		if (!skip)
521 		{
522 			TIter it = begin(pattern, Standard());
523 			TIter itEnd = end(pattern, Standard());
524 			int left = getValue(it);
525 			int right;
526 			for (++it; (it != itEnd) && !skip; ++it, left = right)
527 			{
528 				right = getValue(it);
529 
530 #ifdef NON_REDUNDANT
531 				if (left == SEQAN_MISMATCH && right == SEQAN_DELETE)
532 					skip = true;	// MISMATCH before DELETE is DELETE before MISMATCH (already enumerated)
533 
534 				if (left == SEQAN_MISMATCH && right == SEQAN_INSERT)
535 					skip = true;	// MISMATCH before INSERT is INSERT before MISMATCH (already enumerated)
536 
537 				if (left == SEQAN_INSERT && right == SEQAN_DELETE)
538 					skip = true;	// INSERT before DELETE is one MISMATCH (already enumerated)
539 
540 				if (left == SEQAN_DELETE && right == SEQAN_INSERT)
541 					skip = true;	// DELETE before INSERT is one MISMATCH (already enumerated)
542 #endif
543 			}
544 			if (left == SEQAN_INSERT)
545 				skip = true;		// no trailing INSERT allowed
546 		}
547 
548 		if (!skip)
549 		{
550 			appendValue(patternStore, pattern, Generous());
551 //			std::cout << pattern << std::endl;
552 		}
553 
554 		// reposition modifiers
555 		int i = 0;
556 		for (; i < maxErrors; ++i)
557 		{
558 			if (mods[i].i2 == SEQAN_MATCH) continue;
559 			int endPos = (mods[i].i2 == SEQAN_INSERT)? span + 1: span;
560 			if (++mods[i].i1 < endPos)
561 			{
562 				for(--i; i >= 0; --i)
563 					mods[i].i1 = mods[i + 1].i1;
564 				break;
565 			}
566 		}
567 
568 		if (i < maxErrors) continue;
569 
570 		for (i = 0; i < maxErrors; ++i)
571 			mods[i].i1 = 0;
572 
573 		// next state combination
574 		for (i = 0; i < maxErrors; ++i)
575 		{
576 			if (mods[i].i2 == lastErrorType) continue;
577 			mods[i].i2 = (ErrorType)(mods[i].i2 + 1);
578 			for(--i; i >= 0; --i)
579 				mods[i].i2 = SEQAN_MISMATCH;
580 			break;
581 		}
582 
583 		if (i == maxErrors) break;
584 
585 	} while (true);
586 
587 	if (!optionMinOutput)
588 		std::cout << "Stored " << length(patternStore) << " modification patterns" << std::flush;
589 
590 	reserve(patternStore, length(patternStore), Exact());
591 	std::sort(begin(patternStore, Standard()), end(patternStore, Standard()), ErrorPatternLess());
592 	for (int p = 1; p < (int)length(patternStore); ++p)
593 	{
594 		if (patternStore[p-1] == patternStore[p])
595 			std::cerr << "  !Found duplicate! " << patternStore[p] << std::endl;
596 	}
597 
598 	if (!optionMinOutput)
599 		std::cout << " and sorted them." << std::endl;
600 
601 	//////////////////////////////////////////////////////////////////////////////
602 	// Calculate transitions
603 	resize(states, length(patternStore));
604 	for (int p = 0; p < (int)length(patternStore); ++p)
605 	{
606 		pattern = patternStore[p];
607 		TState &state = states[p];
608 
609 //		std::cout << pattern << "\t";
610 
611 		// count errors of current pattern
612 		int errors = 0;
613 		for (int i = 0; i < (int)length(pattern); ++i)
614 			if ((int)getValue(pattern, i) != SEQAN_MATCH)
615 				++errors;
616 
617 		state.len = length(pattern);
618 		state.errors = errors;
619 		state.intermediate = (int)getValue(pattern, 0) == SEQAN_INSERT;
620 		_getLastPatternProb(state, logError, pattern, span);
621 //		std::cout << pattern << "\t";
622 
623 		state.skipFirst = false;
624 		state.skipLast = false;
625 
626 #ifdef NON_REDUNDANT
627 		int err = 0, del = 0;
628 		for (int j = 0; j < (int)length(pattern); ++j)
629 		{
630 			switch ((int)getValue(pattern, j)) {
631 				case SEQAN_MATCH:
632 					++del;
633 					break;
634 
635 				case SEQAN_DELETE:
636 					++del;
637 
638 				case SEQAN_INSERT:
639 					++err;
640 					break;
641 
642 				default:;
643 			}
644 			if (del > 0 && del <= err)
645 				state.skipFirst = true;
646 		}
647 		err = del = 0;
648 		for (int j = (int)length(pattern) - 1; j >= 0; --j)
649 		{
650 			switch ((int)getValue(pattern, j)) {
651 				case SEQAN_MATCH:
652 					++del;
653 					break;
654 
655 				case SEQAN_DELETE:
656 					++del;
657 
658 				case SEQAN_INSERT:
659 					++err;
660 					break;
661 
662 				default:;
663 			}
664 			if (del > 0 && del <= err)
665 				state.skipLast = true;
666 		}
667 #else
668 		state.skipFirst = (int)getValue(pattern, 0) == SEQAN_INSERT;
669 #endif
670 		// apply pattern to read q-gram
671 		// and check if shape is recognized in the genome
672 		state.qgramHit = false;
673 		int delta = 0;
674 		for (int j = 0, readPos = 0, genomePos = 0; j < (int)length(pattern); ++j)
675 		{
676 			switch ((int)getValue(pattern, j))
677 			{
678 				case SEQAN_MATCH:
679 					if (readPos == 0) {
680 						// assert(bitShape[0] == '1')
681 						delta = genomePos;
682 						state.qgramHit = true;
683 					} else
684 						if (bitShape[readPos] == '1')
685 							state.qgramHit &= (readPos + delta == genomePos);
686 //					std::cout << readPos;
687 					++readPos; ++genomePos;
688 					break;
689 				case SEQAN_MISMATCH:
690 					// was it a relevant read position?
691 					if (bitShape[readPos] == '1')
692 						state.qgramHit = false;
693 //					std::cout << 'x';
694 					++readPos; ++genomePos;
695 					break;
696 				case SEQAN_DELETE:
697 					// was it a relevant read position?
698 					if (bitShape[readPos] == '1')
699 						state.qgramHit = false;
700 					++readPos;
701 					break;
702 				case SEQAN_INSERT:
703 					++genomePos;
704 //					std::cout << 'x';
705 			}
706 		}
707 //		std::cout << std::endl;
708 
709 		// prepend INSERT
710 		++errors;
711 		insertValue(pattern, 0, SEQAN_INSERT);
712 		if ((int)SEQAN_INSERT < (int)state.TRANSITIONS)
713 		{
714 			if (errors <= maxErrors)
715 				state.transition[SEQAN_INSERT] = _getErrorPatternIndex(patternStore, pattern);
716 			else
717 				state.transition[SEQAN_INSERT] = -1;
718 		}
719 
720 		// prepend MISMATCH and cut INSERTS
721 		errors -= _cutErrorPattern(pattern);
722 		if ((int)SEQAN_MISMATCH < (int)state.TRANSITIONS)
723 		{
724 			pattern[0] = SEQAN_MISMATCH;
725 			if (errors <= maxErrors)
726 				state.transition[SEQAN_MISMATCH] = _getErrorPatternIndex(patternStore, pattern);
727 			else
728 				state.transition[SEQAN_MISMATCH] = -1;
729 		}
730 
731 		// prepend DELETE
732 		if ((int)SEQAN_DELETE < (int)state.TRANSITIONS)
733 		{
734 			pattern[0] = SEQAN_DELETE;
735 			if (errors <= maxErrors)
736 				state.transition[SEQAN_DELETE] = _getErrorPatternIndex(patternStore, pattern);
737 			else
738 				state.transition[SEQAN_DELETE] = -1;
739 		}
740 
741 		// prepend MATCH
742 		if ((int)SEQAN_MATCH < (int)state.TRANSITIONS)
743 		{
744 			--errors;
745 			pattern[0] = SEQAN_MATCH;
746 			if (errors <= maxErrors)
747 				state.transition[SEQAN_MATCH] = _getErrorPatternIndex(patternStore, pattern);
748 			else
749 				state.transition[SEQAN_MATCH] = -1;
750 		}
751 /*
752 		std::cout << "\t" << state.errors;
753 		std::cout << "\t" << state.qgramHit;
754 		std::cout << "\t" << state.leftError;
755 		std::cout << "\t" << state.rightError;
756 		std::cout << "\t" << state.transition[0];
757 		std::cout << "\t" << state.transition[1];
758 		std::cout << "\t" << state.transition[2];
759 		std::cout << "\t" << state.transition[3];
760 		std::cout << std::endl;
761 */	}
762 	if (!optionMinOutput)
763 		std::cout << "Preprocessing finished." << std::endl;
764 }
765 
766 //////////////////////////////////////////////////////////////////////////////
767 // Compute filtering loss of any q-gram filter (given a states-string)
768 template <
769 	typename TThreshString,
770 	typename TStateString >
771 void computeExactQGramThreshold(
772 	TThreshString &treshPerError,
773 	TStateString const &states,
774 	int span,
775 	int maxErrors,
776 	int maxN,
777 	bool optionMinOutput)
778 {
779 	typedef typename Value<TStateString>::Type		TState;
780 	typedef unsigned								TThresh;
781 	typedef String<TThresh>							TMatrixCol;
782 
783 	int statesCount = length(states);
784 //	int span = length(bitShape);
785 
786 	// columns n-1 and n for recursion
787 	TMatrixCol col0;	// addressing is colx[errors * statesCount + state]
788 	TMatrixCol col1;
789 	const TThresh infty = MaxValue<TThresh>::VALUE >> 1;
790 
791 	resize(col0, maxErrors * statesCount, infty);
792 	resize(col1, maxErrors * statesCount);
793 
794 	// RECURSION BEGIN
795 	for (int s = 0; s < statesCount; ++s)
796 	{
797 		TState const &state = states[s];
798 		if (state.skipFirst) continue;
799 
800 		// threshold is 1 iff we have a q-gram hit at the end
801 		col0[s] = (state.qgramHit)? 1: 0;
802 	}
803 
804 	// iterate over sequence length n
805 	TMatrixCol *col = &col1;
806 	TMatrixCol *colPrev = &col0;
807 
808 #ifdef DEBUG_RECOG_DP
809 	std::cout << span << ":0";
810 	dump(col0, 0,statesCount);
811 	std::cout << " :1";
812 	dump(col0, 1,statesCount);
813 #endif
814 
815 
816 	// RECURSION
817 	//
818 	// thresh(n,q,e) = min(thresh(n-1,0|(q>>1),e),              delta=1/0 <-> q hat 0/>0 error
819 	//                     thresh(n-1,1|(q>>1),e-1)) + delta
820 
821 	for (int n = span; n < maxN; ++n)
822 	{
823 		for (int e = 0; e < maxErrors * statesCount; e += statesCount)
824 		{
825 			for (int s = 0; s < statesCount; ++s)
826 			{
827 				TState const &state = states[s];
828 
829 				// MATCH
830 				TThresh t = (*colPrev)[e + state.transition[SEQAN_MATCH]];
831 
832 				// MISMATCH, INSERT, DELETE
833 				if (e > 0)
834 					for (int m = SEQAN_MISMATCH; m < TState::TRANSITIONS; ++m)
835 					{
836 						int prevState = state.transition[m];
837 						if (prevState >= 0)
838 						{
839 							if (m == SEQAN_INSERT)
840 								t = _min(t, (*col)[(e - statesCount) + prevState]);
841 							else
842 								t = _min(t, (*colPrev)[(e - statesCount) + prevState]);
843 						}
844 					}
845 
846 				(*col)[e + s] = t + state.qgramHit;
847 			}
848 			if (!optionMinOutput)
849 				std::cout << '.' << std::flush;
850 		}
851 
852 		TMatrixCol *tmp = col;
853 		col = colPrev;
854 		colPrev = tmp;
855 
856 #ifdef DEBUG_RECOG_DP
857 		std::cout << n+1 << ":0";
858 		dump(*colPrev, 0,statesCount);
859 		std::cout << " :1";
860 		dump(*colPrev, 1,statesCount);
861 		std::cout << " :2";
862 		dump(*colPrev, 2,statesCount);
863 #endif
864 	}
865 
866 	if (!optionMinOutput)
867 		std::cout << std::endl;
868 
869 	resize(treshPerError, maxErrors);
870 
871 	// RECURSION END
872 	for (int eSum = 0; eSum < maxErrors; ++eSum)
873 	{
874 		TThresh t = infty;
875 		for (int s = 0; s < statesCount; ++s)
876 		{
877 			TState const &state = states[s];
878 
879 			// skip intermediate results
880 			if (state.intermediate || state.skipLast) continue;
881 			if (state.errors <= eSum)
882 			{
883 				int e = eSum - state.errors;
884 				// multiply probability for the trailing pattern
885 				t = _min(t, (*colPrev)[e * statesCount + s]);
886 			}
887 		}
888 
889 		if (t >= infty) t = 0;
890 		treshPerError[eSum] = t;
891 	}
892 }
893 
894 
895 //////////////////////////////////////////////////////////////////////////////
896 // Compute filtering loss of any q-gram filter (given a states-string)
897 template <
898 	typename TLossString,
899 	typename TLogErrorDistr,
900 	typename TStateString >
901 void computeQGramFilteringSensitivity(
902 	TLossString &found,
903 	TStateString const &states,
904 	int span,
905 	int maxT,
906 	int maxErrors,
907 	TLogErrorDistr const &logError,
908 //	bool optionAbsolute = false,
909 	bool optionMinOutput)
910 {
911 	typedef typename Value<TLossString>::Type		TFloat;
912 	typedef typename Value<TLogErrorDistr>::Type	TProbValue;
913 	typedef typename Value<TStateString>::Type		TState;
914 
915 	typedef String<TFloat>							TMatrixCol;
916 	typedef String<int>								TIntCol;
917 
918 	SEQAN_ASSERT((length(logError) % 4) == 0);
919 
920 	int maxN = length(logError) / 4;
921 	int statesCount = length(states);
922 	const bool optionAbsolute = false;
923 //	int span = length(bitShape);
924 
925 	// columns n-1 and n for recursion
926 	TMatrixCol col0;
927 	TMatrixCol col1;
928 	resize(col0, maxErrors * statesCount * maxT, (TFloat)_transform(0.0));
929 	resize(col1, maxErrors * statesCount * maxT);
930 
931 #ifdef COUNT_LOSSES
932 	TFloat positive = _transform(0.0);
933 	TFloat negative = _transform(1.0);
934 #else
935 	TFloat positive = _transform(1.0);
936 	TFloat negative = _transform(0.0);
937 #endif
938 
939 	// RECURSION BEGIN
940 	for (int s = 0; s < statesCount; ++s)
941 	{
942 		TState const &state = states[s];
943 
944 		if (state.skipFirst) continue;
945 
946 		// we miss no match if threshold t is 0
947 		col0[s*maxT] = positive;
948 
949 		// for n==0
950 		if (state.qgramHit)
951 		{
952 			// we miss no match if read q-gram is recognized
953 			// --> probability of finding this MMP is 1, if t=1
954 			col0[s*maxT+1] = positive;
955 			// --> probability of finding this MMP is 0, if t>1
956 			for (int t = 2; t < maxT; ++t)
957 				col0[s*maxT+t] = negative;
958 		} else
959 		{
960 			// we miss 1 match if t>0 and read q-gram is not recognized
961 			// --> probability of finding this MMP is 0, if t>=1
962 			for (int t = 1; t < maxT; ++t)
963 				col0[s*maxT+t] = negative;
964 		}
965 	}
966 
967 	// iterate over sequence length n
968 	TMatrixCol *col = &col1;
969 	TMatrixCol *colPrev = &col0;
970 
971 #ifdef DEBUG_RECOG_DP
972 	::std::cout << span << ":0";
973 	dump(col0, 0,statesCount);
974 	::std::cout << " :1";
975 	dump(col0, 1,statesCount);
976 #endif
977 
978 
979 	// RECURSION
980 	//
981 	// found(n,q,t,e) = (1-errorProb[n-span]) * found(n-1,0|(q>>1),t-delta,e) delta=1/0 <-> q hat 0/>0 fehler
982 	//               + errorProb[n-span] * found(n-1,1|(q>>1),t-delta,e-1)
983 
984 	// rekursion (fuer q-gram matches <=1 fehler)
985 	// found(n,q,t,e) = (1-errorProb[n-span]) * found(n-1,0|(q>>1),t-delta,e) delta=1/0 <-> q hat <=1/>1 fehler
986 	//               + errorProb[n-span] * found(n-1,1|(q>>1),t-delta,e-1)
987 
988 	for (int n = span; n < maxN; ++n)
989 	{
990 		for (int e = 0; e < maxErrors * statesCount; e += statesCount)
991 		{
992 			for (int s = 0; s < statesCount; ++s)
993 			{
994 				TState const &state = states[s];
995 				for (int t = 0; t < maxT; ++t)
996 				{
997 					int _t = t;
998 					if (_t > 0 && state.qgramHit) --_t;
999 
1000 					// MATCH
1001 					TFloat recovered = _probMul(
1002 						_getProb(logError, SEQAN_MATCH, n-span),
1003 						(*colPrev)[(e+state.transition[SEQAN_MATCH])*maxT+_t]);
1004 
1005 					// MISMATCH, INSERT, DELETE
1006 					for (int m = SEQAN_MISMATCH; m < 4; ++m)
1007 						if (e > 0)
1008 						{
1009 							int prevState = state.transition[m];
1010 							if (prevState >= 0)
1011 							{
1012 								if (m == SEQAN_INSERT)
1013 									_probAdd(recovered, _probMul(_getProb(logError,m,n-span), (*col)[((e-statesCount)+prevState)*maxT+t]));
1014 								else
1015 									_probAdd(recovered, _probMul(_getProb(logError,m,n-span), (*colPrev)[((e-statesCount)+prevState)*maxT+_t]));
1016 							}
1017 						}
1018 					(*col)[(e+s)*maxT+t] = recovered;
1019 				}
1020 			}
1021 			if (!optionMinOutput)
1022 				::std::cout << '.' << ::std::flush;
1023 		}
1024 
1025 		TMatrixCol *tmp = col;
1026 		col = colPrev;
1027 		colPrev = tmp;
1028 
1029 #ifdef DEBUG_RECOG_DP
1030 		::std::cout << n+1 << ":0";
1031 		dump(*colPrev, 0,statesCount);
1032 		::std::cout << " :1";
1033 		dump(*colPrev, 1,statesCount);
1034 		::std::cout << " :2";
1035 		dump(*colPrev, 2,statesCount);
1036 #endif
1037 	}
1038 
1039 	if (!optionMinOutput)
1040 		::std::cout << ::std::endl;
1041 
1042 	// RECURSION END
1043 	for (int eSum = 0; eSum < maxErrors; ++eSum)
1044 		for (int t = 0; t < maxT; ++t)
1045 		{
1046 			TFloat recovered = _transform(0.0);
1047 			for (int s = 0; s < statesCount; ++s)
1048 			{
1049 				TState const &state = states[s];
1050 
1051 				// skip intermediate results
1052 				if (state.intermediate || state.skipLast) continue;
1053 				if (state.errors <= eSum)
1054 				{
1055 					int e = eSum - state.errors;
1056 					// multiply probability for the trailing pattern
1057 					_probAdd(recovered, _probMul(state.prob, (*colPrev)[(e*statesCount+s)*maxT+t]));
1058 				}
1059 			}
1060 
1061 #ifndef COUNT_LOSSES
1062 			// we can only normalize probs if t==0 contains all k-pattern probs
1063 			if (t > 0 && !optionAbsolute)
1064 				recovered = _probDiv(recovered, found[eSum*maxT]);
1065 #endif
1066 
1067 			found[eSum*maxT+t] = recovered;
1068 		}
1069 }
1070 
1071 
1072 //////////////////////////////////////////////////////////////////////////////
1073 // q-gram threshold DP algorithm
1074 //
1075 // - exact threshold
1076 //////////////////////////////////////////////////////////////////////////////
1077 
1078 template <typename TShape, typename TPatternSize, typename TErrors, typename TDistance>
1079 int qgramThreshold(TShape const & shape, TPatternSize patternLength, TErrors errors, TDistance const dist, ThreshExact const)
1080 {
1081 	String<ThreshDPState_<TDistance> > states;
1082 	String<unsigned> thresh;
1083 	String<char> bitString;
1084 
1085 	shapeToString(bitString, shape);
1086 	initPatterns(states, bitString, errors, Nothing(), dist, true);
1087 	computeExactQGramThreshold(thresh, states, length(bitString), errors + 1, patternLength, true);
1088 
1089 	return thresh[errors];
1090 }
1091 
1092 //////////////////////////////////////////////////////////////////////////////
1093 // q-gram filter sensitivity DP algorithm
1094 //
1095 // - exact threshold
1096 //////////////////////////////////////////////////////////////////////////////
1097 
1098 template <typename TSensitivityMatrix, typename TShape, typename TPatternSize, typename TErrors, typename TThresh, typename TDistance, typename TErrorDist>
1099 void qgramFilteringSensitivity(
1100 	TSensitivityMatrix & sensMat,
1101 	TShape const & shape,
1102 	TPatternSize patternLength,
1103 	TErrors errors,
1104 	TThresh maxThresh,
1105 	TDistance const dist,
1106 	ThreshExact const,
1107 	TErrorDist const & logErrorDistribution)
1108 {
1109 	typedef typename Value<TSensitivityMatrix>::Type TFloat;
1110 	String<SensitivityDPState_<TDistance, TFloat> > states;
1111 	String<unsigned> thresh;
1112 	String<char> bitString;
1113 
1114 	maxThresh = _min(maxThresh, patternLength - length(shape) + 1);
1115 	resize(sensMat, (maxThresh + 1) * (errors + 1));
1116 	shapeToString(bitString, shape);
1117 
1118 	initPatterns(states, bitString, errors, logErrorDistribution, dist, true);
1119 	computeQGramFilteringSensitivity(sensMat, states, length(bitString), maxThresh + 1, errors + 1, logErrorDistribution, true);
1120 }
1121 
1122 }	// namespace seqan
1123 
1124 #endif
1125