1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34
35 #ifndef SEQAN_HEADER_SHAPE_THRESHOLD_H
36 #define SEQAN_HEADER_SHAPE_THRESHOLD_H
37
38 namespace SEQAN_NAMESPACE_MAIN
39 {
40
41 struct ThreshQGramLemma_;
42 struct ThreshExact_;
43 struct ThreshHeuristic_;
44
45 typedef Tag<ThreshQGramLemma_> const ThreshQGramLemma;
46 typedef Tag<ThreshHeuristic_> const ThreshHeuristic;
47 typedef Tag<ThreshExact_> const ThreshExact;
48
49
50 //////////////////////////////////////////////////////////////////////////////
51 // q-gram lemma
52 //
53 // - exact for ungapped shapes or errors <= 1
54 // - lower bound gapped shapes
55 //////////////////////////////////////////////////////////////////////////////
56
57 template <typename TShape, typename TPatternLength, typename TErrors, typename TDistance>
qgramThreshold(TShape const & shape,TPatternLength patternLength,TErrors errors,TDistance const,ThreshQGramLemma const)58 inline int qgramThreshold(TShape const & shape, TPatternLength patternLength, TErrors errors, TDistance const, ThreshQGramLemma const)
59 {
60 int t = (int)patternLength - (int)length(shape) + 1 - (int)errors * (int)weight(shape);
61 return (t > 0)? t: 0;
62 }
63
64
65 //////////////////////////////////////////////////////////////////////////////
66 // q-gram heuristic
67 //
68 // - exact for errors <= 1
69 // - upper bound
70 //////////////////////////////////////////////////////////////////////////////
71
72 template <typename TShape, typename TPatternSize, typename TErrors, typename TDistance>
qgramThreshold(TShape const & shape,TPatternSize patternLength,TErrors errors,TDistance const,ThreshHeuristic const)73 int qgramThreshold(TShape const & shape, TPatternSize patternLength, TErrors errors, TDistance const, ThreshHeuristic const)
74 {
75 String<unsigned char> coverage;
76 String<bool> preserved;
77 String<unsigned> ones;
78 CharString bitString;
79
80 // initialize coverage map and bitmap of preserved q-grams
81 resize(preserved, patternLength - length(shape) + 1, true);
82 resize(coverage, patternLength, 0);
83
84 shapeToString(bitString, shape);
85 for (unsigned i = 0; i < length(bitString); ++i)
86 if (bitString[i] == '1')
87 {
88 appendValue(ones, i);
89 for (unsigned j = 0; j < length(preserved); ++j)
90 ++coverage[i + j];
91 }
92
93 // greedily destroy a maximum number of q-grams
94 for (; errors > 0; --errors)
95 {
96 // find position that destroys a maximum number of q-grams
97 unsigned maxCoverage = 0;
98 unsigned maxCoveragePos = 0;
99 for (unsigned i = 0; i < length(coverage); ++i)
100 if (maxCoverage < coverage[i])
101 {
102 maxCoverage = coverage[i];
103 maxCoveragePos = i;
104 }
105
106 // destroy q-grams
107 for (unsigned k = 0; k < length(ones); ++k)
108 if (ones[k] <= maxCoveragePos)
109 {
110 unsigned startPos = maxCoveragePos - ones[k];
111 if (startPos < length(preserved) && preserved[startPos])
112 {
113 preserved[startPos] = false;
114 for (unsigned l = 0; l < length(ones); ++l)
115 --coverage[startPos + ones[l]];
116 }
117 }
118 }
119
120 unsigned thresh = 0;
121 for (unsigned i = 0; i < length(preserved); ++i)
122 if (preserved[i])
123 ++thresh;
124
125 return thresh;
126 }
127
128
129
130
131
132 //____________________________________________________________________________
133 // Extensions to SeqAn
134
135 struct ErrorAlphabet_ {};
136 typedef SimpleType<unsigned char, ErrorAlphabet_> ErrorAlphabet;
137
138 template <> struct ValueSize< ErrorAlphabet > { enum { VALUE = 4 }; };
139 template <> struct BitsPerValue< ErrorAlphabet > { enum { VALUE = 2 }; };
140
141 template <typename T = void>
142 struct TranslateTableErrorToAscii_
143 {
144 static char const VALUE[4];
145 };
146 template <typename T>
147 char const TranslateTableErrorToAscii_<T>::VALUE[4] = {'.', 'M', 'I', 'D'};
148
149 inline void assign(Ascii & c_target,
150 ErrorAlphabet const & source)
151 {
152 SEQAN_CHECKPOINT
153 c_target = TranslateTableErrorToAscii_<>::VALUE[source.value];
154 }
155
156
157 struct ErrorPackedString;
158
159 template <typename TValue>
160 struct Host<String<TValue, Packed<ErrorPackedString> > >
161 {
162 typedef String<__int64, Array<1> > Type;
163 };
164
165 template <typename TValue>
166 struct Host<String<TValue, Packed<ErrorPackedString> > const >
167 {
168 typedef String<__int64, Array<1> > const Type;
169 };
170
171
172 //____________________________________________________________________________
173
174 enum ErrorType {
175 SEQAN_MATCH = 0,
176 SEQAN_MISMATCH = 1,
177 SEQAN_INSERT = 2,
178 SEQAN_DELETE = 3
179 };
180
181 template <typename TDistance>
182 struct ErrorTypes {
183 enum { VALUE = 4 };
184 };
185
186 template <>
187 struct ErrorTypes<HammingDistance> {
188 enum { VALUE = 2 };
189 };
190
191 // descriptor of the modification pattern
192 // in the recursion it modifies the last q-gram of a read sequence
193 template <typename TDistance, typename TFloat>
194 struct SensitivityDPState_
195 {
196 enum { TRANSITIONS = ErrorTypes<TDistance>::VALUE };
197 TFloat prob; // probability of this state
198 int transition[ErrorTypes<TDistance>::VALUE]; // returns previous state
199 unsigned char len; // length of this pattern (shapeSpan-errors <= this value <= shapeSpan+errors)
200 unsigned char errors:4; // errors in this state
201 bool skipFirst:1; // skip this pattern if it is the first
202 bool skipLast:1; // skip this pattern if it is the last
203 bool intermediate:1; // this is an intermediate result (beginning with INSERT)
204 bool qgramHit:1; // is this a q-gram hit? (result of the former delta function)
205 }
206 #ifndef PLATFORM_WINDOWS
207 __attribute__((packed))
208 #endif
209 ;
210
211 // descriptor of the modification pattern
212 // in the recursion it modifies the last q-gram of a read sequence
213 template <typename TDistance>
214 struct ThreshDPState_
215 {
216 enum { TRANSITIONS = ErrorTypes<TDistance>::VALUE };
217 int transition[ErrorTypes<TDistance>::VALUE]; // returns previous state
218 unsigned char len; // length of this pattern (shapeSpan-errors <= this value <= shapeSpan+errors)
219 unsigned char errors:4; // errors in this state
220 bool skipFirst:1; // skip this pattern if it is the first
221 bool skipLast:1; // skip this pattern if it is the last
222 bool intermediate:1; // this is an intermediate result (beginning with INSERT)
223 bool qgramHit:1; // is this a q-gram hit? (result of the former delta function)
224 }
225 #ifndef PLATFORM_WINDOWS
226 __attribute__((packed))
227 #endif
228 ;
229
230 #ifdef PLATFORM_WINDOWS
231
232 template<typename TValue>
233 inline bool isnan(TValue value)
234 {
235 return value != value;
236 }
237
238 template<typename TValue>
239 inline bool isinf(TValue value)
240 {
241 return value == log(0.0);
242 }
243
244 #else
245
246 template<typename TValue>
247 inline bool isnan(TValue value)
248 {
249 return std::isnan(value);
250 }
251
252 template<typename TValue>
253 inline bool isinf(TValue value)
254 {
255 return std::isinf(value);
256 }
257
258 #endif
259
260
261 template <typename TValue>
262 inline long double
263 _transform(TValue a)
264 {
265 #ifdef USE_LOGVALUES
266 return log(a);
267 #else
268 return a;
269 #endif
270 }
271
272 template <typename TValue>
273 inline long double
274 _transformBack(TValue a)
275 {
276 #ifdef USE_LOGVALUES
277 return exp(a);
278 #else
279 return a;
280 #endif
281 }
282
283 //////////////////////////////////////////////////////////////////////////////
284 // Returns the sum of two probability values in log space
285 template <typename TValue>
286 inline void
287 _probAdd(TValue &a, TValue b)
288 {
289 #ifdef USE_LOGVALUES
290 if (isinf(a)) {
291 a = b;
292 return;
293 }
294 if (isinf(b)) return;
295 if (isnan(a + log(1 + exp(b - a)))) return;
296 a += log(1 + exp(b - a));
297 #else
298 a += b;
299 #endif
300 }
301
302 template <typename TValue>
303 inline TValue
304 _probMul(TValue a, TValue b)
305 {
306 #ifdef USE_LOGVALUES
307 return a + b;
308 #else
309 return a * b;
310 #endif
311 }
312
313 template <typename TValue>
314 inline TValue
315 _probDiv(TValue a, TValue b)
316 {
317 #ifdef USE_LOGVALUES
318 return a - b;
319 #else
320 return a / b;
321 #endif
322 }
323
324
325 struct ErrorPatternLess
326 {
327 template <typename TPattern>
328 bool operator() (TPattern const &a, TPattern const &b) const
329 {
330 typedef typename Iterator<TPattern const>::Type TIter;
331 TIter itA = end(a, Standard());
332 TIter itB = end(b, Standard());
333 TIter itEnd;
334 if (length(a) <= length(b))
335 {
336 itEnd = begin(a, Standard());
337 for (; itA != itEnd;)
338 {
339 --itA;
340 --itB;
341 if (*itA < *itB) return true;
342 if (*itA > *itB) return false;
343 }
344 return false;
345 } else
346 {
347 itEnd = begin(b, Standard());
348 for (; itB != itEnd;)
349 {
350 --itA;
351 --itB;
352 if (*itA < *itB) return true;
353 if (*itA > *itB) return false;
354 }
355 return true;
356 }
357 }
358 };
359
360 template <typename TPatternStore, typename TPattern>
361 inline int
362 _getErrorPatternIndex(TPatternStore const &patternStore, TPattern const &pattern)
363 {
364 typedef typename Iterator<TPatternStore const>::Type TIter;
365 TIter lb = std::lower_bound(begin(patternStore, Standard()), end(patternStore, Standard()), pattern, ErrorPatternLess());
366 TIter invalid = end(patternStore, Standard());
367 if (lb != invalid && *lb == pattern) {
368 // std::cout << pattern;
369 return lb - begin(patternStore, Standard());
370 } else {
371 /* std::cerr << " !Pattern Not Found! " << pattern;
372 if (lb != invalid) std::cerr << "\tnext is " << *lb;
373 std::cerr << std::endl;
374 */ return -1;
375 }
376 }
377
378 // Cut 1 read character and trailing INSERTs of the pattern
379 template <typename TPattern>
380 inline int
381 _cutErrorPattern(TPattern &_pattern)
382 {
383 typedef typename Iterator<TPattern const, Standard>::Type TIter;
384 TPattern const & pattern = const_cast<TPattern const&>(_pattern);
385 TIter it = end(pattern, Standard());
386 int cuttedErrors = -2;
387
388 // cut trailing INSERTs
389 do {
390 --it;
391 ++cuttedErrors;
392 } while ((int)getValue(it) == SEQAN_INSERT);
393
394 // cut non INSERT
395 if ((int)getValue(it) != SEQAN_MATCH)
396 ++cuttedErrors;
397
398 // and all adjacent INSERTs
399 do {
400 --it;
401 ++cuttedErrors;
402 } while ((int)getValue(it) == SEQAN_INSERT);
403
404 resize(_pattern, 1 + (it - begin(pattern, Standard())));
405 return cuttedErrors;
406 }
407
408 template < typename TLogErrorDistr >
409 typename Value<TLogErrorDistr>::Type
410 _getProb(TLogErrorDistr const &logError, int errorType, int readPos)
411 {
412 int maxN = length(logError) / 4;
413 SEQAN_ASSERT(readPos >= 0 && readPos < maxN);
414 return logError[maxN * (int)errorType + readPos];
415 }
416
417 //////////////////////////////////////////////////////////////////////////////
418 // Returns log probability of q-gram-configuration q ending at position pos in sequence
419 template < typename TState, typename TLogErrorDistr, typename TPattern >
420 inline void
421 _getLastPatternProb(TState &state, TLogErrorDistr const &logError, TPattern const &pattern, int span)
422 {
423 int maxN = length(logError) / 4;
424 typename Value<TLogErrorDistr>::Type prob = _transform(1.0);
425 for (int i = 0, j = 0; j < (int)length(pattern); ++j)
426 {
427 prob = _probMul(prob, _getProb(logError, getValue(pattern, j), maxN - span + i));
428 if ((int)getValue(pattern, j) != SEQAN_INSERT)
429 ++i;
430 }
431 state.prob = prob;
432 }
433
434 template < typename TState, typename TPattern >
435 inline void
436 _getLastPatternProb(TState &, Nothing const &, TPattern const &, int)
437 {
438 }
439
440
441 //////////////////////////////////////////////////////////////////////////////
442 // Initialize states-string for edit/hamming-distance filters
443 template <
444 typename TStateString,
445 typename TShape,
446 typename TLogErrorDistr,
447 typename TDistance >
448 void initPatterns(
449 TStateString &states, // resulting states-string
450 TShape const &bitShape, // bit-string of the shape
451 int maxErrors, // allowed errors per pattern
452 TLogErrorDistr const &logError, // error distribution (Nothing or string of 4*patternLen floats)
453 TDistance, // enumerate hamming or edit distance patterns
454 bool optionMinOutput) // omit output
455 {
456 #ifndef DEBUG_RECOG_DP
457 // typedef String<ErrorAlphabet, Packed<ErrorPackedString> > TPattern;
458 typedef String<ErrorAlphabet> TPattern;
459 #endif
460
461 typedef typename Iterator<TPattern, Standard>::Type TIter;
462 typedef typename Value<TStateString>::Type TState;
463
464 ErrorType lastErrorType = (IsSameType<TDistance, HammingDistance>::VALUE)? SEQAN_MISMATCH: SEQAN_DELETE;
465
466 SEQAN_ASSERT(SEQAN_MATCH == 0);
467 SEQAN_ASSERT((length(logError) % 4) == 0);
468
469 #ifndef DEBUG_RECOG_DP
470 String<TPattern> patternStore;
471 #endif
472
473 // a modifier is a pair of position and error type
474 String<Pair<int, ErrorType> > mods;
475 resize(mods, maxErrors, Pair<int, ErrorType> (0, SEQAN_MATCH));
476
477 TPattern pattern;
478 int span = length(bitShape);
479
480 //////////////////////////////////////////////////////////////////////////////
481 // Enumerate all edit-modification patterns with up to k errors
482 if (maxErrors == 0)
483 {
484 resize(pattern, span, (ErrorAlphabet)SEQAN_MATCH);
485 appendValue(patternStore, pattern, Generous());
486 }
487 else
488 do
489 {
490 clear(pattern);
491 resize(pattern, span, (ErrorAlphabet)SEQAN_MATCH);
492
493 // place errors in the pattern
494 bool skip = false;
495 for (int i = 0; (i < maxErrors) && !skip; ++i)
496 {
497 // std::cout << mods[i].i1 << " " << (ErrorAlphabet)mods[i].i2 << "\t";
498 switch (mods[i].i2)
499 {
500 case SEQAN_MISMATCH:
501 case SEQAN_DELETE:
502 if (pattern[mods[i].i1] != (ErrorAlphabet)SEQAN_MATCH)
503 {
504 skip = true;
505 break;
506 }
507 pattern[mods[i].i1] = (ErrorAlphabet)mods[i].i2;
508 break;
509
510 case SEQAN_INSERT:
511 insertValue(pattern, mods[i].i1, (ErrorAlphabet)SEQAN_INSERT);
512 break;
513
514 case SEQAN_MATCH:
515 break;
516 }
517 }
518
519 // remove redundant patterns
520 if (!skip)
521 {
522 TIter it = begin(pattern, Standard());
523 TIter itEnd = end(pattern, Standard());
524 int left = getValue(it);
525 int right;
526 for (++it; (it != itEnd) && !skip; ++it, left = right)
527 {
528 right = getValue(it);
529
530 #ifdef NON_REDUNDANT
531 if (left == SEQAN_MISMATCH && right == SEQAN_DELETE)
532 skip = true; // MISMATCH before DELETE is DELETE before MISMATCH (already enumerated)
533
534 if (left == SEQAN_MISMATCH && right == SEQAN_INSERT)
535 skip = true; // MISMATCH before INSERT is INSERT before MISMATCH (already enumerated)
536
537 if (left == SEQAN_INSERT && right == SEQAN_DELETE)
538 skip = true; // INSERT before DELETE is one MISMATCH (already enumerated)
539
540 if (left == SEQAN_DELETE && right == SEQAN_INSERT)
541 skip = true; // DELETE before INSERT is one MISMATCH (already enumerated)
542 #endif
543 }
544 if (left == SEQAN_INSERT)
545 skip = true; // no trailing INSERT allowed
546 }
547
548 if (!skip)
549 {
550 appendValue(patternStore, pattern, Generous());
551 // std::cout << pattern << std::endl;
552 }
553
554 // reposition modifiers
555 int i = 0;
556 for (; i < maxErrors; ++i)
557 {
558 if (mods[i].i2 == SEQAN_MATCH) continue;
559 int endPos = (mods[i].i2 == SEQAN_INSERT)? span + 1: span;
560 if (++mods[i].i1 < endPos)
561 {
562 for(--i; i >= 0; --i)
563 mods[i].i1 = mods[i + 1].i1;
564 break;
565 }
566 }
567
568 if (i < maxErrors) continue;
569
570 for (i = 0; i < maxErrors; ++i)
571 mods[i].i1 = 0;
572
573 // next state combination
574 for (i = 0; i < maxErrors; ++i)
575 {
576 if (mods[i].i2 == lastErrorType) continue;
577 mods[i].i2 = (ErrorType)(mods[i].i2 + 1);
578 for(--i; i >= 0; --i)
579 mods[i].i2 = SEQAN_MISMATCH;
580 break;
581 }
582
583 if (i == maxErrors) break;
584
585 } while (true);
586
587 if (!optionMinOutput)
588 std::cout << "Stored " << length(patternStore) << " modification patterns" << std::flush;
589
590 reserve(patternStore, length(patternStore), Exact());
591 std::sort(begin(patternStore, Standard()), end(patternStore, Standard()), ErrorPatternLess());
592 for (int p = 1; p < (int)length(patternStore); ++p)
593 {
594 if (patternStore[p-1] == patternStore[p])
595 std::cerr << " !Found duplicate! " << patternStore[p] << std::endl;
596 }
597
598 if (!optionMinOutput)
599 std::cout << " and sorted them." << std::endl;
600
601 //////////////////////////////////////////////////////////////////////////////
602 // Calculate transitions
603 resize(states, length(patternStore));
604 for (int p = 0; p < (int)length(patternStore); ++p)
605 {
606 pattern = patternStore[p];
607 TState &state = states[p];
608
609 // std::cout << pattern << "\t";
610
611 // count errors of current pattern
612 int errors = 0;
613 for (int i = 0; i < (int)length(pattern); ++i)
614 if ((int)getValue(pattern, i) != SEQAN_MATCH)
615 ++errors;
616
617 state.len = length(pattern);
618 state.errors = errors;
619 state.intermediate = (int)getValue(pattern, 0) == SEQAN_INSERT;
620 _getLastPatternProb(state, logError, pattern, span);
621 // std::cout << pattern << "\t";
622
623 state.skipFirst = false;
624 state.skipLast = false;
625
626 #ifdef NON_REDUNDANT
627 int err = 0, del = 0;
628 for (int j = 0; j < (int)length(pattern); ++j)
629 {
630 switch ((int)getValue(pattern, j)) {
631 case SEQAN_MATCH:
632 ++del;
633 break;
634
635 case SEQAN_DELETE:
636 ++del;
637
638 case SEQAN_INSERT:
639 ++err;
640 break;
641
642 default:;
643 }
644 if (del > 0 && del <= err)
645 state.skipFirst = true;
646 }
647 err = del = 0;
648 for (int j = (int)length(pattern) - 1; j >= 0; --j)
649 {
650 switch ((int)getValue(pattern, j)) {
651 case SEQAN_MATCH:
652 ++del;
653 break;
654
655 case SEQAN_DELETE:
656 ++del;
657
658 case SEQAN_INSERT:
659 ++err;
660 break;
661
662 default:;
663 }
664 if (del > 0 && del <= err)
665 state.skipLast = true;
666 }
667 #else
668 state.skipFirst = (int)getValue(pattern, 0) == SEQAN_INSERT;
669 #endif
670 // apply pattern to read q-gram
671 // and check if shape is recognized in the genome
672 state.qgramHit = false;
673 int delta = 0;
674 for (int j = 0, readPos = 0, genomePos = 0; j < (int)length(pattern); ++j)
675 {
676 switch ((int)getValue(pattern, j))
677 {
678 case SEQAN_MATCH:
679 if (readPos == 0) {
680 // assert(bitShape[0] == '1')
681 delta = genomePos;
682 state.qgramHit = true;
683 } else
684 if (bitShape[readPos] == '1')
685 state.qgramHit &= (readPos + delta == genomePos);
686 // std::cout << readPos;
687 ++readPos; ++genomePos;
688 break;
689 case SEQAN_MISMATCH:
690 // was it a relevant read position?
691 if (bitShape[readPos] == '1')
692 state.qgramHit = false;
693 // std::cout << 'x';
694 ++readPos; ++genomePos;
695 break;
696 case SEQAN_DELETE:
697 // was it a relevant read position?
698 if (bitShape[readPos] == '1')
699 state.qgramHit = false;
700 ++readPos;
701 break;
702 case SEQAN_INSERT:
703 ++genomePos;
704 // std::cout << 'x';
705 }
706 }
707 // std::cout << std::endl;
708
709 // prepend INSERT
710 ++errors;
711 insertValue(pattern, 0, SEQAN_INSERT);
712 if ((int)SEQAN_INSERT < (int)state.TRANSITIONS)
713 {
714 if (errors <= maxErrors)
715 state.transition[SEQAN_INSERT] = _getErrorPatternIndex(patternStore, pattern);
716 else
717 state.transition[SEQAN_INSERT] = -1;
718 }
719
720 // prepend MISMATCH and cut INSERTS
721 errors -= _cutErrorPattern(pattern);
722 if ((int)SEQAN_MISMATCH < (int)state.TRANSITIONS)
723 {
724 pattern[0] = SEQAN_MISMATCH;
725 if (errors <= maxErrors)
726 state.transition[SEQAN_MISMATCH] = _getErrorPatternIndex(patternStore, pattern);
727 else
728 state.transition[SEQAN_MISMATCH] = -1;
729 }
730
731 // prepend DELETE
732 if ((int)SEQAN_DELETE < (int)state.TRANSITIONS)
733 {
734 pattern[0] = SEQAN_DELETE;
735 if (errors <= maxErrors)
736 state.transition[SEQAN_DELETE] = _getErrorPatternIndex(patternStore, pattern);
737 else
738 state.transition[SEQAN_DELETE] = -1;
739 }
740
741 // prepend MATCH
742 if ((int)SEQAN_MATCH < (int)state.TRANSITIONS)
743 {
744 --errors;
745 pattern[0] = SEQAN_MATCH;
746 if (errors <= maxErrors)
747 state.transition[SEQAN_MATCH] = _getErrorPatternIndex(patternStore, pattern);
748 else
749 state.transition[SEQAN_MATCH] = -1;
750 }
751 /*
752 std::cout << "\t" << state.errors;
753 std::cout << "\t" << state.qgramHit;
754 std::cout << "\t" << state.leftError;
755 std::cout << "\t" << state.rightError;
756 std::cout << "\t" << state.transition[0];
757 std::cout << "\t" << state.transition[1];
758 std::cout << "\t" << state.transition[2];
759 std::cout << "\t" << state.transition[3];
760 std::cout << std::endl;
761 */ }
762 if (!optionMinOutput)
763 std::cout << "Preprocessing finished." << std::endl;
764 }
765
766 //////////////////////////////////////////////////////////////////////////////
767 // Compute filtering loss of any q-gram filter (given a states-string)
768 template <
769 typename TThreshString,
770 typename TStateString >
771 void computeExactQGramThreshold(
772 TThreshString &treshPerError,
773 TStateString const &states,
774 int span,
775 int maxErrors,
776 int maxN,
777 bool optionMinOutput)
778 {
779 typedef typename Value<TStateString>::Type TState;
780 typedef unsigned TThresh;
781 typedef String<TThresh> TMatrixCol;
782
783 int statesCount = length(states);
784 // int span = length(bitShape);
785
786 // columns n-1 and n for recursion
787 TMatrixCol col0; // addressing is colx[errors * statesCount + state]
788 TMatrixCol col1;
789 const TThresh infty = MaxValue<TThresh>::VALUE >> 1;
790
791 resize(col0, maxErrors * statesCount, infty);
792 resize(col1, maxErrors * statesCount);
793
794 // RECURSION BEGIN
795 for (int s = 0; s < statesCount; ++s)
796 {
797 TState const &state = states[s];
798 if (state.skipFirst) continue;
799
800 // threshold is 1 iff we have a q-gram hit at the end
801 col0[s] = (state.qgramHit)? 1: 0;
802 }
803
804 // iterate over sequence length n
805 TMatrixCol *col = &col1;
806 TMatrixCol *colPrev = &col0;
807
808 #ifdef DEBUG_RECOG_DP
809 std::cout << span << ":0";
810 dump(col0, 0,statesCount);
811 std::cout << " :1";
812 dump(col0, 1,statesCount);
813 #endif
814
815
816 // RECURSION
817 //
818 // thresh(n,q,e) = min(thresh(n-1,0|(q>>1),e), delta=1/0 <-> q hat 0/>0 error
819 // thresh(n-1,1|(q>>1),e-1)) + delta
820
821 for (int n = span; n < maxN; ++n)
822 {
823 for (int e = 0; e < maxErrors * statesCount; e += statesCount)
824 {
825 for (int s = 0; s < statesCount; ++s)
826 {
827 TState const &state = states[s];
828
829 // MATCH
830 TThresh t = (*colPrev)[e + state.transition[SEQAN_MATCH]];
831
832 // MISMATCH, INSERT, DELETE
833 if (e > 0)
834 for (int m = SEQAN_MISMATCH; m < TState::TRANSITIONS; ++m)
835 {
836 int prevState = state.transition[m];
837 if (prevState >= 0)
838 {
839 if (m == SEQAN_INSERT)
840 t = _min(t, (*col)[(e - statesCount) + prevState]);
841 else
842 t = _min(t, (*colPrev)[(e - statesCount) + prevState]);
843 }
844 }
845
846 (*col)[e + s] = t + state.qgramHit;
847 }
848 if (!optionMinOutput)
849 std::cout << '.' << std::flush;
850 }
851
852 TMatrixCol *tmp = col;
853 col = colPrev;
854 colPrev = tmp;
855
856 #ifdef DEBUG_RECOG_DP
857 std::cout << n+1 << ":0";
858 dump(*colPrev, 0,statesCount);
859 std::cout << " :1";
860 dump(*colPrev, 1,statesCount);
861 std::cout << " :2";
862 dump(*colPrev, 2,statesCount);
863 #endif
864 }
865
866 if (!optionMinOutput)
867 std::cout << std::endl;
868
869 resize(treshPerError, maxErrors);
870
871 // RECURSION END
872 for (int eSum = 0; eSum < maxErrors; ++eSum)
873 {
874 TThresh t = infty;
875 for (int s = 0; s < statesCount; ++s)
876 {
877 TState const &state = states[s];
878
879 // skip intermediate results
880 if (state.intermediate || state.skipLast) continue;
881 if (state.errors <= eSum)
882 {
883 int e = eSum - state.errors;
884 // multiply probability for the trailing pattern
885 t = _min(t, (*colPrev)[e * statesCount + s]);
886 }
887 }
888
889 if (t >= infty) t = 0;
890 treshPerError[eSum] = t;
891 }
892 }
893
894
895 //////////////////////////////////////////////////////////////////////////////
896 // Compute filtering loss of any q-gram filter (given a states-string)
897 template <
898 typename TLossString,
899 typename TLogErrorDistr,
900 typename TStateString >
901 void computeQGramFilteringSensitivity(
902 TLossString &found,
903 TStateString const &states,
904 int span,
905 int maxT,
906 int maxErrors,
907 TLogErrorDistr const &logError,
908 // bool optionAbsolute = false,
909 bool optionMinOutput)
910 {
911 typedef typename Value<TLossString>::Type TFloat;
912 typedef typename Value<TLogErrorDistr>::Type TProbValue;
913 typedef typename Value<TStateString>::Type TState;
914
915 typedef String<TFloat> TMatrixCol;
916 typedef String<int> TIntCol;
917
918 SEQAN_ASSERT((length(logError) % 4) == 0);
919
920 int maxN = length(logError) / 4;
921 int statesCount = length(states);
922 const bool optionAbsolute = false;
923 // int span = length(bitShape);
924
925 // columns n-1 and n for recursion
926 TMatrixCol col0;
927 TMatrixCol col1;
928 resize(col0, maxErrors * statesCount * maxT, (TFloat)_transform(0.0));
929 resize(col1, maxErrors * statesCount * maxT);
930
931 #ifdef COUNT_LOSSES
932 TFloat positive = _transform(0.0);
933 TFloat negative = _transform(1.0);
934 #else
935 TFloat positive = _transform(1.0);
936 TFloat negative = _transform(0.0);
937 #endif
938
939 // RECURSION BEGIN
940 for (int s = 0; s < statesCount; ++s)
941 {
942 TState const &state = states[s];
943
944 if (state.skipFirst) continue;
945
946 // we miss no match if threshold t is 0
947 col0[s*maxT] = positive;
948
949 // for n==0
950 if (state.qgramHit)
951 {
952 // we miss no match if read q-gram is recognized
953 // --> probability of finding this MMP is 1, if t=1
954 col0[s*maxT+1] = positive;
955 // --> probability of finding this MMP is 0, if t>1
956 for (int t = 2; t < maxT; ++t)
957 col0[s*maxT+t] = negative;
958 } else
959 {
960 // we miss 1 match if t>0 and read q-gram is not recognized
961 // --> probability of finding this MMP is 0, if t>=1
962 for (int t = 1; t < maxT; ++t)
963 col0[s*maxT+t] = negative;
964 }
965 }
966
967 // iterate over sequence length n
968 TMatrixCol *col = &col1;
969 TMatrixCol *colPrev = &col0;
970
971 #ifdef DEBUG_RECOG_DP
972 ::std::cout << span << ":0";
973 dump(col0, 0,statesCount);
974 ::std::cout << " :1";
975 dump(col0, 1,statesCount);
976 #endif
977
978
979 // RECURSION
980 //
981 // found(n,q,t,e) = (1-errorProb[n-span]) * found(n-1,0|(q>>1),t-delta,e) delta=1/0 <-> q hat 0/>0 fehler
982 // + errorProb[n-span] * found(n-1,1|(q>>1),t-delta,e-1)
983
984 // rekursion (fuer q-gram matches <=1 fehler)
985 // found(n,q,t,e) = (1-errorProb[n-span]) * found(n-1,0|(q>>1),t-delta,e) delta=1/0 <-> q hat <=1/>1 fehler
986 // + errorProb[n-span] * found(n-1,1|(q>>1),t-delta,e-1)
987
988 for (int n = span; n < maxN; ++n)
989 {
990 for (int e = 0; e < maxErrors * statesCount; e += statesCount)
991 {
992 for (int s = 0; s < statesCount; ++s)
993 {
994 TState const &state = states[s];
995 for (int t = 0; t < maxT; ++t)
996 {
997 int _t = t;
998 if (_t > 0 && state.qgramHit) --_t;
999
1000 // MATCH
1001 TFloat recovered = _probMul(
1002 _getProb(logError, SEQAN_MATCH, n-span),
1003 (*colPrev)[(e+state.transition[SEQAN_MATCH])*maxT+_t]);
1004
1005 // MISMATCH, INSERT, DELETE
1006 for (int m = SEQAN_MISMATCH; m < 4; ++m)
1007 if (e > 0)
1008 {
1009 int prevState = state.transition[m];
1010 if (prevState >= 0)
1011 {
1012 if (m == SEQAN_INSERT)
1013 _probAdd(recovered, _probMul(_getProb(logError,m,n-span), (*col)[((e-statesCount)+prevState)*maxT+t]));
1014 else
1015 _probAdd(recovered, _probMul(_getProb(logError,m,n-span), (*colPrev)[((e-statesCount)+prevState)*maxT+_t]));
1016 }
1017 }
1018 (*col)[(e+s)*maxT+t] = recovered;
1019 }
1020 }
1021 if (!optionMinOutput)
1022 ::std::cout << '.' << ::std::flush;
1023 }
1024
1025 TMatrixCol *tmp = col;
1026 col = colPrev;
1027 colPrev = tmp;
1028
1029 #ifdef DEBUG_RECOG_DP
1030 ::std::cout << n+1 << ":0";
1031 dump(*colPrev, 0,statesCount);
1032 ::std::cout << " :1";
1033 dump(*colPrev, 1,statesCount);
1034 ::std::cout << " :2";
1035 dump(*colPrev, 2,statesCount);
1036 #endif
1037 }
1038
1039 if (!optionMinOutput)
1040 ::std::cout << ::std::endl;
1041
1042 // RECURSION END
1043 for (int eSum = 0; eSum < maxErrors; ++eSum)
1044 for (int t = 0; t < maxT; ++t)
1045 {
1046 TFloat recovered = _transform(0.0);
1047 for (int s = 0; s < statesCount; ++s)
1048 {
1049 TState const &state = states[s];
1050
1051 // skip intermediate results
1052 if (state.intermediate || state.skipLast) continue;
1053 if (state.errors <= eSum)
1054 {
1055 int e = eSum - state.errors;
1056 // multiply probability for the trailing pattern
1057 _probAdd(recovered, _probMul(state.prob, (*colPrev)[(e*statesCount+s)*maxT+t]));
1058 }
1059 }
1060
1061 #ifndef COUNT_LOSSES
1062 // we can only normalize probs if t==0 contains all k-pattern probs
1063 if (t > 0 && !optionAbsolute)
1064 recovered = _probDiv(recovered, found[eSum*maxT]);
1065 #endif
1066
1067 found[eSum*maxT+t] = recovered;
1068 }
1069 }
1070
1071
1072 //////////////////////////////////////////////////////////////////////////////
1073 // q-gram threshold DP algorithm
1074 //
1075 // - exact threshold
1076 //////////////////////////////////////////////////////////////////////////////
1077
1078 template <typename TShape, typename TPatternSize, typename TErrors, typename TDistance>
1079 int qgramThreshold(TShape const & shape, TPatternSize patternLength, TErrors errors, TDistance const dist, ThreshExact const)
1080 {
1081 String<ThreshDPState_<TDistance> > states;
1082 String<unsigned> thresh;
1083 String<char> bitString;
1084
1085 shapeToString(bitString, shape);
1086 initPatterns(states, bitString, errors, Nothing(), dist, true);
1087 computeExactQGramThreshold(thresh, states, length(bitString), errors + 1, patternLength, true);
1088
1089 return thresh[errors];
1090 }
1091
1092 //////////////////////////////////////////////////////////////////////////////
1093 // q-gram filter sensitivity DP algorithm
1094 //
1095 // - exact threshold
1096 //////////////////////////////////////////////////////////////////////////////
1097
1098 template <typename TSensitivityMatrix, typename TShape, typename TPatternSize, typename TErrors, typename TThresh, typename TDistance, typename TErrorDist>
1099 void qgramFilteringSensitivity(
1100 TSensitivityMatrix & sensMat,
1101 TShape const & shape,
1102 TPatternSize patternLength,
1103 TErrors errors,
1104 TThresh maxThresh,
1105 TDistance const dist,
1106 ThreshExact const,
1107 TErrorDist const & logErrorDistribution)
1108 {
1109 typedef typename Value<TSensitivityMatrix>::Type TFloat;
1110 String<SensitivityDPState_<TDistance, TFloat> > states;
1111 String<unsigned> thresh;
1112 String<char> bitString;
1113
1114 maxThresh = _min(maxThresh, patternLength - length(shape) + 1);
1115 resize(sensMat, (maxThresh + 1) * (errors + 1));
1116 shapeToString(bitString, shape);
1117
1118 initPatterns(states, bitString, errors, logErrorDistribution, dist, true);
1119 computeQGramFilteringSensitivity(sensMat, states, length(bitString), maxThresh + 1, errors + 1, logErrorDistribution, true);
1120 }
1121
1122 } // namespace seqan
1123
1124 #endif
1125