1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 //
34 // Based on the code by Carsten Kemena <carsten.kemena@crg.es>, debugged
35 // by Birte Kehr <birte.kehr@fu-berlin.de>.
36 // ==========================================================================
37 // Seed extension algorithms.
38 //
39 // The approach for gapped X-drop extension is based on the algorithm in
40 // Figure 2 from (Zhang et al., 2000).
41 //
42 // Zhang Z, Schwartz S, Wagner L, Miller W. A greedy algorithm for aligning
43 // DNA sequences. Journal of computational biologyA a journal of
44 // computational molecular cell biology. 2000;7(1-2):203-14.
45 // doi:10.1089/10665270050081478
46 // ==========================================================================
47
48 #ifndef SEQAN_SEEDS_SEEDS_EXTENSION_H_
49 #define SEQAN_SEEDS_SEEDS_EXTENSION_H_
50
51 namespace seqan {
52
53 // ===========================================================================
54 // Enums, Tags, Classes, Specializations
55 // ===========================================================================
56
57 // ---------------------------------------------------------------------------
58 // Tags Seed Extension
59 // ---------------------------------------------------------------------------
60
61 /*!
62 * @defgroup SeedExtensionTags
63 * @brief Tags for selecting seed extension algorithm.
64 *
65 * @see Seed#extendSeed
66 *
67 * @tag SeedExtensionTags#MatchExtend
68 * @headerfile <seqan/seeds.h>
69 * @brief Extends a seed until a mismatch occurs.
70 *
71 * @signature typedef Tag<MatchExtend_> const MatchExtend;
72 *
73 * @tag SeedExtensionTags#UnGappedXDrop
74 * @headerfile <seqan/seeds.h>
75 * @brief Ungapped extension of a seed until score drops below a given value.
76 *
77 * @signature typedef Tag<UngappedXDrop_> const UnGappedXDrop;
78 *
79 * @tag SeedExtensionTags#GappedXDrop
80 * @headerfile <seqan/seeds.h>
81 * @brief Gapped extension of a seed until score drops below a given value; only works for SimpleSeed.
82 *
83 * @signature typedef Tag<GappedXDrop_> const GappedXDrop;
84 */
85
86 struct MatchExtend_;
87 typedef Tag<MatchExtend_> const MatchExtend;
88
89 struct UngappedXDrop_;
90 typedef Tag<UngappedXDrop_> const UnGappedXDrop;
91
92 struct GappedXDrop_;
93 typedef Tag<GappedXDrop_> const GappedXDrop;
94
95 // ---------------------------------------------------------------------------
96 // Enum ExtensionDirection
97 // ---------------------------------------------------------------------------
98
99 // TODO(holtgrew): Put into class?
100
101 /*!
102 * @enum ExtensionDirection
103 * @headerfile <seqan/seeds.h>
104 * @brief Direction for seed extension.
105 *
106 * @signature enum ExtensionDirection;
107 *
108 * @see Seed#extendSeed
109 *
110 * @val ExtensionDirection EXTEND_NONE = 0;
111 * @brief Perform no extension.
112 *
113 * @val ExtensionDirection EXTEND_LEFT = 1;
114 * @brief Perform extension towards the left.
115 *
116 * @val ExtensionDirection EXTEND_RIGHT = 2;
117 * @brief Perform extension towards the right.
118 *
119 * @val ExtensionDirection EXTEND_BOTH = 3;
120 * @brief Perform extension to both directions.
121 */
122
123 enum ExtensionDirection
124 {
125 EXTEND_NONE = 0,
126 EXTEND_LEFT = 1,
127 EXTEND_RIGHT = 2,
128 EXTEND_BOTH = 3
129 };
130
131 // ===========================================================================
132 // Metafunctions
133 // ===========================================================================
134
135 // ===========================================================================
136 // Functions
137 // ===========================================================================
138
139 // ---------------------------------------------------------------------------
140 // Function extendSeed [MatchExtend]
141 // ---------------------------------------------------------------------------
142
143 /*!
144 * @fn Seed#extendSeed
145 * @headerfile <seqan/seeds.h>
146 * @brief Extends a seed.
147 *
148 * @signature void extendSeed(seed, database, query, direction, MatchExtend);
149 * @signature void extendSeed(seed, database, query, direction, scoringScheme, scoreDropOff, xDropTag);
150 *
151 * @param[in,out] seed The Seed to extend.
152 * @param[in] database The database (horizontal) @link ContainerConcept sequence @endlink.
153 * @param[in] query The query (vertical) @link ContainerConcept sequence @endlink.
154 * @param[in] direction The extension direction. Type: @link ExtensionDirection @endlink.
155 * @param[in] scoringScheme The @link Score @endlink object to use for scoring alignments and gaps.
156 * @param[in] scoreDropOff The score drop after which the extension should stop. The extension stops if this
157 * value is exceeded. Only given for when using an x-drop algorithm.
158 * @param[in] xDropTag Tag for selecting x-drop method, one of <tt>UnGappedXDrop</tt> and
159 * <tt>GappedXDrop</tt>.
160 *
161 * You can use the tags, <tt>MatchExtend</tt>, <tt>UnGappedXDrop</tt>, and <tt>GappedXDrop</tt>.
162 *
163 * Note that the diagonals updated in <tt>seed</tt> do not necessarily reflect the diagonals for the optimal extension
164 * but the diagonals used in all traces of the extension. However, they are guaranteed to include the optimal
165 * extension's trace.
166 *
167 * @section Examples
168 *
169 * The documentation of the class @link Seed @endlink contains an example for
170 * seed extension.
171 *
172 * @see ExtensionDirection
173 * @see SeedExtensionTags
174 */
175
176 // We need one specialization for each combination of the extension variants and seeds. It is not worth to extract the
177 // common parts for simple and chained seeds.
178
179 template <typename TConfig, typename TDatabase, typename TQuery>
180 inline void
extendSeed(Seed<Simple,TConfig> & seed,TDatabase const & database,TQuery const & query,ExtensionDirection direction,MatchExtend const &)181 extendSeed(Seed<Simple, TConfig> & seed,
182 TDatabase const & database,
183 TQuery const & query,
184 ExtensionDirection direction,
185 MatchExtend const &)
186 {
187 // For match extension of Simple Seeds, we can simply update the begin and end values in each dimension.
188 typedef Seed<Simple, TConfig> TSeed;
189 typedef typename Position<TSeed>::Type TPosition;
190 typedef typename Size<TSeed>::Type TSize;
191
192 // Extension to the left
193 if (direction == EXTEND_LEFT || direction == EXTEND_BOTH)
194 {
195 TPosition posH = beginPositionH(seed);
196 TPosition posV = beginPositionV(seed);
197 while (posH >= 1 && posV >= 1 && database[posH - 1] == query[posV - 1])
198 {
199 --posH;
200 --posV;
201 }
202 setBeginPositionH(seed, posH);
203 setBeginPositionV(seed, posV);
204 }
205
206 // Extension to the right
207 if (direction == EXTEND_RIGHT || direction == EXTEND_BOTH)
208 {
209 TSize lengthH = length(database);
210 TSize lengthV = length(query);
211 TPosition posH = endPositionH(seed);
212 TPosition posV = endPositionV(seed);
213 while (posH < lengthH && posV < lengthV && database[posH] == query[posV])
214 {
215 ++posH;
216 ++posV;
217 }
218 setEndPositionH(seed, posH);
219 setEndPositionV(seed, posV);
220 }
221 }
222
223
224 template <typename TConfig, typename TDatabase, typename TQuery>
225 inline void
extendSeed(Seed<ChainedSeed,TConfig> & seed,TDatabase const & database,TQuery const & query,ExtensionDirection direction,MatchExtend const &)226 extendSeed(Seed<ChainedSeed, TConfig> & seed,
227 TDatabase const & database,
228 TQuery const & query,
229 ExtensionDirection direction,
230 MatchExtend const &)
231 {
232 // For match extension of Chained Seeds, we extend the first and the last Seed Diagonal.
233 SEQAN_ASSERT_GT(length(seed), 0u);
234
235 typedef Seed<ChainedSeed, TConfig> TSeed;
236 typedef typename Value<TSeed>::Type TSeedDiagonal;
237 typedef typename Position<TSeedDiagonal>::Type TPosition;
238 typedef typename Size<TSeedDiagonal>::Type TSize;
239
240 // Extension to the left
241 if (direction == EXTEND_LEFT || direction == EXTEND_BOTH)
242 {
243 TSeedDiagonal & diag = front(seed);
244 TPosition posH = diag.beginPositionH;
245 TPosition posV = diag.beginPositionV;
246 TSize diagonalLength = diag.length;
247 while (posH >= 1 && posV >= 1 && database[posH - 1] == query[posV - 1])
248 {
249 --posH;
250 --posV;
251 ++diagonalLength;
252 }
253 diag.beginPositionH = posH;
254 diag.beginPositionV = posV;
255 diag.length = diagonalLength;
256 }
257
258 // Extension to the right
259 if (direction == EXTEND_RIGHT || direction == EXTEND_BOTH)
260 {
261 TSize lengthH = length(database);
262 TSize lengthV = length(query);
263 TSeedDiagonal & diag = back(seed);
264 TPosition posH = diag.beginPositionH + diag.length;
265 TPosition posV = diag.beginPositionV + diag.length;
266 TSize diagonalLength = diag.length;
267 while (posH < lengthH && posV < lengthV && database[posH] == query[posV])
268 {
269 ++posH;
270 ++posV;
271 ++diagonalLength;
272 }
273 diag.length = diagonalLength;
274 }
275 }
276
277 // ---------------------------------------------------------------------------
278 // Function extendSeed [UnGappedXDrop]
279 // ---------------------------------------------------------------------------
280
281 template <typename TConfig, typename TDatabase, typename TQuery, typename TScoreValue, typename TScoreSpec>
282 inline void
extendSeed(Seed<Simple,TConfig> & seed,TDatabase const & database,TQuery const & query,ExtensionDirection direction,Score<TScoreValue,TScoreSpec> const & scoringScheme,TScoreValue scoreDropOff,UnGappedXDrop const &)283 extendSeed(Seed<Simple, TConfig> & seed,
284 TDatabase const & database,
285 TQuery const & query,
286 ExtensionDirection direction,
287 Score<TScoreValue, TScoreSpec> const & scoringScheme,
288 TScoreValue scoreDropOff,
289 UnGappedXDrop const &)
290 {
291 // For ungapped X-drop extension of Simple Seeds, we can simply
292 // update the begin and end values in each dimension.
293 scoreDropOff = -scoreDropOff;
294
295 typedef Seed<ChainedSeed, TConfig> TSeed;
296 typedef typename Position<TSeed>::Type TPosition;
297 typedef typename Size<TSeed>::Type TSize;
298
299 // Extension to the left
300 if (direction == EXTEND_LEFT || direction == EXTEND_BOTH)
301 {
302 TScoreValue tmpScore = 0;
303 TPosition posH = beginPositionH(seed);
304 TPosition posV = beginPositionV(seed);
305 TPosition mismatchingSuffixLength = 0;
306 while (posH >= 1 && posV >= 1 && tmpScore > scoreDropOff)
307 {
308 tmpScore += score(scoringScheme, sequenceEntryForScore(scoringScheme, database, posH),
309 sequenceEntryForScore(scoringScheme, query, posV));
310 if (database[posH - 1] == query[posV - 1])
311 {
312 mismatchingSuffixLength = 0;
313 if (tmpScore > static_cast<TScoreValue>(0))
314 tmpScore = 0;
315 }
316 else
317 {
318 mismatchingSuffixLength += 1;
319 }
320 --posH;
321 --posV;
322 }
323 setBeginPositionH(seed, posH + mismatchingSuffixLength);
324 setBeginPositionV(seed, posV + mismatchingSuffixLength);
325 }
326
327 // Extension to the right
328 if (direction == EXTEND_RIGHT || direction == EXTEND_BOTH)
329 {
330 TScoreValue tmpScore = 0;
331 TSize lengthH = length(database);
332 TSize lengthV = length(query);
333 TPosition posH = endPositionH(seed);
334 TPosition posV = endPositionV(seed);
335 TPosition mismatchingSuffixLength = 0;
336 while (posH < lengthH && posV < lengthV && tmpScore > scoreDropOff)
337 {
338 tmpScore += score(scoringScheme, sequenceEntryForScore(scoringScheme, database, posH),
339 sequenceEntryForScore(scoringScheme, query, posV));
340 if (database[posH] == query[posV])
341 {
342 mismatchingSuffixLength = 0;
343 if (tmpScore > static_cast<TScoreValue>(0))
344 tmpScore = 0;
345 }
346 else
347 {
348 mismatchingSuffixLength += 1;
349 }
350 ++posH;
351 ++posV;
352 }
353 setEndPositionH(seed, posH - mismatchingSuffixLength);
354 setEndPositionV(seed, posV - mismatchingSuffixLength);
355 }
356
357 // TODO(holtgrew): Update score?!
358 }
359
360
361 template <typename TConfig, typename TDatabase, typename TQuery, typename TScoreValue, typename TScoreSpec>
362 inline void
extendSeed(Seed<ChainedSeed,TConfig> & seed,TDatabase const & database,TQuery const & query,ExtensionDirection direction,Score<TScoreValue,TScoreSpec> const & scoringScheme,TScoreValue scoreDropOff,UnGappedXDrop const &)363 extendSeed(Seed<ChainedSeed, TConfig> & seed,
364 TDatabase const & database,
365 TQuery const & query,
366 ExtensionDirection direction,
367 Score<TScoreValue, TScoreSpec> const & scoringScheme,
368 TScoreValue scoreDropOff,
369 UnGappedXDrop const &)
370 {
371 // For ungapped X-drop extension of Chained Seeds, we extend the
372 // first and the last Seed Diagonal.
373 scoreDropOff = -scoreDropOff;
374
375 typedef Seed<ChainedSeed, TConfig> TSeed;
376 typedef typename Value<TSeed>::Type TSeedDiagonal;
377 typedef typename Position<TSeedDiagonal>::Type TPosition;
378 typedef typename Size<TSeedDiagonal>::Type TSize;
379
380 // Extension to the left
381 if (direction == EXTEND_LEFT || direction == EXTEND_BOTH)
382 {
383 TScoreValue tmpScore = 0;
384 TPosition mismatchingSuffixLength = 0;
385 TSeedDiagonal & diag = front(seed);
386 TPosition posH = beginPositionH(seed);
387 TPosition posV = beginPositionV(seed);
388 TSize diagonalLength = diag.length;
389 while (posH >= 1 && posV >= 1 && tmpScore > scoreDropOff)
390 {
391 tmpScore += score(scoringScheme, sequenceEntryForScore(scoringScheme, database, posH),
392 sequenceEntryForScore(scoringScheme, query, posV));
393 if (database[posH - 1] == query[posV - 1])
394 {
395 mismatchingSuffixLength = 0;
396 if (tmpScore > static_cast<TScoreValue>(0))
397 tmpScore = 0;
398 }
399 else
400 {
401 mismatchingSuffixLength += 1;
402 }
403 --posH;
404 --posV;
405 ++diagonalLength;
406 }
407 diag.beginPositionH = posH + mismatchingSuffixLength;
408 diag.beginPositionV = posV + mismatchingSuffixLength;
409 diag.length = diagonalLength - mismatchingSuffixLength;
410 }
411
412 // Extension to the right
413 if (direction == EXTEND_RIGHT || direction == EXTEND_BOTH)
414 {
415 TScoreValue tmpScore = 0;
416 TPosition mismatchingSuffixLength = 0;
417 TSize lengthH = length(query);
418 TSize lengthV = length(database);
419 TSeedDiagonal & diag = back(seed);
420 TPosition posH = diag.beginPositionH + diag.length;
421 TPosition posV = diag.beginPositionV + diag.length;
422 TSize diagonalLength = diag.length;
423 while (posH < lengthH && posV < lengthV && tmpScore > scoreDropOff)
424 {
425 tmpScore += score(scoringScheme, sequenceEntryForScore(scoringScheme, database, posH),
426 sequenceEntryForScore(scoringScheme, query, posV));
427 if (database[posH] == query[posV])
428 {
429 mismatchingSuffixLength = 0;
430 if (tmpScore > static_cast<TScoreValue>(0))
431 tmpScore = 0;
432 }
433 else
434 {
435 mismatchingSuffixLength += 1;
436 }
437 ++posH;
438 ++posV;
439 ++diagonalLength;
440 }
441 diag.length = diagonalLength - mismatchingSuffixLength;
442 }
443
444 // TODO(holtgrew): Update score?!
445 }
446
447 // ---------------------------------------------------------------------------
448 // Function extendSeed [GappedXDrop]
449 // ---------------------------------------------------------------------------
450
451 template<typename TAntiDiag, typename TDropOff, typename TScoreValue>
452 inline void
_initAntiDiags(TAntiDiag &,TAntiDiag & antiDiag2,TAntiDiag & antiDiag3,TDropOff dropOff,TScoreValue gapCost,TScoreValue undefined)453 _initAntiDiags(TAntiDiag & ,
454 TAntiDiag & antiDiag2,
455 TAntiDiag & antiDiag3,
456 TDropOff dropOff,
457 TScoreValue gapCost,
458 TScoreValue undefined)
459 {
460 // antiDiagonals will be swaped in while loop BEFORE computation of antiDiag3 entries
461 // -> no initialization of antiDiag1 necessary
462
463 resize(antiDiag2, 1);
464 antiDiag2[0] = 0;
465
466 resize(antiDiag3, 2);
467 if (-gapCost > dropOff)
468 {
469 antiDiag3[0] = undefined;
470 antiDiag3[1] = undefined;
471 }
472 else
473 {
474 antiDiag3[0] = gapCost;
475 antiDiag3[1] = gapCost;
476 }
477 }
478
479 template<typename TAntiDiag>
480 inline void
_swapAntiDiags(TAntiDiag & antiDiag1,TAntiDiag & antiDiag2,TAntiDiag & antiDiag3)481 _swapAntiDiags(TAntiDiag & antiDiag1,
482 TAntiDiag & antiDiag2,
483 TAntiDiag & antiDiag3)
484 {
485 TAntiDiag temp;
486 move(temp, antiDiag1);
487 move(antiDiag1, antiDiag2);
488 move(antiDiag2, antiDiag3);
489 move(antiDiag3, temp);
490 }
491
492 template<typename TAntiDiag, typename TSize, typename TScoreValue>
493 inline TSize
_initAntiDiag3(TAntiDiag & antiDiag3,TSize offset,TSize maxCol,TSize antiDiagNo,TScoreValue minScore,TScoreValue gapCost,TScoreValue undefined)494 _initAntiDiag3(TAntiDiag & antiDiag3,
495 TSize offset,
496 TSize maxCol,
497 TSize antiDiagNo,
498 TScoreValue minScore,
499 TScoreValue gapCost,
500 TScoreValue undefined)
501 {
502 resize(antiDiag3, maxCol + 1 - offset);
503
504 antiDiag3[0] = undefined;
505 antiDiag3[maxCol - offset] = undefined;
506
507 if ((int)antiDiagNo * gapCost > minScore)
508 {
509 if (offset == 0) // init first column
510 antiDiag3[0] = antiDiagNo * gapCost;
511 if (antiDiagNo - maxCol == 0) // init first row
512 antiDiag3[maxCol - offset] = antiDiagNo * gapCost;
513 }
514 return offset;
515 }
516
517 template<typename TDiagonal, typename TSize>
518 inline void
_calcExtendedLowerDiag(TDiagonal & lowerDiag,TSize minCol,TSize antiDiagNo)519 _calcExtendedLowerDiag(TDiagonal & lowerDiag,
520 TSize minCol,
521 TSize antiDiagNo)
522 {
523 TSize minRow = antiDiagNo - minCol;
524 if ((TDiagonal)minCol - (TDiagonal)minRow < lowerDiag)
525 lowerDiag = (TDiagonal)minCol - (TDiagonal)minRow;
526 }
527
528 template<typename TDiagonal, typename TSize>
529 inline void
_calcExtendedUpperDiag(TDiagonal & upperDiag,TSize maxCol,TSize antiDiagNo)530 _calcExtendedUpperDiag(TDiagonal & upperDiag,
531 TSize maxCol,
532 TSize antiDiagNo)
533 {
534 TSize maxRow = antiDiagNo + 1 - maxCol;
535 if ((TDiagonal)maxCol - 1 - (TDiagonal)maxRow > upperDiag)
536 upperDiag = maxCol - 1 - maxRow;
537 }
538
539 template<typename TSeed, typename TSize, typename TDiagonal>
540 inline void
_updateExtendedSeed(TSeed & seed,ExtensionDirection direction,TSize cols,TSize rows,TDiagonal lowerDiag,TDiagonal upperDiag)541 _updateExtendedSeed(TSeed & seed,
542 ExtensionDirection direction,
543 TSize cols,
544 TSize rows,
545 TDiagonal lowerDiag,
546 TDiagonal upperDiag)
547 {
548 if (direction == EXTEND_LEFT)
549 {
550 // Set lower and upper diagonals.
551 TDiagonal beginDiag = beginDiagonal(seed);
552 if (lowerDiagonal(seed) > beginDiag + lowerDiag)
553 setLowerDiagonal(seed, beginDiag + lowerDiag);
554 if (upperDiagonal(seed) < beginDiag + upperDiag)
555 setUpperDiagonal(seed, beginDiag + upperDiag);
556
557 // Set new start position of seed.
558 setBeginPositionH(seed, beginPositionH(seed) - rows);
559 setBeginPositionV(seed, beginPositionV(seed) - cols);
560 } else { // direction == EXTEND_RIGHT
561 // Set new lower and upper diagonals.
562 TDiagonal endDiag = endDiagonal(seed);
563 if (upperDiagonal(seed) < endDiag - lowerDiag)
564 setUpperDiagonal(seed, endDiag - lowerDiag);
565 if (lowerDiagonal(seed) > endDiag - upperDiag)
566 setLowerDiagonal(seed, endDiag - upperDiag);
567
568 // Set new end position of seed.
569 setEndPositionH(seed, endPositionH(seed) + rows);
570 setEndPositionV(seed, endPositionV(seed) + cols);
571 }
572 SEQAN_ASSERT_GEQ(upperDiagonal(seed), lowerDiagonal(seed));
573 SEQAN_ASSERT_GEQ(upperDiagonal(seed), beginDiagonal(seed));
574 SEQAN_ASSERT_GEQ(upperDiagonal(seed), endDiagonal(seed));
575 SEQAN_ASSERT_GEQ(beginDiagonal(seed), lowerDiagonal(seed));
576 SEQAN_ASSERT_GEQ(endDiagonal(seed), lowerDiagonal(seed));
577 }
578
579 // Limit score; In the general case we cannot do this so we simply perform a check on the score mismatch values.
580 template <typename TScoreValue, typename TScoreSpec, typename TAlphabet>
581 void
_extendSeedGappedXDropOneDirectionLimitScoreMismatch(Score<TScoreValue,TScoreSpec> & scoringScheme,TScoreValue minErrScore,TAlphabet *)582 _extendSeedGappedXDropOneDirectionLimitScoreMismatch(Score<TScoreValue, TScoreSpec> & scoringScheme,
583 TScoreValue minErrScore,
584 TAlphabet * /*tag*/)
585 {
586 // We cannot set a lower limit for the mismatch score since the score might be a scoring matrix such as Blosum62.
587 // Instead, we perform a check on the matrix scores.
588 #if SEQAN_ENABLE_DEBUG
589 {
590 for (unsigned i = 0; i < valueSize<TAlphabet>(); ++i)
591 for (unsigned j = 0; j <= i; ++j)
592 SEQAN_ASSERT_GEQ_MSG(score(scoringScheme, TAlphabet(i), TAlphabet(j)), minErrScore,
593 "Mismatch score too small!, i = %u, j = %u");
594 }
595 #else
596 (void)scoringScheme;
597 (void)minErrScore;
598 #endif // #if SEQAN_ENABLE_DEBUG
599 }
600
601 // In the case of a SimpleScore, however, we can set this.
602 template <typename TScoreValue, typename TAlphabet>
603 void
_extendSeedGappedXDropOneDirectionLimitScoreMismatch(Score<TScoreValue,Simple> & scoringScheme,TScoreValue minErrScore,TAlphabet *)604 _extendSeedGappedXDropOneDirectionLimitScoreMismatch(Score<TScoreValue, Simple> & scoringScheme,
605 TScoreValue minErrScore,
606 TAlphabet * /*tag*/)
607 {
608 setScoreMismatch(scoringScheme, std::max(scoreMismatch(scoringScheme), minErrScore));
609 }
610
611 template<typename TConfig, typename TQuerySegment, typename TDatabaseSegment, typename TScoreValue, typename TScoreSpec>
612 TScoreValue
_extendSeedGappedXDropOneDirection(Seed<Simple,TConfig> & seed,TQuerySegment const & querySeg,TDatabaseSegment const & databaseSeg,ExtensionDirection direction,Score<TScoreValue,TScoreSpec> scoringScheme,TScoreValue scoreDropOff)613 _extendSeedGappedXDropOneDirection(
614 Seed<Simple, TConfig> & seed,
615 TQuerySegment const & querySeg,
616 TDatabaseSegment const & databaseSeg,
617 ExtensionDirection direction,
618 Score<TScoreValue, TScoreSpec> scoringScheme,
619 TScoreValue scoreDropOff)
620 {
621 typedef typename Size<TQuerySegment>::Type TSize;
622 typedef typename Seed<Simple,TConfig>::TDiagonal TDiagonal;
623
624 TSize cols = length(querySeg)+1;
625 TSize rows = length(databaseSeg)+1;
626 if (rows == 1 || cols == 1)
627 return 0;
628
629 TScoreValue len = 2 * _max(cols, rows); // number of antidiagonals
630 TScoreValue const minErrScore = minValue<TScoreValue>() / len; // minimal allowed error penalty
631 setScoreGap(scoringScheme, _max(scoreGap(scoringScheme), minErrScore));
632 typename Value<TQuerySegment>::Type * tag = 0;
633 (void)tag;
634 _extendSeedGappedXDropOneDirectionLimitScoreMismatch(scoringScheme, minErrScore, tag);
635
636 TScoreValue gapCost = scoreGap(scoringScheme);
637 TScoreValue undefined = minValue<TScoreValue>() - gapCost;
638
639 // DP matrix is calculated by anti-diagonals
640 String<TScoreValue> antiDiag1; //smallest anti-diagonal
641 String<TScoreValue> antiDiag2;
642 String<TScoreValue> antiDiag3; //current anti-diagonal
643
644 // Indices on anti-diagonals include gap column/gap row:
645 // - decrease indices by 1 for position in query/database segment
646 // - first calculated entry is on anti-diagonal n\B0 2
647
648 TSize minCol = 1;
649 TSize maxCol = 2;
650
651 TSize offset1 = 0; // number of leading columns that need not be calculated in antiDiag1
652 TSize offset2 = 0; // in antiDiag2
653 TSize offset3 = 0; // in antiDiag3
654
655 _initAntiDiags(antiDiag1, antiDiag2, antiDiag3, scoreDropOff, gapCost, undefined);
656 TSize antiDiagNo = 1; // the currently calculated anti-diagonal
657
658 TScoreValue best = 0; // maximal score value in the DP matrix (for drop-off calculation)
659
660 TDiagonal lowerDiag = 0;
661 TDiagonal upperDiag = 0;
662
663 while (minCol < maxCol)
664 {
665 ++antiDiagNo;
666 _swapAntiDiags(antiDiag1, antiDiag2, antiDiag3);
667 offset1 = offset2;
668 offset2 = offset3;
669 offset3 = minCol-1;
670 _initAntiDiag3(antiDiag3, offset3, maxCol, antiDiagNo, best - scoreDropOff, gapCost, undefined);
671
672 TScoreValue antiDiagBest = antiDiagNo * gapCost;
673 for (TSize col = minCol; col < maxCol; ++col) {
674 // indices on anti-diagonals
675 TSize i3 = col - offset3;
676 TSize i2 = col - offset2;
677 TSize i1 = col - offset1;
678
679 // indices in query and database segments
680 TSize queryPos, dbPos;
681 if (direction == EXTEND_RIGHT)
682 {
683 queryPos = col - 1;
684 dbPos = antiDiagNo - col - 1;
685 }
686 else // direction == EXTEND_LEFT
687 {
688 queryPos = cols - 1 - col;
689 dbPos = rows - 1 + col - antiDiagNo;
690 }
691
692 // Calculate matrix entry (-> antiDiag3[col])
693 TScoreValue tmp = _max(antiDiag2[i2-1], antiDiag2[i2]) + gapCost;
694 tmp = _max(tmp, antiDiag1[i1 - 1] + score(scoringScheme, sequenceEntryForScore(scoringScheme, querySeg, queryPos),
695 sequenceEntryForScore(scoringScheme, databaseSeg, dbPos)));
696 if (tmp < best - scoreDropOff)
697 {
698 antiDiag3[i3] = undefined;
699 }
700 else
701 {
702 antiDiag3[i3] = tmp;
703 antiDiagBest = _max(antiDiagBest, tmp);
704 }
705 }
706 best = _max(best, antiDiagBest);
707
708 // Calculate new minCol and minCol
709 while (minCol - offset3 < length(antiDiag3) && antiDiag3[minCol - offset3] == undefined &&
710 minCol - offset2 - 1 < length(antiDiag2) && antiDiag2[minCol - offset2 - 1] == undefined)
711 {
712 ++minCol;
713 }
714
715 // Calculate new maxCol
716 while (maxCol - offset3 > 0 && (antiDiag3[maxCol - offset3 - 1] == undefined) &&
717 (antiDiag2[maxCol - offset2 - 1] == undefined))
718 {
719 --maxCol;
720 }
721 ++maxCol;
722
723 // Calculate new lowerDiag and upperDiag of extended seed
724 _calcExtendedLowerDiag(lowerDiag, minCol, antiDiagNo);
725 _calcExtendedUpperDiag(upperDiag, maxCol - 1, antiDiagNo);
726
727 // end of databaseSeg reached?
728 minCol = _max((int)minCol, (int)antiDiagNo + 2 - (int)rows);
729 // end of querySeg reached?
730 maxCol = _min(maxCol, cols);
731 }
732
733 // find positions of longest extension
734
735 // reached ends of both segments
736 TSize longestExtensionCol = length(antiDiag3) + offset3 - 2;
737 TSize longestExtensionRow = antiDiagNo - longestExtensionCol;
738 TScoreValue longestExtensionScore = antiDiag3[longestExtensionCol - offset3];
739
740 if (longestExtensionScore == undefined)
741 {
742 if (antiDiag2[length(antiDiag2)-2] != undefined)
743 {
744 // reached end of query segment
745 longestExtensionCol = length(antiDiag2) + offset2 - 2;
746 longestExtensionRow = antiDiagNo - 1 - longestExtensionCol;
747 longestExtensionScore = antiDiag2[longestExtensionCol - offset2];
748 }
749 else if (length(antiDiag2) > 2 && antiDiag2[length(antiDiag2)-3] != undefined)
750 {
751 // reached end of database segment
752 longestExtensionCol = length(antiDiag2) + offset2 - 3;
753 longestExtensionRow = antiDiagNo - 1 - longestExtensionCol;
754 longestExtensionScore = antiDiag2[longestExtensionCol - offset2];
755 }
756 }
757
758 if (longestExtensionScore == undefined)
759 {
760 // general case
761 for (TSize i = 0; i < length(antiDiag1); ++i)
762 {
763 if (antiDiag1[i] > longestExtensionScore)
764 {
765 longestExtensionScore = antiDiag1[i];
766 longestExtensionCol = i + offset1;
767 longestExtensionRow = antiDiagNo - 2 - longestExtensionCol;
768 }
769 }
770 }
771
772 // update seed
773 if (longestExtensionScore != undefined)
774 _updateExtendedSeed(seed, direction, longestExtensionCol, longestExtensionRow, lowerDiag, upperDiag);
775 return longestExtensionScore;
776 }
777
778 template <typename TConfig, typename TDatabase, typename TQuery, typename TScoreValue, typename TScoreSpec>
779 inline void
extendSeed(Seed<Simple,TConfig> & seed,TDatabase const & database,TQuery const & query,ExtensionDirection direction,Score<TScoreValue,TScoreSpec> const & scoringScheme,TScoreValue scoreDropOff,GappedXDrop const &)780 extendSeed(Seed<Simple, TConfig> & seed,
781 TDatabase const & database,
782 TQuery const & query,
783 ExtensionDirection direction,
784 Score<TScoreValue, TScoreSpec> const & scoringScheme,
785 TScoreValue scoreDropOff,
786 GappedXDrop const &)
787 {
788 // For gapped X-drop extension of Simple Seeds, we can simply
789 // update the begin and end values in each dimension as well as the diagonals.
790
791 // The algorithm only works for linear gap scores < 0, mismatch scores < 0
792 // and match scores > 0.
793 // TODO(holtgrew): We could introduce such check functions for score matrices.
794 // TODO(holtgrew): Originally, this function only worked for simple scoring schemes, does the algorithm also work correctly for BLOSUM62? This matrix contains zeroes. Also see [10729].
795 // SEQAN_ASSERT_GT(scoreMatch(scoringScheme), 0);
796 // SEQAN_ASSERT_LT(scoreMismatch(scoringScheme), 0);
797 SEQAN_ASSERT_LT(scoreGapOpen(scoringScheme), 0);
798 SEQAN_ASSERT_LT(scoreGapExtend(scoringScheme), 0);
799 SEQAN_ASSERT_EQ(scoreGapExtend(scoringScheme), scoreGapOpen(scoringScheme));
800
801 if (direction == EXTEND_LEFT || direction == EXTEND_BOTH)
802 {
803 // Do not extend to the left if we are already at the beginning of an
804 // infix or the sequence itself.
805
806 typedef typename Prefix<TDatabase const>::Type TDatabasePrefix;
807 typedef typename Prefix<TQuery const>::Type TQueryPrefix;
808
809 TDatabasePrefix databasePrefix = prefix(database, beginPositionH(seed));
810 TQueryPrefix queryPrefix = prefix(query, beginPositionV(seed));
811 // TODO(holtgrew): Update _extendSeedGappedXDropOneDirection and switch query/database order.
812 _extendSeedGappedXDropOneDirection(seed, queryPrefix, databasePrefix, EXTEND_LEFT, scoringScheme, scoreDropOff);
813 }
814
815 if (direction == EXTEND_RIGHT || direction == EXTEND_BOTH)
816 {
817 // Do not extend to the right if we are already at the beginning of an
818 // infix or the sequence itself.
819
820 typedef typename Suffix<TDatabase const>::Type TDatabaseSuffix;
821 typedef typename Suffix<TQuery const>::Type TQuerySuffix;
822
823 TDatabaseSuffix databaseSuffix = suffix(database, endPositionH(seed));
824 TQuerySuffix querySuffix = suffix(query, endPositionV(seed));
825 // std::cout << "database = " << database << std::endl;
826 // std::cout << "database Suffix = " << databaseSuffix << std::endl;
827 // std::cout << "query = " << query << std::endl;
828 // std::cout << "query Suffix = " << querySuffix << std::endl;
829 // TODO(holtgrew): Update _extendSeedGappedXDropOneDirection and switch query/database order.
830 _extendSeedGappedXDropOneDirection(seed, querySuffix, databaseSuffix, EXTEND_RIGHT, scoringScheme, scoreDropOff);
831 }
832
833 // TODO(holtgrew): Update seed's score?!
834 }
835
836
837 template <typename TConfig, typename TDatabase, typename TQuery, typename TScoreValue, typename TScoreSpec>
838 inline void
extendSeed(Seed<ChainedSeed,TConfig> &,TDatabase const &,TQuery const &,ExtensionDirection,Score<TScoreValue,TScoreSpec> const &,TScoreValue,GappedXDrop const &)839 extendSeed(Seed<ChainedSeed, TConfig> & /*seed*/,
840 TDatabase const & /*database*/,
841 TQuery const & /*query*/,
842 ExtensionDirection /*direction*/,
843 Score<TScoreValue, TScoreSpec> const & /*scoringScheme*/,
844 TScoreValue /*scoreDropOff*/,
845 GappedXDrop const &)
846 {
847 // For ungapped X-drop extension of Chained Seeds, we have to append
848 // diagonals to the front and end of the list of seed diagonals and modify
849 // the first and last one of the current set of seed diagonals.
850 SEQAN_CHECKPOINT;
851
852 SEQAN_ASSERT_FAIL("Write me! Look into the function where this assertion fails for instructions on how to do this.");
853 // TODO(holtgrew): Implement gapped X-drop extension with Chained seeds. As follows:
854 //
855 // Create a simple seed, copy over from chained seed. Then,
856 // performed gapped x-drop extension on the simple seed. Perform
857 // banded alignment on the left and right extended parts. Use the
858 // internal functions for this instead of the user-level functions
859 // to initialize, fill the matrix, and compute the traceback
860 // object. Construct the correct SeedDiagonal objects from the
861 // traceback objects and add them to the list of diagonals for the
862 // diagonal seed.
863 //
864 // An alternative implementation with storing the banded extension
865 // matrix would be too much work and it is questionable if this
866 // was faster. The banded seed alignment code from Tobias Rausch
867 // is very optimized.
868
869 // TODO(holtgrew): Update seed's score?!
870 }
871
872 } // namespace seqan
873
874 #endif // SEQAN_SEEDS_SEEDS_EXTENSION_H_
875