1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 //
34 // Based on the code by Carsten Kemena <carsten.kemena@crg.es>, debugged
35 // by Birte Kehr <birte.kehr@fu-berlin.de>.
36 // ==========================================================================
37 // Seed extension algorithms.
38 //
39 // The approach for gapped X-drop extension is based on the algorithm in
40 // Figure 2 from (Zhang et al., 2000).
41 //
42 //  Zhang Z, Schwartz S, Wagner L, Miller W.  A greedy algorithm for aligning
43 //  DNA sequences.  Journal of computational biologyA a journal of
44 //  computational molecular cell biology.  2000;7(1-2):203-14.
45 //  doi:10.1089/10665270050081478
46 // ==========================================================================
47 
48 #ifndef SEQAN_SEEDS_SEEDS_EXTENSION_H_
49 #define SEQAN_SEEDS_SEEDS_EXTENSION_H_
50 
51 namespace seqan {
52 
53 // ===========================================================================
54 // Enums, Tags, Classes, Specializations
55 // ===========================================================================
56 
57 // ---------------------------------------------------------------------------
58 // Tags Seed Extension
59 // ---------------------------------------------------------------------------
60 
61 /*!
62  * @defgroup SeedExtensionTags
63  * @brief Tags for selecting seed extension algorithm.
64  *
65  * @see Seed#extendSeed
66  *
67  * @tag SeedExtensionTags#MatchExtend
68  * @headerfile <seqan/seeds.h>
69  * @brief Extends a seed until a mismatch occurs.
70  *
71  * @signature typedef Tag<MatchExtend_> const MatchExtend;
72  *
73  * @tag SeedExtensionTags#UnGappedXDrop
74  * @headerfile <seqan/seeds.h>
75  * @brief Ungapped extension of a seed until score drops below a given value.
76  *
77  * @signature typedef Tag<UngappedXDrop_> const UnGappedXDrop;
78  *
79  * @tag SeedExtensionTags#GappedXDrop
80  * @headerfile <seqan/seeds.h>
81  * @brief Gapped extension of a seed until score drops below a given value; only works for SimpleSeed.
82  *
83  * @signature typedef Tag<GappedXDrop_> const GappedXDrop;
84  */
85 
86 struct MatchExtend_;
87 typedef Tag<MatchExtend_> const MatchExtend;
88 
89 struct UngappedXDrop_;
90 typedef Tag<UngappedXDrop_> const UnGappedXDrop;
91 
92 struct GappedXDrop_;
93 typedef Tag<GappedXDrop_> const GappedXDrop;
94 
95 // ---------------------------------------------------------------------------
96 // Enum ExtensionDirection
97 // ---------------------------------------------------------------------------
98 
99 // TODO(holtgrew): Put into class?
100 
101 /*!
102  * @enum ExtensionDirection
103  * @headerfile <seqan/seeds.h>
104  * @brief Direction for seed extension.
105  *
106  * @signature enum ExtensionDirection;
107  *
108  * @see Seed#extendSeed
109  *
110  * @val ExtensionDirection EXTEND_NONE = 0;
111  * @brief Perform no extension.
112  *
113  * @val ExtensionDirection EXTEND_LEFT = 1;
114  * @brief Perform extension towards the left.
115  *
116  * @val ExtensionDirection EXTEND_RIGHT = 2;
117  * @brief Perform extension towards the right.
118  *
119  * @val ExtensionDirection EXTEND_BOTH = 3;
120  * @brief Perform extension to both directions.
121  */
122 
123 enum ExtensionDirection
124 {
125     EXTEND_NONE  = 0,
126     EXTEND_LEFT  = 1,
127     EXTEND_RIGHT = 2,
128     EXTEND_BOTH  = 3
129 };
130 
131 // ===========================================================================
132 // Metafunctions
133 // ===========================================================================
134 
135 // ===========================================================================
136 // Functions
137 // ===========================================================================
138 
139 // ---------------------------------------------------------------------------
140 // Function extendSeed                                           [MatchExtend]
141 // ---------------------------------------------------------------------------
142 
143 /*!
144  * @fn Seed#extendSeed
145  * @headerfile <seqan/seeds.h>
146  * @brief Extends a seed.
147  *
148  * @signature void extendSeed(seed, database, query, direction, MatchExtend);
149  * @signature void extendSeed(seed, database, query, direction, scoringScheme, scoreDropOff, xDropTag);
150  *
151  * @param[in,out] seed          The Seed to extend.
152  * @param[in]     database      The database (horizontal) @link ContainerConcept sequence @endlink.
153  * @param[in]     query         The query (vertical) @link ContainerConcept sequence @endlink.
154  * @param[in]     direction     The extension direction.  Type: @link ExtensionDirection @endlink.
155  * @param[in]     scoringScheme The @link Score @endlink object to use for scoring alignments and gaps.
156  * @param[in]     scoreDropOff  The score drop after which the extension should stop.  The extension stops if this
157  *                              value is exceeded.  Only given for when using an x-drop algorithm.
158  * @param[in]     xDropTag      Tag for selecting x-drop method, one of <tt>UnGappedXDrop</tt> and
159  *                              <tt>GappedXDrop</tt>.
160  *
161  * You can use the tags, <tt>MatchExtend</tt>, <tt>UnGappedXDrop</tt>, and <tt>GappedXDrop</tt>.
162  *
163  * Note that the diagonals updated in <tt>seed</tt> do not necessarily reflect the diagonals for the optimal extension
164  * but the diagonals used in all traces of the extension.  However, they are guaranteed to include the optimal
165  * extension's trace.
166  *
167  * @section Examples
168  *
169  * The documentation of the class @link Seed @endlink contains an example for
170  * seed extension.
171  *
172  * @see ExtensionDirection
173  * @see SeedExtensionTags
174  */
175 
176 // We need one specialization for each combination of the extension variants and seeds.  It is not worth to extract the
177 // common parts for simple and chained seeds.
178 
179 template <typename TConfig, typename TDatabase, typename TQuery>
180 inline void
extendSeed(Seed<Simple,TConfig> & seed,TDatabase const & database,TQuery const & query,ExtensionDirection direction,MatchExtend const &)181 extendSeed(Seed<Simple, TConfig> & seed,
182            TDatabase const & database,
183            TQuery const & query,
184            ExtensionDirection direction,
185            MatchExtend const &)
186 {
187     // For match extension of Simple Seeds, we can simply update the begin and end values in each dimension.
188     typedef Seed<Simple, TConfig> TSeed;
189     typedef typename Position<TSeed>::Type TPosition;
190     typedef typename Size<TSeed>::Type TSize;
191 
192     // Extension to the left
193     if (direction == EXTEND_LEFT || direction == EXTEND_BOTH)
194     {
195         TPosition posH = beginPositionH(seed);
196         TPosition posV = beginPositionV(seed);
197         while (posH >= 1 && posV >= 1 && database[posH - 1] == query[posV - 1])
198         {
199             --posH;
200             --posV;
201         }
202         setBeginPositionH(seed, posH);
203         setBeginPositionV(seed, posV);
204     }
205 
206     // Extension to the right
207     if (direction == EXTEND_RIGHT || direction == EXTEND_BOTH)
208     {
209         TSize lengthH = length(database);
210         TSize lengthV = length(query);
211         TPosition posH = endPositionH(seed);
212         TPosition posV = endPositionV(seed);
213         while (posH < lengthH && posV < lengthV && database[posH] == query[posV])
214         {
215             ++posH;
216             ++posV;
217         }
218         setEndPositionH(seed, posH);
219         setEndPositionV(seed, posV);
220     }
221 }
222 
223 
224 template <typename TConfig, typename TDatabase, typename TQuery>
225 inline void
extendSeed(Seed<ChainedSeed,TConfig> & seed,TDatabase const & database,TQuery const & query,ExtensionDirection direction,MatchExtend const &)226 extendSeed(Seed<ChainedSeed, TConfig> & seed,
227            TDatabase const & database,
228            TQuery const & query,
229            ExtensionDirection direction,
230            MatchExtend const &)
231 {
232     // For match extension of Chained Seeds, we extend the first and the last Seed Diagonal.
233     SEQAN_ASSERT_GT(length(seed), 0u);
234 
235     typedef Seed<ChainedSeed, TConfig> TSeed;
236     typedef typename Value<TSeed>::Type TSeedDiagonal;
237     typedef typename Position<TSeedDiagonal>::Type TPosition;
238     typedef typename Size<TSeedDiagonal>::Type TSize;
239 
240     // Extension to the left
241     if (direction == EXTEND_LEFT || direction == EXTEND_BOTH)
242     {
243         TSeedDiagonal & diag = front(seed);
244         TPosition posH = diag.beginPositionH;
245         TPosition posV = diag.beginPositionV;
246         TSize diagonalLength = diag.length;
247         while (posH >= 1 && posV >= 1 && database[posH - 1] == query[posV - 1])
248         {
249             --posH;
250             --posV;
251             ++diagonalLength;
252         }
253         diag.beginPositionH = posH;
254         diag.beginPositionV = posV;
255         diag.length = diagonalLength;
256     }
257 
258     // Extension to the right
259     if (direction == EXTEND_RIGHT || direction == EXTEND_BOTH)
260     {
261         TSize lengthH = length(database);
262         TSize lengthV = length(query);
263         TSeedDiagonal & diag = back(seed);
264         TPosition posH = diag.beginPositionH + diag.length;
265         TPosition posV = diag.beginPositionV + diag.length;
266         TSize diagonalLength = diag.length;
267         while (posH < lengthH && posV < lengthV && database[posH] == query[posV])
268         {
269             ++posH;
270             ++posV;
271             ++diagonalLength;
272         }
273         diag.length = diagonalLength;
274     }
275 }
276 
277 // ---------------------------------------------------------------------------
278 // Function extendSeed                                         [UnGappedXDrop]
279 // ---------------------------------------------------------------------------
280 
281 template <typename TConfig, typename TDatabase, typename TQuery, typename TScoreValue, typename TScoreSpec>
282 inline void
extendSeed(Seed<Simple,TConfig> & seed,TDatabase const & database,TQuery const & query,ExtensionDirection direction,Score<TScoreValue,TScoreSpec> const & scoringScheme,TScoreValue scoreDropOff,UnGappedXDrop const &)283 extendSeed(Seed<Simple, TConfig> & seed,
284            TDatabase const & database,
285            TQuery const & query,
286            ExtensionDirection direction,
287            Score<TScoreValue, TScoreSpec> const & scoringScheme,
288            TScoreValue scoreDropOff,
289            UnGappedXDrop const &)
290 {
291     // For ungapped X-drop extension of Simple Seeds, we can simply
292     // update the begin and end values in each dimension.
293     scoreDropOff = -scoreDropOff;
294 
295     typedef Seed<ChainedSeed, TConfig> TSeed;
296     typedef typename Position<TSeed>::Type TPosition;
297     typedef typename Size<TSeed>::Type TSize;
298 
299     // Extension to the left
300     if (direction == EXTEND_LEFT || direction == EXTEND_BOTH)
301     {
302         TScoreValue tmpScore = 0;
303         TPosition posH = beginPositionH(seed);
304         TPosition posV = beginPositionV(seed);
305         TPosition mismatchingSuffixLength = 0;
306         while (posH >= 1 && posV >= 1 && tmpScore > scoreDropOff)
307         {
308             tmpScore += score(scoringScheme, sequenceEntryForScore(scoringScheme, database, posH),
309                               sequenceEntryForScore(scoringScheme, query, posV));
310             if (database[posH - 1] == query[posV - 1])
311             {
312                 mismatchingSuffixLength = 0;
313                 if (tmpScore > static_cast<TScoreValue>(0))
314                     tmpScore = 0;
315             }
316             else
317             {
318                 mismatchingSuffixLength += 1;
319             }
320             --posH;
321             --posV;
322         }
323         setBeginPositionH(seed, posH + mismatchingSuffixLength);
324         setBeginPositionV(seed, posV + mismatchingSuffixLength);
325     }
326 
327     // Extension to the right
328     if (direction == EXTEND_RIGHT || direction == EXTEND_BOTH)
329     {
330         TScoreValue tmpScore = 0;
331         TSize lengthH = length(database);
332         TSize lengthV = length(query);
333         TPosition posH = endPositionH(seed);
334         TPosition posV = endPositionV(seed);
335         TPosition mismatchingSuffixLength = 0;
336         while (posH < lengthH && posV < lengthV && tmpScore > scoreDropOff)
337         {
338             tmpScore += score(scoringScheme, sequenceEntryForScore(scoringScheme, database, posH),
339                               sequenceEntryForScore(scoringScheme, query, posV));
340             if (database[posH] == query[posV])
341             {
342                 mismatchingSuffixLength = 0;
343                 if (tmpScore > static_cast<TScoreValue>(0))
344                     tmpScore = 0;
345             }
346             else
347             {
348                 mismatchingSuffixLength += 1;
349             }
350             ++posH;
351             ++posV;
352         }
353         setEndPositionH(seed, posH - mismatchingSuffixLength);
354         setEndPositionV(seed, posV - mismatchingSuffixLength);
355     }
356 
357     // TODO(holtgrew): Update score?!
358 }
359 
360 
361 template <typename TConfig, typename TDatabase, typename TQuery, typename TScoreValue, typename TScoreSpec>
362 inline void
extendSeed(Seed<ChainedSeed,TConfig> & seed,TDatabase const & database,TQuery const & query,ExtensionDirection direction,Score<TScoreValue,TScoreSpec> const & scoringScheme,TScoreValue scoreDropOff,UnGappedXDrop const &)363 extendSeed(Seed<ChainedSeed, TConfig> & seed,
364            TDatabase const & database,
365            TQuery const & query,
366            ExtensionDirection direction,
367            Score<TScoreValue, TScoreSpec> const & scoringScheme,
368            TScoreValue scoreDropOff,
369            UnGappedXDrop const &)
370 {
371     // For ungapped X-drop extension of Chained Seeds, we extend the
372     // first and the last Seed Diagonal.
373     scoreDropOff = -scoreDropOff;
374 
375     typedef Seed<ChainedSeed, TConfig> TSeed;
376     typedef typename Value<TSeed>::Type TSeedDiagonal;
377     typedef typename Position<TSeedDiagonal>::Type TPosition;
378     typedef typename Size<TSeedDiagonal>::Type TSize;
379 
380     // Extension to the left
381     if (direction == EXTEND_LEFT || direction == EXTEND_BOTH)
382     {
383         TScoreValue tmpScore = 0;
384         TPosition mismatchingSuffixLength = 0;
385         TSeedDiagonal & diag = front(seed);
386         TPosition posH = beginPositionH(seed);
387         TPosition posV = beginPositionV(seed);
388         TSize diagonalLength = diag.length;
389         while (posH >= 1 && posV >= 1 && tmpScore > scoreDropOff)
390         {
391             tmpScore += score(scoringScheme, sequenceEntryForScore(scoringScheme, database, posH),
392                               sequenceEntryForScore(scoringScheme, query, posV));
393             if (database[posH - 1] == query[posV - 1])
394             {
395                 mismatchingSuffixLength = 0;
396                 if (tmpScore > static_cast<TScoreValue>(0))
397                     tmpScore = 0;
398             }
399             else
400             {
401                 mismatchingSuffixLength += 1;
402             }
403             --posH;
404             --posV;
405             ++diagonalLength;
406         }
407         diag.beginPositionH = posH + mismatchingSuffixLength;
408         diag.beginPositionV = posV + mismatchingSuffixLength;
409         diag.length = diagonalLength - mismatchingSuffixLength;
410     }
411 
412     // Extension to the right
413     if (direction == EXTEND_RIGHT || direction == EXTEND_BOTH)
414     {
415         TScoreValue tmpScore = 0;
416         TPosition mismatchingSuffixLength = 0;
417         TSize lengthH = length(query);
418         TSize lengthV = length(database);
419         TSeedDiagonal & diag = back(seed);
420         TPosition posH = diag.beginPositionH + diag.length;
421         TPosition posV = diag.beginPositionV + diag.length;
422         TSize diagonalLength = diag.length;
423         while (posH < lengthH && posV < lengthV && tmpScore > scoreDropOff)
424         {
425             tmpScore += score(scoringScheme, sequenceEntryForScore(scoringScheme, database, posH),
426                               sequenceEntryForScore(scoringScheme, query, posV));
427             if (database[posH] == query[posV])
428             {
429                 mismatchingSuffixLength = 0;
430                 if (tmpScore > static_cast<TScoreValue>(0))
431                     tmpScore = 0;
432             }
433             else
434             {
435                 mismatchingSuffixLength += 1;
436             }
437             ++posH;
438             ++posV;
439             ++diagonalLength;
440         }
441         diag.length = diagonalLength - mismatchingSuffixLength;
442     }
443 
444     // TODO(holtgrew): Update score?!
445 }
446 
447 // ---------------------------------------------------------------------------
448 // Function extendSeed                                           [GappedXDrop]
449 // ---------------------------------------------------------------------------
450 
451 template<typename TAntiDiag, typename TDropOff, typename TScoreValue>
452 inline void
_initAntiDiags(TAntiDiag &,TAntiDiag & antiDiag2,TAntiDiag & antiDiag3,TDropOff dropOff,TScoreValue gapCost,TScoreValue undefined)453 _initAntiDiags(TAntiDiag & ,
454                TAntiDiag & antiDiag2,
455                TAntiDiag & antiDiag3,
456                TDropOff dropOff,
457                TScoreValue gapCost,
458                TScoreValue undefined)
459 {
460     // antiDiagonals will be swaped in while loop BEFORE computation of antiDiag3 entries
461     //  -> no initialization of antiDiag1 necessary
462 
463     resize(antiDiag2, 1);
464     antiDiag2[0] = 0;
465 
466     resize(antiDiag3, 2);
467     if (-gapCost > dropOff)
468     {
469         antiDiag3[0] = undefined;
470         antiDiag3[1] = undefined;
471     }
472     else
473     {
474         antiDiag3[0] = gapCost;
475         antiDiag3[1] = gapCost;
476     }
477 }
478 
479 template<typename TAntiDiag>
480 inline void
_swapAntiDiags(TAntiDiag & antiDiag1,TAntiDiag & antiDiag2,TAntiDiag & antiDiag3)481 _swapAntiDiags(TAntiDiag & antiDiag1,
482                TAntiDiag & antiDiag2,
483                TAntiDiag & antiDiag3)
484 {
485     TAntiDiag temp;
486     move(temp, antiDiag1);
487     move(antiDiag1, antiDiag2);
488     move(antiDiag2, antiDiag3);
489     move(antiDiag3, temp);
490 }
491 
492 template<typename TAntiDiag, typename TSize, typename TScoreValue>
493 inline TSize
_initAntiDiag3(TAntiDiag & antiDiag3,TSize offset,TSize maxCol,TSize antiDiagNo,TScoreValue minScore,TScoreValue gapCost,TScoreValue undefined)494 _initAntiDiag3(TAntiDiag & antiDiag3,
495                TSize offset,
496                TSize maxCol,
497                TSize antiDiagNo,
498                TScoreValue minScore,
499                TScoreValue gapCost,
500                TScoreValue undefined)
501 {
502     resize(antiDiag3, maxCol + 1 - offset);
503 
504     antiDiag3[0] = undefined;
505     antiDiag3[maxCol - offset] = undefined;
506 
507     if ((int)antiDiagNo * gapCost > minScore)
508     {
509         if (offset == 0) // init first column
510             antiDiag3[0] = antiDiagNo * gapCost;
511         if (antiDiagNo - maxCol == 0) // init first row
512             antiDiag3[maxCol - offset] = antiDiagNo * gapCost;
513     }
514     return offset;
515 }
516 
517 template<typename TDiagonal, typename TSize>
518 inline void
_calcExtendedLowerDiag(TDiagonal & lowerDiag,TSize minCol,TSize antiDiagNo)519 _calcExtendedLowerDiag(TDiagonal & lowerDiag,
520                        TSize minCol,
521                        TSize antiDiagNo)
522 {
523     TSize minRow = antiDiagNo - minCol;
524     if ((TDiagonal)minCol - (TDiagonal)minRow < lowerDiag)
525         lowerDiag = (TDiagonal)minCol - (TDiagonal)minRow;
526 }
527 
528 template<typename TDiagonal, typename TSize>
529 inline void
_calcExtendedUpperDiag(TDiagonal & upperDiag,TSize maxCol,TSize antiDiagNo)530 _calcExtendedUpperDiag(TDiagonal & upperDiag,
531                        TSize maxCol,
532                        TSize antiDiagNo)
533 {
534     TSize maxRow = antiDiagNo + 1 - maxCol;
535     if ((TDiagonal)maxCol - 1 - (TDiagonal)maxRow > upperDiag)
536         upperDiag = maxCol - 1 - maxRow;
537 }
538 
539 template<typename TSeed, typename TSize, typename TDiagonal>
540 inline void
_updateExtendedSeed(TSeed & seed,ExtensionDirection direction,TSize cols,TSize rows,TDiagonal lowerDiag,TDiagonal upperDiag)541 _updateExtendedSeed(TSeed & seed,
542                     ExtensionDirection direction,
543                     TSize cols,
544                     TSize rows,
545                     TDiagonal lowerDiag,
546                     TDiagonal upperDiag)
547 {
548     if (direction == EXTEND_LEFT)
549     {
550         // Set lower and upper diagonals.
551         TDiagonal beginDiag = beginDiagonal(seed);
552         if (lowerDiagonal(seed) > beginDiag + lowerDiag)
553             setLowerDiagonal(seed, beginDiag + lowerDiag);
554         if (upperDiagonal(seed) < beginDiag + upperDiag)
555             setUpperDiagonal(seed, beginDiag + upperDiag);
556 
557         // Set new start position of seed.
558         setBeginPositionH(seed, beginPositionH(seed) - rows);
559         setBeginPositionV(seed, beginPositionV(seed) - cols);
560     } else {  // direction == EXTEND_RIGHT
561         // Set new lower and upper diagonals.
562         TDiagonal endDiag = endDiagonal(seed);
563         if (upperDiagonal(seed) < endDiag - lowerDiag)
564             setUpperDiagonal(seed, endDiag - lowerDiag);
565         if (lowerDiagonal(seed) > endDiag - upperDiag)
566             setLowerDiagonal(seed, endDiag - upperDiag);
567 
568         // Set new end position of seed.
569         setEndPositionH(seed, endPositionH(seed) + rows);
570         setEndPositionV(seed, endPositionV(seed) + cols);
571     }
572     SEQAN_ASSERT_GEQ(upperDiagonal(seed), lowerDiagonal(seed));
573     SEQAN_ASSERT_GEQ(upperDiagonal(seed), beginDiagonal(seed));
574     SEQAN_ASSERT_GEQ(upperDiagonal(seed), endDiagonal(seed));
575     SEQAN_ASSERT_GEQ(beginDiagonal(seed), lowerDiagonal(seed));
576     SEQAN_ASSERT_GEQ(endDiagonal(seed), lowerDiagonal(seed));
577 }
578 
579 // Limit score;  In the general case we cannot do this so we simply perform a check on the score mismatch values.
580 template <typename TScoreValue, typename TScoreSpec, typename TAlphabet>
581 void
_extendSeedGappedXDropOneDirectionLimitScoreMismatch(Score<TScoreValue,TScoreSpec> & scoringScheme,TScoreValue minErrScore,TAlphabet *)582 _extendSeedGappedXDropOneDirectionLimitScoreMismatch(Score<TScoreValue, TScoreSpec> & scoringScheme,
583                                                      TScoreValue minErrScore,
584                                                      TAlphabet * /*tag*/)
585 {
586     // We cannot set a lower limit for the mismatch score since the score might be a scoring matrix such as Blosum62.
587     // Instead, we perform a check on the matrix scores.
588 #if SEQAN_ENABLE_DEBUG
589     {
590         for (unsigned i = 0; i < valueSize<TAlphabet>(); ++i)
591             for (unsigned j = 0; j <= i; ++j)
592                 SEQAN_ASSERT_GEQ_MSG(score(scoringScheme, TAlphabet(i), TAlphabet(j)), minErrScore,
593                                      "Mismatch score too small!, i = %u, j = %u");
594     }
595 #else
596     (void)scoringScheme;
597     (void)minErrScore;
598 #endif  // #if SEQAN_ENABLE_DEBUG
599 }
600 
601 // In the case of a SimpleScore, however, we can set this.
602 template <typename TScoreValue, typename TAlphabet>
603 void
_extendSeedGappedXDropOneDirectionLimitScoreMismatch(Score<TScoreValue,Simple> & scoringScheme,TScoreValue minErrScore,TAlphabet *)604 _extendSeedGappedXDropOneDirectionLimitScoreMismatch(Score<TScoreValue, Simple> & scoringScheme,
605                                                      TScoreValue minErrScore,
606                                                      TAlphabet * /*tag*/)
607 {
608     setScoreMismatch(scoringScheme, std::max(scoreMismatch(scoringScheme), minErrScore));
609 }
610 
611 template<typename TConfig, typename TQuerySegment, typename TDatabaseSegment, typename TScoreValue, typename TScoreSpec>
612 TScoreValue
_extendSeedGappedXDropOneDirection(Seed<Simple,TConfig> & seed,TQuerySegment const & querySeg,TDatabaseSegment const & databaseSeg,ExtensionDirection direction,Score<TScoreValue,TScoreSpec> scoringScheme,TScoreValue scoreDropOff)613 _extendSeedGappedXDropOneDirection(
614         Seed<Simple, TConfig> & seed,
615         TQuerySegment const & querySeg,
616         TDatabaseSegment const & databaseSeg,
617         ExtensionDirection direction,
618         Score<TScoreValue, TScoreSpec> scoringScheme,
619         TScoreValue scoreDropOff)
620 {
621     typedef typename Size<TQuerySegment>::Type TSize;
622     typedef typename Seed<Simple,TConfig>::TDiagonal TDiagonal;
623 
624     TSize cols = length(querySeg)+1;
625     TSize rows = length(databaseSeg)+1;
626     if (rows == 1 || cols == 1)
627         return 0;
628 
629     TScoreValue len = 2 * _max(cols, rows); // number of antidiagonals
630     TScoreValue const minErrScore = minValue<TScoreValue>() / len; // minimal allowed error penalty
631     setScoreGap(scoringScheme, _max(scoreGap(scoringScheme), minErrScore));
632     typename Value<TQuerySegment>::Type * tag = 0;
633     (void)tag;
634     _extendSeedGappedXDropOneDirectionLimitScoreMismatch(scoringScheme, minErrScore, tag);
635 
636     TScoreValue gapCost = scoreGap(scoringScheme);
637     TScoreValue undefined = minValue<TScoreValue>() - gapCost;
638 
639     // DP matrix is calculated by anti-diagonals
640     String<TScoreValue> antiDiag1;    //smallest anti-diagonal
641     String<TScoreValue> antiDiag2;
642     String<TScoreValue> antiDiag3;    //current anti-diagonal
643 
644     // Indices on anti-diagonals include gap column/gap row:
645     //   - decrease indices by 1 for position in query/database segment
646     //   - first calculated entry is on anti-diagonal n\B0 2
647 
648     TSize minCol = 1;
649     TSize maxCol = 2;
650 
651     TSize offset1 = 0; // number of leading columns that need not be calculated in antiDiag1
652     TSize offset2 = 0; //                                                       in antiDiag2
653     TSize offset3 = 0; //                                                       in antiDiag3
654 
655     _initAntiDiags(antiDiag1, antiDiag2, antiDiag3, scoreDropOff, gapCost, undefined);
656     TSize antiDiagNo = 1; // the currently calculated anti-diagonal
657 
658     TScoreValue best = 0; // maximal score value in the DP matrix (for drop-off calculation)
659 
660     TDiagonal lowerDiag = 0;
661     TDiagonal upperDiag = 0;
662 
663     while (minCol < maxCol)
664     {
665         ++antiDiagNo;
666         _swapAntiDiags(antiDiag1, antiDiag2, antiDiag3);
667         offset1 = offset2;
668         offset2 = offset3;
669         offset3 = minCol-1;
670         _initAntiDiag3(antiDiag3, offset3, maxCol, antiDiagNo, best - scoreDropOff, gapCost, undefined);
671 
672         TScoreValue antiDiagBest = antiDiagNo * gapCost;
673         for (TSize col = minCol; col < maxCol; ++col) {
674             // indices on anti-diagonals
675             TSize i3 = col - offset3;
676             TSize i2 = col - offset2;
677             TSize i1 = col - offset1;
678 
679             // indices in query and database segments
680             TSize queryPos, dbPos;
681             if (direction == EXTEND_RIGHT)
682             {
683                 queryPos = col - 1;
684                 dbPos = antiDiagNo - col - 1;
685             }
686             else // direction == EXTEND_LEFT
687             {
688                 queryPos = cols - 1 - col;
689                 dbPos = rows - 1 + col - antiDiagNo;
690             }
691 
692             // Calculate matrix entry (-> antiDiag3[col])
693             TScoreValue tmp = _max(antiDiag2[i2-1], antiDiag2[i2]) + gapCost;
694             tmp = _max(tmp, antiDiag1[i1 - 1] + score(scoringScheme, sequenceEntryForScore(scoringScheme, querySeg, queryPos),
695                                                       sequenceEntryForScore(scoringScheme, databaseSeg, dbPos)));
696             if (tmp < best - scoreDropOff)
697             {
698                 antiDiag3[i3] = undefined;
699             }
700             else
701             {
702                 antiDiag3[i3] = tmp;
703                 antiDiagBest = _max(antiDiagBest, tmp);
704             }
705         }
706         best = _max(best, antiDiagBest);
707 
708         // Calculate new minCol and minCol
709         while (minCol - offset3 < length(antiDiag3) && antiDiag3[minCol - offset3] == undefined &&
710                minCol - offset2 - 1 < length(antiDiag2) && antiDiag2[minCol - offset2 - 1] == undefined)
711         {
712             ++minCol;
713         }
714 
715         // Calculate new maxCol
716         while (maxCol - offset3 > 0 && (antiDiag3[maxCol - offset3 - 1] == undefined) &&
717                                        (antiDiag2[maxCol - offset2 - 1] == undefined))
718         {
719             --maxCol;
720         }
721         ++maxCol;
722 
723         // Calculate new lowerDiag and upperDiag of extended seed
724         _calcExtendedLowerDiag(lowerDiag, minCol, antiDiagNo);
725         _calcExtendedUpperDiag(upperDiag, maxCol - 1, antiDiagNo);
726 
727         // end of databaseSeg reached?
728         minCol = _max((int)minCol, (int)antiDiagNo + 2 - (int)rows);
729         // end of querySeg reached?
730         maxCol = _min(maxCol, cols);
731     }
732 
733     // find positions of longest extension
734 
735     // reached ends of both segments
736     TSize longestExtensionCol = length(antiDiag3) + offset3 - 2;
737     TSize longestExtensionRow = antiDiagNo - longestExtensionCol;
738     TScoreValue longestExtensionScore = antiDiag3[longestExtensionCol - offset3];
739 
740     if (longestExtensionScore == undefined)
741     {
742         if (antiDiag2[length(antiDiag2)-2] != undefined)
743         {
744             // reached end of query segment
745             longestExtensionCol = length(antiDiag2) + offset2 - 2;
746             longestExtensionRow = antiDiagNo - 1 - longestExtensionCol;
747             longestExtensionScore = antiDiag2[longestExtensionCol - offset2];
748         }
749         else if (length(antiDiag2) > 2 && antiDiag2[length(antiDiag2)-3] != undefined)
750         {
751             // reached end of database segment
752             longestExtensionCol = length(antiDiag2) + offset2 - 3;
753             longestExtensionRow = antiDiagNo - 1 - longestExtensionCol;
754             longestExtensionScore = antiDiag2[longestExtensionCol - offset2];
755         }
756     }
757 
758     if (longestExtensionScore == undefined)
759     {
760         // general case
761         for (TSize i = 0; i < length(antiDiag1); ++i)
762         {
763             if (antiDiag1[i] > longestExtensionScore)
764             {
765                 longestExtensionScore = antiDiag1[i];
766                 longestExtensionCol = i + offset1;
767                 longestExtensionRow = antiDiagNo - 2 - longestExtensionCol;
768             }
769         }
770     }
771 
772     // update seed
773     if (longestExtensionScore != undefined)
774         _updateExtendedSeed(seed, direction, longestExtensionCol, longestExtensionRow, lowerDiag, upperDiag);
775     return longestExtensionScore;
776 }
777 
778 template <typename TConfig, typename TDatabase, typename TQuery, typename TScoreValue, typename TScoreSpec>
779 inline void
extendSeed(Seed<Simple,TConfig> & seed,TDatabase const & database,TQuery const & query,ExtensionDirection direction,Score<TScoreValue,TScoreSpec> const & scoringScheme,TScoreValue scoreDropOff,GappedXDrop const &)780 extendSeed(Seed<Simple, TConfig> & seed,
781            TDatabase const & database,
782            TQuery const & query,
783            ExtensionDirection direction,
784            Score<TScoreValue, TScoreSpec> const & scoringScheme,
785            TScoreValue scoreDropOff,
786            GappedXDrop const &)
787 {
788     // For gapped X-drop extension of Simple Seeds, we can simply
789     // update the begin and end values in each dimension as well as the diagonals.
790 
791     // The algorithm only works for linear gap scores < 0, mismatch scores < 0
792     // and match scores > 0.
793     // TODO(holtgrew): We could introduce such check functions for score matrices.
794     // TODO(holtgrew): Originally, this function only worked for simple scoring schemes, does the algorithm also work correctly for BLOSUM62? This matrix contains zeroes. Also see [10729].
795     // SEQAN_ASSERT_GT(scoreMatch(scoringScheme), 0);
796     // SEQAN_ASSERT_LT(scoreMismatch(scoringScheme), 0);
797     SEQAN_ASSERT_LT(scoreGapOpen(scoringScheme), 0);
798     SEQAN_ASSERT_LT(scoreGapExtend(scoringScheme), 0);
799     SEQAN_ASSERT_EQ(scoreGapExtend(scoringScheme), scoreGapOpen(scoringScheme));
800 
801     if (direction == EXTEND_LEFT || direction == EXTEND_BOTH)
802     {
803         // Do not extend to the left if we are already at the beginning of an
804         // infix or the sequence itself.
805 
806         typedef typename Prefix<TDatabase const>::Type TDatabasePrefix;
807         typedef typename Prefix<TQuery const>::Type TQueryPrefix;
808 
809         TDatabasePrefix databasePrefix = prefix(database, beginPositionH(seed));
810         TQueryPrefix queryPrefix = prefix(query, beginPositionV(seed));
811         // TODO(holtgrew): Update _extendSeedGappedXDropOneDirection and switch query/database order.
812         _extendSeedGappedXDropOneDirection(seed, queryPrefix, databasePrefix, EXTEND_LEFT, scoringScheme, scoreDropOff);
813     }
814 
815     if (direction == EXTEND_RIGHT || direction == EXTEND_BOTH)
816     {
817         // Do not extend to the right if we are already at the beginning of an
818         // infix or the sequence itself.
819 
820         typedef typename Suffix<TDatabase const>::Type TDatabaseSuffix;
821         typedef typename Suffix<TQuery const>::Type TQuerySuffix;
822 
823         TDatabaseSuffix databaseSuffix = suffix(database, endPositionH(seed));
824         TQuerySuffix querySuffix = suffix(query, endPositionV(seed));
825         // std::cout << "database = " << database << std::endl;
826         // std::cout << "database Suffix = " << databaseSuffix << std::endl;
827         // std::cout << "query = " << query << std::endl;
828         // std::cout << "query Suffix = " << querySuffix << std::endl;
829         // TODO(holtgrew): Update _extendSeedGappedXDropOneDirection and switch query/database order.
830         _extendSeedGappedXDropOneDirection(seed, querySuffix, databaseSuffix, EXTEND_RIGHT, scoringScheme, scoreDropOff);
831     }
832 
833     // TODO(holtgrew): Update seed's score?!
834 }
835 
836 
837 template <typename TConfig, typename TDatabase, typename TQuery, typename TScoreValue, typename TScoreSpec>
838 inline void
extendSeed(Seed<ChainedSeed,TConfig> &,TDatabase const &,TQuery const &,ExtensionDirection,Score<TScoreValue,TScoreSpec> const &,TScoreValue,GappedXDrop const &)839 extendSeed(Seed<ChainedSeed, TConfig> & /*seed*/,
840            TDatabase const & /*database*/,
841            TQuery const & /*query*/,
842            ExtensionDirection /*direction*/,
843            Score<TScoreValue, TScoreSpec> const & /*scoringScheme*/,
844            TScoreValue /*scoreDropOff*/,
845            GappedXDrop const &)
846 {
847     // For ungapped X-drop extension of Chained Seeds, we have to append
848     // diagonals to the front and end of the list of seed diagonals and modify
849     // the first and last one of the current set of seed diagonals.
850     SEQAN_CHECKPOINT;
851 
852     SEQAN_ASSERT_FAIL("Write me! Look into the function where this assertion fails for instructions on how to do this.");
853     // TODO(holtgrew): Implement gapped X-drop extension with Chained seeds. As follows:
854     //
855     // Create a simple seed, copy over from chained seed.  Then,
856     // performed gapped x-drop extension on the simple seed.  Perform
857     // banded alignment on the left and right extended parts.  Use the
858     // internal functions for this instead of the user-level functions
859     // to initialize, fill the matrix, and compute the traceback
860     // object.  Construct the correct SeedDiagonal objects from the
861     // traceback objects and add them to the list of diagonals for the
862     // diagonal seed.
863     //
864     // An alternative implementation with storing the banded extension
865     // matrix would be too much work and it is questionable if this
866     // was faster.  The banded seed alignment code from Tobias Rausch
867     // is very optimized.
868 
869     // TODO(holtgrew): Update seed's score?!
870 }
871 
872 }  // namespace seqan
873 
874 #endif  // SEQAN_SEEDS_SEEDS_EXTENSION_H_
875