1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 // Algorithms for combining (i.e. merging and chaining) seeds.
35 // ==========================================================================
36
37 // TODO(holtgrew): All the Nothing()'s should not be part of the public interface.
38
39 #ifndef SEQAN_SEEDS_SEEDS_COMBINATION_H_
40 #define SEQAN_SEEDS_SEEDS_COMBINATION_H_
41
42 namespace seqan {
43
44 // ===========================================================================
45 // Enums, Tags, Classes, Specializations
46 // ===========================================================================
47
48 /*!
49 * @defgroup LocalChainingTags
50 * @brief Tags for selecting local chaining algorithms.
51 *
52 * @tag LocalChainingTags#Merge
53 * @headerfile <seqan/seeds.h>
54 * @brief Try to merge with existing seed.
55 *
56 * @signature typedef Tag<Merge_> Merge;
57 *
58 * @tag LocalChainingTags#Chaos
59 * @headerfile <seqan/seeds.h>
60 * @brief Chain to other seed using CHAOS chaining condition.
61 *
62 * @signature typedef Tag<Chaos_> Chaos;
63 *
64 * @tag LocalChainingTags#SimpleChain
65 * @headerfile <seqan/seeds.h>
66 * @brief Chain to other seed by simple chaining.
67 *
68 * @signature typedef Tag<SimpleChain_> SimpleChain;
69 *
70 * @tag LocalChainingTags#Single
71 * @headerfile <seqan/seeds.h>
72 * @brief Force adding without chaining.
73 *
74 * @signature typedef Tag<Single_> Single;
75 */
76
77 // TODO(holtgrew): Stream-line tags to Merge, ChaosChain, SimpleChain?
78 struct Merge_;
79 typedef Tag<Merge_> Merge;
80
81 struct Chaos_;
82 typedef Tag<Chaos_> Chaos;
83
84 struct SimpleChain_;
85 typedef Tag<SimpleChain_> SimpleChain;
86
87 struct Single_;
88 typedef Tag<Single_> Single;
89
90 // ===========================================================================
91 // Metafunctions
92 // ===========================================================================
93
94 // ===========================================================================
95 // Functions
96 // ===========================================================================
97
98 // Returns true iff b can be merged into a where a is the one to the
99 // upper left, b the one to the lower right.
100 template <typename TSeedSpec, typename TSeedConfig, typename TThreshold, typename TBandwidth>
101 inline bool
_seedsCombineable(Seed<TSeedSpec,TSeedConfig> const & seedA,Seed<TSeedSpec,TSeedConfig> const & seedB,TThreshold const & maxDiagonalDistance,TBandwidth const &,Merge const &)102 _seedsCombineable(Seed<TSeedSpec, TSeedConfig> const & seedA,
103 Seed<TSeedSpec, TSeedConfig> const & seedB,
104 TThreshold const & maxDiagonalDistance,
105 TBandwidth const & /*maxBandwidth*/,
106 Merge const &)
107 {
108 // TODO(holtgrew): TThreshold could be Position<TSeed>::Type.
109 // b has to be right of a for the two seeds to be mergeable.
110 if (beginPositionH(seedB) < beginPositionH(seedA) || beginPositionV(seedB) < beginPositionV(seedA))
111 return false;
112 // If the two seeds do not overlap, they cannot be merged.
113 if (beginPositionH(seedB) > endPositionH(seedA) || beginPositionV(seedB) > endPositionV(seedA))
114 return false;
115 // If the distance between the diagonals exceeds the threshold
116 // then the seeds cannot be merged.
117 typedef typename MakeUnsigned_<TThreshold>::Type TUnsignedThreshold;
118 if (static_cast<TUnsignedThreshold>(_abs(endDiagonal(seedA) - beginDiagonal(seedB))) > static_cast<TUnsignedThreshold>(maxDiagonalDistance))
119 return false;
120 // Otherwise, the seeds can be merged.
121 return true;
122 }
123
124
125 // Returns true iff b can be simple-chained to a where a is the one to
126 // the upper left, b the one to the lower right.
127 template <typename TSeedSpec, typename TSeedConfig, typename TThreshold, typename TBandwidth>
128 inline bool
_seedsCombineable(Seed<TSeedSpec,TSeedConfig> const & seedA,Seed<TSeedSpec,TSeedConfig> const & seedB,TThreshold const & maxGapSize,TBandwidth const &,SimpleChain const &)129 _seedsCombineable(Seed<TSeedSpec, TSeedConfig> const & seedA,
130 Seed<TSeedSpec, TSeedConfig> const & seedB,
131 TThreshold const & maxGapSize,
132 TBandwidth const & /*maxBandwidth*/,
133 SimpleChain const &)
134 {
135 // TODO(holtgrew): We should be able to configure whether we want to have Manhattan, euclidean, minimal edit distance, for seeds.
136 // TODO(holtgrew): TThreshold could be Position<TSeed>::Type.
137
138 // b has to be right of a for the two seeds to be chainable.
139 if (beginPositionH(seedB) < endPositionH(seedA) || beginPositionV(seedB) < endPositionV(seedA))
140 return false;
141
142 // Distance is maximal distance, this corresponds to going the
143 // distacen in the smaller distance with matches/mismatches and
144 // the rest with indels.
145 TThreshold distance = _max(beginPositionH(seedB) - endPositionH(seedA), beginPositionV(seedB) - endPositionV(seedA));
146 // Compare distance with threshold.
147 return distance <= maxGapSize;
148 }
149
150
151 // Returns true iff b can be Chaos chained to a where a is the one to
152 // the upper left, b the one to the lower right.
153 //
154 // TODO(holtgrew): Replace bandwidth with diagonalDistance.
155 template <typename TSeedSpec, typename TSeedConfig, typename TDistanceThreshold, typename TBandwidthThreshold>
156 inline bool
_seedsCombineable(Seed<TSeedSpec,TSeedConfig> const & seedA,Seed<TSeedSpec,TSeedConfig> const & seedB,TDistanceThreshold const & maxGapSize,TBandwidthThreshold const & bandwidth,Chaos const &)157 _seedsCombineable(Seed<TSeedSpec, TSeedConfig> const & seedA,
158 Seed<TSeedSpec, TSeedConfig> const & seedB,
159 TDistanceThreshold const & maxGapSize,
160 TBandwidthThreshold const & bandwidth,
161 Chaos const &)
162 {
163 // b has to be right of a for the two seeds to be chainable.
164 if (beginPositionH(seedB) < endPositionH(seedA) || beginPositionV(seedB) < endPositionV(seedA))
165 return false;
166
167 // The diagonal distance has to be smaller than the bandwidth.
168 // TODO(holtgrew): s/beginDiagonal/getBeginDiagonal/
169 TBandwidthThreshold diagonalDistance = _abs(endDiagonal(seedB) - beginDiagonal(seedA));
170 if (diagonalDistance > bandwidth)
171 return false;
172
173 // Distance is maximal distance, this corresponds to going the
174 // distance in the smaller distance with matches/mismatches and
175 // the rest with indels.
176 TDistanceThreshold distance = _max(beginPositionH(seedB) - endPositionH(seedA), beginPositionV(seedB) - endPositionV(seedA));
177 // Compare distance with threshold.
178 return distance <= maxGapSize;
179 }
180
181
182 // Updating the coordinates of seeds is the same for merging and
183 // simple chaining. Only the score computation differs.
184 template <typename TSeedConfig>
185 inline void
_updateSeedsCoordinatesMergeOrSimpleChain(Seed<Simple,TSeedConfig> & seed,Seed<Simple,TSeedConfig> const & other)186 _updateSeedsCoordinatesMergeOrSimpleChain(
187 Seed<Simple, TSeedConfig> & seed,
188 Seed<Simple, TSeedConfig> const & other)
189 {
190 setBeginPositionH(seed, std::min(beginPositionH(seed), beginPositionH(other)));
191 setBeginPositionV(seed, std::min(beginPositionV(seed), beginPositionV(other)));
192 setEndPositionH(seed, std::max(endPositionH(seed), endPositionH(other)));
193 setEndPositionV(seed, std::max(endPositionV(seed), endPositionV(other)));
194 setLowerDiagonal(seed, std::min(lowerDiagonal(seed), lowerDiagonal(other)));
195 setUpperDiagonal(seed, std::max(upperDiagonal(seed), upperDiagonal(other)));
196 }
197
198
199 template <typename TSeedConfig, typename TScoreValue, typename TSequence0, typename TSequence1>
200 inline void
_combineSeeds(Seed<Simple,TSeedConfig> & seed,Seed<Simple,TSeedConfig> const & other,Score<TScoreValue,Simple> const &,TSequence0 const &,TSequence1 const &,Merge const &)201 _combineSeeds(Seed<Simple, TSeedConfig> & seed,
202 Seed<Simple, TSeedConfig> const & other,
203 Score<TScoreValue, Simple> const & /*scoringScheme*/,
204 TSequence0 const & /*sequence0*/,
205 TSequence1 const & /*sequence1*/,
206 Merge const &)
207 {
208 _updateSeedsScoreMerge(seed, other);
209 _updateSeedsCoordinatesMergeOrSimpleChain(seed, other);
210 }
211
212
213 template <typename TSeedConfig, typename TScoreValue, typename TSequence0, typename TSequence1>
214 inline void
_combineSeeds(Seed<Simple,TSeedConfig> & seed,Seed<Simple,TSeedConfig> const & other,Score<TScoreValue,Simple> const & scoringScheme,TSequence0 const &,TSequence1 const &,SimpleChain const &)215 _combineSeeds(Seed<Simple, TSeedConfig> & seed,
216 Seed<Simple, TSeedConfig> const & other,
217 Score<TScoreValue, Simple> const & scoringScheme,
218 TSequence0 const & /*sequence0*/,
219 TSequence1 const & /*sequence1*/,
220 SimpleChain const &)
221 {
222 _updateSeedsScoreSimpleChain(seed, other, scoringScheme);
223 _updateSeedsCoordinatesMergeOrSimpleChain(seed, other);
224 }
225
226
227 template <typename TSeedConfig, typename TScoreValue, typename TSequence0, typename TSequence1>
228 inline void
_combineSeeds(Seed<Simple,TSeedConfig> & seed,Seed<Simple,TSeedConfig> const & other,Score<TScoreValue,Simple> const & scoringScheme,TSequence0 const & sequence0,TSequence1 const & sequence1,Chaos const &)229 _combineSeeds(Seed<Simple, TSeedConfig> & seed,
230 Seed<Simple, TSeedConfig> const & other,
231 Score<TScoreValue, Simple> const & scoringScheme,
232 TSequence0 const & sequence0,
233 TSequence1 const & sequence1,
234 Chaos const &)
235 {
236 typedef Seed<Simple, TSeedConfig> TSeed;
237 typedef typename Position<TSeed>::Type TPosition;
238
239 // TODO(holtgrew): Assert seed left of other.
240
241 // Compute gaps in both dimensions, the remaining gap is the
242 // vertical/horizontal distance we will not fill with CHAOS
243 // chaining.
244 //
245 // TODO(holtgrew): We need + 1 here, do we need it anywhere else?
246 TPosition gapDim0 = beginPositionH(other) - endPositionH(seed);
247 TPosition gapDim1 = beginPositionV(other) - endPositionV(seed);
248 TPosition minGap = _min(gapDim0, gapDim1);
249 TPosition maxGap = _max(gapDim0, gapDim1);
250 TPosition remainingGap = maxGap - minGap;
251
252 // Compute new score using the CHAOS method.
253 //
254 // First, compute the score when force-aligning from seed.
255 TPosition posLeft0 = endPositionH(seed);
256 TPosition posLeft1 = endPositionV(seed);
257 TScoreValue tmpScore = 0;
258 // TODO(holtgrew): Probably better use iterators on sequences!
259 for (TPosition i = 0; i < minGap; ++i)
260 tmpScore += score(scoringScheme, sequence0[posLeft0 + i], sequence1[posLeft1 + i]);
261
262 SEQAN_ASSERT_GT(beginPositionH(other), static_cast<TPosition>(0));
263 SEQAN_ASSERT_GT(beginPositionV(other), static_cast<TPosition>(0));
264 TPosition posRight0 = beginPositionH(other);
265 TPosition posRight1 = beginPositionV(other);
266
267 // Now, try to put the gap at each position and get the position
268 // with the highest score. If there are two such positions, the
269 // first one found is returned which is the one that is furthest
270 // away from seed.
271 // TPosition bestGapPos = 0; // delta to lowermost position // TODO(holtgrew): Set but not used; Remove?
272 TScoreValue bestScore = tmpScore;
273 for (TPosition i = 1; i < minGap; ++i) {
274 tmpScore -= score(scoringScheme, sequence0[posLeft0 + minGap - i], sequence1[posLeft1 + minGap - i]);
275 tmpScore += score(scoringScheme, sequence0[posRight0 - i], sequence1[posRight1 - i]);
276 if (tmpScore > bestScore) {
277 // Found a better score.
278 bestScore = tmpScore;
279 // bestGapPos = i; // TODO(holtgrew): Set but not used; Remove?
280 }
281 }
282
283 // Now, the best gap is when extending the lower right seed
284 // (other) by bestGapPos to the upper right. However, this is
285 // ignored for simple seeds: We simply update the score and are
286 // done.
287 _updateSeedsScoreChaos(seed, other, bestScore + remainingGap * scoreGap(scoringScheme));
288
289 // For simple seeds, the coordinate computation is the same as for
290 // merge/simple chain.
291 //
292 // TODO(holtgrew): Adjust the name of updateSeedsCoordinatesMergeOrSimpleChain to reflect this.
293 _updateSeedsCoordinatesMergeOrSimpleChain(seed, other);
294 }
295
296
297 template <typename TSeedConfig, typename TScoreValue, typename TSequence0, typename TSequence1>
298 inline void
_combineSeeds(Seed<ChainedSeed,TSeedConfig> & seed,Seed<ChainedSeed,TSeedConfig> const & other,Score<TScoreValue,Simple> const &,TSequence0 const &,TSequence1 const &,Merge const &)299 _combineSeeds(Seed<ChainedSeed, TSeedConfig> & seed,
300 Seed<ChainedSeed, TSeedConfig> const & other,
301 Score<TScoreValue, Simple> const & /*scoringScheme*/,
302 TSequence0 const & /*sequence0*/,
303 TSequence1 const & /*sequence1*/,
304 Merge const &)
305 {
306 // For chained seeds, we first remove all diagonals from seed
307 // until the last diagonal of seed starts truly before other.
308 // Then, we possibly shorten the last diagonal. Finally, we copy
309 // over all diagonals from other.
310
311 // std::cout << "Merging chained seeds " << seed << " and " << other << std::endl;
312 SEQAN_ASSERT_LEQ_MSG(beginPositionH(seed), beginPositionH(other), "Monotony in both dimensions required for merging.");
313 SEQAN_ASSERT_LEQ_MSG(beginPositionV(seed), beginPositionV(other), "Monotony in both dimensions required for merging.");
314
315 _updateSeedsScoreMerge(seed, other);
316
317 // Remove diagonals.
318 typedef Seed<ChainedSeed, TSeedConfig> TSeed;
319 typedef typename Iterator<TSeed, Standard>::Type TIterator;
320 TIterator it;
321 // TODO(holtgrew): Could use back() instead of lastKept.
322 TIterator lastKept = begin(seed);
323 for (it = begin(seed); it != end(seed); ++it) {
324 if (it->beginPositionH >= beginPositionH(other) && it->beginPositionV >= beginPositionV(other))
325 break;
326 lastKept = it;
327 }
328 if (it != end(seed))
329 truncateDiagonals(seed, it);
330 // std::cout << "Seed after truncating diagonals: " << seed << std::endl;
331
332 // Shorten last diagonal if necessary.
333 if (lastKept->beginPositionH + lastKept->length > beginPositionH(other) && lastKept->beginPositionV + lastKept->length > beginPositionV(other)) {
334 lastKept->length = _min(beginPositionH(other) - lastKept->beginPositionH, beginPositionV(other) - lastKept->beginPositionV);
335 } else if (lastKept->beginPositionH + lastKept->length > beginPositionH(other)) {
336 lastKept->length = beginPositionH(other) - lastKept->beginPositionH;
337 } else if (lastKept->beginPositionV + lastKept->length > beginPositionV(other)) {
338 lastKept->length = beginPositionV(other) - lastKept->beginPositionV;
339 }
340
341 // Maybe remove shortened diagonal if its length is 0.
342 if (back(seed).length == 0) {
343 // TODO(holtgrew): Do not use dot method.
344 seed._seedDiagonals.pop_back();
345 }
346
347 // Copy over other diagonals.
348 typedef typename Iterator<TSeed const, Standard>::Type TConstIterator;
349 for (TConstIterator it = begin(other, Standard()); it != end(other, Standard()); ++it)
350 appendDiagonal(seed, *it);
351
352 // std::cout << "Chained seed after merging: " << seed << std::endl;
353
354 // TODO(holtgrew): Update lower and upper diagonals!
355 }
356
357
358 template <typename TSeedConfig, typename TScoreValue, typename TSequence0, typename TSequence1>
359 inline void
_combineSeeds(Seed<ChainedSeed,TSeedConfig> & seed,Seed<ChainedSeed,TSeedConfig> const & other,Score<TScoreValue,Simple> const & scoringScheme,TSequence0 const &,TSequence1 const &,SimpleChain const &)360 _combineSeeds(Seed<ChainedSeed, TSeedConfig> & seed,
361 Seed<ChainedSeed, TSeedConfig> const & other,
362 Score<TScoreValue, Simple> const & scoringScheme,
363 TSequence0 const & /*sequence0*/,
364 TSequence1 const & /*sequence1*/,
365 SimpleChain const &)
366 {
367 // Simply copy over the diagonals of the seed (other) into the
368 // left one (seed) after updating the score.
369
370 _updateSeedsScoreSimpleChain(seed, other, scoringScheme);
371
372 // Copy over other diagonals.
373 typedef Seed<ChainedSeed, TSeedConfig> TSeed;
374 typedef typename Iterator<TSeed const, Standard>::Type TConstIterator;
375 for (TConstIterator it = begin(other, Standard()); it != end(other, Standard()); ++it)
376 appendDiagonal(seed, *it);
377 }
378
379
380 template <typename TSeedConfig, typename TScoreValue, typename TSequence0, typename TSequence1>
381 inline void
_combineSeeds(Seed<ChainedSeed,TSeedConfig> & seed,Seed<ChainedSeed,TSeedConfig> const & other,Score<TScoreValue,Simple> const & scoringScheme,TSequence0 const & sequence0,TSequence1 const & sequence1,Chaos const &)382 _combineSeeds(Seed<ChainedSeed, TSeedConfig> & seed,
383 Seed<ChainedSeed, TSeedConfig> const & other,
384 Score<TScoreValue, Simple> const & scoringScheme,
385 TSequence0 const & sequence0,
386 TSequence1 const & sequence1,
387 Chaos const &)
388 {
389 typedef Seed<ChainedSeed, TSeedConfig> TSeed;
390 typedef typename Position<TSeed>::Type TPosition;
391 typedef typename Iterator<TSeed const, Standard>::Type TConstIterator;
392
393 // TODO(holtgrew): Assert seed left of other.
394
395 // Compute gaps in both dimensions, the remaining gap is the
396 // vertical/horizontal distance we will not fill with CHAOS
397 // chaining.
398 //
399 // TODO(holtgrew): We need + 1 here, do we need it anywhere else?
400 TPosition gapDim0 = beginPositionH(other) - endPositionH(seed);
401 TPosition gapDim1 = beginPositionV(other) - endPositionV(seed);
402 TPosition minGap = _min(gapDim0, gapDim1);
403 TPosition maxGap = _max(gapDim0, gapDim1);
404 TPosition remainingGap = maxGap - minGap;
405
406 // Compute new score using the CHAOS method.
407 //
408 // First, compute the score when force-aligning from seed.
409 TPosition posLeft0 = endPositionH(seed);
410 TPosition posLeft1 = endPositionV(seed);
411 TScoreValue tmpScore = 0;
412 // TODO(holtgrew): Probably better use iterators on sequences!
413 for (TPosition i = 0; i < minGap; ++i)
414 tmpScore += score(scoringScheme, sequence0[posLeft0 + i], sequence1[posLeft1 + i]);
415
416 SEQAN_ASSERT_GT(beginPositionH(other), static_cast<TPosition>(0));
417 SEQAN_ASSERT_GT(beginPositionV(other), static_cast<TPosition>(0));
418 TPosition posRight0 = beginPositionH(other);
419 TPosition posRight1 = beginPositionV(other);
420
421 // Now, try to put the gap at each position and get the position
422 // with the highest score. If there are two such positions, the
423 // first one found is returned which is the one that is furthest
424 // away from seed.
425 TPosition bestGapPos = 0; // delta to lowermost position
426 TScoreValue bestScore = tmpScore;
427 for (TPosition i = 1; i < minGap; ++i) {
428 tmpScore -= score(scoringScheme, sequence0[posLeft0 + minGap - i], sequence1[posLeft1 + minGap - i]);
429 tmpScore += score(scoringScheme, sequence0[posRight0 - i], sequence1[posRight1 - i]);
430 if (tmpScore > bestScore) {
431 // Found a better score.
432 bestScore = tmpScore;
433 bestGapPos = i;
434 }
435 }
436
437 // Now, the best gap is when extending the lower right seed
438 // (other) by bestGapPos to the upper right. The upper left seed
439 // is extended by (minGap - bestGapPos).
440 //
441 // Adjust last diagonal of seed.
442 back(seed).length += minGap - bestGapPos;
443 // Copy over the first diagonal of other and adjust diagonal.
444 appendDiagonal(seed, front(other));
445 back(seed).beginPositionH -= bestGapPos;
446 back(seed).beginPositionV -= bestGapPos;
447 back(seed).length += bestGapPos;
448 // Copy over all other diagonals.
449 TConstIterator it = begin(other, Standard());
450 TConstIterator itEnd = end(other, Standard());
451 // TODO(holtgrew): value(it) does not work here, the adaption around std::list needs more work!
452 for (++it; it != itEnd; ++it)
453 appendDiagonal(seed, *it);
454
455 // Finally, we update the score and are done.
456 _updateSeedsScoreChaos(seed, other, bestScore + remainingGap * scoreGap(scoringScheme));
457 }
458
459 } // namespace seqan
460
461 #endif // #ifndef SEQAN_SEEDS_SEEDS_COMBINATION_UNORDERED_H_
462