1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Global chaining algorithms.
33 // ==========================================================================
34
35 #ifndef SEQAN_SEEDS_SEEDS_GLOBAL_CHAINING_H_
36 #define SEQAN_SEEDS_SEEDS_GLOBAL_CHAINING_H_
37
38 #include <map>
39
40 namespace seqan {
41
42 // ===========================================================================
43 // Enums, Tags, Classes, Specializations
44 // ===========================================================================
45
46 /*!
47 * @defgroup GlobalChainingTags Global Chaining Tags
48 * @brief Tags for specifying global chaining method.
49 *
50 * @tag GlobalChainingTags#SparseChaining
51 * @brief Tag for global sparse chaining.
52 */
53
54 struct SparseChaining_;
55 typedef Tag<SparseChaining_> SparseChaining;
56
57 // ===========================================================================
58 // Metafunctions
59 // ===========================================================================
60
61 // ===========================================================================
62 // Functions
63 // ===========================================================================
64
65 /*!
66 * @fn chainSeedsGlobally
67 * @headerfile <seqan/seeds.h>
68 * @brief Global chaining of seeds.
69 *
70 * @signature void chainSeedsGlobally(target, seedSet, tag);
71 *
72 * @param[out] target A @link ContainerConcept container @endlink to append the seeds to.
73 * @param[in] seedSet The @link SeedSet @endlink object to get the seeds from.
74 * @param[in] tag The tag to select the algorithm with (currently only @link GlobalChainingTags#SparseChaining
75 * SparseChaining @endlink is supported).
76 *
77 * Chaining of seeds between two sequences can be performed using sparse chaining as defined in (Gusfield, 1997).
78 *
79 * @section Example
80 *
81 * The following example demonstrates how to use the <tt>chainSeedsGlobally()</tt> function. First, a @link SeedSet
82 * @endlink is built and filled with @link SimpleSeed @endlink object. Then, a @link String @endlink of @link
83 * SimpleSeed @endlink objects is defined and filled using the <tt>chainSeedsGlobally()</tt> function.
84 *
85 * @include demos/seeds/seeds_chaining.cpp
86 *
87 * The output is as follows. Only the first and last seeds are written to <tt>std::cout</tt>.
88 *
89 * @include demos/seeds/seeds_chaining.cpp.stdout
90 *
91 * @section References
92 *
93 * <ul>
94 * <li>Dan Gusfield. Algorithms on Strings, Trees, and Sequences: Computer Science and Computational Biology.
95 * Cambridge University Press, January 1997.</li>
96 * </ul>
97 */
98
99 // TODO(holtgrew): Implement scored!
100 template <typename TTargetContainer, typename TSeed, typename TSeedSetSpec>
101 void
chainSeedsGlobally(TTargetContainer & target,SeedSet<TSeed,TSeedSetSpec> const & seedSet,SparseChaining const &)102 chainSeedsGlobally(
103 TTargetContainer & target,
104 SeedSet<TSeed, TSeedSetSpec> const & seedSet,
105 SparseChaining const &)
106 {
107 typedef typename Position<TSeed>::Type TPosition;
108 typedef typename Size<TSeed>::Type TSize;
109
110 // -----------------------------------------------------------------------
111 // Preparation
112 // -----------------------------------------------------------------------
113
114 // We copy over the seeds from the seed set into an array of seeds. We can then directly reference seed by their
115 // index in this array which is simpler than handling iterators into the std::set<> of the seed set.
116 String<TSeed> seeds;
117 resize(seeds, length(seedSet));
118 std::copy(seedSet._seeds.begin(), seedSet._seeds.end(), begin(seeds, Standard()));
119
120 // -----------------------------------------------------------------------
121 // Step 1: Generate the sorted list of interval points.
122 // -----------------------------------------------------------------------
123
124 // This list is I in Gusfield's description. An interval point is a triple of (dimension 0 border value, is start
125 // point, pointer to seed it belongs to).
126 typedef Triple<TPosition, bool, unsigned> TIntervalPoint;
127 typedef String<TIntervalPoint> TIntervalPoints;
128 typedef typename Iterator<TIntervalPoints, Standard>::Type TIntervalPointsIterator;
129
130 TIntervalPoints intervalPoints;
131 // std::cout << ",--- high quality seeds" << std::endl;
132 // TODO(holtgrew): Using something dense here for qualities and predecessors should be faster but requires seeds to be available by consecutive ids.
133 std::map<unsigned, TSize> qualityOfChainEndingIn;
134 std::map<unsigned, unsigned> predecessor;
135 for (unsigned i = 0; i < length(seeds); ++i)
136 {
137 // Since we use gap space, we have to use "false" for end points so the lexical ordering gives us what the
138 // sparse chaining algorithm expects.
139 // std::cout << "| " << **it << std::endl;
140 qualityOfChainEndingIn[i] = seedSize(seeds[i]);
141 predecessor[i] = maxValue<unsigned>();
142 appendValue(intervalPoints, TIntervalPoint(beginPositionH(seeds[i]), true, i));
143 appendValue(intervalPoints, TIntervalPoint(endPositionH(seeds[i]), false, i));
144 }
145 // std::cout << "`--" << std::endl;
146 std::sort(begin(intervalPoints, Standard()), end(intervalPoints, Standard()));
147 // std::cout << ",--- interval points" << std::endl;
148 // for (unsigned i = 0; i < length(intervalPoints); ++i) {
149 // std::cout << "| (" << intervalPoints[i].i1 << ", " << intervalPoints[i].i2 << ", " << intervalPoints[i].i3 << ")" << std::endl;
150 // }
151 // std::cout << "`---" << std::endl;
152
153 // -----------------------------------------------------------------------
154 // Step 2: Build the chain.
155 // -----------------------------------------------------------------------
156 // We build a list of "intermediate solutions". Each such
157 // solution is represented by the triple (end position in dim1,
158 // value of best chain so far, last seed of the chain).
159 typedef Triple<TPosition, TSize, unsigned> TIntermediateSolution;
160 typedef std::multiset<TIntermediateSolution> TIntermediateSolutions;
161 typedef typename TIntermediateSolutions::iterator TIntermediateSolutionsIterator;
162
163 // For all interval points...
164 TIntermediateSolutions intermediateSolutions;
165 for (TIntervalPointsIterator it = begin(intervalPoints), itEnd = end(intervalPoints); it != itEnd; ++it) {
166 // The seed belonging ot the interval point is seed k.
167 TSeed const & seedK = seeds[it->i3];
168
169 // std::cout << "Processing interval point (" << it->i1 << ", " << it->i2 << ", " << it->i3 << ")" << std::endl;
170 if (it->i2) { // Is is begin point.
171 // Find the closest seed (in dimension 1) to seed k with an
172 // entry in intermediateSolutions whose end coordinate in
173 // dimension 1 is <= the begin coordinate in dimension 1
174 // of seedK.
175 //
176 // STL gives us upper_bound which returns a pointer to the
177 // *first* one that compares greater than the reference
178 // one. Searching for the this one and decrementing the
179 // result iterator gives the desired result.
180 TIntermediateSolution referenceSolution(beginPositionV(seedK), maxValue<TSize>(), maxValue<unsigned>());
181 // std::cout << " intermediateSolutions.upper_bound(" << beginPositionV(seedK) << ")" << std::endl;
182 TIntermediateSolutionsIterator itJ = intermediateSolutions.upper_bound(referenceSolution);
183 if (itJ == intermediateSolutions.begin()) {
184 if (intermediateSolutions.size() > 0 &&
185 intermediateSolutions.rbegin()->i1 <= beginPositionV(seedK)) {
186 itJ = intermediateSolutions.end();
187 --itJ;
188 } else {
189 continue;
190 }
191 } else {
192 SEQAN_ASSERT_GT(intermediateSolutions.size(), 0u); // TODO(holtgrew): Remove this assertion?
193 --itJ;
194 }
195 // std::cout << " --> " << seeds[itJ->i3] << std::endl;
196 // Now, we have found such a seed j.
197 SEQAN_ASSERT_LEQ(endPositionV(seeds[itJ->i3]), endPositionV(seedK));
198 // Update the intermediate solution value for k and set predecessor.
199 qualityOfChainEndingIn[it->i3] += itJ->i2;
200 // std::cout << " UPDATE qualityOfChainEndingIn[" << it->i3 << "] == " << qualityOfChainEndingIn[it->i3] << std::endl;
201 predecessor[it->i3] = itJ->i3;
202 // std::cout << " predecessor[" << it->i3 << "] == " << itJ->i3 << std::endl;
203 } else { // Is end point.
204 // Search for the first triple in intermediateSolutions
205 // where the end coordinate in dimension 1 is >= end
206 // coordinate in dimension 1 for seed k. The corresponding
207 // seed is seed j.
208 //
209 // We work with upper_bound here which gives us the first
210 // value that is > so we have to work around this to get
211 // >= again...
212 SEQAN_ASSERT_GT(endPositionV(seedK), 0u);
213 TIntermediateSolution referenceSolution(endPositionV(seedK), 0, maxValue<unsigned>());
214 TIntermediateSolutionsIterator itSol = intermediateSolutions.upper_bound(referenceSolution);
215 if (itSol == intermediateSolutions.end()) {
216 // None found. Insert a new triple for seed k.
217 TIntermediateSolution sol(endPositionV(seedK), qualityOfChainEndingIn[it->i3], it->i3);
218 // std::cout << " INSERT (" << sol.i1 << ", " << sol.i2 << ", " << sol.i3 << ") " << __LINE__ << std::endl;
219 intermediateSolutions.insert(sol);
220 } else {
221 // Found this intermediate solution.
222 SEQAN_ASSERT_GEQ(itSol->i1, endPositionV(seedK));
223 TSeed const & seedJ = seeds[itSol->i3];
224 // Possibly start a new chain at k if the end1 is
225 // before the end1 of the chain ending in j or they
226 // end at the same coordinate in dim1 but k already
227 // has a higher quality than the whole chaing ending
228 // at j.
229 if (endPositionV(seedJ) > endPositionV(seedK) ||
230 (endPositionV(seedJ) == endPositionV(seedK) && qualityOfChainEndingIn[it->i3] > itSol->i2)) {
231 TIntermediateSolution sol(endPositionV(seedK), qualityOfChainEndingIn[it->i3], it->i3);
232 // std::cout << " INSERT (" << sol.i1 << ", " << sol.i2 << ", " << sol.i3 << ")" << __LINE__ << std::endl;
233 intermediateSolutions.insert(sol);
234 }
235 }
236
237 // Delete all intermediate solutions where end1 >= end1 of k and have a lower quality than k.
238 TIntermediateSolutionsIterator itDel = intermediateSolutions.upper_bound(referenceSolution);
239 TIntermediateSolutionsIterator itDelEnd = intermediateSolutions.end();
240 while (itDel != itDelEnd) {
241 TIntermediateSolutionsIterator ptr = itDel;
242 ++itDel;
243 if (qualityOfChainEndingIn[it->i3] > ptr->i2) {
244 // std::cout << " ERASE (" << ptr->i1 << ", " << ptr->i2 << ", " << ptr->i3 << ")" << std::endl;
245 intermediateSolutions.erase(ptr);
246 }
247 }
248 }
249 }
250
251 // std::cout << "Maximal quality: " << intermediateSolutions.rbegin()->i2 << std::endl;
252
253 // -----------------------------------------------------------------------
254 // Step 3: Write out the resulting chain.
255 // -----------------------------------------------------------------------
256 // TODO(holtgrew): We could use two different algorithms for target containers that are strings and those that are lists.
257 clear(target);
258 unsigned next = intermediateSolutions.rbegin()->i3;
259 while (next != maxValue<unsigned>())
260 {
261 appendValue(target, seeds[next]);
262 next = predecessor[next];
263 }
264 reverse(target);
265
266 // Assert that the resulting chain is non-overlapping.
267 #if SEQAN_ENABLE_DEBUG
268 if (length(target) > 0u) {
269 typedef typename Iterator<TTargetContainer, Standard>::Type TIterator;
270 // std::cerr << ".-- Chain (" << __FILE__ << ":" << __LINE__ << "):" << std::endl;
271 // for (TIterator it = begin(target, Standard()); it != end(target, Standard()); ++it)
272 // std::cerr << "| " << *it << std::endl;
273 // std::cerr << "`--" << std::endl;
274 TIterator itPrevious = begin(target, Standard());
275 TIterator it = itPrevious;
276 TIterator itEnd = end(target, Standard());
277 // std::cout << *it << std::endl;
278 ++it;
279 for (; it != itEnd; ++it) {
280 // std::cout << *it << std::endl;
281 SEQAN_ASSERT_LEQ(endPositionH(*itPrevious), beginPositionH(*it));
282 SEQAN_ASSERT_LEQ(endPositionV(*itPrevious), beginPositionV(*it));
283 itPrevious = it;
284 }
285 }
286 #endif // #if SEQAN_ENABLE_DEBUG
287 }
288
289 } // namespace seqan
290
291 #endif // #ifndef SEQAN_SEEDS_SEEDS_GLOBAL_CHAINING_H_
292