1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Global chaining algorithms.
33 // ==========================================================================
34 
35 #ifndef SEQAN_SEEDS_SEEDS_GLOBAL_CHAINING_H_
36 #define SEQAN_SEEDS_SEEDS_GLOBAL_CHAINING_H_
37 
38 #include <map>
39 
40 namespace seqan {
41 
42 // ===========================================================================
43 // Enums, Tags, Classes, Specializations
44 // ===========================================================================
45 
46 /*!
47  * @defgroup GlobalChainingTags Global Chaining Tags
48  * @brief Tags for specifying global chaining method.
49  *
50  * @tag GlobalChainingTags#SparseChaining
51  * @brief Tag for global sparse chaining.
52  */
53 
54 struct SparseChaining_;
55 typedef Tag<SparseChaining_> SparseChaining;
56 
57 // ===========================================================================
58 // Metafunctions
59 // ===========================================================================
60 
61 // ===========================================================================
62 // Functions
63 // ===========================================================================
64 
65 /*!
66  * @fn chainSeedsGlobally
67  * @headerfile <seqan/seeds.h>
68  * @brief Global chaining of seeds.
69  *
70  * @signature void chainSeedsGlobally(target, seedSet, tag);
71  *
72  * @param[out] target  A @link ContainerConcept container @endlink to append the seeds to.
73  * @param[in]  seedSet The @link SeedSet @endlink object to get the seeds from.
74  * @param[in]  tag     The tag to select the algorithm with (currently only @link GlobalChainingTags#SparseChaining
75  *                     SparseChaining @endlink is supported).
76  *
77  * Chaining of seeds between two sequences can be performed using sparse chaining as defined in (Gusfield, 1997).
78  *
79  * @section Example
80  *
81  * The following example demonstrates how to use the <tt>chainSeedsGlobally()</tt> function.  First, a @link SeedSet
82  * @endlink is built and filled with @link SimpleSeed @endlink object.  Then, a @link String @endlink of @link
83  * SimpleSeed @endlink objects is defined and filled using the <tt>chainSeedsGlobally()</tt> function.
84  *
85  * @include demos/seeds/seeds_chaining.cpp
86  *
87  * The output is as follows.  Only the first and last seeds are written to <tt>std::cout</tt>.
88  *
89  * @include demos/seeds/seeds_chaining.cpp.stdout
90  *
91  * @section References
92  *
93  * <ul>
94  * <li>Dan Gusfield. Algorithms on Strings, Trees, and Sequences:  Computer Science and Computational Biology.
95  *     Cambridge University Press, January 1997.</li>
96  * </ul>
97  */
98 
99 // TODO(holtgrew): Implement scored!
100 template <typename TTargetContainer, typename TSeed, typename TSeedSetSpec>
101 void
chainSeedsGlobally(TTargetContainer & target,SeedSet<TSeed,TSeedSetSpec> const & seedSet,SparseChaining const &)102 chainSeedsGlobally(
103         TTargetContainer & target,
104         SeedSet<TSeed, TSeedSetSpec> const & seedSet,
105         SparseChaining const &)
106 {
107     typedef typename Position<TSeed>::Type TPosition;
108     typedef typename Size<TSeed>::Type TSize;
109 
110     // -----------------------------------------------------------------------
111     // Preparation
112     // -----------------------------------------------------------------------
113 
114     // We copy over the seeds from the seed set into an array of seeds.  We can then directly reference seed by their
115     // index in this array which is simpler than handling iterators into the std::set<> of the seed set.
116     String<TSeed> seeds;
117     resize(seeds, length(seedSet));
118     std::copy(seedSet._seeds.begin(), seedSet._seeds.end(), begin(seeds, Standard()));
119 
120     // -----------------------------------------------------------------------
121     // Step 1: Generate the sorted list of interval points.
122     // -----------------------------------------------------------------------
123 
124     // This list is I in Gusfield's description.  An interval point is a triple of (dimension 0 border value, is start
125     // point, pointer to seed it belongs to).
126     typedef Triple<TPosition, bool, unsigned> TIntervalPoint;
127     typedef String<TIntervalPoint> TIntervalPoints;
128     typedef typename Iterator<TIntervalPoints, Standard>::Type TIntervalPointsIterator;
129 
130     TIntervalPoints intervalPoints;
131     // std::cout << ",--- high quality seeds" << std::endl;
132     // TODO(holtgrew): Using something dense here for qualities and predecessors should be faster but requires seeds to be available by consecutive ids.
133     std::map<unsigned, TSize> qualityOfChainEndingIn;
134     std::map<unsigned, unsigned> predecessor;
135     for (unsigned i = 0; i < length(seeds); ++i)
136     {
137         // Since we use gap space, we have to use "false" for end points so the lexical ordering gives us what the
138         // sparse chaining algorithm expects.
139         // std::cout << "| " << **it << std::endl;
140         qualityOfChainEndingIn[i] = seedSize(seeds[i]);
141         predecessor[i] = maxValue<unsigned>();
142         appendValue(intervalPoints, TIntervalPoint(beginPositionH(seeds[i]), true, i));
143         appendValue(intervalPoints, TIntervalPoint(endPositionH(seeds[i]), false, i));
144     }
145     // std::cout << "`--" << std::endl;
146     std::sort(begin(intervalPoints, Standard()), end(intervalPoints, Standard()));
147     // std::cout << ",--- interval points" << std::endl;
148     // for (unsigned i = 0; i < length(intervalPoints); ++i) {
149     //     std::cout << "| (" << intervalPoints[i].i1 << ", " << intervalPoints[i].i2 << ", " << intervalPoints[i].i3 << ")" << std::endl;
150     // }
151     // std::cout << "`---" << std::endl;
152 
153     // -----------------------------------------------------------------------
154     // Step 2: Build the chain.
155     // -----------------------------------------------------------------------
156     // We build a list of "intermediate solutions".  Each such
157     // solution is represented by the triple (end position in dim1,
158     // value of best chain so far, last seed of the chain).
159     typedef Triple<TPosition, TSize, unsigned> TIntermediateSolution;
160     typedef std::multiset<TIntermediateSolution> TIntermediateSolutions;
161     typedef typename TIntermediateSolutions::iterator TIntermediateSolutionsIterator;
162 
163     // For all interval points...
164     TIntermediateSolutions intermediateSolutions;
165     for (TIntervalPointsIterator it = begin(intervalPoints), itEnd = end(intervalPoints); it != itEnd; ++it) {
166         // The seed belonging ot the interval point is seed k.
167         TSeed const & seedK = seeds[it->i3];
168 
169         // std::cout << "Processing interval point (" << it->i1 << ", " << it->i2 << ", " << it->i3 << ")" << std::endl;
170         if (it->i2) {  // Is is begin point.
171             // Find the closest seed (in dimension 1) to seed k with an
172             // entry in intermediateSolutions whose end coordinate in
173             // dimension 1 is <= the begin coordinate in dimension 1
174             // of seedK.
175             //
176             // STL gives us upper_bound which returns a pointer to the
177             // *first* one that compares greater than the reference
178             // one.  Searching for the this one and decrementing the
179             // result iterator gives the desired result.
180             TIntermediateSolution referenceSolution(beginPositionV(seedK), maxValue<TSize>(), maxValue<unsigned>());
181             // std::cout << "    intermediateSolutions.upper_bound(" << beginPositionV(seedK) << ")" << std::endl;
182             TIntermediateSolutionsIterator itJ = intermediateSolutions.upper_bound(referenceSolution);
183             if (itJ == intermediateSolutions.begin()) {
184                 if (intermediateSolutions.size() > 0 &&
185                     intermediateSolutions.rbegin()->i1 <= beginPositionV(seedK)) {
186                     itJ = intermediateSolutions.end();
187                     --itJ;
188                 } else {
189                     continue;
190                 }
191             } else {
192                 SEQAN_ASSERT_GT(intermediateSolutions.size(), 0u);  // TODO(holtgrew): Remove this assertion?
193                 --itJ;
194             }
195             // std::cout << "     --> " << seeds[itJ->i3] << std::endl;
196             // Now, we have found such a seed j.
197             SEQAN_ASSERT_LEQ(endPositionV(seeds[itJ->i3]), endPositionV(seedK));
198             // Update the intermediate solution value for k and set predecessor.
199             qualityOfChainEndingIn[it->i3] += itJ->i2;
200             // std::cout << "  UPDATE qualityOfChainEndingIn[" << it->i3 << "] == " << qualityOfChainEndingIn[it->i3] << std::endl;
201             predecessor[it->i3] = itJ->i3;
202             // std::cout << "         predecessor[" << it->i3 << "] == " << itJ->i3 << std::endl;
203         } else {  // Is end point.
204             // Search for the first triple in intermediateSolutions
205             // where the end coordinate in dimension 1 is >= end
206             // coordinate in dimension 1 for seed k.  The corresponding
207             // seed is seed j.
208             //
209             // We work with upper_bound here which gives us the first
210             // value that is > so we have to work around this to get
211             // >= again...
212             SEQAN_ASSERT_GT(endPositionV(seedK), 0u);
213             TIntermediateSolution referenceSolution(endPositionV(seedK), 0, maxValue<unsigned>());
214             TIntermediateSolutionsIterator itSol = intermediateSolutions.upper_bound(referenceSolution);
215             if (itSol == intermediateSolutions.end()) {
216                 // None found.  Insert a new triple for seed k.
217                 TIntermediateSolution sol(endPositionV(seedK), qualityOfChainEndingIn[it->i3], it->i3);
218                 // std::cout << "  INSERT (" << sol.i1 << ", " << sol.i2 << ", " << sol.i3 << ") " << __LINE__ << std::endl;
219                 intermediateSolutions.insert(sol);
220             } else {
221                 // Found this intermediate solution.
222                 SEQAN_ASSERT_GEQ(itSol->i1, endPositionV(seedK));
223                 TSeed const & seedJ = seeds[itSol->i3];
224                 // Possibly start a new chain at k if the end1 is
225                 // before the end1 of the chain ending in j or they
226                 // end at the same coordinate in dim1 but k already
227                 // has a higher quality than the whole chaing ending
228                 // at j.
229                 if (endPositionV(seedJ) > endPositionV(seedK) ||
230                     (endPositionV(seedJ) == endPositionV(seedK) && qualityOfChainEndingIn[it->i3] > itSol->i2)) {
231                     TIntermediateSolution sol(endPositionV(seedK), qualityOfChainEndingIn[it->i3], it->i3);
232                     // std::cout << "  INSERT (" << sol.i1 << ", " << sol.i2 << ", " << sol.i3 << ")" << __LINE__  << std::endl;
233                     intermediateSolutions.insert(sol);
234                 }
235             }
236 
237             // Delete all intermediate solutions where end1 >= end1 of k and have a lower quality than k.
238             TIntermediateSolutionsIterator itDel = intermediateSolutions.upper_bound(referenceSolution);
239             TIntermediateSolutionsIterator itDelEnd = intermediateSolutions.end();
240             while (itDel != itDelEnd) {
241                 TIntermediateSolutionsIterator ptr = itDel;
242                 ++itDel;
243                 if (qualityOfChainEndingIn[it->i3] > ptr->i2) {
244                     // std::cout << "  ERASE (" << ptr->i1 << ", " << ptr->i2 << ", " << ptr->i3 << ")" << std::endl;
245                     intermediateSolutions.erase(ptr);
246                 }
247             }
248         }
249     }
250 
251     // std::cout << "Maximal quality: " << intermediateSolutions.rbegin()->i2 << std::endl;
252 
253     // -----------------------------------------------------------------------
254     // Step 3: Write out the resulting chain.
255     // -----------------------------------------------------------------------
256     // TODO(holtgrew): We could use two different algorithms for target containers that are strings and those that are lists.
257     clear(target);
258     unsigned next = intermediateSolutions.rbegin()->i3;
259     while (next != maxValue<unsigned>())
260     {
261         appendValue(target, seeds[next]);
262         next = predecessor[next];
263     }
264     reverse(target);
265 
266     // Assert that the resulting chain is non-overlapping.
267     #if SEQAN_ENABLE_DEBUG
268     if (length(target) > 0u) {
269         typedef typename Iterator<TTargetContainer, Standard>::Type TIterator;
270         // std::cerr << ".-- Chain (" << __FILE__ << ":" << __LINE__ << "):" << std::endl;
271         // for (TIterator it = begin(target, Standard()); it != end(target, Standard()); ++it)
272         //     std::cerr << "| " << *it << std::endl;
273         // std::cerr << "`--" << std::endl;
274         TIterator itPrevious = begin(target, Standard());
275         TIterator it = itPrevious;
276         TIterator itEnd = end(target, Standard());
277         // std::cout << *it << std::endl;
278         ++it;
279         for (; it != itEnd; ++it) {
280             // std::cout << *it << std::endl;
281             SEQAN_ASSERT_LEQ(endPositionH(*itPrevious), beginPositionH(*it));
282             SEQAN_ASSERT_LEQ(endPositionV(*itPrevious), beginPositionV(*it));
283             itPrevious = it;
284         }
285     }
286     #endif  // #if SEQAN_ENABLE_DEBUG
287 }
288 
289 }  // namespace seqan
290 
291 #endif  // #ifndef SEQAN_SEEDS_SEEDS_GLOBAL_CHAINING_H_
292