1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34 
35 #ifndef SEQAN_HEADER_FIND_INDEX_APPROX_H
36 #define SEQAN_HEADER_FIND_INDEX_APPROX_H
37 
38 namespace seqan
39 {
40 
41 template <typename TSAValue1, typename TSAValue2>
42 struct AddResultsFunctor
43 {
44     String<Pair<TSAValue1, String<TSAValue2> > > results;
45 
46     template <typename TIter1, typename TIter2>
operatorAddResultsFunctor47     void operator() (TIter1 &iter1, TIter2 &iter2)
48     {
49         unsigned ofs = length(results);
50         resize(results, ofs + countOccurrences(iter1));
51         for (unsigned i = 0; i < countOccurrences(iter1); ++i)
52         {
53             results[ofs + i].i1 = getOccurrences(iter1)[i];
54             results[ofs + i].i2 = getOccurrences(iter2);
55         }
56     }
57 };
58 
59 template <
60     bool enumerateA,
61     bool enumerateB,
62     typename TOnFoundFunctor,
63     typename TTreeIteratorA,
64     typename TIterPosA,
65     typename TTreeIteratorB,
66     typename TIterPosB,
67     typename TErrors >
68 inline void
_exactTreeSearch(TOnFoundFunctor & onFoundFunctor,TTreeIteratorA iterA,TIterPosA iterPosA,TTreeIteratorB iterB,TIterPosB iterPosB)69 _exactTreeSearch(
70     TOnFoundFunctor    &onFoundFunctor,    // functor to store matches
71     TTreeIteratorA    iterA,              // pattern tree iterator
72     TIterPosA        iterPosA,           // position in representative
73     TTreeIteratorB    iterB,              // text suffix tree iterator
74     TIterPosB        iterPosB)           // position in representative
75 {
76     while (true)
77     {
78         TIterPosA repLenA = repLength(iterA);
79         TIterPosB repLenB = repLength(iterB);
80         if (iterPosA < repLenA)
81         {
82             // search patterns in the suffix tree
83             if (!goDown(iterB, suffix(representative(iterA), iterPosA)))
84                 return;
85             // end of pattern found?
86             if (isLeaf(iterA))
87             {
88                 onFoundFunctor(iterA, iterB);
89                 return;
90             }
91             iterPosB += repLenA - iterPosA;
92             iterPosA = repLenA;
93         }
94         else if (iterPosB < repLenB)
95         {
96             // search text in the pattern tree
97             TIterPosA lcp = 0;
98             if (!_goDownString(iterA, suffix(representative(iterB), iterPosB), lcp))
99             {
100                 // couldn't we proceed to go down because of a leaf in the pattern tree?
101                 if (iterPosA + lcp == repLength(iterA) && isLeaf(iterA))
102                     onFoundFunctor(iterA, iterB);
103                 return;
104             }
105             // end of pattern found?
106             if (isLeaf(iterA))
107             {
108                 onFoundFunctor(iterA, iterB);
109                 return;
110             }
111             iterPosA += repLenB - iterPosB;
112             iterPosB = repLenB;
113         }
114         else
115         {
116             if (!goDown(iterA))
117             {
118                 onFoundFunctor(iterA, iterB);
119                 return;
120             }
121             while (emptyParentEdge(iterA))
122             {
123                 onFoundFunctor(iterA, iterB);
124                 if (!goRight(iterA))
125                     return;
126             }
127 
128             if (!goDown(iterB))
129                 return;
130             while (emptyParentEdge(iterB))
131             {
132                 if (!goRight(iterA))
133                     return;
134             }
135 
136             // search pairs of edges with the same character
137             while (true)
138             {
139                 if (parentEdgeFirstChar(iterA) < parentEdgeFirstChar(iterB))
140                 {
141                     if (!goRight(iterA))
142                         return;
143                 }
144                 else if (parentEdgeFirstChar(iterB) < parentEdgeFirstChar(iterA))
145                 {
146                     if (!goRight(iterB))
147                         return;
148                 }
149                 else
150                 {
151                     _exactTreeSearch(onFoundFunctor, iterA, iterPosA, iterB, iterPosB);
152                     if (!goRight(iterA) || !goRight(iterB))
153                         return;
154                 }
155             }
156         }
157     }
158 }
159 
160 template <
161     bool enumerateA,
162     bool enumerateB,
163     typename TOnFoundFunctor,
164     typename TTreeIteratorA,
165     typename TIterPosA,
166     typename TTreeIteratorB,
167     typename TIterPosB,
168     typename TErrors >
169 inline void
_approximateTreeSearch(TOnFoundFunctor & onFoundFunctor,TTreeIteratorA iterA,TIterPosA iterPosA,TTreeIteratorB iterB_,TIterPosB iterPosB,TErrors errorsLeft)170 _approximateTreeSearch(
171     TOnFoundFunctor    &onFoundFunctor,    // functor to store matches
172     TTreeIteratorA    iterA,              // pattern tree iterator
173     TIterPosA        iterPosA,           // position in representative
174     TTreeIteratorB    iterB_,             // text suffix tree iterator
175     TIterPosB        iterPosB,           // position in representative
176     TErrors            errorsLeft)         // tolerated errors left
177 {
178     if (enumerateA && !goDown(iterA))
179     {
180         onFoundFunctor(iterA, iterB_);
181         return;
182     }
183     if (enumerateB && !goDown(iterB_)) return;
184 
185     do
186     {
187         TTreeIteratorB iterB = iterB_;
188         do
189         {
190             TErrors e = errorsLeft;
191             TIterPosA ipA = iterPosA;
192             TIterPosB ipB = iterPosB;
193 
194             if (ipB == repLength(iterB)) continue;
195 
196             while (true)
197             {
198                 if (ipA == repLength(iterA))
199                 {
200                     if (ipB == repLength(iterB))
201                         _approximateTreeSearch<true,true>(onFoundFunctor, iterA, ipA, iterB, ipB, e);
202                     else
203                         _approximateTreeSearch<true,false>(onFoundFunctor, iterA, ipA, iterB, ipB, e);
204                     break;
205                 }
206                 else if (ipB == repLength(iterB))
207                 {
208                     _approximateTreeSearch<false,true>(onFoundFunctor, iterA, ipA, iterB, ipB, e);
209                     break;
210                 }
211 
212                 if (representative(iterA)[ipA] != representative(iterB)[ipB])
213                     if (e-- == 0) break;
214 
215                 ++ipA;
216                 ++ipB;
217             }
218         } while (enumerateB && goRight(iterB));
219     } while (enumerateA && goRight(iterA));
220 }
221 
222 
223 template <typename TSAValue>
224 struct AddSingleResultsFunctor
225 {
226     String<TSAValue> results;
227 
228     template <typename TPattern, typename TIter>
operatorAddSingleResultsFunctor229     void operator() (TPattern & /*pattern*/, TIter &iter)
230     {
231         append(results, getOccurrences(iter));
232     }
233 };
234 
235 
236 template <
237     typename TOnFoundFunctor,
238     typename TString,
239     typename TStringPos,
240     typename TTreeIterator,
241     typename TIterPos,
242     typename TErrors >
243 inline void
_approximateStringSearch(TOnFoundFunctor & onFoundFunctor,TString const & string,TStringPos stringPos,TTreeIterator iter,TIterPos iterPos,TErrors errorsLeft)244 _approximateStringSearch(
245     TOnFoundFunctor    &onFoundFunctor,    // functor to store matches
246     TString const &string,              // search pattern
247     TStringPos stringPos,               // position in pattern
248     TTreeIterator iter,                 // text suffix tree iterator
249     TIterPos iterPos,                   // position in representative
250     TErrors errorsLeft)                 // tolerated errors left
251 {
252     if (errorsLeft == 0)
253     {
254         if (goDown(iter, suffix(string, stringPos)))
255             onFoundFunctor(string, iter);
256         return;
257     }
258 
259     if (!goDown(iter)) return;
260     do
261     {
262         TErrors e = errorsLeft;
263         TStringPos sp = stringPos;
264         TIterPos ip = iterPos;
265 
266         if (ip == repLength(iter)) continue;
267 
268         while (true)
269         {
270             if (representative(iter)[ip] != string[sp])
271                 if (e-- == 0) break;
272 
273             if (++sp == length(string))
274             {
275                 onFoundFunctor(string, iter);
276                 break;
277             }
278 
279             if (++ip == repLength(iter))
280             {
281                 _approximateStringSearch(onFoundFunctor, string, sp, iter, ip, e);
282                 break;
283             }
284         }
285     } while (goRight(iter));
286 }
287 
288 template <
289     typename TOnFoundFunctor,
290     typename TString,
291     typename TTreeIterator,
292     typename TErrors>
293 inline void
approximateStringSearch(TOnFoundFunctor & onFoundFunctor,TString const & string,TTreeIterator & iter,TErrors errorsLeft)294 approximateStringSearch(
295     TOnFoundFunctor &onFoundFunctor,
296     TString const &string,
297     TTreeIterator &iter,
298     TErrors errorsLeft)
299 {
300     if (length(string) <= errorsLeft)
301     {
302         onFoundFunctor(string, iter);
303         return;
304     }
305     _approximateStringSearch(onFoundFunctor, string, 0u, iter, repLength(iter), errorsLeft);
306 }
307 
308 template <
309     typename TOnFoundFunctor,
310     typename TTreeIteratorA,
311     typename TTreeIteratorB,
312     typename TErrors>
313 inline void
approximateTreeSearch(TOnFoundFunctor & onFoundFunctor,TTreeIteratorA const & iterA,TTreeIteratorB const & iterB,TErrors errorsLeft)314 approximateTreeSearch(
315     TOnFoundFunctor &onFoundFunctor,
316     TTreeIteratorA const &iterA,
317     TTreeIteratorB const &iterB,
318     TErrors errorsLeft)
319 {
320     _approximateTreeSearch<true,true>(
321         onFoundFunctor,
322         iterA,
323         repLength(iterA),
324         iterB,
325         repLength(iterB),
326         errorsLeft);
327 }
328 
329 }
330 #endif
331 
332