1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34
35 #ifndef SEQAN_HEADER_FIND_INDEX_APPROX_H
36 #define SEQAN_HEADER_FIND_INDEX_APPROX_H
37
38 namespace seqan
39 {
40
41 template <typename TSAValue1, typename TSAValue2>
42 struct AddResultsFunctor
43 {
44 String<Pair<TSAValue1, String<TSAValue2> > > results;
45
46 template <typename TIter1, typename TIter2>
operatorAddResultsFunctor47 void operator() (TIter1 &iter1, TIter2 &iter2)
48 {
49 unsigned ofs = length(results);
50 resize(results, ofs + countOccurrences(iter1));
51 for (unsigned i = 0; i < countOccurrences(iter1); ++i)
52 {
53 results[ofs + i].i1 = getOccurrences(iter1)[i];
54 results[ofs + i].i2 = getOccurrences(iter2);
55 }
56 }
57 };
58
59 template <
60 bool enumerateA,
61 bool enumerateB,
62 typename TOnFoundFunctor,
63 typename TTreeIteratorA,
64 typename TIterPosA,
65 typename TTreeIteratorB,
66 typename TIterPosB,
67 typename TErrors >
68 inline void
_exactTreeSearch(TOnFoundFunctor & onFoundFunctor,TTreeIteratorA iterA,TIterPosA iterPosA,TTreeIteratorB iterB,TIterPosB iterPosB)69 _exactTreeSearch(
70 TOnFoundFunctor &onFoundFunctor, // functor to store matches
71 TTreeIteratorA iterA, // pattern tree iterator
72 TIterPosA iterPosA, // position in representative
73 TTreeIteratorB iterB, // text suffix tree iterator
74 TIterPosB iterPosB) // position in representative
75 {
76 while (true)
77 {
78 TIterPosA repLenA = repLength(iterA);
79 TIterPosB repLenB = repLength(iterB);
80 if (iterPosA < repLenA)
81 {
82 // search patterns in the suffix tree
83 if (!goDown(iterB, suffix(representative(iterA), iterPosA)))
84 return;
85 // end of pattern found?
86 if (isLeaf(iterA))
87 {
88 onFoundFunctor(iterA, iterB);
89 return;
90 }
91 iterPosB += repLenA - iterPosA;
92 iterPosA = repLenA;
93 }
94 else if (iterPosB < repLenB)
95 {
96 // search text in the pattern tree
97 TIterPosA lcp = 0;
98 if (!_goDownString(iterA, suffix(representative(iterB), iterPosB), lcp))
99 {
100 // couldn't we proceed to go down because of a leaf in the pattern tree?
101 if (iterPosA + lcp == repLength(iterA) && isLeaf(iterA))
102 onFoundFunctor(iterA, iterB);
103 return;
104 }
105 // end of pattern found?
106 if (isLeaf(iterA))
107 {
108 onFoundFunctor(iterA, iterB);
109 return;
110 }
111 iterPosA += repLenB - iterPosB;
112 iterPosB = repLenB;
113 }
114 else
115 {
116 if (!goDown(iterA))
117 {
118 onFoundFunctor(iterA, iterB);
119 return;
120 }
121 while (emptyParentEdge(iterA))
122 {
123 onFoundFunctor(iterA, iterB);
124 if (!goRight(iterA))
125 return;
126 }
127
128 if (!goDown(iterB))
129 return;
130 while (emptyParentEdge(iterB))
131 {
132 if (!goRight(iterA))
133 return;
134 }
135
136 // search pairs of edges with the same character
137 while (true)
138 {
139 if (parentEdgeFirstChar(iterA) < parentEdgeFirstChar(iterB))
140 {
141 if (!goRight(iterA))
142 return;
143 }
144 else if (parentEdgeFirstChar(iterB) < parentEdgeFirstChar(iterA))
145 {
146 if (!goRight(iterB))
147 return;
148 }
149 else
150 {
151 _exactTreeSearch(onFoundFunctor, iterA, iterPosA, iterB, iterPosB);
152 if (!goRight(iterA) || !goRight(iterB))
153 return;
154 }
155 }
156 }
157 }
158 }
159
160 template <
161 bool enumerateA,
162 bool enumerateB,
163 typename TOnFoundFunctor,
164 typename TTreeIteratorA,
165 typename TIterPosA,
166 typename TTreeIteratorB,
167 typename TIterPosB,
168 typename TErrors >
169 inline void
_approximateTreeSearch(TOnFoundFunctor & onFoundFunctor,TTreeIteratorA iterA,TIterPosA iterPosA,TTreeIteratorB iterB_,TIterPosB iterPosB,TErrors errorsLeft)170 _approximateTreeSearch(
171 TOnFoundFunctor &onFoundFunctor, // functor to store matches
172 TTreeIteratorA iterA, // pattern tree iterator
173 TIterPosA iterPosA, // position in representative
174 TTreeIteratorB iterB_, // text suffix tree iterator
175 TIterPosB iterPosB, // position in representative
176 TErrors errorsLeft) // tolerated errors left
177 {
178 if (enumerateA && !goDown(iterA))
179 {
180 onFoundFunctor(iterA, iterB_);
181 return;
182 }
183 if (enumerateB && !goDown(iterB_)) return;
184
185 do
186 {
187 TTreeIteratorB iterB = iterB_;
188 do
189 {
190 TErrors e = errorsLeft;
191 TIterPosA ipA = iterPosA;
192 TIterPosB ipB = iterPosB;
193
194 if (ipB == repLength(iterB)) continue;
195
196 while (true)
197 {
198 if (ipA == repLength(iterA))
199 {
200 if (ipB == repLength(iterB))
201 _approximateTreeSearch<true,true>(onFoundFunctor, iterA, ipA, iterB, ipB, e);
202 else
203 _approximateTreeSearch<true,false>(onFoundFunctor, iterA, ipA, iterB, ipB, e);
204 break;
205 }
206 else if (ipB == repLength(iterB))
207 {
208 _approximateTreeSearch<false,true>(onFoundFunctor, iterA, ipA, iterB, ipB, e);
209 break;
210 }
211
212 if (representative(iterA)[ipA] != representative(iterB)[ipB])
213 if (e-- == 0) break;
214
215 ++ipA;
216 ++ipB;
217 }
218 } while (enumerateB && goRight(iterB));
219 } while (enumerateA && goRight(iterA));
220 }
221
222
223 template <typename TSAValue>
224 struct AddSingleResultsFunctor
225 {
226 String<TSAValue> results;
227
228 template <typename TPattern, typename TIter>
operatorAddSingleResultsFunctor229 void operator() (TPattern & /*pattern*/, TIter &iter)
230 {
231 append(results, getOccurrences(iter));
232 }
233 };
234
235
236 template <
237 typename TOnFoundFunctor,
238 typename TString,
239 typename TStringPos,
240 typename TTreeIterator,
241 typename TIterPos,
242 typename TErrors >
243 inline void
_approximateStringSearch(TOnFoundFunctor & onFoundFunctor,TString const & string,TStringPos stringPos,TTreeIterator iter,TIterPos iterPos,TErrors errorsLeft)244 _approximateStringSearch(
245 TOnFoundFunctor &onFoundFunctor, // functor to store matches
246 TString const &string, // search pattern
247 TStringPos stringPos, // position in pattern
248 TTreeIterator iter, // text suffix tree iterator
249 TIterPos iterPos, // position in representative
250 TErrors errorsLeft) // tolerated errors left
251 {
252 if (errorsLeft == 0)
253 {
254 if (goDown(iter, suffix(string, stringPos)))
255 onFoundFunctor(string, iter);
256 return;
257 }
258
259 if (!goDown(iter)) return;
260 do
261 {
262 TErrors e = errorsLeft;
263 TStringPos sp = stringPos;
264 TIterPos ip = iterPos;
265
266 if (ip == repLength(iter)) continue;
267
268 while (true)
269 {
270 if (representative(iter)[ip] != string[sp])
271 if (e-- == 0) break;
272
273 if (++sp == length(string))
274 {
275 onFoundFunctor(string, iter);
276 break;
277 }
278
279 if (++ip == repLength(iter))
280 {
281 _approximateStringSearch(onFoundFunctor, string, sp, iter, ip, e);
282 break;
283 }
284 }
285 } while (goRight(iter));
286 }
287
288 template <
289 typename TOnFoundFunctor,
290 typename TString,
291 typename TTreeIterator,
292 typename TErrors>
293 inline void
approximateStringSearch(TOnFoundFunctor & onFoundFunctor,TString const & string,TTreeIterator & iter,TErrors errorsLeft)294 approximateStringSearch(
295 TOnFoundFunctor &onFoundFunctor,
296 TString const &string,
297 TTreeIterator &iter,
298 TErrors errorsLeft)
299 {
300 if (length(string) <= errorsLeft)
301 {
302 onFoundFunctor(string, iter);
303 return;
304 }
305 _approximateStringSearch(onFoundFunctor, string, 0u, iter, repLength(iter), errorsLeft);
306 }
307
308 template <
309 typename TOnFoundFunctor,
310 typename TTreeIteratorA,
311 typename TTreeIteratorB,
312 typename TErrors>
313 inline void
approximateTreeSearch(TOnFoundFunctor & onFoundFunctor,TTreeIteratorA const & iterA,TTreeIteratorB const & iterB,TErrors errorsLeft)314 approximateTreeSearch(
315 TOnFoundFunctor &onFoundFunctor,
316 TTreeIteratorA const &iterA,
317 TTreeIteratorB const &iterB,
318 TErrors errorsLeft)
319 {
320 _approximateTreeSearch<true,true>(
321 onFoundFunctor,
322 iterA,
323 repLength(iterA),
324 iterB,
325 repLength(iterB),
326 errorsLeft);
327 }
328
329 }
330 #endif
331
332