1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Tobias Rausch <rausch@embl.de>
33 // ==========================================================================
34 
35 #ifndef SEQAN_HEADER_FIND_SETHORSPOOL_H
36 #define SEQAN_HEADER_FIND_SETHORSPOOL_H
37 
38 namespace SEQAN_NAMESPACE_MAIN
39 {
40 
41 //////////////////////////////////////////////////////////////////////////////
42 // Set Horspool Algorithm
43 //////////////////////////////////////////////////////////////////////////////
44 
45 /*!
46  * @class SetHorspoolPattern
47  * @extends Pattern
48  * @headerfile <seqan/find.h>
49  *
50  * @brief Multiple exact string matching using set horspool algorithm.
51  *
52  * @signature template <typename TNeedle>
53  *            class Pattern<TNeedle, SetHorspool>;
54  *
55  * @tparam TNeedle The needle type, a string of keywords.  Types: @link ContainerConcept @endlink.
56  *
57  * The types of all keywords in the needle and the haystack have to match.
58  */
59 
60 struct SetHorspool_;
61 typedef Tag<SetHorspool_> SetHorspool;
62 
63 //////////////////////////////////////////////////////////////////////////////
64 
65 template <typename TNeedle>
66 class Pattern<TNeedle, SetHorspool> {
67 //____________________________________________________________________________
68 private:
69     Pattern(Pattern const& other);
70     Pattern const& operator=(Pattern const & other);
71 
72 //____________________________________________________________________________
73 public:
74     typedef typename Size<TNeedle>::Type TSize;
75     typedef typename Value<TNeedle>::Type TValue;
76     typedef typename Value<TValue>::Type TAlphabet;
77     typedef Graph<Automaton<TAlphabet> > TGraph;
78     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
79 
80     Holder<TNeedle> data_host;
81     Graph<Automaton<TAlphabet> > data_reverseTrie;  // Search trie
82     String<String<TSize> > data_terminalStateMap;
83     String<TSize> data_dMap;    // Jump table
84     TSize data_lmin;
85     String<TSize> data_endPositions;    // All remaining keyword indices
86     TSize data_keywordIndex;            // Current keyword that produced a hit
87     TSize data_needleLength;            // Last length of needle to reposition finder
88     TVertexDescriptor data_lastState;   // Last state in the trie
89 
90 //____________________________________________________________________________
91 
Pattern()92     Pattern() {
93     }
94 
95     template <typename TNeedle2>
Pattern(TNeedle2 const & ndl)96     Pattern(TNeedle2 const & ndl)
97     {
98         setHost(*this, ndl);
99     }
100 
~Pattern()101     ~Pattern() {
102         SEQAN_CHECKPOINT
103     }
104 //____________________________________________________________________________
105 };
106 
107 //////////////////////////////////////////////////////////////////////////////
108 // Host Metafunctions
109 //////////////////////////////////////////////////////////////////////////////
110 
111 template <typename TNeedle>
112 struct Host< Pattern<TNeedle, SetHorspool> >
113 {
114     typedef TNeedle Type;
115 };
116 
117 template <typename TNeedle>
118 struct Host< Pattern<TNeedle, SetHorspool> const>
119 {
120     typedef TNeedle const Type;
121 };
122 
123 //////////////////////////////////////////////////////////////////////////////
124 // Functions
125 //////////////////////////////////////////////////////////////////////////////
126 
127 template <typename TNeedle, typename TNeedle2>
128 void setHost (Pattern<TNeedle, SetHorspool> & me, TNeedle2 const & needle) {
129     SEQAN_CHECKPOINT
130     typedef typename Value<TNeedle>::Type TKeyword;
131     typedef typename Size<TKeyword>::Type TSize;
132     typedef typename Value<TKeyword>::Type TAlphabet;
133 
134     // clean-up
135     clear(me.data_reverseTrie);
136     clear(me.data_terminalStateMap);
137     clear(me.data_endPositions);
138     clear(me.data_dMap);
139     me.data_lmin=0;
140 
141     // Create Trie
142     createTrieOnReverse(me.data_reverseTrie,me.data_terminalStateMap,needle);
143     assignRoot(me.data_reverseTrie,0);
144     setValue(me.data_host, needle);
145 
146     // Create jump map
147     TSize alphabet_size = ValueSize<TAlphabet>::VALUE;
148     resize(me.data_dMap, alphabet_size);
149     me.data_lmin = _getInfinity<TSize>();
150     typename Iterator<TNeedle2 const, Rooted>::Type it = begin(needle);
151     for(;!atEnd(it);goNext(it)) {
152         TSize tmp = length(*it);
153         if (tmp<me.data_lmin) me.data_lmin = tmp;
154     }
155     for(TSize i=0;i<alphabet_size;++i) {
156         me.data_dMap[i]=me.data_lmin;
157     }
158     goBegin(it);
159     for(;!atEnd(it);goNext(it)) {
160         for(TSize pos = 0;pos < length(*it) - 1; ++pos) {
161             TSize ind = ordValue((TAlphabet)(*it)[pos]);
162             if ((length(*it)- 1 - pos) < me.data_dMap[ind]) {
163                 me.data_dMap[ind] = (length(*it) - 1 - pos);
164             }
165         }
166     }
167 
168     /*
169     fstream strm;
170     strm.open(TEST_PATH "my_trie.dot", ios_base::out | ios_base::trunc);
171     String<String<char> > nodeMap;
172     _createTrieNodeNames(me.data_reverseTrie, me.data_terminalStateMap, nodeMap);
173     String<String<char> > edgeMap;
174     _createEdgeNames(me.data_reverseTrie,edgeMap);
175     writeRecords(strm,me.data_reverseTrie,nodeMap,edgeMap,DotDrawing());
176     strm.close();
177     */
178 }
179 
180 template <typename TNeedle, typename TNeedle2>
181 void setHost (Pattern<TNeedle, SetHorspool> & me, TNeedle2 & needle)
182 {
183     setHost(me, reinterpret_cast<TNeedle2 const &>(needle));
184 }
185 
186 //____________________________________________________________________________
187 
188 
189 template <typename TNeedle>
190 inline void _patternInit (Pattern<TNeedle, SetHorspool> & me)
191 {
192 SEQAN_CHECKPOINT
193     clear(me.data_endPositions);
194     me.data_keywordIndex = 0;
195     me.data_lastState = getRoot(me.data_reverseTrie);
196 }
197 
198 
199 //____________________________________________________________________________
200 
201 template <typename TNeedle>
202 inline typename Host<Pattern<TNeedle, SetHorspool>const>::Type &
203 host(Pattern<TNeedle, SetHorspool> & me)
204 {
205 SEQAN_CHECKPOINT
206     return value(me.data_host);
207 }
208 
209 template <typename TNeedle>
210 inline typename Host<Pattern<TNeedle, SetHorspool>const>::Type &
211 host(Pattern<TNeedle, SetHorspool> const & me)
212 {
213 SEQAN_CHECKPOINT
214     return value(me.data_host);
215 }
216 
217 //____________________________________________________________________________
218 
219 
220 template <typename TNeedle>
221 inline typename Size<TNeedle>::Type
222 position(Pattern<TNeedle, SetHorspool> & me)
223 {
224     return me.data_keywordIndex;
225 }
226 
227 
228 template <typename TFinder, typename TNeedle>
229 inline bool find(TFinder & finder, Pattern<TNeedle, SetHorspool> & me) {
230     SEQAN_CHECKPOINT
231     typedef typename Value<TNeedle>::Type TKeyword;
232     typedef typename Size<TKeyword>::Type TSize;
233     typedef typename Value<TKeyword>::Type TAlphabet;
234     typedef Graph<Automaton<TAlphabet> > TGraph;
235     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
236 
237     TVertexDescriptor current = getRoot(me.data_reverseTrie);
238 
239     // Process left-over hits
240     if ((!empty(finder)) &&
241         (!empty(me.data_endPositions))) {
242         finder += me.data_needleLength;
243         current = me.data_lastState;
244         me.data_keywordIndex = me.data_endPositions[length(me.data_endPositions)-1];
245         me.data_needleLength = length(getValue(host(me), me.data_keywordIndex))-1;
246         if (length(me.data_endPositions) > 1) resize(me.data_endPositions, (length(me.data_endPositions)-1));
247         else clear(me.data_endPositions);
248         me.data_lastState = current;
249         finder -= me.data_needleLength;
250         return true;
251     }
252 
253     TVertexDescriptor nilVal = getNil<TVertexDescriptor>();
254     TSize j = 0;
255     if (empty(finder)) {
256         _patternInit(me);
257         _finderSetNonEmpty(finder);
258         finder += me.data_lmin - 1;
259     } else {
260         finder += me.data_needleLength;
261         j = me.data_needleLength + 1;
262         current = me.data_lastState;
263     }
264 
265     TSize haystackLength = length(container(finder));
266     bool oldMatch = true;
267     // Do not change to !atEnd(finder) because of jump map!
268     while(position(finder) < haystackLength) {
269         while ((position(finder)>=j) &&
270                 (getSuccessor(me.data_reverseTrie, current, *(finder-j))!= nilVal))
271         {
272             me.data_endPositions = getProperty(me.data_terminalStateMap,current);
273             if ((!oldMatch) && (!empty(me.data_endPositions))) break;
274             current = getSuccessor(me.data_reverseTrie, current, *(finder-j));
275             if (current == nilVal) break;
276             ++j;
277             oldMatch = false;
278         }
279         me.data_endPositions = getProperty(me.data_terminalStateMap,current);
280         if ((!oldMatch) &&
281             (!empty(me.data_endPositions)))
282         {
283             me.data_keywordIndex = me.data_endPositions[length(me.data_endPositions)-1];
284             me.data_needleLength = length(getValue(host(me), me.data_keywordIndex))-1;
285             if (length(me.data_endPositions) > 1) resize(me.data_endPositions, length(me.data_endPositions)-1);
286             else clear(me.data_endPositions);
287             me.data_lastState = current;
288             finder -= me.data_needleLength;
289             _setFinderLength(finder, me.data_needleLength+1);
290             _setFinderEnd(finder, position(finder)+length(finder));
291             return true;
292         }
293         oldMatch = false;
294         TSize ind = ordValue(*finder);
295         setPosition(finder, position(finder) + getValue(me.data_dMap, ind));
296         j = 0;
297         current = getRoot(me.data_reverseTrie);
298     }
299     return false;
300 }
301 
302 }// namespace SEQAN_NAMESPACE_MAIN
303 
304 #endif //#ifndef SEQAN_HEADER_FIND_SETHORSPOOL_H
305