1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: Tobias Rausch <rausch@embl.de> 33 // ========================================================================== 34 35 #ifndef SEQAN_HEADER_FIND_SETHORSPOOL_H 36 #define SEQAN_HEADER_FIND_SETHORSPOOL_H 37 38 namespace SEQAN_NAMESPACE_MAIN 39 { 40 41 ////////////////////////////////////////////////////////////////////////////// 42 // Set Horspool Algorithm 43 ////////////////////////////////////////////////////////////////////////////// 44 45 /*! 46 * @class SetHorspoolPattern 47 * @extends Pattern 48 * @headerfile <seqan/find.h> 49 * 50 * @brief Multiple exact string matching using set horspool algorithm. 51 * 52 * @signature template <typename TNeedle> 53 * class Pattern<TNeedle, SetHorspool>; 54 * 55 * @tparam TNeedle The needle type, a string of keywords. Types: @link ContainerConcept @endlink. 56 * 57 * The types of all keywords in the needle and the haystack have to match. 58 */ 59 60 struct SetHorspool_; 61 typedef Tag<SetHorspool_> SetHorspool; 62 63 ////////////////////////////////////////////////////////////////////////////// 64 65 template <typename TNeedle> 66 class Pattern<TNeedle, SetHorspool> { 67 //____________________________________________________________________________ 68 private: 69 Pattern(Pattern const& other); 70 Pattern const& operator=(Pattern const & other); 71 72 //____________________________________________________________________________ 73 public: 74 typedef typename Size<TNeedle>::Type TSize; 75 typedef typename Value<TNeedle>::Type TValue; 76 typedef typename Value<TValue>::Type TAlphabet; 77 typedef Graph<Automaton<TAlphabet> > TGraph; 78 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor; 79 80 Holder<TNeedle> data_host; 81 Graph<Automaton<TAlphabet> > data_reverseTrie; // Search trie 82 String<String<TSize> > data_terminalStateMap; 83 String<TSize> data_dMap; // Jump table 84 TSize data_lmin; 85 String<TSize> data_endPositions; // All remaining keyword indices 86 TSize data_keywordIndex; // Current keyword that produced a hit 87 TSize data_needleLength; // Last length of needle to reposition finder 88 TVertexDescriptor data_lastState; // Last state in the trie 89 90 //____________________________________________________________________________ 91 Pattern()92 Pattern() { 93 } 94 95 template <typename TNeedle2> Pattern(TNeedle2 const & ndl)96 Pattern(TNeedle2 const & ndl) 97 { 98 setHost(*this, ndl); 99 } 100 ~Pattern()101 ~Pattern() { 102 SEQAN_CHECKPOINT 103 } 104 //____________________________________________________________________________ 105 }; 106 107 ////////////////////////////////////////////////////////////////////////////// 108 // Host Metafunctions 109 ////////////////////////////////////////////////////////////////////////////// 110 111 template <typename TNeedle> 112 struct Host< Pattern<TNeedle, SetHorspool> > 113 { 114 typedef TNeedle Type; 115 }; 116 117 template <typename TNeedle> 118 struct Host< Pattern<TNeedle, SetHorspool> const> 119 { 120 typedef TNeedle const Type; 121 }; 122 123 ////////////////////////////////////////////////////////////////////////////// 124 // Functions 125 ////////////////////////////////////////////////////////////////////////////// 126 127 template <typename TNeedle, typename TNeedle2> 128 void setHost (Pattern<TNeedle, SetHorspool> & me, TNeedle2 const & needle) { 129 SEQAN_CHECKPOINT 130 typedef typename Value<TNeedle>::Type TKeyword; 131 typedef typename Size<TKeyword>::Type TSize; 132 typedef typename Value<TKeyword>::Type TAlphabet; 133 134 // clean-up 135 clear(me.data_reverseTrie); 136 clear(me.data_terminalStateMap); 137 clear(me.data_endPositions); 138 clear(me.data_dMap); 139 me.data_lmin=0; 140 141 // Create Trie 142 createTrieOnReverse(me.data_reverseTrie,me.data_terminalStateMap,needle); 143 assignRoot(me.data_reverseTrie,0); 144 setValue(me.data_host, needle); 145 146 // Create jump map 147 TSize alphabet_size = ValueSize<TAlphabet>::VALUE; 148 resize(me.data_dMap, alphabet_size); 149 me.data_lmin = _getInfinity<TSize>(); 150 typename Iterator<TNeedle2 const, Rooted>::Type it = begin(needle); 151 for(;!atEnd(it);goNext(it)) { 152 TSize tmp = length(*it); 153 if (tmp<me.data_lmin) me.data_lmin = tmp; 154 } 155 for(TSize i=0;i<alphabet_size;++i) { 156 me.data_dMap[i]=me.data_lmin; 157 } 158 goBegin(it); 159 for(;!atEnd(it);goNext(it)) { 160 for(TSize pos = 0;pos < length(*it) - 1; ++pos) { 161 TSize ind = ordValue((TAlphabet)(*it)[pos]); 162 if ((length(*it)- 1 - pos) < me.data_dMap[ind]) { 163 me.data_dMap[ind] = (length(*it) - 1 - pos); 164 } 165 } 166 } 167 168 /* 169 fstream strm; 170 strm.open(TEST_PATH "my_trie.dot", ios_base::out | ios_base::trunc); 171 String<String<char> > nodeMap; 172 _createTrieNodeNames(me.data_reverseTrie, me.data_terminalStateMap, nodeMap); 173 String<String<char> > edgeMap; 174 _createEdgeNames(me.data_reverseTrie,edgeMap); 175 writeRecords(strm,me.data_reverseTrie,nodeMap,edgeMap,DotDrawing()); 176 strm.close(); 177 */ 178 } 179 180 template <typename TNeedle, typename TNeedle2> 181 void setHost (Pattern<TNeedle, SetHorspool> & me, TNeedle2 & needle) 182 { 183 setHost(me, reinterpret_cast<TNeedle2 const &>(needle)); 184 } 185 186 //____________________________________________________________________________ 187 188 189 template <typename TNeedle> 190 inline void _patternInit (Pattern<TNeedle, SetHorspool> & me) 191 { 192 SEQAN_CHECKPOINT 193 clear(me.data_endPositions); 194 me.data_keywordIndex = 0; 195 me.data_lastState = getRoot(me.data_reverseTrie); 196 } 197 198 199 //____________________________________________________________________________ 200 201 template <typename TNeedle> 202 inline typename Host<Pattern<TNeedle, SetHorspool>const>::Type & 203 host(Pattern<TNeedle, SetHorspool> & me) 204 { 205 SEQAN_CHECKPOINT 206 return value(me.data_host); 207 } 208 209 template <typename TNeedle> 210 inline typename Host<Pattern<TNeedle, SetHorspool>const>::Type & 211 host(Pattern<TNeedle, SetHorspool> const & me) 212 { 213 SEQAN_CHECKPOINT 214 return value(me.data_host); 215 } 216 217 //____________________________________________________________________________ 218 219 220 template <typename TNeedle> 221 inline typename Size<TNeedle>::Type 222 position(Pattern<TNeedle, SetHorspool> & me) 223 { 224 return me.data_keywordIndex; 225 } 226 227 228 template <typename TFinder, typename TNeedle> 229 inline bool find(TFinder & finder, Pattern<TNeedle, SetHorspool> & me) { 230 SEQAN_CHECKPOINT 231 typedef typename Value<TNeedle>::Type TKeyword; 232 typedef typename Size<TKeyword>::Type TSize; 233 typedef typename Value<TKeyword>::Type TAlphabet; 234 typedef Graph<Automaton<TAlphabet> > TGraph; 235 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor; 236 237 TVertexDescriptor current = getRoot(me.data_reverseTrie); 238 239 // Process left-over hits 240 if ((!empty(finder)) && 241 (!empty(me.data_endPositions))) { 242 finder += me.data_needleLength; 243 current = me.data_lastState; 244 me.data_keywordIndex = me.data_endPositions[length(me.data_endPositions)-1]; 245 me.data_needleLength = length(getValue(host(me), me.data_keywordIndex))-1; 246 if (length(me.data_endPositions) > 1) resize(me.data_endPositions, (length(me.data_endPositions)-1)); 247 else clear(me.data_endPositions); 248 me.data_lastState = current; 249 finder -= me.data_needleLength; 250 return true; 251 } 252 253 TVertexDescriptor nilVal = getNil<TVertexDescriptor>(); 254 TSize j = 0; 255 if (empty(finder)) { 256 _patternInit(me); 257 _finderSetNonEmpty(finder); 258 finder += me.data_lmin - 1; 259 } else { 260 finder += me.data_needleLength; 261 j = me.data_needleLength + 1; 262 current = me.data_lastState; 263 } 264 265 TSize haystackLength = length(container(finder)); 266 bool oldMatch = true; 267 // Do not change to !atEnd(finder) because of jump map! 268 while(position(finder) < haystackLength) { 269 while ((position(finder)>=j) && 270 (getSuccessor(me.data_reverseTrie, current, *(finder-j))!= nilVal)) 271 { 272 me.data_endPositions = getProperty(me.data_terminalStateMap,current); 273 if ((!oldMatch) && (!empty(me.data_endPositions))) break; 274 current = getSuccessor(me.data_reverseTrie, current, *(finder-j)); 275 if (current == nilVal) break; 276 ++j; 277 oldMatch = false; 278 } 279 me.data_endPositions = getProperty(me.data_terminalStateMap,current); 280 if ((!oldMatch) && 281 (!empty(me.data_endPositions))) 282 { 283 me.data_keywordIndex = me.data_endPositions[length(me.data_endPositions)-1]; 284 me.data_needleLength = length(getValue(host(me), me.data_keywordIndex))-1; 285 if (length(me.data_endPositions) > 1) resize(me.data_endPositions, length(me.data_endPositions)-1); 286 else clear(me.data_endPositions); 287 me.data_lastState = current; 288 finder -= me.data_needleLength; 289 _setFinderLength(finder, me.data_needleLength+1); 290 _setFinderEnd(finder, position(finder)+length(finder)); 291 return true; 292 } 293 oldMatch = false; 294 TSize ind = ordValue(*finder); 295 setPosition(finder, position(finder) + getValue(me.data_dMap, ind)); 296 j = 0; 297 current = getRoot(me.data_reverseTrie); 298 } 299 return false; 300 } 301 302 }// namespace SEQAN_NAMESPACE_MAIN 303 304 #endif //#ifndef SEQAN_HEADER_FIND_SETHORSPOOL_H 305