1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: David Weese <david.weese@fu-berlin.de> 33 // ========================================================================== 34 35 #ifndef SEQAN_HEADER_INDEX_WOTD_H 36 #define SEQAN_HEADER_INDEX_WOTD_H 37 38 namespace SEQAN_NAMESPACE_MAIN 39 { 40 41 42 ////////////////////////////////////////////////////////////////////////////// 43 // wotd tree index fibres 44 45 /*! 46 * @defgroup WOTDIndexFibres WOTD Index Fibres 47 * @brief Tag to select a specific fibre (e.g. table, object, ...) of an @link 48 * IndexWotd @endlink index. 49 * 50 * These tags can be used to get @link Fibre @endlink of an @link IndexWotd @endlink. 51 * 52 * @see Fibre 53 * @see Index#getFibre 54 * @see IndexWotd 55 * 56 * @tag WOTDIndexFibres#WotdDir 57 * @brief The child table. 58 * 59 * @tag WOTDIndexFibres#WotdRawSA 60 * @brief The raw suffix array. 61 * 62 * @tag WOTDIndexFibres#WotdText 63 * @brief The original text the index should be based on. 64 * 65 * @tag WOTDIndexFibres#WotdRawText 66 * @brief The raw text the index is really based on. 67 * 68 * @tag WOTDIndexFibres#WotdSA 69 * @brief The suffix array. 70 */ 71 72 ////////////////////////////////////////////////////////////////////////////// 73 /*! 74 * @fn IndexWotd#indexSA 75 * @headerfile <seqan/index.h> 76 * @brief Shortcut for <tt>getFibre(.., WotdSA)</tt>. 77 * 78 * @signature TSa indexSA(index); 79 * 80 * @param[in] index The @link IndexWotd @endlink object holding the fibre. 81 * 82 * @return TSa A reference to the @link WOTDIndexFibres#WotdSA @endlink fibre (partially sorted suffix array). 83 */ 84 85 /*! 86 * @fn IndexWotd#indexDir 87 * @headerfile <seqan/index.h> 88 * @brief Shortcut for <tt>getFibre(.., WotdDir())</tt>. 89 * @signature TFibre indexDir(index); 90 * 91 * @param[in] index The @link IndexWotd @endlink object holding the fibre. 92 * 93 * @return TFibre A reference to the @link WOTDIndexFibres#WotdDir @endlink fibre (tree structure). 94 */ 95 96 /*! 97 * @fn IndexWotd#saAt 98 * @headerfile <seqan/index.h> 99 * @note Advanced functionality, not commonly used. 100 * @brief Shortcut for <tt>value(indexSA(..), ..)</tt>. 101 * 102 * @signature TValue saAt(position, index); 103 * 104 * @param[in] index The @link IndexWotd @endlink object holding the fibre. 105 * @param[in] position A position in the array on which the value should be accessed. 106 * 107 * @return TValue A reference or proxy to the value in the @link WOTDIndexFibres#WotdSA @endlink fibre. 108 * To be more precise, a reference to a position containing a value of type 109 * @link SAValue @endlink is returned (or a proxy). 110 */ 111 112 /*! 113 * @fn IndexWotd#dirAt 114 * @headerfile <seqan/index.h> 115 * @brief Shortcut for <tt>value(indexDir(index), position)</tt>. 116 * 117 * @signature TFibre dirAt(position, index); 118 * 119 * @param[in] index The @link IndexWotd @endlink object holding the fibre. 120 * @param[in] position A position in the array on which the value should be accessed. 121 * 122 * @return TFibre A reference to the @link WOTDIndexFibres#WotdDir @endlink fibre. 123 */ 124 125 typedef FibreText WotdText; 126 typedef FibreRawText WotdRawText; 127 typedef FibreSA WotdSA; 128 typedef FibreRawSA WotdRawSA; 129 typedef FibreDir WotdDir; 130 131 ////////////////////////////////////////////////////////////////////////////// 132 // wotd tree index 133 134 /*! 135 * @class IndexWotd 136 * @extends Index 137 * @implements StringTreeConcept 138 * @headerfile <seqan/index.h> 139 * @brief An index based on a lazy suffix tree (see Giegerich et al., "Efficient implementation of lazy suffix 140 * trees"). 141 * 142 * @signature template <typename TText, typename TSpec> 143 * class Index<TText, IndexWotd<TSpec> >; 144 * 145 * @tparam TText The @link TextConcept @endlink text type. 146 * @tparam TSpec The type for further specialization of the Index type. 147 * 148 * The fibres (see @link Index @endlink and @link Fibre @endlink) of this index are a partially sorted suffix array 149 * (see @link WOTDIndexFibres#WotdSA @endlink) and the wotd tree (see @link WOTDIndexFibres#WotdDir @endlink). 150 * 151 * Demo: Demo.Constraint Iterator 152 * 153 * @see WOTDIndexFibres 154 */ 155 156 struct WotdOriginal_; 157 typedef Tag<WotdOriginal_> const WotdOriginal; 158 159 template < typename TSpec = void > 160 struct IndexWotd {}; 161 162 /* 163 template < typename TObject, typename TSpec > 164 struct Fibre< Index<TObject, IndexWotd<TSpec> >, FibreDir> 165 { 166 typedef Index<TObject, IndexWotd<TSpec> > TIndex; 167 typedef String< 168 typename typename Size<TIndex>::Type, 169 Alloc<> 170 > Type; 171 }; 172 */ 173 174 template < typename TObject, typename TSpec > 175 class Index<TObject, IndexWotd<TSpec> > { 176 public: 177 typedef typename Fibre<Index, WotdText>::Type TText; 178 typedef typename Fibre<Index, WotdSA>::Type TSA; 179 typedef typename Fibre<Index, WotdDir>::Type TDir; 180 181 typedef typename Value<Index>::Type TValue; 182 typedef typename Value<TDir>::Type TDirValue; 183 typedef typename Size<Index>::Type TSize; 184 typedef String<TSize, Alloc<> > TCounter; 185 typedef String<typename Value<TSA>::Type, Alloc<> > TTempSA; 186 typedef typename Cargo<Index>::Type TCargo; 187 188 // 1st word flags 189 static TDirValue const LEAF = (TDirValue)1 << (BitsPerValue<TDirValue>::VALUE - 1); // this node is a leaf 190 static TDirValue const LAST_CHILD = (TDirValue)1 << (BitsPerValue<TDirValue>::VALUE - 2); // this node is the last child 191 // 2nd word flag 192 static TDirValue const UNEVALUATED = (TDirValue)1 << (BitsPerValue<TDirValue>::VALUE - 1); // this node is partially evalutated and has no evaluated children 193 static TDirValue const SENTINELS = (TDirValue)1 << (BitsPerValue<TDirValue>::VALUE - 2); // the children of this node have solely $-edges 194 195 static TDirValue const BITMASK0 = ~(LEAF | LAST_CHILD); 196 static TDirValue const BITMASK1 = ~(UNEVALUATED | SENTINELS); 197 198 199 Holder<TText> text; // underlying text 200 TSA sa; // suffix array sorted by the first q chars 201 TDir dir; // bucket directory 202 TCargo cargo; // user-defined cargo 203 204 205 TTempSA tempSA; 206 TCounter tempOcc; 207 TCounter tempBound; 208 209 TSize sentinelOcc; 210 TSize sentinelBound; 211 bool interSentinelNodes; // should virtually one (true) $-sign or many (false) $_i-signs be appended to the strings in text 212 Index()213 Index(): 214 interSentinelNodes(false) {} 215 Index(Index & other)216 Index(Index &other) : 217 text(other.text), 218 sa(other.sa), 219 dir(other.dir), 220 cargo(other.cargo), 221 tempSA(other.tempSA), 222 tempBound(other.tempBound), 223 sentinelOcc(other.sentinelOcc), 224 sentinelBound(other.sentinelBound), 225 interSentinelNodes(other.interSentinelNodes) {} 226 Index(Index const & other)227 Index(Index const &other) : 228 text(other.text), 229 sa(other.sa), 230 dir(other.dir), 231 cargo(other.cargo), 232 tempSA(other.tempSA), 233 tempBound(other.tempBound), 234 sentinelOcc(other.sentinelOcc), 235 sentinelBound(other.sentinelBound), 236 interSentinelNodes(other.interSentinelNodes) {} 237 238 template <typename TText_> Index(TText_ & _text)239 Index(TText_ &_text) : 240 text(_text), 241 sentinelOcc(0), 242 sentinelBound(0), 243 interSentinelNodes(false) {} 244 245 template <typename TText_> Index(TText_ const & _text)246 Index(TText_ const &_text): 247 text(_text), 248 sentinelOcc(0), 249 sentinelBound(0), 250 interSentinelNodes(false) {} 251 }; 252 /* 253 template < typename TText, typename TSpec > 254 struct Value< Index<TText, IndexWotd<TSpec> > > { 255 typedef typename Value< typename Fibre< Index<TText, IndexWotd<TSpec> >, WotdRawText >::Type >::Type Type; 256 }; 257 258 template < typename TText, typename TSpec > 259 struct Size< Index<TText, IndexWotd<TSpec> > > { 260 typedef typename Size< typename Fibre< Index<TText, IndexWotd<TSpec> >, WotdRawText >::Type >::Type Type; 261 }; 262 */ 263 264 template <typename TText, typename TSpec> 265 SEQAN_CONCEPT_IMPL((Index<TText, IndexWotd<TSpec> >), (StringTreeConcept)); 266 267 template <typename TText, typename TSpec> 268 SEQAN_CONCEPT_IMPL((Index<TText, IndexWotd<TSpec> > const), (StringTreeConcept)); 269 270 ////////////////////////////////////////////////////////////////////////////// 271 // default fibre creators 272 273 template < typename TText, typename TSpec > 274 struct DefaultIndexCreator<Index<TText, IndexWotd<TSpec> >, FibreDir> { 275 typedef Default Type; 276 }; 277 278 template < typename TText, typename TSSetSpec, typename TSpec > 279 struct DefaultIndexCreator<Index<StringSet<TText, TSSetSpec>, IndexWotd<TSpec> >, FibreDir> { 280 typedef Default Type; 281 }; 282 283 ////////////////////////////////////////////////////////////////////////////// 284 // default finder 285 286 template < typename TText, typename TSpec > 287 struct DefaultFinder< Index<TText, IndexWotd<TSpec> > > 288 { 289 typedef FinderSTree Type; // standard wotd finder is tree based search 290 }; 291 292 ////////////////////////////////////////////////////////////////////////////// 293 294 295 template <typename TSize> 296 struct VertexWotdOriginal_ { 297 TSize node; // position of current node entry in directory 298 TSize parentRepLen; // representative length of parent node 299 TSize edgeLen; // length of edge above current node 300 301 VertexWotdOriginal_() : node(0), parentRepLen(0), edgeLen(0) {} 302 VertexWotdOriginal_(MinimalCtor) : node(0), parentRepLen(0), edgeLen(0) 303 { 304 _setSizeInval(node); 305 } 306 }; 307 308 template <typename TSize> 309 struct VertexWotdModified_ { 310 TSize node; // position of current node entry in directory 311 TSize parentRepLen; // representative length of parent node 312 TSize edgeLen; // length of edge above current node 313 Pair<TSize> range; // current SA interval of hits 314 TSize parentRight; // right boundary of parent node's range (allows to go right) 315 316 VertexWotdModified_() : 317 node(0), 318 parentRepLen(0), 319 edgeLen(0), 320 range(0,0), 321 parentRight(0) 322 {} 323 VertexWotdModified_(MinimalCtor) : 324 node(0), 325 parentRepLen(0), 326 edgeLen(0), 327 range(0,0), 328 parentRight(0) 329 {} 330 VertexWotdModified_(Pair<TSize> const &otherRange, TSize otherParentRight) : 331 node(0), 332 parentRepLen(0), 333 edgeLen(0), 334 range(otherRange), 335 parentRight(otherParentRight) 336 {} 337 }; 338 339 ////////////////////////////////////////////////////////////////////////////// 340 template < typename TText > 341 struct VertexDescriptor< Index<TText, IndexWotd<WotdOriginal> > > { 342 typedef typename Size< Index<TText, IndexWotd<WotdOriginal> > >::Type TSize; 343 typedef VertexWotdOriginal_<TSize> Type; 344 }; 345 346 template < typename TText, typename TSpec > 347 struct VertexDescriptor< Index<TText, IndexWotd<TSpec> > > { 348 typedef typename Size< Index<TText, IndexWotd<TSpec> > >::Type TSize; 349 typedef VertexWotdModified_<TSize> Type; 350 }; 351 352 template < typename TText, typename TSpec > 353 void _indexRequireTopDownIteration(Index<TText, IndexWotd<TSpec> > &index) 354 { 355 indexRequire(index, WotdDir()); 356 } 357 358 ////////////////////////////////////////////////////////////////////////////// 359 // history stack functions 360 361 template <typename TSize> 362 struct HistoryStackWotdOriginal_ 363 { 364 TSize node; // position of current node entry in directory 365 TSize edgeLen; // length of edge above current node 366 }; 367 368 template <typename TSize> 369 struct HistoryStackWotdModified_ 370 { 371 TSize node; // position of current node entry in directory 372 TSize edgeLen; // length of edge above current node 373 Pair<TSize> range; // current SA interval of hits 374 }; 375 376 template < typename TText, typename TSpec > 377 struct HistoryStackEntry_< Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown< ParentLinks<TSpec> > > > > 378 { 379 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 380 typedef typename Size<TIndex>::Type TSize; 381 typedef HistoryStackWotdOriginal_<TSize> Type; 382 }; 383 384 template < typename TText, typename TIndexSpec, typename TSpec > 385 struct HistoryStackEntry_< Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown< ParentLinks<TSpec> > > > > 386 { 387 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 388 typedef typename Size<TIndex>::Type TSize; 389 typedef HistoryStackWotdModified_<TSize> Type; 390 }; 391 392 393 template < typename TText, typename TSpec > 394 inline void 395 _historyPush(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > &it) 396 { 397 it._parentDesc = value(it); 398 value(it).parentRepLen += parentEdgeLength(it); 399 } 400 401 template < typename TText, typename TIndexSpec, typename TSpec > 402 inline void 403 _historyPush(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > &it) 404 { 405 it._parentDesc = value(it); 406 value(it).parentRepLen += parentEdgeLength(it); 407 value(it).parentRight = value(it).range.i2; 408 } 409 410 template < typename TText, typename TSpec > 411 inline void 412 _historyPush(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown< ParentLinks<TSpec> > > > &it) 413 { 414 typedef typename Size< Index<TText, IndexWotd<WotdOriginal> > >::Type TSize; 415 TSize edgeLen = parentEdgeLength(it); 416 HistoryStackWotdOriginal_<TSize> entry = { value(it).node, edgeLen }; 417 appendValue(it.history, entry); 418 value(it).parentRepLen += edgeLen; 419 } 420 421 template < typename TText, typename TIndexSpec, typename TSpec > 422 inline void 423 _historyPush(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown< ParentLinks<TSpec> > > > &it) 424 { 425 typedef typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type TSize; 426 TSize edgeLen = parentEdgeLength(it); 427 HistoryStackWotdModified_<TSize> entry = { value(it).node, edgeLen, value(it).range }; 428 appendValue(it.history, entry); 429 value(it).parentRepLen += edgeLen; 430 value(it).parentRight = value(it).range.i2; 431 } 432 433 ////////////////////////////////////////////////////////////////////////////// 434 template < typename TText, typename TIndexSpec, typename TPropertyMap > 435 inline void 436 resizeVertexMap( 437 TPropertyMap & pm, 438 Index<TText, IndexWotd<TIndexSpec> > const& index) 439 { 440 resize(pm, length(indexDir(index)), Generous()); 441 } 442 443 /* // different interface compared to resizeVertexMap(graph, ...) 444 template < typename TText, typename TIndexSpec, typename TPropertyMap, typename TProperty > 445 inline void 446 resizeVertexMap( 447 Index<TText, IndexWotd<TIndexSpec> > const& index, 448 TPropertyMap & pm, 449 TProperty const & prop) 450 { 451 resize(pm, length(indexDir(index)), prop, Generous()); 452 } 453 */ 454 template < typename TSize > 455 inline typename Id< VertexWotdOriginal_<TSize> const >::Type 456 _getId(VertexWotdOriginal_<TSize> const &desc) 457 { 458 return desc.node; 459 } 460 461 template < typename TSize > 462 inline typename Id< VertexWotdOriginal_<TSize> >::Type 463 _getId(VertexWotdOriginal_<TSize> &desc) 464 { 465 return _getId(const_cast<VertexWotdOriginal_<TSize> const &>(desc)); 466 } 467 468 template < typename TSize > 469 inline typename Id< VertexWotdModified_<TSize> const >::Type 470 _getId(VertexWotdModified_<TSize> const &desc) 471 { 472 return desc.node; 473 } 474 475 template < typename TSize > 476 inline typename Id< VertexWotdModified_<TSize> >::Type 477 _getId(VertexWotdModified_<TSize> &desc) 478 { 479 return _getId(const_cast<VertexWotdModified_<TSize> const &>(desc)); 480 } 481 482 ////////////////////////////////////////////////////////////////////////////// 483 484 template < typename TSize > 485 inline bool _isRoot(VertexWotdOriginal_<TSize> const &value) 486 { 487 return value.node == 0; 488 } 489 490 template < typename TSize > 491 inline bool _isRoot(VertexWotdModified_<TSize> const &value) 492 { 493 return value.node == 0; 494 } 495 496 // is this a leaf? (including empty $-edges) 497 template < typename TText, typename TIndexSpec, typename TSpec, typename TDfsOrder > 498 inline bool _isLeaf( 499 Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree<TSpec> > const &it, 500 VSTreeIteratorTraits<TDfsOrder, False> const) 501 { 502 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 503 TIndex const &index = container(it); 504 return (dirAt(value(it).node, index) & index.LEAF) != 0; 505 } 506 507 // is this a leaf? (excluding empty $-edges) 508 template < typename TText, typename TIndexSpec, typename TSpec, typename TDfsOrder > 509 inline bool _isLeaf( 510 Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree<TSpec> > const &it, 511 VSTreeIteratorTraits<TDfsOrder, True> const) 512 { 513 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 514 515 TIndex const &index = container(it); 516 if (dirAt(value(it).node, index) & index.LEAF) 517 return true; 518 519 // ensure node evaluation and test for sentinel child edges? 520 return _wotdEvaluate(it) & index.SENTINELS; 521 } 522 523 524 // parentEdgeLength - ORIGINAL VERSION 525 template < typename TIndex, typename TSize > 526 inline typename Size<TIndex>::Type 527 parentEdgeLength(TIndex const &index, VertexWotdOriginal_<TSize> &vDesc) 528 { 529 TSize edgeLen = vDesc.edgeLen; 530 if (edgeLen != (TSize)-1) 531 return edgeLen; 532 533 TSize pos = vDesc.node; 534 TSize w0 = dirAt(pos, index); 535 if (w0 & index.LEAF) 536 return vDesc.edgeLen = suffixLength(w0 & index.BITMASK0, index); 537 538 TSize w1 = dirAt(pos + 1, index); 539 if (w1 & index.UNEVALUATED) 540 return vDesc.edgeLen = _bucketLcp( 541 infix(indexSA(index), w0 & index.BITMASK0, w1 & index.BITMASK1), 542 indexText(index)); 543 else 544 return vDesc.edgeLen = _getNodeLP(index, w1) - (w0 & index.BITMASK0); 545 } 546 547 // parentEdgeLength - MODIFIED VERSION 548 template < typename TIndex, typename TSize > 549 inline typename Size<TIndex>::Type 550 parentEdgeLength(TIndex const &index, VertexWotdModified_<TSize> &vDesc) 551 { 552 TSize edgeLen = vDesc.edgeLen; 553 if (edgeLen != (TSize)-1) 554 return edgeLen; 555 556 TSize pos = vDesc.node; 557 TSize w0 = dirAt(pos, index); 558 if (w0 & index.LEAF) 559 return vDesc.edgeLen = 560 suffixLength(saAt(vDesc.range.i1, index), index) - vDesc.parentRepLen; 561 562 TSize w1 = dirAt(pos + 1, index); 563 if (w1 & index.UNEVALUATED) 564 if (_isSizeInval(vDesc.range.i2)) 565 return vDesc.edgeLen = _bucketLcp( 566 suffix(indexSA(index), vDesc.range.i1), 567 indexText(index), 568 vDesc.parentRepLen) - vDesc.parentRepLen; 569 else 570 return vDesc.edgeLen = _bucketLcp( 571 infix(indexSA(index), vDesc.range.i1, vDesc.range.i2), 572 indexText(index), 573 vDesc.parentRepLen) - vDesc.parentRepLen; 574 else 575 return (dirAt(w1 & index.BITMASK1, index) & index.BITMASK0) - vDesc.parentRepLen; 576 } 577 578 template < typename TText, typename TIndexSpec, typename TSpec > 579 inline typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type 580 parentEdgeLength(Iter< 581 Index<TText, IndexWotd<TIndexSpec> >, 582 VSTree< TopDown<TSpec> > > const &it) 583 { 584 typedef Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > TIter; 585 return parentEdgeLength(container(it), value(const_cast<TIter&>(it))); 586 } 587 588 template < typename TText, typename TIndexSpec, typename TSpec > 589 inline typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type 590 parentRepLength(Iter< 591 Index<TText, IndexWotd<TIndexSpec> >, 592 VSTree< TopDown<TSpec> > > const &it) 593 { 594 return value(it).parentRepLen; 595 } 596 597 template < typename TText, typename TIndexSpec, typename TSpec > 598 inline typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type 599 parentRepLength(Iter< 600 Index<TText, IndexWotd<TIndexSpec> >, 601 VSTree< TopDown< ParentLinks<TSpec> > > > const &it) 602 { 603 return value(it).parentRepLen; 604 } 605 606 template < typename TText, typename TIndexSpec, typename TSpec > 607 inline typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type 608 repLength(Iter< 609 Index<TText, IndexWotd<TIndexSpec> >, 610 VSTree< TopDown<TSpec> > > const &it) 611 { 612 return parentRepLength(it) + parentEdgeLength(it); 613 } 614 615 616 // parentEdgeLabel - ORIGINAL VERSION (doesn't work if TText is a StringSet) 617 template < typename TText, typename TSpec > 618 inline typename Infix< typename Fibre<Index<TText, IndexWotd<WotdOriginal> >, EsaText>::Type const >::Type 619 parentEdgeLabel(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > const &it) 620 { 621 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 622 typedef typename Size<TIndex>::Type TSize; 623 624 TIndex const &index = container(it); 625 626 if (isRoot(it)) 627 return infix(indexText(index), 0, 0); 628 else { 629 TSize occ = _getNodeLP(index, value(it).node); 630 return infix(indexText(index), occ, occ + parentEdgeLength(it)); 631 } 632 } 633 634 // getOccurrence - ORIGINAL VERSION 635 template < typename TText, typename TSpec > 636 inline typename SAValue<Index<TText, IndexWotd<WotdOriginal> > >::Type 637 getOccurrence(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree<TSpec> > const &it) { 638 return _getNodeLP(container(it), value(it).node) - value(it).parentRepLen; 639 } 640 641 template < typename TText, typename TIndexSpec, typename TSpec > 642 inline bool 643 emptyParentEdge(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree<TopDown<TSpec> > > const &it) 644 { 645 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 646 647 TIndex const &index = container(it); 648 typename SAValue<TIndex>::Type pos = getOccurrence(it); 649 return getSeqOffset(pos, stringSetLimits(index)) + value(it).parentRepLen 650 == sequenceLength(getSeqNo(pos, stringSetLimits(index)), index); 651 } 652 653 // to avoid ambiguity 654 template < typename TText, typename TIndexSpec, typename TSpec > 655 inline bool 656 emptyParentEdge(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree<TopDown<ParentLinks<TSpec> > > > const &it) 657 { 658 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 659 660 TIndex const &index = container(it); 661 typename SAValue<TIndex>::Type pos = getOccurrence(it); 662 return getSeqOffset(pos, stringSetLimits(index)) + value(it).parentRepLen 663 == sequenceLength(getSeqNo(pos, stringSetLimits(index)), index); 664 } 665 666 667 668 template < typename TText, typename TSpec > 669 inline void 670 goRoot(Iter< 671 Index<TText, IndexWotd<WotdOriginal> >, 672 VSTree<TSpec> > &it) 673 { 674 _historyClear(it); 675 value(it).node = 0; // start in root node (first entry in dir) 676 value(it).parentRepLen = 0; // parent prefix length is 0 677 value(it).edgeLen = 0; // edge length is 0 678 } 679 680 template < typename TText, typename TIndexSpec, typename TSpec > 681 inline void 682 goRoot(Iter< 683 Index<TText, IndexWotd<TIndexSpec> >, 684 VSTree<TSpec> > &it) 685 { 686 _historyClear(it); 687 value(it).range.i1 = 0; // start in root node with range (0,infty) 688 _setSizeInval(value(it).range.i2); // infty is equivalent to length(index) and faster to compare 689 value(it).node = 0; // start in root node (first entry in dir) 690 value(it).parentRepLen = 0; // parent prefix length is 0 691 value(it).edgeLen = 0; // edge length is 0 692 } 693 694 template < typename TText, typename TSpec > 695 inline bool atEnd(Iter<Index<TText, IndexWotd<WotdOriginal> >, VSTree<TSpec> > &it) 696 { 697 return _isSizeInval(value(it).node); 698 } 699 700 template < typename TText, typename TSpec > 701 inline bool atEnd(Iter<Index<TText, IndexWotd<WotdOriginal> >, VSTree<TSpec> > const &it) 702 { 703 return _isSizeInval(value(it).node); 704 } 705 706 707 // adjust iterator's right border of SA range 708 template < typename TText, typename TSpec > 709 inline void 710 _adjustRightBorder( 711 Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > &) 712 {} 713 714 template < typename TText, typename TIndexSpec, typename TSpec > 715 inline void 716 _adjustRightBorder( 717 Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > &it) 718 { 719 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 720 typedef typename Size<TIndex>::Type TSize; 721 722 TIndex const &index = container(it); 723 TSize pos = value(it).node; 724 725 TSize w0 = dirAt(pos, index); 726 if (w0 & index.LEAF) 727 value(it).range.i2 = value(it).range.i1 + 1; 728 else 729 if (w0 & index.LAST_CHILD) 730 value(it).range.i2 = value(it).parentRight; 731 else { 732 w0 = dirAt(pos + 2, index); 733 value(it).range.i2 = w0 & index.BITMASK0; 734 } 735 } 736 737 // go down the leftmost edge (including empty $-edges) 738 template < typename TText, typename TSpec, typename TDfsOrder, typename THideEmptyEdges > 739 inline bool 740 _goDown( 741 Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > &it, 742 VSTreeIteratorTraits<TDfsOrder, THideEmptyEdges> const) 743 { 744 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 745 typedef typename Size<TIndex>::Type TSize; 746 747 if (_isLeaf(it, EmptyEdges())) return false; 748 749 TIndex &index = container(it); 750 _historyPush(it); 751 752 // ensure node evaluation 753 TSize childNode = _wotdEvaluate(it); 754 755 if (THideEmptyEdges::VALUE && (childNode & index.SENTINELS) != 0) 756 return false; 757 758 // go down 759 value(it).node = childNode & index.BITMASK1; 760 value(it).edgeLen = -1; 761 762 // go right if parent edge is empty 763 // or hull predicate is false 764 if ((THideEmptyEdges::VALUE && emptyParentEdge(it)) || !nodeHullPredicate(it)) 765 if (!goRight(it)) { 766 _goUp(it); 767 return false; 768 } 769 770 return true; 771 } 772 773 // go down the leftmost edge (excluding empty $-edges) 774 template < typename TText, typename TIndexSpec, typename TSpec, typename TDfsOrder, typename THideEmptyEdges > 775 inline bool 776 _goDown( 777 Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > &it, 778 VSTreeIteratorTraits<TDfsOrder, THideEmptyEdges> const) 779 { 780 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 781 typedef typename Size<TIndex>::Type TSize; 782 783 if (_isLeaf(it, EmptyEdges())) return false; 784 TIndex const &index = container(it); 785 786 // ensure node evaluation 787 TSize childNode = _wotdEvaluate(it); 788 789 if (THideEmptyEdges::VALUE && (childNode & index.SENTINELS) != 0) 790 return false; 791 792 // go down 793 _historyPush(it); 794 value(it).node = childNode & index.BITMASK1; 795 value(it).edgeLen = -1; 796 _adjustRightBorder(it); 797 798 // go right if parent edge is empty 799 // or hull predicate is false 800 if ((THideEmptyEdges::VALUE && emptyParentEdge(it)) || !nodeHullPredicate(it)) 801 if (!goRight(it)) { 802 _goUp(it); 803 return false; 804 } 805 806 return true; 807 } 808 809 // go right to the lexic. next sibling 810 template < typename TText, typename TSpec, typename TDfsOrder, typename THideEmptyEdges > 811 inline bool 812 _goRight( 813 Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > &it, 814 VSTreeIteratorTraits<TDfsOrder, THideEmptyEdges> const) 815 { 816 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 817 typedef typename Size<TIndex>::Type TSize; 818 819 TIndex const &index = container(it); 820 821 do { 822 TSize w0 = dirAt(value(it).node, index); 823 if (w0 & index.LAST_CHILD) 824 return false; 825 826 value(it).node += (w0 & index.LEAF)? 1: 2; 827 value(it).edgeLen = -1; 828 829 _adjustRightBorder(it); 830 } while ((THideEmptyEdges::VALUE && emptyParentEdge(it)) || !nodeHullPredicate(it)); 831 return true; 832 } 833 834 template < typename TText, typename TIndexSpec, typename TSpec, typename TDfsOrder, typename THideEmptyEdges > 835 inline bool 836 _goRight( 837 Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > &it, 838 VSTreeIteratorTraits<TDfsOrder, THideEmptyEdges> const) 839 { 840 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 841 typedef typename Size<TIndex>::Type TSize; 842 843 TIndex const &index = container(it); 844 845 do { 846 TSize w0 = dirAt(value(it).node, index); 847 if (w0 & index.LAST_CHILD) 848 return false; 849 850 value(it).node += (w0 & index.LEAF)? 1: 2; 851 value(it).edgeLen = -1; 852 853 value(it).range.i1 = value(it).range.i2; 854 _adjustRightBorder(it); 855 } while ((THideEmptyEdges::VALUE && emptyParentEdge(it)) || !nodeHullPredicate(it)); 856 return true; 857 } 858 859 // go up one edge (returns false if in root node) 860 // can be used at most once, as no history stack is available 861 template < typename TText, typename TWotdSpec, typename TSpec > 862 inline bool 863 _goUp(Iter< Index<TText, IndexWotd<TWotdSpec> >, VSTree< TopDown<TSpec> > > &it) 864 { 865 if (!isRoot(it)) { 866 value(it) = it._parentDesc; 867 return true; 868 } 869 return false; 870 } 871 872 // go up one edge (returns false if in root node) 873 template < typename TText, typename TSpec > 874 inline bool 875 _goUp(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown< ParentLinks<TSpec> > > > &it) 876 { 877 typedef typename Size< Index<TText, IndexWotd<WotdOriginal> > >::Type TSize; 878 879 if (!empty(it.history)) { 880 HistoryStackWotdOriginal_<TSize> const &entry = back(it.history); 881 value(it).node = entry.node; 882 value(it).parentRepLen -= entry.edgeLen; 883 value(it).edgeLen = entry.edgeLen; 884 eraseBack(it.history); 885 return true; 886 } 887 return false; 888 } 889 890 template < typename TText, typename TIndexSpec, typename TSpec > 891 inline bool 892 _goUp(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown< ParentLinks<TSpec> > > > &it) 893 { 894 typedef typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type TSize; 895 896 if (!empty(it.history)) 897 { 898 HistoryStackWotdModified_<TSize> const &entry = back(it.history); 899 value(it).node = entry.node; 900 value(it).parentRepLen -= entry.edgeLen; 901 value(it).edgeLen = entry.edgeLen; 902 value(it).range = entry.range; 903 eraseBack(it.history); 904 if (!empty(it.history)) 905 value(it).parentRight = back(it.history).range.i2; // copy right boundary of parent's range 906 return true; 907 } 908 return false; 909 } 910 911 // return vertex descriptor of parent's node 912 template < typename TText, typename TSpec > 913 inline typename VertexDescriptor< Index<TText, IndexWotd<WotdOriginal> > >::Type 914 nodeUp(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown< ParentLinks<TSpec> > > > const &it) 915 { 916 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 917 typedef typename Size<TIndex>::Type TSize; 918 919 if (!empty(it.history)) 920 { 921 HistoryStackWotdOriginal_<TSize> const &entry = back(it.history); 922 typename VertexDescriptor<TIndex>::Type desc; 923 924 desc.node = entry.node; 925 desc.parentRepLen = value(it).parentRepLen - entry.edgeLen; 926 desc.edgeLen = entry.edgeLen; 927 TSize h = length(it.history) - 1; 928 if (h != 0) --h; 929 return desc; 930 } else 931 return value(it); 932 } 933 934 // return vertex descriptor of parent's node 935 template < typename TText, typename TIndexSpec, typename TSpec > 936 inline typename VertexDescriptor< Index<TText, IndexWotd<TIndexSpec> > >::Type 937 nodeUp(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown< ParentLinks<TSpec> > > > const &it) 938 { 939 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 940 typedef typename Size<TIndex>::Type TSize; 941 942 if (!empty(it.history)) 943 { 944 HistoryStackWotdModified_<TSize> const &entry = back(it.history); 945 typename VertexDescriptor<TIndex>::Type desc; 946 947 desc.node = entry.node; 948 desc.parentRepLen = value(it).parentRepLen - entry.edgeLen; 949 desc.edgeLen = entry.edgeLen; 950 return desc; 951 } else 952 return value(it); 953 } 954 955 ////////////////////////////////////////////////////////////////////////////// 956 957 958 // Counting sort - Step 2a: Count characters 959 template < typename TBuckets, typename TText > 960 inline void 961 _wotdCountChars(TBuckets &buckets, TText const &text) 962 { 963 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 964 965 TTextIterator itText = begin(text, Standard()); 966 TTextIterator itTextEnd = end(text, Standard()); 967 for (; itText != itTextEnd; ++itText) 968 ++buckets[ordValue(getValue(itText))]; 969 } 970 971 972 template < typename TBuckets, typename TText, typename TSpec > 973 inline void 974 _wotdCountChars(TBuckets &buckets, StringSet<TText, TSpec> const &stringSet) 975 { 976 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 977 978 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 979 { 980 TText const &text = value(stringSet, seqNo); 981 TTextIterator itText = begin(text, Standard()); 982 TTextIterator itTextEnd = end(text, Standard()); 983 for (; itText != itTextEnd; ++itText) 984 ++buckets[ordValue(getValue(itText))]; 985 } 986 } 987 988 // Counting sort - Step 2b: Count the prefixLen'th characters of suffices 989 template < typename TBuckets, typename TText, typename TSA, typename TSize > 990 inline typename Size<TText>::Type 991 _wotdCountCharsWotdOriginal( 992 TBuckets &buckets, 993 TText const &text, 994 TSA &sa, 995 TSize prefixLen) 996 { 997 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 998 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 999 typedef typename Size<TText>::Type TTextSize; 1000 1001 TTextIterator itText = begin(text, Standard()); 1002 TSAIterator itSA = begin(sa, Standard()); 1003 TSAIterator itSAEnd = end(sa, Standard()); 1004 1005 TTextSize sentinels = 0; 1006 TTextSize textLength = length(text); 1007 for (; itSA != itSAEnd; ++itSA) 1008 { 1009 // add prefix on entries in sa 1010 TTextSize saValue = (*itSA += prefixLen); 1011 if (textLength > saValue) 1012 ++buckets[ordValue(*(itText + saValue))]; 1013 else 1014 if (textLength == saValue) ++sentinels; 1015 } 1016 return sentinels; 1017 } 1018 1019 template < typename TBuckets, typename TText, typename TSA, typename TSize > 1020 inline typename Size<TText>::Type 1021 _wotdCountChars( 1022 TBuckets &buckets, 1023 TText const &text, 1024 TSA const &sa, 1025 TSize prefixLen) 1026 { 1027 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1028 typedef typename Iterator<TSA const, Standard>::Type TSAIterator; 1029 typedef typename Size<TText>::Type TTextSize; 1030 1031 TTextIterator itText = begin(text, Standard()) + prefixLen; 1032 TSAIterator itSA = begin(sa, Standard()); 1033 TSAIterator itSAEnd = end(sa, Standard()); 1034 1035 TTextSize sentinels = 0; 1036 TTextSize textLength = length(text) - prefixLen; 1037 for (; itSA != itSAEnd; ++itSA) 1038 { 1039 TTextSize saValue = *itSA; 1040 if (textLength > saValue) 1041 ++buckets[ordValue(*(itText + saValue))]; 1042 else 1043 if (textLength == saValue) ++sentinels; 1044 } 1045 return sentinels; 1046 } 1047 1048 template < 1049 typename TBuckets, 1050 typename TText, 1051 typename TSpec, 1052 typename TSA, 1053 typename TSize 1054 > 1055 inline typename Size<TText>::Type 1056 _wotdCountChars( 1057 TBuckets &buckets, 1058 StringSet<TText, TSpec> const &stringSet, 1059 TSA const &sa, 1060 TSize prefixLen) 1061 { 1062 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1063 typedef typename Iterator<TSA const, Standard>::Type TSAIterator; 1064 typedef typename Size<TText>::Type TTextSize; 1065 1066 if (empty(stringSet)) 1067 return 0; 1068 1069 TTextIterator itText = begin(front(stringSet), Standard()); 1070 TSAIterator itSA = begin(sa, Standard()); 1071 TSAIterator itSAEnd = end(sa, Standard()); 1072 1073 TTextSize sentinels = 0; 1074 TTextSize textLength = 0; 1075 unsigned lastSeqSeen = (unsigned)-1; 1076 Pair<unsigned, TTextSize> lPos; 1077 for (; itSA != itSAEnd; ++itSA) 1078 { 1079 posLocalize(lPos, *itSA, stringSetLimits(stringSet)); 1080 if (lastSeqSeen != getSeqNo(lPos)) 1081 { 1082 lastSeqSeen = getSeqNo(lPos); 1083 1084 // shift textBegin and textLength by prefixLen 1085 textLength = length(stringSet[lastSeqSeen]) - prefixLen; 1086 itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen; 1087 } 1088 if (textLength > getSeqOffset(lPos)) 1089 ++buckets[ordValue(*(itText + getSeqOffset(lPos)))]; 1090 else 1091 if (textLength == getSeqOffset(lPos)) ++sentinels; 1092 } 1093 return sentinels; 1094 } 1095 1096 1097 ////////////////////////////////////////////////////////////////////////////// 1098 1099 1100 // Counting sort - Step 3: Cumulative sum 1101 template < typename TBounds, typename TBuckets, typename TSize > 1102 inline typename Size<TBuckets>::Type 1103 _wotdCummulativeSum(TBounds &bounds, TBuckets const &buckets, TSize offset) 1104 { 1105 typedef typename Iterator<TBounds, Standard>::Type TBoundIterator; 1106 typedef typename Iterator<TBuckets const, Standard>::Type TBucketsIterator; 1107 1108 TBucketsIterator it = begin(buckets, Standard()); 1109 TBucketsIterator itEnd = end(buckets, Standard()); 1110 TBoundIterator bit = begin(bounds, Standard()); 1111 1112 typename Value<TBounds>::Type sum = offset; 1113 typename Size<TBuckets>::Type requiredSize = 0; 1114 typename Value<TBuckets>::Type diff; 1115 1116 for (; it != itEnd; ++it, ++bit) 1117 if ((diff = *it) != 0) { 1118 requiredSize += (diff > 1)? 2: 1; 1119 *bit = sum; 1120 sum += diff; 1121 } 1122 1123 return requiredSize; 1124 } 1125 1126 template < typename TBounds, typename TBuckets > 1127 inline typename Size<TBuckets>::Type 1128 _wotdCummulativeSum(TBounds &bounds, TBuckets const &buckets) 1129 { 1130 return _wotdCummulativeSum(bounds, buckets, 0); 1131 } 1132 1133 ////////////////////////////////////////////////////////////////////////////// 1134 //TODO(singer): The function createWotdIndex in never defined! 1135 /*! 1136 * @fn IndexWotd#createWotdIndex 1137 * @headerfile <seqan/index.h> 1138 * @brief Builds a the WOTD index. 1139 * 1140 * @signature void createWotdIndex(sa, dir, text); 1141 * 1142 * @param[out] sa The resulting list in which all <i>q</i>-grams are sorted alphabetically. 1143 * @param[out] dir The resulting array that indicates at which position in index the corresponding <i>q</i>-grams 1144 * can be found. 1145 * @param[in] text The sequence. Types: @link ContainerConcept @endlink 1146 * 1147 * The resulting <tt>index</tt> contains the sorted list of qgrams. For each possible <i>q</i>-gram pos contains 1148 * the first position in index that corresponds to this <i>q</i>-gram. 1149 */ 1150 1151 // single sequence 1152 template < typename TIndex > 1153 typename Size<TIndex>::Type 1154 _sortFirstWotdBucket(TIndex &index) 1155 { 1156 typedef typename Fibre<TIndex, WotdText >::Type TText; 1157 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 1158 typedef typename TIndex::TCounter TCounter; 1159 1160 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1161 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 1162 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1163 typedef typename Size<TText>::Type TSize; 1164 1165 TText const &text = indexText(index); 1166 TCounter &occ = index.tempOcc; 1167 TCounter &bound = index.tempBound; 1168 1169 // 1. clear counters and copy SA to temporary SA 1170 arrayFill(begin(occ, Standard()), end(occ, Standard()), 0); 1171 1172 // 2. count characters 1173 _wotdCountChars(occ, text); 1174 1175 // 3. cumulative sum 1176 TSize requiredSize = _wotdCummulativeSum(bound, occ); 1177 1178 // 4. fill suffix array 1179 { 1180 TSA &sa = indexSA(index); 1181 TSAIterator saBeg = begin(sa, Standard()); 1182 TCntIterator boundBeg = begin(bound, Standard()); 1183 1184 TTextIterator itText = begin(text, Standard()); 1185 TTextIterator itTextEnd = end(text, Standard()); 1186 for(TSize i = 0; itText != itTextEnd; ++itText, ++i) 1187 *(saBeg + (*(boundBeg + ordValue(getValue(itText))))++) = i; 1188 } 1189 index.sentinelOcc = 0; 1190 index.sentinelBound = 0; 1191 1192 return requiredSize; 1193 } 1194 1195 // multiple sequences 1196 template < typename TText, typename TSpec, typename TIndexSpec > 1197 typename Size< Index<StringSet<TText, TSpec>, TIndexSpec> >::Type 1198 _sortFirstWotdBucket(Index<StringSet<TText, TSpec>, TIndexSpec> &index) 1199 { 1200 typedef Index<StringSet<TText, TSpec>, TIndexSpec> TIndex; 1201 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 1202 typedef typename TIndex::TCounter TCounter; 1203 1204 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1205 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 1206 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1207 typedef typename Size<TText>::Type TSize; 1208 1209 StringSet<TText, TSpec> const &stringSet = indexText(index); 1210 TCounter &occ = index.tempOcc; 1211 TCounter &bound = index.tempBound; 1212 1213 // 1. clear counters and copy SA to temporary SA 1214 arrayFill(begin(occ, Standard()), end(occ, Standard()), 0); 1215 1216 // 2. count characters 1217 _wotdCountChars(occ, stringSet); 1218 1219 // 3. cummulative sum 1220 TSize requiredSize = _wotdCummulativeSum(bound, occ); 1221 1222 // 4. fill suffix array 1223 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 1224 { 1225 TSA &sa = indexSA(index); 1226 TSAIterator saBeg = begin(sa, Standard()); 1227 TCntIterator boundBeg = begin(bound, Standard()); 1228 1229 typename Value<TSA>::Type localPos; 1230 assignValueI1(localPos, seqNo); 1231 assignValueI2(localPos, 0); 1232 1233 TText const &text = value(stringSet, seqNo); 1234 TTextIterator itText = begin(text, Standard()); 1235 TTextIterator itTextEnd = end(text, Standard()); 1236 for(; itText != itTextEnd; ++itText) 1237 { 1238 *(saBeg + (*(boundBeg + ordValue(getValue(itText))))++) = localPos; 1239 assignValueI2(localPos, getValueI2(localPos) + 1); 1240 } 1241 } 1242 index.sentinelOcc = 0; 1243 index.sentinelBound = 0; 1244 1245 return requiredSize; 1246 } 1247 1248 1249 1250 // sort bucket using counting sort 1251 // (nearly) ORIGINAL VERSION: 1252 // - brings the bucket with the longest suffix (lpBucket) to front 1253 // - all other buckets are in lexicographical order 1254 // - adds prefixLen to all SA entries in SA[left,right) 1255 template < typename TText, typename TBeginPos, typename TEndPos, typename TSize > 1256 TSize _sortWotdBucket( 1257 Index<TText, IndexWotd<WotdOriginal> > &index, 1258 TBeginPos left, 1259 TEndPos right, 1260 TSize prefixLen) 1261 { 1262 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 1263 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 1264 typedef typename TIndex::TCounter TCounter; 1265 1266 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1267 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 1268 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1269 typedef typename Iterator<TCounter const, Standard>::Type TConstCntIterator; 1270 typedef typename Size<TText>::Type TTextSize; 1271 1272 TText const &text = indexText(index); 1273 TCounter const &tempSA = index.tempSA; 1274 TCounter &occ = index.tempOcc; 1275 TCounter &bound = index.tempBound; 1276 1277 // 1. clear counters and copy SA to temporary SA 1278 arrayFill(begin(occ, Standard()), end(occ, Standard()), 0); 1279 1280 // 2. count characters 1281 index.tempSA = infix(indexSA(index), left, right); 1282 index.sentinelBound = 0; 1283 index.sentinelOcc = 1284 _wotdCountCharsWotdOriginal(occ, text, index.tempSA, prefixLen); 1285 1286 // 3. cumulative sum 1287 TSize requiredSize = 0; 1288 1289 // actually, here sentinelOcc<=1 holds (this is the original wotd) 1290 if (index.interSentinelNodes) { 1291 if (index.sentinelOcc != 0) 1292 requiredSize = (index.sentinelOcc > 1)? 2: 1; // insert *one* $-edge for all $_i suffices 1293 } else 1294 requiredSize = index.sentinelOcc; // insert each $_i suffix one-by-one 1295 1296 requiredSize += _wotdCummulativeSum(bound, occ, left + index.sentinelOcc); 1297 index.sentinelBound = left; 1298 1299 // 4. fill suffix array 1300 { 1301 TSA &sa = indexSA(index); 1302 TSAIterator saBeg = begin(sa, Standard()); 1303 TCntIterator boundBeg = begin(bound, Standard()); 1304 1305 TTextIterator itText = begin(text, Standard()); 1306 TConstCntIterator itSA = begin(tempSA, Standard()); 1307 TConstCntIterator itSAEnd = end(tempSA, Standard()); 1308 TTextSize textLength = length(text); 1309 for(; itSA != itSAEnd; ++itSA) 1310 { 1311 TTextSize saValue = *itSA; 1312 if (textLength > saValue) 1313 *(saBeg + (*(boundBeg + ordValue(*(itText + saValue))))++) = saValue; 1314 else 1315 if (textLength == saValue) 1316 *(saBeg + index.sentinelBound++) = saValue; 1317 } 1318 } 1319 1320 // 5. move lpBucket to front and add saOffset to hash directory entries 1321 { 1322 TSize lpBucket = ordValue(text[tempSA[0]]); 1323 if (lpBucket != 0) { 1324 TSize lpBucketOcc = occ[lpBucket]; 1325 TSize lpBucketBound = bound[lpBucket]; 1326 1327 TCntIterator itOcc = begin(occ, Standard()) + lpBucket; 1328 TCntIterator itBound = begin(bound, Standard()) + lpBucket; 1329 TCntIterator itBeg = begin(bound, Standard()); 1330 for(; itBound != itBeg; --itBound, --itOcc) { 1331 *itOcc = *(itOcc - 1); 1332 *itBound = *(itBound - 1); 1333 } 1334 if (index.sentinelOcc != 0) { 1335 // bring first bucket before sentinel bucket 1336 *itOcc = index.sentinelOcc; 1337 *itBound = index.sentinelBound; 1338 index.sentinelOcc = lpBucketOcc; 1339 index.sentinelBound = lpBucketBound; 1340 } else { 1341 *itOcc = lpBucketOcc; 1342 *itBound = lpBucketBound; 1343 } 1344 } else 1345 if (index.sentinelOcc != 0) { 1346 // bring first bucket before sentinel bucket 1347 TSize swap = index.sentinelOcc; 1348 index.sentinelOcc = occ[0]; 1349 occ[0] = swap; 1350 swap = index.sentinelBound; 1351 index.sentinelBound = bound[0]; 1352 bound[0] = swap; 1353 } 1354 } 1355 1356 return requiredSize; 1357 } 1358 1359 1360 1361 1362 1363 // sort bucket using counting sort 1364 // MODIFIED VERSION: 1365 // - all buckets are in lexicographical order 1366 // - SA[left,right) contains real SA entries (the beginning positions of the suffices) 1367 1368 // single sequence 1369 template < typename TIndex, typename TBeginPos, typename TEndPos, typename TSize > 1370 TSize _sortWotdBucket( 1371 TIndex &index, 1372 TBeginPos left, 1373 TEndPos right, 1374 TSize prefixLen) 1375 { 1376 typedef typename Fibre<TIndex, WotdText >::Type TText; 1377 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 1378 typedef typename TIndex::TCounter TCounter; 1379 1380 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1381 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 1382 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1383 typedef typename Iterator<TCounter const, Standard>::Type TConstCntIterator; 1384 typedef typename Size<TText>::Type TTextSize; 1385 1386 TText const &text = indexText(index); 1387 TCounter const &tempSA = index.tempSA; 1388 TCounter &occ = index.tempOcc; 1389 TCounter &bound = index.tempBound; 1390 1391 // 1. clear counters and copy SA to temporary SA 1392 arrayFill(begin(occ, Standard()), end(occ, Standard()), 0); 1393 index.tempSA = infix(indexSA(index), left, right); 1394 1395 // 2. count characters 1396 index.sentinelBound = 0; 1397 index.sentinelOcc = _wotdCountChars(occ, text, tempSA, prefixLen); 1398 1399 // 3. cumulative sum 1400 TSize requiredSize = 0; 1401 if (index.interSentinelNodes) { 1402 if (index.sentinelOcc != 0) 1403 requiredSize = (index.sentinelOcc > 1)? 2: 1; // insert *one* $-edge for all $_i suffices 1404 } else 1405 requiredSize = index.sentinelOcc; // insert each $_i suffix one-by-one 1406 1407 requiredSize += _wotdCummulativeSum(bound, occ, left + index.sentinelOcc); 1408 index.sentinelBound = left; 1409 1410 // 4. fill suffix array 1411 { 1412 TSA &sa = indexSA(index); 1413 TSAIterator saBeg = begin(sa, Standard()); 1414 TCntIterator boundBeg = begin(bound, Standard()); 1415 1416 TTextIterator itText = begin(text, Standard()) + prefixLen; 1417 TConstCntIterator itSA = begin(tempSA, Standard()); 1418 TConstCntIterator itSAEnd = end(tempSA, Standard()); 1419 TTextSize textLength = length(text) - prefixLen; 1420 for(; itSA != itSAEnd; ++itSA) 1421 { 1422 TTextSize saValue = *itSA; 1423 if (textLength > saValue) 1424 *(saBeg + (*(boundBeg + ordValue(*(itText + saValue))))++) = saValue; 1425 else 1426 if (textLength == saValue) 1427 *(saBeg + index.sentinelBound++) = saValue; 1428 } 1429 } 1430 1431 return requiredSize; 1432 } 1433 1434 // multiple sequences 1435 template < typename TText, typename TSpec, typename TIndexSpec, typename TBeginPos, typename TEndPos, typename TSize > 1436 TSize _sortWotdBucket( 1437 Index<StringSet<TText, TSpec>, TIndexSpec> &index, 1438 TBeginPos left, 1439 TEndPos right, 1440 TSize prefixLen) 1441 { 1442 typedef Index<StringSet<TText, TSpec>, TIndexSpec> TIndex; 1443 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 1444 typedef typename TIndex::TCounter TCounter; 1445 typedef typename TIndex::TTempSA TTempSA; 1446 1447 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1448 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 1449 typedef typename Iterator<TTempSA const, Standard>::Type TTempSAIterator; 1450 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1451 typedef typename Size<TText>::Type TTextSize; 1452 1453 StringSet<TText, TSpec> const &stringSet = indexText(index); 1454 TTempSA const &tempSA = index.tempSA; 1455 TCounter &occ = index.tempOcc; 1456 TCounter &bound = index.tempBound; 1457 1458 // 1. clear counters and copy SA to temporary SA 1459 TCntIterator occBeg = begin(occ, Standard()); 1460 1461 arrayFill(occBeg, end(occ, Standard()), 0); 1462 index.tempSA = infix(indexSA(index), left, right); 1463 1464 // 2. count characters 1465 index.sentinelBound = 0; 1466 index.sentinelOcc = _wotdCountChars(occ, stringSet, tempSA, prefixLen); 1467 1468 // 3. cumulative sum 1469 TSize requiredSize = 0; 1470 if (index.interSentinelNodes) { 1471 if (index.sentinelOcc != 0) 1472 requiredSize = (index.sentinelOcc > 1)? 2: 1; // insert *one* $-edge for all $_i suffices 1473 } else 1474 requiredSize = index.sentinelOcc; // insert each $_i suffix one-by-one 1475 1476 requiredSize += _wotdCummulativeSum(bound, occ, left + index.sentinelOcc); 1477 index.sentinelBound = left; 1478 1479 // 4. fill suffix array 1480 { 1481 if (empty(stringSet)) 1482 return requiredSize; 1483 1484 TSA &sa = indexSA(index); 1485 TSAIterator saBeg = begin(sa, Standard()); 1486 TCntIterator boundBeg = begin(bound, Standard()); 1487 1488 TTextIterator itText = begin(front(stringSet), Standard()); 1489 TTempSAIterator itSA = begin(tempSA, Standard()); 1490 TTempSAIterator itSAEnd = end(tempSA, Standard()); 1491 TTextSize textLength = 0; 1492 unsigned lastSeqSeen = (unsigned)-1; 1493 Pair<unsigned, TTextSize> lPos; 1494 for(; itSA != itSAEnd; ++itSA) 1495 { 1496 posLocalize(lPos, *itSA, stringSetLimits(index)); 1497 if (lastSeqSeen != getSeqNo(lPos)) 1498 { 1499 lastSeqSeen = getSeqNo(lPos); 1500 1501 // shift textBegin and textLength by prefixLen 1502 textLength = length(stringSet[lastSeqSeen]) - prefixLen; 1503 itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen; 1504 } 1505 if (textLength > getSeqOffset(lPos)) 1506 *(saBeg + (*(boundBeg + ordValue(*(itText + getSeqOffset(lPos)))))++) = *itSA; 1507 else 1508 if (textLength == getSeqOffset(lPos)) 1509 *(saBeg + index.sentinelBound++) = *itSA; 1510 } 1511 } 1512 1513 return requiredSize; 1514 } 1515 1516 1517 1518 1519 1520 template < typename TSA, typename TText > 1521 typename Size<TText>::Type 1522 _bucketLcp(TSA const &sa, TText const &text) 1523 { 1524 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1525 typedef typename Iterator<TSA const, Standard>::Type TSAIterator; 1526 typedef typename Value<TText>::Type TValue; 1527 typedef typename Size<TText>::Type TTextSize; 1528 1529 TTextSize prefixLen = 0; 1530 1531 if (length(sa) < 2) return prefixLen; 1532 1533 TTextIterator itText = begin(text, Standard()); 1534 TSAIterator itSAEnd = end(sa, Standard()); 1535 TTextSize textLength = length(text); 1536 1537 do { 1538 TSAIterator itSA = begin(sa, Standard()); 1539 TTextSize sa = *itSA; 1540 if (textLength == sa) return prefixLen; 1541 1542 TValue c = *(itText + sa); 1543 for(++itSA; itSA != itSAEnd; ++itSA) { 1544 sa = *itSA; 1545 if (textLength == sa || c != *(itText + sa)) 1546 return prefixLen; 1547 } 1548 ++prefixLen; --textLength; 1549 ++itText; 1550 } while (true); 1551 } 1552 1553 template < typename TSA, typename TText, typename TSize > 1554 typename Size<TText>::Type 1555 _bucketLcp(TSA const &sa, TText const &text, TSize prefixLen) 1556 { 1557 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1558 typedef typename Iterator<TSA const, Standard>::Type TSAIterator; 1559 typedef typename Value<TText>::Type TValue; 1560 typedef typename Size<TText>::Type TTextSize; 1561 1562 if (length(sa) < 2) return prefixLen; 1563 1564 TTextIterator itText = begin(text, Standard()) + prefixLen; 1565 TSAIterator itSAEnd = end(sa, Standard()); 1566 TTextSize textLength = length(text) - prefixLen; 1567 1568 do { 1569 TSAIterator itSA = begin(sa, Standard()); 1570 TTextSize sa = *itSA; 1571 if (textLength == sa) return prefixLen; 1572 1573 TValue c = *(itText + sa); 1574 for(++itSA; itSA != itSAEnd; ++itSA) { 1575 sa = *itSA; 1576 if (textLength == sa || *(itText + sa) != c) 1577 return prefixLen; 1578 } 1579 ++prefixLen; --textLength; 1580 ++itText; 1581 } while (true); 1582 } 1583 1584 template < typename TSA, typename TText, typename TSpec, typename TSize > 1585 typename Size<TText>::Type 1586 _bucketLcp(TSA const &sa, StringSet<TText, TSpec> const &stringSet, TSize prefixLen) 1587 { 1588 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1589 typedef typename Iterator<TSA const, Standard>::Type TSAIterator; 1590 typedef typename Value<TText>::Type TValue; 1591 typedef typename Size<TText>::Type TTextSize; 1592 1593 if (length(sa) < 2) return prefixLen; 1594 1595 TSAIterator itSAEnd = end(sa, Standard()); 1596 TTextIterator itText; 1597 TTextSize textLength; 1598 1599 Pair<unsigned, TTextSize> lPos; 1600 do { 1601 TSAIterator itSA = begin(sa, Standard()); 1602 posLocalize(lPos, *itSA, stringSetLimits(stringSet)); 1603 1604 unsigned lastSeqSeen = getSeqNo(*itSA); 1605 textLength = length(stringSet[lastSeqSeen]) - prefixLen; 1606 if (textLength == getSeqOffset(lPos)) return prefixLen; 1607 1608 itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen; 1609 TValue c = *(itText + getSeqOffset(*itSA)); 1610 for(++itSA; itSA != itSAEnd; ++itSA) 1611 { 1612 posLocalize(lPos, *itSA, stringSetLimits(stringSet)); 1613 1614 if (lastSeqSeen != getSeqNo(lPos)) 1615 { 1616 lastSeqSeen = getSeqNo(lPos); 1617 1618 // shift textBegin and textLength by prefixLen 1619 textLength = length(stringSet[lastSeqSeen]) - prefixLen; 1620 itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen; 1621 } 1622 1623 if (textLength == getSeqOffset(lPos) || c != *(itText + getSeqOffset(lPos))) 1624 return prefixLen; 1625 } 1626 ++prefixLen; --textLength; 1627 ++itText; 1628 } while (true); 1629 } 1630 1631 1632 template <typename TText, typename TSpec, typename TPos> 1633 inline TPos 1634 _getNodeLP( 1635 Index<TText, IndexWotd<TSpec> > const &index, 1636 TPos pos) 1637 { 1638 TPos w0 = dirAt(pos, index); 1639 if (w0 & index.LEAF) 1640 return w0 & index.BITMASK0; 1641 1642 TPos w1 = dirAt(pos + 1, index); 1643 if (w1 & index.UNEVALUATED) 1644 return saAt(w0 & index.BITMASK0, index); 1645 else 1646 return w0 & index.BITMASK0; 1647 } 1648 1649 // store buckets into directory 1650 // ORIGINAL VERSION: storing SA entries and topology links in Dir 1651 template <typename TText, typename TPos> 1652 inline void 1653 _storeWotdChildren( 1654 Index<TText, IndexWotd<WotdOriginal> > &index, 1655 TPos dirOfs) 1656 { 1657 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 1658 typedef typename Fibre<TIndex, WotdDir>::Type TDir; 1659 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 1660 typedef typename Size<TDir>::Type TDirSize; 1661 typedef typename TIndex::TCounter TCounter; 1662 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1663 1664 typedef typename Value<TCounter>::Type TValue; 1665 1666 TDirIterator itDir = begin(indexDir(index), Standard()) + dirOfs; 1667 TDirIterator itDirEnd = end(indexDir(index), Standard()); 1668 TDirIterator itPrev = itDirEnd; 1669 1670 TCntIterator it = begin(index.tempOcc, Standard()); 1671 TCntIterator bit = begin(index.tempBound, Standard()); 1672 TCntIterator itEnd = end(index.tempOcc, Standard()); 1673 1674 TValue occ; 1675 if (index.sentinelOcc != 0) 1676 { 1677 if (index.sentinelOcc > 1 && index.interSentinelNodes) // occurs on multiseqs 1678 { 1679 itPrev = itDir; 1680 *itDir = index.sentinelBound - index.sentinelOcc; ++itDir; 1681 *itDir = index.sentinelBound | index.UNEVALUATED; ++itDir; 1682 } else 1683 for (TDirSize d = index.sentinelBound - index.sentinelOcc; d != index.sentinelBound; ++d) 1684 { 1685 itPrev = itDir; 1686 *itDir = saAt(d, index) | index.LEAF; ++itDir; 1687 } 1688 } 1689 for (; it != itEnd; ++it, ++bit) 1690 { 1691 if ((occ = *it) == 0) continue; 1692 if (occ > 1) { 1693 itPrev = itDir; 1694 *itDir = *bit - occ; ++itDir; 1695 *itDir = *bit | index.UNEVALUATED; ++itDir; 1696 } else { 1697 itPrev = itDir; 1698 *itDir = saAt(*bit - occ, index) | index.LEAF; ++itDir; 1699 } 1700 } 1701 1702 // mark the last child 1703 if (itPrev != itDirEnd) 1704 *itPrev |= index.LAST_CHILD; 1705 } 1706 1707 // store buckets into directory 1708 // MODIFIED VERSION: storing SA links and topology links in Dir 1709 template <typename TText, typename TSpec, typename TSize> 1710 inline void 1711 _storeWotdChildren( 1712 Index<TText, IndexWotd<TSpec> > &index, 1713 TSize dirOfs, 1714 TSize lcp) 1715 { 1716 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1717 typedef typename Fibre<TIndex, WotdDir>::Type TDir; 1718 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 1719 typedef typename Size<TDir>::Type TDirSize; 1720 typedef typename TIndex::TCounter TCounter; 1721 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1722 1723 typedef typename Value<TCounter>::Type TValue; 1724 1725 TDirIterator itDirBegin = begin(indexDir(index), Standard()) + dirOfs; 1726 TDirIterator itDirEnd = end(indexDir(index), Standard()); 1727 TDirIterator itDir = itDirBegin; 1728 TDirIterator itPrev = itDirEnd; 1729 1730 TCntIterator it = begin(index.tempOcc, Standard()); 1731 TCntIterator bit = begin(index.tempBound, Standard()); 1732 TCntIterator itEnd = end(index.tempOcc, Standard()); 1733 1734 TValue occ; 1735 if (index.sentinelOcc != 0) 1736 { 1737 if (index.sentinelOcc > 1 && index.interSentinelNodes) // occurs on multiseqs 1738 { 1739 itPrev = itDir; 1740 *itDir = index.sentinelBound - index.sentinelOcc; ++itDir; 1741 *itDir = index.sentinelBound | index.UNEVALUATED; ++itDir; 1742 } else 1743 for (TDirSize d = index.sentinelBound - index.sentinelOcc; d != index.sentinelBound; ++d) 1744 { 1745 itPrev = itDir; 1746 *itDir = d | index.LEAF; ++itDir; 1747 } 1748 } 1749 for (; it != itEnd; ++it, ++bit) 1750 { 1751 if ((occ = *it) == 0) continue; 1752 if (occ > 1) { 1753 itPrev = itDir; 1754 *itDir = *bit - occ; ++itDir; 1755 *itDir = *bit | index.UNEVALUATED; ++itDir; 1756 } else { 1757 itPrev = itDir; 1758 *itDir = (*bit - occ) | index.LEAF; ++itDir; 1759 } 1760 } 1761 1762 // first child gets the mutual lcp value of the children (== parent repLength) 1763 *itDirBegin = ((*itDirBegin) & ~index.BITMASK0) | lcp; 1764 1765 // mark the last child 1766 if (itPrev != itDirEnd) 1767 *itPrev |= index.LAST_CHILD; 1768 } 1769 1770 1771 template < typename TText, typename TSpec > 1772 inline typename Size< Index<TText, IndexWotd<WotdOriginal> > >::Type 1773 _wotdEvaluate(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree<TSpec> > const &it) 1774 { 1775 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 1776 typedef typename Size<TIndex>::Type TSize; 1777 1778 TIndex &index = const_cast<TIndex&>(container(it)); 1779 TSize pos = value(it).node; 1780 TSize w1 = dirAt(pos + 1, index); 1781 1782 // test for evaluation 1783 if (w1 & index.UNEVALUATED) 1784 { 1785 TSize w0 = dirAt(pos, index); 1786 TSize lp = saAt(w0 & index.BITMASK0, index); 1787 TSize dst = length(indexDir(index)); 1788 1789 TSize size = _sortWotdBucket( 1790 index, 1791 w0 & index.BITMASK0, 1792 w1 & index.BITMASK1, 1793 parentEdgeLength(it)); 1794 1795 resize(indexDir(index), dst + size, Generous()); 1796 _storeWotdChildren(index, dst); 1797 1798 // mark nodes with solely empty child edges 1799 w1 = dst; 1800 if (index.sentinelOcc > 0) 1801 { 1802 TSize sentinelSize = index.sentinelOcc; 1803 if (index.interSentinelNodes && sentinelSize > 2) 1804 sentinelSize = 2; 1805 if (size == sentinelSize) w1 |= index.SENTINELS; 1806 } 1807 1808 assert(!(index.sentinelOcc == 1 && size == 1)); 1809 1810 dirAt(pos, index) = (w0 & ~index.BITMASK0) | lp; 1811 dirAt(pos + 1, index) = w1; 1812 } 1813 1814 return w1; 1815 } 1816 1817 template < typename TText, typename TIndexSpec, typename TSpec > 1818 inline typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type 1819 _wotdEvaluate(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree<TSpec> > const &it) 1820 { 1821 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 1822 typedef typename Size<TIndex>::Type TSize; 1823 1824 TIndex &index = const_cast<TIndex&>(container(it)); 1825 TSize pos = value(it).node; 1826 TSize w1 = dirAt(pos + 1, index); 1827 1828 // test for evaluation 1829 if (w1 & index.UNEVALUATED) 1830 { 1831 TSize dst = length(indexDir(index)); 1832 1833 TSize size = _sortWotdBucket( 1834 index, 1835 value(it).range.i1, 1836 w1 & index.BITMASK1, 1837 repLength(it)); 1838 /* 1839 if (globalDumpFlag) 1840 { 1841 std::cerr << '"' << representative(it) << '"' << std::endl; 1842 for (int i=0;i<length(getOccurrences(it));++i) 1843 std::cerr << getOccurrences(it)[i]<<'\t'<<suffix(indexText(index),getOccurrences(it)[i])<<std::endl; 1844 // _dumpFreq(index); 1845 } 1846 */ 1847 resize(indexDir(index), dst + size, Generous()); 1848 _storeWotdChildren(index, dst, repLength(it)); 1849 1850 // mark nodes with solely empty child edges 1851 w1 = dst; 1852 if (index.sentinelOcc > 0) 1853 { 1854 TSize sentinelSize = index.sentinelOcc; 1855 if (index.interSentinelNodes && sentinelSize > 2) 1856 sentinelSize = 2; 1857 if (size == sentinelSize) w1 |= index.SENTINELS; 1858 } 1859 1860 dirAt(pos + 1, index) = w1; 1861 } 1862 1863 return w1; 1864 } 1865 1866 1867 template <typename TText, typename TSpec> 1868 inline void 1869 _dump(Index<TText, IndexWotd<TSpec> > &index) 1870 { 1871 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1872 typedef typename Fibre<TIndex, WotdDir>::Type TDir; 1873 typedef typename Value<TDir>::Type TDirValue; 1874 1875 std::cout << " Dir (wotd)" << std::endl; 1876 for(unsigned i=0; i < length(indexDir(index)); ++i) { 1877 TDirValue d = indexDir(index)[i]; 1878 std::cout << i << ": " << (d & index.BITMASK0); 1879 if (d & index.LEAF) std::cout << " (Leaf/Uneval)"; 1880 if (d & index.LAST_CHILD) std::cout << " (LastChild/SENTINELS)"; 1881 std::cout << std::endl; 1882 } 1883 1884 std::cout << std::endl << " SA" << std::endl; 1885 for(unsigned i=0; i < length(indexSA(index)); ++i) 1886 std::cout << i << ": " << indexSA(index)[i] << " " << suffix(indexText(index), indexSA(index)[i]) << std::endl; 1887 1888 std::cout << std::endl; 1889 1890 } 1891 1892 ////////////////////////////////////////////////////////////////////////////// 1893 // _goDownChar 1894 1895 template < typename TText, class TSpec, typename TValue > 1896 inline bool _goDownChar( 1897 Iter<Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > &it, 1898 TValue c) 1899 { 1900 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1901 typedef typename Value<TIndex>::Type TIndexValue; 1902 1903 bool sorted = false; 1904 if (!goDown(it)) return false; 1905 do { 1906 if (parentEdgeLength(it) != 0) { 1907 TIndexValue edgeChar = parentEdgeLabel(it)[0]; 1908 if (edgeChar == c) return true; // the edge is found 1909 if (sorted && edgeChar > c) break; // too far (except the first one, 1910 } // child edges are sorted) 1911 sorted = true; 1912 } while (goRight(it)); 1913 _goUp(it); 1914 return false; 1915 } 1916 1917 template < typename TText, class TIndexSpec, class TSpec, typename TValue > 1918 inline bool _goDownChar( 1919 Iter<Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > &it, 1920 TValue c) 1921 { 1922 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1923 typedef typename Value<TIndex>::Type TIndexValue; 1924 1925 if (!goDown(it)) return false; 1926 do { 1927 if (parentEdgeLength(it) != 0) { 1928 TIndexValue edgeChar = parentEdgeLabel(it)[0]; 1929 if (edgeChar == c) return true; // the edge is found 1930 if (edgeChar > c) break; // too far (child edges are sorted) 1931 } 1932 } while (goRight(it)); 1933 _goUp(it); 1934 return false; 1935 } 1936 1937 /* 1938 template < typename TText, typename TSpec, typename TValue > 1939 inline bool 1940 _getNodeByChar( 1941 Iter< Index<TText, IndexWotd<TSpec> >, VSTree<TSpec> > const &it, 1942 TValue c, 1943 typename VertexDescriptor< Index<TText, IndexWotd<TSpec> > >::Type &childDesc) 1944 { 1945 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1946 typedef typename Fibre<TIndex, WotdDir>::Type TDir; 1947 typedef typename Fibre<TIndex, WotdSA>::Type TSA; 1948 1949 typedef typename Value<TSA>::Type TSAValue; 1950 typedef typename Value<TDir>::Type TDirValue; 1951 1952 typename Size<TIndex>::Type len = length(index); 1953 typename VertexDescriptor<TIndex>::Type desc; 1954 1955 TSAValue pos = _firstSuffixOfBucket(index, value(it).node); 1956 while (pos == len || value < (c = textAt(index, pos + value.i2))) { 1957 value.node += (dirAt(value.node, index) & index.LEAF)? 1: 2; 1958 pos = _firstSuffixOfBucket(index, value.node); 1959 } 1960 assert(pos != len); 1961 1962 return c == value; 1963 } 1964 */ 1965 1966 ////////////////////////////////////////////////////////////////////////////// 1967 // interface for automatic index creation 1968 1969 template <typename TText, typename TSpec> 1970 inline void _wotdCreateFirstLevel(Index<TText, IndexWotd<TSpec> > &index) 1971 { 1972 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1973 typedef typename Value<TIndex>::Type TValue; 1974 typedef typename Size<TIndex>::Type TSize; 1975 1976 resize(index.tempOcc, ValueSize<TValue>::VALUE + 1, Exact()); 1977 resize(index.tempBound, ValueSize<TValue>::VALUE + 1, Exact()); 1978 1979 TSize size; 1980 if (empty(indexSA(index))) 1981 { 1982 resize(indexSA(index), length(indexRawText(index)), Exact()); 1983 size = _sortFirstWotdBucket(index); 1984 } else 1985 { 1986 size = _sortWotdBucket(index, 0, length(indexSA(index)), 0); 1987 } 1988 1989 if (size > 0) 1990 { 1991 resize(indexDir(index), size + 2, Generous()); 1992 _storeWotdChildren(index, 2, 0); 1993 1994 // mark nodes with solely empty child edges 1995 TSize w1 = 2; 1996 if (index.sentinelOcc > 0) 1997 { 1998 TSize sentinelSize = index.sentinelOcc; 1999 if (index.interSentinelNodes && sentinelSize > 2) 2000 sentinelSize = 2; 2001 if (size == sentinelSize) w1 |= index.SENTINELS; 2002 } 2003 2004 2005 dirAt(0, index) = 0 | index.LAST_CHILD; 2006 dirAt(1, index) = w1; 2007 2008 } else { 2009 resize(indexDir(index), 1); 2010 dirAt(0, index) = 0 | index.LAST_CHILD | index.LEAF; 2011 } 2012 } 2013 2014 template <typename TText, typename TSpec> 2015 inline bool indexCreate(Index<TText, IndexWotd<TSpec> > &index, WotdDir const, Default const) 2016 { 2017 _wotdCreateFirstLevel(index); 2018 return true; 2019 } 2020 2021 ////////////////////////////////////////////////////////////////////////////// 2022 // clear 2023 2024 template < typename TText, typename TSpec > 2025 inline void clear(Index<TText, IndexWotd<TSpec> > &index) 2026 { 2027 clear(getFibre(index, WotdSA())); 2028 clear(getFibre(index, WotdDir())); 2029 } 2030 2031 ////////////////////////////////////////////////////////////////////////////// 2032 // open 2033 2034 template < typename TText, typename TSpec > 2035 inline bool open( 2036 Index< TText, IndexWotd<TSpec> > &index, 2037 const char *fileName, 2038 int openMode) 2039 { 2040 typedef Index<TText, IndexWotd<TSpec> > TIndex; 2041 typedef typename Value<TIndex>::Type TValue; 2042 2043 String<char> name; 2044 2045 name = fileName; append(name, ".txt"); 2046 if ((!open(getFibre(index, WotdText()), toCString(name), openMode)) && 2047 (!open(getFibre(index, WotdText()), fileName, openMode))) return false; 2048 2049 name = fileName; append(name, ".sa"); 2050 if (!open(getFibre(index, WotdSA()), toCString(name), openMode)) return false; 2051 name = fileName; append(name, ".dir"); 2052 2053 if (!open(getFibre(index, WotdDir()), toCString(name), openMode)) return false; 2054 2055 if (!empty(getFibre(index, WotdDir()))) 2056 { 2057 resize(index.tempOcc, ValueSize<TValue>::VALUE + 1); 2058 resize(index.tempBound, ValueSize<TValue>::VALUE + 1); 2059 } 2060 2061 return true; 2062 } 2063 template < typename TText, typename TSpec > 2064 inline bool open( 2065 Index< TText, IndexWotd<TSpec> > &index, 2066 const char *fileName) 2067 { 2068 return open(index, fileName, OPEN_RDONLY); 2069 } 2070 2071 2072 ////////////////////////////////////////////////////////////////////////////// 2073 // save 2074 2075 template < typename TText, typename TSpec > 2076 inline bool save( 2077 Index< TText, IndexWotd<TSpec> > &index, 2078 const char *fileName, 2079 int openMode) 2080 { 2081 String<char> name; 2082 2083 name = fileName; append(name, ".txt"); 2084 if ((!save(getFibre(index, WotdText()), toCString(name), openMode)) && 2085 (!save(getFibre(index, WotdText()), fileName, openMode))) return false; 2086 2087 name = fileName; append(name, ".sa"); 2088 if (!save(getFibre(index, WotdSA()), toCString(name), openMode)) return false; 2089 name = fileName; append(name, ".dir"); 2090 2091 if (!save(getFibre(index, WotdDir()), toCString(name), openMode)) return false; 2092 2093 return true; 2094 } 2095 template < typename TText, typename TSpec > 2096 inline bool save( 2097 Index< TText, IndexWotd<TSpec> > &index, 2098 const char *fileName) 2099 { 2100 return save(index, fileName, OPEN_WRONLY | OPEN_CREATE); 2101 } 2102 } 2103 2104 #endif //#ifndef SEQAN_HEADER_... 2105