1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: David Weese <david.weese@fu-berlin.de> 33 // ========================================================================== 34 35 #ifndef SEQAN_HEADER_INDEX_WOTD_H 36 #define SEQAN_HEADER_INDEX_WOTD_H 37 38 namespace seqan 39 { 40 41 42 ////////////////////////////////////////////////////////////////////////////// 43 // wotd tree index fibres 44 45 /*! 46 * @defgroup WOTDIndexFibres WOTD Index Fibres 47 * @brief Tag to select a specific fibre (e.g. table, object, ...) of an @link 48 * IndexWotd @endlink index. 49 * 50 * These tags can be used to get @link Fibre @endlink of an @link IndexWotd @endlink. 51 * 52 * @see Fibre 53 * @see Index#getFibre 54 * @see IndexWotd 55 * 56 * @tag WOTDIndexFibres#WotdDir 57 * @brief The child table. 58 * 59 * @tag WOTDIndexFibres#WotdRawSA 60 * @brief The raw suffix array. 61 * 62 * @tag WOTDIndexFibres#WotdText 63 * @brief The original text the index should be based on. 64 * 65 * @tag WOTDIndexFibres#WotdRawText 66 * @brief The raw text the index is really based on. 67 * 68 * @tag WOTDIndexFibres#WotdSA 69 * @brief The suffix array. 70 */ 71 72 ////////////////////////////////////////////////////////////////////////////// 73 /*! 74 * @fn IndexWotd#indexSA 75 * @headerfile <seqan/index.h> 76 * @brief Shortcut for <tt>getFibre(.., WotdSA)</tt>. 77 * 78 * @signature TSa indexSA(index); 79 * 80 * @param[in] index The @link IndexWotd @endlink object holding the fibre. 81 * 82 * @return TSa A reference to the @link WOTDIndexFibres#WotdSA @endlink fibre (partially sorted suffix array). 83 */ 84 85 /*! 86 * @fn IndexWotd#indexDir 87 * @headerfile <seqan/index.h> 88 * @brief Shortcut for <tt>getFibre(.., WotdDir())</tt>. 89 * @signature TFibre indexDir(index); 90 * 91 * @param[in] index The @link IndexWotd @endlink object holding the fibre. 92 * 93 * @return TFibre A reference to the @link WOTDIndexFibres#WotdDir @endlink fibre (tree structure). 94 */ 95 96 /*! 97 * @fn IndexWotd#saAt 98 * @headerfile <seqan/index.h> 99 * @note Advanced functionality, not commonly used. 100 * @brief Shortcut for <tt>value(indexSA(..), ..)</tt>. 101 * 102 * @signature TValue saAt(position, index); 103 * 104 * @param[in] index The @link IndexWotd @endlink object holding the fibre. 105 * @param[in] position A position in the array on which the value should be accessed. 106 * 107 * @return TValue A reference or proxy to the value in the @link WOTDIndexFibres#WotdSA @endlink fibre. 108 * To be more precise, a reference to a position containing a value of type 109 * @link SAValue @endlink is returned (or a proxy). 110 */ 111 112 /*! 113 * @fn IndexWotd#dirAt 114 * @headerfile <seqan/index.h> 115 * @brief Shortcut for <tt>value(indexDir(index), position)</tt>. 116 * 117 * @signature TFibre dirAt(position, index); 118 * 119 * @param[in] index The @link IndexWotd @endlink object holding the fibre. 120 * @param[in] position A position in the array on which the value should be accessed. 121 * 122 * @return TFibre A reference to the @link WOTDIndexFibres#WotdDir @endlink fibre. 123 */ 124 125 typedef FibreText WotdText; 126 typedef FibreRawText WotdRawText; 127 typedef FibreSA WotdSA; 128 typedef FibreRawSA WotdRawSA; 129 typedef FibreDir WotdDir; 130 131 ////////////////////////////////////////////////////////////////////////////// 132 // wotd tree index 133 134 /*! 135 * @class IndexWotd 136 * @extends Index 137 * @implements StringTreeConcept 138 * @headerfile <seqan/index.h> 139 * @brief An index based on a lazy suffix tree (see Giegerich et al., "Efficient implementation of lazy suffix 140 * trees"). 141 * 142 * @signature template <typename TText, typename TSpec> 143 * class Index<TText, IndexWotd<TSpec> >; 144 * 145 * @tparam TText The @link TextConcept @endlink text type. 146 * @tparam TSpec The type for further specialization of the Index type. 147 * 148 * The fibres (see @link Index @endlink and @link Fibre @endlink) of this index are a partially sorted suffix array 149 * (see @link WOTDIndexFibres#WotdSA @endlink) and the wotd tree (see @link WOTDIndexFibres#WotdDir @endlink). 150 * 151 * Demo: Demo.Constraint Iterator 152 * 153 * @see WOTDIndexFibres 154 */ 155 156 struct WotdOriginal_; 157 typedef Tag<WotdOriginal_> const WotdOriginal; 158 159 template < typename TSpec = void > 160 struct IndexWotd {}; 161 162 /* 163 template < typename TObject, typename TSpec > 164 struct Fibre< Index<TObject, IndexWotd<TSpec> >, FibreDir> 165 { 166 typedef Index<TObject, IndexWotd<TSpec> > TIndex; 167 typedef String< 168 typename typename Size<TIndex>::Type, 169 Alloc<> 170 > Type; 171 }; 172 */ 173 174 template < typename TObject, typename TSpec > 175 class Index<TObject, IndexWotd<TSpec> > { 176 public: 177 typedef typename Fibre<Index, WotdText>::Type TText; 178 typedef typename Fibre<Index, WotdSA>::Type TSA; 179 typedef typename Fibre<Index, WotdDir>::Type TDir; 180 181 typedef typename Value<Index>::Type TValue; 182 typedef typename Value<TDir>::Type TDirValue; 183 typedef typename Size<Index>::Type TSize; 184 typedef String<TSize, Alloc<> > TCounter; 185 typedef String<typename Value<TSA>::Type, Alloc<> > TTempSA; 186 typedef typename Cargo<Index>::Type TCargo; 187 188 // 1st word flags 189 static TDirValue const LEAF = (TDirValue)1 << (BitsPerValue<TDirValue>::VALUE - 1); // this node is a leaf 190 static TDirValue const LAST_CHILD = (TDirValue)1 << (BitsPerValue<TDirValue>::VALUE - 2); // this node is the last child 191 // 2nd word flag 192 static TDirValue const UNEVALUATED = (TDirValue)1 << (BitsPerValue<TDirValue>::VALUE - 1); // this node is partially evalutated and has no evaluated children 193 static TDirValue const SENTINELS = (TDirValue)1 << (BitsPerValue<TDirValue>::VALUE - 2); // the children of this node have solely $-edges 194 195 static TDirValue const BITMASK0 = ~(LEAF | LAST_CHILD); 196 static TDirValue const BITMASK1 = ~(UNEVALUATED | SENTINELS); 197 198 199 Holder<TText> text; // underlying text 200 TSA sa; // suffix array sorted by the first q chars 201 TDir dir; // bucket directory 202 TCargo cargo; // user-defined cargo 203 204 205 TTempSA tempSA; 206 TCounter tempOcc; 207 TCounter tempBound; 208 209 TSize sentinelOcc; 210 TSize sentinelBound; 211 bool interSentinelNodes; // should virtually one (true) $-sign or many (false) $_i-signs be appended to the strings in text 212 213 /*! 214 * @fn IndexWotd::Index 215 * @brief Constructor 216 * 217 * @signature Index::Index(); 218 * @signature Index::Index(index); 219 * @signature Index::Index(text); 220 * 221 * @param[in] index Other Index object to copy from. 222 * @param[in] text The text to be indexed. 223 */ 224 Index()225 Index(): 226 interSentinelNodes(false) {} 227 Index(Index & other)228 Index(Index &other) : 229 text(other.text), 230 sa(other.sa), 231 dir(other.dir), 232 cargo(other.cargo), 233 tempSA(other.tempSA), 234 tempBound(other.tempBound), 235 sentinelOcc(other.sentinelOcc), 236 sentinelBound(other.sentinelBound), 237 interSentinelNodes(other.interSentinelNodes) {} 238 Index(Index const & other)239 Index(Index const &other) : 240 text(other.text), 241 sa(other.sa), 242 dir(other.dir), 243 cargo(other.cargo), 244 tempSA(other.tempSA), 245 tempBound(other.tempBound), 246 sentinelOcc(other.sentinelOcc), 247 sentinelBound(other.sentinelBound), 248 interSentinelNodes(other.interSentinelNodes) {} 249 250 template <typename TText_> Index(TText_ & _text)251 Index(TText_ &_text) : 252 text(_text), 253 sentinelOcc(0), 254 sentinelBound(0), 255 interSentinelNodes(false) {} 256 257 template <typename TText_> Index(TText_ const & _text)258 Index(TText_ const &_text): 259 text(_text), 260 sentinelOcc(0), 261 sentinelBound(0), 262 interSentinelNodes(false) {} 263 }; 264 /* 265 template < typename TText, typename TSpec > 266 struct Value< Index<TText, IndexWotd<TSpec> > > { 267 typedef typename Value< typename Fibre< Index<TText, IndexWotd<TSpec> >, WotdRawText >::Type >::Type Type; 268 }; 269 270 template < typename TText, typename TSpec > 271 struct Size< Index<TText, IndexWotd<TSpec> > > { 272 typedef typename Size< typename Fibre< Index<TText, IndexWotd<TSpec> >, WotdRawText >::Type >::Type Type; 273 }; 274 */ 275 276 template <typename TText, typename TSpec> 277 SEQAN_CONCEPT_IMPL((Index<TText, IndexWotd<TSpec> >), (StringTreeConcept)); 278 279 template <typename TText, typename TSpec> 280 SEQAN_CONCEPT_IMPL((Index<TText, IndexWotd<TSpec> > const), (StringTreeConcept)); 281 282 ////////////////////////////////////////////////////////////////////////////// 283 // default fibre creators 284 285 template < typename TText, typename TSpec > 286 struct DefaultIndexCreator<Index<TText, IndexWotd<TSpec> >, FibreDir> { 287 typedef Default Type; 288 }; 289 290 template < typename TText, typename TSSetSpec, typename TSpec > 291 struct DefaultIndexCreator<Index<StringSet<TText, TSSetSpec>, IndexWotd<TSpec> >, FibreDir> { 292 typedef Default Type; 293 }; 294 295 ////////////////////////////////////////////////////////////////////////////// 296 // default finder 297 298 template < typename TText, typename TSpec > 299 struct DefaultFinder< Index<TText, IndexWotd<TSpec> > > 300 { 301 typedef FinderSTree Type; // standard wotd finder is tree based search 302 }; 303 304 ////////////////////////////////////////////////////////////////////////////// 305 306 307 template <typename TSize> 308 struct VertexWotdOriginal_ { 309 TSize node; // position of current node entry in directory 310 TSize parentRepLen; // representative length of parent node 311 TSize edgeLen; // length of edge above current node 312 313 VertexWotdOriginal_() : node(0), parentRepLen(0), edgeLen(0) {} 314 VertexWotdOriginal_(MinimalCtor) : node(0), parentRepLen(0), edgeLen(0) 315 { 316 _setSizeInval(node); 317 } 318 }; 319 320 template <typename TSize> 321 struct VertexWotdModified_ { 322 TSize node; // position of current node entry in directory 323 TSize parentRepLen; // representative length of parent node 324 TSize edgeLen; // length of edge above current node 325 Pair<TSize> range; // current SA interval of hits 326 TSize parentRight; // right boundary of parent node's range (allows one to go right) 327 328 VertexWotdModified_() : 329 node(0), 330 parentRepLen(0), 331 edgeLen(0), 332 range(0,0), 333 parentRight(0) 334 {} 335 VertexWotdModified_(MinimalCtor) : 336 node(0), 337 parentRepLen(0), 338 edgeLen(0), 339 range(0,0), 340 parentRight(0) 341 {} 342 VertexWotdModified_(Pair<TSize> const &otherRange, TSize otherParentRight) : 343 node(0), 344 parentRepLen(0), 345 edgeLen(0), 346 range(otherRange), 347 parentRight(otherParentRight) 348 {} 349 }; 350 351 ////////////////////////////////////////////////////////////////////////////// 352 template < typename TText > 353 struct VertexDescriptor< Index<TText, IndexWotd<WotdOriginal> > > { 354 typedef typename Size< Index<TText, IndexWotd<WotdOriginal> > >::Type TSize; 355 typedef VertexWotdOriginal_<TSize> Type; 356 }; 357 358 template < typename TText, typename TSpec > 359 struct VertexDescriptor< Index<TText, IndexWotd<TSpec> > > { 360 typedef typename Size< Index<TText, IndexWotd<TSpec> > >::Type TSize; 361 typedef VertexWotdModified_<TSize> Type; 362 }; 363 364 template < typename TText, typename TSpec > 365 void _indexRequireTopDownIteration(Index<TText, IndexWotd<TSpec> > &index) 366 { 367 indexRequire(index, WotdDir()); 368 } 369 370 ////////////////////////////////////////////////////////////////////////////// 371 // history stack functions 372 373 template <typename TSize> 374 struct HistoryStackWotdOriginal_ 375 { 376 TSize node; // position of current node entry in directory 377 TSize edgeLen; // length of edge above current node 378 }; 379 380 template <typename TSize> 381 struct HistoryStackWotdModified_ 382 { 383 TSize node; // position of current node entry in directory 384 TSize edgeLen; // length of edge above current node 385 Pair<TSize> range; // current SA interval of hits 386 }; 387 388 template < typename TText, typename TSpec > 389 struct HistoryStackEntry_< Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown< ParentLinks<TSpec> > > > > 390 { 391 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 392 typedef typename Size<TIndex>::Type TSize; 393 typedef HistoryStackWotdOriginal_<TSize> Type; 394 }; 395 396 template < typename TText, typename TIndexSpec, typename TSpec > 397 struct HistoryStackEntry_< Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown< ParentLinks<TSpec> > > > > 398 { 399 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 400 typedef typename Size<TIndex>::Type TSize; 401 typedef HistoryStackWotdModified_<TSize> Type; 402 }; 403 404 405 template < typename TText, typename TSpec > 406 inline void 407 _historyPush(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > &it) 408 { 409 it._parentDesc = value(it); 410 value(it).parentRepLen += parentEdgeLength(it); 411 } 412 413 template < typename TText, typename TIndexSpec, typename TSpec > 414 inline void 415 _historyPush(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > &it) 416 { 417 it._parentDesc = value(it); 418 value(it).parentRepLen += parentEdgeLength(it); 419 value(it).parentRight = value(it).range.i2; 420 } 421 422 template < typename TText, typename TSpec > 423 inline void 424 _historyPush(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown< ParentLinks<TSpec> > > > &it) 425 { 426 typedef typename Size< Index<TText, IndexWotd<WotdOriginal> > >::Type TSize; 427 TSize edgeLen = parentEdgeLength(it); 428 HistoryStackWotdOriginal_<TSize> entry = { value(it).node, edgeLen }; 429 appendValue(it.history, entry); 430 value(it).parentRepLen += edgeLen; 431 } 432 433 template < typename TText, typename TIndexSpec, typename TSpec > 434 inline void 435 _historyPush(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown< ParentLinks<TSpec> > > > &it) 436 { 437 typedef typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type TSize; 438 TSize edgeLen = parentEdgeLength(it); 439 HistoryStackWotdModified_<TSize> entry = { value(it).node, edgeLen, value(it).range }; 440 appendValue(it.history, entry); 441 value(it).parentRepLen += edgeLen; 442 value(it).parentRight = value(it).range.i2; 443 } 444 445 ////////////////////////////////////////////////////////////////////////////// 446 template < typename TText, typename TIndexSpec, typename TPropertyMap > 447 inline void 448 resizeVertexMap( 449 TPropertyMap & pm, 450 Index<TText, IndexWotd<TIndexSpec> > const& index) 451 { 452 resize(pm, length(indexDir(index)), Generous()); 453 } 454 455 /* // different interface compared to resizeVertexMap(graph, ...) 456 template < typename TText, typename TIndexSpec, typename TPropertyMap, typename TProperty > 457 inline void 458 resizeVertexMap( 459 Index<TText, IndexWotd<TIndexSpec> > const& index, 460 TPropertyMap & pm, 461 TProperty const & prop) 462 { 463 resize(pm, length(indexDir(index)), prop, Generous()); 464 } 465 */ 466 template < typename TSize > 467 inline typename Id< VertexWotdOriginal_<TSize> const >::Type 468 _getId(VertexWotdOriginal_<TSize> const &desc) 469 { 470 return desc.node; 471 } 472 473 template < typename TSize > 474 inline typename Id< VertexWotdOriginal_<TSize> >::Type 475 _getId(VertexWotdOriginal_<TSize> &desc) 476 { 477 return _getId(const_cast<VertexWotdOriginal_<TSize> const &>(desc)); 478 } 479 480 template < typename TSize > 481 inline typename Id< VertexWotdModified_<TSize> const >::Type 482 _getId(VertexWotdModified_<TSize> const &desc) 483 { 484 return desc.node; 485 } 486 487 template < typename TSize > 488 inline typename Id< VertexWotdModified_<TSize> >::Type 489 _getId(VertexWotdModified_<TSize> &desc) 490 { 491 return _getId(const_cast<VertexWotdModified_<TSize> const &>(desc)); 492 } 493 494 ////////////////////////////////////////////////////////////////////////////// 495 496 template < typename TSize > 497 inline bool _isRoot(VertexWotdOriginal_<TSize> const &value) 498 { 499 return value.node == 0; 500 } 501 502 template < typename TSize > 503 inline bool _isRoot(VertexWotdModified_<TSize> const &value) 504 { 505 return value.node == 0; 506 } 507 508 // is this a leaf? (including empty $-edges) 509 template < typename TText, typename TIndexSpec, typename TSpec, typename TDfsOrder > 510 inline bool _isLeaf( 511 Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree<TSpec> > const &it, 512 VSTreeIteratorTraits<TDfsOrder, False> const) 513 { 514 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 515 TIndex const &index = container(it); 516 return (dirAt(value(it).node, index) & index.LEAF) != 0; 517 } 518 519 // is this a leaf? (excluding empty $-edges) 520 template < typename TText, typename TIndexSpec, typename TSpec, typename TDfsOrder > 521 inline bool _isLeaf( 522 Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree<TSpec> > const &it, 523 VSTreeIteratorTraits<TDfsOrder, True> const) 524 { 525 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 526 527 TIndex const &index = container(it); 528 if (dirAt(value(it).node, index) & index.LEAF) 529 return true; 530 531 // ensure node evaluation and test for sentinel child edges? 532 return _wotdEvaluate(it) & index.SENTINELS; 533 } 534 535 536 // parentEdgeLength - ORIGINAL VERSION 537 template < typename TIndex, typename TSize > 538 inline typename Size<TIndex>::Type 539 parentEdgeLength(TIndex const &index, VertexWotdOriginal_<TSize> &vDesc) 540 { 541 TSize edgeLen = vDesc.edgeLen; 542 if (edgeLen != (TSize)-1) 543 return edgeLen; 544 545 TSize pos = vDesc.node; 546 TSize w0 = dirAt(pos, index); 547 if (w0 & index.LEAF) 548 return vDesc.edgeLen = suffixLength(w0 & index.BITMASK0, index); 549 550 TSize w1 = dirAt(pos + 1, index); 551 if (w1 & index.UNEVALUATED) 552 return vDesc.edgeLen = _bucketLcp( 553 infix(indexSA(index), w0 & index.BITMASK0, w1 & index.BITMASK1), 554 indexText(index)); 555 else 556 return vDesc.edgeLen = _getNodeLP(index, w1) - (w0 & index.BITMASK0); 557 } 558 559 // parentEdgeLength - MODIFIED VERSION 560 template < typename TIndex, typename TSize > 561 inline typename Size<TIndex>::Type 562 parentEdgeLength(TIndex const &index, VertexWotdModified_<TSize> &vDesc) 563 { 564 TSize edgeLen = vDesc.edgeLen; 565 if (edgeLen != (TSize)-1) 566 return edgeLen; 567 568 TSize pos = vDesc.node; 569 TSize w0 = dirAt(pos, index); 570 if (w0 & index.LEAF) 571 return vDesc.edgeLen = 572 suffixLength(saAt(vDesc.range.i1, index), index) - vDesc.parentRepLen; 573 574 TSize w1 = dirAt(pos + 1, index); 575 if (w1 & index.UNEVALUATED) 576 if (_isSizeInval(vDesc.range.i2)) 577 return vDesc.edgeLen = _bucketLcp( 578 suffix(indexSA(index), vDesc.range.i1), 579 indexText(index), 580 vDesc.parentRepLen) - vDesc.parentRepLen; 581 else 582 return vDesc.edgeLen = _bucketLcp( 583 infix(indexSA(index), vDesc.range.i1, vDesc.range.i2), 584 indexText(index), 585 vDesc.parentRepLen) - vDesc.parentRepLen; 586 else 587 return (dirAt(w1 & index.BITMASK1, index) & index.BITMASK0) - vDesc.parentRepLen; 588 } 589 590 template < typename TText, typename TIndexSpec, typename TSpec > 591 inline typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type 592 parentEdgeLength(Iter< 593 Index<TText, IndexWotd<TIndexSpec> >, 594 VSTree< TopDown<TSpec> > > const &it) 595 { 596 typedef Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > TIter; 597 return parentEdgeLength(container(it), value(const_cast<TIter&>(it))); 598 } 599 600 template < typename TText, typename TIndexSpec, typename TSpec > 601 inline typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type 602 parentRepLength(Iter< 603 Index<TText, IndexWotd<TIndexSpec> >, 604 VSTree< TopDown<TSpec> > > const &it) 605 { 606 return value(it).parentRepLen; 607 } 608 609 template < typename TText, typename TIndexSpec, typename TSpec > 610 inline typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type 611 parentRepLength(Iter< 612 Index<TText, IndexWotd<TIndexSpec> >, 613 VSTree< TopDown< ParentLinks<TSpec> > > > const &it) 614 { 615 return value(it).parentRepLen; 616 } 617 618 template < typename TText, typename TIndexSpec, typename TSpec > 619 inline typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type 620 repLength(Iter< 621 Index<TText, IndexWotd<TIndexSpec> >, 622 VSTree< TopDown<TSpec> > > const &it) 623 { 624 return parentRepLength(it) + parentEdgeLength(it); 625 } 626 627 628 // parentEdgeLabel - ORIGINAL VERSION (doesn't work if TText is a StringSet) 629 template < typename TText, typename TSpec > 630 inline typename Infix< typename Fibre<Index<TText, IndexWotd<WotdOriginal> >, EsaText>::Type const >::Type 631 parentEdgeLabel(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > const &it) 632 { 633 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 634 typedef typename Size<TIndex>::Type TSize; 635 636 TIndex const &index = container(it); 637 638 if (isRoot(it)) 639 return infix(indexText(index), 0, 0); 640 else { 641 TSize occ = _getNodeLP(index, value(it).node); 642 return infix(indexText(index), occ, occ + parentEdgeLength(it)); 643 } 644 } 645 646 // getOccurrence - ORIGINAL VERSION 647 template < typename TText, typename TSpec > 648 inline typename SAValue<Index<TText, IndexWotd<WotdOriginal> > >::Type 649 getOccurrence(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree<TSpec> > const &it) { 650 return _getNodeLP(container(it), value(it).node) - value(it).parentRepLen; 651 } 652 653 template < typename TText, typename TIndexSpec, typename TSpec > 654 inline bool 655 emptyParentEdge(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree<TopDown<TSpec> > > const &it) 656 { 657 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 658 659 TIndex const &index = container(it); 660 typename SAValue<TIndex>::Type pos = getOccurrence(it); 661 return getSeqOffset(pos, stringSetLimits(index)) + value(it).parentRepLen 662 == sequenceLength(getSeqNo(pos, stringSetLimits(index)), index); 663 } 664 665 // to avoid ambiguity 666 template < typename TText, typename TIndexSpec, typename TSpec > 667 inline bool 668 emptyParentEdge(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree<TopDown<ParentLinks<TSpec> > > > const &it) 669 { 670 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 671 672 TIndex const &index = container(it); 673 typename SAValue<TIndex>::Type pos = getOccurrence(it); 674 return getSeqOffset(pos, stringSetLimits(index)) + value(it).parentRepLen 675 == sequenceLength(getSeqNo(pos, stringSetLimits(index)), index); 676 } 677 678 679 680 template < typename TText, typename TSpec > 681 inline void 682 goRoot(Iter< 683 Index<TText, IndexWotd<WotdOriginal> >, 684 VSTree<TSpec> > &it) 685 { 686 _historyClear(it); 687 value(it).node = 0; // start in root node (first entry in dir) 688 value(it).parentRepLen = 0; // parent prefix length is 0 689 value(it).edgeLen = 0; // edge length is 0 690 } 691 692 template < typename TText, typename TIndexSpec, typename TSpec > 693 inline void 694 goRoot(Iter< 695 Index<TText, IndexWotd<TIndexSpec> >, 696 VSTree<TSpec> > &it) 697 { 698 _historyClear(it); 699 value(it).range.i1 = 0; // start in root node with range (0,infty) 700 _setSizeInval(value(it).range.i2); // infty is equivalent to length(index) and faster to compare 701 value(it).node = 0; // start in root node (first entry in dir) 702 value(it).parentRepLen = 0; // parent prefix length is 0 703 value(it).edgeLen = 0; // edge length is 0 704 } 705 706 template < typename TText, typename TSpec > 707 inline bool atEnd(Iter<Index<TText, IndexWotd<WotdOriginal> >, VSTree<TSpec> > &it) 708 { 709 return _isSizeInval(value(it).node); 710 } 711 712 template < typename TText, typename TSpec > 713 inline bool atEnd(Iter<Index<TText, IndexWotd<WotdOriginal> >, VSTree<TSpec> > const &it) 714 { 715 return _isSizeInval(value(it).node); 716 } 717 718 719 // adjust iterator's right border of SA range 720 template < typename TText, typename TSpec > 721 inline void 722 _adjustRightBorder( 723 Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > &) 724 {} 725 726 template < typename TText, typename TIndexSpec, typename TSpec > 727 inline void 728 _adjustRightBorder( 729 Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > &it) 730 { 731 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 732 typedef typename Size<TIndex>::Type TSize; 733 734 TIndex const &index = container(it); 735 TSize pos = value(it).node; 736 737 TSize w0 = dirAt(pos, index); 738 if (w0 & index.LEAF) 739 value(it).range.i2 = value(it).range.i1 + 1; 740 else 741 if (w0 & index.LAST_CHILD) 742 value(it).range.i2 = value(it).parentRight; 743 else { 744 w0 = dirAt(pos + 2, index); 745 value(it).range.i2 = w0 & index.BITMASK0; 746 } 747 } 748 749 // go down the leftmost edge (including empty $-edges) 750 template < typename TText, typename TSpec, typename TDfsOrder, typename THideEmptyEdges > 751 inline bool 752 _goDown( 753 Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > &it, 754 VSTreeIteratorTraits<TDfsOrder, THideEmptyEdges> const) 755 { 756 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 757 typedef typename Size<TIndex>::Type TSize; 758 759 if (_isLeaf(it, EmptyEdges())) return false; 760 761 TIndex &index = container(it); 762 _historyPush(it); 763 764 // ensure node evaluation 765 TSize childNode = _wotdEvaluate(it); 766 767 if (THideEmptyEdges::VALUE && (childNode & index.SENTINELS) != 0) 768 return false; 769 770 // go down 771 value(it).node = childNode & index.BITMASK1; 772 value(it).edgeLen = -1; 773 774 // go right if parent edge is empty 775 // or hull predicate is false 776 if ((THideEmptyEdges::VALUE && emptyParentEdge(it)) || !nodeHullPredicate(it)) 777 if (!goRight(it)) { 778 _goUp(it); 779 return false; 780 } 781 782 return true; 783 } 784 785 // go down the leftmost edge (excluding empty $-edges) 786 template < typename TText, typename TIndexSpec, typename TSpec, typename TDfsOrder, typename THideEmptyEdges > 787 inline bool 788 _goDown( 789 Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > &it, 790 VSTreeIteratorTraits<TDfsOrder, THideEmptyEdges> const) 791 { 792 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 793 typedef typename Size<TIndex>::Type TSize; 794 795 if (_isLeaf(it, EmptyEdges())) return false; 796 TIndex const &index = container(it); 797 798 // ensure node evaluation 799 TSize childNode = _wotdEvaluate(it); 800 801 if (THideEmptyEdges::VALUE && (childNode & index.SENTINELS) != 0) 802 return false; 803 804 // go down 805 _historyPush(it); 806 value(it).node = childNode & index.BITMASK1; 807 value(it).edgeLen = -1; 808 _adjustRightBorder(it); 809 810 // go right if parent edge is empty 811 // or hull predicate is false 812 if ((THideEmptyEdges::VALUE && emptyParentEdge(it)) || !nodeHullPredicate(it)) 813 if (!goRight(it)) { 814 _goUp(it); 815 return false; 816 } 817 818 return true; 819 } 820 821 // go right to the lexic. next sibling 822 template < typename TText, typename TSpec, typename TDfsOrder, typename THideEmptyEdges > 823 inline bool 824 _goRight( 825 Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > &it, 826 VSTreeIteratorTraits<TDfsOrder, THideEmptyEdges> const) 827 { 828 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 829 typedef typename Size<TIndex>::Type TSize; 830 831 TIndex const &index = container(it); 832 833 do { 834 TSize w0 = dirAt(value(it).node, index); 835 if (w0 & index.LAST_CHILD) 836 return false; 837 838 value(it).node += (w0 & index.LEAF)? 1: 2; 839 value(it).edgeLen = -1; 840 841 _adjustRightBorder(it); 842 } while ((THideEmptyEdges::VALUE && emptyParentEdge(it)) || !nodeHullPredicate(it)); 843 return true; 844 } 845 846 template < typename TText, typename TIndexSpec, typename TSpec, typename TDfsOrder, typename THideEmptyEdges > 847 inline bool 848 _goRight( 849 Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > &it, 850 VSTreeIteratorTraits<TDfsOrder, THideEmptyEdges> const) 851 { 852 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 853 typedef typename Size<TIndex>::Type TSize; 854 855 TIndex const &index = container(it); 856 857 do { 858 TSize w0 = dirAt(value(it).node, index); 859 if (w0 & index.LAST_CHILD) 860 return false; 861 862 value(it).node += (w0 & index.LEAF)? 1: 2; 863 value(it).edgeLen = -1; 864 865 value(it).range.i1 = value(it).range.i2; 866 _adjustRightBorder(it); 867 } while ((THideEmptyEdges::VALUE && emptyParentEdge(it)) || !nodeHullPredicate(it)); 868 return true; 869 } 870 871 // go up one edge (returns false if in root node) 872 // can be used at most once, as no history stack is available 873 template < typename TText, typename TWotdSpec, typename TSpec > 874 inline bool 875 _goUp(Iter< Index<TText, IndexWotd<TWotdSpec> >, VSTree< TopDown<TSpec> > > &it) 876 { 877 if (!isRoot(it)) { 878 value(it) = it._parentDesc; 879 return true; 880 } 881 return false; 882 } 883 884 // go up one edge (returns false if in root node) 885 template < typename TText, typename TSpec > 886 inline bool 887 _goUp(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown< ParentLinks<TSpec> > > > &it) 888 { 889 typedef typename Size< Index<TText, IndexWotd<WotdOriginal> > >::Type TSize; 890 891 if (!empty(it.history)) { 892 HistoryStackWotdOriginal_<TSize> const &entry = back(it.history); 893 value(it).node = entry.node; 894 value(it).parentRepLen -= entry.edgeLen; 895 value(it).edgeLen = entry.edgeLen; 896 eraseBack(it.history); 897 return true; 898 } 899 return false; 900 } 901 902 template < typename TText, typename TIndexSpec, typename TSpec > 903 inline bool 904 _goUp(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown< ParentLinks<TSpec> > > > &it) 905 { 906 typedef typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type TSize; 907 908 if (!empty(it.history)) 909 { 910 HistoryStackWotdModified_<TSize> const &entry = back(it.history); 911 value(it).node = entry.node; 912 value(it).parentRepLen -= entry.edgeLen; 913 value(it).edgeLen = entry.edgeLen; 914 value(it).range = entry.range; 915 eraseBack(it.history); 916 if (!empty(it.history)) 917 value(it).parentRight = back(it.history).range.i2; // copy right boundary of parent's range 918 return true; 919 } 920 return false; 921 } 922 923 // return vertex descriptor of parent's node 924 template < typename TText, typename TSpec > 925 inline typename VertexDescriptor< Index<TText, IndexWotd<WotdOriginal> > >::Type 926 nodeUp(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown< ParentLinks<TSpec> > > > const &it) 927 { 928 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 929 typedef typename Size<TIndex>::Type TSize; 930 931 if (!empty(it.history)) 932 { 933 HistoryStackWotdOriginal_<TSize> const &entry = back(it.history); 934 typename VertexDescriptor<TIndex>::Type desc; 935 936 desc.node = entry.node; 937 desc.parentRepLen = value(it).parentRepLen - entry.edgeLen; 938 desc.edgeLen = entry.edgeLen; 939 TSize h = length(it.history) - 1; 940 if (h != 0) --h; 941 return desc; 942 } else 943 return value(it); 944 } 945 946 // return vertex descriptor of parent's node 947 template < typename TText, typename TIndexSpec, typename TSpec > 948 inline typename VertexDescriptor< Index<TText, IndexWotd<TIndexSpec> > >::Type 949 nodeUp(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown< ParentLinks<TSpec> > > > const &it) 950 { 951 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 952 typedef typename Size<TIndex>::Type TSize; 953 954 if (!empty(it.history)) 955 { 956 HistoryStackWotdModified_<TSize> const &entry = back(it.history); 957 typename VertexDescriptor<TIndex>::Type desc; 958 959 desc.node = entry.node; 960 desc.parentRepLen = value(it).parentRepLen - entry.edgeLen; 961 desc.edgeLen = entry.edgeLen; 962 return desc; 963 } else 964 return value(it); 965 } 966 967 ////////////////////////////////////////////////////////////////////////////// 968 969 970 // Counting sort - Step 2a: Count characters 971 template < typename TBuckets, typename TText > 972 inline void 973 _wotdCountChars(TBuckets &buckets, TText const &text) 974 { 975 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 976 977 TTextIterator itText = begin(text, Standard()); 978 TTextIterator itTextEnd = end(text, Standard()); 979 for (; itText != itTextEnd; ++itText) 980 ++buckets[ordValue(*itText)]; 981 } 982 983 984 template < typename TBuckets, typename TText, typename TSpec > 985 inline void 986 _wotdCountChars(TBuckets &buckets, StringSet<TText, TSpec> const &stringSet) 987 { 988 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 989 990 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 991 { 992 TText const &text = value(stringSet, seqNo); 993 TTextIterator itText = begin(text, Standard()); 994 TTextIterator itTextEnd = end(text, Standard()); 995 for (; itText != itTextEnd; ++itText) 996 ++buckets[ordValue(*itText)]; 997 } 998 } 999 1000 // Counting sort - Step 2b: Count the prefixLen'th characters of suffices 1001 template < typename TBuckets, typename TText, typename TSA, typename TSize > 1002 inline typename Size<TText>::Type 1003 _wotdCountCharsWotdOriginal( 1004 TBuckets &buckets, 1005 TText const &text, 1006 TSA &sa, 1007 TSize prefixLen) 1008 { 1009 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1010 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 1011 typedef typename Size<TText>::Type TTextSize; 1012 1013 TTextIterator itText = begin(text, Standard()); 1014 TSAIterator itSA = begin(sa, Standard()); 1015 TSAIterator itSAEnd = end(sa, Standard()); 1016 1017 TTextSize sentinels = 0; 1018 TTextSize textLength = length(text); 1019 for (; itSA != itSAEnd; ++itSA) 1020 { 1021 // add prefix on entries in sa 1022 TTextSize saValue = (*itSA += prefixLen); 1023 if (textLength > saValue) 1024 ++buckets[ordValue(*(itText + saValue))]; 1025 else 1026 if (textLength == saValue) ++sentinels; 1027 } 1028 return sentinels; 1029 } 1030 1031 template < typename TBuckets, typename TText, typename TSA, typename TSize > 1032 inline typename Size<TText>::Type 1033 _wotdCountChars( 1034 TBuckets &buckets, 1035 TText const &text, 1036 TSA const &sa, 1037 TSize prefixLen) 1038 { 1039 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1040 typedef typename Iterator<TSA const, Standard>::Type TSAIterator; 1041 typedef typename Size<TText>::Type TTextSize; 1042 1043 TTextIterator itText = begin(text, Standard()) + prefixLen; 1044 TSAIterator itSA = begin(sa, Standard()); 1045 TSAIterator itSAEnd = end(sa, Standard()); 1046 1047 TTextSize sentinels = 0; 1048 TTextSize textLength = length(text) - prefixLen; 1049 for (; itSA != itSAEnd; ++itSA) 1050 { 1051 TTextSize saValue = *itSA; 1052 if (textLength > saValue) 1053 ++buckets[ordValue(*(itText + saValue))]; 1054 else 1055 if (textLength == saValue) ++sentinels; 1056 } 1057 return sentinels; 1058 } 1059 1060 template < 1061 typename TBuckets, 1062 typename TText, 1063 typename TSpec, 1064 typename TSA, 1065 typename TSize 1066 > 1067 inline typename Size<TText>::Type 1068 _wotdCountChars( 1069 TBuckets &buckets, 1070 StringSet<TText, TSpec> const &stringSet, 1071 TSA const &sa, 1072 TSize prefixLen) 1073 { 1074 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1075 typedef typename Iterator<TSA const, Standard>::Type TSAIterator; 1076 typedef typename Size<TText>::Type TTextSize; 1077 1078 if (empty(stringSet)) 1079 return 0; 1080 1081 TTextIterator itText = begin(front(stringSet), Standard()); 1082 TSAIterator itSA = begin(sa, Standard()); 1083 TSAIterator itSAEnd = end(sa, Standard()); 1084 1085 TTextSize sentinels = 0; 1086 TTextSize textLength = 0; 1087 unsigned lastSeqSeen = (unsigned)-1; 1088 Pair<unsigned, TTextSize> lPos; 1089 for (; itSA != itSAEnd; ++itSA) 1090 { 1091 posLocalize(lPos, *itSA, stringSetLimits(stringSet)); 1092 if (lastSeqSeen != getSeqNo(lPos)) 1093 { 1094 lastSeqSeen = getSeqNo(lPos); 1095 1096 // shift textBegin and textLength by prefixLen 1097 textLength = length(stringSet[lastSeqSeen]) - prefixLen; 1098 itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen; 1099 } 1100 if (textLength > getSeqOffset(lPos)) 1101 ++buckets[ordValue(*(itText + getSeqOffset(lPos)))]; 1102 else 1103 if (textLength == getSeqOffset(lPos)) ++sentinels; 1104 } 1105 return sentinels; 1106 } 1107 1108 1109 ////////////////////////////////////////////////////////////////////////////// 1110 1111 1112 // Counting sort - Step 3: Cumulative sum 1113 template < typename TBounds, typename TBuckets, typename TSize > 1114 inline typename Size<TBuckets>::Type 1115 _wotdCummulativeSum(TBounds &bounds, TBuckets const &buckets, TSize offset) 1116 { 1117 typedef typename Iterator<TBounds, Standard>::Type TBoundIterator; 1118 typedef typename Iterator<TBuckets const, Standard>::Type TBucketsIterator; 1119 1120 TBucketsIterator it = begin(buckets, Standard()); 1121 TBucketsIterator itEnd = end(buckets, Standard()); 1122 TBoundIterator bit = begin(bounds, Standard()); 1123 1124 typename Value<TBounds>::Type sum = offset; 1125 typename Size<TBuckets>::Type requiredSize = 0; 1126 typename Value<TBuckets>::Type diff; 1127 1128 for (; it != itEnd; ++it, ++bit) 1129 if ((diff = *it) != 0) { 1130 requiredSize += (diff > 1)? 2: 1; 1131 *bit = sum; 1132 sum += diff; 1133 } 1134 1135 return requiredSize; 1136 } 1137 1138 template < typename TBounds, typename TBuckets > 1139 inline typename Size<TBuckets>::Type 1140 _wotdCummulativeSum(TBounds &bounds, TBuckets const &buckets) 1141 { 1142 return _wotdCummulativeSum(bounds, buckets, 0); 1143 } 1144 1145 ////////////////////////////////////////////////////////////////////////////// 1146 //TODO(singer): The function createWotdIndex in never defined! 1147 /*! 1148 * @fn IndexWotd#createWotdIndex 1149 * @headerfile <seqan/index.h> 1150 * @brief Builds a the WOTD index. 1151 * 1152 * @signature void createWotdIndex(sa, dir, text); 1153 * 1154 * @param[out] sa The resulting list in which all <i>q</i>-grams are sorted alphabetically. 1155 * @param[out] dir The resulting array that indicates at which position in index the corresponding <i>q</i>-grams 1156 * can be found. 1157 * @param[in] text The sequence. Types: @link ContainerConcept @endlink 1158 * 1159 * The resulting <tt>index</tt> contains the sorted list of qgrams. For each possible <i>q</i>-gram pos contains 1160 * the first position in index that corresponds to this <i>q</i>-gram. 1161 */ 1162 1163 // single sequence 1164 template < typename TIndex > 1165 typename Size<TIndex>::Type 1166 _sortFirstWotdBucket(TIndex &index) 1167 { 1168 typedef typename Fibre<TIndex, WotdText >::Type TText; 1169 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 1170 typedef typename TIndex::TCounter TCounter; 1171 1172 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1173 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 1174 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1175 typedef typename Size<TText>::Type TSize; 1176 1177 TText const &text = indexText(index); 1178 TCounter &occ = index.tempOcc; 1179 TCounter &bound = index.tempBound; 1180 1181 // 1. clear counters and copy SA to temporary SA 1182 arrayFill(begin(occ, Standard()), end(occ, Standard()), 0); 1183 1184 // 2. count characters 1185 _wotdCountChars(occ, text); 1186 1187 // 3. cumulative sum 1188 TSize requiredSize = _wotdCummulativeSum(bound, occ); 1189 1190 // 4. fill suffix array 1191 { 1192 TSA &sa = indexSA(index); 1193 TSAIterator saBeg = begin(sa, Standard()); 1194 TCntIterator boundBeg = begin(bound, Standard()); 1195 1196 TTextIterator itText = begin(text, Standard()); 1197 TTextIterator itTextEnd = end(text, Standard()); 1198 for(TSize i = 0; itText != itTextEnd; ++itText, ++i) 1199 *(saBeg + (*(boundBeg + ordValue(*itText)))++) = i; 1200 } 1201 index.sentinelOcc = 0; 1202 index.sentinelBound = 0; 1203 1204 return requiredSize; 1205 } 1206 1207 // multiple sequences 1208 template < typename TText, typename TSpec, typename TIndexSpec > 1209 typename Size< Index<StringSet<TText, TSpec>, TIndexSpec> >::Type 1210 _sortFirstWotdBucket(Index<StringSet<TText, TSpec>, TIndexSpec> &index) 1211 { 1212 typedef Index<StringSet<TText, TSpec>, TIndexSpec> TIndex; 1213 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 1214 typedef typename TIndex::TCounter TCounter; 1215 1216 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1217 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 1218 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1219 typedef typename Size<TText>::Type TSize; 1220 1221 StringSet<TText, TSpec> const &stringSet = indexText(index); 1222 TCounter &occ = index.tempOcc; 1223 TCounter &bound = index.tempBound; 1224 1225 // 1. clear counters and copy SA to temporary SA 1226 arrayFill(begin(occ, Standard()), end(occ, Standard()), 0); 1227 1228 // 2. count characters 1229 _wotdCountChars(occ, stringSet); 1230 1231 // 3. cummulative sum 1232 TSize requiredSize = _wotdCummulativeSum(bound, occ); 1233 1234 // 4. fill suffix array 1235 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 1236 { 1237 TSA &sa = indexSA(index); 1238 TSAIterator saBeg = begin(sa, Standard()); 1239 TCntIterator boundBeg = begin(bound, Standard()); 1240 1241 typename Value<TSA>::Type localPos; 1242 assignValueI1(localPos, seqNo); 1243 assignValueI2(localPos, 0); 1244 1245 TText const &text = value(stringSet, seqNo); 1246 TTextIterator itText = begin(text, Standard()); 1247 TTextIterator itTextEnd = end(text, Standard()); 1248 for(; itText != itTextEnd; ++itText) 1249 { 1250 *(saBeg + (*(boundBeg + ordValue(*itText)))++) = localPos; 1251 assignValueI2(localPos, getValueI2(localPos) + 1); 1252 } 1253 } 1254 index.sentinelOcc = 0; 1255 index.sentinelBound = 0; 1256 1257 return requiredSize; 1258 } 1259 1260 1261 1262 // sort bucket using counting sort 1263 // (nearly) ORIGINAL VERSION: 1264 // - brings the bucket with the longest suffix (lpBucket) to front 1265 // - all other buckets are in lexicographical order 1266 // - adds prefixLen to all SA entries in SA[left,right) 1267 template < typename TText, typename TBeginPos, typename TEndPos, typename TSize > 1268 TSize _sortWotdBucket( 1269 Index<TText, IndexWotd<WotdOriginal> > &index, 1270 TBeginPos left, 1271 TEndPos right, 1272 TSize prefixLen) 1273 { 1274 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 1275 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 1276 typedef typename TIndex::TCounter TCounter; 1277 1278 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1279 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 1280 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1281 typedef typename Iterator<TCounter const, Standard>::Type TConstCntIterator; 1282 typedef typename Size<TText>::Type TTextSize; 1283 1284 TText const &text = indexText(index); 1285 TCounter const &tempSA = index.tempSA; 1286 TCounter &occ = index.tempOcc; 1287 TCounter &bound = index.tempBound; 1288 1289 // 1. clear counters and copy SA to temporary SA 1290 arrayFill(begin(occ, Standard()), end(occ, Standard()), 0); 1291 1292 // 2. count characters 1293 index.tempSA = infix(indexSA(index), left, right); 1294 index.sentinelBound = 0; 1295 index.sentinelOcc = 1296 _wotdCountCharsWotdOriginal(occ, text, index.tempSA, prefixLen); 1297 1298 // 3. cumulative sum 1299 TSize requiredSize = 0; 1300 1301 // actually, here sentinelOcc<=1 holds (this is the original wotd) 1302 if (index.interSentinelNodes) { 1303 if (index.sentinelOcc != 0) 1304 requiredSize = (index.sentinelOcc > 1)? 2: 1; // insert *one* $-edge for all $_i suffices 1305 } else 1306 requiredSize = index.sentinelOcc; // insert each $_i suffix one-by-one 1307 1308 requiredSize += _wotdCummulativeSum(bound, occ, left + index.sentinelOcc); 1309 index.sentinelBound = left; 1310 1311 // 4. fill suffix array 1312 { 1313 TSA &sa = indexSA(index); 1314 TSAIterator saBeg = begin(sa, Standard()); 1315 TCntIterator boundBeg = begin(bound, Standard()); 1316 1317 TTextIterator itText = begin(text, Standard()); 1318 TConstCntIterator itSA = begin(tempSA, Standard()); 1319 TConstCntIterator itSAEnd = end(tempSA, Standard()); 1320 TTextSize textLength = length(text); 1321 for(; itSA != itSAEnd; ++itSA) 1322 { 1323 TTextSize saValue = *itSA; 1324 if (textLength > saValue) 1325 *(saBeg + (*(boundBeg + ordValue(*(itText + saValue))))++) = saValue; 1326 else 1327 if (textLength == saValue) 1328 *(saBeg + index.sentinelBound++) = saValue; 1329 } 1330 } 1331 1332 // 5. move lpBucket to front and add saOffset to hash directory entries 1333 { 1334 TSize lpBucket = ordValue(text[tempSA[0]]); 1335 if (lpBucket != 0) { 1336 TSize lpBucketOcc = occ[lpBucket]; 1337 TSize lpBucketBound = bound[lpBucket]; 1338 1339 TCntIterator itOcc = begin(occ, Standard()) + lpBucket; 1340 TCntIterator itBound = begin(bound, Standard()) + lpBucket; 1341 TCntIterator itBeg = begin(bound, Standard()); 1342 for(; itBound != itBeg; --itBound, --itOcc) { 1343 *itOcc = *(itOcc - 1); 1344 *itBound = *(itBound - 1); 1345 } 1346 if (index.sentinelOcc != 0) { 1347 // bring first bucket before sentinel bucket 1348 *itOcc = index.sentinelOcc; 1349 *itBound = index.sentinelBound; 1350 index.sentinelOcc = lpBucketOcc; 1351 index.sentinelBound = lpBucketBound; 1352 } else { 1353 *itOcc = lpBucketOcc; 1354 *itBound = lpBucketBound; 1355 } 1356 } else 1357 if (index.sentinelOcc != 0) { 1358 // bring first bucket before sentinel bucket 1359 TSize swap = index.sentinelOcc; 1360 index.sentinelOcc = occ[0]; 1361 occ[0] = swap; 1362 swap = index.sentinelBound; 1363 index.sentinelBound = bound[0]; 1364 bound[0] = swap; 1365 } 1366 } 1367 1368 return requiredSize; 1369 } 1370 1371 1372 1373 1374 1375 // sort bucket using counting sort 1376 // MODIFIED VERSION: 1377 // - all buckets are in lexicographical order 1378 // - SA[left,right) contains real SA entries (the beginning positions of the suffices) 1379 1380 // single sequence 1381 template < typename TIndex, typename TBeginPos, typename TEndPos, typename TSize > 1382 TSize _sortWotdBucket( 1383 TIndex &index, 1384 TBeginPos left, 1385 TEndPos right, 1386 TSize prefixLen) 1387 { 1388 typedef typename Fibre<TIndex, WotdText >::Type TText; 1389 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 1390 typedef typename TIndex::TCounter TCounter; 1391 1392 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1393 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 1394 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1395 typedef typename Iterator<TCounter const, Standard>::Type TConstCntIterator; 1396 typedef typename Size<TText>::Type TTextSize; 1397 1398 TText const &text = indexText(index); 1399 TCounter const &tempSA = index.tempSA; 1400 TCounter &occ = index.tempOcc; 1401 TCounter &bound = index.tempBound; 1402 1403 // 1. clear counters and copy SA to temporary SA 1404 arrayFill(begin(occ, Standard()), end(occ, Standard()), 0); 1405 index.tempSA = infix(indexSA(index), left, right); 1406 1407 // 2. count characters 1408 index.sentinelBound = 0; 1409 index.sentinelOcc = _wotdCountChars(occ, text, tempSA, prefixLen); 1410 1411 // 3. cumulative sum 1412 TSize requiredSize = 0; 1413 if (index.interSentinelNodes) { 1414 if (index.sentinelOcc != 0) 1415 requiredSize = (index.sentinelOcc > 1)? 2: 1; // insert *one* $-edge for all $_i suffices 1416 } else 1417 requiredSize = index.sentinelOcc; // insert each $_i suffix one-by-one 1418 1419 requiredSize += _wotdCummulativeSum(bound, occ, left + index.sentinelOcc); 1420 index.sentinelBound = left; 1421 1422 // 4. fill suffix array 1423 { 1424 TSA &sa = indexSA(index); 1425 TSAIterator saBeg = begin(sa, Standard()); 1426 TCntIterator boundBeg = begin(bound, Standard()); 1427 1428 TTextIterator itText = begin(text, Standard()) + prefixLen; 1429 TConstCntIterator itSA = begin(tempSA, Standard()); 1430 TConstCntIterator itSAEnd = end(tempSA, Standard()); 1431 TTextSize textLength = length(text) - prefixLen; 1432 for(; itSA != itSAEnd; ++itSA) 1433 { 1434 TTextSize saValue = *itSA; 1435 if (textLength > saValue) 1436 *(saBeg + (*(boundBeg + ordValue(*(itText + saValue))))++) = saValue; 1437 else 1438 if (textLength == saValue) 1439 *(saBeg + index.sentinelBound++) = saValue; 1440 } 1441 } 1442 1443 return requiredSize; 1444 } 1445 1446 // multiple sequences 1447 template < typename TText, typename TSpec, typename TIndexSpec, typename TBeginPos, typename TEndPos, typename TSize > 1448 TSize _sortWotdBucket( 1449 Index<StringSet<TText, TSpec>, TIndexSpec> &index, 1450 TBeginPos left, 1451 TEndPos right, 1452 TSize prefixLen) 1453 { 1454 typedef Index<StringSet<TText, TSpec>, TIndexSpec> TIndex; 1455 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 1456 typedef typename TIndex::TCounter TCounter; 1457 typedef typename TIndex::TTempSA TTempSA; 1458 1459 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1460 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 1461 typedef typename Iterator<TTempSA const, Standard>::Type TTempSAIterator; 1462 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1463 typedef typename Size<TText>::Type TTextSize; 1464 1465 StringSet<TText, TSpec> const &stringSet = indexText(index); 1466 TTempSA const &tempSA = index.tempSA; 1467 TCounter &occ = index.tempOcc; 1468 TCounter &bound = index.tempBound; 1469 1470 // 1. clear counters and copy SA to temporary SA 1471 TCntIterator occBeg = begin(occ, Standard()); 1472 1473 arrayFill(occBeg, end(occ, Standard()), 0); 1474 index.tempSA = infix(indexSA(index), left, right); 1475 1476 // 2. count characters 1477 index.sentinelBound = 0; 1478 index.sentinelOcc = _wotdCountChars(occ, stringSet, tempSA, prefixLen); 1479 1480 // 3. cumulative sum 1481 TSize requiredSize = 0; 1482 if (index.interSentinelNodes) { 1483 if (index.sentinelOcc != 0) 1484 requiredSize = (index.sentinelOcc > 1)? 2: 1; // insert *one* $-edge for all $_i suffices 1485 } else 1486 requiredSize = index.sentinelOcc; // insert each $_i suffix one-by-one 1487 1488 requiredSize += _wotdCummulativeSum(bound, occ, left + index.sentinelOcc); 1489 index.sentinelBound = left; 1490 1491 // 4. fill suffix array 1492 { 1493 if (empty(stringSet)) 1494 return requiredSize; 1495 1496 TSA &sa = indexSA(index); 1497 TSAIterator saBeg = begin(sa, Standard()); 1498 TCntIterator boundBeg = begin(bound, Standard()); 1499 1500 TTextIterator itText = begin(front(stringSet), Standard()); 1501 TTempSAIterator itSA = begin(tempSA, Standard()); 1502 TTempSAIterator itSAEnd = end(tempSA, Standard()); 1503 TTextSize textLength = 0; 1504 unsigned lastSeqSeen = (unsigned)-1; 1505 Pair<unsigned, TTextSize> lPos; 1506 for(; itSA != itSAEnd; ++itSA) 1507 { 1508 posLocalize(lPos, *itSA, stringSetLimits(index)); 1509 if (lastSeqSeen != getSeqNo(lPos)) 1510 { 1511 lastSeqSeen = getSeqNo(lPos); 1512 1513 // shift textBegin and textLength by prefixLen 1514 textLength = length(stringSet[lastSeqSeen]) - prefixLen; 1515 itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen; 1516 } 1517 if (textLength > getSeqOffset(lPos)) 1518 *(saBeg + (*(boundBeg + ordValue(*(itText + getSeqOffset(lPos)))))++) = *itSA; 1519 else 1520 if (textLength == getSeqOffset(lPos)) 1521 *(saBeg + index.sentinelBound++) = *itSA; 1522 } 1523 } 1524 1525 return requiredSize; 1526 } 1527 1528 1529 1530 1531 1532 template < typename TSA, typename TText > 1533 typename Size<TText>::Type 1534 _bucketLcp(TSA const &sa, TText const &text) 1535 { 1536 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1537 typedef typename Iterator<TSA const, Standard>::Type TSAIterator; 1538 typedef typename Value<TText>::Type TValue; 1539 typedef typename Size<TText>::Type TTextSize; 1540 1541 TTextSize prefixLen = 0; 1542 1543 if (length(sa) < 2) return prefixLen; 1544 1545 TTextIterator itText = begin(text, Standard()); 1546 TSAIterator itSAEnd = end(sa, Standard()); 1547 TTextSize textLength = length(text); 1548 1549 do { 1550 TSAIterator itSA = begin(sa, Standard()); 1551 TTextSize sa = *itSA; 1552 if (textLength == sa) return prefixLen; 1553 1554 TValue c = *(itText + sa); 1555 for(++itSA; itSA != itSAEnd; ++itSA) { 1556 sa = *itSA; 1557 if (textLength == sa || c != *(itText + sa)) 1558 return prefixLen; 1559 } 1560 ++prefixLen; --textLength; 1561 ++itText; 1562 } while (true); 1563 } 1564 1565 template < typename TSA, typename TText, typename TSize > 1566 typename Size<TText>::Type 1567 _bucketLcp(TSA const &sa, TText const &text, TSize prefixLen) 1568 { 1569 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1570 typedef typename Iterator<TSA const, Standard>::Type TSAIterator; 1571 typedef typename Value<TText>::Type TValue; 1572 typedef typename Size<TText>::Type TTextSize; 1573 1574 if (length(sa) < 2) return prefixLen; 1575 1576 TTextIterator itText = begin(text, Standard()) + prefixLen; 1577 TSAIterator itSAEnd = end(sa, Standard()); 1578 TTextSize textLength = length(text) - prefixLen; 1579 1580 do { 1581 TSAIterator itSA = begin(sa, Standard()); 1582 TTextSize sa = *itSA; 1583 if (textLength == sa) return prefixLen; 1584 1585 TValue c = *(itText + sa); 1586 for(++itSA; itSA != itSAEnd; ++itSA) { 1587 sa = *itSA; 1588 if (textLength == sa || *(itText + sa) != c) 1589 return prefixLen; 1590 } 1591 ++prefixLen; --textLength; 1592 ++itText; 1593 } while (true); 1594 } 1595 1596 template < typename TSA, typename TText, typename TSpec, typename TSize > 1597 typename Size<TText>::Type 1598 _bucketLcp(TSA const &sa, StringSet<TText, TSpec> const &stringSet, TSize prefixLen) 1599 { 1600 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 1601 typedef typename Iterator<TSA const, Standard>::Type TSAIterator; 1602 typedef typename Value<TText>::Type TValue; 1603 typedef typename Size<TText>::Type TTextSize; 1604 1605 if (length(sa) < 2) return prefixLen; 1606 1607 TSAIterator itSAEnd = end(sa, Standard()); 1608 TTextIterator itText; 1609 TTextSize textLength; 1610 1611 Pair<unsigned, TTextSize> lPos; 1612 do { 1613 TSAIterator itSA = begin(sa, Standard()); 1614 posLocalize(lPos, *itSA, stringSetLimits(stringSet)); 1615 1616 unsigned lastSeqSeen = getSeqNo(*itSA); 1617 textLength = length(stringSet[lastSeqSeen]) - prefixLen; 1618 if (textLength == getSeqOffset(lPos)) return prefixLen; 1619 1620 itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen; 1621 TValue c = *(itText + getSeqOffset(*itSA)); 1622 for(++itSA; itSA != itSAEnd; ++itSA) 1623 { 1624 posLocalize(lPos, *itSA, stringSetLimits(stringSet)); 1625 1626 if (lastSeqSeen != getSeqNo(lPos)) 1627 { 1628 lastSeqSeen = getSeqNo(lPos); 1629 1630 // shift textBegin and textLength by prefixLen 1631 textLength = length(stringSet[lastSeqSeen]) - prefixLen; 1632 itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen; 1633 } 1634 1635 if (textLength == getSeqOffset(lPos) || c != *(itText + getSeqOffset(lPos))) 1636 return prefixLen; 1637 } 1638 ++prefixLen; --textLength; 1639 ++itText; 1640 } while (true); 1641 } 1642 1643 1644 template <typename TText, typename TSpec, typename TPos> 1645 inline TPos 1646 _getNodeLP( 1647 Index<TText, IndexWotd<TSpec> > const &index, 1648 TPos pos) 1649 { 1650 TPos w0 = dirAt(pos, index); 1651 if (w0 & index.LEAF) 1652 return w0 & index.BITMASK0; 1653 1654 TPos w1 = dirAt(pos + 1, index); 1655 if (w1 & index.UNEVALUATED) 1656 return saAt(w0 & index.BITMASK0, index); 1657 else 1658 return w0 & index.BITMASK0; 1659 } 1660 1661 // store buckets into directory 1662 // ORIGINAL VERSION: storing SA entries and topology links in Dir 1663 template <typename TText, typename TPos> 1664 inline void 1665 _storeWotdChildren( 1666 Index<TText, IndexWotd<WotdOriginal> > &index, 1667 TPos dirOfs) 1668 { 1669 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 1670 typedef typename Fibre<TIndex, WotdDir>::Type TDir; 1671 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 1672 typedef typename Size<TDir>::Type TDirSize; 1673 typedef typename TIndex::TCounter TCounter; 1674 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1675 1676 typedef typename Value<TCounter>::Type TValue; 1677 1678 TDirIterator itDir = begin(indexDir(index), Standard()) + dirOfs; 1679 TDirIterator itDirEnd = end(indexDir(index), Standard()); 1680 TDirIterator itPrev = itDirEnd; 1681 1682 TCntIterator it = begin(index.tempOcc, Standard()); 1683 TCntIterator bit = begin(index.tempBound, Standard()); 1684 TCntIterator itEnd = end(index.tempOcc, Standard()); 1685 1686 TValue occ; 1687 if (index.sentinelOcc != 0) 1688 { 1689 if (index.sentinelOcc > 1 && index.interSentinelNodes) // occurs on multiseqs 1690 { 1691 itPrev = itDir; 1692 *itDir = index.sentinelBound - index.sentinelOcc; ++itDir; 1693 *itDir = index.sentinelBound | index.UNEVALUATED; ++itDir; 1694 } else 1695 for (TDirSize d = index.sentinelBound - index.sentinelOcc; d != index.sentinelBound; ++d) 1696 { 1697 itPrev = itDir; 1698 *itDir = saAt(d, index) | index.LEAF; ++itDir; 1699 } 1700 } 1701 for (; it != itEnd; ++it, ++bit) 1702 { 1703 if ((occ = *it) == 0) continue; 1704 if (occ > 1) { 1705 itPrev = itDir; 1706 *itDir = *bit - occ; ++itDir; 1707 *itDir = *bit | index.UNEVALUATED; ++itDir; 1708 } else { 1709 itPrev = itDir; 1710 *itDir = saAt(*bit - occ, index) | index.LEAF; ++itDir; 1711 } 1712 } 1713 1714 // mark the last child 1715 if (itPrev != itDirEnd) 1716 *itPrev |= index.LAST_CHILD; 1717 } 1718 1719 // store buckets into directory 1720 // MODIFIED VERSION: storing SA links and topology links in Dir 1721 template <typename TText, typename TSpec, typename TSize> 1722 inline void 1723 _storeWotdChildren( 1724 Index<TText, IndexWotd<TSpec> > &index, 1725 TSize dirOfs, 1726 TSize lcp) 1727 { 1728 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1729 typedef typename Fibre<TIndex, WotdDir>::Type TDir; 1730 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 1731 typedef typename Size<TDir>::Type TDirSize; 1732 typedef typename TIndex::TCounter TCounter; 1733 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 1734 1735 typedef typename Value<TCounter>::Type TValue; 1736 1737 TDirIterator itDirBegin = begin(indexDir(index), Standard()) + dirOfs; 1738 TDirIterator itDirEnd = end(indexDir(index), Standard()); 1739 TDirIterator itDir = itDirBegin; 1740 TDirIterator itPrev = itDirEnd; 1741 1742 TCntIterator it = begin(index.tempOcc, Standard()); 1743 TCntIterator bit = begin(index.tempBound, Standard()); 1744 TCntIterator itEnd = end(index.tempOcc, Standard()); 1745 1746 TValue occ; 1747 if (index.sentinelOcc != 0) 1748 { 1749 if (index.sentinelOcc > 1 && index.interSentinelNodes) // occurs on multiseqs 1750 { 1751 itPrev = itDir; 1752 *itDir = index.sentinelBound - index.sentinelOcc; ++itDir; 1753 *itDir = index.sentinelBound | index.UNEVALUATED; ++itDir; 1754 } else 1755 for (TDirSize d = index.sentinelBound - index.sentinelOcc; d != index.sentinelBound; ++d) 1756 { 1757 itPrev = itDir; 1758 *itDir = d | index.LEAF; ++itDir; 1759 } 1760 } 1761 for (; it != itEnd; ++it, ++bit) 1762 { 1763 if ((occ = *it) == 0) continue; 1764 if (occ > 1) { 1765 itPrev = itDir; 1766 *itDir = *bit - occ; ++itDir; 1767 *itDir = *bit | index.UNEVALUATED; ++itDir; 1768 } else { 1769 itPrev = itDir; 1770 *itDir = (*bit - occ) | index.LEAF; ++itDir; 1771 } 1772 } 1773 1774 // first child gets the mutual lcp value of the children (== parent repLength) 1775 *itDirBegin = ((*itDirBegin) & ~index.BITMASK0) | lcp; 1776 1777 // mark the last child 1778 if (itPrev != itDirEnd) 1779 *itPrev |= index.LAST_CHILD; 1780 } 1781 1782 1783 template < typename TText, typename TSpec > 1784 inline typename Size< Index<TText, IndexWotd<WotdOriginal> > >::Type 1785 _wotdEvaluate(Iter< Index<TText, IndexWotd<WotdOriginal> >, VSTree<TSpec> > const &it) 1786 { 1787 typedef Index<TText, IndexWotd<WotdOriginal> > TIndex; 1788 typedef typename Size<TIndex>::Type TSize; 1789 1790 TIndex &index = const_cast<TIndex&>(container(it)); 1791 TSize pos = value(it).node; 1792 TSize w1 = dirAt(pos + 1, index); 1793 1794 // test for evaluation 1795 if (w1 & index.UNEVALUATED) 1796 { 1797 TSize w0 = dirAt(pos, index); 1798 TSize lp = saAt(w0 & index.BITMASK0, index); 1799 TSize dst = length(indexDir(index)); 1800 1801 TSize size = _sortWotdBucket( 1802 index, 1803 w0 & index.BITMASK0, 1804 w1 & index.BITMASK1, 1805 parentEdgeLength(it)); 1806 1807 resize(indexDir(index), dst + size, Generous()); 1808 _storeWotdChildren(index, dst); 1809 1810 // mark nodes with solely empty child edges 1811 w1 = dst; 1812 if (index.sentinelOcc > 0) 1813 { 1814 TSize sentinelSize = index.sentinelOcc; 1815 if (index.interSentinelNodes && sentinelSize > 2) 1816 sentinelSize = 2; 1817 if (size == sentinelSize) w1 |= index.SENTINELS; 1818 } 1819 1820 assert(!(index.sentinelOcc == 1 && size == 1)); 1821 1822 dirAt(pos, index) = (w0 & ~index.BITMASK0) | lp; 1823 dirAt(pos + 1, index) = w1; 1824 } 1825 1826 return w1; 1827 } 1828 1829 template < typename TText, typename TIndexSpec, typename TSpec > 1830 inline typename Size< Index<TText, IndexWotd<TIndexSpec> > >::Type 1831 _wotdEvaluate(Iter< Index<TText, IndexWotd<TIndexSpec> >, VSTree<TSpec> > const &it) 1832 { 1833 typedef Index<TText, IndexWotd<TIndexSpec> > TIndex; 1834 typedef typename Size<TIndex>::Type TSize; 1835 1836 TIndex &index = const_cast<TIndex&>(container(it)); 1837 TSize pos = value(it).node; 1838 TSize w1 = dirAt(pos + 1, index); 1839 1840 // test for evaluation 1841 if (w1 & index.UNEVALUATED) 1842 { 1843 TSize dst = length(indexDir(index)); 1844 1845 TSize size = _sortWotdBucket( 1846 index, 1847 value(it).range.i1, 1848 w1 & index.BITMASK1, 1849 repLength(it)); 1850 /* 1851 if (globalDumpFlag) 1852 { 1853 std::cerr << '"' << representative(it) << '"' << std::endl; 1854 for (int i=0;i<length(getOccurrences(it));++i) 1855 std::cerr << getOccurrences(it)[i]<<'\t'<<suffix(indexText(index),getOccurrences(it)[i])<<std::endl; 1856 // _dumpFreq(index); 1857 } 1858 */ 1859 resize(indexDir(index), dst + size, Generous()); 1860 _storeWotdChildren(index, dst, repLength(it)); 1861 1862 // mark nodes with solely empty child edges 1863 w1 = dst; 1864 if (index.sentinelOcc > 0) 1865 { 1866 TSize sentinelSize = index.sentinelOcc; 1867 if (index.interSentinelNodes && sentinelSize > 2) 1868 sentinelSize = 2; 1869 if (size == sentinelSize) w1 |= index.SENTINELS; 1870 } 1871 1872 dirAt(pos + 1, index) = w1; 1873 } 1874 1875 return w1; 1876 } 1877 1878 1879 template <typename TText, typename TSpec> 1880 inline void 1881 _dump(Index<TText, IndexWotd<TSpec> > &index) 1882 { 1883 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1884 typedef typename Fibre<TIndex, WotdDir>::Type TDir; 1885 typedef typename Value<TDir>::Type TDirValue; 1886 1887 std::cout << " Dir (wotd)" << std::endl; 1888 for(unsigned i=0; i < length(indexDir(index)); ++i) { 1889 TDirValue d = indexDir(index)[i]; 1890 std::cout << i << ": " << (d & index.BITMASK0); 1891 if (d & index.LEAF) std::cout << " (Leaf/Uneval)"; 1892 if (d & index.LAST_CHILD) std::cout << " (LastChild/SENTINELS)"; 1893 std::cout << std::endl; 1894 } 1895 1896 std::cout << std::endl << " SA" << std::endl; 1897 for(unsigned i=0; i < length(indexSA(index)); ++i) 1898 std::cout << i << ": " << indexSA(index)[i] << " " << suffix(indexText(index), indexSA(index)[i]) << std::endl; 1899 1900 std::cout << std::endl; 1901 1902 } 1903 1904 ////////////////////////////////////////////////////////////////////////////// 1905 // _goDownChar 1906 1907 template < typename TText, class TSpec, typename TValue > 1908 inline bool _goDownChar( 1909 Iter<Index<TText, IndexWotd<WotdOriginal> >, VSTree< TopDown<TSpec> > > &it, 1910 TValue c) 1911 { 1912 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1913 typedef typename Value<TIndex>::Type TIndexValue; 1914 1915 bool sorted = false; 1916 if (!goDown(it)) return false; 1917 do { 1918 if (parentEdgeLength(it) != 0) { 1919 TIndexValue edgeChar = parentEdgeLabel(it)[0]; 1920 if (edgeChar == c) return true; // the edge is found 1921 if (sorted && edgeChar > c) break; // too far (except the first one, 1922 } // child edges are sorted) 1923 sorted = true; 1924 } while (goRight(it)); 1925 _goUp(it); 1926 return false; 1927 } 1928 1929 template < typename TText, class TIndexSpec, class TSpec, typename TValue > 1930 inline bool _goDownChar( 1931 Iter<Index<TText, IndexWotd<TIndexSpec> >, VSTree< TopDown<TSpec> > > &it, 1932 TValue c) 1933 { 1934 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1935 typedef typename Value<TIndex>::Type TIndexValue; 1936 1937 if (!goDown(it)) return false; 1938 do { 1939 if (parentEdgeLength(it) != 0) { 1940 TIndexValue edgeChar = parentEdgeLabel(it)[0]; 1941 if (edgeChar == c) return true; // the edge is found 1942 if (edgeChar > c) break; // too far (child edges are sorted) 1943 } 1944 } while (goRight(it)); 1945 _goUp(it); 1946 return false; 1947 } 1948 1949 /* 1950 template < typename TText, typename TSpec, typename TValue > 1951 inline bool 1952 _getNodeByChar( 1953 Iter< Index<TText, IndexWotd<TSpec> >, VSTree<TSpec> > const &it, 1954 TValue c, 1955 typename VertexDescriptor< Index<TText, IndexWotd<TSpec> > >::Type &childDesc) 1956 { 1957 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1958 typedef typename Fibre<TIndex, WotdDir>::Type TDir; 1959 typedef typename Fibre<TIndex, WotdSA>::Type TSA; 1960 1961 typedef typename Value<TSA>::Type TSAValue; 1962 typedef typename Value<TDir>::Type TDirValue; 1963 1964 typename Size<TIndex>::Type len = length(index); 1965 typename VertexDescriptor<TIndex>::Type desc; 1966 1967 TSAValue pos = _firstSuffixOfBucket(index, value(it).node); 1968 while (pos == len || value < (c = textAt(index, pos + value.i2))) { 1969 value.node += (dirAt(value.node, index) & index.LEAF)? 1: 2; 1970 pos = _firstSuffixOfBucket(index, value.node); 1971 } 1972 assert(pos != len); 1973 1974 return c == value; 1975 } 1976 */ 1977 1978 ////////////////////////////////////////////////////////////////////////////// 1979 // interface for automatic index creation 1980 1981 template <typename TText, typename TSpec> 1982 inline void _wotdCreateFirstLevel(Index<TText, IndexWotd<TSpec> > &index) 1983 { 1984 typedef Index<TText, IndexWotd<TSpec> > TIndex; 1985 typedef typename Value<TIndex>::Type TValue; 1986 typedef typename Size<TIndex>::Type TSize; 1987 1988 resize(index.tempOcc, ValueSize<TValue>::VALUE + 1, Exact()); 1989 resize(index.tempBound, ValueSize<TValue>::VALUE + 1, Exact()); 1990 1991 TSize size; 1992 if (empty(indexSA(index))) 1993 { 1994 resize(indexSA(index), length(indexRawText(index)), Exact()); 1995 size = _sortFirstWotdBucket(index); 1996 } else 1997 { 1998 size = _sortWotdBucket(index, 0, length(indexSA(index)), 0); 1999 } 2000 2001 if (size > 0) 2002 { 2003 resize(indexDir(index), size + 2, Generous()); 2004 _storeWotdChildren(index, 2, 0); 2005 2006 // mark nodes with solely empty child edges 2007 TSize w1 = 2; 2008 if (index.sentinelOcc > 0) 2009 { 2010 TSize sentinelSize = index.sentinelOcc; 2011 if (index.interSentinelNodes && sentinelSize > 2) 2012 sentinelSize = 2; 2013 if (size == sentinelSize) w1 |= index.SENTINELS; 2014 } 2015 2016 2017 dirAt(0, index) = 0 | index.LAST_CHILD; 2018 dirAt(1, index) = w1; 2019 2020 } else { 2021 resize(indexDir(index), 1); 2022 dirAt(0, index) = 0 | index.LAST_CHILD | index.LEAF; 2023 } 2024 } 2025 2026 template <typename TText, typename TSpec> 2027 inline bool indexCreate(Index<TText, IndexWotd<TSpec> > &index, WotdDir const, Default const) 2028 { 2029 _wotdCreateFirstLevel(index); 2030 return true; 2031 } 2032 2033 template <typename TText, typename TSpec> 2034 inline bool indexCreate(Index<TText, IndexWotd<TSpec> > &index, WotdDir const, Trie const) 2035 { 2036 resize(indexSA(index), length(indexText(index)), Exact()); 2037 fillSuffixArray(indexSA(index), indexText(index), Trie()); 2038 _wotdCreateFirstLevel(index); 2039 return true; 2040 } 2041 2042 2043 ////////////////////////////////////////////////////////////////////////////// 2044 // clear 2045 2046 template < typename TText, typename TSpec > 2047 inline void clear(Index<TText, IndexWotd<TSpec> > &index) 2048 { 2049 clear(getFibre(index, WotdSA())); 2050 clear(getFibre(index, WotdDir())); 2051 } 2052 2053 ////////////////////////////////////////////////////////////////////////////// 2054 // open 2055 2056 template < typename TText, typename TSpec > 2057 inline bool open( 2058 Index< TText, IndexWotd<TSpec> > &index, 2059 const char *fileName, 2060 int openMode) 2061 { 2062 typedef Index<TText, IndexWotd<TSpec> > TIndex; 2063 typedef typename Value<TIndex>::Type TValue; 2064 2065 String<char> name; 2066 2067 name = fileName; append(name, ".txt"); 2068 if ((!open(getFibre(index, WotdText()), toCString(name), openMode)) && 2069 (!open(getFibre(index, WotdText()), fileName, openMode))) return false; 2070 2071 name = fileName; append(name, ".sa"); 2072 if (!open(getFibre(index, WotdSA()), toCString(name), openMode)) return false; 2073 name = fileName; append(name, ".dir"); 2074 2075 if (!open(getFibre(index, WotdDir()), toCString(name), openMode)) return false; 2076 2077 if (!empty(getFibre(index, WotdDir()))) 2078 { 2079 resize(index.tempOcc, ValueSize<TValue>::VALUE + 1); 2080 resize(index.tempBound, ValueSize<TValue>::VALUE + 1); 2081 } 2082 2083 return true; 2084 } 2085 template < typename TText, typename TSpec > 2086 inline bool open( 2087 Index< TText, IndexWotd<TSpec> > &index, 2088 const char *fileName) 2089 { 2090 return open(index, fileName, OPEN_RDONLY); 2091 } 2092 2093 2094 ////////////////////////////////////////////////////////////////////////////// 2095 // save 2096 2097 template < typename TText, typename TSpec > 2098 inline bool save( 2099 Index< TText, IndexWotd<TSpec> > &index, 2100 const char *fileName, 2101 int openMode) 2102 { 2103 String<char> name; 2104 2105 name = fileName; append(name, ".txt"); 2106 if ((!save(getFibre(index, WotdText()), toCString(name), openMode)) && 2107 (!save(getFibre(index, WotdText()), fileName, openMode))) return false; 2108 2109 name = fileName; append(name, ".sa"); 2110 if (!save(getFibre(index, WotdSA()), toCString(name), openMode)) return false; 2111 name = fileName; append(name, ".dir"); 2112 2113 if (!save(getFibre(index, WotdDir()), toCString(name), openMode)) return false; 2114 2115 return true; 2116 } 2117 template < typename TText, typename TSpec > 2118 inline bool save( 2119 Index< TText, IndexWotd<TSpec> > &index, 2120 const char *fileName) 2121 { 2122 return save(index, fileName, OPEN_WRONLY | OPEN_CREATE); 2123 } 2124 } 2125 2126 #endif //#ifndef SEQAN_HEADER_... 2127