1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: David Weese <david.weese@fu-berlin.de> 33 // ========================================================================== 34 35 #ifndef SEQAN_HEADER_INDEX_DFI_H 36 #define SEQAN_HEADER_INDEX_DFI_H 37 38 namespace seqan 39 { 40 41 42 ////////////////////////////////////////////////////////////////////////////// 43 // simple struct to store frequency vectors 44 45 struct DfiEntry_ 46 { 47 unsigned lastSeqSeen; 48 // String<unsigned, Array<2> > freq; 49 String<unsigned> freq; 50 }; 51 52 53 ////////////////////////////////////////////////////////////////////////////// 54 // constant frequency predicate 55 56 template <bool Result> 57 struct DfiPredDefault_ 58 { operatorDfiPredDefault_59 inline bool operator()(DfiEntry_ const &) const { 60 return Result; 61 } 62 }; 63 64 65 ////////////////////////////////////////////////////////////////////////////// 66 // Dfi - The Deferred Frequency Index 67 68 /*! 69 * @class IndexDfi 70 * @extends IndexWotd 71 * @headerfile <seqan/index.h> 72 * 73 * @brief The Deferred Frequency Index (see Weese and Schulz, "Efficient string mining under constraints via 74 * the deferred frequency index"). 75 * 76 * @signature template <typename TText, typename TPredHull, typename TPred> 77 * class Index<TText, IndexWotd< Dfi<TPredHull, TPred> > >; 78 * 79 * @tparam TText The @link TextConcept text @endlink. 80 * @tparam TPred An arbitrary frequency predicate. 81 * @tparam TPredHull A monotonic hull of <tt>TPred</tt> 82 * 83 * This index is based on a lazy suffix tree (see @link IndexWotd @endlink). All <tt>TPredHull</tt> sufficing 84 * nodes can be iterated using a @link TopDownIterator @endlink. To iterate the exact solution set of <tt>TPred</tt>, 85 * use a @link TopDownHistoryIterator @endlink of this index. 86 */ 87 88 /*! 89 * @defgroup DfiIndexFibres Dfi Index Fibres 90 * @brief Tag to select a specific fibre (e.g. table, object, ...) of an @link 91 * IndexDfi @endlink index. 92 * 93 * These tags can be used to get @link Fibre @endlink of an @link IndexDfi @endlink. 94 * 95 * @see Fibre 96 * @see Index#getFibre 97 * @see IndexDfi 98 * 99 * @tag DfiIndexFibres#DfiDir 100 * @brief The child table. 101 * 102 * @tag DfiIndexFibres#DfiRawSA 103 * @brief The raw suffix array. 104 * 105 * @tag DfiIndexFibres#DfiText 106 * @brief The original text the index should be based on. 107 * 108 * @tag DfiIndexFibres#DfiRawText 109 * @brief The raw text the index is really based on. 110 * 111 * @tag DfiIndexFibres#DfiSA 112 * @brief The suffix array. 113 */ 114 115 ////////////////////////////////////////////////////////////////////////////// 116 /*! 117 * @fn IndexDfi#indexSA 118 * @headerfile <seqan/index.h> 119 * @brief Shortcut for <tt>getFibre(.., DfiSA)</tt>. 120 * 121 * @signature TSa indexSA(index); 122 * 123 * @param[in] index The @link IndexDfi @endlink object holding the fibre. 124 * 125 * @return TSa A reference to the @link DfiIndexFibres#DfiSA @endlink fibre (partially sorted suffix array). 126 */ 127 128 /*! 129 * @fn IndexDfi#indexDir 130 * @headerfile <seqan/index.h> 131 * @brief Shortcut for <tt>getFibre(.., DfiDir())</tt>. 132 * @signature TFibre indexDir(index); 133 * 134 * @param[in] index The @link IndexDfi @endlink object holding the fibre. 135 * 136 * @return TFibre A reference to the @link DfiIndexFibres#DfiDir @endlink fibre (tree structure). 137 */ 138 139 /*! 140 * @fn IndexDfi#saAt 141 * @headerfile <seqan/index.h> 142 * @note Advanced functionality, not commonly used. 143 * @brief Shortcut for <tt>value(indexSA(..), ..)</tt>. 144 * 145 * @signature TValue saAt(position, index); 146 * 147 * @param[in] index The @link IndexDfi @endlink object holding the fibre. 148 * @param[in] position A position in the array on which the value should be accessed. 149 * 150 * @return TValue A reference or proxy to the value in the @link DfiIndexFibres#DfiSA @endlink fibre. 151 * To be more precise, a reference to a position containing a value of type 152 * @link SAValue @endlink is returned (or a proxy). 153 */ 154 155 /*! 156 * @fn IndexDfi#dirAt 157 * @headerfile <seqan/index.h> 158 * @brief Shortcut for <tt>value(indexDir(index), position)</tt>. 159 * 160 * @signature TFibre dirAt(position, index); 161 * 162 * @param[in] index The @link IndexDfi @endlink object holding the fibre. 163 * @param[in] position A position in the array on which the value should be accessed. 164 * 165 * @return TFibre A reference to the @link DfiIndexFibres#DfiDir @endlink fibre. 166 */ 167 168 template < 169 typename TPredHull = DfiPredDefault_<true>, 170 typename TPred = DfiPredDefault_<true> 171 > 172 struct Dfi; 173 174 template < 175 typename TObject, 176 typename TPredHull, 177 typename TPred 178 > 179 class Index<TObject, IndexWotd< Dfi<TPredHull, TPred> > >: 180 public Index<TObject, IndexWotd<> > 181 { 182 public: 183 184 typedef Index<TObject, IndexWotd<> > TBase; 185 186 // extending base class 187 typedef typename TBase::TText TText; 188 typedef typename TBase::TValue TValue; 189 typedef typename TBase::TSize TSize; 190 191 using TBase::LEAF; 192 using TBase::LAST_CHILD; 193 using TBase::UNEVALUATED; 194 using TBase::SENTINELS; 195 196 // frequency strings for Dfi 197 typedef DfiEntry_ TDFIEntry; 198 typedef String< 199 TDFIEntry, 200 Array<ValueSize<TValue>::VALUE> > TDFIEntries; 201 typedef String<unsigned> TDFIDatasets; 202 203 // 1st word flags 204 static TSize const DFI_PRED_HULL = (TSize)1 << (BitsPerValue<TSize>::VALUE - 3); // this node fulfills all monotonic frequency predicates (e.g. min_freq) 205 static TSize const DFI_PRED = (TSize)1 << (BitsPerValue<TSize>::VALUE - 4); // this node fulfills all frequency predicates (e.g. emerging, minmax_freq) 206 static TSize const DFI_PARENT_FREQ = (TSize)1 << (BitsPerValue<TSize>::VALUE - 5); // this node has the same frequency as its parent 207 208 static TSize const BITMASK0 = ~(LEAF | LAST_CHILD | DFI_PRED_HULL | DFI_PRED | DFI_PARENT_FREQ); 209 static TSize const BITMASK1 = ~(UNEVALUATED | SENTINELS); 210 211 TDFIEntry nodeEntry; // current nodes frequencies 212 TDFIEntries childEntry; // child frequencies for each first letter of outgoing edges 213 TDFIDatasets ds; 214 215 TPredHull predHull; 216 TPred pred; 217 218 /*! 219 * @fn IndexDfi::Index 220 * @brief Constructor 221 * 222 * @signature Index::Index(); 223 * @signature Index::Index(index); 224 * @signature Index::Index(text[, predHull][, pred]); 225 * 226 * @param[in] index Other Index object to copy from. 227 * @param[in] text The text to be indexed. 228 * @param[in] predHull (TPredHull) A monotonic hull of TPred. (optional) 229 * @param[in] pred (TPred) An arbitrary frequeny predicate. (optional) 230 */ 231 Index()232 Index() {} 233 Index(Index & other)234 Index(Index &other): 235 TBase((TBase &)other), 236 childEntry(other.childEntry), 237 ds(other.ds), 238 predHull(other.predHull), 239 pred(other.pred) {} 240 Index(Index const & other)241 Index(Index const &other): 242 TBase((TBase const &)other), 243 childEntry(other.childEntry), 244 ds(other.ds), 245 predHull(other.predHull), 246 pred(other.pred) {} 247 248 template <typename TText_> Index(TText_ & _text)249 Index(TText_ &_text): 250 TBase(_text) {} 251 252 template <typename TText_> Index(TText_ & _text,TPredHull const & _predHull)253 Index(TText_ &_text, TPredHull const &_predHull): 254 TBase(_text), 255 predHull(_predHull) {} 256 257 template <typename TText_> Index(TText_ & _text,TPredHull const & _predHull,TPred const & _pred)258 Index(TText_ &_text, TPredHull const &_predHull, TPred const &_pred): 259 TBase(_text), 260 predHull(_predHull), 261 pred(_pred) {} 262 }; 263 264 265 template < 266 typename TText, 267 typename TPredHull, 268 typename TPred, 269 typename TSpec 270 > nodePredicate(Iter<Index<TText,IndexWotd<Dfi<TPredHull,TPred>>>,TSpec> & it)271 inline bool nodePredicate( 272 Iter<Index<TText, IndexWotd< Dfi<TPredHull, TPred> > >, TSpec> &it) 273 { 274 typedef Index<TText, IndexWotd< Dfi<TPredHull, TPred> > > TIndex; 275 return (dirAt(value(it).node, container(it)) & TIndex::DFI_PRED) != 0; 276 } 277 278 template < 279 typename TText, 280 typename TPredHull, 281 typename TPred, 282 typename TSpec 283 > nodeHullPredicate(Iter<Index<TText,IndexWotd<Dfi<TPredHull,TPred>>>,TSpec> & it)284 inline bool nodeHullPredicate( 285 Iter<Index<TText, IndexWotd< Dfi<TPredHull, TPred> > >, TSpec> &it) 286 { 287 typedef Index<TText, IndexWotd< Dfi<TPredHull, TPred> > > TIndex; 288 return (dirAt(value(it).node, container(it)) & TIndex::DFI_PRED_HULL) != 0; 289 } 290 291 292 ////////////////////////////////////////////////////////////////////////////// 293 // we modify counting sort used in the wotd-index 294 // to count the frequencies in passing 295 296 template < typename TText, typename TSpec, typename TPredHull, typename TPred > 297 typename Size< Index<StringSet<TText, TSpec>, IndexWotd<Dfi<TPredHull, TPred> > > >::Type _sortFirstWotdBucket(Index<StringSet<TText,TSpec>,IndexWotd<Dfi<TPredHull,TPred>>> & index)298 _sortFirstWotdBucket(Index<StringSet<TText, TSpec>, IndexWotd<Dfi<TPredHull, TPred> > > &index) 299 { 300 typedef Index<StringSet<TText, TSpec>, IndexWotd<Dfi<TPredHull, TPred> > > TIndex; 301 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 302 typedef typename TIndex::TCounter TCounter; 303 304 typedef typename TIndex::TDFIEntry TDFIEntry; 305 typedef typename TIndex::TDFIDatasets TDFIDatasets; 306 typedef typename Iterator<TDFIDatasets, Standard>::Type TDFIDatasetsIterator; 307 308 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 309 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 310 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 311 typedef typename Size<TText>::Type TSize; 312 typedef typename Value<TText>::Type TValue; 313 314 StringSet<TText, TSpec> const &stringSet = indexText(index); 315 TCounter &occ = index.tempOcc; 316 TCounter &bound = index.tempBound; 317 318 // 1. clear counters and copy SA to temporary SA 319 arrayFill(begin(occ, Standard()), end(occ, Standard()), 0); 320 321 index.nodeEntry.lastSeqSeen = -1; 322 for(unsigned j = 0; j < length(index.nodeEntry.freq); ++j) 323 index.nodeEntry.freq[j] = 0; 324 for(unsigned i = 0; i < ValueSize<TValue>::VALUE; ++i) 325 { 326 TDFIEntry &childEntry = index.childEntry[i]; 327 childEntry.lastSeqSeen = -1; 328 for(unsigned j = 0; j < length(childEntry.freq); ++j) 329 childEntry.freq[j] = 0; 330 } 331 332 // 2. count characters 333 _wotdCountChars(occ, stringSet); 334 335 // 3. cummulative sum 336 TSize requiredSize = _wotdCummulativeSum(bound, occ); 337 338 // 4. fill suffix array 339 unsigned dsNo = 0; 340 TDFIDatasetsIterator currentDS = begin(index.ds, Standard()) + 1; 341 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 342 { 343 // search for surrounding dataset 344 while (seqNo >= *currentDS) { 345 ++dsNo; 346 ++currentDS; 347 } 348 349 TSA &sa = indexSA(index); 350 TSAIterator saBeg = begin(sa, Standard()); 351 TCntIterator boundBeg = begin(bound, Standard()); 352 353 typename Value<TSA>::Type localPos; 354 assignValueI1(localPos, seqNo); 355 assignValueI2(localPos, 0); 356 357 TText const &text = value(stringSet, seqNo); 358 TTextIterator itText = begin(text, Standard()); 359 TTextIterator itTextEnd = end(text, Standard()); 360 for(; itText != itTextEnd; ++itText) 361 { 362 unsigned ord = ordValue(*itText); 363 TDFIEntry &childEntry = index.childEntry[ord]; 364 // new sequence is seen for <ord> character 365 // -> increment frequency of current dataset 366 if (childEntry.lastSeqSeen != seqNo) 367 { 368 childEntry.lastSeqSeen = seqNo; 369 ++childEntry.freq[dsNo]; 370 } 371 372 *(saBeg + (*(boundBeg + ord))++) = localPos; 373 assignValueI2(localPos, getValueI2(localPos) + 1); 374 } 375 } 376 index.sentinelOcc = 0; 377 index.sentinelBound = 0; 378 379 return requiredSize; 380 } 381 382 // sort bucket using radixsort 383 // - all buckets are in lexicographical order 384 // - SA[left,right) contains real SA entries (the beginning positions of the suffices) 385 template < typename TText, typename TSpec, typename TPredHull, typename TPred, typename TBeginPos, typename TEndPos, typename TSize > _sortWotdBucket(Index<StringSet<TText,TSpec>,IndexWotd<Dfi<TPredHull,TPred>>> & index,TBeginPos left,TEndPos right,TSize prefixLen)386 TSize _sortWotdBucket( 387 Index<StringSet<TText, TSpec>, IndexWotd<Dfi<TPredHull, TPred> > > &index, 388 TBeginPos left, 389 TEndPos right, 390 TSize prefixLen) 391 { 392 typedef Index<StringSet<TText, TSpec>, IndexWotd<Dfi<TPredHull, TPred> > > TIndex; 393 typedef typename Fibre<TIndex, WotdSA >::Type TSA; 394 typedef typename TIndex::TCounter TCounter; 395 typedef typename TIndex::TTempSA TTempSA; 396 typedef typename TIndex::TDFIEntry TDFIEntry; 397 typedef typename TIndex::TDFIDatasets TDFIDatasets; 398 typedef typename Iterator<TDFIDatasets, Standard>::Type TDFIDatasetsIterator; 399 400 typedef typename Iterator<TText const, Standard>::Type TTextIterator; 401 typedef typename Iterator<TSA, Standard>::Type TSAIterator; 402 typedef typename Iterator<TTempSA const, Standard>::Type TTempSAIterator; 403 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 404 typedef typename Size<TText>::Type TTextSize; 405 typedef typename Value<TText>::Type TValue; 406 407 StringSet<TText, TSpec> const &stringSet = indexText(index); 408 TTempSA const &tempSA = index.tempSA; 409 TCounter &occ = index.tempOcc; 410 TCounter &bound = index.tempBound; 411 412 // 1. clear counters and copy SA to temporary SA 413 TCntIterator occBeg = begin(occ, Standard()); 414 415 arrayFill(occBeg, end(occ, Standard()), 0); 416 index.tempSA = infix(indexSA(index), left, right); 417 418 index.nodeEntry.lastSeqSeen = -1; 419 for(unsigned j = 0; j < length(index.nodeEntry.freq); ++j) 420 index.nodeEntry.freq[j] = 0; 421 for(unsigned i = 0; i < ValueSize<TValue>::VALUE; ++i) { 422 TDFIEntry &childEntry = index.childEntry[i]; 423 childEntry.lastSeqSeen = -1; 424 for(unsigned j = 0; j < length(childEntry.freq); ++j) 425 childEntry.freq[j] = 0; 426 } 427 428 index.sentinelOcc = 0; 429 index.sentinelBound = 0; 430 431 // 2. count characters 432 { 433 TDFIDatasetsIterator currentDS = begin(index.ds, Standard()) + 1; 434 TTextIterator itText = TTextIterator(); 435 TTempSAIterator itSA = begin(tempSA, Standard()); 436 TTempSAIterator itSAEnd = end(tempSA, Standard()); 437 TTextSize textLength = 0; 438 unsigned lastSeqSeen = (unsigned)-1; 439 unsigned dsNo = 0; 440 Pair<unsigned, TTextSize> lPos; 441 for (; itSA != itSAEnd; ++itSA) 442 { 443 posLocalize(lPos, *itSA, stringSetLimits(index)); 444 if (lastSeqSeen != getSeqNo(lPos)) 445 { 446 lastSeqSeen = getSeqNo(lPos); 447 448 // search for surrounding dataset 449 while (lastSeqSeen >= *currentDS) 450 { 451 ++dsNo; 452 ++currentDS; 453 } 454 ++index.nodeEntry.freq[dsNo]; 455 456 // shift textBegin and textLength by prefixLen 457 textLength = length(stringSet[lastSeqSeen]) - prefixLen; 458 itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen; 459 } 460 if (textLength > getSeqOffset(lPos)) 461 { 462 unsigned ord = ordValue(*(itText + getSeqOffset(lPos))); 463 TDFIEntry &childEntry = index.childEntry[ord]; 464 // new sequence is seen for <ord> character 465 // -> increment frequency of current dataset 466 if (childEntry.lastSeqSeen != lastSeqSeen) 467 { 468 childEntry.lastSeqSeen = lastSeqSeen; 469 ++childEntry.freq[dsNo]; 470 } 471 ++*(occBeg + ord); 472 } else 473 if (textLength == getSeqOffset(lPos)) ++index.sentinelOcc; 474 } 475 } 476 477 // 3. cumulative sum 478 TSize requiredSize = 0; 479 if (index.interSentinelNodes) { 480 if (index.sentinelOcc != 0) 481 requiredSize = (index.sentinelOcc > 1)? 2: 1; // insert *one* $-edge for all $_i suffices 482 } else 483 requiredSize = index.sentinelOcc; // insert each $_i suffix one-by-one 484 485 requiredSize += _wotdCummulativeSum(bound, occ, left + index.sentinelOcc); 486 index.sentinelBound = left; 487 /* 488 std::cout << "$=" << index.sentinelOcc<<"@"<<index.sentinelBound << "\t"; 489 for(int i=0; i<length(occ);++i) 490 if (occ[i]) 491 std::cout << i << "=" << occ[i]<<"@"<<bound[i] << "\t"; 492 */ 493 // 4. fill suffix array 494 { 495 TSA &sa = indexSA(index); 496 TSAIterator saBeg = begin(sa, Standard()); 497 TCntIterator boundBeg = begin(bound, Standard()); 498 499 TTextIterator itText = TTextIterator(); 500 TTempSAIterator itSA = begin(tempSA, Standard()); 501 TTempSAIterator itSAEnd = end(tempSA, Standard()); 502 TTextSize textLength = 0; 503 unsigned lastSeqSeen = (unsigned)-1; 504 Pair<unsigned, TTextSize> lPos; 505 for(; itSA != itSAEnd; ++itSA) 506 { 507 posLocalize(lPos, *itSA, stringSetLimits(index)); 508 if (lastSeqSeen != getSeqNo(lPos)) 509 { 510 lastSeqSeen = getSeqNo(lPos); 511 512 // shift textBegin and textLength by prefixLen 513 textLength = length(stringSet[lastSeqSeen]) - prefixLen; 514 itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen; 515 } 516 if (textLength > getSeqOffset(lPos)) 517 *(saBeg + (*(boundBeg + ordValue(*(itText + getSeqOffset(lPos)))))++) = *itSA; 518 else 519 if (textLength == getSeqOffset(lPos)) 520 *(saBeg + index.sentinelBound++) = *itSA; 521 } 522 } 523 524 return requiredSize; 525 } 526 527 528 // store buckets into directory 529 // storing SA links and topology links in Dir 530 template <typename TText, typename TPredHull, typename TPred, typename TSize> 531 inline void _storeWotdChildren(Index<TText,IndexWotd<Dfi<TPredHull,TPred>>> & index,TSize dirOfs,TSize lcp)532 _storeWotdChildren( 533 Index<TText, IndexWotd<Dfi<TPredHull, TPred> > > &index, 534 TSize dirOfs, 535 TSize lcp) 536 { 537 typedef Index<TText, IndexWotd<Dfi<TPredHull, TPred> > > TIndex; 538 539 typedef typename Fibre<TIndex, WotdDir>::Type TDir; 540 typedef typename TIndex::TCounter TCounter; 541 typedef typename TIndex::TDFIEntries TEntries; 542 543 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 544 typedef typename Size<TDir>::Type TDirSize; 545 typedef typename Iterator<TCounter, Standard>::Type TCntIterator; 546 typedef typename Iterator<TEntries, Standard>::Type TEntriesIterator; 547 548 typedef typename Value<TCounter>::Type TCntValue; 549 typedef typename Value<TDir>::Type TDirValue; 550 551 TDirIterator itDirBegin = begin(indexDir(index), Standard()) + dirOfs; 552 TDirIterator itDirEnd = end(indexDir(index), Standard()); 553 TDirIterator itDir = itDirBegin; 554 TDirIterator itPrev = itDirEnd; 555 556 TCntIterator it = begin(index.tempOcc, Standard()); 557 TCntIterator bit = begin(index.tempBound, Standard()); 558 TCntIterator itEnd = end(index.tempOcc, Standard()); 559 TEntriesIterator itEntry = begin(index.childEntry, Standard()); 560 561 TCntValue occ; 562 if (index.sentinelOcc != 0) 563 { 564 TDirValue orMask = (index.predHull(*itEntry))? index.DFI_PRED_HULL: 0; 565 if (index.pred(*itEntry)) orMask |= index.DFI_PRED; 566 567 if (index.sentinelOcc > 1 && index.interSentinelNodes) // occurs on multiseqs 568 { 569 itPrev = itDir; 570 *itDir = (index.sentinelBound - index.sentinelOcc) | orMask; ++itDir; 571 *itDir = index.sentinelBound | index.UNEVALUATED; ++itDir; 572 } else 573 { 574 orMask |= index.LEAF; 575 } 576 //NOTE(h-2): previosly the following block was indented, as though belonging to 577 // to the else statement. If something here is unexpected, please investigate. 578 for (TDirSize d = index.sentinelBound - index.sentinelOcc; d != index.sentinelBound; ++d) 579 { 580 itPrev = itDir; 581 *itDir = d | orMask; ++itDir; 582 } 583 } 584 585 for (; it != itEnd; ++it, ++bit, ++itEntry) 586 { 587 if ((occ = *it) == 0) continue; 588 589 TDirValue orMask = (index.predHull(*itEntry))? index.DFI_PRED_HULL: 0; 590 if (index.pred(*itEntry)) orMask |= index.DFI_PRED; 591 if ((*itEntry).freq == index.nodeEntry.freq) orMask |= index.DFI_PARENT_FREQ; 592 593 if (occ > 1) { 594 itPrev = itDir; 595 *itDir = (*bit - occ) | orMask; ++itDir; 596 *itDir = *bit | index.UNEVALUATED; ++itDir; 597 } else { 598 itPrev = itDir; 599 *itDir = (*bit - occ) | index.LEAF | orMask; ++itDir; 600 } 601 } 602 603 // first child gets the mutual lcp value of the children (== parent repLength) 604 *itDirBegin = ((*itDirBegin) & ~index.BITMASK0) | lcp; 605 606 // mark the last child 607 if (itPrev != itDirEnd) 608 *itPrev |= index.LAST_CHILD; 609 } 610 611 ////////////////////////////////////////////////////////////////////////////// 612 // debug output 613 614 template <typename TText, typename TPredHull, typename TPred> 615 inline void _dump(Index<TText,IndexWotd<Dfi<TPredHull,TPred>>> & index)616 _dump(Index<TText, IndexWotd< Dfi<TPredHull, TPred> > > &index) 617 { 618 typedef IndexWotd< Dfi<TPredHull, TPred> > TSpec; 619 typedef Index<TText, TSpec> TIndex; 620 typedef typename Fibre<TIndex, WotdDir>::Type TDir; 621 typedef typename Value<TDir>::Type TDirValue; 622 623 std::cout << " Dir (wotd/Dfi)" << std::endl; 624 for(unsigned i=0; i < length(indexDir(index)); ++i) { 625 TDirValue d = indexDir(index)[i]; 626 std::cout << i << ": " << (d & index.BITMASK0); 627 if (d & index.LEAF) std::cout << " (Leaf/Uneval)"; 628 if (d & index.LAST_CHILD) std::cout << " (LastChild/SENTINELS)"; 629 if (d & index.DFI_PRED_HULL) std::cout << " (PRED_HULL)"; 630 if (d & index.DFI_PRED) std::cout << " (PRED)"; 631 if (d & index.DFI_PARENT_FREQ) std::cout << " (PARENT_FREQ)"; 632 std::cout << std::endl; 633 } 634 635 std::cout << std::endl << " SA" << std::endl; 636 for(unsigned i=0; i < length(indexSA(index)); ++i) 637 std::cout << i << ": " << indexSA(index)[i] << " " << suffix(indexText(index), indexSA(index)[i]) << std::endl; 638 639 std::cout << std::endl; 640 } 641 642 template <typename TText, typename TPredHull, typename TPred> 643 inline void _dumpFreq(Index<TText,IndexWotd<Dfi<TPredHull,TPred>>> & index)644 _dumpFreq(Index<TText, IndexWotd< Dfi<TPredHull, TPred> > > &index) 645 { 646 typedef Dfi<TPredHull, TPred> TSpec; 647 typedef Index<TText, IndexWotd<TSpec> > TIndex; 648 typedef typename Value<TIndex>::Type TValue; 649 650 std::cout << " ParentF = ("; 651 for(unsigned d=0; d<length(index.nodeEntry.freq); ++d) { 652 if (d>0) std::cout << ","; 653 std::cout << index.nodeEntry.freq[d]; 654 } 655 std::cout << ")" << std::endl; 656 657 for(unsigned i=0; i<length(index.tempOcc); ++i) 658 if (index.tempOcc[i] != 0) { 659 std::cout << " Freq[" << (TValue)i << "] = ("; 660 for(unsigned d=0; d<length(index.childEntry[i].freq); ++d) { 661 if (d>0) std::cout << ","; 662 std::cout << index.childEntry[i].freq[d]; 663 } 664 std::cout << ")" << std::endl; 665 } 666 } 667 668 ////////////////////////////////////////////////////////////////////////////// 669 // interface for automatic index creation 670 671 template <typename TText, typename TPredHull, typename TPred> indexCreate(Index<TText,IndexWotd<Dfi<TPredHull,TPred>>> & index,WotdDir const,Default const)672 inline bool indexCreate(Index<TText, IndexWotd<Dfi<TPredHull, TPred> > > &index, WotdDir const, Default const) 673 { 674 typedef Index<TText, IndexWotd<Dfi<TPredHull, TPred> > > TIndex; 675 typedef typename Value<TIndex>::Type TValue; 676 677 resize(index.childEntry, (unsigned) ValueSize<TValue>::VALUE); 678 if (empty(index.ds)) { 679 resize(index.ds, 2); 680 index.ds[0] = 0; 681 index.ds[1] = countSequences(index); 682 } 683 resize(index.nodeEntry.freq, length(index.ds) - 1, Exact()); 684 for(unsigned i = 0; i < ValueSize<TValue>::VALUE; ++i) 685 resize(index.childEntry[i].freq, length(index.ds) - 1, Exact()); 686 687 _wotdCreateFirstLevel(index); 688 return true; 689 } 690 } 691 692 #endif //#ifndef SEQAN_HEADER_... 693