1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34 
35 #ifndef SEQAN_HEADER_INDEX_DFI_H
36 #define SEQAN_HEADER_INDEX_DFI_H
37 
38 namespace seqan
39 {
40 
41 
42 //////////////////////////////////////////////////////////////////////////////
43 // simple struct to store frequency vectors
44 
45     struct DfiEntry_
46     {
47         unsigned                        lastSeqSeen;
48 //        String<unsigned, Array<2> >        freq;
49         String<unsigned>                freq;
50     };
51 
52 
53 //////////////////////////////////////////////////////////////////////////////
54 // constant frequency predicate
55 
56     template <bool Result>
57     struct DfiPredDefault_
58     {
operatorDfiPredDefault_59         inline bool operator()(DfiEntry_ const &) const {
60             return Result;
61         }
62     };
63 
64 
65 //////////////////////////////////////////////////////////////////////////////
66 // Dfi - The Deferred Frequency Index
67 
68 /*!
69  * @class IndexDfi
70  * @extends IndexWotd
71  * @headerfile <seqan/index.h>
72  *
73  * @brief The Deferred Frequency Index (see Weese and Schulz, "Efficient string mining under constraints via
74  *        the deferred frequency index").
75  *
76  * @signature template <typename TText, typename TPredHull, typename TPred>
77  *            class Index<TText, IndexWotd< Dfi<TPredHull, TPred> > >;
78  *
79  * @tparam TText     The @link TextConcept text @endlink.
80  * @tparam TPred     An arbitrary frequency predicate.
81  * @tparam TPredHull A monotonic hull of <tt>TPred</tt>
82  *
83  * This index is based on a lazy suffix tree (see @link IndexWotd @endlink).  All <tt>TPredHull</tt> sufficing
84  * nodes can be iterated using a @link TopDownIterator @endlink.  To iterate the exact solution set of <tt>TPred</tt>,
85  * use a @link TopDownHistoryIterator @endlink of this index.
86  */
87 
88 /*!
89  * @defgroup DfiIndexFibres Dfi Index Fibres
90  * @brief Tag to select a specific fibre (e.g. table, object, ...) of an @link
91  *        IndexDfi @endlink index.
92  *
93  * These tags can be used to get @link Fibre @endlink of an @link IndexDfi @endlink.
94  *
95  * @see Fibre
96  * @see Index#getFibre
97  * @see IndexDfi
98  *
99  * @tag DfiIndexFibres#DfiDir
100  * @brief The child table.
101  *
102  * @tag DfiIndexFibres#DfiRawSA
103  * @brief The raw suffix array.
104  *
105  * @tag DfiIndexFibres#DfiText
106  * @brief The original text the index should be based on.
107  *
108  * @tag DfiIndexFibres#DfiRawText
109  * @brief The raw text the index is really based on.
110  *
111  * @tag DfiIndexFibres#DfiSA
112  * @brief The suffix array.
113  */
114 
115 //////////////////////////////////////////////////////////////////////////////
116 /*!
117  * @fn IndexDfi#indexSA
118  * @headerfile <seqan/index.h>
119  * @brief Shortcut for <tt>getFibre(.., DfiSA)</tt>.
120  *
121  * @signature TSa indexSA(index);
122  *
123  * @param[in] index The @link IndexDfi @endlink object holding the fibre.
124  *
125  * @return TSa A reference to the @link DfiIndexFibres#DfiSA @endlink fibre (partially sorted suffix array).
126  */
127 
128 /*!
129  * @fn IndexDfi#indexDir
130  * @headerfile <seqan/index.h>
131  * @brief Shortcut for <tt>getFibre(.., DfiDir())</tt>.
132  * @signature TFibre indexDir(index);
133  *
134  * @param[in] index The @link IndexDfi @endlink object holding the fibre.
135  *
136  * @return TFibre A reference to the @link DfiIndexFibres#DfiDir @endlink fibre (tree structure).
137  */
138 
139 /*!
140  * @fn IndexDfi#saAt
141  * @headerfile <seqan/index.h>
142  * @note Advanced functionality, not commonly used.
143  * @brief Shortcut for <tt>value(indexSA(..), ..)</tt>.
144  *
145  * @signature TValue saAt(position, index);
146  *
147  * @param[in] index The @link IndexDfi @endlink object holding the fibre.
148  * @param[in] position A position in the array on which the value should be accessed.
149  *
150  * @return TValue A reference or proxy to the value in the @link DfiIndexFibres#DfiSA @endlink fibre.
151  *                To be more precise, a reference to a position containing a value of type
152  *                @link SAValue @endlink is returned (or a proxy).
153  */
154 
155 /*!
156  * @fn IndexDfi#dirAt
157  * @headerfile <seqan/index.h>
158  * @brief Shortcut for <tt>value(indexDir(index), position)</tt>.
159  *
160  * @signature TFibre dirAt(position, index);
161  *
162  * @param[in] index    The @link IndexDfi @endlink object holding the fibre.
163  * @param[in] position A position in the array on which the value should be accessed.
164  *
165  * @return TFibre A reference to the @link DfiIndexFibres#DfiDir @endlink fibre.
166  */
167 
168         template <
169         typename TPredHull = DfiPredDefault_<true>,
170         typename TPred = DfiPredDefault_<true>
171     >
172     struct Dfi;
173 
174     template <
175         typename TObject,
176         typename TPredHull,
177         typename TPred
178     >
179     class Index<TObject, IndexWotd< Dfi<TPredHull, TPred> > >:
180         public Index<TObject, IndexWotd<> >
181     {
182     public:
183 
184         typedef Index<TObject, IndexWotd<> >    TBase;
185 
186         // extending base class
187         typedef typename TBase::TText    TText;
188         typedef typename TBase::TValue    TValue;
189         typedef typename TBase::TSize    TSize;
190 
191         using TBase::LEAF;
192         using TBase::LAST_CHILD;
193         using TBase::UNEVALUATED;
194         using TBase::SENTINELS;
195 
196         // frequency strings for Dfi
197         typedef DfiEntry_                        TDFIEntry;
198         typedef String<
199             TDFIEntry,
200             Array<ValueSize<TValue>::VALUE> >    TDFIEntries;
201         typedef String<unsigned>                TDFIDatasets;
202 
203         // 1st word flags
204         static TSize const DFI_PRED_HULL    = (TSize)1 << (BitsPerValue<TSize>::VALUE - 3); // this node fulfills all monotonic frequency predicates (e.g. min_freq)
205         static TSize const DFI_PRED            = (TSize)1 << (BitsPerValue<TSize>::VALUE - 4); // this node fulfills all frequency predicates (e.g. emerging, minmax_freq)
206         static TSize const DFI_PARENT_FREQ    = (TSize)1 << (BitsPerValue<TSize>::VALUE - 5); // this node has the same frequency as its parent
207 
208         static TSize const BITMASK0            = ~(LEAF | LAST_CHILD | DFI_PRED_HULL | DFI_PRED | DFI_PARENT_FREQ);
209         static TSize const BITMASK1            = ~(UNEVALUATED | SENTINELS);
210 
211         TDFIEntry        nodeEntry;            // current nodes frequencies
212         TDFIEntries        childEntry;            // child frequencies for each first letter of outgoing edges
213         TDFIDatasets    ds;
214 
215         TPredHull        predHull;
216         TPred            pred;
217 
218         /*!
219          * @fn IndexDfi::Index
220          * @brief Constructor
221          *
222          * @signature Index::Index();
223          * @signature Index::Index(index);
224          * @signature Index::Index(text[, predHull][, pred]);
225          *
226          * @param[in] index    Other Index object to copy from.
227          * @param[in] text     The text to be indexed.
228          * @param[in] predHull (TPredHull) A monotonic hull of TPred. (optional)
229          * @param[in] pred     (TPred) An arbitrary frequeny predicate. (optional)
230          */
231 
Index()232         Index() {}
233 
Index(Index & other)234         Index(Index &other):
235             TBase((TBase &)other),
236             childEntry(other.childEntry),
237             ds(other.ds),
238             predHull(other.predHull),
239             pred(other.pred) {}
240 
Index(Index const & other)241         Index(Index const &other):
242             TBase((TBase const &)other),
243             childEntry(other.childEntry),
244             ds(other.ds),
245             predHull(other.predHull),
246             pred(other.pred) {}
247 
248         template <typename TText_>
Index(TText_ & _text)249         Index(TText_ &_text):
250             TBase(_text) {}
251 
252         template <typename TText_>
Index(TText_ & _text,TPredHull const & _predHull)253         Index(TText_ &_text, TPredHull const &_predHull):
254             TBase(_text),
255             predHull(_predHull) {}
256 
257         template <typename TText_>
Index(TText_ & _text,TPredHull const & _predHull,TPred const & _pred)258         Index(TText_ &_text, TPredHull const &_predHull, TPred const &_pred):
259             TBase(_text),
260             predHull(_predHull),
261             pred(_pred) {}
262     };
263 
264 
265     template <
266         typename TText,
267         typename TPredHull,
268         typename TPred,
269         typename TSpec
270     >
nodePredicate(Iter<Index<TText,IndexWotd<Dfi<TPredHull,TPred>>>,TSpec> & it)271     inline bool nodePredicate(
272         Iter<Index<TText, IndexWotd< Dfi<TPredHull, TPred> > >, TSpec> &it)
273     {
274         typedef Index<TText, IndexWotd< Dfi<TPredHull, TPred> > > TIndex;
275         return (dirAt(value(it).node, container(it)) & TIndex::DFI_PRED) != 0;
276     }
277 
278     template <
279         typename TText,
280         typename TPredHull,
281         typename TPred,
282         typename TSpec
283     >
nodeHullPredicate(Iter<Index<TText,IndexWotd<Dfi<TPredHull,TPred>>>,TSpec> & it)284     inline bool nodeHullPredicate(
285         Iter<Index<TText, IndexWotd< Dfi<TPredHull, TPred> > >, TSpec> &it)
286     {
287         typedef Index<TText, IndexWotd< Dfi<TPredHull, TPred> > > TIndex;
288         return (dirAt(value(it).node, container(it)) & TIndex::DFI_PRED_HULL) != 0;
289     }
290 
291 
292 //////////////////////////////////////////////////////////////////////////////
293 // we modify counting sort used in the wotd-index
294 // to count the frequencies in passing
295 
296     template < typename TText, typename TSpec, typename TPredHull, typename TPred >
297     typename Size< Index<StringSet<TText, TSpec>, IndexWotd<Dfi<TPredHull, TPred> > > >::Type
_sortFirstWotdBucket(Index<StringSet<TText,TSpec>,IndexWotd<Dfi<TPredHull,TPred>>> & index)298     _sortFirstWotdBucket(Index<StringSet<TText, TSpec>, IndexWotd<Dfi<TPredHull, TPred> > > &index)
299     {
300         typedef Index<StringSet<TText, TSpec>, IndexWotd<Dfi<TPredHull, TPred> > >    TIndex;
301         typedef typename Fibre<TIndex, WotdSA >::Type            TSA;
302         typedef typename TIndex::TCounter                        TCounter;
303 
304         typedef typename TIndex::TDFIEntry                        TDFIEntry;
305         typedef typename TIndex::TDFIDatasets                    TDFIDatasets;
306         typedef typename Iterator<TDFIDatasets, Standard>::Type    TDFIDatasetsIterator;
307 
308         typedef typename Iterator<TText const, Standard>::Type    TTextIterator;
309         typedef typename Iterator<TSA, Standard>::Type            TSAIterator;
310         typedef typename Iterator<TCounter, Standard>::Type        TCntIterator;
311         typedef typename Size<TText>::Type                        TSize;
312         typedef typename Value<TText>::Type                        TValue;
313 
314         StringSet<TText, TSpec> const &stringSet = indexText(index);
315         TCounter &occ = index.tempOcc;
316         TCounter &bound = index.tempBound;
317 
318         // 1. clear counters and copy SA to temporary SA
319         arrayFill(begin(occ, Standard()), end(occ, Standard()), 0);
320 
321         index.nodeEntry.lastSeqSeen = -1;
322         for(unsigned j = 0; j < length(index.nodeEntry.freq); ++j)
323             index.nodeEntry.freq[j] = 0;
324         for(unsigned i = 0; i < ValueSize<TValue>::VALUE; ++i)
325         {
326             TDFIEntry &childEntry = index.childEntry[i];
327             childEntry.lastSeqSeen = -1;
328             for(unsigned j = 0; j < length(childEntry.freq); ++j)
329                 childEntry.freq[j] = 0;
330         }
331 
332         // 2. count characters
333         _wotdCountChars(occ, stringSet);
334 
335         // 3. cummulative sum
336         TSize requiredSize = _wotdCummulativeSum(bound, occ);
337 
338         // 4. fill suffix array
339         unsigned dsNo = 0;
340         TDFIDatasetsIterator currentDS = begin(index.ds, Standard()) + 1;
341         for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo)
342         {
343             // search for surrounding dataset
344             while (seqNo >= *currentDS) {
345                 ++dsNo;
346                 ++currentDS;
347             }
348 
349             TSA &sa = indexSA(index);
350             TSAIterator saBeg = begin(sa, Standard());
351             TCntIterator boundBeg = begin(bound, Standard());
352 
353             typename Value<TSA>::Type localPos;
354             assignValueI1(localPos, seqNo);
355             assignValueI2(localPos, 0);
356 
357             TText const &text = value(stringSet, seqNo);
358             TTextIterator itText = begin(text, Standard());
359             TTextIterator itTextEnd = end(text, Standard());
360             for(; itText != itTextEnd; ++itText)
361             {
362                 unsigned ord = ordValue(*itText);
363                 TDFIEntry &childEntry = index.childEntry[ord];
364                 // new sequence is seen for <ord> character
365                 // -> increment frequency of current dataset
366                 if (childEntry.lastSeqSeen != seqNo)
367                 {
368                     childEntry.lastSeqSeen = seqNo;
369                     ++childEntry.freq[dsNo];
370                 }
371 
372                 *(saBeg + (*(boundBeg + ord))++) = localPos;
373                 assignValueI2(localPos, getValueI2(localPos) + 1);
374             }
375         }
376         index.sentinelOcc = 0;
377         index.sentinelBound = 0;
378 
379         return requiredSize;
380     }
381 
382     // sort bucket using radixsort
383     // - all buckets are in lexicographical order
384     // - SA[left,right) contains real SA entries (the beginning positions of the suffices)
385     template < typename TText, typename TSpec, typename TPredHull, typename TPred, typename TBeginPos, typename TEndPos, typename TSize >
_sortWotdBucket(Index<StringSet<TText,TSpec>,IndexWotd<Dfi<TPredHull,TPred>>> & index,TBeginPos left,TEndPos right,TSize prefixLen)386     TSize _sortWotdBucket(
387         Index<StringSet<TText, TSpec>, IndexWotd<Dfi<TPredHull, TPred> > > &index,
388         TBeginPos left,
389         TEndPos right,
390         TSize prefixLen)
391     {
392         typedef Index<StringSet<TText, TSpec>, IndexWotd<Dfi<TPredHull, TPred> > >    TIndex;
393         typedef typename Fibre<TIndex, WotdSA >::Type                TSA;
394         typedef typename TIndex::TCounter                            TCounter;
395         typedef typename TIndex::TTempSA                            TTempSA;
396         typedef typename TIndex::TDFIEntry                            TDFIEntry;
397         typedef typename TIndex::TDFIDatasets                        TDFIDatasets;
398         typedef typename Iterator<TDFIDatasets, Standard>::Type        TDFIDatasetsIterator;
399 
400         typedef typename Iterator<TText const, Standard>::Type        TTextIterator;
401         typedef typename Iterator<TSA, Standard>::Type                TSAIterator;
402         typedef typename Iterator<TTempSA const, Standard>::Type    TTempSAIterator;
403         typedef typename Iterator<TCounter, Standard>::Type            TCntIterator;
404         typedef typename Size<TText>::Type                            TTextSize;
405         typedef typename Value<TText>::Type                            TValue;
406 
407         StringSet<TText, TSpec> const &stringSet = indexText(index);
408         TTempSA const &tempSA = index.tempSA;
409         TCounter &occ = index.tempOcc;
410         TCounter &bound = index.tempBound;
411 
412         // 1. clear counters and copy SA to temporary SA
413         TCntIterator occBeg = begin(occ, Standard());
414 
415         arrayFill(occBeg, end(occ, Standard()), 0);
416         index.tempSA = infix(indexSA(index), left, right);
417 
418         index.nodeEntry.lastSeqSeen = -1;
419         for(unsigned j = 0; j < length(index.nodeEntry.freq); ++j)
420             index.nodeEntry.freq[j] = 0;
421         for(unsigned i = 0; i < ValueSize<TValue>::VALUE; ++i) {
422             TDFIEntry &childEntry = index.childEntry[i];
423             childEntry.lastSeqSeen = -1;
424             for(unsigned  j = 0; j < length(childEntry.freq); ++j)
425                 childEntry.freq[j] = 0;
426         }
427 
428         index.sentinelOcc = 0;
429         index.sentinelBound = 0;
430 
431         // 2. count characters
432         {
433             TDFIDatasetsIterator currentDS = begin(index.ds, Standard()) + 1;
434             TTextIterator itText = TTextIterator();
435             TTempSAIterator itSA = begin(tempSA, Standard());
436             TTempSAIterator itSAEnd = end(tempSA, Standard());
437             TTextSize textLength = 0;
438             unsigned lastSeqSeen = (unsigned)-1;
439             unsigned dsNo = 0;
440             Pair<unsigned, TTextSize> lPos;
441             for (; itSA != itSAEnd; ++itSA)
442             {
443                 posLocalize(lPos, *itSA, stringSetLimits(index));
444                 if (lastSeqSeen != getSeqNo(lPos))
445                 {
446                     lastSeqSeen = getSeqNo(lPos);
447 
448                     // search for surrounding dataset
449                     while (lastSeqSeen >= *currentDS)
450                     {
451                         ++dsNo;
452                         ++currentDS;
453                     }
454                     ++index.nodeEntry.freq[dsNo];
455 
456                     // shift textBegin and textLength by prefixLen
457                     textLength = length(stringSet[lastSeqSeen]) - prefixLen;
458                     itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen;
459                 }
460                 if (textLength > getSeqOffset(lPos))
461                 {
462                     unsigned ord = ordValue(*(itText + getSeqOffset(lPos)));
463                     TDFIEntry &childEntry = index.childEntry[ord];
464                     // new sequence is seen for <ord> character
465                     // -> increment frequency of current dataset
466                     if (childEntry.lastSeqSeen != lastSeqSeen)
467                     {
468                         childEntry.lastSeqSeen = lastSeqSeen;
469                         ++childEntry.freq[dsNo];
470                     }
471                     ++*(occBeg + ord);
472                 } else
473                     if (textLength == getSeqOffset(lPos)) ++index.sentinelOcc;
474             }
475         }
476 
477         // 3. cumulative sum
478         TSize requiredSize = 0;
479         if (index.interSentinelNodes) {
480             if (index.sentinelOcc != 0)
481                 requiredSize = (index.sentinelOcc > 1)? 2: 1;    // insert *one* $-edge for all $_i suffices
482         } else
483             requiredSize = index.sentinelOcc;                    // insert each $_i suffix one-by-one
484 
485         requiredSize += _wotdCummulativeSum(bound, occ, left + index.sentinelOcc);
486         index.sentinelBound = left;
487 /*
488         std::cout << "$=" << index.sentinelOcc<<"@"<<index.sentinelBound << "\t";
489         for(int i=0; i<length(occ);++i)
490             if (occ[i])
491                 std::cout << i << "=" << occ[i]<<"@"<<bound[i] << "\t";
492 */
493         // 4. fill suffix array
494         {
495             TSA &sa = indexSA(index);
496             TSAIterator saBeg = begin(sa, Standard());
497             TCntIterator boundBeg = begin(bound, Standard());
498 
499             TTextIterator itText = TTextIterator();
500             TTempSAIterator itSA = begin(tempSA, Standard());
501             TTempSAIterator itSAEnd = end(tempSA, Standard());
502             TTextSize textLength = 0;
503             unsigned lastSeqSeen = (unsigned)-1;
504             Pair<unsigned, TTextSize> lPos;
505             for(; itSA != itSAEnd; ++itSA)
506             {
507                 posLocalize(lPos, *itSA, stringSetLimits(index));
508                 if (lastSeqSeen != getSeqNo(lPos))
509                 {
510                     lastSeqSeen = getSeqNo(lPos);
511 
512                     // shift textBegin and textLength by prefixLen
513                     textLength = length(stringSet[lastSeqSeen]) - prefixLen;
514                     itText = begin(stringSet[lastSeqSeen], Standard()) + prefixLen;
515                 }
516                 if (textLength > getSeqOffset(lPos))
517                     *(saBeg + (*(boundBeg + ordValue(*(itText + getSeqOffset(lPos)))))++) = *itSA;
518                 else
519                     if (textLength == getSeqOffset(lPos))
520                         *(saBeg + index.sentinelBound++) = *itSA;
521             }
522         }
523 
524         return requiredSize;
525     }
526 
527 
528     // store buckets into directory
529     // storing SA links and topology links in Dir
530     template <typename TText, typename TPredHull, typename TPred, typename TSize>
531     inline void
_storeWotdChildren(Index<TText,IndexWotd<Dfi<TPredHull,TPred>>> & index,TSize dirOfs,TSize lcp)532     _storeWotdChildren(
533         Index<TText, IndexWotd<Dfi<TPredHull, TPred> > > &index,
534         TSize dirOfs,
535         TSize lcp)
536     {
537         typedef Index<TText, IndexWotd<Dfi<TPredHull, TPred> > >    TIndex;
538 
539         typedef typename Fibre<TIndex, WotdDir>::Type        TDir;
540         typedef typename TIndex::TCounter                    TCounter;
541         typedef typename TIndex::TDFIEntries                TEntries;
542 
543         typedef typename Iterator<TDir, Standard>::Type        TDirIterator;
544         typedef typename Size<TDir>::Type                    TDirSize;
545         typedef typename Iterator<TCounter, Standard>::Type    TCntIterator;
546         typedef typename Iterator<TEntries, Standard>::Type    TEntriesIterator;
547 
548         typedef typename Value<TCounter>::Type                TCntValue;
549         typedef typename Value<TDir>::Type                    TDirValue;
550 
551         TDirIterator itDirBegin = begin(indexDir(index), Standard()) + dirOfs;
552         TDirIterator itDirEnd = end(indexDir(index), Standard());
553         TDirIterator itDir = itDirBegin;
554         TDirIterator itPrev = itDirEnd;
555 
556         TCntIterator it = begin(index.tempOcc, Standard());
557         TCntIterator bit = begin(index.tempBound, Standard());
558         TCntIterator itEnd = end(index.tempOcc, Standard());
559         TEntriesIterator itEntry = begin(index.childEntry, Standard());
560 
561         TCntValue occ;
562         if (index.sentinelOcc != 0)
563         {
564             TDirValue orMask = (index.predHull(*itEntry))? index.DFI_PRED_HULL: 0;
565             if (index.pred(*itEntry)) orMask |= index.DFI_PRED;
566 
567             if (index.sentinelOcc > 1 && index.interSentinelNodes)    // occurs on multiseqs
568             {
569                 itPrev = itDir;
570                 *itDir = (index.sentinelBound - index.sentinelOcc) | orMask;    ++itDir;
571                 *itDir = index.sentinelBound | index.UNEVALUATED;                ++itDir;
572             } else
573             {
574                 orMask |= index.LEAF;
575             }
576             //NOTE(h-2): previosly the following block was indented, as though belonging to
577             //           to the else statement. If something here is unexpected, please investigate.
578             for (TDirSize d = index.sentinelBound - index.sentinelOcc; d != index.sentinelBound; ++d)
579             {
580                 itPrev = itDir;
581                 *itDir = d | orMask;                                        ++itDir;
582             }
583         }
584 
585         for (; it != itEnd; ++it, ++bit, ++itEntry)
586         {
587             if ((occ = *it) == 0) continue;
588 
589             TDirValue orMask = (index.predHull(*itEntry))? index.DFI_PRED_HULL: 0;
590             if (index.pred(*itEntry)) orMask |= index.DFI_PRED;
591             if ((*itEntry).freq == index.nodeEntry.freq) orMask |= index.DFI_PARENT_FREQ;
592 
593             if (occ > 1) {
594                 itPrev = itDir;
595                 *itDir = (*bit - occ) | orMask;                    ++itDir;
596                 *itDir = *bit | index.UNEVALUATED;                ++itDir;
597             } else {
598                 itPrev = itDir;
599                 *itDir = (*bit - occ) | index.LEAF | orMask;    ++itDir;
600             }
601         }
602 
603         // first child gets the mutual lcp value of the children (== parent repLength)
604         *itDirBegin = ((*itDirBegin) & ~index.BITMASK0) | lcp;
605 
606         // mark the last child
607         if (itPrev != itDirEnd)
608             *itPrev |= index.LAST_CHILD;
609     }
610 
611 //////////////////////////////////////////////////////////////////////////////
612 // debug output
613 
614     template <typename TText, typename TPredHull, typename TPred>
615     inline void
_dump(Index<TText,IndexWotd<Dfi<TPredHull,TPred>>> & index)616     _dump(Index<TText, IndexWotd< Dfi<TPredHull, TPred> > > &index)
617     {
618         typedef IndexWotd< Dfi<TPredHull, TPred> >        TSpec;
619         typedef Index<TText, TSpec>                            TIndex;
620         typedef typename Fibre<TIndex, WotdDir>::Type        TDir;
621         typedef typename Value<TDir>::Type                    TDirValue;
622 
623         std::cout << "  Dir (wotd/Dfi)" << std::endl;
624         for(unsigned i=0; i < length(indexDir(index)); ++i) {
625             TDirValue d = indexDir(index)[i];
626             std::cout << i << ":  " << (d & index.BITMASK0);
627             if (d & index.LEAF)                std::cout << "  (Leaf/Uneval)";
628             if (d & index.LAST_CHILD)        std::cout << "  (LastChild/SENTINELS)";
629             if (d & index.DFI_PRED_HULL)    std::cout << "  (PRED_HULL)";
630             if (d & index.DFI_PRED)            std::cout << "  (PRED)";
631             if (d & index.DFI_PARENT_FREQ)    std::cout << "  (PARENT_FREQ)";
632             std::cout << std::endl;
633         }
634 
635         std::cout << std::endl << "  SA" << std::endl;
636         for(unsigned i=0; i < length(indexSA(index)); ++i)
637             std::cout << i << ":  " << indexSA(index)[i] << "  " << suffix(indexText(index), indexSA(index)[i]) << std::endl;
638 
639         std::cout << std::endl;
640     }
641 
642     template <typename TText, typename TPredHull, typename TPred>
643     inline void
_dumpFreq(Index<TText,IndexWotd<Dfi<TPredHull,TPred>>> & index)644     _dumpFreq(Index<TText, IndexWotd< Dfi<TPredHull, TPred> > > &index)
645     {
646         typedef Dfi<TPredHull, TPred> TSpec;
647         typedef Index<TText, IndexWotd<TSpec> >                TIndex;
648         typedef typename Value<TIndex>::Type                    TValue;
649 
650         std::cout << "  ParentF = (";
651         for(unsigned d=0; d<length(index.nodeEntry.freq); ++d) {
652             if (d>0) std::cout << ",";
653             std::cout << index.nodeEntry.freq[d];
654         }
655         std::cout << ")" << std::endl;
656 
657         for(unsigned i=0; i<length(index.tempOcc); ++i)
658             if (index.tempOcc[i] != 0) {
659                 std::cout << "  Freq[" << (TValue)i << "] = (";
660                 for(unsigned d=0; d<length(index.childEntry[i].freq); ++d) {
661                     if (d>0) std::cout << ",";
662                     std::cout << index.childEntry[i].freq[d];
663                 }
664                 std::cout << ")" << std::endl;
665             }
666     }
667 
668 //////////////////////////////////////////////////////////////////////////////
669 // interface for automatic index creation
670 
671     template <typename TText, typename TPredHull, typename TPred>
indexCreate(Index<TText,IndexWotd<Dfi<TPredHull,TPred>>> & index,WotdDir const,Default const)672     inline bool indexCreate(Index<TText, IndexWotd<Dfi<TPredHull, TPred> > > &index, WotdDir const, Default const)
673     {
674         typedef Index<TText, IndexWotd<Dfi<TPredHull, TPred> > >    TIndex;
675         typedef typename Value<TIndex>::Type                            TValue;
676 
677         resize(index.childEntry, (unsigned) ValueSize<TValue>::VALUE);
678         if (empty(index.ds)) {
679             resize(index.ds, 2);
680             index.ds[0] = 0;
681             index.ds[1] = countSequences(index);
682         }
683         resize(index.nodeEntry.freq, length(index.ds) - 1, Exact());
684         for(unsigned i = 0; i < ValueSize<TValue>::VALUE; ++i)
685             resize(index.childEntry[i].freq, length(index.ds) - 1, Exact());
686 
687         _wotdCreateFirstLevel(index);
688         return true;
689     }
690 }
691 
692 #endif //#ifndef SEQAN_HEADER_...
693