1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: David Weese <david.weese@fu-berlin.de> 33 // ========================================================================== 34 35 #ifndef SEQAN_HEADER_INDEX_SKEW7_H 36 #define SEQAN_HEADER_INDEX_SKEW7_H 37 38 namespace seqan 39 { 40 41 //namespace SEQAN_NAMESPACE_PIPELINING 42 //{ 43 44 struct Skew7 {}; 45 46 47 ////////////////////////////////////////////////////////////////////////////// 48 // external Skew7 algorithm 49 ////////////////////////////////////////////////////////////////////////////// 50 51 template <typename T> 52 struct SkewDC_<7, T> 53 { 54 static const unsigned VALUE[]; 55 }; 56 57 template <typename T> 58 const unsigned SkewDC_<7, T>::VALUE[] = { 3, 1, 2, 4 }; 59 60 61 // *** COMPARATORS & MAPS *** 62 63 template <typename TValue, typename TResult = int> 64 struct _skew7NComp : 65 public std::binary_function<TValue, TValue, TResult> 66 { 67 inline TResult operator() (const TValue &a, const TValue &b) const 68 { 69 typedef typename Value<TValue, 1>::Type TSize; 70 typedef typename Value<TValue, 2>::Type TSeptet; 71 typedef typename Value<TSeptet>::Type TSeptetValue; 72 typedef typename StoredTupleValue_<TSeptetValue>::Type TStoredValue; 73 74 const TStoredValue *sa = a.i2.i; 75 const TStoredValue *sb = b.i2.i; 76 77 TSize n = LENGTH<TSeptet>::VALUE; 78 if (a.i1 < n) n = a.i1; 79 if (b.i1 < n) n = b.i1; 80 81 // compare the overlap of septets (the first n bases) 82 for (TSize i = 0; i < n; i++, ++sa, ++sb) 83 { 84 if (*sa == *sb) continue; 85 return (*sa < *sb)? -1 : 1; 86 } 87 88 // overlap is equal, now suffix lengths decide 89 if (n < LENGTH<TSeptet>::VALUE) 90 return (a.i1 < b.i1)? -1 : 1; 91 else 92 return 0; 93 } 94 }; 95 96 // optimized for bitvectors 97 template <typename T1, typename TValue, typename TResult> 98 struct _skew7NComp< Pair<T1, Tuple<TValue, 7, BitPacked<> >, Pack >, TResult > : 99 public std::binary_function< 100 Pair<T1, Tuple<TValue, 7, BitPacked<> >, Pack >, 101 Pair<T1, Tuple<TValue, 7, BitPacked<> >, Pack >, 102 TResult> 103 { 104 inline TResult operator() ( 105 const Pair<T1, Tuple<TValue, 7, BitPacked<> >, Pack > &a, 106 const Pair<T1, Tuple<TValue, 7, BitPacked<> >, Pack > &b) const 107 { 108 if (a.i2 < b.i2) return -1; 109 if (a.i2 > b.i2) return 1; 110 if (a.i1 < 7 || b.i1 < 7) 111 return (a.i1 < b.i1)? -1 : 1; 112 return 0; 113 } 114 }; 115 116 template <typename TValue, typename TResult = typename Value<TValue, 1>::Type> 117 struct _skew7NMapLinear : 118 public std::unary_function<TValue, TResult> 119 { 120 TResult BN4, BN; 121 122 _skew7NMapLinear(TResult BN_): BN4(BN_+1), BN(BN_) {} 123 124 inline TResult operator() (const TValue& x) const 125 { 126 TResult i = x.i1; return (i%7 == 4)? BN4-(i-(i/7)*4): BN-(i-(i/7)*4); 127 } 128 }; 129 130 template <typename TValue, typename TResult = typename Value<TValue, 1>::Type> 131 struct _skew7NMapSliced : 132 public std::unary_function<TValue, TResult> 133 { 134 TResult off[5]; 135 136 _skew7NMapSliced(TResult BN_) 137 { 138 off[0] = 0; 139 off[1] = BN_ - 1; 140 off[2] = (2*BN_)/3 - 1; 141 off[3] = 0; 142 off[4] = BN_/3 - 1; 143 } 144 145 inline TResult operator() (const TValue& x) const 146 { 147 return off[x.i1 % 7] - x.i1/7; 148 } 149 }; 150 151 152 template <typename TValue, typename TResult = TValue> 153 struct _skew7UnslicerFunc : 154 public std::unary_function<TValue, TResult> 155 { 156 TResult o1, o2, o4, n4, n24; 157 158 _skew7UnslicerFunc(TResult N) : 159 o1(N - (N + 6) % 7), 160 o2(N - (N + 5) % 7), 161 o4(N - (N + 3) % 7), 162 n4((N + 3) / 7), 163 n24((N + 5) / 7 + n4) {} 164 165 inline TResult operator() (const TValue& x) const 166 { 167 return (x < n4) ? o4 - x * 7: 168 (x < n24) ? o2 - (x - n4) * 7: 169 o1 - (x - n24) * 7; 170 } 171 }; 172 173 template <typename TValue, typename TResult = typename Value<typename Value<TValue, 2>::Type>::Type> 174 struct _skew7NMapExtended : 175 public std::unary_function<TValue, TResult> 176 { 177 inline TResult operator() (const TValue& x) const 178 { 179 return x.i2[0]; 180 } 181 }; 182 183 template <typename TValue, unsigned EXT_LENGTH, typename TResult = int> 184 struct _skew7ExtendComp : 185 public std::binary_function<TValue, TValue, TResult> 186 { 187 inline TResult operator() (const TValue &a, const TValue &b) const 188 { 189 for (unsigned i = 0; i < EXT_LENGTH; i++) 190 { 191 if (a.i3[i] == b.i3[i]) continue; 192 return (a.i3[i] < b.i3[i])? -1 : 1; 193 } 194 return (a.i2[0] < b.i2[0])? -1 : 1; 195 } 196 }; 197 198 // optimized for bitvectors 199 /* 200 template <typename T1, typename T2, typename T, const int _size, const int EXT_LENGTH, typename TResult> 201 struct _skew7ExtendComp< Triple<T1,T2,Tuple<T,_size, BitPacked<> >, Pack>, EXT_LENGTH, TResult> : 202 public std::binary_function< 203 Triple<T1,T2,Tuple<T,_size, BitPacked<> >, Pack>, 204 Triple<T1,T2,Tuple<T,_size, BitPacked<> >, Pack>, 205 TResult> 206 { 207 inline TResult operator() ( 208 const Triple<T1,T2,Tuple<T,_size, BitPacked<> >, Pack> &a, 209 const Triple<T1,T2,Tuple<T,_size, BitPacked<> >, Pack> &b) const 210 { 211 if (a.i3 < b.i3) return -1; 212 if (a.i3 > b.i3) return 1; 213 return (a.i2[0] < b.i2[0])? -1 : 1; 214 } 215 }; 216 */ 217 218 template < typename TInput > 219 struct Value< Pipe< TInput, Skew7 > > : 220 public Size<TInput> {}; 221 222 template <typename T> 223 struct Skew7StringSpec_: 224 public Spec<T> {}; 225 226 template <typename T, typename TStringSpec> 227 struct Skew7StringSpec_<String<T, TStringSpec> > 228 { 229 typedef TStringSpec Type; 230 }; 231 232 template <typename T, typename TSegmentSpec> 233 struct Skew7StringSpec_<Segment<T, TSegmentSpec> >: 234 public Skew7StringSpec_<T> {}; 235 236 template <typename T> 237 struct Skew7StringSpec_<T const>: 238 public Skew7StringSpec_<T> {}; 239 240 ////////////////////////////////////////////////////////////////////////////// 241 // skew7 class 242 template < typename TInput > 243 struct Pipe< TInput, Skew7 > 244 { 245 246 // *** SPECIALIZATION *** 247 248 // use packing if lessorequal 16 different values per char 249 typedef typename IfC< 250 (BitsPerValue<TypeOf_(TInput)>::VALUE > 0) && 251 (BitsPerValue<TypeOf_(TInput)>::VALUE <= 4), 252 BitPacked<>, 253 Pack>::Type TPack; 254 255 // step 1 256 typedef Pipe< TInput, Sampler<7, TPack> > TSamplerDC7; 257 typedef _skew7NComp<TypeOf_(TSamplerDC7)> ncomp_t; 258 typedef Pool< TypeOf_(TSamplerDC7), SorterSpec< SorterConfigSize<ncomp_t, TSizeOf_(TSamplerDC7) > > > TSortTuples; 259 typedef Pipe< TSortTuples, Namer<ncomp_t> > TNamer; 260 typedef _skew7NMapSliced<TypeOf_(TNamer)> nmap_sliced_t; 261 typedef _skew7NMapLinear<TypeOf_(TNamer)> nmap_linear_t; 262 typedef Pool< TypeOf_(TNamer), MapperSpec< MapperConfigSize< nmap_sliced_t, TSizeOf_(TNamer) > > > TNames_Sliced; 263 264 // unique names - shortcut 265 typedef Pool< TypeOf_(TNames_Sliced), MapperSpec< MapperConfigSize< nmap_linear_t, TSizeOf_(TNames_Sliced) > > > TNames_Linear_Unique; 266 267 // non-unique names 268 typedef Pipe< TNames_Sliced, Filter< filterI2<TypeOf_(TNames_Sliced)> > > TFilter; 269 270 // recursion 271 typedef Pipe< TFilter, Skew3 > TRecurse; 272 typedef _skew7UnslicerFunc<TypeOf_(TRecurse)> unslicer_func_t; 273 typedef Pipe< TRecurse, Filter<unslicer_func_t> > TUnslicer; 274 typedef Pipe< TUnslicer, Counter > TRenamer; 275 276 // no recursion inMemory shortcut 277 typedef Pipe< TFilter, LarssonSadakane > TInMem; 278 typedef Pipe< TInMem, Filter<unslicer_func_t> > TUnslicerInMem; 279 typedef Pipe< TUnslicerInMem, Counter > TRenamerInMem; 280 281 typedef Pool< TypeOf_(TRenamer), MapperSpec< MapperConfigSize< nmap_linear_t, TSizeOf_(TRenamer) > > > TNames_Linear; 282 283 // step 2 284 typedef Pipe< Bundle2< TInput, TNames_Linear >, Extender7<TPack> > TExtender; 285 typedef _skew7ExtendComp<TypeOf_(typename TExtender::Out0),3> extend0_comp_t; 286 typedef _skew7ExtendComp<TypeOf_(typename TExtender::Out6),2> extend6_comp_t; 287 typedef _skew7ExtendComp<TypeOf_(typename TExtender::Out5),1> extend5_comp_t; 288 typedef _skew7ExtendComp<TypeOf_(typename TExtender::Out3),1> extend3_comp_t; 289 typedef Pool< TypeOf_(typename TExtender::Out0), SorterSpec< SorterConfigSize< extend0_comp_t, TSizeOf_(typename TExtender::Out0) > > > TSorterS0; 290 typedef Pool< TypeOf_(typename TExtender::Out6), SorterSpec< SorterConfigSize< extend6_comp_t, TSizeOf_(typename TExtender::Out6) > > > TSorterS6; 291 typedef Pool< TypeOf_(typename TExtender::Out5), SorterSpec< SorterConfigSize< extend5_comp_t, TSizeOf_(typename TExtender::Out5) > > > TSorterS5; 292 typedef Pool< TypeOf_(typename TExtender::Out3), SorterSpec< SorterConfigSize< extend3_comp_t, TSizeOf_(typename TExtender::Out3) > > > TSorterS3; 293 294 // step 3 295 typedef _skew7NMapExtended<TypeOf_(typename TExtender::Out124)> nmap_extended_t; 296 typedef Pool< TypeOf_(typename TExtender::Out124), MapperSpec< MapperConfigSize< nmap_extended_t, TSizeOf_(typename TExtender::Out124) > > > TSorterS124; 297 typedef Pipe< Bundle5< TSorterS0, TSorterS3, TSorterS5, TSorterS6, TSorterS124 >, Merger7 > TMerger; 298 299 TSorterS0 sortedS0; 300 TSorterS3 sortedS3; 301 TSorterS5 sortedS5; 302 TSorterS6 sortedS6; 303 TSorterS124 sortedS124; 304 TMerger in; 305 306 Pipe(): 307 in(bundle5(sortedS0, sortedS3, sortedS5, sortedS6, sortedS124)) {} 308 309 Pipe(TInput& _textIn): 310 in(bundle5(sortedS0, sortedS3, sortedS5, sortedS6, sortedS124)) 311 { 312 process(_textIn); 313 } 314 315 template < typename TInput_ > 316 bool process(TInput_ &textIn) { 317 318 SEQAN_PROADD(SEQAN_PRODEPTH, 1); 319 SEQAN_PROMARK("Enter recursion"); 320 #ifdef SEQAN_DEBUG_INDEX 321 std::cerr << "enter level " << SEQAN_PROVAL(SEQAN_PRODEPTH) << " bit-packing: "; 322 std::cerr << IsSameType<TPack, BitPacked<> >::VALUE << " " << BitsPerValue<TypeOf_(TInput)>::VALUE << std::endl; 323 #endif 324 { 325 326 327 // *** INSTANTIATION *** 328 329 // step 1 330 TSamplerDC7 sampler(textIn); 331 TSortTuples sorter; 332 #ifdef SEQAN_DEBUG_INDEX 333 std::cerr << " sort names (" << length(sampler)<< ")" << std::endl; 334 #endif 335 sorter << sampler; 336 SEQAN_PROMARK("Sorter (2) - sort 7-mers"); 337 338 TNamer namer(sorter); 339 nmap_sliced_t map_sliced(length(namer)); 340 nmap_linear_t map_linear(length(namer)); 341 TNames_Sliced names_sliced(map_sliced); 342 #ifdef SEQAN_DEBUG_INDEX 343 std::cerr << " slice names" << std::endl; 344 #endif 345 names_sliced << namer; 346 347 if (namer.unique() || empty(names_sliced)) { 348 // unique names 349 350 clear(sorter); 351 SEQAN_PROMARK("Mapper (4) - construct s124"); 352 TNames_Linear_Unique names_linear(map_linear); 353 354 #ifdef SEQAN_DEBUG_INDEX 355 std::cerr << " make names linear" << std::endl; 356 #endif 357 names_linear << names_sliced; 358 clear(names_sliced); 359 SEQAN_PROMARK("Mapper (10) - construct ISA124"); 360 361 // step 2 362 _skew7Extend(textIn, names_linear, sortedS0, sortedS3, sortedS5, sortedS6, sortedS124); 363 364 } else { 365 // non-unique names 366 367 clear(sorter); 368 SEQAN_PROMARK("Mapper (4) - construct s124"); 369 370 TFilter filter(names_sliced); 371 TNames_Linear names_linear(map_linear); 372 373 if (length(filter) > 128*1024*1024) 374 { 375 // recursion 376 TRecurse recurse(filter); 377 378 #ifdef SEQAN_TEST_SKEW7 379 { 380 String<typename Value<TFilter>::Type, Alloc<> > _text; 381 _text << filter; 382 SEQAN_ASSERT(isSuffixArray(recurse, _text)); 383 } 384 #endif 385 386 clear(filter); 387 unslicer_func_t func(length(textIn)); 388 TUnslicer unslicer(recurse, func); 389 TRenamer renamer(unslicer); 390 391 #ifdef SEQAN_DEBUG_INDEX 392 std::cerr << " rename names" << std::endl; 393 #endif 394 395 names_linear << renamer; 396 } 397 else 398 { 399 TInMem inMem(filter); 400 401 clear(filter); 402 unslicer_func_t func(length(textIn)); 403 TUnslicerInMem unslicer(inMem, func); 404 TRenamerInMem renamer(unslicer); 405 406 #ifdef SEQAN_DEBUG_INDEX 407 std::cerr << " rename names" << std::endl; 408 #endif 409 410 names_linear << renamer; 411 } 412 413 SEQAN_PROMARK("Mapper (10) - ISA124 konstruieren"); 414 415 // step 2 416 #ifdef SEQAN_DEBUG_INDEX 417 std::cerr << " prepare merge" << std::endl; 418 #endif 419 _skew7Extend(textIn, names_linear, sortedS0, sortedS3, sortedS5, sortedS6, sortedS124); 420 SEQAN_PROMARK("Mapper (12), Sorter (13-16) - merge SA124, SA3, SA5, SA6, SA0"); 421 } 422 423 // step 3 424 // ... is done on-demand by merger 425 } 426 #ifdef SEQAN_DEBUG_INDEX 427 std::cerr << "left level " << SEQAN_PROVAL(SEQAN_PRODEPTH) << std::endl; 428 #endif 429 SEQAN_PROMARK("Left recursion"); 430 SEQAN_PROSUB(SEQAN_PRODEPTH, 1); 431 432 return true; 433 } 434 435 inline typename Value<Pipe>::Type const operator*() { 436 return *in; 437 } 438 439 inline Pipe& operator++() { 440 ++in; 441 return *this; 442 } 443 }; 444 445 // not sure which interface is more intuitive, we support both 446 // you can call "skew << pipe" or "skew_t skew(pipe); skew.process()" 447 // for the first we would need no _in member 448 template < typename TInput, typename TObject > 449 inline bool operator<<(Pipe< TInput, Skew7 > &me, TObject &textIn) { 450 return me.process(textIn); 451 } 452 453 454 ////////////////////////////////////////////////////////////////////////////// 455 // internal Skew7 algorithm 456 ////////////////////////////////////////////////////////////////////////////// 457 458 ////////////////////////////////////////////////////////////////////////////// 459 // typedefs and helpers 460 461 // compares n characters and in case of equality the names a2 and b2 (no clipping) 462 template <typename TTextIter, typename TSize> inline 463 bool _leqSkew7(TTextIter a1, TSize a2, TTextIter b1, TSize b2, TSize n) 464 { 465 // lexic. order for n-tupels 466 for (; n != 0; --n, ++a1, ++b1) { 467 if (ordLess(*a1, *b1)) return true; 468 if (ordLess(*b1, *a1)) return false; 469 } 470 return (a2 <= b2); 471 } 472 473 // compares at most the last n characters (a) with b (clipping) 474 template <typename TTextIter, typename TSize> inline 475 bool _leqSkew7(TTextIter a, TTextIter b, TSize n) 476 { // lexic. order for n-tupels 477 for (; n != 0; --n, ++a, ++b) { 478 if (ordLess(*a, *b)) return true; 479 if (ordLess(*b, *a)) return false; 480 } 481 return true; // a is shorter than b 482 } 483 484 // compares two suffixes of residue classes a and b 485 template <typename TTextIter, typename TSize, typename TString> inline 486 bool _leqSkew7(unsigned a, unsigned b, TTextIter spos[], const TSize tpos[], const bool islast[], const TString &s124, const TSize adjust[7][7]) 487 { 488 TTextIter sa = spos[a]; 489 TTextIter sb = spos[b]; 490 TSize shft = SkewShift_<7>::VALUE[a][b]; 491 if (sa > sb) { 492 if ((a != 0) && (a < shft) && islast[a]) // do we need to clip? 493 return _leqSkew7 (sa, sb, a); 494 } else { 495 if ((b != 0) && (b < shft) && islast[b]) // do we need to clip? 496 return !_leqSkew7 (sb, sa, b); 497 } 498 return _leqSkew7 (sa, s124[tpos[a] + adjust[a][b]], sb, s124[tpos[b] + adjust[b][a]], shft); 499 } 500 501 template <typename TSAOut, typename TText, typename TSAIn, typename TEnable> 502 inline bool _fastTupleSortSkew7(TSAOut &, TText const &, TSAIn const &, TEnable) 503 { 504 return false; 505 } 506 507 template <typename TSAOut, typename TText, typename TSAIn> 508 inline bool _fastTupleSortSkew7(TSAOut &SA124, TText const &s, TSAIn const &s124, True) 509 { 510 typedef typename Value<TText>::Type TValue; 511 typedef typename Value<TSAOut>::Type TSize; 512 typedef typename Iterator<TText const, Standard>::Type TValueIter; 513 typedef typename Iterator<TSAIn const, Standard>::Type TSAIter; 514 515 // optimized tuple sort for short alphabets 516 Shape<TValue, UngappedShape<7> > shape; 517 String<TSize, Alloc<> > cnt; 518 resize(cnt, _fullDir2Length(shape), 0, Exact()); 519 520 TValueIter textBegin = begin(s, Standard()); 521 TSAIter it = begin(s124, Standard()); 522 TSAIter itEnd = end(s124, Standard()); 523 524 TSize n = length(s); 525 for(; it != itEnd; ++it) 526 ++cnt[hash2(shape, textBegin + *it, n - *it)]; 527 528 _qgramCummulativeSum(cnt, False()); 529 530 it = begin(s124, Standard()); 531 for(; it != itEnd; ++it) 532 SA124[cnt[hash2(shape, textBegin + *it, n - *it) + 1]++] = *it; 533 534 return true; 535 } 536 537 538 ////////////////////////////////////////////////////////////////////////////// 539 // Skew algorithm with difference cover of Z_7 540 // 541 // Construct the suffix array SA of s[0..n-1], where s is a string over 542 // the alphabet {0..K}. 543 // 544 // The following algorithm divides the suffixes in seven residue classes 545 // according to their lengths and uses a difference cover {1,2,4}. 546 // That approach results in an algorithm that is more space and time efficient 547 // than the original skew algorithm with a difference cover {1,2} of Z_3. 548 // 549 // * no trailing 0's required 550 // * no dummy triples in special cases 551 552 template < typename TSA, 553 typename TText > 554 void createSuffixArray( 555 TSA &SA, 556 TText &s, 557 Skew7 const &, 558 unsigned K, 559 unsigned maxdepth, 560 unsigned depth) 561 { 562 typedef typename Value<TSA>::Type TSize; 563 typedef typename Value<TText>::Type TValue; 564 typedef String<TSize, Alloc<> > TBuffer; 565 typedef typename Iterator<TSA, Standard>::Type TSAIter; 566 //typedef typename Iterator<TBuffer, Standard>::Type TBufferIter; 567 typedef typename Iterator<TText const, Standard>::Type TValueIter; 568 569 SEQAN_ASSERT(AllowsFastRandomAccess<TText>::VALUE == true); 570 SEQAN_ASSERT(AllowsFastRandomAccess<TSA>::VALUE == true); 571 572 #ifdef SEQAN_DEBUG_INDEX 573 std::cerr << "--- CREATE SUFFIX ARRAY ---" << std::endl; 574 std::cerr << "Skew7 [random access]" << std::endl; 575 #endif 576 577 TSize n = length(s); 578 if (n < 1) return; 579 580 TSize _n[7]; 581 TSize _o[7]; 582 583 _n[0] = n/7; 584 _o[0] = n%7; 585 TSize j = n + 6; 586 for(int i = 1; i < 7; ++i, --j) { 587 _n[i] = j/7; 588 _o[i] = j%7; 589 } 590 591 TSize _n24 = _n[2]+_n[4]; 592 TSize _n124 = _n[1]+_n24; 593 594 SEQAN_PROSET(SEQAN_PRODEPTH, depth); 595 SEQAN_PROSET(SEQAN_PROEXTRA1, K); 596 SEQAN_PROMARK("Rekursionsabstieg"); 597 #ifdef SEQAN_DEBUG_INDEX 598 std::cerr << "enter level " << depth << " (" << n << ")" << std::endl; 599 #endif 600 601 TBuffer s124; 602 resize(s124, _n124, Exact()); 603 // we use SA[n-n124..n-1] as a temporary buffer instead of allocating one 604 typename Suffix<TSA>::Type SA124 = suffix(SA, n - _n124); 605 606 607 // generate positions of mod 3, mod 5 and mod 6 suffixes 608 { 609 TSize j = 0; 610 if (_n[2] > _n[4]) s124[j++] = _o[2]; 611 if (_n[1] > _n[4]) s124[j++] = _o[1]; 612 613 for (TSize i=_o[4]; i < n; i+=7) { 614 s124[j++] = i; 615 s124[j++] = i + 2; 616 s124[j++] = i + 3; 617 } 618 } 619 620 621 // lsb radix sort the mod 3, mod 5 and mod 6 7-tupels 622 if (!_fastTupleSortSkew7(SA124, s, s124, typename Eval<BitsPerValue<TValue>::VALUE < 4>::Type())) 623 { 624 String<TSize, Alloc<> > cnt; 625 resize(cnt, K, Exact()); // counter array 626 627 radixPass(SA124, s124, s, cnt, K, 6); 628 radixPass(s124, SA124, s, cnt, K, 5); 629 radixPass(SA124, s124, s, cnt, K, 4); 630 radixPass(s124, SA124, s, cnt, K, 3); 631 radixPass(SA124, s124, s, cnt, K, 2); 632 radixPass(s124, SA124, s, cnt, K, 1); 633 radixPass(SA124, s124, s, cnt, K); 634 } 635 SEQAN_PROMARK("7-lets sortiert"); 636 637 // find lexicographic names of 7-tupel 638 TSize name = 0; 639 { 640 TSize ofs[7] = {0, _n24, _n[4], 0, 0, 0, 0}; 641 bool differ = true; 642 TValue c0 = TValue(0), c1 = TValue(0), c2 = TValue(0), c3 = TValue(0), c4 = TValue(0), c5 = TValue(0), c6 = TValue(0); 643 for (TSize i = 0, clip = _max(n, (TSize)6) - 6, l; i < _n124; i++) { 644 if ((l = SA124[i]) < clip) { 645 if (differ || s[l] != c0 || s[l+1] != c1 || s[l+2] != c2 || s[l+3] != c3 || 646 s[l+4] != c4 || s[l+5] != c5 || s[l+6] != c6) { 647 name++; c0 = s[l]; c1 = s[l+1]; c2 = s[l+2]; c3 = s[l+3]; 648 c4 = s[l+4]; c5 = s[l+5]; c6 = s[l+6]; 649 differ = false; 650 } 651 } else { 652 name++; 653 differ = true; // the last 6 7-tupels always differ from the rest 654 } 655 s124[(l/7) + ofs[(n-l) % 7]] = name - 1; // select a third 656 } 657 } 658 SEQAN_PROMARK("s12 konstruiert"); 659 660 // recurse if names are not yet unique 661 if (name < _n124) { 662 if (depth != maxdepth) 663 { 664 createSuffixArray(SA124, s124, Skew7(), name, maxdepth, depth + 1); 665 #ifdef SEQAN_TEST_SKEW7 666 SEQAN_ASSERT(isSuffixArray(SA124, s124)); 667 #endif 668 } 669 // store unique names in s124 using the suffix array 670 for (TSize i = 0; i < _n124; i++) s124[SA124[i]] = i; 671 } else // generate the suffix array of s124 directly 672 for (TSize i = 0; i < _n124; i++) SA124[s124[i]] = i; 673 674 675 // use SA[0...n3-1] and SA[n3...n3+n5-1] as a temporary buffers instead of allocating some 676 // and allocate SA0, SA3, SA5 and SA6 677 678 { 679 typename Infix<TSA>::Type s3 = infix(SA, 0, _n[3]), s5 = infix(SA, _n[3], _n[3] + _n[5]); 680 String<TSize, typename Skew7StringSpec_<TSA>::Type> SA0, SA3, SA5, SA6; 681 682 resize(SA0, _n[0], Exact()); 683 resize(SA3, _n[3], Exact()); 684 resize(SA5, _n[5], Exact()); 685 resize(SA6, _n[6], Exact()); 686 687 // stably sort the mod 5 and mod 3 suffixes from SA124 by their first character 688 { 689 for (TSize i=0, j3=0, j5=0, l; i < _n124; i++) { 690 l = SA124[i]; 691 if (l < _n[4]) { 692 if ((l = _o[4] + (7 * l)) > 0) 693 s5[j5++] = l - 1; 694 } else if (l < _n24) { 695 if ((l = _o[2] + (7 * (l - _n[4]))) > 0) 696 s3[j3++] = l - 1; 697 } 698 } 699 700 { 701 String<TSize, Alloc<> > cnt; 702 resize(cnt, K, Exact()); // counter array 703 704 radixPass(SA3, s3, s, cnt, K); 705 SEQAN_PROMARK("SA3 konstruiert"); 706 707 radixPass(SA5, s5, s, cnt, K); 708 SEQAN_PROMARK("SA5 konstruiert"); 709 710 // stably sort the mod 6 suffixes from SA5 by their first character 711 712 if (_n[5] == _n[6]) radixExtend (SA6, SA5, s, cnt, K); 713 else radixExtendClip(SA6, SA5, s, cnt, K); 714 SEQAN_PROMARK("SA6 konstruiert"); 715 716 // stably sort the mod 0 suffixes from SA6 by their first character 717 718 if (_n[6] == _n[0]) radixExtend (SA0, SA6, s, cnt, K); 719 else radixExtendClip(SA0, SA6, s, cnt, K); 720 SEQAN_PROMARK("SA0 konstruiert"); 721 } 722 } 723 724 // MULTIWAY MERGE all SA_ streams 725 { 726 // a helper matrix to lex-name-compare every combination of suffixes 727 TSize adjust[7][7] = 728 // 0 1 2 3 4 5 6 729 {{0 , _n124-_n[0] , _n24-_n[0] , _n124-_n[0] , _n[4]-_n[0] , _n[4]-_n[0] , _n24-_n[0] }, // 0 730 {1-_n[1] , 0 , 0 , 1-_n[1] , 0 , 1-_n[1]-_n[2] , 1-_n[1]-_n[2] }, // 1* 731 {1-_n[2] , 0 , 0 , _n[1] , 0 , _n[1] , 1-_n[2] }, // 2* 732 {1+_n[4]-_n[3] , 1+_n[4]-_n[3] , _n24-_n[3] , 0 , _n124-_n[3] , _n24-_n[3] , _n124-_n[3] }, // 3 733 {_n[1]+_n[2] , 0 , 0 ,_n[2] , 0 , _n[1]+_n[2] , _n[2] }, // 4* 734 {_n24-_n[5] , _n124-_n[5] , _n[4]-_n[5] , _n[4]-_n[5] , _n24-_n[5] , 0 , _n124-_n[5] }, // 5 735 {_n124-_n[6] , _n24-_n[6] , _n124-_n[6] , _n[4]-_n[6] , _n[4]-_n[6] , _n24-_n[6] , 0 }}; // 6 736 737 TSAIter pos[7] = {begin(SA0, Standard()) , begin(SA124, Standard()) , begin(SA124, Standard()) , 738 begin(SA3, Standard()) , begin(SA124, Standard()) , begin(SA5, Standard()) , begin(SA6, Standard()) }; 739 TSAIter max[7] = {begin(SA0, Standard())+_n[0], begin(SA124, Standard())+_n124, begin(SA124, Standard())+_n124, 740 begin(SA3, Standard())+_n[3], begin(SA124, Standard())+_n124, begin(SA5, Standard())+_n[5] , begin(SA6, Standard())+_n[6]}; 741 TValueIter spos[7]; 742 TSize tpos[7]; 743 bool islast[7]; 744 745 int a, b, rank[5]; 746 int fill = 0; 747 TSize k = 0; 748 749 #define SEQAN_GET_ISKEW7(ii) (ii < _n[4] ? (ii * 7) + _o[4] : (ii < _n24 ? ((ii - _n[4]) * 7) + _o[2] : ((ii - _n24) * 7) + _o[1])) 750 #define SEQAN_GET_ASKEW7(ii) (ii < _n[4] ? 4 : (ii < _n24 ? 2 : 1)) 751 752 // fill the stream ranking list 753 for(int i = 0; i < 7; ++i) { 754 if (!_n[i]) continue; 755 if (i == 2 || i == 4) continue; // insert only the least suffix of SA124 756 757 if (i == 1) { 758 759 TSize ii = *(pos[1]); 760 a = SEQAN_GET_ASKEW7(ii); 761 TSize j = SEQAN_GET_ISKEW7(ii); 762 763 tpos[a] = ii; 764 spos[a] = begin(s, Standard()) + j; 765 islast[a] = (j + 7 >= n); 766 767 } else { 768 769 a = i; 770 TSize j = *(pos[a]); 771 772 tpos[a] = j / 7; 773 spos[a] = begin(s, Standard()) + j; 774 islast[a] = (j + 7 >= n); 775 776 } 777 778 // get the rank of stream a's suffix 779 int j; 780 for(j = 0; j < fill; ++j) { 781 b = rank[j]; 782 if (_leqSkew7 (a, b, spos, tpos, islast, s124, adjust)) break; 783 } 784 785 // insert the suffix 786 for(int i = fill; i > j; --i) 787 rank[i] = rank[i-1]; 788 rank[j] = a; 789 fill++; 790 } 791 792 // main merge loop 793 // in order to find the least suffix in every step we use a stream ranking list 794 // so we only need to keep up the ordering and thus rank[0] is always the least suffix 795 while (fill > 1) { 796 797 // add the least suffix to SA and get the next of the corresponding stream 798 a = rank[0]; 799 SA[k++] = spos[a] - begin(s, Standard()); 800 if (a == 1 || a == 2 || a == 4) 801 pos[4] = pos[2] = ++pos[1]; 802 else 803 ++pos[a]; 804 805 if (pos[a] < max[a]) { 806 807 // set corresponding spos, tpos, islast values and adapt a if necessary 808 if (a == 1 || a == 2 || a == 4) { 809 810 TSize ii = *(pos[1]); 811 a = SEQAN_GET_ASKEW7(ii); 812 TSize j = SEQAN_GET_ISKEW7(ii); 813 814 tpos[a] = ii; 815 spos[a] = begin(s, Standard()) + j; 816 islast[a] = (j + 7 >= n); 817 818 } else { 819 820 TSize j = *(pos[a]); 821 822 tpos[a] = j / 7; 823 spos[a] = begin(s, Standard()) + j; 824 islast[a] = (j + 7 >= n); 825 826 } 827 828 // get the rank of stream a's suffix 829 830 // linear search 831 int right; 832 for(right = 1; right < fill; right++) { 833 b = rank[right]; 834 if (_leqSkew7 (a, b, spos, tpos, islast, s124, adjust)) break; 835 } 836 /* 837 // binary search 838 int left = 0; 839 int right = fill; 840 while (left + 1 != right) { 841 int middle = (left + right) >> 2; 842 if (leq<TValue, TSize> (a, rank[middle], spos, tpos, islast, s124, adjust)) 843 right = middle; 844 else 845 left = middle; 846 }*/ 847 848 // remove the least suffix ... 849 for(int i = 1; i < right; ++i) 850 rank[i-1] = rank[i]; 851 852 // ... and insert the new one 853 rank[right-1] = a; 854 855 } else { 856 // only remove the least suffix 857 fill--; 858 for(int i = 0; i < fill; ++i) 859 rank[i] = rank[i+1]; 860 } 861 } 862 863 // only one (or less) stream left to fill SA with 864 a = rank[0]; 865 if (a == 1 || a == 2 || a == 4) 866 for (; k < n; ++k) { TSize ii = *(pos[1]++); SA[k] = SEQAN_GET_ISKEW7(ii); } 867 else 868 for (; k < n; ++k) SA[k] = *(pos[a]++); 869 } 870 } 871 SEQAN_PROMARK("SA124, SA3, SA5, SA6, SA0 verschmolzen"); 872 873 #ifdef SEQAN_DEBUG_INDEX 874 std::cerr << "left level " << depth << std::endl; 875 #endif 876 SEQAN_PROMARK("Rekursionsaufstieg"); 877 SEQAN_PROSUB(SEQAN_PRODEPTH, 1); 878 } 879 880 template < typename TSA, 881 typename TText > 882 inline void createSuffixArray( 883 TSA &SA, 884 TText &s, 885 Skew7 const &alg, 886 unsigned K, 887 unsigned maxdepth) 888 { 889 createSuffixArray(SA, s, alg, K, maxdepth, 1); 890 } 891 892 // creates suffix array sorted by the first maxLCP chars of suffixes 893 template < typename TSA, 894 typename TText, 895 typename TSize > 896 inline void createSuffixArrayPart( 897 TSA &SA, 898 TText &s, 899 Skew7 const &_dummy, 900 TSize maxLCP, 901 unsigned K) 902 { 903 unsigned depth = 0; 904 for(TSize i = 1; i < maxLCP; i*=7) ++depth; 905 createSuffixArray(SA, s, _dummy, K, depth); 906 } 907 908 909 // creates suffix array sorted by the first maxLCP chars of suffixes 910 template < typename TSA, 911 typename TText, 912 typename TSize > 913 inline void createSuffixArrayPart( 914 TSA &SA, 915 TText &s, 916 Skew7 const &_dummy, 917 TSize maxLCP) 918 { 919 createSuffixArrayPart(SA, s, _dummy, maxLCP, ValueSize< typename Value<TText>::Type >::VALUE); 920 } 921 //} 922 923 } 924 925 #endif 926