1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: David Weese <david.weese@fu-berlin.de> 33 // ========================================================================== 34 35 #ifndef SEQAN_HEADER_PIPE_SAMPLER_H 36 #define SEQAN_HEADER_PIPE_SAMPLER_H 37 38 namespace seqan 39 { 40 41 //namespace SEQAN_NAMESPACE_PIPELINING 42 //{ 43 44 template <int I, typename T = void> 45 struct SkewDC_; 46 47 ////////////////////////////////////////////////////////////////////////////// 48 49 template < unsigned m, typename TPack = Pack > 50 struct Sampler; 51 52 template < typename TInput, unsigned m, typename TPack > 53 struct Value< Pipe< TInput, Sampler<m, TPack> > > 54 { 55 typedef Tuple<typename Value<TInput>::Type, m, TPack> mTuple; 56 typedef Pair<typename Size<TInput>::Type, mTuple, Pack> Type; 57 }; 58 59 ////////////////////////////////////////////////////////////////////////////// 60 61 template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString > 62 struct Value< Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > > 63 { 64 typedef Tuple<typename Value<TInput>::Type, m, TPack> mTuple; 65 typedef Pair<TPair, mTuple, Pack> Type; 66 }; 67 68 ////////////////////////////////////////////////////////////////////////////// 69 70 // TODO(holtgrew): Documentation bug with DC! 71 72 /*! 73 * @class Sampler 74 * @extends Pipe 75 * @headerfile <seqan/pipe.h> 76 * @brief Outputs m-tuples beginning at a position of difference cover DC. 77 * 78 * @signature template <typename TInput, unsigned M[, typename TPack]> 79 * class Pipe<TInput, Sampler<M, TPack> >; 80 * 81 * @tparam TInput The type of the pipeline module this module reads from. 82 * @tparam m The tuple size. 83 * @tparam TPack Specifies the packing method of the tuples (<tt>void</tt> = no packing), default is <tt>Pack</tt>. 84 * 85 * The output type is a Pair of size type and Tuple of input elements and length m (i.e. <tt>Pair<Size<TInput>::Type, 86 * Tuple<Value<TInput>::Type, m, TPack> ></tt>). 87 * 88 * The first output field contains the number of remaining pipe elements. The m-tuple in the second field contains the 89 * first m elements of them. The m-tuples are substrings of the input stream beginning at positions <tt>i</tt>, with 90 * <tt>(n-i) mod m</tt> is element of the set DC (n is the input stream length). 91 * 92 * @section Examples 93 * 94 * The set <tt>{1,2,4}</tt> is represented by <tt>int DC[] = { 3, 1, 2, 4 }</tt>. 95 * 96 * @see BitPackedTuple 97 */ 98 99 ////////////////////////////////////////////////////////////////////////////// 100 // sampler class 101 template < typename TInput, unsigned m, typename TPack > 102 struct Pipe< TInput, Sampler<m, TPack> > 103 { 104 typedef typename Value<Pipe>::Type TOutValue; 105 typedef typename Size<Pipe>::Type TSize; 106 107 TInput ∈ 108 bool filter[m]; 109 TSize idx, _size, _rest; 110 unsigned idxMod; 111 TOutValue tmp1, tmp2; 112 TOutValue *outRef, *tmpRef; 113 bool last; 114 115 Pipe(TInput& _in): 116 in(_in), 117 outRef(&tmp1), 118 tmpRef(&tmp2) {} 119 120 inline void prepare() 121 { 122 for (unsigned i = 0; i < m; ++i) 123 filter[i] = 0; 124 for(unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; i++) 125 filter[SkewDC_<m>::VALUE[i]] = true; 126 127 idx = length(in); 128 idxMod = idx % m; 129 130 while (!filter[idxMod] && !eof(in)) 131 { 132 ++in; 133 if (idxMod == 0) idxMod = m; 134 --idxMod; --idx; 135 } 136 _rest = length(*this); 137 fill(); 138 swap(); 139 } 140 141 inline void fill() 142 { 143 unsigned i; 144 for(i = 0; i < m && !eof(in); ++i, ++in) 145 tmpRef->i2.i[i] = *in; 146 last = eof(in); 147 for(; i < m; ++i) 148 tmpRef->i2.i[i] = 0; 149 tmpRef->i1 = idx; 150 } 151 152 inline void rotate(unsigned r) 153 { 154 for(unsigned i = 0; i < m; ++i, ++r) 155 { 156 if (r == m) r = 0; 157 tmpRef->i2.i[i] = outRef->i2.i[r]; 158 } 159 } 160 161 inline void swap() 162 { 163 TOutValue *newOutRef = tmpRef; 164 tmpRef = outRef; 165 outRef = newOutRef; 166 } 167 168 inline TOutValue const& operator*() 169 { 170 return *outRef; 171 } 172 173 Pipe& operator++() 174 { 175 unsigned skipped = 0; 176 if (--_rest) 177 { 178 if (!last) 179 { 180 do 181 { 182 outRef->i2.i[skipped++] = *in; 183 ++in; 184 if (idxMod == 0) idxMod = m; 185 --idxMod; --idx; 186 if (eof(in)) 187 { 188 last = true; 189 while (!filter[idxMod]) 190 { 191 outRef->i2.i[skipped++] = 0; 192 if (idxMod == 0) idxMod = m; 193 --idxMod; --idx; 194 } 195 break; 196 } 197 } while (!filter[idxMod]); 198 } 199 else 200 { 201 do 202 { 203 outRef->i2.i[skipped++] = 0; 204 if (idxMod == 0) idxMod = m; 205 --idxMod; --idx; 206 } 207 while (!filter[idxMod]); 208 } 209 } 210 rotate(skipped); 211 tmpRef->i1 = idx; 212 swap(); 213 return *this; 214 } 215 }; 216 217 218 ////////////////////////////////////////////////////////////////////////////// 219 // sampler class (uses packing) 220 template < typename TInput, unsigned m > 221 struct Pipe< TInput, Sampler<m, BitPacked<> > > 222 { 223 typedef typename Value<Pipe>::Type TOutValue; 224 typedef typename Size<Pipe>::Type TSize; 225 typedef typename Value<TOutValue, 2>::Type TTuple; 226 227 TInput ∈ 228 bool filter[m]; 229 TSize _size, _rest; 230 unsigned idxMod; 231 TOutValue tmp; 232 bool last; 233 234 Pipe(TInput & _in) : in(_in), _size(0), _rest(0), idxMod(0), last(false) 235 {} 236 237 inline void prepare() 238 { 239 for (unsigned i = 0; i < m; ++i) 240 filter[i] = 0; 241 for (unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; i++) 242 filter[SkewDC_<m>::VALUE[i]] = true; 243 244 tmp.i1 = length(in); 245 idxMod = tmp.i1 % m; 246 247 while (!filter[idxMod] && !eof(in)) 248 { 249 ++in; 250 if (idxMod == 0) idxMod = m; 251 --idxMod; --tmp.i1; 252 } 253 _rest = length(*this); 254 fill(); 255 } 256 257 inline void fill() 258 { 259 unsigned i; 260 clear(tmp.i2); 261 for(i = 0; i < m && !eof(in); ++i, ++in) 262 { 263 tmp.i2 <<= 1; 264 tmp.i2 |= *in; 265 } 266 last = eof(in); 267 tmp.i2 <<= (m - i); 268 } 269 270 inline TOutValue const& operator*() 271 { 272 return tmp; 273 } 274 275 Pipe& operator++() 276 { 277 if (--_rest) 278 { 279 if (!last) 280 { 281 do 282 { 283 tmp.i2 <<= 1; 284 tmp.i2 |= *in; 285 ++in; 286 if (idxMod == 0) idxMod = m; 287 --idxMod; --tmp.i1; 288 if (eof(in)) 289 { 290 last = true; 291 while (!filter[idxMod]) 292 { 293 tmp.i2 <<= 1; 294 if (idxMod == 0) idxMod = m; 295 --idxMod; --tmp.i1; 296 } 297 break; 298 } 299 } while (!filter[idxMod]); 300 } 301 else 302 { 303 do 304 { 305 tmp.i2 <<= 1; 306 if (idxMod == 0) idxMod = m; 307 --idxMod; --tmp.i1; 308 } 309 while (!filter[idxMod]); 310 } 311 } 312 return *this; 313 } 314 }; 315 316 317 318 ////////////////////////////////////////////////////////////////////////////// 319 // global pipe functions 320 template < typename TInput, unsigned m, typename TPack > 321 inline bool control(Pipe< TInput, Sampler<m, TPack> > &me, ControlBeginRead const &command) 322 { 323 if (!control(me.in, command)) return false; 324 me.prepare(); 325 return true; 326 } 327 328 template < typename TInput, unsigned m, typename TPack > 329 inline bool control(Pipe< TInput, Sampler<m, TPack> > &me, ControlEof const &/*command*/) 330 { 331 return me._rest == 0; 332 } 333 334 template < typename TInput, unsigned m, typename TPack > 335 inline typename Size< Pipe< TInput, Sampler<m, TPack> > >::Type 336 length(Pipe< TInput, Sampler<m, TPack> > const &me) 337 { 338 typedef typename Size< Pipe< TInput, Sampler<m> > >::Type TSize; 339 340 TSize sum = 0; 341 TSize size = length(me.in); 342 343 // sum up the number of tuples in each residue class 344 // for a string of length n there are 1+(n-x)/m suffixes 345 // whose lengths are in residue class x 346 for (unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; ++i) 347 { 348 sum += (size + m - SkewDC_<m>::VALUE[i]) / m; 349 if (SkewDC_<m>::VALUE[i] == 0) 350 --sum; // we don't consider empty suffixes 351 } 352 return sum; 353 } 354 355 356 357 ////////////////////////////////////////////////////////////////////////////// 358 359 template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString > 360 struct Size< Pipe<TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > > : 361 public Value<TLimitsString> {}; 362 363 ////////////////////////////////////////////////////////////////////////////// 364 // sampler class 365 template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString > 366 struct Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > 367 { 368 typedef typename Value<Pipe>::Type TOutValue; 369 typedef typename Size<Pipe>::Type TSize; 370 371 typedef PairDecrementer_<TPair, TLimitsString, m> Decrementer; 372 373 TInput ∈ 374 bool filter[m]; 375 Decrementer localPos; 376 TSize _size, _rest; 377 TOutValue tmp1, tmp2; 378 TOutValue *outRef, *tmpRef; 379 bool last; 380 381 TLimitsString const &limits; 382 383 template <typename TLimitsString_> 384 Pipe(TInput& _in, TLimitsString_ &_limits): // const &_limits is intentionally omitted to suppress implicit casts (if types mismatch) and taking refs of them 385 in(_in), 386 outRef(&tmp1), 387 tmpRef(&tmp2), 388 limits(_limits) {} 389 390 inline void prepare() 391 { 392 for (unsigned i = 0; i < m; ++i) 393 filter[i] = 0; 394 for(unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; i++) 395 filter[SkewDC_<m>::VALUE[i]] = true; 396 397 setHost(localPos, limits); 398 399 while (!filter[localPos.residue] && !eof(in)) 400 { 401 ++in; 402 --localPos; 403 } 404 _rest = length(*this); 405 fill(); 406 swap(); 407 } 408 409 inline void fill() 410 { 411 unsigned i; 412 for(i = 0; i < m && !eof(in); ++i, ++in) 413 tmpRef->i2.i[i] = *in; 414 last = eof(in); 415 for(; i < m; ++i) 416 tmpRef->i2.i[i] = 0; 417 tmpRef->i1 = localPos; 418 } 419 420 inline void rotate(unsigned r) 421 { 422 for(unsigned i = 0; i < m; ++i, ++r) 423 { 424 if (r == m) r = 0; 425 tmpRef->i2.i[i] = outRef->i2.i[r]; 426 } 427 } 428 429 inline void swap() 430 { 431 TOutValue *newOutRef = tmpRef; 432 tmpRef = outRef; 433 outRef = newOutRef; 434 } 435 436 inline TOutValue const& operator*() 437 { 438 return *outRef; 439 } 440 441 Pipe& operator++() 442 { 443 unsigned skipped = 0; 444 if (--_rest) 445 { 446 if (!last) 447 { 448 do 449 { 450 outRef->i2.i[skipped++] = *in; 451 ++in; 452 --localPos; 453 if (eof(in)) 454 { 455 last = true; 456 while (!filter[localPos.residue]) 457 { 458 outRef->i2.i[skipped++] = 0; 459 --localPos; 460 } 461 break; 462 } 463 } 464 while (!filter[localPos.residue]); 465 } 466 else 467 { 468 do 469 { 470 outRef->i2.i[skipped++] = 0; 471 --localPos; 472 } 473 while (!filter[localPos.residue]); 474 } 475 rotate(skipped); 476 tmpRef->i1 = localPos; 477 swap(); 478 } 479 return *this; 480 } 481 }; 482 483 484 ////////////////////////////////////////////////////////////////////////////// 485 // sampler class (uses bit-packing) 486 template < typename TInput, unsigned m, typename TPair, typename TLimitsString > 487 struct Pipe< TInput, Multi<Sampler<m, BitPacked<> >, TPair, TLimitsString> > 488 { 489 typedef typename Value<Pipe>::Type TOutValue; 490 typedef typename Size<Pipe>::Type TSize; 491 typedef typename Value<TOutValue, 2>::Type TTuple; 492 493 typedef PairDecrementer_<TPair, TLimitsString, m> Decrementer; 494 495 TInput ∈ 496 bool filter[m]; 497 TSize _size, _rest; 498 Decrementer localPos; 499 TOutValue tmp; 500 bool last; 501 502 TLimitsString const &limits; 503 504 template <typename TLimitsString_> 505 Pipe(TInput& _in, TLimitsString_ &_limits): // const &_limits is intentionally omitted to suppress implicit casts (if types mismatch) and taking refs of them 506 in(_in), 507 limits(_limits) {} 508 509 inline void prepare() 510 { 511 for (unsigned i = 0; i < m; ++i) 512 filter[i] = 0; 513 for(unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; i++) 514 filter[SkewDC_<m>::VALUE[i]] = true; 515 516 setHost(localPos, limits); 517 518 while (!filter[localPos.residue] && !eof(in)) { 519 ++in; 520 --localPos; 521 } 522 _rest = length(*this); 523 fill(); 524 } 525 526 inline void fill() 527 { 528 unsigned i; 529 clear(tmp.i2); 530 for(i = 0; i < m && !eof(in); ++i, ++in) { 531 tmp.i2 <<= 1; 532 tmp.i2 |= *in; 533 } 534 last = eof(in); 535 tmp.i2 <<= (m - i); 536 tmp.i1 = localPos; 537 } 538 539 inline TOutValue const& operator*() 540 { 541 return tmp; 542 } 543 544 Pipe& operator++() 545 { 546 if (--_rest) 547 { 548 if (!last) 549 { 550 do 551 { 552 tmp.i2 <<= 1; 553 tmp.i2 |= *in; 554 ++in; 555 --localPos; 556 if (eof(in)) 557 { 558 last = true; 559 while (!filter[localPos.residue]) 560 { 561 tmp.i2 <<= 1; 562 --localPos; 563 } 564 break; 565 } 566 } 567 while (!filter[localPos.residue]); 568 } 569 else 570 { 571 do 572 { 573 tmp.i2 <<= 1; 574 --localPos; 575 } while (!filter[localPos.residue]); 576 } 577 } 578 tmp.i1 = localPos; 579 return *this; 580 } 581 }; 582 583 584 585 ////////////////////////////////////////////////////////////////////////////// 586 // global pipe functions 587 template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString > 588 inline bool control(Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > &me, ControlBeginRead const &command) 589 { 590 if (!control(me.in, command)) 591 return false; 592 me.prepare(); 593 return true; 594 } 595 596 template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString > 597 inline bool control(Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > &me, ControlEof const &/*command*/) 598 { 599 return me._rest == 0; 600 } 601 602 template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString > 603 inline bool control(Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > &me, ControlEos const &/*command*/) 604 { 605 return control(me, ControlEof()); 606 } 607 608 template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString > 609 inline typename Size< Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > >::Type 610 length(Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > const &me) 611 { 612 typedef typename Size< Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > >::Type TSize; 613 614 if (empty(me.limits)) 615 return 0; 616 617 TSize sum = 0; 618 TLimitsString const &limits = me.limits; 619 int64_t seqCountPlusOne = length(me.limits); 620 621 SEQAN_OMP_PRAGMA(parallel for reduction(+:sum) if (seqCountPlusOne > 99)) 622 for (int64_t i = 1; i < seqCountPlusOne; ++i) 623 { 624 TSize prev = limits[i - 1]; 625 TSize cur = limits[i]; 626 627 SEQAN_ASSERT_LEQ(prev, cur); 628 TSize size = cur - prev; 629 630 // sum up the number of tuples in each residue class 631 // for a string of length n there are 1+(n-x)/m suffixes 632 // whose lengths are in residue class x 633 for (unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; ++i) 634 { 635 sum += (size + m - SkewDC_<m>::VALUE[i]) / m; 636 if (SkewDC_<m>::VALUE[i] == 0) 637 --sum; // we don't consider empty suffixes 638 } 639 } 640 641 return sum; 642 } 643 //} 644 645 } 646 647 #endif 648