1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34 
35 #ifndef SEQAN_HEADER_PIPE_SAMPLER_H
36 #define SEQAN_HEADER_PIPE_SAMPLER_H
37 
38 namespace seqan
39 {
40 
41 //namespace SEQAN_NAMESPACE_PIPELINING
42 //{
43 
44     template <int I, typename T = void>
45     struct SkewDC_;
46 
47 //////////////////////////////////////////////////////////////////////////////
48 
49     template < unsigned m, typename TPack = Pack >
50     struct Sampler;
51 
52     template < typename TInput, unsigned m, typename TPack >
53     struct Value< Pipe< TInput, Sampler<m, TPack> > >
54     {
55         typedef Tuple<typename Value<TInput>::Type, m, TPack>    mTuple;
56         typedef Pair<typename Size<TInput>::Type, mTuple, Pack> Type;
57     };
58 
59 //////////////////////////////////////////////////////////////////////////////
60 
61     template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString >
62     struct Value< Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > >
63     {
64         typedef Tuple<typename Value<TInput>::Type, m, TPack>    mTuple;
65         typedef Pair<TPair, mTuple, Pack> Type;
66     };
67 
68 //////////////////////////////////////////////////////////////////////////////
69 
70 // TODO(holtgrew): Documentation bug with DC!
71 
72 /*!
73  * @class Sampler
74  * @extends Pipe
75  * @headerfile <seqan/pipe.h>
76  * @brief Outputs m-tuples beginning at a position of difference cover DC.
77  *
78  * @signature template <typename TInput, unsigned M[, typename TPack]>
79  *            class Pipe<TInput, Sampler<M, TPack> >;
80  *
81  * @tparam TInput The type of the pipeline module this module reads from.
82  * @tparam m      The tuple size.
83  * @tparam TPack  Specifies the packing method of the tuples (<tt>void</tt> = no packing), default is <tt>Pack</tt>.
84  *
85  * The output type is a Pair of size type and Tuple of input elements and length m (i.e. <tt>Pair&lt;Size&lt;TInput&gt;::Type,
86  * Tuple&lt;Value&lt;TInput&gt;::Type, m, TPack&gt; &gt;</tt>).
87  *
88  * The first output field contains the number of remaining pipe elements. The m-tuple in the second field contains the
89  * first m elements of them. The m-tuples are substrings of the input stream beginning at positions <tt>i</tt>, with
90  * <tt>(n-i) mod m</tt> is element of the set DC (n is the input stream length).
91  *
92  * @section Examples
93  *
94  * The set <tt>{1,2,4}</tt> is represented by <tt>int DC[] = { 3, 1, 2, 4 }</tt>.
95  *
96  * @see BitPackedTuple
97  */
98 
99     //////////////////////////////////////////////////////////////////////////////
100     // sampler class
101     template < typename TInput, unsigned m, typename TPack >
102     struct Pipe< TInput, Sampler<m, TPack> >
103     {
104         typedef typename Value<Pipe>::Type  TOutValue;
105         typedef typename Size<Pipe>::Type   TSize;
106 
107         TInput        &in;
108         bool        filter[m];
109         TSize       idx, _size, _rest;
110         unsigned    idxMod;
111         TOutValue   tmp1, tmp2;
112         TOutValue   *outRef, *tmpRef;
113         bool        last;
114 
115         Pipe(TInput& _in):
116             in(_in),
117             outRef(&tmp1),
118             tmpRef(&tmp2) {}
119 
120         inline void prepare()
121         {
122             for (unsigned i = 0; i < m; ++i)
123                 filter[i] = 0;
124             for(unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; i++)
125                 filter[SkewDC_<m>::VALUE[i]] = true;
126 
127             idx = length(in);
128             idxMod = idx % m;
129 
130             while (!filter[idxMod] && !eof(in))
131             {
132                 ++in;
133                 if (idxMod == 0) idxMod = m;
134                 --idxMod; --idx;
135             }
136             _rest = length(*this);
137             fill();
138             swap();
139         }
140 
141         inline void fill()
142         {
143             unsigned i;
144             for(i = 0; i < m && !eof(in); ++i, ++in)
145                 tmpRef->i2.i[i] = *in;
146             last = eof(in);
147             for(; i < m; ++i)
148                 tmpRef->i2.i[i] = 0;
149             tmpRef->i1 = idx;
150         }
151 
152         inline void rotate(unsigned r)
153         {
154             for(unsigned i = 0; i < m; ++i, ++r)
155             {
156                 if (r == m) r = 0;
157                 tmpRef->i2.i[i] = outRef->i2.i[r];
158             }
159         }
160 
161         inline void swap()
162         {
163             TOutValue *newOutRef = tmpRef;
164             tmpRef = outRef;
165             outRef = newOutRef;
166         }
167 
168         inline TOutValue const& operator*()
169         {
170             return *outRef;
171         }
172 
173         Pipe& operator++()
174         {
175             unsigned skipped = 0;
176             if (--_rest)
177             {
178                 if (!last)
179                 {
180                     do
181                     {
182                         outRef->i2.i[skipped++] = *in;
183                         ++in;
184                         if (idxMod == 0) idxMod = m;
185                         --idxMod; --idx;
186                         if (eof(in))
187                         {
188                             last = true;
189                             while (!filter[idxMod])
190                             {
191                                 outRef->i2.i[skipped++] = 0;
192                                 if (idxMod == 0) idxMod = m;
193                                 --idxMod; --idx;
194                             }
195                             break;
196                         }
197                     } while (!filter[idxMod]);
198                 }
199                 else
200                 {
201                     do
202                     {
203                         outRef->i2.i[skipped++] = 0;
204                         if (idxMod == 0) idxMod = m;
205                         --idxMod; --idx;
206                     }
207                     while (!filter[idxMod]);
208                 }
209             }
210             rotate(skipped);
211             tmpRef->i1 = idx;
212             swap();
213             return *this;
214         }
215     };
216 
217 
218     //////////////////////////////////////////////////////////////////////////////
219     // sampler class (uses packing)
220     template < typename TInput, unsigned m >
221     struct Pipe< TInput, Sampler<m, BitPacked<> > >
222     {
223         typedef typename Value<Pipe>::Type          TOutValue;
224         typedef typename Size<Pipe>::Type           TSize;
225         typedef typename Value<TOutValue, 2>::Type  TTuple;
226 
227         TInput        &in;
228         bool        filter[m];
229         TSize       _size, _rest;
230         unsigned    idxMod;
231         TOutValue   tmp;
232         bool        last;
233 
234         Pipe(TInput & _in) : in(_in), _size(0), _rest(0), idxMod(0), last(false)
235         {}
236 
237         inline void prepare()
238         {
239             for (unsigned i = 0; i < m; ++i)
240                 filter[i] = 0;
241             for (unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; i++)
242                 filter[SkewDC_<m>::VALUE[i]] = true;
243 
244             tmp.i1 = length(in);
245             idxMod = tmp.i1 % m;
246 
247             while (!filter[idxMod] && !eof(in))
248             {
249                 ++in;
250                 if (idxMod == 0) idxMod = m;
251                 --idxMod; --tmp.i1;
252             }
253             _rest = length(*this);
254             fill();
255         }
256 
257         inline void fill()
258         {
259             unsigned i;
260             clear(tmp.i2);
261             for(i = 0; i < m && !eof(in); ++i, ++in)
262             {
263                 tmp.i2 <<= 1;
264                 tmp.i2 |= *in;
265             }
266             last = eof(in);
267             tmp.i2 <<= (m - i);
268         }
269 
270         inline TOutValue const& operator*()
271         {
272             return tmp;
273         }
274 
275         Pipe& operator++()
276         {
277             if (--_rest)
278             {
279                 if (!last)
280                 {
281                     do
282                     {
283                         tmp.i2 <<= 1;
284                         tmp.i2 |= *in;
285                         ++in;
286                         if (idxMod == 0) idxMod = m;
287                         --idxMod; --tmp.i1;
288                         if (eof(in))
289                         {
290                             last = true;
291                             while (!filter[idxMod])
292                             {
293                                 tmp.i2 <<= 1;
294                                 if (idxMod == 0) idxMod = m;
295                                 --idxMod; --tmp.i1;
296                             }
297                             break;
298                         }
299                     } while (!filter[idxMod]);
300                 }
301                 else
302                 {
303                     do
304                     {
305                         tmp.i2 <<= 1;
306                         if (idxMod == 0) idxMod = m;
307                         --idxMod; --tmp.i1;
308                     }
309                     while (!filter[idxMod]);
310                 }
311             }
312             return *this;
313         }
314     };
315 
316 
317 
318     //////////////////////////////////////////////////////////////////////////////
319     // global pipe functions
320     template < typename TInput, unsigned m, typename TPack >
321     inline bool control(Pipe< TInput, Sampler<m, TPack> > &me, ControlBeginRead const &command)
322     {
323         if (!control(me.in, command)) return false;
324         me.prepare();
325         return true;
326     }
327 
328     template < typename TInput, unsigned m, typename TPack >
329     inline bool control(Pipe< TInput, Sampler<m, TPack> > &me, ControlEof const &/*command*/)
330     {
331         return me._rest == 0;
332     }
333 
334     template < typename TInput, unsigned m, typename TPack >
335     inline typename Size< Pipe< TInput, Sampler<m, TPack> > >::Type
336     length(Pipe< TInput, Sampler<m, TPack> > const &me)
337     {
338         typedef typename Size< Pipe< TInput, Sampler<m> > >::Type TSize;
339 
340         TSize sum = 0;
341         TSize size = length(me.in);
342 
343         // sum up the number of tuples in each residue class
344         // for a string of length n there are 1+(n-x)/m suffixes
345         // whose lengths are in residue class x
346         for (unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; ++i)
347         {
348             sum += (size + m - SkewDC_<m>::VALUE[i]) / m;
349             if (SkewDC_<m>::VALUE[i] == 0)
350                 --sum;  // we don't consider empty suffixes
351         }
352         return sum;
353     }
354 
355 
356 
357 //////////////////////////////////////////////////////////////////////////////
358 
359     template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString >
360     struct Size< Pipe<TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > > :
361         public Value<TLimitsString> {};
362 
363     //////////////////////////////////////////////////////////////////////////////
364     // sampler class
365     template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString >
366     struct Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> >
367     {
368         typedef typename Value<Pipe>::Type  TOutValue;
369         typedef typename Size<Pipe>::Type   TSize;
370 
371         typedef PairDecrementer_<TPair, TLimitsString, m> Decrementer;
372 
373         TInput        &in;
374         bool        filter[m];
375         Decrementer    localPos;
376         TSize       _size, _rest;
377         TOutValue   tmp1, tmp2;
378         TOutValue   *outRef, *tmpRef;
379         bool        last;
380 
381         TLimitsString const &limits;
382 
383         template <typename TLimitsString_>
384         Pipe(TInput& _in, TLimitsString_ &_limits):  // const &_limits is intentionally omitted to suppress implicit casts (if types mismatch) and taking refs of them
385             in(_in),
386             outRef(&tmp1),
387             tmpRef(&tmp2),
388             limits(_limits) {}
389 
390         inline void prepare()
391         {
392             for (unsigned i = 0; i < m; ++i)
393                 filter[i] = 0;
394             for(unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; i++)
395                 filter[SkewDC_<m>::VALUE[i]] = true;
396 
397             setHost(localPos, limits);
398 
399             while (!filter[localPos.residue] && !eof(in))
400             {
401                 ++in;
402                 --localPos;
403             }
404             _rest = length(*this);
405             fill();
406             swap();
407         }
408 
409         inline void fill()
410         {
411             unsigned i;
412             for(i = 0; i < m && !eof(in); ++i, ++in)
413                 tmpRef->i2.i[i] = *in;
414             last = eof(in);
415             for(; i < m; ++i)
416                 tmpRef->i2.i[i] = 0;
417             tmpRef->i1 = localPos;
418         }
419 
420         inline void rotate(unsigned r)
421         {
422             for(unsigned i = 0; i < m; ++i, ++r)
423             {
424                 if (r == m) r = 0;
425                 tmpRef->i2.i[i] = outRef->i2.i[r];
426             }
427         }
428 
429         inline void swap()
430         {
431             TOutValue *newOutRef = tmpRef;
432             tmpRef = outRef;
433             outRef = newOutRef;
434         }
435 
436         inline TOutValue const& operator*()
437         {
438             return *outRef;
439         }
440 
441         Pipe& operator++()
442         {
443             unsigned skipped = 0;
444             if (--_rest)
445             {
446                 if (!last)
447                 {
448                     do
449                     {
450                         outRef->i2.i[skipped++] = *in;
451                         ++in;
452                         --localPos;
453                         if (eof(in))
454                         {
455                             last = true;
456                             while (!filter[localPos.residue])
457                             {
458                                 outRef->i2.i[skipped++] = 0;
459                                 --localPos;
460                             }
461                             break;
462                         }
463                     }
464                     while (!filter[localPos.residue]);
465                 }
466                 else
467                 {
468                     do
469                     {
470                         outRef->i2.i[skipped++] = 0;
471                         --localPos;
472                     }
473                     while (!filter[localPos.residue]);
474                 }
475                 rotate(skipped);
476                 tmpRef->i1 = localPos;
477                 swap();
478             }
479             return *this;
480         }
481     };
482 
483 
484     //////////////////////////////////////////////////////////////////////////////
485     // sampler class (uses bit-packing)
486     template < typename TInput, unsigned m, typename TPair, typename TLimitsString >
487     struct Pipe< TInput, Multi<Sampler<m, BitPacked<> >, TPair, TLimitsString> >
488     {
489         typedef typename Value<Pipe>::Type          TOutValue;
490         typedef typename Size<Pipe>::Type           TSize;
491         typedef typename Value<TOutValue, 2>::Type  TTuple;
492 
493         typedef PairDecrementer_<TPair, TLimitsString, m> Decrementer;
494 
495         TInput        &in;
496         bool        filter[m];
497         TSize       _size, _rest;
498         Decrementer localPos;
499         TOutValue   tmp;
500         bool        last;
501 
502         TLimitsString const &limits;
503 
504         template <typename TLimitsString_>
505         Pipe(TInput& _in, TLimitsString_ &_limits):  // const &_limits is intentionally omitted to suppress implicit casts (if types mismatch) and taking refs of them
506             in(_in),
507             limits(_limits) {}
508 
509         inline void prepare()
510         {
511             for (unsigned i = 0; i < m; ++i)
512                 filter[i] = 0;
513             for(unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; i++)
514                 filter[SkewDC_<m>::VALUE[i]] = true;
515 
516             setHost(localPos, limits);
517 
518             while (!filter[localPos.residue] && !eof(in)) {
519                 ++in;
520                 --localPos;
521             }
522             _rest = length(*this);
523             fill();
524         }
525 
526         inline void fill()
527         {
528             unsigned i;
529             clear(tmp.i2);
530             for(i = 0; i < m && !eof(in); ++i, ++in) {
531                 tmp.i2 <<= 1;
532                 tmp.i2 |= *in;
533             }
534             last = eof(in);
535             tmp.i2 <<= (m - i);
536             tmp.i1 = localPos;
537         }
538 
539         inline TOutValue const& operator*()
540         {
541             return tmp;
542         }
543 
544         Pipe& operator++()
545         {
546             if (--_rest)
547             {
548                 if (!last)
549                 {
550                     do
551                     {
552                         tmp.i2 <<= 1;
553                         tmp.i2 |= *in;
554                         ++in;
555                         --localPos;
556                         if (eof(in))
557                         {
558                             last = true;
559                             while (!filter[localPos.residue])
560                             {
561                                 tmp.i2 <<= 1;
562                                 --localPos;
563                             }
564                             break;
565                         }
566                     }
567                     while (!filter[localPos.residue]);
568                 }
569                 else
570                 {
571                     do
572                     {
573                         tmp.i2 <<= 1;
574                         --localPos;
575                     } while (!filter[localPos.residue]);
576                 }
577             }
578             tmp.i1 = localPos;
579             return *this;
580         }
581     };
582 
583 
584 
585     //////////////////////////////////////////////////////////////////////////////
586     // global pipe functions
587     template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString >
588     inline bool control(Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > &me, ControlBeginRead const &command)
589     {
590         if (!control(me.in, command))
591             return false;
592         me.prepare();
593         return true;
594     }
595 
596     template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString >
597     inline bool control(Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > &me, ControlEof const &/*command*/)
598     {
599         return me._rest == 0;
600     }
601 
602     template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString >
603     inline bool control(Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > &me, ControlEos const &/*command*/)
604     {
605         return control(me, ControlEof());
606     }
607 
608     template < typename TInput, unsigned m, typename TPack, typename TPair, typename TLimitsString >
609     inline typename Size< Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > >::Type
610     length(Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > const &me)
611     {
612         typedef typename Size< Pipe< TInput, Multi<Sampler<m, TPack>, TPair, TLimitsString> > >::Type TSize;
613 
614         if (empty(me.limits))
615             return 0;
616 
617         TSize sum = 0;
618         TLimitsString const &limits = me.limits;
619         int64_t seqCountPlusOne = length(me.limits);
620 
621         SEQAN_OMP_PRAGMA(parallel for reduction(+:sum) if (seqCountPlusOne > 99))
622         for (int64_t i = 1; i < seqCountPlusOne; ++i)
623         {
624             TSize prev = limits[i - 1];
625             TSize cur = limits[i];
626 
627             SEQAN_ASSERT_LEQ(prev, cur);
628             TSize size = cur - prev;
629 
630             // sum up the number of tuples in each residue class
631             // for a string of length n there are 1+(n-x)/m suffixes
632             // whose lengths are in residue class x
633             for (unsigned i = 1; i <= SkewDC_<m>::VALUE[0]; ++i)
634             {
635                 sum += (size + m - SkewDC_<m>::VALUE[i]) / m;
636                 if (SkewDC_<m>::VALUE[i] == 0)
637                     --sum;  // we don't consider empty suffixes
638             }
639         }
640 
641         return sum;
642     }
643 //}
644 
645 }
646 
647 #endif
648