1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * Thread utilities for sequential prefix scan over statically-sized array types
32  */
33 
34 #pragma once
35 
36 #include "../config.cuh"
37 #include "../thread/thread_operators.cuh"
38 
39 /// Optional outer namespace(s)
40 CUB_NS_PREFIX
41 
42 /// CUB namespace
43 namespace cub {
44 
45 /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
46 namespace internal {
47 
48 
49 /**
50  * \addtogroup UtilModule
51  * @{
52  */
53 
54 /**
55  * \name Sequential prefix scan over statically-sized array types
56  * @{
57  */
58 
59 template <
60     int         LENGTH,
61     typename    T,
62     typename    ScanOp>
ThreadScanExclusive(T inclusive,T exclusive,T * input,T * output,ScanOp scan_op,Int2Type<LENGTH>)63 __device__ __forceinline__ T ThreadScanExclusive(
64     T                   inclusive,
65     T                   exclusive,
66     T                   *input,                 ///< [in] Input array
67     T                   *output,                ///< [out] Output array (may be aliased to \p input)
68     ScanOp              scan_op,                ///< [in] Binary scan operator
69     Int2Type<LENGTH>    /*length*/)
70 {
71     #pragma unroll
72     for (int i = 0; i < LENGTH; ++i)
73     {
74         inclusive = scan_op(exclusive, input[i]);
75         output[i] = exclusive;
76         exclusive = inclusive;
77     }
78 
79     return inclusive;
80 }
81 
82 
83 
84 /**
85  * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
86  *
87  * \tparam LENGTH     LengthT of \p input and \p output arrays
88  * \tparam T          <b>[inferred]</b> The data type to be scanned.
89  * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
90  */
91 template <
92     int         LENGTH,
93     typename    T,
94     typename    ScanOp>
ThreadScanExclusive(T * input,T * output,ScanOp scan_op,T prefix,bool apply_prefix=true)95 __device__ __forceinline__ T ThreadScanExclusive(
96     T           *input,                 ///< [in] Input array
97     T           *output,                ///< [out] Output array (may be aliased to \p input)
98     ScanOp      scan_op,                ///< [in] Binary scan operator
99     T           prefix,                 ///< [in] Prefix to seed scan with
100     bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
101 {
102     T inclusive = input[0];
103     if (apply_prefix)
104     {
105         inclusive = scan_op(prefix, inclusive);
106     }
107     output[0] = prefix;
108     T exclusive = inclusive;
109 
110     return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
111 }
112 
113 
114 /**
115  * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
116  *
117  * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
118  * \tparam T          <b>[inferred]</b> The data type to be scanned.
119  * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
120  */
121 template <
122     int         LENGTH,
123     typename    T,
124     typename    ScanOp>
ThreadScanExclusive(T (& input)[LENGTH],T (& output)[LENGTH],ScanOp scan_op,T prefix,bool apply_prefix=true)125 __device__ __forceinline__ T ThreadScanExclusive(
126     T           (&input)[LENGTH],       ///< [in] Input array
127     T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
128     ScanOp      scan_op,                ///< [in] Binary scan operator
129     T           prefix,                 ///< [in] Prefix to seed scan with
130     bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
131 {
132     return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
133 }
134 
135 
136 
137 
138 
139 
140 
141 
142 
143 template <
144     int         LENGTH,
145     typename    T,
146     typename    ScanOp>
ThreadScanInclusive(T inclusive,T * input,T * output,ScanOp scan_op,Int2Type<LENGTH>)147 __device__ __forceinline__ T ThreadScanInclusive(
148     T                   inclusive,
149     T                   *input,                 ///< [in] Input array
150     T                   *output,                ///< [out] Output array (may be aliased to \p input)
151     ScanOp              scan_op,                ///< [in] Binary scan operator
152     Int2Type<LENGTH>    /*length*/)
153 {
154     #pragma unroll
155     for (int i = 0; i < LENGTH; ++i)
156     {
157         inclusive = scan_op(inclusive, input[i]);
158         output[i] = inclusive;
159     }
160 
161     return inclusive;
162 }
163 
164 
165 /**
166  * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
167  *
168  * \tparam LENGTH     LengthT of \p input and \p output arrays
169  * \tparam T          <b>[inferred]</b> The data type to be scanned.
170  * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
171  */
172 template <
173     int         LENGTH,
174     typename    T,
175     typename    ScanOp>
ThreadScanInclusive(T * input,T * output,ScanOp scan_op)176 __device__ __forceinline__ T ThreadScanInclusive(
177     T           *input,                 ///< [in] Input array
178     T           *output,                ///< [out] Output array (may be aliased to \p input)
179     ScanOp      scan_op)                ///< [in] Binary scan operator
180 {
181     T inclusive = input[0];
182     output[0] = inclusive;
183 
184     // Continue scan
185     return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
186 }
187 
188 
189 /**
190  * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
191  *
192  * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
193  * \tparam T          <b>[inferred]</b> The data type to be scanned.
194  * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
195  */
196 template <
197     int         LENGTH,
198     typename    T,
199     typename    ScanOp>
ThreadScanInclusive(T (& input)[LENGTH],T (& output)[LENGTH],ScanOp scan_op)200 __device__ __forceinline__ T ThreadScanInclusive(
201     T           (&input)[LENGTH],       ///< [in] Input array
202     T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
203     ScanOp      scan_op)                ///< [in] Binary scan operator
204 {
205     return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
206 }
207 
208 
209 /**
210  * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
211  *
212  * \tparam LENGTH     LengthT of \p input and \p output arrays
213  * \tparam T          <b>[inferred]</b> The data type to be scanned.
214  * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
215  */
216 template <
217     int         LENGTH,
218     typename    T,
219     typename    ScanOp>
ThreadScanInclusive(T * input,T * output,ScanOp scan_op,T prefix,bool apply_prefix=true)220 __device__ __forceinline__ T ThreadScanInclusive(
221     T           *input,                 ///< [in] Input array
222     T           *output,                ///< [out] Output array (may be aliased to \p input)
223     ScanOp      scan_op,                ///< [in] Binary scan operator
224     T           prefix,                 ///< [in] Prefix to seed scan with
225     bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
226 {
227     T inclusive = input[0];
228     if (apply_prefix)
229     {
230         inclusive = scan_op(prefix, inclusive);
231     }
232     output[0] = inclusive;
233 
234     // Continue scan
235     return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
236 }
237 
238 
239 /**
240  * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
241  *
242  * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
243  * \tparam T          <b>[inferred]</b> The data type to be scanned.
244  * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
245  */
246 template <
247     int         LENGTH,
248     typename    T,
249     typename    ScanOp>
ThreadScanInclusive(T (& input)[LENGTH],T (& output)[LENGTH],ScanOp scan_op,T prefix,bool apply_prefix=true)250 __device__ __forceinline__ T ThreadScanInclusive(
251     T           (&input)[LENGTH],       ///< [in] Input array
252     T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
253     ScanOp      scan_op,                ///< [in] Binary scan operator
254     T           prefix,                 ///< [in] Prefix to seed scan with
255     bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
256 {
257     return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
258 }
259 
260 
261 //@}  end member group
262 
263 /** @} */       // end group UtilModule
264 
265 
266 }               // internal namespace
267 }               // CUB namespace
268 CUB_NS_POSTFIX  // Optional outer namespace(s)
269