1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29 /**
30 * \file
31 * Thread utilities for sequential prefix scan over statically-sized array types
32 */
33
34 #pragma once
35
36 #include "../config.cuh"
37 #include "../thread/thread_operators.cuh"
38
39 /// Optional outer namespace(s)
40 CUB_NS_PREFIX
41
42 /// CUB namespace
43 namespace cub {
44
45 /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
46 namespace internal {
47
48
49 /**
50 * \addtogroup UtilModule
51 * @{
52 */
53
54 /**
55 * \name Sequential prefix scan over statically-sized array types
56 * @{
57 */
58
59 template <
60 int LENGTH,
61 typename T,
62 typename ScanOp>
ThreadScanExclusive(T inclusive,T exclusive,T * input,T * output,ScanOp scan_op,Int2Type<LENGTH>)63 __device__ __forceinline__ T ThreadScanExclusive(
64 T inclusive,
65 T exclusive,
66 T *input, ///< [in] Input array
67 T *output, ///< [out] Output array (may be aliased to \p input)
68 ScanOp scan_op, ///< [in] Binary scan operator
69 Int2Type<LENGTH> /*length*/)
70 {
71 #pragma unroll
72 for (int i = 0; i < LENGTH; ++i)
73 {
74 inclusive = scan_op(exclusive, input[i]);
75 output[i] = exclusive;
76 exclusive = inclusive;
77 }
78
79 return inclusive;
80 }
81
82
83
84 /**
85 * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned.
86 *
87 * \tparam LENGTH LengthT of \p input and \p output arrays
88 * \tparam T <b>[inferred]</b> The data type to be scanned.
89 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
90 */
91 template <
92 int LENGTH,
93 typename T,
94 typename ScanOp>
ThreadScanExclusive(T * input,T * output,ScanOp scan_op,T prefix,bool apply_prefix=true)95 __device__ __forceinline__ T ThreadScanExclusive(
96 T *input, ///< [in] Input array
97 T *output, ///< [out] Output array (may be aliased to \p input)
98 ScanOp scan_op, ///< [in] Binary scan operator
99 T prefix, ///< [in] Prefix to seed scan with
100 bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.)
101 {
102 T inclusive = input[0];
103 if (apply_prefix)
104 {
105 inclusive = scan_op(prefix, inclusive);
106 }
107 output[0] = prefix;
108 T exclusive = inclusive;
109
110 return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
111 }
112
113
114 /**
115 * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned.
116 *
117 * \tparam LENGTH <b>[inferred]</b> LengthT of \p input and \p output arrays
118 * \tparam T <b>[inferred]</b> The data type to be scanned.
119 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
120 */
121 template <
122 int LENGTH,
123 typename T,
124 typename ScanOp>
ThreadScanExclusive(T (& input)[LENGTH],T (& output)[LENGTH],ScanOp scan_op,T prefix,bool apply_prefix=true)125 __device__ __forceinline__ T ThreadScanExclusive(
126 T (&input)[LENGTH], ///< [in] Input array
127 T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input)
128 ScanOp scan_op, ///< [in] Binary scan operator
129 T prefix, ///< [in] Prefix to seed scan with
130 bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.)
131 {
132 return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
133 }
134
135
136
137
138
139
140
141
142
143 template <
144 int LENGTH,
145 typename T,
146 typename ScanOp>
ThreadScanInclusive(T inclusive,T * input,T * output,ScanOp scan_op,Int2Type<LENGTH>)147 __device__ __forceinline__ T ThreadScanInclusive(
148 T inclusive,
149 T *input, ///< [in] Input array
150 T *output, ///< [out] Output array (may be aliased to \p input)
151 ScanOp scan_op, ///< [in] Binary scan operator
152 Int2Type<LENGTH> /*length*/)
153 {
154 #pragma unroll
155 for (int i = 0; i < LENGTH; ++i)
156 {
157 inclusive = scan_op(inclusive, input[i]);
158 output[i] = inclusive;
159 }
160
161 return inclusive;
162 }
163
164
165 /**
166 * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned.
167 *
168 * \tparam LENGTH LengthT of \p input and \p output arrays
169 * \tparam T <b>[inferred]</b> The data type to be scanned.
170 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
171 */
172 template <
173 int LENGTH,
174 typename T,
175 typename ScanOp>
ThreadScanInclusive(T * input,T * output,ScanOp scan_op)176 __device__ __forceinline__ T ThreadScanInclusive(
177 T *input, ///< [in] Input array
178 T *output, ///< [out] Output array (may be aliased to \p input)
179 ScanOp scan_op) ///< [in] Binary scan operator
180 {
181 T inclusive = input[0];
182 output[0] = inclusive;
183
184 // Continue scan
185 return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
186 }
187
188
189 /**
190 * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned.
191 *
192 * \tparam LENGTH <b>[inferred]</b> LengthT of \p input and \p output arrays
193 * \tparam T <b>[inferred]</b> The data type to be scanned.
194 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
195 */
196 template <
197 int LENGTH,
198 typename T,
199 typename ScanOp>
ThreadScanInclusive(T (& input)[LENGTH],T (& output)[LENGTH],ScanOp scan_op)200 __device__ __forceinline__ T ThreadScanInclusive(
201 T (&input)[LENGTH], ///< [in] Input array
202 T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input)
203 ScanOp scan_op) ///< [in] Binary scan operator
204 {
205 return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
206 }
207
208
209 /**
210 * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned.
211 *
212 * \tparam LENGTH LengthT of \p input and \p output arrays
213 * \tparam T <b>[inferred]</b> The data type to be scanned.
214 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
215 */
216 template <
217 int LENGTH,
218 typename T,
219 typename ScanOp>
ThreadScanInclusive(T * input,T * output,ScanOp scan_op,T prefix,bool apply_prefix=true)220 __device__ __forceinline__ T ThreadScanInclusive(
221 T *input, ///< [in] Input array
222 T *output, ///< [out] Output array (may be aliased to \p input)
223 ScanOp scan_op, ///< [in] Binary scan operator
224 T prefix, ///< [in] Prefix to seed scan with
225 bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.)
226 {
227 T inclusive = input[0];
228 if (apply_prefix)
229 {
230 inclusive = scan_op(prefix, inclusive);
231 }
232 output[0] = inclusive;
233
234 // Continue scan
235 return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
236 }
237
238
239 /**
240 * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned.
241 *
242 * \tparam LENGTH <b>[inferred]</b> LengthT of \p input and \p output arrays
243 * \tparam T <b>[inferred]</b> The data type to be scanned.
244 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
245 */
246 template <
247 int LENGTH,
248 typename T,
249 typename ScanOp>
ThreadScanInclusive(T (& input)[LENGTH],T (& output)[LENGTH],ScanOp scan_op,T prefix,bool apply_prefix=true)250 __device__ __forceinline__ T ThreadScanInclusive(
251 T (&input)[LENGTH], ///< [in] Input array
252 T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input)
253 ScanOp scan_op, ///< [in] Binary scan operator
254 T prefix, ///< [in] Prefix to seed scan with
255 bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.)
256 {
257 return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
258 }
259
260
261 //@} end member group
262
263 /** @} */ // end group UtilModule
264
265
266 } // internal namespace
267 } // CUB namespace
268 CUB_NS_POSTFIX // Optional outer namespace(s)
269