1 2 /****************************************************************************** 3 * Copyright (c) 2011, Duane Merrill. All rights reserved. 4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * * Neither the name of the NVIDIA CORPORATION nor the 14 * names of its contributors may be used to endorse or promote products 15 * derived from this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 * 28 ******************************************************************************/ 29 30 /** 31 * \file 32 * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. 33 */ 34 35 #pragma once 36 37 #include <stdio.h> 38 #include <iterator> 39 40 #include "dispatch/dispatch_select_if.cuh" 41 #include "../util_namespace.cuh" 42 43 /// Optional outer namespace(s) 44 CUB_NS_PREFIX 45 46 /// CUB namespace 47 namespace cub { 48 49 50 /** 51 * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png) 52 * \ingroup SingleModule 53 * 54 * \par Overview 55 * These operations apply a selection criterion to selectively copy 56 * items from a specified input sequence to a compact output sequence. 57 * 58 * \par Usage Considerations 59 * \cdp_class{DeviceSelect} 60 * 61 * \par Performance 62 * \linear_performance{select-flagged, select-if, and select-unique} 63 * 64 * \par 65 * The following chart illustrates DeviceSelect::If 66 * performance across different CUDA architectures for \p int32 items, 67 * where 50% of the items are randomly selected. 68 * 69 * \image html select_if_int32_50_percent.png 70 * 71 * \par 72 * The following chart illustrates DeviceSelect::Unique 73 * performance across different CUDA architectures for \p int32 items 74 * where segments have lengths uniformly sampled from [1,1000]. 75 * 76 * \image html select_unique_int32_len_500.png 77 * 78 * \par 79 * \plots_below 80 * 81 */ 82 struct DeviceSelect 83 { 84 /** 85 * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png) 86 * 87 * \par 88 * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). 89 * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. 90 * - \devicestorage 91 * 92 * \par Snippet 93 * The code snippet below illustrates the compaction of items selected from an \p int device vector. 94 * \par 95 * \code 96 * #include <cub/cub.cuh> // or equivalently <cub/device/device_select.cuh> 97 * 98 * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output 99 * int num_items; // e.g., 8 100 * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] 101 * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] 102 * int *d_out; // e.g., [ , , , , , , , ] 103 * int *d_num_selected_out; // e.g., [ ] 104 * ... 105 * 106 * // Determine temporary device storage requirements 107 * void *d_temp_storage = NULL; 108 * size_t temp_storage_bytes = 0; 109 * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); 110 * 111 * // Allocate temporary storage 112 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 113 * 114 * // Run selection 115 * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); 116 * 117 * // d_out <-- [1, 4, 6, 7] 118 * // d_num_selected_out <-- [4] 119 * 120 * \endcode 121 * 122 * \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator 123 * \tparam FlagIterator <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator 124 * \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator 125 * \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator 126 */ 127 template < 128 typename InputIteratorT, 129 typename FlagIterator, 130 typename OutputIteratorT, 131 typename NumSelectedIteratorT> 132 CUB_RUNTIME_FUNCTION __forceinline__ Flaggedcub::DeviceSelect133 static cudaError_t Flagged( 134 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 135 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 136 InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items 137 FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags 138 OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items 139 NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) 140 int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) 141 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 142 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 143 { 144 typedef int OffsetT; // Signed integer type for global offsets 145 typedef NullType SelectOp; // Selection op (not used) 146 typedef NullType EqualityOp; // Equality operator (not used) 147 148 return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch( 149 d_temp_storage, 150 temp_storage_bytes, 151 d_in, 152 d_flags, 153 d_out, 154 d_num_selected_out, 155 SelectOp(), 156 EqualityOp(), 157 num_items, 158 stream, 159 debug_synchronous); 160 } 161 162 163 /** 164 * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png) 165 * 166 * \par 167 * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. 168 * - \devicestorage 169 * 170 * \par Performance 171 * The following charts illustrate saturated select-if performance across different 172 * CUDA architectures for \p int32 and \p int64 items, respectively. Items are 173 * selected with 50% probability. 174 * 175 * \image html select_if_int32_50_percent.png 176 * \image html select_if_int64_50_percent.png 177 * 178 * \par 179 * The following charts are similar, but 5% selection probability: 180 * 181 * \image html select_if_int32_5_percent.png 182 * \image html select_if_int64_5_percent.png 183 * 184 * \par Snippet 185 * The code snippet below illustrates the compaction of items selected from an \p int device vector. 186 * \par 187 * \code 188 * #include <cub/cub.cuh> // or equivalently <cub/device/device_select.cuh> 189 * 190 * // Functor type for selecting values less than some criteria 191 * struct LessThan 192 * { 193 * int compare; 194 * 195 * CUB_RUNTIME_FUNCTION __forceinline__ 196 * LessThan(int compare) : compare(compare) {} 197 * 198 * CUB_RUNTIME_FUNCTION __forceinline__ 199 * bool operator()(const int &a) const { 200 * return (a < compare); 201 * } 202 * }; 203 * 204 * // Declare, allocate, and initialize device-accessible pointers for input and output 205 * int num_items; // e.g., 8 206 * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] 207 * int *d_out; // e.g., [ , , , , , , , ] 208 * int *d_num_selected_out; // e.g., [ ] 209 * LessThan select_op(7); 210 * ... 211 * 212 * // Determine temporary device storage requirements 213 * void *d_temp_storage = NULL; 214 * size_t temp_storage_bytes = 0; 215 * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); 216 * 217 * // Allocate temporary storage 218 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 219 * 220 * // Run selection 221 * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); 222 * 223 * // d_out <-- [0, 2, 3, 5, 2] 224 * // d_num_selected_out <-- [5] 225 * 226 * \endcode 227 * 228 * \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator 229 * \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator 230 * \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator 231 * \tparam SelectOp <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt> 232 */ 233 template < 234 typename InputIteratorT, 235 typename OutputIteratorT, 236 typename NumSelectedIteratorT, 237 typename SelectOp> 238 CUB_RUNTIME_FUNCTION __forceinline__ Ifcub::DeviceSelect239 static cudaError_t If( 240 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 241 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 242 InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items 243 OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items 244 NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) 245 int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) 246 SelectOp select_op, ///< [in] Unary selection operator 247 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 248 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 249 { 250 typedef int OffsetT; // Signed integer type for global offsets 251 typedef NullType* FlagIterator; // FlagT iterator type (not used) 252 typedef NullType EqualityOp; // Equality operator (not used) 253 254 return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch( 255 d_temp_storage, 256 temp_storage_bytes, 257 d_in, 258 NULL, 259 d_out, 260 d_num_selected_out, 261 select_op, 262 EqualityOp(), 263 num_items, 264 stream, 265 debug_synchronous); 266 } 267 268 269 /** 270 * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png) 271 * 272 * \par 273 * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent 274 * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. 275 * - \devicestorage 276 * 277 * \par Performance 278 * The following charts illustrate saturated select-unique performance across different 279 * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have 280 * lengths uniformly sampled from [1,1000]. 281 * 282 * \image html select_unique_int32_len_500.png 283 * \image html select_unique_int64_len_500.png 284 * 285 * \par 286 * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: 287 * 288 * \image html select_unique_int32_len_5.png 289 * \image html select_unique_int64_len_5.png 290 * 291 * \par Snippet 292 * The code snippet below illustrates the compaction of items selected from an \p int device vector. 293 * \par 294 * \code 295 * #include <cub/cub.cuh> // or equivalently <cub/device/device_select.cuh> 296 * 297 * // Declare, allocate, and initialize device-accessible pointers for input and output 298 * int num_items; // e.g., 8 299 * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] 300 * int *d_out; // e.g., [ , , , , , , , ] 301 * int *d_num_selected_out; // e.g., [ ] 302 * ... 303 * 304 * // Determine temporary device storage requirements 305 * void *d_temp_storage = NULL; 306 * size_t temp_storage_bytes = 0; 307 * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); 308 * 309 * // Allocate temporary storage 310 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 311 * 312 * // Run selection 313 * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); 314 * 315 * // d_out <-- [0, 2, 9, 5, 8] 316 * // d_num_selected_out <-- [5] 317 * 318 * \endcode 319 * 320 * \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator 321 * \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator 322 * \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator 323 */ 324 template < 325 typename InputIteratorT, 326 typename OutputIteratorT, 327 typename NumSelectedIteratorT> 328 CUB_RUNTIME_FUNCTION __forceinline__ Uniquecub::DeviceSelect329 static cudaError_t Unique( 330 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 331 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 332 InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items 333 OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items 334 NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) 335 int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) 336 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 337 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 338 { 339 typedef int OffsetT; // Signed integer type for global offsets 340 typedef NullType* FlagIterator; // FlagT iterator type (not used) 341 typedef NullType SelectOp; // Selection op (not used) 342 typedef Equality EqualityOp; // Default == operator 343 344 return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch( 345 d_temp_storage, 346 temp_storage_bytes, 347 d_in, 348 NULL, 349 d_out, 350 d_num_selected_out, 351 SelectOp(), 352 EqualityOp(), 353 num_items, 354 stream, 355 debug_synchronous); 356 } 357 358 }; 359 360 /** 361 * \example example_device_select_flagged.cu 362 * \example example_device_select_if.cu 363 * \example example_device_select_unique.cu 364 */ 365 366 } // CUB namespace 367 CUB_NS_POSTFIX // Optional outer namespace(s) 368 369 370