1 2 /****************************************************************************** 3 * Copyright (c) 2011, Duane Merrill. All rights reserved. 4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * * Neither the name of the NVIDIA CORPORATION nor the 14 * names of its contributors may be used to endorse or promote products 15 * derived from this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 * 28 ******************************************************************************/ 29 30 /** 31 * \file 32 * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. 33 */ 34 35 #pragma once 36 37 #include <stdio.h> 38 #include <iterator> 39 40 #include "dispatch/dispatch_select_if.cuh" 41 #include "../util_namespace.cuh" 42 43 /// Optional outer namespace(s) 44 CUB_NS_PREFIX 45 46 /// CUB namespace 47 namespace cub { 48 49 50 /** 51 * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png) 52 * \ingroup SingleModule 53 * 54 * \par Overview 55 * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from 56 * a specified input sequence. 57 * 58 * \par Usage Considerations 59 * \cdp_class{DevicePartition} 60 * 61 * \par Performance 62 * \linear_performance{partition} 63 * 64 * \par 65 * The following chart illustrates DevicePartition::If 66 * performance across different CUDA architectures for \p int32 items, 67 * where 50% of the items are randomly selected for the first partition. 68 * \plots_below 69 * 70 * \image html partition_if_int32_50_percent.png 71 * 72 */ 73 struct DevicePartition 74 { 75 /** 76 * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png) 77 * 78 * \par 79 * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). 80 * - Copies of the selected items are compacted into \p d_out and maintain their original 81 * relative ordering, however copies of the unselected items are compacted into the 82 * rear of \p d_out in reverse order. 83 * - \devicestorage 84 * 85 * \par Snippet 86 * The code snippet below illustrates the compaction of items selected from an \p int device vector. 87 * \par 88 * \code 89 * #include <cub/cub.cuh> // or equivalently <cub/device/device_partition.cuh> 90 * 91 * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output 92 * int num_items; // e.g., 8 93 * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] 94 * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] 95 * int *d_out; // e.g., [ , , , , , , , ] 96 * int *d_num_selected_out; // e.g., [ ] 97 * ... 98 * 99 * // Determine temporary device storage requirements 100 * void *d_temp_storage = NULL; 101 * size_t temp_storage_bytes = 0; 102 * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); 103 * 104 * // Allocate temporary storage 105 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 106 * 107 * // Run selection 108 * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); 109 * 110 * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] 111 * // d_num_selected_out <-- [4] 112 * 113 * \endcode 114 * 115 * \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator 116 * \tparam FlagIterator <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator 117 * \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing output items \iterator 118 * \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator 119 */ 120 template < 121 typename InputIteratorT, 122 typename FlagIterator, 123 typename OutputIteratorT, 124 typename NumSelectedIteratorT> 125 CUB_RUNTIME_FUNCTION __forceinline__ Flaggedcub::DevicePartition126 static cudaError_t Flagged( 127 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 128 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 129 InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items 130 FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags 131 OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items 132 NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) 133 int num_items, ///< [in] Total number of items to select from 134 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 135 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 136 { 137 typedef int OffsetT; // Signed integer type for global offsets 138 typedef NullType SelectOp; // Selection op (not used) 139 typedef NullType EqualityOp; // Equality operator (not used) 140 141 return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch( 142 d_temp_storage, 143 temp_storage_bytes, 144 d_in, 145 d_flags, 146 d_out, 147 d_num_selected_out, 148 SelectOp(), 149 EqualityOp(), 150 num_items, 151 stream, 152 debug_synchronous); 153 } 154 155 156 /** 157 * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png) 158 * 159 * \par 160 * - Copies of the selected items are compacted into \p d_out and maintain their original 161 * relative ordering, however copies of the unselected items are compacted into the 162 * rear of \p d_out in reverse order. 163 * - \devicestorage 164 * 165 * \par Performance 166 * The following charts illustrate saturated partition-if performance across different 167 * CUDA architectures for \p int32 and \p int64 items, respectively. Items are 168 * selected for the first partition with 50% probability. 169 * 170 * \image html partition_if_int32_50_percent.png 171 * \image html partition_if_int64_50_percent.png 172 * 173 * \par 174 * The following charts are similar, but 5% selection probability for the first partition: 175 * 176 * \image html partition_if_int32_5_percent.png 177 * \image html partition_if_int64_5_percent.png 178 * 179 * \par Snippet 180 * The code snippet below illustrates the compaction of items selected from an \p int device vector. 181 * \par 182 * \code 183 * #include <cub/cub.cuh> // or equivalently <cub/device/device_partition.cuh> 184 * 185 * // Functor type for selecting values less than some criteria 186 * struct LessThan 187 * { 188 * int compare; 189 * 190 * CUB_RUNTIME_FUNCTION __forceinline__ 191 * LessThan(int compare) : compare(compare) {} 192 * 193 * CUB_RUNTIME_FUNCTION __forceinline__ 194 * bool operator()(const int &a) const { 195 * return (a < compare); 196 * } 197 * }; 198 * 199 * // Declare, allocate, and initialize device-accessible pointers for input and output 200 * int num_items; // e.g., 8 201 * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] 202 * int *d_out; // e.g., [ , , , , , , , ] 203 * int *d_num_selected_out; // e.g., [ ] 204 * LessThan select_op(7); 205 * ... 206 * 207 * // Determine temporary device storage requirements 208 * void *d_temp_storage = NULL; 209 * size_t temp_storage_bytes = 0; 210 * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); 211 * 212 * // Allocate temporary storage 213 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 214 * 215 * // Run selection 216 * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); 217 * 218 * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] 219 * // d_num_selected_out <-- [5] 220 * 221 * \endcode 222 * 223 * \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator 224 * \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing output items \iterator 225 * \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator 226 * \tparam SelectOp <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt> 227 */ 228 template < 229 typename InputIteratorT, 230 typename OutputIteratorT, 231 typename NumSelectedIteratorT, 232 typename SelectOp> 233 CUB_RUNTIME_FUNCTION __forceinline__ Ifcub::DevicePartition234 static cudaError_t If( 235 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 236 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 237 InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items 238 OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items 239 NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) 240 int num_items, ///< [in] Total number of items to select from 241 SelectOp select_op, ///< [in] Unary selection operator 242 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 243 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 244 { 245 typedef int OffsetT; // Signed integer type for global offsets 246 typedef NullType* FlagIterator; // FlagT iterator type (not used) 247 typedef NullType EqualityOp; // Equality operator (not used) 248 249 return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch( 250 d_temp_storage, 251 temp_storage_bytes, 252 d_in, 253 NULL, 254 d_out, 255 d_num_selected_out, 256 select_op, 257 EqualityOp(), 258 num_items, 259 stream, 260 debug_synchronous); 261 } 262 263 }; 264 265 /** 266 * \example example_device_partition_flagged.cu 267 * \example example_device_partition_if.cu 268 */ 269 270 } // CUB namespace 271 CUB_NS_POSTFIX // Optional outer namespace(s) 272 273 274