1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *     * Redistributions of source code must retain the above copyright
9  *       notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above copyright
11  *       notice, this list of conditions and the following disclaimer in the
12  *       documentation and/or other materials provided with the distribution.
13  *     * Neither the name of the NVIDIA CORPORATION nor the
14  *       names of its contributors may be used to endorse or promote products
15  *       derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
30 /**
31  * \file
32  * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
33  */
34 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "dispatch/dispatch_select_if.cuh"
41 #include "../util_namespace.cuh"
42 
43 /// Optional outer namespace(s)
44 CUB_NS_PREFIX
45 
46 /// CUB namespace
47 namespace cub {
48 
49 
50 /**
51  * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png)
52  * \ingroup SingleModule
53  *
54  * \par Overview
55  * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
56  * a specified input sequence.
57  *
58  * \par Usage Considerations
59  * \cdp_class{DevicePartition}
60  *
61  * \par Performance
62  * \linear_performance{partition}
63  *
64  * \par
65  * The following chart illustrates DevicePartition::If
66  * performance across different CUDA architectures for \p int32 items,
67  * where 50% of the items are randomly selected for the first partition.
68  * \plots_below
69  *
70  * \image html partition_if_int32_50_percent.png
71  *
72  */
73 struct DevicePartition
74 {
75     /**
76      * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png)
77      *
78      * \par
79      * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
80      * - Copies of the selected items are compacted into \p d_out and maintain their original
81      *   relative ordering, however copies of the unselected items are compacted into the
82      *   rear of \p d_out in reverse order.
83      * - \devicestorage
84      *
85      * \par Snippet
86      * The code snippet below illustrates the compaction of items selected from an \p int device vector.
87      * \par
88      * \code
89      * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
90      *
91      * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
92      * int  num_items;              // e.g., 8
93      * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
94      * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
95      * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
96      * int  *d_num_selected_out;    // e.g., [ ]
97      * ...
98      *
99      * // Determine temporary device storage requirements
100      * void     *d_temp_storage = NULL;
101      * size_t   temp_storage_bytes = 0;
102      * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
103      *
104      * // Allocate temporary storage
105      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
106      *
107      * // Run selection
108      * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
109      *
110      * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
111      * // d_num_selected_out    <-- [4]
112      *
113      * \endcode
114      *
115      * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
116      * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
117      * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
118      * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
119      */
120     template <
121         typename                    InputIteratorT,
122         typename                    FlagIterator,
123         typename                    OutputIteratorT,
124         typename                    NumSelectedIteratorT>
125     CUB_RUNTIME_FUNCTION __forceinline__
Flaggedcub::DevicePartition126     static cudaError_t Flagged(
127         void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
128         size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
129         InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
130         FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
131         OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
132         NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
133         int                         num_items,                      ///< [in] Total number of items to select from
134         cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
135         bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
136     {
137         typedef int                     OffsetT;         // Signed integer type for global offsets
138         typedef NullType                SelectOp;       // Selection op (not used)
139         typedef NullType                EqualityOp;     // Equality operator (not used)
140 
141         return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
142             d_temp_storage,
143             temp_storage_bytes,
144             d_in,
145             d_flags,
146             d_out,
147             d_num_selected_out,
148             SelectOp(),
149             EqualityOp(),
150             num_items,
151             stream,
152             debug_synchronous);
153     }
154 
155 
156     /**
157      * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
158      *
159      * \par
160      * - Copies of the selected items are compacted into \p d_out and maintain their original
161      *   relative ordering, however copies of the unselected items are compacted into the
162      *   rear of \p d_out in reverse order.
163      * - \devicestorage
164      *
165      * \par Performance
166      * The following charts illustrate saturated partition-if performance across different
167      * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
168      * selected for the first partition with 50% probability.
169      *
170      * \image html partition_if_int32_50_percent.png
171      * \image html partition_if_int64_50_percent.png
172      *
173      * \par
174      * The following charts are similar, but 5% selection probability for the first partition:
175      *
176      * \image html partition_if_int32_5_percent.png
177      * \image html partition_if_int64_5_percent.png
178      *
179      * \par Snippet
180      * The code snippet below illustrates the compaction of items selected from an \p int device vector.
181      * \par
182      * \code
183      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
184      *
185      * // Functor type for selecting values less than some criteria
186      * struct LessThan
187      * {
188      *     int compare;
189      *
190      *     CUB_RUNTIME_FUNCTION __forceinline__
191      *     LessThan(int compare) : compare(compare) {}
192      *
193      *     CUB_RUNTIME_FUNCTION __forceinline__
194      *     bool operator()(const int &a) const {
195      *         return (a < compare);
196      *     }
197      * };
198      *
199      * // Declare, allocate, and initialize device-accessible pointers for input and output
200      * int      num_items;              // e.g., 8
201      * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
202      * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
203      * int      *d_num_selected_out;    // e.g., [ ]
204      * LessThan select_op(7);
205      * ...
206      *
207      * // Determine temporary device storage requirements
208      * void     *d_temp_storage = NULL;
209      * size_t   temp_storage_bytes = 0;
210      * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
211      *
212      * // Allocate temporary storage
213      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
214      *
215      * // Run selection
216      * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
217      *
218      * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
219      * // d_num_selected_out    <-- [5]
220      *
221      * \endcode
222      *
223      * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
224      * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
225      * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
226      * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
227      */
228     template <
229         typename                    InputIteratorT,
230         typename                    OutputIteratorT,
231         typename                    NumSelectedIteratorT,
232         typename                    SelectOp>
233     CUB_RUNTIME_FUNCTION __forceinline__
Ifcub::DevicePartition234     static cudaError_t If(
235         void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
236         size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
237         InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
238         OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
239         NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
240         int                         num_items,                      ///< [in] Total number of items to select from
241         SelectOp                    select_op,                      ///< [in] Unary selection operator
242         cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
243         bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
244     {
245         typedef int                     OffsetT;         // Signed integer type for global offsets
246         typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
247         typedef NullType                EqualityOp;     // Equality operator (not used)
248 
249         return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
250             d_temp_storage,
251             temp_storage_bytes,
252             d_in,
253             NULL,
254             d_out,
255             d_num_selected_out,
256             select_op,
257             EqualityOp(),
258             num_items,
259             stream,
260             debug_synchronous);
261     }
262 
263 };
264 
265 /**
266  * \example example_device_partition_flagged.cu
267  * \example example_device_partition_if.cu
268  */
269 
270 }               // CUB namespace
271 CUB_NS_POSTFIX  // Optional outer namespace(s)
272 
273 
274