1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *     * Redistributions of source code must retain the above copyright
9  *       notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above copyright
11  *       notice, this list of conditions and the following disclaimer in the
12  *       documentation and/or other materials provided with the distribution.
13  *     * Neither the name of the NVIDIA CORPORATION nor the
14  *       names of its contributors may be used to endorse or promote products
15  *       derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
30 /**
31  * \file
32  * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
33  */
34 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "dispatch/dispatch_select_if.cuh"
41 #include "../util_namespace.cuh"
42 
43 /// Optional outer namespace(s)
44 CUB_NS_PREFIX
45 
46 /// CUB namespace
47 namespace cub {
48 
49 
50 /**
51  * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
52  * \ingroup SingleModule
53  *
54  * \par Overview
55  * These operations apply a selection criterion to selectively copy
56  * items from a specified input sequence to a compact output sequence.
57  *
58  * \par Usage Considerations
59  * \cdp_class{DeviceSelect}
60  *
61  * \par Performance
62  * \linear_performance{select-flagged, select-if, and select-unique}
63  *
64  * \par
65  * The following chart illustrates DeviceSelect::If
66  * performance across different CUDA architectures for \p int32 items,
67  * where 50% of the items are randomly selected.
68  *
69  * \image html select_if_int32_50_percent.png
70  *
71  * \par
72  * The following chart illustrates DeviceSelect::Unique
73  * performance across different CUDA architectures for \p int32 items
74  * where segments have lengths uniformly sampled from [1,1000].
75  *
76  * \image html select_unique_int32_len_500.png
77  *
78  * \par
79  * \plots_below
80  *
81  */
82 struct DeviceSelect
83 {
84     /**
85      * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
86      *
87      * \par
88      * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
89      * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
90      * - \devicestorage
91      *
92      * \par Snippet
93      * The code snippet below illustrates the compaction of items selected from an \p int device vector.
94      * \par
95      * \code
96      * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
97      *
98      * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
99      * int  num_items;              // e.g., 8
100      * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
101      * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
102      * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
103      * int  *d_num_selected_out;    // e.g., [ ]
104      * ...
105      *
106      * // Determine temporary device storage requirements
107      * void     *d_temp_storage = NULL;
108      * size_t   temp_storage_bytes = 0;
109      * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
110      *
111      * // Allocate temporary storage
112      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
113      *
114      * // Run selection
115      * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
116      *
117      * // d_out                 <-- [1, 4, 6, 7]
118      * // d_num_selected_out    <-- [4]
119      *
120      * \endcode
121      *
122      * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
123      * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
124      * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
125      * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
126      */
127     template <
128         typename                    InputIteratorT,
129         typename                    FlagIterator,
130         typename                    OutputIteratorT,
131         typename                    NumSelectedIteratorT>
132     CUB_RUNTIME_FUNCTION __forceinline__
Flaggedcub::DeviceSelect133     static cudaError_t Flagged(
134         void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
135         size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
136         InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
137         FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
138         OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
139         NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
140         int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
141         cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
142         bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
143     {
144         typedef int                     OffsetT;         // Signed integer type for global offsets
145         typedef NullType                SelectOp;       // Selection op (not used)
146         typedef NullType                EqualityOp;     // Equality operator (not used)
147 
148         return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
149             d_temp_storage,
150             temp_storage_bytes,
151             d_in,
152             d_flags,
153             d_out,
154             d_num_selected_out,
155             SelectOp(),
156             EqualityOp(),
157             num_items,
158             stream,
159             debug_synchronous);
160     }
161 
162 
163     /**
164      * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
165      *
166      * \par
167      * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
168      * - \devicestorage
169      *
170      * \par Performance
171      * The following charts illustrate saturated select-if performance across different
172      * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
173      * selected with 50% probability.
174      *
175      * \image html select_if_int32_50_percent.png
176      * \image html select_if_int64_50_percent.png
177      *
178      * \par
179      * The following charts are similar, but 5% selection probability:
180      *
181      * \image html select_if_int32_5_percent.png
182      * \image html select_if_int64_5_percent.png
183      *
184      * \par Snippet
185      * The code snippet below illustrates the compaction of items selected from an \p int device vector.
186      * \par
187      * \code
188      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
189      *
190      * // Functor type for selecting values less than some criteria
191      * struct LessThan
192      * {
193      *     int compare;
194      *
195      *     CUB_RUNTIME_FUNCTION __forceinline__
196      *     LessThan(int compare) : compare(compare) {}
197      *
198      *     CUB_RUNTIME_FUNCTION __forceinline__
199      *     bool operator()(const int &a) const {
200      *         return (a < compare);
201      *     }
202      * };
203      *
204      * // Declare, allocate, and initialize device-accessible pointers for input and output
205      * int      num_items;              // e.g., 8
206      * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
207      * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
208      * int      *d_num_selected_out;    // e.g., [ ]
209      * LessThan select_op(7);
210      * ...
211      *
212      * // Determine temporary device storage requirements
213      * void     *d_temp_storage = NULL;
214      * size_t   temp_storage_bytes = 0;
215      * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
216      *
217      * // Allocate temporary storage
218      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
219      *
220      * // Run selection
221      * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
222      *
223      * // d_out                 <-- [0, 2, 3, 5, 2]
224      * // d_num_selected_out    <-- [5]
225      *
226      * \endcode
227      *
228      * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
229      * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
230      * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
231      * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
232      */
233     template <
234         typename                    InputIteratorT,
235         typename                    OutputIteratorT,
236         typename                    NumSelectedIteratorT,
237         typename                    SelectOp>
238     CUB_RUNTIME_FUNCTION __forceinline__
Ifcub::DeviceSelect239     static cudaError_t If(
240         void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
241         size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
242         InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
243         OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
244         NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
245         int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
246         SelectOp                    select_op,                      ///< [in] Unary selection operator
247         cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
248         bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
249     {
250         typedef int                     OffsetT;         // Signed integer type for global offsets
251         typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
252         typedef NullType                EqualityOp;     // Equality operator (not used)
253 
254         return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
255             d_temp_storage,
256             temp_storage_bytes,
257             d_in,
258             NULL,
259             d_out,
260             d_num_selected_out,
261             select_op,
262             EqualityOp(),
263             num_items,
264             stream,
265             debug_synchronous);
266     }
267 
268 
269     /**
270      * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
271      *
272      * \par
273      * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
274      * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
275      * - \devicestorage
276      *
277      * \par Performance
278      * The following charts illustrate saturated select-unique performance across different
279      * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
280      * lengths uniformly sampled from [1,1000].
281      *
282      * \image html select_unique_int32_len_500.png
283      * \image html select_unique_int64_len_500.png
284      *
285      * \par
286      * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
287      *
288      * \image html select_unique_int32_len_5.png
289      * \image html select_unique_int64_len_5.png
290      *
291      * \par Snippet
292      * The code snippet below illustrates the compaction of items selected from an \p int device vector.
293      * \par
294      * \code
295      * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
296      *
297      * // Declare, allocate, and initialize device-accessible pointers for input and output
298      * int  num_items;              // e.g., 8
299      * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
300      * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
301      * int  *d_num_selected_out;    // e.g., [ ]
302      * ...
303      *
304      * // Determine temporary device storage requirements
305      * void     *d_temp_storage = NULL;
306      * size_t   temp_storage_bytes = 0;
307      * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
308      *
309      * // Allocate temporary storage
310      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
311      *
312      * // Run selection
313      * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
314      *
315      * // d_out                 <-- [0, 2, 9, 5, 8]
316      * // d_num_selected_out    <-- [5]
317      *
318      * \endcode
319      *
320      * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
321      * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
322      * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
323      */
324     template <
325         typename                    InputIteratorT,
326         typename                    OutputIteratorT,
327         typename                    NumSelectedIteratorT>
328     CUB_RUNTIME_FUNCTION __forceinline__
Uniquecub::DeviceSelect329     static cudaError_t Unique(
330         void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
331         size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
332         InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
333         OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
334         NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
335         int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
336         cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
337         bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
338     {
339         typedef int                     OffsetT;         // Signed integer type for global offsets
340         typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
341         typedef NullType                SelectOp;       // Selection op (not used)
342         typedef Equality                EqualityOp;     // Default == operator
343 
344         return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
345             d_temp_storage,
346             temp_storage_bytes,
347             d_in,
348             NULL,
349             d_out,
350             d_num_selected_out,
351             SelectOp(),
352             EqualityOp(),
353             num_items,
354             stream,
355             debug_synchronous);
356     }
357 
358 };
359 
360 /**
361  * \example example_device_select_flagged.cu
362  * \example example_device_select_if.cu
363  * \example example_device_select_unique.cu
364  */
365 
366 }               // CUB namespace
367 CUB_NS_POSTFIX  // Optional outer namespace(s)
368 
369 
370