1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *     * Redistributions of source code must retain the above copyright
9  *       notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above copyright
11  *       notice, this list of conditions and the following disclaimer in the
12  *       documentation and/or other materials provided with the distribution.
13  *     * Neither the name of the NVIDIA CORPORATION nor the
14  *       names of its contributors may be used to endorse or promote products
15  *       derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
30 /**
31  * \file
32  * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
33  */
34 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 #include <limits>
40 
41 #include "dispatch/dispatch_histogram.cuh"
42 #include "../util_namespace.cuh"
43 
44 /// Optional outer namespace(s)
45 CUB_NS_PREFIX
46 
47 /// CUB namespace
48 namespace cub {
49 
50 
51 /**
52  * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png)
53  * \ingroup SingleModule
54  *
55  * \par Overview
56  * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
57  * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
58  *
59  * \par Usage Considerations
60  * \cdp_class{DeviceHistogram}
61  *
62  */
63 struct DeviceHistogram
64 {
65     /******************************************************************//**
66      * \name Evenly-segmented bin ranges
67      *********************************************************************/
68     //@{
69 
70     /**
71      * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
72      *
73      * \par
74      * - The number of histogram bins is (\p num_levels - 1)
75      * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
76      * - \devicestorage
77      *
78      * \par Snippet
79      * The code snippet below illustrates the computation of a six-bin histogram
80      * from a sequence of float samples
81      *
82      * \par
83      * \code
84      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
85      *
86      * // Declare, allocate, and initialize device-accessible pointers for input samples and
87      * // output histogram
88      * int      num_samples;    // e.g., 10
89      * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
90      * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
91      * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
92      * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
93      * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
94      * ...
95      *
96      * // Determine temporary device storage requirements
97      * void*    d_temp_storage = NULL;
98      * size_t   temp_storage_bytes = 0;
99      * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
100      *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
101      *
102      * // Allocate temporary storage
103      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
104      *
105      * // Compute histograms
106      * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
107      *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
108      *
109      * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
110      *
111      * \endcode
112      *
113      * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
114      * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
115      * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
116      * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
117      */
118     template <
119         typename            SampleIteratorT,
120         typename            CounterT,
121         typename            LevelT,
122         typename            OffsetT>
123     CUB_RUNTIME_FUNCTION
HistogramEvencub::DeviceHistogram124     static cudaError_t HistogramEven(
125         void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
126         size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
127         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
128         CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
129         int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
130         LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
131         LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
132         OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
133         cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
134         bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
135     {
136         /// The sample value type of the input iterator
137         typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
138 
139         CounterT*           d_histogram1[1]     = {d_histogram};
140         int                 num_levels1[1]      = {num_levels};
141         LevelT              lower_level1[1]     = {lower_level};
142         LevelT              upper_level1[1]     = {upper_level};
143 
144         return MultiHistogramEven<1, 1>(
145             d_temp_storage,
146             temp_storage_bytes,
147             d_samples,
148             d_histogram1,
149             num_levels1,
150             lower_level1,
151             upper_level1,
152             num_samples,
153             1,
154             sizeof(SampleT) * num_samples,
155             stream,
156             debug_synchronous);
157     }
158 
159 
160     /**
161      * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
162      *
163      * \par
164      * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
165      *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
166      * - The row stride must be a whole multiple of the sample data type
167      *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
168      * - The number of histogram bins is (\p num_levels - 1)
169      * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
170      * - \devicestorage
171      *
172      * \par Snippet
173      * The code snippet below illustrates the computation of a six-bin histogram
174      * from a 2x5 region of interest within a flattened 2x7 array of float samples.
175      *
176      * \par
177      * \code
178      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
179      *
180      * // Declare, allocate, and initialize device-accessible pointers for input samples and
181      * // output histogram
182      * int      num_row_samples;    // e.g., 5
183      * int      num_rows;           // e.g., 2;
184      * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
185      * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
186      *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
187      * int*     d_histogram;        // e.g., [ -, -, -, -, -, -, -, -]
188      * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
189      * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
190      * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
191      * ...
192      *
193      * // Determine temporary device storage requirements
194      * void*    d_temp_storage  = NULL;
195      * size_t   temp_storage_bytes = 0;
196      * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
197      *     d_samples, d_histogram, num_levels, lower_level, upper_level,
198      *     num_row_samples, num_rows, row_stride_bytes);
199      *
200      * // Allocate temporary storage
201      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
202      *
203      * // Compute histograms
204      * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
205      *     d_samples, d_histogram, num_levels, lower_level, upper_level,
206      *     num_row_samples, num_rows, row_stride_bytes);
207      *
208      * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
209      *
210      * \endcode
211      *
212      * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
213      * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
214      * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
215      * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
216      */
217     template <
218         typename            SampleIteratorT,
219         typename            CounterT,
220         typename            LevelT,
221         typename            OffsetT>
222     CUB_RUNTIME_FUNCTION
HistogramEvencub::DeviceHistogram223     static cudaError_t HistogramEven(
224         void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
225         size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
226         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
227         CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
228         int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
229         LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
230         LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
231         OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
232         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
233         size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
234         cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
235         bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
236     {
237         CounterT*           d_histogram1[1]     = {d_histogram};
238         int                 num_levels1[1]      = {num_levels};
239         LevelT              lower_level1[1]     = {lower_level};
240         LevelT              upper_level1[1]     = {upper_level};
241 
242         return MultiHistogramEven<1, 1>(
243             d_temp_storage,
244             temp_storage_bytes,
245             d_samples,
246             d_histogram1,
247             num_levels1,
248             lower_level1,
249             upper_level1,
250             num_row_samples,
251             num_rows,
252             row_stride_bytes,
253             stream,
254             debug_synchronous);
255     }
256 
257     /**
258      * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
259      *
260      * \par
261      * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
262      *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
263      * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
264      *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
265      *   pixel samples).
266      * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
267      * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
268      *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
269      * - \devicestorage
270      *
271      * \par Snippet
272      * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
273      * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
274      *
275      * \par
276      * \code
277      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
278      *
279      * // Declare, allocate, and initialize device-accessible pointers for input samples
280      * // and output histograms
281      * int              num_pixels;         // e.g., 5
282      * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
283      *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
284      * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
285      *                                      //       each allocated with 256 integer counters
286      * int              num_levels[3];      // e.g., {257, 257, 257};
287      * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
288      * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
289      * ...
290      *
291      * // Determine temporary device storage requirements
292      * void*    d_temp_storage = NULL;
293      * size_t   temp_storage_bytes = 0;
294      * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
295      *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
296      *
297      * // Allocate temporary storage
298      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
299      *
300      * // Compute histograms
301      * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
302      *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
303      *
304      * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
305      * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
306      * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
307      *
308      * \endcode
309      *
310      * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
311      * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
312      * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
313      * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
314      * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
315      * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
316      */
317     template <
318         int                 NUM_CHANNELS,
319         int                 NUM_ACTIVE_CHANNELS,
320         typename            SampleIteratorT,
321         typename            CounterT,
322         typename            LevelT,
323         typename            OffsetT>
324     CUB_RUNTIME_FUNCTION
MultiHistogramEvencub::DeviceHistogram325     static cudaError_t MultiHistogramEven(
326         void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
327         size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
328         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
329         CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
330         int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
331         LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
332         LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
333         OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
334         cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
335         bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
336     {
337         /// The sample value type of the input iterator
338         typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
339 
340         return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
341             d_temp_storage,
342             temp_storage_bytes,
343             d_samples,
344             d_histogram,
345             num_levels,
346             lower_level,
347             upper_level,
348             num_pixels,
349             1,
350             sizeof(SampleT) * NUM_CHANNELS * num_pixels,
351             stream,
352             debug_synchronous);
353     }
354 
355 
356     /**
357      * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
358      *
359      * \par
360      * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
361      *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
362      * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
363      *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
364      *   pixel samples).
365      * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
366      *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
367      * - The row stride must be a whole multiple of the sample data type
368      *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
369      * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
370      * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
371      *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
372      * - \devicestorage
373      *
374      * \par Snippet
375      * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of
376      * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
377      *
378      * \par
379      * \code
380      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
381      *
382      * // Declare, allocate, and initialize device-accessible pointers for input samples
383      * // and output histograms
384      * int              num_row_pixels;     // e.g., 3
385      * int              num_rows;           // e.g., 2
386      * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
387      * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
388      *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
389      * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
390      *                                      //       each allocated with 256 integer counters
391      * int              num_levels[3];      // e.g., {257, 257, 257};
392      * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
393      * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
394      * ...
395      *
396      * // Determine temporary device storage requirements
397      * void*    d_temp_storage = NULL;
398      * size_t   temp_storage_bytes = 0;
399      * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
400      *     d_samples, d_histogram, num_levels, lower_level, upper_level,
401      *     num_row_pixels, num_rows, row_stride_bytes);
402      *
403      * // Allocate temporary storage
404      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
405      *
406      * // Compute histograms
407      * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
408      *     d_samples, d_histogram, num_levels, lower_level, upper_level,
409      *     num_row_pixels, num_rows, row_stride_bytes);
410      *
411      * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
412      * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
413      * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
414      *
415      * \endcode
416      *
417      * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
418      * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
419      * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
420      * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
421      * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
422      * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
423      */
424     template <
425         int                 NUM_CHANNELS,
426         int                 NUM_ACTIVE_CHANNELS,
427         typename            SampleIteratorT,
428         typename            CounterT,
429         typename            LevelT,
430         typename            OffsetT>
431     CUB_RUNTIME_FUNCTION
MultiHistogramEvencub::DeviceHistogram432     static cudaError_t MultiHistogramEven(
433         void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
434         size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
435         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
436         CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
437         int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
438         LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
439         LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
440         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
441         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
442         size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
443         cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
444         bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
445     {
446         /// The sample value type of the input iterator
447         typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
448         Int2Type<sizeof(SampleT) == 1> is_byte_sample;
449 
450         if ((sizeof(OffsetT) > sizeof(int)) &&
451             ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
452         {
453             // Down-convert OffsetT data type
454 
455 
456             return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
457                 d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
458                 (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
459                 stream, debug_synchronous, is_byte_sample);
460         }
461 
462         return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
463             d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
464             num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
465             stream, debug_synchronous, is_byte_sample);
466     }
467 
468 
469     //@}  end member group
470     /******************************************************************//**
471      * \name Custom bin ranges
472      *********************************************************************/
473     //@{
474 
475     /**
476      * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
477      *
478      * \par
479      * - The number of histogram bins is (\p num_levels - 1)
480      * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
481      * - \devicestorage
482      *
483      * \par Snippet
484      * The code snippet below illustrates the computation of an six-bin histogram
485      * from a sequence of float samples
486      *
487      * \par
488      * \code
489      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
490      *
491      * // Declare, allocate, and initialize device-accessible pointers for input samples and
492      * // output histogram
493      * int      num_samples;    // e.g., 10
494      * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
495      * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
496      * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
497      * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
498      * ...
499      *
500      * // Determine temporary device storage requirements
501      * void*    d_temp_storage = NULL;
502      * size_t   temp_storage_bytes = 0;
503      * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
504      *     d_samples, d_histogram, num_levels, d_levels, num_samples);
505      *
506      * // Allocate temporary storage
507      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
508      *
509      * // Compute histograms
510      * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
511      *     d_samples, d_histogram, num_levels, d_levels, num_samples);
512      *
513      * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
514      *
515      * \endcode
516      *
517      * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
518      * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
519      * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
520      * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
521      */
522     template <
523         typename            SampleIteratorT,
524         typename            CounterT,
525         typename            LevelT,
526         typename            OffsetT>
527     CUB_RUNTIME_FUNCTION
HistogramRangecub::DeviceHistogram528     static cudaError_t HistogramRange(
529         void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
530         size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
531         SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
532         CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
533         int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
534         LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
535         OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
536         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
537         bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
538     {
539         /// The sample value type of the input iterator
540         typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
541 
542         CounterT*           d_histogram1[1] = {d_histogram};
543         int                 num_levels1[1]  = {num_levels};
544         LevelT*             d_levels1[1]    = {d_levels};
545 
546         return MultiHistogramRange<1, 1>(
547             d_temp_storage,
548             temp_storage_bytes,
549             d_samples,
550             d_histogram1,
551             num_levels1,
552             d_levels1,
553             num_samples,
554             1,
555             sizeof(SampleT) * num_samples,
556             stream,
557             debug_synchronous);
558     }
559 
560 
561     /**
562      * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
563      *
564      * \par
565      * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
566      *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
567      * - The row stride must be a whole multiple of the sample data type
568      *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
569      * - The number of histogram bins is (\p num_levels - 1)
570      * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
571      * - \devicestorage
572      *
573      * \par Snippet
574      * The code snippet below illustrates the computation of a six-bin histogram
575      * from a 2x5 region of interest within a flattened 2x7 array of float samples.
576      *
577      * \par
578      * \code
579      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
580      *
581      * // Declare, allocate, and initialize device-accessible pointers for input samples and
582      * // output histogram
583      * int      num_row_samples;    // e.g., 5
584      * int      num_rows;           // e.g., 2;
585      * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
586      * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
587      *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
588      * int*     d_histogram;        // e.g., [ , , , , , , , ]
589      * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
590      * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
591      * ...
592      *
593      * // Determine temporary device storage requirements
594      * void*    d_temp_storage = NULL;
595      * size_t   temp_storage_bytes = 0;
596      * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
597      *     d_samples, d_histogram, num_levels, d_levels,
598      *     num_row_samples, num_rows, row_stride_bytes);
599      *
600      * // Allocate temporary storage
601      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
602      *
603      * // Compute histograms
604      * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
605      *     d_samples, d_histogram, num_levels, d_levels,
606      *     num_row_samples, num_rows, row_stride_bytes);
607      *
608      * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
609      *
610      * \endcode
611      *
612      * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
613      * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
614      * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
615      * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
616      */
617     template <
618         typename            SampleIteratorT,
619         typename            CounterT,
620         typename            LevelT,
621         typename            OffsetT>
622     CUB_RUNTIME_FUNCTION
HistogramRangecub::DeviceHistogram623     static cudaError_t HistogramRange(
624         void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
625         size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
626         SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
627         CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
628         int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
629         LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
630         OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
631         OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
632         size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
633         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
634         bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
635     {
636         CounterT*           d_histogram1[1]     = {d_histogram};
637         int                 num_levels1[1]      = {num_levels};
638         LevelT*             d_levels1[1]        = {d_levels};
639 
640         return MultiHistogramRange<1, 1>(
641             d_temp_storage,
642             temp_storage_bytes,
643             d_samples,
644             d_histogram1,
645             num_levels1,
646             d_levels1,
647             num_row_samples,
648             num_rows,
649             row_stride_bytes,
650             stream,
651             debug_synchronous);
652     }
653 
654     /**
655      * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
656      *
657      * \par
658      * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
659      *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
660      * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
661      *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
662      *   pixel samples).
663      * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
664      * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
665      *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
666      * - \devicestorage
667      *
668      * \par Snippet
669      * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms
670      * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
671      *
672      * \par
673      * \code
674      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
675      *
676      * // Declare, allocate, and initialize device-accessible pointers for input samples
677      * // and output histograms
678      * int            num_pixels;       // e.g., 5
679      * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
680      *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
681      * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
682      * int            num_levels[3];    // e.g., {5, 5, 5};
683      * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
684      *                                  //         [0, 2, 4, 6, 8],
685      *                                  //         [0, 2, 4, 6, 8] ];
686      * ...
687      *
688      * // Determine temporary device storage requirements
689      * void*    d_temp_storage = NULL;
690      * size_t   temp_storage_bytes = 0;
691      * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
692      *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
693      *
694      * // Allocate temporary storage
695      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
696      *
697      * // Compute histograms
698      * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
699      *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
700      *
701      * // d_histogram   <-- [ [1, 3, 0, 1],
702      * //                     [3, 0, 0, 2],
703      * //                     [0, 2, 0, 3] ]
704      *
705      * \endcode
706      *
707      * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
708      * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
709      * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
710      * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
711      * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
712      * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
713      */
714     template <
715         int                 NUM_CHANNELS,
716         int                 NUM_ACTIVE_CHANNELS,
717         typename            SampleIteratorT,
718         typename            CounterT,
719         typename            LevelT,
720         typename            OffsetT>
721     CUB_RUNTIME_FUNCTION
MultiHistogramRangecub::DeviceHistogram722     static cudaError_t MultiHistogramRange(
723         void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
724         size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
725         SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
726         CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
727         int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
728         LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
729         OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
730         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
731         bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
732     {
733         /// The sample value type of the input iterator
734         typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
735 
736         return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
737             d_temp_storage,
738             temp_storage_bytes,
739             d_samples,
740             d_histogram,
741             num_levels,
742             d_levels,
743             num_pixels,
744             1,
745             sizeof(SampleT) * NUM_CHANNELS * num_pixels,
746             stream,
747             debug_synchronous);
748     }
749 
750 
751     /**
752      * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
753      *
754      * \par
755      * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
756      *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
757      * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
758      *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
759      *   pixel samples).
760      * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
761      *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
762      * - The row stride must be a whole multiple of the sample data type
763      *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
764      * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
765      * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
766      *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
767      * - \devicestorage
768      *
769      * \par Snippet
770      * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of
771      * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
772      *
773      * \par
774      * \code
775      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
776      *
777      * // Declare, allocate, and initialize device-accessible pointers for input samples
778      * // and output histograms
779      * int              num_row_pixels;     // e.g., 3
780      * int              num_rows;           // e.g., 2
781      * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
782      * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
783      *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
784      * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
785      * int              num_levels[3];      // e.g., {5, 5, 5};
786      * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
787      *                                      //         [0, 2, 4, 6, 8],
788      *                                      //         [0, 2, 4, 6, 8] ];
789      * ...
790      *
791      * // Determine temporary device storage requirements
792      * void*    d_temp_storage = NULL;
793      * size_t   temp_storage_bytes = 0;
794      * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
795      *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
796      *
797      * // Allocate temporary storage
798      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
799      *
800      * // Compute histograms
801      * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
802      *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
803      *
804      * // d_histogram   <-- [ [2, 3, 0, 1],
805      * //                     [3, 0, 0, 2],
806      * //                     [1, 2, 0, 3] ]
807      *
808      * \endcode
809      *
810      * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
811      * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
812      * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
813      * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
814      * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
815      * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
816      */
817     template <
818         int                 NUM_CHANNELS,
819         int                 NUM_ACTIVE_CHANNELS,
820         typename            SampleIteratorT,
821         typename            CounterT,
822         typename            LevelT,
823         typename            OffsetT>
824     CUB_RUNTIME_FUNCTION
MultiHistogramRangecub::DeviceHistogram825     static cudaError_t MultiHistogramRange(
826         void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
827         size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
828         SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
829         CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
830         int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
831         LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
832         OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
833         OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
834         size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
835         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
836         bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
837     {
838         /// The sample value type of the input iterator
839         typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
840         Int2Type<sizeof(SampleT) == 1> is_byte_sample;
841 
842         if ((sizeof(OffsetT) > sizeof(int)) &&
843             ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
844         {
845             // Down-convert OffsetT data type
846             return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
847                 d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
848                 (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
849                 stream, debug_synchronous, is_byte_sample);
850         }
851 
852         return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
853             d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
854             num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
855             stream, debug_synchronous, is_byte_sample);
856     }
857 
858 
859 
860     //@}  end member group
861 };
862 
863 }               // CUB namespace
864 CUB_NS_POSTFIX  // Optional outer namespace(s)
865 
866 
867