1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *     * Redistributions of source code must retain the above copyright
9  *       notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above copyright
11  *       notice, this list of conditions and the following disclaimer in the
12  *       documentation and/or other materials provided with the distribution.
13  *     * Neither the name of the NVIDIA CORPORATION nor the
14  *       names of its contributors may be used to endorse or promote products
15  *       derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
30 /**
31  * \file
32  * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
33  */
34 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 #include <limits>
40 
41 #include "../../agent/agent_histogram.cuh"
42 #include "../../util_debug.cuh"
43 #include "../../util_device.cuh"
44 #include "../../thread/thread_search.cuh"
45 #include "../../grid/grid_queue.cuh"
46 #include "../../config.cuh"
47 
48 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
49 
50 /// Optional outer namespace(s)
51 CUB_NS_PREFIX
52 
53 /// CUB namespace
54 namespace cub {
55 
56 
57 
58 /******************************************************************************
59  * Histogram kernel entry points
60  *****************************************************************************/
61 
62 /**
63  * Histogram initialization kernel entry point
64  */
65 template <
66     int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
67     typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
68     typename                                        OffsetT>                        ///< Signed integer type for global offsets
DeviceHistogramInitKernel(ArrayWrapper<int,NUM_ACTIVE_CHANNELS> num_output_bins_wrapper,ArrayWrapper<CounterT *,NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper,GridQueue<int> tile_queue)69 __global__ void DeviceHistogramInitKernel(
70     ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
71     ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
72     GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
73 {
74     if ((threadIdx.x == 0) && (blockIdx.x == 0))
75         tile_queue.ResetDrain();
76 
77     int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
78 
79     #pragma unroll
80     for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
81     {
82         if (output_bin < num_output_bins_wrapper.array[CHANNEL])
83             d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
84     }
85 }
86 
87 
88 /**
89  * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
90  */
91 template <
92     typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
93     int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
94     int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
95     int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
96     typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
97     typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
98     typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
99     typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
100     typename                                            OffsetT>                        ///< Signed integer type for global offsets
__launch_bounds__(int (AgentHistogramPolicyT::BLOCK_THREADS))101 __launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
102 __global__ void DeviceHistogramSweepKernel(
103     SampleIteratorT                                         d_samples,                          ///< Input data to reduce
104     ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
105     ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
106     ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
107     ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
108     ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
109     ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
110     OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
111     OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
112     OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
113     int                                                     tiles_per_row,                      ///< Number of image tiles per row
114     GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
115 {
116     // Thread block type for compositing input tiles
117     typedef AgentHistogram<
118             AgentHistogramPolicyT,
119             PRIVATIZED_SMEM_BINS,
120             NUM_CHANNELS,
121             NUM_ACTIVE_CHANNELS,
122             SampleIteratorT,
123             CounterT,
124             PrivatizedDecodeOpT,
125             OutputDecodeOpT,
126             OffsetT>
127         AgentHistogramT;
128 
129     // Shared memory for AgentHistogram
130     __shared__ typename AgentHistogramT::TempStorage temp_storage;
131 
132     AgentHistogramT agent(
133         temp_storage,
134         d_samples,
135         num_output_bins_wrapper.array,
136         num_privatized_bins_wrapper.array,
137         d_output_histograms_wrapper.array,
138         d_privatized_histograms_wrapper.array,
139         output_decode_op_wrapper.array,
140         privatized_decode_op_wrapper.array);
141 
142     // Initialize counters
143     agent.InitBinCounters();
144 
145     // Consume input tiles
146     agent.ConsumeTiles(
147         num_row_pixels,
148         num_rows,
149         row_stride_samples,
150         tiles_per_row,
151         tile_queue);
152 
153     // Store output to global (if necessary)
154     agent.StoreOutput();
155 
156 }
157 
158 
159 
160 
161 
162 
163 /******************************************************************************
164  * Dispatch
165  ******************************************************************************/
166 
167 /**
168  * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
169  */
170 template <
171     int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
172     int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
173     typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
174     typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
175     typename    LevelT,                     ///< Type for specifying bin level boundaries
176     typename    OffsetT>                    ///< Signed integer type for global offsets
177 struct DipatchHistogram
178 {
179     //---------------------------------------------------------------------
180     // Types and constants
181     //---------------------------------------------------------------------
182 
183     /// The sample value type of the input iterator
184     typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
185 
186     enum
187     {
188         // Maximum number of bins per channel for which we will use a privatized smem strategy
189         MAX_PRIVATIZED_SMEM_BINS = 256
190     };
191 
192 
193     //---------------------------------------------------------------------
194     // Transform functors for converting samples to bin-ids
195     //---------------------------------------------------------------------
196 
197     // Searches for bin given a list of bin-boundary levels
198     template <typename LevelIteratorT>
199     struct SearchTransform
200     {
201         LevelIteratorT  d_levels;                   // Pointer to levels array
202         int             num_output_levels;          // Number of levels in array
203 
204         // Initializer
Initcub::DipatchHistogram::SearchTransform205         __host__ __device__ __forceinline__ void Init(
206             LevelIteratorT  d_levels,               // Pointer to levels array
207             int             num_output_levels)      // Number of levels in array
208         {
209             this->d_levels          = d_levels;
210             this->num_output_levels = num_output_levels;
211         }
212 
213         // Method for converting samples to bin-ids
214         template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
BinSelectcub::DipatchHistogram::SearchTransform215         __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
216         {
217             /// Level iterator wrapper type
218             typedef typename If<IsPointer<LevelIteratorT>::VALUE,
219                     CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
220                     LevelIteratorT>::Type                                           // Directly use the supplied input iterator type
221                 WrappedLevelIteratorT;
222 
223             WrappedLevelIteratorT wrapped_levels(d_levels);
224 
225             int num_bins = num_output_levels - 1;
226             if (valid)
227             {
228                 bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
229                 if (bin >= num_bins)
230                     bin = -1;
231             }
232         }
233     };
234 
235 
236     // Scales samples to evenly-spaced bins
237     struct ScaleTransform
238     {
239         int    num_bins;    // Number of levels in array
240         LevelT max;         // Max sample level (exclusive)
241         LevelT min;         // Min sample level (inclusive)
242         LevelT scale;       // Bin scaling factor
243 
244         // Initializer
245         template <typename _LevelT>
Initcub::DipatchHistogram::ScaleTransform246         __host__ __device__ __forceinline__ void Init(
247             int     num_output_levels,  // Number of levels in array
248             _LevelT max,                // Max sample level (exclusive)
249             _LevelT min,                // Min sample level (inclusive)
250             _LevelT scale)              // Bin scaling factor
251         {
252             this->num_bins = num_output_levels - 1;
253             this->max = max;
254             this->min = min;
255             this->scale = scale;
256         }
257 
258         // Initializer (float specialization)
Initcub::DipatchHistogram::ScaleTransform259         __host__ __device__ __forceinline__ void Init(
260             int    num_output_levels,   // Number of levels in array
261             float   max,                // Max sample level (exclusive)
262             float   min,                // Min sample level (inclusive)
263             float   scale)              // Bin scaling factor
264         {
265             this->num_bins = num_output_levels - 1;
266             this->max = max;
267             this->min = min;
268             this->scale = float(1.0) / scale;
269         }
270 
271         // Initializer (double specialization)
Initcub::DipatchHistogram::ScaleTransform272         __host__ __device__ __forceinline__ void Init(
273             int    num_output_levels,   // Number of levels in array
274             double max,                 // Max sample level (exclusive)
275             double min,                 // Min sample level (inclusive)
276             double scale)               // Bin scaling factor
277         {
278             this->num_bins = num_output_levels - 1;
279             this->max = max;
280             this->min = min;
281             this->scale = double(1.0) / scale;
282         }
283 
284         // Method for converting samples to bin-ids
285         template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
BinSelectcub::DipatchHistogram::ScaleTransform286         __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
287         {
288             LevelT level_sample = (LevelT) sample;
289 
290             if (valid && (level_sample >= min) && (level_sample < max))
291                 bin = (int) ((level_sample - min) / scale);
292         }
293 
294         // Method for converting samples to bin-ids (float specialization)
295         template <CacheLoadModifier LOAD_MODIFIER>
BinSelectcub::DipatchHistogram::ScaleTransform296         __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
297         {
298             LevelT level_sample = (LevelT) sample;
299 
300             if (valid && (level_sample >= min) && (level_sample < max))
301                 bin = (int) ((level_sample - min) * scale);
302         }
303 
304         // Method for converting samples to bin-ids (double specialization)
305         template <CacheLoadModifier LOAD_MODIFIER>
BinSelectcub::DipatchHistogram::ScaleTransform306         __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
307         {
308             LevelT level_sample = (LevelT) sample;
309 
310             if (valid && (level_sample >= min) && (level_sample < max))
311                 bin = (int) ((level_sample - min) * scale);
312         }
313     };
314 
315 
316     // Pass-through bin transform operator
317     struct PassThruTransform
318     {
319         // Method for converting samples to bin-ids
320         template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
BinSelectcub::DipatchHistogram::PassThruTransform321         __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
322         {
323             if (valid)
324                 bin = (int) sample;
325         }
326     };
327 
328 
329 
330     //---------------------------------------------------------------------
331     // Tuning policies
332     //---------------------------------------------------------------------
333 
334     template <int NOMINAL_ITEMS_PER_THREAD>
335     struct TScale
336     {
337         enum
338         {
339             V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
340             VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
341         };
342     };
343 
344     /// SM35
345     struct Policy350
346     {
347         // HistogramSweepPolicy
348         typedef AgentHistogramPolicy<
349                 128,
350                 TScale<8>::VALUE,
351                 BLOCK_LOAD_DIRECT,
352                 LOAD_LDG,
353                 true,
354                 BLEND,
355                 true>
356             HistogramSweepPolicy;
357     };
358 
359     /// SM50
360     struct Policy500
361     {
362         // HistogramSweepPolicy
363         typedef AgentHistogramPolicy<
364                 384,
365                 TScale<16>::VALUE,
366                 BLOCK_LOAD_DIRECT,
367                 LOAD_LDG,
368                 true,
369                 SMEM,
370                 false>
371             HistogramSweepPolicy;
372     };
373 
374 
375 
376     //---------------------------------------------------------------------
377     // Tuning policies of current PTX compiler pass
378     //---------------------------------------------------------------------
379 
380 #if (CUB_PTX_ARCH >= 500)
381     typedef Policy500 PtxPolicy;
382 
383 #else
384     typedef Policy350 PtxPolicy;
385 
386 #endif
387 
388     // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
389     struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
390 
391 
392     //---------------------------------------------------------------------
393     // Utilities
394     //---------------------------------------------------------------------
395 
396     /**
397      * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
398      */
399     template <typename KernelConfig>
400     CUB_RUNTIME_FUNCTION __forceinline__
InitConfigscub::DipatchHistogram401     static cudaError_t InitConfigs(
402         int             ptx_version,
403         KernelConfig    &histogram_sweep_config)
404     {
405         cudaError_t result = cudaErrorNotSupported;
406         if (CUB_IS_DEVICE_CODE)
407         {
408             #if CUB_INCLUDE_DEVICE_CODE
409                 // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
410                 result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
411             #endif
412         }
413         else
414         {
415             #if CUB_INCLUDE_HOST_CODE
416                 // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
417                 if (ptx_version >= 500)
418                 {
419                     result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
420                 }
421                 else
422                 {
423                     result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
424                 }
425             #endif
426         }
427         return result;
428     }
429 
430 
431     /**
432      * Kernel kernel dispatch configuration
433      */
434     struct KernelConfig
435     {
436         int                             block_threads;
437         int                             pixels_per_thread;
438 
439         template <typename BlockPolicy>
440         CUB_RUNTIME_FUNCTION __forceinline__
Initcub::DipatchHistogram::KernelConfig441         cudaError_t Init()
442         {
443             block_threads               = BlockPolicy::BLOCK_THREADS;
444             pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
445 
446             return cudaSuccess;
447         }
448     };
449 
450 
451     //---------------------------------------------------------------------
452     // Dispatch entrypoints
453     //---------------------------------------------------------------------
454 
455     /**
456      * Privatization-based dispatch routine
457      */
458     template <
459         typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
460         typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
461         typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
462         typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
463     CUB_RUNTIME_FUNCTION __forceinline__
PrivatizedDispatchcub::DipatchHistogram464     static cudaError_t PrivatizedDispatch(
465         void*                               d_temp_storage,                                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
466         size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
467         SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
468         CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
469         int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
470         PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
471         int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
472         OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
473         int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
474         OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
475         OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
476         OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
477         DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
478         DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
479         KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
480         cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
481         bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
482     {
483     #ifndef CUB_RUNTIME_ENABLED
484 
485         // Kernel launch not supported from this device
486         return CubDebug(cudaErrorNotSupported);
487 
488     #else
489 
490         cudaError error = cudaSuccess;
491         do
492         {
493             // Get device ordinal
494             int device_ordinal;
495             if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
496 
497             // Get SM count
498             int sm_count;
499             if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
500 
501             // Get SM occupancy for histogram_sweep_kernel
502             int histogram_sweep_sm_occupancy;
503             if (CubDebug(error = MaxSmOccupancy(
504                 histogram_sweep_sm_occupancy,
505                 histogram_sweep_kernel,
506                 histogram_sweep_config.block_threads))) break;
507 
508             // Get device occupancy for histogram_sweep_kernel
509             int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
510 
511             if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
512             {
513                 // Treat as a single linear array of samples
514                 num_row_pixels      *= num_rows;
515                 num_rows            = 1;
516                 row_stride_samples  = num_row_pixels * NUM_CHANNELS;
517             }
518 
519             // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
520             int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
521             int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
522             int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
523             int blocks_per_col      = (blocks_per_row > 0) ?
524                                         int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
525                                         0;
526             int num_thread_blocks   = blocks_per_row * blocks_per_col;
527 
528             dim3 sweep_grid_dims;
529             sweep_grid_dims.x = (unsigned int) blocks_per_row;
530             sweep_grid_dims.y = (unsigned int) blocks_per_col;
531             sweep_grid_dims.z = 1;
532 
533             // Temporary storage allocation requirements
534             const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
535             void*       allocations[NUM_ALLOCATIONS] = {};
536             size_t      allocation_sizes[NUM_ALLOCATIONS];
537 
538             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
539                 allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
540 
541             allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
542 
543             // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
544             if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
545             if (d_temp_storage == NULL)
546             {
547                 // Return if the caller is simply requesting the size of the storage allocation
548                 break;
549             }
550 
551             // Construct the grid queue descriptor
552             GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
553 
554             // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
555             ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
556             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
557                 d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
558 
559             // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
560             ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
561             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
562                 d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
563 
564             // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
565             ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
566             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
567                 privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
568 
569             // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
570             ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
571             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
572                 output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
573 
574             // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
575             ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
576             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
577                 num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
578 
579             // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
580             ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
581             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
582                 num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
583 
584             int histogram_init_block_threads    = 256;
585             int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
586 
587             // Log DeviceHistogramInitKernel configuration
588             if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
589                 histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
590 
591             // Invoke histogram_init_kernel
592             thrust::cuda_cub::launcher::triple_chevron(
593                 histogram_init_grid_dims, histogram_init_block_threads, 0,
594                 stream
595             ).doit(histogram_init_kernel,
596                 num_output_bins_wrapper,
597                 d_output_histograms_wrapper,
598                 tile_queue);
599 
600             // Return if empty problem
601             if ((blocks_per_row == 0) || (blocks_per_col == 0))
602                 break;
603 
604             // Log histogram_sweep_kernel configuration
605             if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
606                 sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
607                 histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
608 
609             // Invoke histogram_sweep_kernel
610             thrust::cuda_cub::launcher::triple_chevron(
611                 sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
612             ).doit(histogram_sweep_kernel,
613                 d_samples,
614                 num_output_bins_wrapper,
615                 num_privatized_bins_wrapper,
616                 d_output_histograms_wrapper,
617                 d_privatized_histograms_wrapper,
618                 output_decode_op_wrapper,
619                 privatized_decode_op_wrapper,
620                 num_row_pixels,
621                 num_rows,
622                 row_stride_samples,
623                 tiles_per_row,
624                 tile_queue);
625 
626             // Check for failure to launch
627             if (CubDebug(error = cudaPeekAtLastError())) break;
628 
629             // Sync the stream if specified to flush runtime errors
630             if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
631 
632         }
633         while (0);
634 
635         return error;
636 
637     #endif // CUB_RUNTIME_ENABLED
638     }
639 
640 
641 
642     /**
643      * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
644      */
645     CUB_RUNTIME_FUNCTION
DispatchRangecub::DipatchHistogram646     static cudaError_t DispatchRange(
647         void*               d_temp_storage,                                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
648         size_t&             temp_storage_bytes,                            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
649         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
650         CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
651         int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
652         LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
653         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
654         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
655         OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
656         cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
657         bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
658         Int2Type<false>     /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
659     {
660         cudaError error = cudaSuccess;
661         do
662         {
663             // Get PTX version
664             int ptx_version = 0;
665             if (CubDebug(error = PtxVersion(ptx_version))) break;
666 
667             // Get kernel dispatch configurations
668             KernelConfig histogram_sweep_config;
669             if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
670                 break;
671 
672             // Use the search transform op for converting samples to privatized bins
673             typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
674 
675             // Use the pass-thru transform op for converting privatized bins to output bins
676             typedef PassThruTransform OutputDecodeOpT;
677 
678             PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
679             OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
680             int                     max_levels = num_output_levels[0];
681 
682             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
683             {
684                 privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
685                 if (num_output_levels[channel] > max_levels)
686                     max_levels = num_output_levels[channel];
687             }
688             int max_num_output_bins = max_levels - 1;
689 
690             // Dispatch
691             if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
692             {
693                 // Too many bins to keep in shared memory.
694                 const int PRIVATIZED_SMEM_BINS = 0;
695 
696                 if (CubDebug(error = PrivatizedDispatch(
697                     d_temp_storage,
698                     temp_storage_bytes,
699                     d_samples,
700                     d_output_histograms,
701                     num_output_levels,
702                     privatized_decode_op,
703                     num_output_levels,
704                     output_decode_op,
705                     max_num_output_bins,
706                     num_row_pixels,
707                     num_rows,
708                     row_stride_samples,
709                     DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
710                     DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
711                     histogram_sweep_config,
712                     stream,
713                     debug_synchronous))) break;
714             }
715             else
716             {
717                 // Dispatch shared-privatized approach
718                 const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
719 
720                 if (CubDebug(error = PrivatizedDispatch(
721                     d_temp_storage,
722                     temp_storage_bytes,
723                     d_samples,
724                     d_output_histograms,
725                     num_output_levels,
726                     privatized_decode_op,
727                     num_output_levels,
728                     output_decode_op,
729                     max_num_output_bins,
730                     num_row_pixels,
731                     num_rows,
732                     row_stride_samples,
733                     DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
734                     DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
735                     histogram_sweep_config,
736                     stream,
737                     debug_synchronous))) break;
738             }
739 
740         } while (0);
741 
742         return error;
743     }
744 
745 
746     /**
747      * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
748      */
749     CUB_RUNTIME_FUNCTION
DispatchRangecub::DipatchHistogram750     static cudaError_t DispatchRange(
751         void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
752         size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
753         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
754         CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
755         int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
756         LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
757         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
758         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
759         OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
760         cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
761         bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
762         Int2Type<true>      /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
763     {
764         cudaError error = cudaSuccess;
765         do
766         {
767             // Get PTX version
768             int ptx_version = 0;
769             if (CubDebug(error = PtxVersion(ptx_version))) break;
770 
771             // Get kernel dispatch configurations
772             KernelConfig histogram_sweep_config;
773             if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
774                 break;
775 
776             // Use the pass-thru transform op for converting samples to privatized bins
777             typedef PassThruTransform PrivatizedDecodeOpT;
778 
779             // Use the search transform op for converting privatized bins to output bins
780             typedef SearchTransform<LevelT*> OutputDecodeOpT;
781 
782             int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
783             PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
784             OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
785             int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
786 
787             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
788             {
789                 num_privatized_levels[channel] = 257;
790                 output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
791 
792                 if (num_output_levels[channel] > max_levels)
793                     max_levels = num_output_levels[channel];
794             }
795             int max_num_output_bins = max_levels - 1;
796 
797             const int PRIVATIZED_SMEM_BINS = 256;
798 
799             if (CubDebug(error = PrivatizedDispatch(
800                 d_temp_storage,
801                 temp_storage_bytes,
802                 d_samples,
803                 d_output_histograms,
804                 num_privatized_levels,
805                 privatized_decode_op,
806                 num_output_levels,
807                 output_decode_op,
808                 max_num_output_bins,
809                 num_row_pixels,
810                 num_rows,
811                 row_stride_samples,
812                 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
813                 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
814                 histogram_sweep_config,
815                 stream,
816                 debug_synchronous))) break;
817 
818         } while (0);
819 
820         return error;
821     }
822 
823 
824     /**
825      * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
826      */
827     CUB_RUNTIME_FUNCTION __forceinline__
DispatchEvencub::DipatchHistogram828     static cudaError_t DispatchEven(
829         void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
830         size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
831         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
832         CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
833         int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
834         LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
835         LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
836         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
837         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
838         OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
839         cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
840         bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
841         Int2Type<false>     /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
842     {
843         cudaError error = cudaSuccess;
844         do
845         {
846             // Get PTX version
847             int ptx_version = 0;
848             if (CubDebug(error = PtxVersion(ptx_version))) break;
849 
850             // Get kernel dispatch configurations
851             KernelConfig histogram_sweep_config;
852             if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
853                 break;
854 
855             // Use the scale transform op for converting samples to privatized bins
856             typedef ScaleTransform PrivatizedDecodeOpT;
857 
858             // Use the pass-thru transform op for converting privatized bins to output bins
859             typedef PassThruTransform OutputDecodeOpT;
860 
861             PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
862             OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
863             int                         max_levels = num_output_levels[0];
864 
865             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
866             {
867                 int     bins    = num_output_levels[channel] - 1;
868                 LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
869 
870                 privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
871 
872                 if (num_output_levels[channel] > max_levels)
873                     max_levels = num_output_levels[channel];
874             }
875             int max_num_output_bins = max_levels - 1;
876 
877             if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
878             {
879                 // Dispatch shared-privatized approach
880                 const int PRIVATIZED_SMEM_BINS = 0;
881 
882                 if (CubDebug(error = PrivatizedDispatch(
883                     d_temp_storage,
884                     temp_storage_bytes,
885                     d_samples,
886                     d_output_histograms,
887                     num_output_levels,
888                     privatized_decode_op,
889                     num_output_levels,
890                     output_decode_op,
891                     max_num_output_bins,
892                     num_row_pixels,
893                     num_rows,
894                     row_stride_samples,
895                     DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
896                     DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
897                     histogram_sweep_config,
898                     stream,
899                     debug_synchronous))) break;
900             }
901             else
902             {
903                 // Dispatch shared-privatized approach
904                 const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
905 
906                 if (CubDebug(error = PrivatizedDispatch(
907                     d_temp_storage,
908                     temp_storage_bytes,
909                     d_samples,
910                     d_output_histograms,
911                     num_output_levels,
912                     privatized_decode_op,
913                     num_output_levels,
914                     output_decode_op,
915                     max_num_output_bins,
916                     num_row_pixels,
917                     num_rows,
918                     row_stride_samples,
919                     DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
920                     DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
921                     histogram_sweep_config,
922                     stream,
923                     debug_synchronous))) break;
924             }
925         }
926         while (0);
927 
928         return error;
929     }
930 
931 
932     /**
933      * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
934      */
935     CUB_RUNTIME_FUNCTION __forceinline__
DispatchEvencub::DipatchHistogram936     static cudaError_t DispatchEven(
937         void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
938         size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
939         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
940         CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
941         int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
942         LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
943         LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
944         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
945         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
946         OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
947         cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
948         bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
949         Int2Type<true>      /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
950     {
951         cudaError error = cudaSuccess;
952         do
953         {
954             // Get PTX version
955             int ptx_version = 0;
956             if (CubDebug(error = PtxVersion(ptx_version))) break;
957 
958             // Get kernel dispatch configurations
959             KernelConfig histogram_sweep_config;
960             if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
961                 break;
962 
963             // Use the pass-thru transform op for converting samples to privatized bins
964             typedef PassThruTransform PrivatizedDecodeOpT;
965 
966             // Use the scale transform op for converting privatized bins to output bins
967             typedef ScaleTransform OutputDecodeOpT;
968 
969             int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
970             PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
971             OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
972             int                     max_levels = num_output_levels[0];
973 
974             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
975             {
976                 num_privatized_levels[channel] = 257;
977 
978                 int     bins    = num_output_levels[channel] - 1;
979                 LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
980                 output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
981 
982                 if (num_output_levels[channel] > max_levels)
983                     max_levels = num_output_levels[channel];
984             }
985             int max_num_output_bins = max_levels - 1;
986 
987             const int PRIVATIZED_SMEM_BINS = 256;
988 
989             if (CubDebug(error = PrivatizedDispatch(
990                 d_temp_storage,
991                 temp_storage_bytes,
992                 d_samples,
993                 d_output_histograms,
994                 num_privatized_levels,
995                 privatized_decode_op,
996                 num_output_levels,
997                 output_decode_op,
998                 max_num_output_bins,
999                 num_row_pixels,
1000                 num_rows,
1001                 row_stride_samples,
1002                 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
1003                 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
1004                 histogram_sweep_config,
1005                 stream,
1006                 debug_synchronous))) break;
1007 
1008         }
1009         while (0);
1010 
1011         return error;
1012     }
1013 
1014 };
1015 
1016 
1017 }               // CUB namespace
1018 CUB_NS_POSTFIX  // Optional outer namespace(s)
1019 
1020 
1021