1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *     * Redistributions of source code must retain the above copyright
9  *       notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above copyright
11  *       notice, this list of conditions and the following disclaimer in the
12  *       documentation and/or other materials provided with the distribution.
13  *     * Neither the name of the NVIDIA CORPORATION nor the
14  *       names of its contributors may be used to endorse or promote products
15  *       derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
30 /**
31  * \file
32  * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
33  */
34 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 #include <limits>
40 
41 #include "../../agent/agent_histogram.cuh"
42 #include "../../util_debug.cuh"
43 #include "../../util_device.cuh"
44 #include "../../thread/thread_search.cuh"
45 #include "../../grid/grid_queue.cuh"
46 #include "../../util_namespace.cuh"
47 
48 /// Optional outer namespace(s)
49 CUB_NS_PREFIX
50 
51 /// CUB namespace
52 namespace cub {
53 
54 
55 
56 /******************************************************************************
57  * Histogram kernel entry points
58  *****************************************************************************/
59 
60 /**
61  * Histogram initialization kernel entry point
62  */
63 template <
64     int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
65     typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
66     typename                                        OffsetT>                        ///< Signed integer type for global offsets
DeviceHistogramInitKernel(ArrayWrapper<int,NUM_ACTIVE_CHANNELS> num_output_bins_wrapper,ArrayWrapper<CounterT *,NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper,GridQueue<int> tile_queue)67 __global__ void DeviceHistogramInitKernel(
68     ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
69     ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
70     GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
71 {
72     if ((threadIdx.x == 0) && (blockIdx.x == 0))
73         tile_queue.ResetDrain();
74 
75     int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
76 
77     #pragma unroll
78     for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
79     {
80         if (output_bin < num_output_bins_wrapper.array[CHANNEL])
81             d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
82     }
83 }
84 
85 
86 /**
87  * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
88  */
89 template <
90     typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
91     int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
92     int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
93     int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
94     typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
95     typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
96     typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
97     typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
98     typename                                            OffsetT>                        ///< Signed integer type for global offsets
__launch_bounds__(int (AgentHistogramPolicyT::BLOCK_THREADS))99 __launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
100 __global__ void DeviceHistogramSweepKernel(
101     SampleIteratorT                                         d_samples,                          ///< Input data to reduce
102     ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
103     ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
104     ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
105     ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
106     ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
107     ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
108     OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
109     OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
110     OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
111     int                                                     tiles_per_row,                      ///< Number of image tiles per row
112     GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
113 {
114     // Thread block type for compositing input tiles
115     typedef AgentHistogram<
116             AgentHistogramPolicyT,
117             PRIVATIZED_SMEM_BINS,
118             NUM_CHANNELS,
119             NUM_ACTIVE_CHANNELS,
120             SampleIteratorT,
121             CounterT,
122             PrivatizedDecodeOpT,
123             OutputDecodeOpT,
124             OffsetT>
125         AgentHistogramT;
126 
127     // Shared memory for AgentHistogram
128     __shared__ typename AgentHistogramT::TempStorage temp_storage;
129 
130     AgentHistogramT agent(
131         temp_storage,
132         d_samples,
133         num_output_bins_wrapper.array,
134         num_privatized_bins_wrapper.array,
135         d_output_histograms_wrapper.array,
136         d_privatized_histograms_wrapper.array,
137         output_decode_op_wrapper.array,
138         privatized_decode_op_wrapper.array);
139 
140     // Initialize counters
141     agent.InitBinCounters();
142 
143     // Consume input tiles
144     agent.ConsumeTiles(
145         num_row_pixels,
146         num_rows,
147         row_stride_samples,
148         tiles_per_row,
149         tile_queue);
150 
151     // Store output to global (if necessary)
152     agent.StoreOutput();
153 
154 }
155 
156 
157 
158 
159 
160 
161 /******************************************************************************
162  * Dispatch
163  ******************************************************************************/
164 
165 /**
166  * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
167  */
168 template <
169     int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
170     int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
171     typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
172     typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
173     typename    LevelT,                     ///< Type for specifying bin level boundaries
174     typename    OffsetT>                    ///< Signed integer type for global offsets
175 struct DipatchHistogram
176 {
177     //---------------------------------------------------------------------
178     // Types and constants
179     //---------------------------------------------------------------------
180 
181     /// The sample value type of the input iterator
182     typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
183 
184     enum
185     {
186         // Maximum number of bins per channel for which we will use a privatized smem strategy
187         MAX_PRIVATIZED_SMEM_BINS = 256
188     };
189 
190 
191     //---------------------------------------------------------------------
192     // Transform functors for converting samples to bin-ids
193     //---------------------------------------------------------------------
194 
195     // Searches for bin given a list of bin-boundary levels
196     template <typename LevelIteratorT>
197     struct SearchTransform
198     {
199         LevelIteratorT  d_levels;                   // Pointer to levels array
200         int             num_output_levels;          // Number of levels in array
201 
202         // Initializer
Initcub::DipatchHistogram::SearchTransform203         __host__ __device__ __forceinline__ void Init(
204             LevelIteratorT  d_levels,               // Pointer to levels array
205             int             num_output_levels)      // Number of levels in array
206         {
207             this->d_levels          = d_levels;
208             this->num_output_levels = num_output_levels;
209         }
210 
211         // Method for converting samples to bin-ids
212         template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
BinSelectcub::DipatchHistogram::SearchTransform213         __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
214         {
215             /// Level iterator wrapper type
216             typedef typename If<IsPointer<LevelIteratorT>::VALUE,
217                     CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
218                     LevelIteratorT>::Type                                           // Directly use the supplied input iterator type
219                 WrappedLevelIteratorT;
220 
221             WrappedLevelIteratorT wrapped_levels(d_levels);
222 
223             int num_bins = num_output_levels - 1;
224             if (valid)
225             {
226                 bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
227                 if (bin >= num_bins)
228                     bin = -1;
229             }
230         }
231     };
232 
233 
234     // Scales samples to evenly-spaced bins
235     struct ScaleTransform
236     {
237         int    num_bins;    // Number of levels in array
238         LevelT max;         // Max sample level (exclusive)
239         LevelT min;         // Min sample level (inclusive)
240         LevelT scale;       // Bin scaling factor
241 
242         // Initializer
243         template <typename _LevelT>
Initcub::DipatchHistogram::ScaleTransform244         __host__ __device__ __forceinline__ void Init(
245             int     num_output_levels,  // Number of levels in array
246             _LevelT max,                // Max sample level (exclusive)
247             _LevelT min,                // Min sample level (inclusive)
248             _LevelT scale)              // Bin scaling factor
249         {
250             this->num_bins = num_output_levels - 1;
251             this->max = max;
252             this->min = min;
253             this->scale = scale;
254         }
255 
256         // Initializer (float specialization)
Initcub::DipatchHistogram::ScaleTransform257         __host__ __device__ __forceinline__ void Init(
258             int    num_output_levels,   // Number of levels in array
259             float   max,                // Max sample level (exclusive)
260             float   min,                // Min sample level (inclusive)
261             float   scale)              // Bin scaling factor
262         {
263             this->num_bins = num_output_levels - 1;
264             this->max = max;
265             this->min = min;
266             this->scale = float(1.0) / scale;
267         }
268 
269         // Initializer (double specialization)
Initcub::DipatchHistogram::ScaleTransform270         __host__ __device__ __forceinline__ void Init(
271             int    num_output_levels,   // Number of levels in array
272             double max,                 // Max sample level (exclusive)
273             double min,                 // Min sample level (inclusive)
274             double scale)               // Bin scaling factor
275         {
276             this->num_bins = num_output_levels - 1;
277             this->max = max;
278             this->min = min;
279             this->scale = double(1.0) / scale;
280         }
281 
282         // Method for converting samples to bin-ids
283         template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
BinSelectcub::DipatchHistogram::ScaleTransform284         __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
285         {
286             LevelT level_sample = (LevelT) sample;
287 
288             if (valid && (level_sample >= min) && (level_sample < max))
289                 bin = (int) ((level_sample - min) / scale);
290         }
291 
292         // Method for converting samples to bin-ids (float specialization)
293         template <CacheLoadModifier LOAD_MODIFIER>
BinSelectcub::DipatchHistogram::ScaleTransform294         __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
295         {
296             LevelT level_sample = (LevelT) sample;
297 
298             if (valid && (level_sample >= min) && (level_sample < max))
299                 bin = (int) ((level_sample - min) * scale);
300         }
301 
302         // Method for converting samples to bin-ids (double specialization)
303         template <CacheLoadModifier LOAD_MODIFIER>
BinSelectcub::DipatchHistogram::ScaleTransform304         __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
305         {
306             LevelT level_sample = (LevelT) sample;
307 
308             if (valid && (level_sample >= min) && (level_sample < max))
309                 bin = (int) ((level_sample - min) * scale);
310         }
311     };
312 
313 
314     // Pass-through bin transform operator
315     struct PassThruTransform
316     {
317         // Method for converting samples to bin-ids
318         template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
BinSelectcub::DipatchHistogram::PassThruTransform319         __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
320         {
321             if (valid)
322                 bin = (int) sample;
323         }
324     };
325 
326 
327 
328     //---------------------------------------------------------------------
329     // Tuning policies
330     //---------------------------------------------------------------------
331 
332     template <int NOMINAL_ITEMS_PER_THREAD>
333     struct TScale
334     {
335         enum
336         {
337             V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
338             VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
339         };
340     };
341 
342 
343     /// SM11
344     struct Policy110
345     {
346         // HistogramSweepPolicy
347         typedef AgentHistogramPolicy<
348                 512,
349                 (NUM_CHANNELS == 1) ? 8 : 2,
350                 BLOCK_LOAD_DIRECT,
351                 LOAD_DEFAULT,
352                 true,
353                 GMEM,
354                 false>
355             HistogramSweepPolicy;
356     };
357 
358     /// SM20
359     struct Policy200
360     {
361         // HistogramSweepPolicy
362         typedef AgentHistogramPolicy<
363                 (NUM_CHANNELS == 1) ? 256 : 128,
364                 (NUM_CHANNELS == 1) ? 8 : 3,
365                 (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
366                 LOAD_DEFAULT,
367                 true,
368                 SMEM,
369                 false>
370             HistogramSweepPolicy;
371     };
372 
373     /// SM30
374     struct Policy300
375     {
376         // HistogramSweepPolicy
377         typedef AgentHistogramPolicy<
378                 512,
379                 (NUM_CHANNELS == 1) ? 8 : 2,
380                 BLOCK_LOAD_DIRECT,
381                 LOAD_DEFAULT,
382                 true,
383                 GMEM,
384                 false>
385             HistogramSweepPolicy;
386     };
387 
388     /// SM35
389     struct Policy350
390     {
391         // HistogramSweepPolicy
392         typedef AgentHistogramPolicy<
393                 128,
394                 TScale<8>::VALUE,
395                 BLOCK_LOAD_DIRECT,
396                 LOAD_LDG,
397                 true,
398                 BLEND,
399                 true>
400             HistogramSweepPolicy;
401     };
402 
403     /// SM50
404     struct Policy500
405     {
406         // HistogramSweepPolicy
407         typedef AgentHistogramPolicy<
408                 384,
409                 TScale<16>::VALUE,
410                 BLOCK_LOAD_DIRECT,
411                 LOAD_LDG,
412                 true,
413                 SMEM,
414                 false>
415             HistogramSweepPolicy;
416     };
417 
418 
419 
420     //---------------------------------------------------------------------
421     // Tuning policies of current PTX compiler pass
422     //---------------------------------------------------------------------
423 
424 #if (CUB_PTX_ARCH >= 500)
425     typedef Policy500 PtxPolicy;
426 
427 #elif (CUB_PTX_ARCH >= 350)
428     typedef Policy350 PtxPolicy;
429 
430 #elif (CUB_PTX_ARCH >= 300)
431     typedef Policy300 PtxPolicy;
432 
433 #elif (CUB_PTX_ARCH >= 200)
434     typedef Policy200 PtxPolicy;
435 
436 #else
437     typedef Policy110 PtxPolicy;
438 
439 #endif
440 
441     // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
442     struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
443 
444 
445     //---------------------------------------------------------------------
446     // Utilities
447     //---------------------------------------------------------------------
448 
449     /**
450      * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
451      */
452     template <typename KernelConfig>
453     CUB_RUNTIME_FUNCTION __forceinline__
InitConfigscub::DipatchHistogram454     static cudaError_t InitConfigs(
455         int             ptx_version,
456         KernelConfig    &histogram_sweep_config)
457     {
458     #if (CUB_PTX_ARCH > 0)
459 
460         // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
461         return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
462 
463     #else
464 
465         // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
466         if (ptx_version >= 500)
467         {
468             return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
469         }
470         else if (ptx_version >= 350)
471         {
472             return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
473         }
474         else if (ptx_version >= 300)
475         {
476             return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
477         }
478         else if (ptx_version >= 200)
479         {
480             return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
481         }
482         else if (ptx_version >= 110)
483         {
484             return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
485         }
486         else
487         {
488             // No global atomic support
489             return cudaErrorNotSupported;
490         }
491 
492     #endif
493     }
494 
495 
496     /**
497      * Kernel kernel dispatch configuration
498      */
499     struct KernelConfig
500     {
501         int                             block_threads;
502         int                             pixels_per_thread;
503 
504         template <typename BlockPolicy>
505         CUB_RUNTIME_FUNCTION __forceinline__
Initcub::DipatchHistogram::KernelConfig506         cudaError_t Init()
507         {
508             block_threads               = BlockPolicy::BLOCK_THREADS;
509             pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
510 
511             return cudaSuccess;
512         }
513     };
514 
515 
516     //---------------------------------------------------------------------
517     // Dispatch entrypoints
518     //---------------------------------------------------------------------
519 
520     /**
521      * Privatization-based dispatch routine
522      */
523     template <
524         typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
525         typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
526         typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
527         typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
528     CUB_RUNTIME_FUNCTION __forceinline__
PrivatizedDispatchcub::DipatchHistogram529     static cudaError_t PrivatizedDispatch(
530         void*                               d_temp_storage,                                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
531         size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
532         SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
533         CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
534         int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
535         PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
536         int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
537         OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
538         int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
539         OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
540         OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
541         OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
542         DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
543         DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
544         KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
545         cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
546         bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
547     {
548     #ifndef CUB_RUNTIME_ENABLED
549 
550         // Kernel launch not supported from this device
551         return CubDebug(cudaErrorNotSupported);
552 
553     #else
554 
555         cudaError error = cudaSuccess;
556         do
557         {
558             // Get device ordinal
559             int device_ordinal;
560             if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
561 
562             // Get SM count
563             int sm_count;
564             if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
565 
566             // Get SM occupancy for histogram_sweep_kernel
567             int histogram_sweep_sm_occupancy;
568             if (CubDebug(error = MaxSmOccupancy(
569                 histogram_sweep_sm_occupancy,
570                 histogram_sweep_kernel,
571                 histogram_sweep_config.block_threads))) break;
572 
573             // Get device occupancy for histogram_sweep_kernel
574             int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
575 
576             if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
577             {
578                 // Treat as a single linear array of samples
579                 num_row_pixels      *= num_rows;
580                 num_rows            = 1;
581                 row_stride_samples  = num_row_pixels * NUM_CHANNELS;
582             }
583 
584             // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
585             int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
586             int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
587             int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
588             int blocks_per_col      = (blocks_per_row > 0) ?
589                                         int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
590                                         0;
591             int num_thread_blocks   = blocks_per_row * blocks_per_col;
592 
593             dim3 sweep_grid_dims;
594             sweep_grid_dims.x = (unsigned int) blocks_per_row;
595             sweep_grid_dims.y = (unsigned int) blocks_per_col;
596             sweep_grid_dims.z = 1;
597 
598             // Temporary storage allocation requirements
599             const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
600             void*       allocations[NUM_ALLOCATIONS];
601             size_t      allocation_sizes[NUM_ALLOCATIONS];
602 
603             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
604                 allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
605 
606             allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
607 
608             // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
609             if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
610             if (d_temp_storage == NULL)
611             {
612                 // Return if the caller is simply requesting the size of the storage allocation
613                 break;
614             }
615 
616             // Construct the grid queue descriptor
617             GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
618 
619             // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
620             ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
621             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
622                 d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
623 
624             // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
625             ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
626             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
627                 d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
628 
629             // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
630             ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
631             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
632                 privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
633 
634             // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
635             ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
636             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
637                 output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
638 
639             // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
640             ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
641             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
642                 num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
643 
644             // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
645             ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
646             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
647                 num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
648 
649             int histogram_init_block_threads    = 256;
650             int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
651 
652             // Log DeviceHistogramInitKernel configuration
653             if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
654                 histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
655 
656             // Invoke histogram_init_kernel
657             histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
658                 num_output_bins_wrapper,
659                 d_output_histograms_wrapper,
660                 tile_queue);
661 
662             // Return if empty problem
663             if ((blocks_per_row == 0) || (blocks_per_col == 0))
664                 break;
665 
666             // Log histogram_sweep_kernel configuration
667             if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
668                 sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
669                 histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
670 
671             // Invoke histogram_sweep_kernel
672             histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
673                 d_samples,
674                 num_output_bins_wrapper,
675                 num_privatized_bins_wrapper,
676                 d_output_histograms_wrapper,
677                 d_privatized_histograms_wrapper,
678                 output_decode_op_wrapper,
679                 privatized_decode_op_wrapper,
680                 num_row_pixels,
681                 num_rows,
682                 row_stride_samples,
683                 tiles_per_row,
684                 tile_queue);
685 
686             // Check for failure to launch
687             if (CubDebug(error = cudaPeekAtLastError())) break;
688 
689             // Sync the stream if specified to flush runtime errors
690             if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
691 
692         }
693         while (0);
694 
695         return error;
696 
697     #endif // CUB_RUNTIME_ENABLED
698     }
699 
700 
701 
702     /**
703      * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
704      */
705     CUB_RUNTIME_FUNCTION
DispatchRangecub::DipatchHistogram706     static cudaError_t DispatchRange(
707         void*               d_temp_storage,                                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
708         size_t&             temp_storage_bytes,                            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
709         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
710         CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
711         int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
712         LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
713         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
714         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
715         OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
716         cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
717         bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
718         Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
719     {
720         cudaError error = cudaSuccess;
721         do
722         {
723             // Get PTX version
724             int ptx_version;
725     #if (CUB_PTX_ARCH == 0)
726             if (CubDebug(error = PtxVersion(ptx_version))) break;
727     #else
728             ptx_version = CUB_PTX_ARCH;
729     #endif
730 
731             // Get kernel dispatch configurations
732             KernelConfig histogram_sweep_config;
733             if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
734                 break;
735 
736             // Use the search transform op for converting samples to privatized bins
737             typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
738 
739             // Use the pass-thru transform op for converting privatized bins to output bins
740             typedef PassThruTransform OutputDecodeOpT;
741 
742             PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
743             OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
744             int                     max_levels = num_output_levels[0];
745 
746             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
747             {
748                 privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
749                 if (num_output_levels[channel] > max_levels)
750                     max_levels = num_output_levels[channel];
751             }
752             int max_num_output_bins = max_levels - 1;
753 
754             // Dispatch
755             if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
756             {
757                 // Too many bins to keep in shared memory.
758                 const int PRIVATIZED_SMEM_BINS = 0;
759 
760                 if (CubDebug(error = PrivatizedDispatch(
761                     d_temp_storage,
762                     temp_storage_bytes,
763                     d_samples,
764                     d_output_histograms,
765                     num_output_levels,
766                     privatized_decode_op,
767                     num_output_levels,
768                     output_decode_op,
769                     max_num_output_bins,
770                     num_row_pixels,
771                     num_rows,
772                     row_stride_samples,
773                     DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
774                     DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
775                     histogram_sweep_config,
776                     stream,
777                     debug_synchronous))) break;
778             }
779             else
780             {
781                 // Dispatch shared-privatized approach
782                 const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
783 
784                 if (CubDebug(error = PrivatizedDispatch(
785                     d_temp_storage,
786                     temp_storage_bytes,
787                     d_samples,
788                     d_output_histograms,
789                     num_output_levels,
790                     privatized_decode_op,
791                     num_output_levels,
792                     output_decode_op,
793                     max_num_output_bins,
794                     num_row_pixels,
795                     num_rows,
796                     row_stride_samples,
797                     DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
798                     DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
799                     histogram_sweep_config,
800                     stream,
801                     debug_synchronous))) break;
802             }
803 
804         } while (0);
805 
806         return error;
807     }
808 
809 
810     /**
811      * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
812      */
813     CUB_RUNTIME_FUNCTION
DispatchRangecub::DipatchHistogram814     static cudaError_t DispatchRange(
815         void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
816         size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
817         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
818         CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
819         int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
820         LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
821         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
822         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
823         OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
824         cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
825         bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
826         Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
827     {
828         cudaError error = cudaSuccess;
829         do
830         {
831             // Get PTX version
832             int ptx_version;
833     #if (CUB_PTX_ARCH == 0)
834             if (CubDebug(error = PtxVersion(ptx_version))) break;
835     #else
836             ptx_version = CUB_PTX_ARCH;
837     #endif
838 
839             // Get kernel dispatch configurations
840             KernelConfig histogram_sweep_config;
841             if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
842                 break;
843 
844             // Use the pass-thru transform op for converting samples to privatized bins
845             typedef PassThruTransform PrivatizedDecodeOpT;
846 
847             // Use the search transform op for converting privatized bins to output bins
848             typedef SearchTransform<LevelT*> OutputDecodeOpT;
849 
850             int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
851             PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
852             OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
853             int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
854 
855             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
856             {
857                 num_privatized_levels[channel] = 257;
858                 output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
859 
860                 if (num_output_levels[channel] > max_levels)
861                     max_levels = num_output_levels[channel];
862             }
863             int max_num_output_bins = max_levels - 1;
864 
865             const int PRIVATIZED_SMEM_BINS = 256;
866 
867             if (CubDebug(error = PrivatizedDispatch(
868                 d_temp_storage,
869                 temp_storage_bytes,
870                 d_samples,
871                 d_output_histograms,
872                 num_privatized_levels,
873                 privatized_decode_op,
874                 num_output_levels,
875                 output_decode_op,
876                 max_num_output_bins,
877                 num_row_pixels,
878                 num_rows,
879                 row_stride_samples,
880                 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
881                 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
882                 histogram_sweep_config,
883                 stream,
884                 debug_synchronous))) break;
885 
886         } while (0);
887 
888         return error;
889     }
890 
891 
892     /**
893      * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
894      */
895     CUB_RUNTIME_FUNCTION __forceinline__
DispatchEvencub::DipatchHistogram896     static cudaError_t DispatchEven(
897         void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
898         size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
899         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
900         CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
901         int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
902         LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
903         LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
904         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
905         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
906         OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
907         cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
908         bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
909         Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
910     {
911         cudaError error = cudaSuccess;
912         do
913         {
914             // Get PTX version
915             int ptx_version;
916     #if (CUB_PTX_ARCH == 0)
917             if (CubDebug(error = PtxVersion(ptx_version))) break;
918     #else
919             ptx_version = CUB_PTX_ARCH;
920     #endif
921 
922             // Get kernel dispatch configurations
923             KernelConfig histogram_sweep_config;
924             if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
925                 break;
926 
927             // Use the scale transform op for converting samples to privatized bins
928             typedef ScaleTransform PrivatizedDecodeOpT;
929 
930             // Use the pass-thru transform op for converting privatized bins to output bins
931             typedef PassThruTransform OutputDecodeOpT;
932 
933             PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
934             OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
935             int                         max_levels = num_output_levels[0];
936 
937             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
938             {
939                 int     bins    = num_output_levels[channel] - 1;
940                 LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
941 
942                 privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
943 
944                 if (num_output_levels[channel] > max_levels)
945                     max_levels = num_output_levels[channel];
946             }
947             int max_num_output_bins = max_levels - 1;
948 
949             if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
950             {
951                 // Dispatch shared-privatized approach
952                 const int PRIVATIZED_SMEM_BINS = 0;
953 
954                 if (CubDebug(error = PrivatizedDispatch(
955                     d_temp_storage,
956                     temp_storage_bytes,
957                     d_samples,
958                     d_output_histograms,
959                     num_output_levels,
960                     privatized_decode_op,
961                     num_output_levels,
962                     output_decode_op,
963                     max_num_output_bins,
964                     num_row_pixels,
965                     num_rows,
966                     row_stride_samples,
967                     DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
968                     DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
969                     histogram_sweep_config,
970                     stream,
971                     debug_synchronous))) break;
972             }
973             else
974             {
975                 // Dispatch shared-privatized approach
976                 const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
977 
978                 if (CubDebug(error = PrivatizedDispatch(
979                     d_temp_storage,
980                     temp_storage_bytes,
981                     d_samples,
982                     d_output_histograms,
983                     num_output_levels,
984                     privatized_decode_op,
985                     num_output_levels,
986                     output_decode_op,
987                     max_num_output_bins,
988                     num_row_pixels,
989                     num_rows,
990                     row_stride_samples,
991                     DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
992                     DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
993                     histogram_sweep_config,
994                     stream,
995                     debug_synchronous))) break;
996             }
997         }
998         while (0);
999 
1000         return error;
1001     }
1002 
1003 
1004     /**
1005      * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
1006      */
1007     CUB_RUNTIME_FUNCTION __forceinline__
DispatchEvencub::DipatchHistogram1008     static cudaError_t DispatchEven(
1009         void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
1010         size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
1011         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
1012         CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
1013         int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
1014         LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
1015         LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
1016         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
1017         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
1018         OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
1019         cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
1020         bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
1021         Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
1022     {
1023         cudaError error = cudaSuccess;
1024         do
1025         {
1026             // Get PTX version
1027             int ptx_version;
1028     #if (CUB_PTX_ARCH == 0)
1029             if (CubDebug(error = PtxVersion(ptx_version))) break;
1030     #else
1031             ptx_version = CUB_PTX_ARCH;
1032     #endif
1033 
1034             // Get kernel dispatch configurations
1035             KernelConfig histogram_sweep_config;
1036             if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
1037                 break;
1038 
1039             // Use the pass-thru transform op for converting samples to privatized bins
1040             typedef PassThruTransform PrivatizedDecodeOpT;
1041 
1042             // Use the scale transform op for converting privatized bins to output bins
1043             typedef ScaleTransform OutputDecodeOpT;
1044 
1045             int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
1046             PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
1047             OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
1048             int                     max_levels = num_output_levels[0];
1049 
1050             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1051             {
1052                 num_privatized_levels[channel] = 257;
1053 
1054                 int     bins    = num_output_levels[channel] - 1;
1055                 LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
1056                 output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
1057 
1058                 if (num_output_levels[channel] > max_levels)
1059                     max_levels = num_output_levels[channel];
1060             }
1061             int max_num_output_bins = max_levels - 1;
1062 
1063             const int PRIVATIZED_SMEM_BINS = 256;
1064 
1065             if (CubDebug(error = PrivatizedDispatch(
1066                 d_temp_storage,
1067                 temp_storage_bytes,
1068                 d_samples,
1069                 d_output_histograms,
1070                 num_privatized_levels,
1071                 privatized_decode_op,
1072                 num_output_levels,
1073                 output_decode_op,
1074                 max_num_output_bins,
1075                 num_row_pixels,
1076                 num_rows,
1077                 row_stride_samples,
1078                 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
1079                 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
1080                 histogram_sweep_config,
1081                 stream,
1082                 debug_synchronous))) break;
1083 
1084         }
1085         while (0);
1086 
1087         return error;
1088     }
1089 
1090 };
1091 
1092 
1093 }               // CUB namespace
1094 CUB_NS_POSTFIX  // Optional outer namespace(s)
1095 
1096 
1097