1
2 /******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
30 /**
31 * \file
32 * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
33 */
34
35 #pragma once
36
37 #include <stdio.h>
38 #include <iterator>
39 #include <limits>
40
41 #include "../../agent/agent_histogram.cuh"
42 #include "../../util_debug.cuh"
43 #include "../../util_device.cuh"
44 #include "../../thread/thread_search.cuh"
45 #include "../../grid/grid_queue.cuh"
46 #include "../../config.cuh"
47
48 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
49
50 /// Optional outer namespace(s)
51 CUB_NS_PREFIX
52
53 /// CUB namespace
54 namespace cub {
55
56
57
58 /******************************************************************************
59 * Histogram kernel entry points
60 *****************************************************************************/
61
62 /**
63 * Histogram initialization kernel entry point
64 */
65 template <
66 int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
67 typename CounterT, ///< Integer type for counting sample occurrences per histogram bin
68 typename OffsetT> ///< Signed integer type for global offsets
DeviceHistogramInitKernel(ArrayWrapper<int,NUM_ACTIVE_CHANNELS> num_output_bins_wrapper,ArrayWrapper<CounterT *,NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper,GridQueue<int> tile_queue)69 __global__ void DeviceHistogramInitKernel(
70 ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper, ///< Number of output histogram bins per channel
71 ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper, ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
72 GridQueue<int> tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
73 {
74 if ((threadIdx.x == 0) && (blockIdx.x == 0))
75 tile_queue.ResetDrain();
76
77 int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
78
79 #pragma unroll
80 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
81 {
82 if (output_bin < num_output_bins_wrapper.array[CHANNEL])
83 d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
84 }
85 }
86
87
88 /**
89 * Histogram privatized sweep kernel entry point (multi-block). Computes privatized histograms, one per thread block.
90 */
91 template <
92 typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type
93 int PRIVATIZED_SMEM_BINS, ///< Maximum number of histogram bins per channel (e.g., up to 256)
94 int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
95 int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
96 typename SampleIteratorT, ///< The input iterator type. \iterator.
97 typename CounterT, ///< Integer type for counting sample occurrences per histogram bin
98 typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
99 typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
100 typename OffsetT> ///< Signed integer type for global offsets
__launch_bounds__(int (AgentHistogramPolicyT::BLOCK_THREADS))101 __launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
102 __global__ void DeviceHistogramSweepKernel(
103 SampleIteratorT d_samples, ///< Input data to reduce
104 ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper, ///< The number bins per final output histogram
105 ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper, ///< The number bins per privatized histogram
106 ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper, ///< Reference to final output histograms
107 ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper, ///< Reference to privatized histograms
108 ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper, ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
109 ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper, ///< The transform operator for determining privatized counter indices from samples, one for each channel
110 OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest
111 OffsetT num_rows, ///< The number of rows in the region of interest
112 OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest
113 int tiles_per_row, ///< Number of image tiles per row
114 GridQueue<int> tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
115 {
116 // Thread block type for compositing input tiles
117 typedef AgentHistogram<
118 AgentHistogramPolicyT,
119 PRIVATIZED_SMEM_BINS,
120 NUM_CHANNELS,
121 NUM_ACTIVE_CHANNELS,
122 SampleIteratorT,
123 CounterT,
124 PrivatizedDecodeOpT,
125 OutputDecodeOpT,
126 OffsetT>
127 AgentHistogramT;
128
129 // Shared memory for AgentHistogram
130 __shared__ typename AgentHistogramT::TempStorage temp_storage;
131
132 AgentHistogramT agent(
133 temp_storage,
134 d_samples,
135 num_output_bins_wrapper.array,
136 num_privatized_bins_wrapper.array,
137 d_output_histograms_wrapper.array,
138 d_privatized_histograms_wrapper.array,
139 output_decode_op_wrapper.array,
140 privatized_decode_op_wrapper.array);
141
142 // Initialize counters
143 agent.InitBinCounters();
144
145 // Consume input tiles
146 agent.ConsumeTiles(
147 num_row_pixels,
148 num_rows,
149 row_stride_samples,
150 tiles_per_row,
151 tile_queue);
152
153 // Store output to global (if necessary)
154 agent.StoreOutput();
155
156 }
157
158
159
160
161
162
163 /******************************************************************************
164 * Dispatch
165 ******************************************************************************/
166
167 /**
168 * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
169 */
170 template <
171 int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
172 int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
173 typename SampleIteratorT, ///< Random-access input iterator type for reading input items \iterator
174 typename CounterT, ///< Integer type for counting sample occurrences per histogram bin
175 typename LevelT, ///< Type for specifying bin level boundaries
176 typename OffsetT> ///< Signed integer type for global offsets
177 struct DipatchHistogram
178 {
179 //---------------------------------------------------------------------
180 // Types and constants
181 //---------------------------------------------------------------------
182
183 /// The sample value type of the input iterator
184 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
185
186 enum
187 {
188 // Maximum number of bins per channel for which we will use a privatized smem strategy
189 MAX_PRIVATIZED_SMEM_BINS = 256
190 };
191
192
193 //---------------------------------------------------------------------
194 // Transform functors for converting samples to bin-ids
195 //---------------------------------------------------------------------
196
197 // Searches for bin given a list of bin-boundary levels
198 template <typename LevelIteratorT>
199 struct SearchTransform
200 {
201 LevelIteratorT d_levels; // Pointer to levels array
202 int num_output_levels; // Number of levels in array
203
204 // Initializer
Initcub::DipatchHistogram::SearchTransform205 __host__ __device__ __forceinline__ void Init(
206 LevelIteratorT d_levels, // Pointer to levels array
207 int num_output_levels) // Number of levels in array
208 {
209 this->d_levels = d_levels;
210 this->num_output_levels = num_output_levels;
211 }
212
213 // Method for converting samples to bin-ids
214 template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
BinSelectcub::DipatchHistogram::SearchTransform215 __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
216 {
217 /// Level iterator wrapper type
218 typedef typename If<IsPointer<LevelIteratorT>::VALUE,
219 CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
220 LevelIteratorT>::Type // Directly use the supplied input iterator type
221 WrappedLevelIteratorT;
222
223 WrappedLevelIteratorT wrapped_levels(d_levels);
224
225 int num_bins = num_output_levels - 1;
226 if (valid)
227 {
228 bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
229 if (bin >= num_bins)
230 bin = -1;
231 }
232 }
233 };
234
235
236 // Scales samples to evenly-spaced bins
237 struct ScaleTransform
238 {
239 int num_bins; // Number of levels in array
240 LevelT max; // Max sample level (exclusive)
241 LevelT min; // Min sample level (inclusive)
242 LevelT scale; // Bin scaling factor
243
244 // Initializer
245 template <typename _LevelT>
Initcub::DipatchHistogram::ScaleTransform246 __host__ __device__ __forceinline__ void Init(
247 int num_output_levels, // Number of levels in array
248 _LevelT max, // Max sample level (exclusive)
249 _LevelT min, // Min sample level (inclusive)
250 _LevelT scale) // Bin scaling factor
251 {
252 this->num_bins = num_output_levels - 1;
253 this->max = max;
254 this->min = min;
255 this->scale = scale;
256 }
257
258 // Initializer (float specialization)
Initcub::DipatchHistogram::ScaleTransform259 __host__ __device__ __forceinline__ void Init(
260 int num_output_levels, // Number of levels in array
261 float max, // Max sample level (exclusive)
262 float min, // Min sample level (inclusive)
263 float scale) // Bin scaling factor
264 {
265 this->num_bins = num_output_levels - 1;
266 this->max = max;
267 this->min = min;
268 this->scale = float(1.0) / scale;
269 }
270
271 // Initializer (double specialization)
Initcub::DipatchHistogram::ScaleTransform272 __host__ __device__ __forceinline__ void Init(
273 int num_output_levels, // Number of levels in array
274 double max, // Max sample level (exclusive)
275 double min, // Min sample level (inclusive)
276 double scale) // Bin scaling factor
277 {
278 this->num_bins = num_output_levels - 1;
279 this->max = max;
280 this->min = min;
281 this->scale = double(1.0) / scale;
282 }
283
284 // Method for converting samples to bin-ids
285 template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
BinSelectcub::DipatchHistogram::ScaleTransform286 __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
287 {
288 LevelT level_sample = (LevelT) sample;
289
290 if (valid && (level_sample >= min) && (level_sample < max))
291 bin = (int) ((level_sample - min) / scale);
292 }
293
294 // Method for converting samples to bin-ids (float specialization)
295 template <CacheLoadModifier LOAD_MODIFIER>
BinSelectcub::DipatchHistogram::ScaleTransform296 __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
297 {
298 LevelT level_sample = (LevelT) sample;
299
300 if (valid && (level_sample >= min) && (level_sample < max))
301 bin = (int) ((level_sample - min) * scale);
302 }
303
304 // Method for converting samples to bin-ids (double specialization)
305 template <CacheLoadModifier LOAD_MODIFIER>
BinSelectcub::DipatchHistogram::ScaleTransform306 __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
307 {
308 LevelT level_sample = (LevelT) sample;
309
310 if (valid && (level_sample >= min) && (level_sample < max))
311 bin = (int) ((level_sample - min) * scale);
312 }
313 };
314
315
316 // Pass-through bin transform operator
317 struct PassThruTransform
318 {
319 // Method for converting samples to bin-ids
320 template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
BinSelectcub::DipatchHistogram::PassThruTransform321 __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
322 {
323 if (valid)
324 bin = (int) sample;
325 }
326 };
327
328
329
330 //---------------------------------------------------------------------
331 // Tuning policies
332 //---------------------------------------------------------------------
333
334 template <int NOMINAL_ITEMS_PER_THREAD>
335 struct TScale
336 {
337 enum
338 {
339 V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
340 VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
341 };
342 };
343
344 /// SM35
345 struct Policy350
346 {
347 // HistogramSweepPolicy
348 typedef AgentHistogramPolicy<
349 128,
350 TScale<8>::VALUE,
351 BLOCK_LOAD_DIRECT,
352 LOAD_LDG,
353 true,
354 BLEND,
355 true>
356 HistogramSweepPolicy;
357 };
358
359 /// SM50
360 struct Policy500
361 {
362 // HistogramSweepPolicy
363 typedef AgentHistogramPolicy<
364 384,
365 TScale<16>::VALUE,
366 BLOCK_LOAD_DIRECT,
367 LOAD_LDG,
368 true,
369 SMEM,
370 false>
371 HistogramSweepPolicy;
372 };
373
374
375
376 //---------------------------------------------------------------------
377 // Tuning policies of current PTX compiler pass
378 //---------------------------------------------------------------------
379
380 #if (CUB_PTX_ARCH >= 500)
381 typedef Policy500 PtxPolicy;
382
383 #else
384 typedef Policy350 PtxPolicy;
385
386 #endif
387
388 // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
389 struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
390
391
392 //---------------------------------------------------------------------
393 // Utilities
394 //---------------------------------------------------------------------
395
396 /**
397 * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
398 */
399 template <typename KernelConfig>
400 CUB_RUNTIME_FUNCTION __forceinline__
InitConfigscub::DipatchHistogram401 static cudaError_t InitConfigs(
402 int ptx_version,
403 KernelConfig &histogram_sweep_config)
404 {
405 cudaError_t result = cudaErrorNotSupported;
406 if (CUB_IS_DEVICE_CODE)
407 {
408 #if CUB_INCLUDE_DEVICE_CODE
409 // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
410 result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
411 #endif
412 }
413 else
414 {
415 #if CUB_INCLUDE_HOST_CODE
416 // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
417 if (ptx_version >= 500)
418 {
419 result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
420 }
421 else
422 {
423 result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
424 }
425 #endif
426 }
427 return result;
428 }
429
430
431 /**
432 * Kernel kernel dispatch configuration
433 */
434 struct KernelConfig
435 {
436 int block_threads;
437 int pixels_per_thread;
438
439 template <typename BlockPolicy>
440 CUB_RUNTIME_FUNCTION __forceinline__
Initcub::DipatchHistogram::KernelConfig441 cudaError_t Init()
442 {
443 block_threads = BlockPolicy::BLOCK_THREADS;
444 pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD;
445
446 return cudaSuccess;
447 }
448 };
449
450
451 //---------------------------------------------------------------------
452 // Dispatch entrypoints
453 //---------------------------------------------------------------------
454
455 /**
456 * Privatization-based dispatch routine
457 */
458 template <
459 typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
460 typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
461 typename DeviceHistogramInitKernelT, ///< Function type of cub::DeviceHistogramInitKernel
462 typename DeviceHistogramSweepKernelT> ///< Function type of cub::DeviceHistogramSweepKernel
463 CUB_RUNTIME_FUNCTION __forceinline__
PrivatizedDispatchcub::DipatchHistogram464 static cudaError_t PrivatizedDispatch(
465 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
466 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
467 SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
468 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
469 int num_privatized_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
470 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel
471 int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
472 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel
473 int max_num_output_bins, ///< [in] Maximum number of output bins in any channel
474 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
475 OffsetT num_rows, ///< [in] The number of rows in the region of interest
476 OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
477 DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
478 DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
479 KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
480 cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
481 bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
482 {
483 #ifndef CUB_RUNTIME_ENABLED
484
485 // Kernel launch not supported from this device
486 return CubDebug(cudaErrorNotSupported);
487
488 #else
489
490 cudaError error = cudaSuccess;
491 do
492 {
493 // Get device ordinal
494 int device_ordinal;
495 if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
496
497 // Get SM count
498 int sm_count;
499 if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
500
501 // Get SM occupancy for histogram_sweep_kernel
502 int histogram_sweep_sm_occupancy;
503 if (CubDebug(error = MaxSmOccupancy(
504 histogram_sweep_sm_occupancy,
505 histogram_sweep_kernel,
506 histogram_sweep_config.block_threads))) break;
507
508 // Get device occupancy for histogram_sweep_kernel
509 int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
510
511 if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
512 {
513 // Treat as a single linear array of samples
514 num_row_pixels *= num_rows;
515 num_rows = 1;
516 row_stride_samples = num_row_pixels * NUM_CHANNELS;
517 }
518
519 // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
520 int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
521 int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
522 int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
523 int blocks_per_col = (blocks_per_row > 0) ?
524 int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
525 0;
526 int num_thread_blocks = blocks_per_row * blocks_per_col;
527
528 dim3 sweep_grid_dims;
529 sweep_grid_dims.x = (unsigned int) blocks_per_row;
530 sweep_grid_dims.y = (unsigned int) blocks_per_col;
531 sweep_grid_dims.z = 1;
532
533 // Temporary storage allocation requirements
534 const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
535 void* allocations[NUM_ALLOCATIONS] = {};
536 size_t allocation_sizes[NUM_ALLOCATIONS];
537
538 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
539 allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
540
541 allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
542
543 // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
544 if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
545 if (d_temp_storage == NULL)
546 {
547 // Return if the caller is simply requesting the size of the storage allocation
548 break;
549 }
550
551 // Construct the grid queue descriptor
552 GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
553
554 // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
555 ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
556 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
557 d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
558
559 // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
560 ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
561 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
562 d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
563
564 // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
565 ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
566 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
567 privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
568
569 // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
570 ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
571 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
572 output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
573
574 // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
575 ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
576 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
577 num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
578
579 // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
580 ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
581 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
582 num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
583
584 int histogram_init_block_threads = 256;
585 int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
586
587 // Log DeviceHistogramInitKernel configuration
588 if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
589 histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
590
591 // Invoke histogram_init_kernel
592 thrust::cuda_cub::launcher::triple_chevron(
593 histogram_init_grid_dims, histogram_init_block_threads, 0,
594 stream
595 ).doit(histogram_init_kernel,
596 num_output_bins_wrapper,
597 d_output_histograms_wrapper,
598 tile_queue);
599
600 // Return if empty problem
601 if ((blocks_per_row == 0) || (blocks_per_col == 0))
602 break;
603
604 // Log histogram_sweep_kernel configuration
605 if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
606 sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
607 histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
608
609 // Invoke histogram_sweep_kernel
610 thrust::cuda_cub::launcher::triple_chevron(
611 sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
612 ).doit(histogram_sweep_kernel,
613 d_samples,
614 num_output_bins_wrapper,
615 num_privatized_bins_wrapper,
616 d_output_histograms_wrapper,
617 d_privatized_histograms_wrapper,
618 output_decode_op_wrapper,
619 privatized_decode_op_wrapper,
620 num_row_pixels,
621 num_rows,
622 row_stride_samples,
623 tiles_per_row,
624 tile_queue);
625
626 // Check for failure to launch
627 if (CubDebug(error = cudaPeekAtLastError())) break;
628
629 // Sync the stream if specified to flush runtime errors
630 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
631
632 }
633 while (0);
634
635 return error;
636
637 #endif // CUB_RUNTIME_ENABLED
638 }
639
640
641
642 /**
643 * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
644 */
645 CUB_RUNTIME_FUNCTION
DispatchRangecub::DipatchHistogram646 static cudaError_t DispatchRange(
647 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
648 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
649 SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
650 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
651 int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
652 LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
653 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
654 OffsetT num_rows, ///< [in] The number of rows in the region of interest
655 OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
656 cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
657 bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
658 Int2Type<false> /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type
659 {
660 cudaError error = cudaSuccess;
661 do
662 {
663 // Get PTX version
664 int ptx_version = 0;
665 if (CubDebug(error = PtxVersion(ptx_version))) break;
666
667 // Get kernel dispatch configurations
668 KernelConfig histogram_sweep_config;
669 if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
670 break;
671
672 // Use the search transform op for converting samples to privatized bins
673 typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
674
675 // Use the pass-thru transform op for converting privatized bins to output bins
676 typedef PassThruTransform OutputDecodeOpT;
677
678 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
679 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
680 int max_levels = num_output_levels[0];
681
682 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
683 {
684 privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
685 if (num_output_levels[channel] > max_levels)
686 max_levels = num_output_levels[channel];
687 }
688 int max_num_output_bins = max_levels - 1;
689
690 // Dispatch
691 if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
692 {
693 // Too many bins to keep in shared memory.
694 const int PRIVATIZED_SMEM_BINS = 0;
695
696 if (CubDebug(error = PrivatizedDispatch(
697 d_temp_storage,
698 temp_storage_bytes,
699 d_samples,
700 d_output_histograms,
701 num_output_levels,
702 privatized_decode_op,
703 num_output_levels,
704 output_decode_op,
705 max_num_output_bins,
706 num_row_pixels,
707 num_rows,
708 row_stride_samples,
709 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
710 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
711 histogram_sweep_config,
712 stream,
713 debug_synchronous))) break;
714 }
715 else
716 {
717 // Dispatch shared-privatized approach
718 const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
719
720 if (CubDebug(error = PrivatizedDispatch(
721 d_temp_storage,
722 temp_storage_bytes,
723 d_samples,
724 d_output_histograms,
725 num_output_levels,
726 privatized_decode_op,
727 num_output_levels,
728 output_decode_op,
729 max_num_output_bins,
730 num_row_pixels,
731 num_rows,
732 row_stride_samples,
733 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
734 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
735 histogram_sweep_config,
736 stream,
737 debug_synchronous))) break;
738 }
739
740 } while (0);
741
742 return error;
743 }
744
745
746 /**
747 * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
748 */
749 CUB_RUNTIME_FUNCTION
DispatchRangecub::DipatchHistogram750 static cudaError_t DispatchRange(
751 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
752 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
753 SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
754 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
755 int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
756 LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
757 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
758 OffsetT num_rows, ///< [in] The number of rows in the region of interest
759 OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
760 cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
761 bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
762 Int2Type<true> /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type
763 {
764 cudaError error = cudaSuccess;
765 do
766 {
767 // Get PTX version
768 int ptx_version = 0;
769 if (CubDebug(error = PtxVersion(ptx_version))) break;
770
771 // Get kernel dispatch configurations
772 KernelConfig histogram_sweep_config;
773 if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
774 break;
775
776 // Use the pass-thru transform op for converting samples to privatized bins
777 typedef PassThruTransform PrivatizedDecodeOpT;
778
779 // Use the search transform op for converting privatized bins to output bins
780 typedef SearchTransform<LevelT*> OutputDecodeOpT;
781
782 int num_privatized_levels[NUM_ACTIVE_CHANNELS];
783 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
784 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
785 int max_levels = num_output_levels[0]; // Maximum number of levels in any channel
786
787 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
788 {
789 num_privatized_levels[channel] = 257;
790 output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
791
792 if (num_output_levels[channel] > max_levels)
793 max_levels = num_output_levels[channel];
794 }
795 int max_num_output_bins = max_levels - 1;
796
797 const int PRIVATIZED_SMEM_BINS = 256;
798
799 if (CubDebug(error = PrivatizedDispatch(
800 d_temp_storage,
801 temp_storage_bytes,
802 d_samples,
803 d_output_histograms,
804 num_privatized_levels,
805 privatized_decode_op,
806 num_output_levels,
807 output_decode_op,
808 max_num_output_bins,
809 num_row_pixels,
810 num_rows,
811 row_stride_samples,
812 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
813 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
814 histogram_sweep_config,
815 stream,
816 debug_synchronous))) break;
817
818 } while (0);
819
820 return error;
821 }
822
823
824 /**
825 * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
826 */
827 CUB_RUNTIME_FUNCTION __forceinline__
DispatchEvencub::DipatchHistogram828 static cudaError_t DispatchEven(
829 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
830 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
831 SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
832 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
833 int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
834 LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
835 LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
836 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
837 OffsetT num_rows, ///< [in] The number of rows in the region of interest
838 OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
839 cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
840 bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
841 Int2Type<false> /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type
842 {
843 cudaError error = cudaSuccess;
844 do
845 {
846 // Get PTX version
847 int ptx_version = 0;
848 if (CubDebug(error = PtxVersion(ptx_version))) break;
849
850 // Get kernel dispatch configurations
851 KernelConfig histogram_sweep_config;
852 if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
853 break;
854
855 // Use the scale transform op for converting samples to privatized bins
856 typedef ScaleTransform PrivatizedDecodeOpT;
857
858 // Use the pass-thru transform op for converting privatized bins to output bins
859 typedef PassThruTransform OutputDecodeOpT;
860
861 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
862 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
863 int max_levels = num_output_levels[0];
864
865 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
866 {
867 int bins = num_output_levels[channel] - 1;
868 LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;
869
870 privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
871
872 if (num_output_levels[channel] > max_levels)
873 max_levels = num_output_levels[channel];
874 }
875 int max_num_output_bins = max_levels - 1;
876
877 if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
878 {
879 // Dispatch shared-privatized approach
880 const int PRIVATIZED_SMEM_BINS = 0;
881
882 if (CubDebug(error = PrivatizedDispatch(
883 d_temp_storage,
884 temp_storage_bytes,
885 d_samples,
886 d_output_histograms,
887 num_output_levels,
888 privatized_decode_op,
889 num_output_levels,
890 output_decode_op,
891 max_num_output_bins,
892 num_row_pixels,
893 num_rows,
894 row_stride_samples,
895 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
896 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
897 histogram_sweep_config,
898 stream,
899 debug_synchronous))) break;
900 }
901 else
902 {
903 // Dispatch shared-privatized approach
904 const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
905
906 if (CubDebug(error = PrivatizedDispatch(
907 d_temp_storage,
908 temp_storage_bytes,
909 d_samples,
910 d_output_histograms,
911 num_output_levels,
912 privatized_decode_op,
913 num_output_levels,
914 output_decode_op,
915 max_num_output_bins,
916 num_row_pixels,
917 num_rows,
918 row_stride_samples,
919 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
920 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
921 histogram_sweep_config,
922 stream,
923 debug_synchronous))) break;
924 }
925 }
926 while (0);
927
928 return error;
929 }
930
931
932 /**
933 * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
934 */
935 CUB_RUNTIME_FUNCTION __forceinline__
DispatchEvencub::DipatchHistogram936 static cudaError_t DispatchEven(
937 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
938 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
939 SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
940 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
941 int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
942 LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
943 LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
944 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
945 OffsetT num_rows, ///< [in] The number of rows in the region of interest
946 OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
947 cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
948 bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
949 Int2Type<true> /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type
950 {
951 cudaError error = cudaSuccess;
952 do
953 {
954 // Get PTX version
955 int ptx_version = 0;
956 if (CubDebug(error = PtxVersion(ptx_version))) break;
957
958 // Get kernel dispatch configurations
959 KernelConfig histogram_sweep_config;
960 if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
961 break;
962
963 // Use the pass-thru transform op for converting samples to privatized bins
964 typedef PassThruTransform PrivatizedDecodeOpT;
965
966 // Use the scale transform op for converting privatized bins to output bins
967 typedef ScaleTransform OutputDecodeOpT;
968
969 int num_privatized_levels[NUM_ACTIVE_CHANNELS];
970 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
971 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
972 int max_levels = num_output_levels[0];
973
974 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
975 {
976 num_privatized_levels[channel] = 257;
977
978 int bins = num_output_levels[channel] - 1;
979 LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;
980 output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
981
982 if (num_output_levels[channel] > max_levels)
983 max_levels = num_output_levels[channel];
984 }
985 int max_num_output_bins = max_levels - 1;
986
987 const int PRIVATIZED_SMEM_BINS = 256;
988
989 if (CubDebug(error = PrivatizedDispatch(
990 d_temp_storage,
991 temp_storage_bytes,
992 d_samples,
993 d_output_histograms,
994 num_privatized_levels,
995 privatized_decode_op,
996 num_output_levels,
997 output_decode_op,
998 max_num_output_bins,
999 num_row_pixels,
1000 num_rows,
1001 row_stride_samples,
1002 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
1003 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
1004 histogram_sweep_config,
1005 stream,
1006 debug_synchronous))) break;
1007
1008 }
1009 while (0);
1010
1011 return error;
1012 }
1013
1014 };
1015
1016
1017 } // CUB namespace
1018 CUB_NS_POSTFIX // Optional outer namespace(s)
1019
1020
1021