1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29 /******************************************************************************
30 * Test of DeviceHistogram utilities
31 ******************************************************************************/
32
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35
36 #include <stdio.h>
37 #include <limits>
38 #include <algorithm>
39 #include <typeinfo>
40
41 #if defined(QUICK_TEST) || defined(QUICKER_TEST)
42 #include <npp.h>
43 #endif
44
45 #include <cub/util_allocator.cuh>
46 #include <cub/iterator/constant_input_iterator.cuh>
47 #include <cub/device/device_histogram.cuh>
48
49 #include "test_util.h"
50
51 using namespace cub;
52
53
54 //---------------------------------------------------------------------
55 // Globals, constants and typedefs
56 //---------------------------------------------------------------------
57
58
59 // Dispatch types
60 enum Backend
61 {
62 CUB, // CUB method
63 NPP, // NPP method
64 CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
65 };
66
67
68 bool g_verbose_input = false;
69 bool g_verbose = false;
70 int g_timing_iterations = 0;
71 int g_repeat = 0;
72 CachingDeviceAllocator g_allocator(true);
73
74
75
76
77 //---------------------------------------------------------------------
78 // Dispatch to NPP histogram
79 //---------------------------------------------------------------------
80
81 #if defined(QUICK_TEST) || defined(QUICKER_TEST)
82
83 /**
84 * Dispatch to single-channel 8b NPP histo-even
85 */
86 template <typename CounterT, typename LevelT, typename OffsetT>
87 //CUB_RUNTIME_FUNCTION __forceinline__
DispatchEven(Int2Type<1> num_channels,Int2Type<1> num_active_channels,Int2Type<NPP> dispatch_to,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,unsigned char * d_samples,CounterT * d_histogram[1],int num_levels[1],LevelT lower_level[1],LevelT upper_level[1],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,cudaStream_t stream,bool debug_synchronous)88 cudaError_t DispatchEven(
89 Int2Type<1> num_channels,
90 Int2Type<1> num_active_channels,
91 Int2Type<NPP> dispatch_to,
92 int timing_timing_iterations,
93 size_t *d_temp_storage_bytes,
94 cudaError_t *d_cdp_error,
95
96 void* d_temp_storage,
97 size_t& temp_storage_bytes,
98 unsigned char *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
99 CounterT *d_histogram[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
100 int num_levels[1], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
101 LevelT lower_level[1], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
102 LevelT upper_level[1], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
103 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
104 OffsetT num_rows, ///< [in] The number of rows in the region of interest
105 OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
106 cudaStream_t stream,
107 bool debug_synchronous)
108 {
109 typedef unsigned char SampleT;
110
111 cudaError_t error = cudaSuccess;
112 NppiSize oSizeROI = {
113 num_row_pixels,
114 num_rows
115 };
116
117 if (d_temp_storage_bytes == NULL)
118 {
119 int nDeviceBufferSize;
120 nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
121 temp_storage_bytes = nDeviceBufferSize;
122 }
123 else
124 {
125 for (int i = 0; i < timing_timing_iterations; ++i)
126 {
127 // compute the histogram
128 nppiHistogramEven_8u_C1R(
129 d_samples,
130 row_stride_bytes,
131 oSizeROI,
132 d_histogram[0],
133 num_levels[0],
134 lower_level[0],
135 upper_level[0],
136 (Npp8u*) d_temp_storage);
137 }
138 }
139
140 return error;
141 }
142
143
144 /**
145 * Dispatch to 3/4 8b NPP histo-even
146 */
147 template <typename CounterT, typename LevelT, typename OffsetT>
148 //CUB_RUNTIME_FUNCTION __forceinline__
DispatchEven(Int2Type<4> num_channels,Int2Type<3> num_active_channels,Int2Type<NPP> dispatch_to,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,unsigned char * d_samples,CounterT * d_histogram[3],int num_levels[3],LevelT lower_level[3],LevelT upper_level[3],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,cudaStream_t stream,bool debug_synchronous)149 cudaError_t DispatchEven(
150 Int2Type<4> num_channels,
151 Int2Type<3> num_active_channels,
152 Int2Type<NPP> dispatch_to,
153 int timing_timing_iterations,
154 size_t *d_temp_storage_bytes,
155 cudaError_t *d_cdp_error,
156
157 void* d_temp_storage,
158 size_t& temp_storage_bytes,
159 unsigned char *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
160 CounterT *d_histogram[3], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
161 int num_levels[3], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
162 LevelT lower_level[3], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
163 LevelT upper_level[3], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
164 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
165 OffsetT num_rows, ///< [in] The number of rows in the region of interest
166 OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
167 cudaStream_t stream,
168 bool debug_synchronous)
169 {
170 typedef unsigned char SampleT;
171
172 cudaError_t error = cudaSuccess;
173 NppiSize oSizeROI = {
174 num_row_pixels,
175 num_rows
176 };
177
178 if (d_temp_storage_bytes == NULL)
179 {
180 int nDeviceBufferSize;
181 nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
182 temp_storage_bytes = nDeviceBufferSize;
183 }
184 else
185 {
186 for (int i = 0; i < timing_timing_iterations; ++i)
187 {
188 // compute the histogram
189 nppiHistogramEven_8u_AC4R(
190 d_samples,
191 row_stride_bytes,
192 oSizeROI,
193 d_histogram,
194 num_levels,
195 lower_level,
196 upper_level,
197 (Npp8u*) d_temp_storage);
198 }
199 }
200
201 return error;
202 }
203
204
205 #endif // #if defined(QUICK_TEST) || defined(QUICKER_TEST)
206
207
208 //---------------------------------------------------------------------
209 // Dispatch to different DeviceHistogram entrypoints
210 //---------------------------------------------------------------------
211
212 template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, int BACKEND>
213 struct Dispatch;
214
215 template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS>
216 struct Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, CUB>
217 {
218 /**
219 * Dispatch to CUB multi histogram-range entrypoint
220 */
221 template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
222 //CUB_RUNTIME_FUNCTION __forceinline__
RangeDispatch223 static cudaError_t Range(
224 int timing_timing_iterations,
225 size_t */*d_temp_storage_bytes*/,
226 cudaError_t */*d_cdp_error*/,
227
228 void* d_temp_storage,
229 size_t& temp_storage_bytes,
230 SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
231 CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
232 int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
233 LevelT *(&d_levels)[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
234 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
235 OffsetT num_rows, ///< [in] The number of rows in the region of interest
236 OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
237 cudaStream_t stream,
238 bool debug_synchronous)
239 {
240 cudaError_t error = cudaSuccess;
241
242 for (int i = 0; i < timing_timing_iterations; ++i)
243 {
244 error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
245 d_temp_storage,
246 temp_storage_bytes,
247 d_samples,
248 d_histogram,
249 num_levels,
250 d_levels,
251 num_row_pixels,
252 num_rows,
253 row_stride_bytes,
254 stream,
255 debug_synchronous);
256 }
257 return error;
258 }
259
260
261 /**
262 * Dispatch to CUB multi histogram-even entrypoint
263 */
264 template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
265 //CUB_RUNTIME_FUNCTION __forceinline__
EvenDispatch266 static cudaError_t Even(
267 int timing_timing_iterations,
268 size_t */*d_temp_storage_bytes*/,
269 cudaError_t */*d_cdp_error*/,
270
271 void* d_temp_storage,
272 size_t& temp_storage_bytes,
273 SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
274 CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
275 int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
276 LevelT *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
277 LevelT *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
278 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
279 OffsetT num_rows, ///< [in] The number of rows in the region of interest
280 OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
281 cudaStream_t stream,
282 bool debug_synchronous)
283 {
284 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
285
286 cudaError_t error = cudaSuccess;
287 for (int i = 0; i < timing_timing_iterations; ++i)
288 {
289 error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
290 d_temp_storage,
291 temp_storage_bytes,
292 d_samples,
293 d_histogram,
294 num_levels,
295 lower_level,
296 upper_level,
297 num_row_pixels,
298 num_rows,
299 row_stride_bytes,
300 stream,
301 debug_synchronous);
302 }
303 return error;
304 }
305
306 };
307
308
309 template <>
310 struct Dispatch<1, 1, CUB>
311 {
312
313 /**
314 * Dispatch to CUB single histogram-range entrypoint
315 */
316 template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
317 //CUB_RUNTIME_FUNCTION __forceinline__
RangeDispatch318 static cudaError_t Range(
319 int timing_timing_iterations,
320 size_t */*d_temp_storage_bytes*/,
321 cudaError_t */*d_cdp_error*/,
322
323 void* d_temp_storage,
324 size_t& temp_storage_bytes,
325 SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
326 CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
327 int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
328 LevelT (&d_levels)[1], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
329 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
330 OffsetT num_rows, ///< [in] The number of rows in the region of interest
331 OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
332 cudaStream_t stream,
333 bool debug_synchronous)
334 {
335 cudaError_t error = cudaSuccess;
336 for (int i = 0; i < timing_timing_iterations; ++i)
337 {
338 error = DeviceHistogram::HistogramRange(
339 d_temp_storage,
340 temp_storage_bytes,
341 d_samples,
342 d_histogram[0],
343 num_levels[0],
344 d_levels[0],
345 num_row_pixels,
346 num_rows,
347 row_stride_bytes,
348 stream,
349 debug_synchronous);
350 }
351 return error;
352 }
353
354
355 /**
356 * Dispatch to CUB single histogram-even entrypoint
357 */
358 template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
359 //CUB_RUNTIME_FUNCTION __forceinline__
EvenDispatch360 static cudaError_t Even(
361 int timing_timing_iterations,
362 size_t */*d_temp_storage_bytes*/,
363 cudaError_t */*d_cdp_error*/,
364
365 void* d_temp_storage,
366 size_t& temp_storage_bytes,
367 SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
368 CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
369 int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
370 LevelT *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
371 LevelT *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
372 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
373 OffsetT num_rows, ///< [in] The number of rows in the region of interest
374 OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
375 cudaStream_t stream,
376 bool debug_synchronous)
377 {
378 cudaError_t error = cudaSuccess;
379 for (int i = 0; i < timing_timing_iterations; ++i)
380 {
381 error = DeviceHistogram::HistogramEven(
382 d_temp_storage,
383 temp_storage_bytes,
384 d_samples,
385 d_histogram[0],
386 num_levels[0],
387 lower_level[0],
388 upper_level[0],
389 num_row_pixels,
390 num_rows,
391 row_stride_bytes,
392 stream,
393 debug_synchronous);
394 }
395 return error;
396 }
397
398 };
399
400
401
402 //---------------------------------------------------------------------
403 // CUDA nested-parallelism test kernel
404 //---------------------------------------------------------------------
405
406 /**
407 * Simple wrapper kernel to invoke DeviceHistogram
408 * /
409 template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
410 __global__ void CnpDispatchKernel(
411 Int2Type<ALGORITHM> algorithm,
412 int timing_timing_iterations,
413 size_t *d_temp_storage_bytes,
414 cudaError_t *d_cdp_error,
415
416 void* d_temp_storage,
417 size_t temp_storage_bytes,
418 SampleT *d_samples,
419 SampleIteratorT d_sample_itr,
420 ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
421 int num_samples,
422 bool debug_synchronous)
423 {
424 #ifndef CUB_CDP
425 *d_cdp_error = cudaErrorNotSupported;
426 #else
427 *d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_samples, 0, debug_synchronous);
428 *d_temp_storage_bytes = temp_storage_bytes;
429 #endif
430 }
431
432
433 / **
434 * Dispatch to CDP kernel
435 * /
436 template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
437 cudaError_t Dispatch(
438 Int2Type<ALGORITHM> algorithm,
439 Int2Type<true> use_cdp,
440 int timing_timing_iterations,
441 size_t *d_temp_storage_bytes,
442 cudaError_t *d_cdp_error,
443
444 void* d_temp_storage,
445 size_t& temp_storage_bytes,
446 SampleT *d_samples,
447 SampleIteratorT d_sample_itr,
448 CounterT *d_histograms[NUM_ACTIVE_CHANNELS],
449 int num_samples,
450 cudaStream_t stream,
451 bool debug_synchronous)
452 {
453 // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
454 ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
455 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
456 d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
457
458 // Invoke kernel to invoke device-side dispatch
459 CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo_wrapper, num_samples, debug_synchronous);
460
461 // Copy out temp_storage_bytes
462 CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
463
464 // Copy out error
465 cudaError_t retval;
466 CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
467 return retval;
468 }
469 */
470
471
472 //---------------------------------------------------------------------
473 // Test generation
474 //---------------------------------------------------------------------
475
476 // Searches for bin given a list of bin-boundary levels
477 template <typename LevelT>
478 struct SearchTransform
479 {
480 LevelT *levels; // Pointer to levels array
481 int num_levels; // Number of levels in array
482
483 // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
484 template <typename SampleT>
operator ()SearchTransform485 int operator()(SampleT sample)
486 {
487 int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
488 if (bin < 0)
489 {
490 // Sample out of range
491 return num_levels;
492 }
493 return bin;
494 }
495 };
496
497
498 // Scales samples to evenly-spaced bins
499 template <typename LevelT>
500 struct ScaleTransform
501 {
502 int num_levels; // Number of levels in array
503 LevelT max; // Max sample level (exclusive)
504 LevelT min; // Min sample level (inclusive)
505 LevelT scale; // Bin scaling factor
506
InitScaleTransform507 void Init(
508 int num_levels, // Number of levels in array
509 LevelT max, // Max sample level (exclusive)
510 LevelT min, // Min sample level (inclusive)
511 LevelT scale) // Bin scaling factor
512 {
513 this->num_levels = num_levels;
514 this->max = max;
515 this->min = min;
516 this->scale = scale;
517 }
518
519 // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
520 template <typename SampleT>
operator ()ScaleTransform521 int operator()(SampleT sample)
522 {
523 if ((sample < min) || (sample >= max))
524 {
525 // Sample out of range
526 return num_levels;
527 }
528
529 return (int) ((((LevelT) sample) - min) / scale);
530 }
531 };
532
533 // Scales samples to evenly-spaced bins
534 template <>
535 struct ScaleTransform<float>
536 {
537 int num_levels; // Number of levels in array
538 float max; // Max sample level (exclusive)
539 float min; // Min sample level (inclusive)
540 float scale; // Bin scaling factor
541
InitScaleTransform542 void Init(
543 int num_levels, // Number of levels in array
544 float max, // Max sample level (exclusive)
545 float min, // Min sample level (inclusive)
546 float scale) // Bin scaling factor
547 {
548 this->num_levels = num_levels;
549 this->max = max;
550 this->min = min;
551 this->scale = 1.0f / scale;
552 }
553
554 // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
555 template <typename SampleT>
operator ()ScaleTransform556 int operator()(SampleT sample)
557 {
558 if ((sample < min) || (sample >= max))
559 {
560 // Sample out of range
561 return num_levels;
562 }
563
564 return (int) ((((float) sample) - min) * scale);
565 }
566 };
567
568
569 /**
570 * Generate sample
571 */
572 template <typename T, typename LevelT>
Sample(T & datum,LevelT max_level,int entropy_reduction)573 void Sample(T &datum, LevelT max_level, int entropy_reduction)
574 {
575 unsigned int max = (unsigned int) -1;
576 unsigned int bits;
577 RandomBits(bits, entropy_reduction);
578 float fraction = (float(bits) / max);
579
580 datum = (T) (fraction * max_level);
581 }
582
583
584 /**
585 * Initialize histogram samples
586 */
587 template <
588 int NUM_CHANNELS,
589 int NUM_ACTIVE_CHANNELS,
590 typename LevelT,
591 typename SampleT,
592 typename OffsetT>
InitializeSamples(LevelT max_level,int entropy_reduction,SampleT * h_samples,OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes)593 void InitializeSamples(
594 LevelT max_level,
595 int entropy_reduction,
596 SampleT *h_samples,
597 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
598 OffsetT num_rows, ///< [in] The number of rows in the region of interest
599 OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
600 {
601 // Initialize samples
602 for (OffsetT row = 0; row < num_rows; ++row)
603 {
604 for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
605 {
606 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
607 {
608 // Sample offset
609 OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
610
611 // Init sample value
612 Sample(h_samples[offset], max_level, entropy_reduction);
613 if (g_verbose_input)
614 {
615 if (channel > 0) printf(", ");
616 std::cout << CoutCast(h_samples[offset]);
617 }
618 }
619 }
620 }
621 }
622
623
624 /**
625 * Initialize histogram solutions
626 */
627 template <
628 int NUM_CHANNELS,
629 int NUM_ACTIVE_CHANNELS,
630 typename CounterT,
631 typename SampleIteratorT,
632 typename TransformOp,
633 typename OffsetT>
InitializeBins(SampleIteratorT h_samples,int num_levels[NUM_ACTIVE_CHANNELS],TransformOp transform_op[NUM_ACTIVE_CHANNELS],CounterT * h_histogram[NUM_ACTIVE_CHANNELS],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes)634 void InitializeBins(
635 SampleIteratorT h_samples,
636 int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
637 TransformOp transform_op[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
638 CounterT *h_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
639 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
640 OffsetT num_rows, ///< [in] The number of rows in the region of interest
641 OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
642 {
643 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
644
645 // Init bins
646 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
647 {
648 for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin)
649 {
650 h_histogram[CHANNEL][bin] = 0;
651 }
652 }
653
654 // Initialize samples
655 if (g_verbose_input) printf("Samples: \n");
656 for (OffsetT row = 0; row < num_rows; ++row)
657 {
658 for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
659 {
660 if (g_verbose_input) printf("[");
661 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
662 {
663 // Sample offset
664 OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
665
666 // Update sample bin
667 int bin = transform_op[channel](h_samples[offset]);
668 if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
669 if ((bin >= 0) && (bin < num_levels[channel] - 1))
670 {
671 // valid bin
672 h_histogram[channel][bin]++;
673 }
674 }
675 if (g_verbose_input) printf("]");
676 }
677 if (g_verbose_input) printf("\n\n");
678 }
679 }
680
681
682
683 /**
684 * Test histogram-even
685 */
686 template <
687 Backend BACKEND,
688 int NUM_CHANNELS,
689 int NUM_ACTIVE_CHANNELS,
690 typename SampleT,
691 typename CounterT,
692 typename LevelT,
693 typename OffsetT,
694 typename SampleIteratorT>
TestEven(LevelT max_level,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT lower_level[NUM_ACTIVE_CHANNELS],LevelT upper_level[NUM_ACTIVE_CHANNELS],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,SampleIteratorT h_samples,SampleIteratorT d_samples)695 void TestEven(
696 LevelT max_level,
697 int entropy_reduction,
698 int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
699 LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
700 LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
701 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
702 OffsetT num_rows, ///< [in] The number of rows in the region of interest
703 OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
704 SampleIteratorT h_samples,
705 SampleIteratorT d_samples)
706 {
707 OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
708
709 printf("\n----------------------------\n");
710 printf("%s cub::DeviceHistogramEven (%s) %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
711 (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
712 (IsPointer<SampleIteratorT>::VALUE) ? "pointer" : "iterator",
713 (int) (num_row_pixels * num_rows),
714 (int) num_rows,
715 (int) num_row_pixels,
716 (int) row_stride_bytes,
717 (int) total_samples,
718 (int) sizeof(SampleT),
719 typeid(SampleT).name(),
720 entropy_reduction,
721 typeid(CounterT).name(),
722 NUM_ACTIVE_CHANNELS,
723 NUM_CHANNELS);
724 std::cout << CoutCast(max_level) << "\n";
725 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
726 std::cout << "\n\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins [" << lower_level[channel] << ", " << upper_level[channel] << ")\n";
727 fflush(stdout);
728
729 // Allocate and initialize host and device data
730
731 typedef SampleT Foo; // rename type to quelch gcc warnings (bug?)
732 CounterT* h_histogram[NUM_ACTIVE_CHANNELS];
733 ScaleTransform<LevelT> transform_op[NUM_ACTIVE_CHANNELS];
734
735 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
736 {
737 int bins = num_levels[channel] - 1;
738 h_histogram[channel] = new CounterT[bins];
739
740 transform_op[channel].Init(
741 num_levels[channel],
742 upper_level[channel],
743 lower_level[channel],
744 ((upper_level[channel] - lower_level[channel]) / bins));
745 }
746
747 InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
748 h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
749
750 // Allocate and initialize device data
751
752 CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
753 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
754 {
755 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
756 CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
757 }
758
759 // Allocate CDP device arrays
760 size_t *d_temp_storage_bytes = NULL;
761 cudaError_t *d_cdp_error = NULL;
762 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
763 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
764
765 // Allocate temporary storage
766 void *d_temp_storage = NULL;
767 size_t temp_storage_bytes = 0;
768
769 Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
770 1, d_temp_storage_bytes, d_cdp_error,
771 d_temp_storage, temp_storage_bytes,
772 d_samples, d_histogram, num_levels, lower_level, upper_level,
773 num_row_pixels, num_rows, row_stride_bytes,
774 0, true);
775
776 // Allocate temporary storage with "canary" zones
777 int canary_bytes = 256;
778 char canary_token = 8;
779 char* canary_zone = new char[canary_bytes];
780
781 memset(canary_zone, canary_token, canary_bytes);
782 CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
783 CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
784
785 // Run warmup/correctness iteration
786 Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
787 1, d_temp_storage_bytes, d_cdp_error,
788 ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
789 d_samples, d_histogram, num_levels, lower_level, upper_level,
790 num_row_pixels, num_rows, row_stride_bytes,
791 0, true);
792
793 // Check canary zones
794 int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
795 AssertEquals(0, error);
796 error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
797 AssertEquals(0, error);
798
799 // Flush any stdout/stderr
800 CubDebugExit(cudaPeekAtLastError());
801 CubDebugExit(cudaDeviceSynchronize());
802 fflush(stdout);
803 fflush(stderr);
804
805 // Check for correctness (and display results, if specified)
806 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
807 {
808 int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
809 printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
810 error |= channel_error;
811 }
812
813 // Performance
814 GpuTimer gpu_timer;
815 gpu_timer.Start();
816
817 Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
818 g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
819 ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
820 d_samples, d_histogram, num_levels, lower_level, upper_level,
821 num_row_pixels, num_rows, row_stride_bytes,
822 0, false);
823
824 gpu_timer.Stop();
825 float elapsed_millis = gpu_timer.ElapsedMillis();
826
827 // Display performance
828 if (g_timing_iterations > 0)
829 {
830 float avg_millis = elapsed_millis / g_timing_iterations;
831 float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
832 float giga_bandwidth = giga_rate * sizeof(SampleT);
833 printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
834 avg_millis,
835 giga_rate,
836 giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
837 giga_rate / NUM_CHANNELS,
838 giga_bandwidth);
839 }
840
841 printf("\n\n");
842
843 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
844 {
845 if (h_histogram[channel])
846 delete[] h_histogram[channel];
847
848 if (d_histogram[channel])
849 CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
850 }
851
852 if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
853 if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
854 if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
855
856 // Correctness asserts
857 AssertEquals(0, error);
858 }
859
860
861 /**
862 * Test histogram-even (native pointer input)
863 */
864 template <
865 Backend BACKEND,
866 int NUM_CHANNELS,
867 int NUM_ACTIVE_CHANNELS,
868 typename SampleT,
869 typename CounterT,
870 typename LevelT,
871 typename OffsetT>
TestEvenNative(LevelT max_level,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT lower_level[NUM_ACTIVE_CHANNELS],LevelT upper_level[NUM_ACTIVE_CHANNELS],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes)872 void TestEvenNative(
873 LevelT max_level,
874 int entropy_reduction,
875 int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
876 LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
877 LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
878 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
879 OffsetT num_rows, ///< [in] The number of rows in the region of interest
880 OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
881 {
882 OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
883
884 // Allocate and initialize host sample data
885 typedef SampleT Foo; // rename type to quelch gcc warnings (bug?)
886 SampleT* h_samples = new Foo[total_samples];
887
888 InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
889 max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
890
891 // Allocate and initialize device data
892 SampleT* d_samples = NULL;
893 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
894 CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
895
896 TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
897 max_level, entropy_reduction, num_levels, lower_level, upper_level,
898 num_row_pixels, num_rows, row_stride_bytes,
899 h_samples, d_samples);
900
901 // Cleanup
902 if (h_samples) delete[] h_samples;
903 if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
904 }
905
906
907 /**
908 * Test histogram-even (native pointer input)
909 */
910 template <
911 Backend BACKEND,
912 int NUM_CHANNELS,
913 int NUM_ACTIVE_CHANNELS,
914 typename SampleT,
915 typename CounterT,
916 typename LevelT,
917 typename OffsetT>
TestEvenIterator(LevelT max_level,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT lower_level[NUM_ACTIVE_CHANNELS],LevelT upper_level[NUM_ACTIVE_CHANNELS],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes)918 void TestEvenIterator(
919 LevelT max_level,
920 int entropy_reduction,
921 int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
922 LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
923 LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
924 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
925 OffsetT num_rows, ///< [in] The number of rows in the region of interest
926 OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
927 {
928 SampleT sample = (SampleT) lower_level[0];
929 ConstantInputIterator<SampleT> sample_itr(sample);
930
931 TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
932 max_level, entropy_reduction, num_levels, lower_level, upper_level,
933 num_row_pixels, num_rows, row_stride_bytes,
934 sample_itr, sample_itr);
935
936 }
937
938
939 /**
940 * Test histogram-range
941 */
942 template <
943 Backend BACKEND,
944 int NUM_CHANNELS,
945 int NUM_ACTIVE_CHANNELS,
946 typename SampleT,
947 typename CounterT,
948 typename LevelT,
949 typename OffsetT>
TestRange(LevelT max_level,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT * levels[NUM_ACTIVE_CHANNELS],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes)950 void TestRange(
951 LevelT max_level,
952 int entropy_reduction,
953 int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
954 LevelT* levels[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
955 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
956 OffsetT num_rows, ///< [in] The number of rows in the region of interest
957 OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
958 {
959 OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
960
961 printf("\n----------------------------\n");
962 printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
963 (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
964 (int) (num_row_pixels * num_rows),
965 (int) num_rows,
966 (int) num_row_pixels,
967 (int) row_stride_bytes,
968 (int) total_samples,
969 (int) sizeof(SampleT),
970 typeid(SampleT).name(),
971 entropy_reduction,
972 typeid(CounterT).name(),
973 NUM_ACTIVE_CHANNELS,
974 NUM_CHANNELS);
975 std::cout << CoutCast(max_level) << "\n";
976 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
977 {
978 printf("Channel %d: %d bins [", channel, num_levels[channel] - 1);
979 std::cout << levels[channel][0];
980 for (int level = 1; level < num_levels[channel]; ++level)
981 std::cout << ", " << levels[channel][level];
982 printf("]\n");
983 }
984 fflush(stdout);
985
986 // Allocate and initialize host and device data
987 typedef SampleT Foo; // rename type to quelch gcc warnings (bug?)
988 SampleT* h_samples = new Foo[total_samples];
989 CounterT* h_histogram[NUM_ACTIVE_CHANNELS];
990 SearchTransform<LevelT> transform_op[NUM_ACTIVE_CHANNELS];
991
992 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
993 {
994 transform_op[channel].levels = levels[channel];
995 transform_op[channel].num_levels = num_levels[channel];
996
997 int bins = num_levels[channel] - 1;
998 h_histogram[channel] = new CounterT[bins];
999 }
1000
1001 InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
1002 max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
1003
1004 InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
1005 h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
1006
1007 // Allocate and initialize device data
1008 SampleT* d_samples = NULL;
1009 LevelT* d_levels[NUM_ACTIVE_CHANNELS];
1010 CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
1011
1012 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
1013 CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
1014
1015 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1016 {
1017 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
1018 CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel], sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
1019
1020 int bins = num_levels[channel] - 1;
1021 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * bins));
1022 CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * bins));
1023 }
1024
1025 // Allocate CDP device arrays
1026 size_t *d_temp_storage_bytes = NULL;
1027 cudaError_t *d_cdp_error = NULL;
1028
1029 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
1030 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
1031
1032 // Allocate temporary storage
1033 void *d_temp_storage = NULL;
1034 size_t temp_storage_bytes = 0;
1035
1036 Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
1037 1, d_temp_storage_bytes, d_cdp_error,
1038 d_temp_storage, temp_storage_bytes,
1039 d_samples,
1040 d_histogram,
1041 num_levels, d_levels,
1042 num_row_pixels, num_rows, row_stride_bytes,
1043 0, true);
1044
1045 // Allocate temporary storage with "canary" zones
1046 int canary_bytes = 256;
1047 char canary_token = 9;
1048 char* canary_zone = new char[canary_bytes];
1049
1050 memset(canary_zone, canary_token, canary_bytes);
1051 CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
1052 CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
1053
1054 // Run warmup/correctness iteration
1055 Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
1056 1, d_temp_storage_bytes, d_cdp_error,
1057 ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
1058 d_samples,
1059 d_histogram,
1060 num_levels, d_levels,
1061 num_row_pixels, num_rows, row_stride_bytes,
1062 0, true);
1063
1064 // Check canary zones
1065 int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
1066 AssertEquals(0, error);
1067 error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
1068 AssertEquals(0, error);
1069
1070 // Flush any stdout/stderr
1071 CubDebugExit(cudaPeekAtLastError());
1072 CubDebugExit(cudaDeviceSynchronize());
1073 fflush(stdout);
1074 fflush(stderr);
1075
1076 // Check for correctness (and display results, if specified)
1077 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1078 {
1079 int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
1080 printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
1081 error |= channel_error;
1082 }
1083
1084 // Performance
1085 GpuTimer gpu_timer;
1086 gpu_timer.Start();
1087
1088 Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
1089 g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
1090 ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
1091 d_samples,
1092 d_histogram,
1093 num_levels, d_levels,
1094 num_row_pixels, num_rows, row_stride_bytes,
1095 0, false);
1096
1097 gpu_timer.Stop();
1098 float elapsed_millis = gpu_timer.ElapsedMillis();
1099
1100 // Display performance
1101 if (g_timing_iterations > 0)
1102 {
1103 float avg_millis = elapsed_millis / g_timing_iterations;
1104 float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
1105 float giga_bandwidth = giga_rate * sizeof(SampleT);
1106 printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
1107 avg_millis,
1108 giga_rate,
1109 giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
1110 giga_rate / NUM_CHANNELS,
1111 giga_bandwidth);
1112 }
1113
1114 printf("\n\n");
1115
1116 // Cleanup
1117 if (h_samples) delete[] h_samples;
1118
1119 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1120 {
1121 if (h_histogram[channel])
1122 delete[] h_histogram[channel];
1123
1124 if (d_histogram[channel])
1125 CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
1126
1127 if (d_levels[channel])
1128 CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
1129 }
1130
1131 if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
1132 if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
1133 if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
1134 if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
1135
1136 // Correctness asserts
1137 AssertEquals(0, error);
1138 }
1139
1140
1141 /**
1142 * Test histogram-even
1143 */
1144 template <
1145 Backend BACKEND,
1146 typename SampleT,
1147 int NUM_CHANNELS,
1148 int NUM_ACTIVE_CHANNELS,
1149 typename CounterT,
1150 typename LevelT,
1151 typename OffsetT>
TestEven(OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT max_level,int max_num_levels)1152 void TestEven(
1153 OffsetT num_row_pixels,
1154 OffsetT num_rows,
1155 OffsetT row_stride_bytes,
1156 int entropy_reduction,
1157 int num_levels[NUM_ACTIVE_CHANNELS],
1158 LevelT max_level,
1159 int max_num_levels)
1160 {
1161 LevelT lower_level[NUM_ACTIVE_CHANNELS];
1162 LevelT upper_level[NUM_ACTIVE_CHANNELS];
1163
1164 // Find smallest level increment
1165 int max_bins = max_num_levels - 1;
1166 LevelT min_level_increment = max_level / max_bins;
1167
1168 // Set upper and lower levels for each channel
1169 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1170 {
1171 int num_bins = num_levels[channel] - 1;
1172 lower_level[channel] = (max_level - (num_bins * min_level_increment)) / 2;
1173 upper_level[channel] = (max_level + (num_bins * min_level_increment)) / 2;
1174 }
1175
1176 // Test pointer-based samples
1177 TestEvenNative<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
1178 max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
1179
1180 // Test iterator-based samples (CUB-only)
1181 TestEvenIterator<CUB, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
1182 max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
1183 }
1184
1185
1186
1187 /**
1188 * Test histogram-range
1189 */
1190 template <
1191 Backend BACKEND,
1192 typename SampleT,
1193 int NUM_CHANNELS,
1194 int NUM_ACTIVE_CHANNELS,
1195 typename CounterT,
1196 typename LevelT,
1197 typename OffsetT>
TestRange(OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT max_level,int max_num_levels)1198 void TestRange(
1199 OffsetT num_row_pixels,
1200 OffsetT num_rows,
1201 OffsetT row_stride_bytes,
1202 int entropy_reduction,
1203 int num_levels[NUM_ACTIVE_CHANNELS],
1204 LevelT max_level,
1205 int max_num_levels)
1206 {
1207 // Find smallest level increment
1208 int max_bins = max_num_levels - 1;
1209 LevelT min_level_increment = max_level / max_bins;
1210
1211 LevelT* levels[NUM_ACTIVE_CHANNELS];
1212 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1213 {
1214 levels[channel] = new LevelT[num_levels[channel]];
1215
1216 int num_bins = num_levels[channel] - 1;
1217 LevelT lower_level = (max_level - (num_bins * min_level_increment)) / 2;
1218
1219 for (int level = 0; level < num_levels[channel]; ++level)
1220 levels[channel][level] = lower_level + (level * min_level_increment);
1221 }
1222
1223 TestRange<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
1224 max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes);
1225
1226 for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1227 delete[] levels[channel];
1228
1229 }
1230
1231
1232
1233 /**
1234 * Test different entrypoints
1235 */
1236 template <
1237 typename SampleT,
1238 int NUM_CHANNELS,
1239 int NUM_ACTIVE_CHANNELS,
1240 typename CounterT,
1241 typename LevelT,
1242 typename OffsetT>
Test(OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT max_level,int max_num_levels)1243 void Test(
1244 OffsetT num_row_pixels,
1245 OffsetT num_rows,
1246 OffsetT row_stride_bytes,
1247 int entropy_reduction,
1248 int num_levels[NUM_ACTIVE_CHANNELS],
1249 LevelT max_level,
1250 int max_num_levels)
1251 {
1252 TestEven<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1253 num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
1254
1255 TestRange<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1256 num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
1257 }
1258
1259
1260 /**
1261 * Test different number of levels
1262 */
1263 template <
1264 typename SampleT,
1265 int NUM_CHANNELS,
1266 int NUM_ACTIVE_CHANNELS,
1267 typename CounterT,
1268 typename LevelT,
1269 typename OffsetT>
Test(OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,int entropy_reduction,LevelT max_level,int max_num_levels)1270 void Test(
1271 OffsetT num_row_pixels,
1272 OffsetT num_rows,
1273 OffsetT row_stride_bytes,
1274 int entropy_reduction,
1275 LevelT max_level,
1276 int max_num_levels)
1277 {
1278 int num_levels[NUM_ACTIVE_CHANNELS];
1279
1280 // Unnecessary testing
1281 // // All the same level
1282 // for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1283 // {
1284 // num_levels[channel] = max_num_levels;
1285 // }
1286 // Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1287 // num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
1288
1289 // All different levels
1290 num_levels[0] = max_num_levels;
1291 for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel)
1292 {
1293 num_levels[channel] = (num_levels[channel - 1] / 2) + 1;
1294 }
1295 Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1296 num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
1297 }
1298
1299
1300
1301 /**
1302 * Test different entropy-levels
1303 */
1304 template <
1305 typename SampleT,
1306 int NUM_CHANNELS,
1307 int NUM_ACTIVE_CHANNELS,
1308 typename CounterT,
1309 typename LevelT,
1310 typename OffsetT>
Test(OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,LevelT max_level,int max_num_levels)1311 void Test(
1312 OffsetT num_row_pixels,
1313 OffsetT num_rows,
1314 OffsetT row_stride_bytes,
1315 LevelT max_level,
1316 int max_num_levels)
1317 {
1318 Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1319 num_row_pixels, num_rows, row_stride_bytes, 0, max_level, max_num_levels);
1320
1321 Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1322 num_row_pixels, num_rows, row_stride_bytes, -1, max_level, max_num_levels);
1323
1324 Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1325 num_row_pixels, num_rows, row_stride_bytes, 5, max_level, max_num_levels);
1326 }
1327
1328
1329 /**
1330 * Test different row strides
1331 */
1332 template <
1333 typename SampleT,
1334 int NUM_CHANNELS,
1335 int NUM_ACTIVE_CHANNELS,
1336 typename CounterT,
1337 typename LevelT,
1338 typename OffsetT>
Test(OffsetT num_row_pixels,OffsetT num_rows,LevelT max_level,int max_num_levels)1339 void Test(
1340 OffsetT num_row_pixels,
1341 OffsetT num_rows,
1342 LevelT max_level,
1343 int max_num_levels)
1344 {
1345 OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT);
1346
1347 // No padding
1348 Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1349 num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels);
1350
1351 // 13 samples padding
1352 Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1353 num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels);
1354 }
1355
1356
1357 /**
1358 * Test different problem sizes
1359 */
1360 template <
1361 typename SampleT,
1362 int NUM_CHANNELS,
1363 int NUM_ACTIVE_CHANNELS,
1364 typename CounterT,
1365 typename LevelT,
1366 typename OffsetT>
Test(LevelT max_level,int max_num_levels)1367 void Test(
1368 LevelT max_level,
1369 int max_num_levels)
1370 {
1371 // 0 row/col images
1372 Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1373 OffsetT(1920), OffsetT(0), max_level, max_num_levels);
1374 Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1375 OffsetT(0), OffsetT(0), max_level, max_num_levels);
1376
1377 // 1080 image
1378 Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1379 OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
1380
1381 // Sample different aspect ratios sizes
1382 for (OffsetT rows = 1; rows < 1000000; rows *= 1000)
1383 {
1384 for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000)
1385 {
1386 Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1387 cols, rows, max_level, max_num_levels);
1388 }
1389 }
1390
1391 // Randomly select linear problem size between 1:10,000,000
1392 unsigned int max_int = (unsigned int) -1;
1393 for (int i = 0; i < 4; ++i)
1394 {
1395 unsigned int num_items;
1396 RandomBits(num_items);
1397 num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
1398 num_items = CUB_MAX(1, num_items);
1399
1400 Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1401 OffsetT(num_items), 1, max_level, max_num_levels);
1402 }
1403 }
1404
1405
1406
1407 /**
1408 * Test different channel interleavings (valid specialiation)
1409 */
1410 template <
1411 typename SampleT,
1412 typename CounterT,
1413 typename LevelT,
1414 typename OffsetT>
TestChannels(LevelT max_level,int max_num_levels,Int2Type<true>)1415 void TestChannels(
1416 LevelT max_level,
1417 int max_num_levels,
1418 Int2Type<true> /*is_valid_tag*/)
1419 {
1420 Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
1421 Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
1422 Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
1423 Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
1424 }
1425
1426
1427 /**
1428 * Test different channel interleavings (invalid specialiation)
1429 */
1430 template <
1431 typename SampleT,
1432 typename CounterT,
1433 typename LevelT,
1434 typename OffsetT>
TestChannels(LevelT,int,Int2Type<false>)1435 void TestChannels(
1436 LevelT /*max_level*/,
1437 int /*max_num_levels*/,
1438 Int2Type<false> /*is_valid_tag*/)
1439 {}
1440
1441
1442
1443 //---------------------------------------------------------------------
1444 // Main
1445 //---------------------------------------------------------------------
1446
1447
1448
1449
1450 /**
1451 * Main
1452 */
main(int argc,char ** argv)1453 int main(int argc, char** argv)
1454 {
1455 int num_row_pixels = -1;
1456 int entropy_reduction = 0;
1457 int num_rows = 1;
1458
1459 // Initialize command line
1460 CommandLineArgs args(argc, argv);
1461 g_verbose = args.CheckCmdLineFlag("v");
1462 g_verbose_input = args.CheckCmdLineFlag("v2");
1463 args.GetCmdLineArgument("n", num_row_pixels);
1464
1465 int row_stride_pixels = num_row_pixels;
1466
1467 args.GetCmdLineArgument("rows", num_rows);
1468 args.GetCmdLineArgument("stride", row_stride_pixels);
1469 args.GetCmdLineArgument("i", g_timing_iterations);
1470 args.GetCmdLineArgument("repeat", g_repeat);
1471 args.GetCmdLineArgument("entropy", entropy_reduction);
1472 #if defined(QUICK_TEST) || defined(QUICKER_TEST)
1473 bool compare_npp = args.CheckCmdLineFlag("npp");
1474 #endif
1475
1476
1477 // Print usage
1478 if (args.CheckCmdLineFlag("help"))
1479 {
1480 printf("%s "
1481 "[--n=<pixels per row>] "
1482 "[--rows=<number of rows>] "
1483 "[--stride=<row stride in pixels>] "
1484 "[--i=<timing iterations>] "
1485 "[--device=<device-id>] "
1486 "[--repeat=<repetitions of entire test suite>]"
1487 "[--entropy=<entropy-reduction factor (default 0)>]"
1488 "[--v] "
1489 "[--cdp]"
1490 "[--npp]"
1491 "\n", argv[0]);
1492 exit(0);
1493 }
1494
1495 // Initialize device
1496 CubDebugExit(args.DeviceInit());
1497
1498 // Get ptx version
1499 int ptx_version = 0;
1500 CubDebugExit(PtxVersion(ptx_version));
1501
1502 if (num_row_pixels < 0)
1503 {
1504 num_row_pixels = 1920 * 1080;
1505 row_stride_pixels = num_row_pixels;
1506 }
1507
1508 #if defined(QUICKER_TEST)
1509
1510 // Compile/run quick tests
1511 {
1512 // HistogramEven: unsigned char 256 bins
1513 typedef unsigned char SampleT;
1514 typedef int LevelT;
1515
1516 LevelT max_level = 256;
1517 int num_levels[1] = {257};
1518 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
1519
1520 TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1521 // The NPP path doesn't compile as of 2020-06:
1522 // No Dispatch<int, int, NPP> specialization defined.
1523 // if (compare_npp)
1524 // TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1525 }
1526
1527 {
1528 // HistogramRange: signed char 256 bins
1529 typedef signed char SampleT;
1530 typedef int LevelT;
1531
1532 LevelT max_level = 256;
1533 int num_levels[1] = {257};
1534 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
1535
1536 TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1537 }
1538
1539
1540
1541 #elif defined(QUICK_TEST)
1542
1543 // Compile/run quick tests
1544 {
1545 // HistogramEven: unsigned char 256 bins
1546 typedef unsigned char SampleT;
1547 typedef int LevelT;
1548
1549 LevelT max_level = 256;
1550 int num_levels[1] = {257};
1551 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
1552
1553 TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1554 // The NPP path doesn't compile as of 2020-06:
1555 // No Dispatch<int, int, NPP> specialization defined.
1556 // if (compare_npp)
1557 // TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1558 }
1559
1560 {
1561 // HistogramEven: 4/4 multichannel Unsigned char 256 bins
1562 typedef unsigned char SampleT;
1563 typedef int LevelT;
1564
1565 LevelT max_level = 256;
1566 int num_levels[4] = {257, 257, 257, 257};
1567 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4;
1568
1569 TestEven<CUB, SampleT, 4, 4, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1570 }
1571
1572 {
1573 // HistogramEven: 3/4 multichannel Unsigned char 256 bins
1574 typedef unsigned char SampleT;
1575 typedef int LevelT;
1576
1577 LevelT max_level = 256;
1578 int num_levels[3] = {257, 257, 257};
1579 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4;
1580
1581 TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1582 // The NPP path doesn't compile as of 2020-06:
1583 // No Dispatch<int, int, NPP> specialization defined.
1584 // if (compare_npp)
1585 // TestEven<NPP, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1586 }
1587
1588 {
1589 // HistogramEven: short [0,1024] 256 bins
1590 typedef unsigned short SampleT;
1591 typedef unsigned short LevelT;
1592
1593 LevelT max_level = 1024;
1594 int num_levels[1] = {257};
1595 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
1596
1597 TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1598 }
1599
1600 {
1601 // HistogramEven: float [0,1.0] 256 bins
1602 typedef float SampleT;
1603 typedef float LevelT;
1604
1605 LevelT max_level = 1.0;
1606 int num_levels[1] = {257};
1607 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
1608
1609 TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1610 }
1611
1612 {
1613 // HistogramEven: 3/4 multichannel float [0,1.0] 256 bins
1614 typedef float SampleT;
1615 typedef float LevelT;
1616
1617 LevelT max_level = 1.0;
1618 int num_levels[3] = {257, 257, 257};
1619 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4;
1620
1621 TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1622 }
1623
1624 {
1625 // HistogramRange: signed char 256 bins
1626 typedef signed char SampleT;
1627 typedef int LevelT;
1628
1629 LevelT max_level = 256;
1630 int num_levels[1] = {257};
1631 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
1632
1633 TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1634 }
1635
1636 {
1637 // HistogramRange: 3/4 channel, unsigned char, varied bins (256, 128, 64)
1638 typedef unsigned char SampleT;
1639 typedef int LevelT;
1640
1641 LevelT max_level = 256;
1642 int num_levels[3] = {257, 129, 65};
1643 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4;
1644
1645 TestRange<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1646 }
1647
1648 if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
1649 {
1650 // HistogramEven: double [0,1.0] 64 bins
1651 typedef double SampleT;
1652 typedef double LevelT;
1653
1654 LevelT max_level = 1.0;
1655 int num_levels[1] = {65};
1656 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
1657
1658 TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1659 }
1660
1661 {
1662 // HistogramEven: short [0,1024] 512 bins
1663 typedef unsigned short SampleT;
1664 typedef unsigned short LevelT;
1665
1666 LevelT max_level = 1024;
1667 int num_levels[1] = {513};
1668 int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
1669
1670 TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1671 }
1672
1673 #else
1674
1675 // Compile/run thorough tests
1676 for (int i = 0; i <= g_repeat; ++i)
1677 {
1678 TestChannels <unsigned char, int, int, int>(256, 256 + 1, Int2Type<true>());
1679 TestChannels <signed char, int, int, int>(256, 256 + 1, Int2Type<true>());
1680 TestChannels <unsigned short, int, int, int>(128, 128 + 1, Int2Type<true>());
1681 TestChannels <unsigned short, int, int, int>(8192, 8192 + 1, Int2Type<true>());
1682 TestChannels <float, int, float, int>(1.0, 256 + 1, Int2Type<true>());
1683
1684 // Test down-conversion of size_t offsets to int
1685 TestChannels <unsigned char, int, int, long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>());
1686 }
1687
1688 #endif
1689
1690 return 0;
1691 }
1692
1693