1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /******************************************************************************
30  * Test of DeviceHistogram utilities
31  ******************************************************************************/
32 
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35 
36 #include <stdio.h>
37 #include <limits>
38 #include <algorithm>
39 #include <typeinfo>
40 
41 #if defined(QUICK_TEST) || defined(QUICKER_TEST)
42     #include <npp.h>
43 #endif
44 
45 #include <cub/util_allocator.cuh>
46 #include <cub/iterator/constant_input_iterator.cuh>
47 #include <cub/device/device_histogram.cuh>
48 
49 #include "test_util.h"
50 
51 using namespace cub;
52 
53 
54 //---------------------------------------------------------------------
55 // Globals, constants and typedefs
56 //---------------------------------------------------------------------
57 
58 
59 // Dispatch types
60 enum Backend
61 {
62     CUB,        // CUB method
63     NPP,        // NPP method
64     CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
65 };
66 
67 
68 bool                    g_verbose_input     = false;
69 bool                    g_verbose           = false;
70 int                     g_timing_iterations = 0;
71 int                     g_repeat            = 0;
72 CachingDeviceAllocator  g_allocator(true);
73 
74 
75 
76 
77 //---------------------------------------------------------------------
78 // Dispatch to NPP histogram
79 //---------------------------------------------------------------------
80 
81 #if defined(QUICK_TEST) || defined(QUICKER_TEST)
82 
83 /**
84  * Dispatch to single-channel 8b NPP histo-even
85  */
86 template <typename CounterT, typename LevelT, typename OffsetT>
87 //CUB_RUNTIME_FUNCTION __forceinline__
DispatchEven(Int2Type<1> num_channels,Int2Type<1> num_active_channels,Int2Type<NPP> dispatch_to,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,unsigned char * d_samples,CounterT * d_histogram[1],int num_levels[1],LevelT lower_level[1],LevelT upper_level[1],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,cudaStream_t stream,bool debug_synchronous)88 cudaError_t DispatchEven(
89     Int2Type<1>             num_channels,
90     Int2Type<1>             num_active_channels,
91     Int2Type<NPP>           dispatch_to,
92     int                     timing_timing_iterations,
93     size_t                  *d_temp_storage_bytes,
94     cudaError_t             *d_cdp_error,
95 
96     void*               d_temp_storage,
97     size_t&             temp_storage_bytes,
98     unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
99     CounterT            *d_histogram[1],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
100     int                 num_levels[1],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
101     LevelT              lower_level[1],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
102     LevelT              upper_level[1],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
103     OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
104     OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
105     OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
106     cudaStream_t        stream,
107     bool                debug_synchronous)
108 {
109     typedef unsigned char SampleT;
110 
111     cudaError_t error = cudaSuccess;
112     NppiSize oSizeROI = {
113         num_row_pixels,
114         num_rows
115     };
116 
117     if (d_temp_storage_bytes == NULL)
118     {
119         int nDeviceBufferSize;
120         nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
121         temp_storage_bytes = nDeviceBufferSize;
122     }
123     else
124     {
125         for (int i = 0; i < timing_timing_iterations; ++i)
126         {
127             // compute the histogram
128             nppiHistogramEven_8u_C1R(
129                 d_samples,
130                 row_stride_bytes,
131                 oSizeROI,
132                 d_histogram[0],
133                 num_levels[0],
134                 lower_level[0],
135                 upper_level[0],
136                 (Npp8u*) d_temp_storage);
137         }
138     }
139 
140     return error;
141 }
142 
143 
144 /**
145  * Dispatch to 3/4 8b NPP histo-even
146  */
147 template <typename CounterT, typename LevelT, typename OffsetT>
148 //CUB_RUNTIME_FUNCTION __forceinline__
DispatchEven(Int2Type<4> num_channels,Int2Type<3> num_active_channels,Int2Type<NPP> dispatch_to,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,unsigned char * d_samples,CounterT * d_histogram[3],int num_levels[3],LevelT lower_level[3],LevelT upper_level[3],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,cudaStream_t stream,bool debug_synchronous)149 cudaError_t DispatchEven(
150     Int2Type<4>          num_channels,
151     Int2Type<3>   num_active_channels,
152     Int2Type<NPP>           dispatch_to,
153     int                     timing_timing_iterations,
154     size_t                  *d_temp_storage_bytes,
155     cudaError_t             *d_cdp_error,
156 
157     void*               d_temp_storage,
158     size_t&             temp_storage_bytes,
159     unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
160     CounterT            *d_histogram[3],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
161     int                 num_levels[3],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
162     LevelT              lower_level[3],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
163     LevelT              upper_level[3],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
164     OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
165     OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
166     OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
167     cudaStream_t        stream,
168     bool                debug_synchronous)
169 {
170     typedef unsigned char SampleT;
171 
172     cudaError_t error = cudaSuccess;
173     NppiSize oSizeROI = {
174         num_row_pixels,
175         num_rows
176     };
177 
178     if (d_temp_storage_bytes == NULL)
179     {
180         int nDeviceBufferSize;
181         nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
182         temp_storage_bytes = nDeviceBufferSize;
183     }
184     else
185     {
186         for (int i = 0; i < timing_timing_iterations; ++i)
187         {
188             // compute the histogram
189             nppiHistogramEven_8u_AC4R(
190                 d_samples,
191                 row_stride_bytes,
192                 oSizeROI,
193                 d_histogram,
194                 num_levels,
195                 lower_level,
196                 upper_level,
197                 (Npp8u*) d_temp_storage);
198         }
199     }
200 
201     return error;
202 }
203 
204 
205 #endif // #if defined(QUICK_TEST) || defined(QUICKER_TEST)
206 
207 
208 //---------------------------------------------------------------------
209 // Dispatch to different DeviceHistogram entrypoints
210 //---------------------------------------------------------------------
211 
212 template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, int BACKEND>
213 struct Dispatch;
214 
215 template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS>
216 struct Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, CUB>
217 {
218     /**
219      * Dispatch to CUB multi histogram-range entrypoint
220      */
221     template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
222     //CUB_RUNTIME_FUNCTION __forceinline__
RangeDispatch223     static cudaError_t Range(
224         int                     timing_timing_iterations,
225         size_t                  */*d_temp_storage_bytes*/,
226         cudaError_t             */*d_cdp_error*/,
227 
228         void*               d_temp_storage,
229         size_t&             temp_storage_bytes,
230         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
231         CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
232         int                 *num_levels,                                ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
233         LevelT              *(&d_levels)[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
234         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
235         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
236         OffsetT             row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
237         cudaStream_t        stream,
238         bool                debug_synchronous)
239     {
240         cudaError_t error = cudaSuccess;
241 
242         for (int i = 0; i < timing_timing_iterations; ++i)
243         {
244             error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
245                 d_temp_storage,
246                 temp_storage_bytes,
247                 d_samples,
248                 d_histogram,
249                 num_levels,
250                 d_levels,
251                 num_row_pixels,
252                 num_rows,
253                 row_stride_bytes,
254                 stream,
255                 debug_synchronous);
256         }
257         return error;
258     }
259 
260 
261     /**
262      * Dispatch to CUB multi histogram-even entrypoint
263      */
264     template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
265     //CUB_RUNTIME_FUNCTION __forceinline__
EvenDispatch266     static cudaError_t Even(
267         int                     timing_timing_iterations,
268         size_t                  */*d_temp_storage_bytes*/,
269         cudaError_t             */*d_cdp_error*/,
270 
271         void*               d_temp_storage,
272         size_t&             temp_storage_bytes,
273         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
274         CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
275         int                 *num_levels,            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
276         LevelT              *lower_level,           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
277         LevelT              *upper_level,           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
278         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
279         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
280         OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
281         cudaStream_t        stream,
282         bool                debug_synchronous)
283     {
284         typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
285 
286         cudaError_t error = cudaSuccess;
287         for (int i = 0; i < timing_timing_iterations; ++i)
288         {
289             error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
290                 d_temp_storage,
291                 temp_storage_bytes,
292                 d_samples,
293                 d_histogram,
294                 num_levels,
295                 lower_level,
296                 upper_level,
297                 num_row_pixels,
298                 num_rows,
299                 row_stride_bytes,
300                 stream,
301                 debug_synchronous);
302         }
303         return error;
304     }
305 
306 };
307 
308 
309 template <>
310 struct Dispatch<1, 1, CUB>
311 {
312 
313     /**
314      * Dispatch to CUB single histogram-range entrypoint
315      */
316     template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
317     //CUB_RUNTIME_FUNCTION __forceinline__
RangeDispatch318     static cudaError_t Range(
319         int                     timing_timing_iterations,
320         size_t                  */*d_temp_storage_bytes*/,
321         cudaError_t             */*d_cdp_error*/,
322 
323         void*               d_temp_storage,
324         size_t&             temp_storage_bytes,
325         SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
326         CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
327         int                 *num_levels,                            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
328         LevelT              (&d_levels)[1],                         ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
329         OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
330         OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
331         OffsetT             row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
332         cudaStream_t        stream,
333         bool                debug_synchronous)
334     {
335         cudaError_t error = cudaSuccess;
336         for (int i = 0; i < timing_timing_iterations; ++i)
337         {
338             error = DeviceHistogram::HistogramRange(
339                 d_temp_storage,
340                 temp_storage_bytes,
341                 d_samples,
342                 d_histogram[0],
343                 num_levels[0],
344                 d_levels[0],
345                 num_row_pixels,
346                 num_rows,
347                 row_stride_bytes,
348                 stream,
349                 debug_synchronous);
350         }
351         return error;
352     }
353 
354 
355     /**
356      * Dispatch to CUB single histogram-even entrypoint
357      */
358     template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
359     //CUB_RUNTIME_FUNCTION __forceinline__
EvenDispatch360     static cudaError_t Even(
361         int                     timing_timing_iterations,
362         size_t                  */*d_temp_storage_bytes*/,
363         cudaError_t             */*d_cdp_error*/,
364 
365         void*               d_temp_storage,
366         size_t&             temp_storage_bytes,
367         SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
368         CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
369         int                 *num_levels,                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
370         LevelT              *lower_level,                             ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
371         LevelT              *upper_level,                             ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
372         OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
373         OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
374         OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
375         cudaStream_t        stream,
376         bool                debug_synchronous)
377     {
378         cudaError_t error = cudaSuccess;
379         for (int i = 0; i < timing_timing_iterations; ++i)
380         {
381             error = DeviceHistogram::HistogramEven(
382                 d_temp_storage,
383                 temp_storage_bytes,
384                 d_samples,
385                 d_histogram[0],
386                 num_levels[0],
387                 lower_level[0],
388                 upper_level[0],
389                 num_row_pixels,
390                 num_rows,
391                 row_stride_bytes,
392                 stream,
393                 debug_synchronous);
394         }
395         return error;
396     }
397 
398 };
399 
400 
401 
402 //---------------------------------------------------------------------
403 // CUDA nested-parallelism test kernel
404 //---------------------------------------------------------------------
405 
406 /**
407  * Simple wrapper kernel to invoke DeviceHistogram
408  * /
409 template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
410 __global__ void CnpDispatchKernel(
411     Int2Type<ALGORITHM> algorithm,
412     int                 timing_timing_iterations,
413     size_t              *d_temp_storage_bytes,
414     cudaError_t         *d_cdp_error,
415 
416     void*               d_temp_storage,
417     size_t              temp_storage_bytes,
418     SampleT             *d_samples,
419     SampleIteratorT      d_sample_itr,
420     ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
421     int                 num_samples,
422     bool                debug_synchronous)
423 {
424 #ifndef CUB_CDP
425     *d_cdp_error = cudaErrorNotSupported;
426 #else
427     *d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_samples, 0, debug_synchronous);
428     *d_temp_storage_bytes = temp_storage_bytes;
429 #endif
430 }
431 
432 
433 / **
434  * Dispatch to CDP kernel
435  * /
436 template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
437 cudaError_t Dispatch(
438     Int2Type<ALGORITHM> algorithm,
439     Int2Type<true>      use_cdp,
440     int                 timing_timing_iterations,
441     size_t              *d_temp_storage_bytes,
442     cudaError_t         *d_cdp_error,
443 
444     void*               d_temp_storage,
445     size_t&             temp_storage_bytes,
446     SampleT             *d_samples,
447     SampleIteratorT      d_sample_itr,
448     CounterT        *d_histograms[NUM_ACTIVE_CHANNELS],
449     int                 num_samples,
450     cudaStream_t        stream,
451     bool                debug_synchronous)
452 {
453     // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
454     ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
455     for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
456         d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
457 
458     // Invoke kernel to invoke device-side dispatch
459     CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo_wrapper, num_samples, debug_synchronous);
460 
461     // Copy out temp_storage_bytes
462     CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
463 
464     // Copy out error
465     cudaError_t retval;
466     CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
467     return retval;
468 }
469 */
470 
471 
472 //---------------------------------------------------------------------
473 // Test generation
474 //---------------------------------------------------------------------
475 
476 // Searches for bin given a list of bin-boundary levels
477 template <typename LevelT>
478 struct SearchTransform
479 {
480     LevelT          *levels;      // Pointer to levels array
481     int             num_levels;   // Number of levels in array
482 
483     // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
484     template <typename SampleT>
operator ()SearchTransform485     int operator()(SampleT sample)
486     {
487         int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
488         if (bin < 0)
489         {
490             // Sample out of range
491             return num_levels;
492         }
493         return bin;
494     }
495 };
496 
497 
498 // Scales samples to evenly-spaced bins
499 template <typename LevelT>
500 struct ScaleTransform
501 {
502     int    num_levels;  // Number of levels in array
503     LevelT max;         // Max sample level (exclusive)
504     LevelT min;         // Min sample level (inclusive)
505     LevelT scale;       // Bin scaling factor
506 
InitScaleTransform507     void Init(
508         int    num_levels,  // Number of levels in array
509         LevelT max,         // Max sample level (exclusive)
510         LevelT min,         // Min sample level (inclusive)
511         LevelT scale)       // Bin scaling factor
512     {
513         this->num_levels = num_levels;
514         this->max = max;
515         this->min = min;
516         this->scale = scale;
517     }
518 
519     // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
520     template <typename SampleT>
operator ()ScaleTransform521     int operator()(SampleT sample)
522     {
523         if ((sample < min) || (sample >= max))
524         {
525             // Sample out of range
526             return num_levels;
527         }
528 
529         return (int) ((((LevelT) sample) - min) / scale);
530     }
531 };
532 
533 // Scales samples to evenly-spaced bins
534 template <>
535 struct ScaleTransform<float>
536 {
537     int   num_levels;  // Number of levels in array
538     float max;         // Max sample level (exclusive)
539     float min;         // Min sample level (inclusive)
540     float scale;       // Bin scaling factor
541 
InitScaleTransform542     void Init(
543         int    num_levels,  // Number of levels in array
544         float max,         // Max sample level (exclusive)
545         float min,         // Min sample level (inclusive)
546         float scale)       // Bin scaling factor
547     {
548         this->num_levels = num_levels;
549         this->max = max;
550         this->min = min;
551         this->scale = 1.0f / scale;
552     }
553 
554     // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
555     template <typename SampleT>
operator ()ScaleTransform556     int operator()(SampleT sample)
557     {
558         if ((sample < min) || (sample >= max))
559         {
560             // Sample out of range
561             return num_levels;
562         }
563 
564         return (int) ((((float) sample) - min) * scale);
565     }
566 };
567 
568 
569 /**
570  * Generate sample
571  */
572 template <typename T, typename LevelT>
Sample(T & datum,LevelT max_level,int entropy_reduction)573 void Sample(T &datum, LevelT max_level, int entropy_reduction)
574 {
575     unsigned int max = (unsigned int) -1;
576     unsigned int bits;
577     RandomBits(bits, entropy_reduction);
578     float fraction = (float(bits) / max);
579 
580     datum = (T) (fraction * max_level);
581 }
582 
583 
584 /**
585  * Initialize histogram samples
586  */
587 template <
588     int             NUM_CHANNELS,
589     int             NUM_ACTIVE_CHANNELS,
590     typename        LevelT,
591     typename        SampleT,
592     typename        OffsetT>
InitializeSamples(LevelT max_level,int entropy_reduction,SampleT * h_samples,OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes)593 void InitializeSamples(
594     LevelT          max_level,
595     int             entropy_reduction,
596     SampleT         *h_samples,
597     OffsetT         num_row_pixels,         ///< [in] The number of multi-channel pixels per row in the region of interest
598     OffsetT         num_rows,               ///< [in] The number of rows in the region of interest
599     OffsetT         row_stride_bytes)       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
600 {
601     // Initialize samples
602     for (OffsetT row = 0; row < num_rows; ++row)
603     {
604         for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
605         {
606             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
607             {
608                 // Sample offset
609                 OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
610 
611                 // Init sample value
612                 Sample(h_samples[offset], max_level, entropy_reduction);
613                 if (g_verbose_input)
614                 {
615                     if (channel > 0) printf(", ");
616                     std::cout << CoutCast(h_samples[offset]);
617                 }
618             }
619         }
620     }
621 }
622 
623 
624 /**
625  * Initialize histogram solutions
626  */
627 template <
628     int             NUM_CHANNELS,
629     int             NUM_ACTIVE_CHANNELS,
630     typename        CounterT,
631     typename        SampleIteratorT,
632     typename        TransformOp,
633     typename        OffsetT>
InitializeBins(SampleIteratorT h_samples,int num_levels[NUM_ACTIVE_CHANNELS],TransformOp transform_op[NUM_ACTIVE_CHANNELS],CounterT * h_histogram[NUM_ACTIVE_CHANNELS],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes)634 void InitializeBins(
635     SampleIteratorT h_samples,
636     int             num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
637     TransformOp     transform_op[NUM_ACTIVE_CHANNELS],      ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
638     CounterT        *h_histogram[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
639     OffsetT         num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
640     OffsetT         num_rows,                               ///< [in] The number of rows in the region of interest
641     OffsetT         row_stride_bytes)                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
642 {
643     typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
644 
645     // Init bins
646     for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
647     {
648         for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin)
649         {
650             h_histogram[CHANNEL][bin] = 0;
651         }
652     }
653 
654     // Initialize samples
655     if (g_verbose_input) printf("Samples: \n");
656     for (OffsetT row = 0; row < num_rows; ++row)
657     {
658         for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
659         {
660             if (g_verbose_input) printf("[");
661             for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
662             {
663                 // Sample offset
664                 OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
665 
666                 // Update sample bin
667                 int bin = transform_op[channel](h_samples[offset]);
668                 if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
669                 if ((bin >= 0) && (bin < num_levels[channel] - 1))
670                 {
671                     // valid bin
672                     h_histogram[channel][bin]++;
673                 }
674             }
675             if (g_verbose_input) printf("]");
676         }
677         if (g_verbose_input) printf("\n\n");
678     }
679 }
680 
681 
682 
683 /**
684  * Test histogram-even
685  */
686 template <
687     Backend         BACKEND,
688     int             NUM_CHANNELS,
689     int             NUM_ACTIVE_CHANNELS,
690     typename        SampleT,
691     typename        CounterT,
692     typename        LevelT,
693     typename        OffsetT,
694     typename        SampleIteratorT>
TestEven(LevelT max_level,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT lower_level[NUM_ACTIVE_CHANNELS],LevelT upper_level[NUM_ACTIVE_CHANNELS],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,SampleIteratorT h_samples,SampleIteratorT d_samples)695 void TestEven(
696     LevelT          max_level,
697     int             entropy_reduction,
698     int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
699     LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
700     LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
701     OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
702     OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
703     OffsetT         row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
704     SampleIteratorT h_samples,
705     SampleIteratorT d_samples)
706 {
707     OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
708 
709     printf("\n----------------------------\n");
710     printf("%s cub::DeviceHistogramEven (%s) %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
711         (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
712         (IsPointer<SampleIteratorT>::VALUE) ? "pointer" : "iterator",
713         (int) (num_row_pixels * num_rows),
714         (int) num_rows,
715         (int) num_row_pixels,
716         (int) row_stride_bytes,
717         (int) total_samples,
718         (int) sizeof(SampleT),
719         typeid(SampleT).name(),
720         entropy_reduction,
721         typeid(CounterT).name(),
722         NUM_ACTIVE_CHANNELS,
723         NUM_CHANNELS);
724     std::cout << CoutCast(max_level) << "\n";
725     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
726         std::cout << "\n\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins [" << lower_level[channel] << ", " << upper_level[channel] << ")\n";
727     fflush(stdout);
728 
729     // Allocate and initialize host and device data
730 
731     typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
732     CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
733     ScaleTransform<LevelT>      transform_op[NUM_ACTIVE_CHANNELS];
734 
735     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
736     {
737         int bins = num_levels[channel] - 1;
738         h_histogram[channel] = new CounterT[bins];
739 
740         transform_op[channel].Init(
741             num_levels[channel],
742             upper_level[channel],
743             lower_level[channel],
744             ((upper_level[channel] - lower_level[channel]) / bins));
745     }
746 
747     InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
748         h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
749 
750     // Allocate and initialize device data
751 
752     CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
753     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
754     {
755         CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
756         CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
757     }
758 
759     // Allocate CDP device arrays
760     size_t          *d_temp_storage_bytes = NULL;
761     cudaError_t     *d_cdp_error = NULL;
762     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
763     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
764 
765     // Allocate temporary storage
766     void            *d_temp_storage = NULL;
767     size_t          temp_storage_bytes = 0;
768 
769     Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
770         1, d_temp_storage_bytes, d_cdp_error,
771         d_temp_storage, temp_storage_bytes,
772         d_samples, d_histogram, num_levels, lower_level, upper_level,
773         num_row_pixels, num_rows, row_stride_bytes,
774         0, true);
775 
776     // Allocate temporary storage with "canary" zones
777     int     canary_bytes    = 256;
778     char    canary_token    = 8;
779     char*   canary_zone     = new char[canary_bytes];
780 
781     memset(canary_zone, canary_token, canary_bytes);
782     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
783     CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
784 
785     // Run warmup/correctness iteration
786     Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
787         1, d_temp_storage_bytes, d_cdp_error,
788         ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
789         d_samples, d_histogram, num_levels, lower_level, upper_level,
790         num_row_pixels, num_rows, row_stride_bytes,
791         0, true);
792 
793     // Check canary zones
794     int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
795     AssertEquals(0, error);
796     error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
797     AssertEquals(0, error);
798 
799     // Flush any stdout/stderr
800     CubDebugExit(cudaPeekAtLastError());
801     CubDebugExit(cudaDeviceSynchronize());
802     fflush(stdout);
803     fflush(stderr);
804 
805     // Check for correctness (and display results, if specified)
806     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
807     {
808         int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
809         printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
810         error |= channel_error;
811     }
812 
813     // Performance
814     GpuTimer gpu_timer;
815     gpu_timer.Start();
816 
817     Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
818         g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
819         ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
820         d_samples, d_histogram, num_levels, lower_level, upper_level,
821         num_row_pixels, num_rows, row_stride_bytes,
822         0, false);
823 
824     gpu_timer.Stop();
825     float elapsed_millis = gpu_timer.ElapsedMillis();
826 
827     // Display performance
828     if (g_timing_iterations > 0)
829     {
830         float avg_millis = elapsed_millis / g_timing_iterations;
831         float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
832         float giga_bandwidth = giga_rate * sizeof(SampleT);
833         printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
834             avg_millis,
835             giga_rate,
836             giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
837             giga_rate / NUM_CHANNELS,
838             giga_bandwidth);
839     }
840 
841     printf("\n\n");
842 
843     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
844     {
845         if (h_histogram[channel])
846             delete[] h_histogram[channel];
847 
848         if (d_histogram[channel])
849             CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
850     }
851 
852     if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
853     if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
854     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
855 
856     // Correctness asserts
857     AssertEquals(0, error);
858 }
859 
860 
861 /**
862  * Test histogram-even (native pointer input)
863  */
864 template <
865     Backend         BACKEND,
866     int             NUM_CHANNELS,
867     int             NUM_ACTIVE_CHANNELS,
868     typename        SampleT,
869     typename        CounterT,
870     typename        LevelT,
871     typename        OffsetT>
TestEvenNative(LevelT max_level,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT lower_level[NUM_ACTIVE_CHANNELS],LevelT upper_level[NUM_ACTIVE_CHANNELS],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes)872 void TestEvenNative(
873     LevelT          max_level,
874     int             entropy_reduction,
875     int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
876     LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
877     LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
878     OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
879     OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
880     OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
881 {
882     OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
883 
884     // Allocate and initialize host sample data
885     typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
886     SampleT*                    h_samples = new Foo[total_samples];
887 
888     InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
889         max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
890 
891     // Allocate and initialize device data
892     SampleT* d_samples = NULL;
893     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
894     CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
895 
896     TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
897         max_level, entropy_reduction, num_levels, lower_level, upper_level,
898         num_row_pixels, num_rows, row_stride_bytes,
899         h_samples, d_samples);
900 
901     // Cleanup
902     if (h_samples) delete[] h_samples;
903     if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
904 }
905 
906 
907 /**
908  * Test histogram-even (native pointer input)
909  */
910 template <
911     Backend         BACKEND,
912     int             NUM_CHANNELS,
913     int             NUM_ACTIVE_CHANNELS,
914     typename        SampleT,
915     typename        CounterT,
916     typename        LevelT,
917     typename        OffsetT>
TestEvenIterator(LevelT max_level,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT lower_level[NUM_ACTIVE_CHANNELS],LevelT upper_level[NUM_ACTIVE_CHANNELS],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes)918 void TestEvenIterator(
919     LevelT          max_level,
920     int             entropy_reduction,
921     int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
922     LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
923     LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
924     OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
925     OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
926     OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
927 {
928     SampleT sample = (SampleT) lower_level[0];
929     ConstantInputIterator<SampleT> sample_itr(sample);
930 
931     TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
932         max_level, entropy_reduction, num_levels, lower_level, upper_level,
933         num_row_pixels, num_rows, row_stride_bytes,
934         sample_itr, sample_itr);
935 
936 }
937 
938 
939 /**
940  * Test histogram-range
941  */
942 template <
943     Backend         BACKEND,
944     int             NUM_CHANNELS,
945     int             NUM_ACTIVE_CHANNELS,
946     typename        SampleT,
947     typename        CounterT,
948     typename        LevelT,
949     typename        OffsetT>
TestRange(LevelT max_level,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT * levels[NUM_ACTIVE_CHANNELS],OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes)950 void TestRange(
951     LevelT          max_level,
952     int             entropy_reduction,
953     int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
954     LevelT*         levels[NUM_ACTIVE_CHANNELS],                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
955     OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
956     OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
957     OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
958 {
959     OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
960 
961     printf("\n----------------------------\n");
962     printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
963         (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
964         (int) (num_row_pixels * num_rows),
965         (int) num_rows,
966         (int) num_row_pixels,
967         (int) row_stride_bytes,
968         (int) total_samples,
969         (int) sizeof(SampleT),
970         typeid(SampleT).name(),
971         entropy_reduction,
972         typeid(CounterT).name(),
973         NUM_ACTIVE_CHANNELS,
974         NUM_CHANNELS);
975     std::cout << CoutCast(max_level) << "\n";
976     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
977     {
978         printf("Channel %d: %d bins [", channel, num_levels[channel] - 1);
979         std::cout << levels[channel][0];
980         for (int level = 1; level < num_levels[channel]; ++level)
981             std::cout << ", " << levels[channel][level];
982         printf("]\n");
983     }
984     fflush(stdout);
985 
986     // Allocate and initialize host and device data
987     typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
988     SampleT*                    h_samples = new Foo[total_samples];
989     CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
990     SearchTransform<LevelT>     transform_op[NUM_ACTIVE_CHANNELS];
991 
992     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
993     {
994         transform_op[channel].levels = levels[channel];
995         transform_op[channel].num_levels = num_levels[channel];
996 
997         int bins = num_levels[channel] - 1;
998         h_histogram[channel] = new CounterT[bins];
999     }
1000 
1001     InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
1002         max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
1003 
1004     InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
1005         h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
1006 
1007     // Allocate and initialize device data
1008     SampleT*        d_samples = NULL;
1009     LevelT*         d_levels[NUM_ACTIVE_CHANNELS];
1010     CounterT*       d_histogram[NUM_ACTIVE_CHANNELS];
1011 
1012     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
1013     CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
1014 
1015     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1016     {
1017         CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
1018         CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel],         sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
1019 
1020         int bins = num_levels[channel] - 1;
1021         CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel],  sizeof(CounterT) * bins));
1022         CubDebugExit(cudaMemset(d_histogram[channel], 0,                        sizeof(CounterT) * bins));
1023     }
1024 
1025     // Allocate CDP device arrays
1026     size_t          *d_temp_storage_bytes = NULL;
1027     cudaError_t     *d_cdp_error = NULL;
1028 
1029     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
1030     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
1031 
1032     // Allocate temporary storage
1033     void            *d_temp_storage = NULL;
1034     size_t          temp_storage_bytes = 0;
1035 
1036     Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
1037         1, d_temp_storage_bytes, d_cdp_error,
1038         d_temp_storage, temp_storage_bytes,
1039         d_samples,
1040         d_histogram,
1041         num_levels, d_levels,
1042         num_row_pixels, num_rows, row_stride_bytes,
1043         0, true);
1044 
1045     // Allocate temporary storage with "canary" zones
1046     int     canary_bytes    = 256;
1047     char    canary_token    = 9;
1048     char*   canary_zone     = new char[canary_bytes];
1049 
1050     memset(canary_zone, canary_token, canary_bytes);
1051     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
1052     CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
1053 
1054     // Run warmup/correctness iteration
1055     Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
1056         1, d_temp_storage_bytes, d_cdp_error,
1057         ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
1058         d_samples,
1059         d_histogram,
1060         num_levels, d_levels,
1061         num_row_pixels, num_rows, row_stride_bytes,
1062         0, true);
1063 
1064     // Check canary zones
1065     int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
1066     AssertEquals(0, error);
1067     error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
1068     AssertEquals(0, error);
1069 
1070     // Flush any stdout/stderr
1071     CubDebugExit(cudaPeekAtLastError());
1072     CubDebugExit(cudaDeviceSynchronize());
1073     fflush(stdout);
1074     fflush(stderr);
1075 
1076     // Check for correctness (and display results, if specified)
1077     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1078     {
1079         int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
1080         printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
1081         error |= channel_error;
1082     }
1083 
1084     // Performance
1085     GpuTimer gpu_timer;
1086     gpu_timer.Start();
1087 
1088     Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
1089         g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
1090         ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
1091         d_samples,
1092         d_histogram,
1093         num_levels, d_levels,
1094         num_row_pixels, num_rows, row_stride_bytes,
1095         0, false);
1096 
1097     gpu_timer.Stop();
1098     float elapsed_millis = gpu_timer.ElapsedMillis();
1099 
1100     // Display performance
1101     if (g_timing_iterations > 0)
1102     {
1103         float avg_millis = elapsed_millis / g_timing_iterations;
1104         float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
1105         float giga_bandwidth = giga_rate * sizeof(SampleT);
1106         printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
1107             avg_millis,
1108             giga_rate,
1109             giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
1110             giga_rate / NUM_CHANNELS,
1111             giga_bandwidth);
1112     }
1113 
1114     printf("\n\n");
1115 
1116     // Cleanup
1117     if (h_samples) delete[] h_samples;
1118 
1119     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1120     {
1121         if (h_histogram[channel])
1122             delete[] h_histogram[channel];
1123 
1124         if (d_histogram[channel])
1125             CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
1126 
1127         if (d_levels[channel])
1128             CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
1129     }
1130 
1131     if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
1132     if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
1133     if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
1134     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
1135 
1136     // Correctness asserts
1137     AssertEquals(0, error);
1138 }
1139 
1140 
1141 /**
1142  * Test histogram-even
1143  */
1144 template <
1145     Backend         BACKEND,
1146     typename        SampleT,
1147     int             NUM_CHANNELS,
1148     int             NUM_ACTIVE_CHANNELS,
1149     typename        CounterT,
1150     typename        LevelT,
1151     typename        OffsetT>
TestEven(OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT max_level,int max_num_levels)1152 void TestEven(
1153     OffsetT         num_row_pixels,
1154     OffsetT         num_rows,
1155     OffsetT         row_stride_bytes,
1156     int             entropy_reduction,
1157     int             num_levels[NUM_ACTIVE_CHANNELS],
1158     LevelT          max_level,
1159     int             max_num_levels)
1160 {
1161     LevelT lower_level[NUM_ACTIVE_CHANNELS];
1162     LevelT upper_level[NUM_ACTIVE_CHANNELS];
1163 
1164     // Find smallest level increment
1165     int max_bins = max_num_levels - 1;
1166     LevelT min_level_increment = max_level / max_bins;
1167 
1168     // Set upper and lower levels for each channel
1169     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1170     {
1171         int num_bins = num_levels[channel] - 1;
1172         lower_level[channel] = (max_level - (num_bins * min_level_increment)) / 2;
1173         upper_level[channel] = (max_level + (num_bins * min_level_increment)) / 2;
1174     }
1175 
1176     // Test pointer-based samples
1177     TestEvenNative<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
1178         max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
1179 
1180     // Test iterator-based samples (CUB-only)
1181     TestEvenIterator<CUB, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
1182         max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
1183 }
1184 
1185 
1186 
1187 /**
1188  * Test histogram-range
1189  */
1190 template <
1191     Backend         BACKEND,
1192     typename        SampleT,
1193     int             NUM_CHANNELS,
1194     int             NUM_ACTIVE_CHANNELS,
1195     typename        CounterT,
1196     typename        LevelT,
1197     typename        OffsetT>
TestRange(OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT max_level,int max_num_levels)1198 void TestRange(
1199     OffsetT         num_row_pixels,
1200     OffsetT         num_rows,
1201     OffsetT         row_stride_bytes,
1202     int             entropy_reduction,
1203     int             num_levels[NUM_ACTIVE_CHANNELS],
1204     LevelT          max_level,
1205     int             max_num_levels)
1206 {
1207     // Find smallest level increment
1208     int max_bins = max_num_levels - 1;
1209     LevelT min_level_increment = max_level / max_bins;
1210 
1211     LevelT* levels[NUM_ACTIVE_CHANNELS];
1212     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1213     {
1214         levels[channel] = new LevelT[num_levels[channel]];
1215 
1216         int num_bins = num_levels[channel] - 1;
1217         LevelT lower_level = (max_level - (num_bins * min_level_increment)) / 2;
1218 
1219         for (int level = 0; level < num_levels[channel]; ++level)
1220             levels[channel][level] = lower_level + (level * min_level_increment);
1221     }
1222 
1223     TestRange<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
1224         max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes);
1225 
1226     for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1227         delete[] levels[channel];
1228 
1229 }
1230 
1231 
1232 
1233 /**
1234  * Test different entrypoints
1235  */
1236 template <
1237     typename        SampleT,
1238     int             NUM_CHANNELS,
1239     int             NUM_ACTIVE_CHANNELS,
1240     typename        CounterT,
1241     typename        LevelT,
1242     typename        OffsetT>
Test(OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,int entropy_reduction,int num_levels[NUM_ACTIVE_CHANNELS],LevelT max_level,int max_num_levels)1243 void Test(
1244     OffsetT         num_row_pixels,
1245     OffsetT         num_rows,
1246     OffsetT         row_stride_bytes,
1247     int             entropy_reduction,
1248     int             num_levels[NUM_ACTIVE_CHANNELS],
1249     LevelT          max_level,
1250     int             max_num_levels)
1251 {
1252     TestEven<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1253         num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
1254 
1255     TestRange<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1256         num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
1257 }
1258 
1259 
1260 /**
1261  * Test different number of levels
1262  */
1263 template <
1264     typename        SampleT,
1265     int             NUM_CHANNELS,
1266     int             NUM_ACTIVE_CHANNELS,
1267     typename        CounterT,
1268     typename        LevelT,
1269     typename        OffsetT>
Test(OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,int entropy_reduction,LevelT max_level,int max_num_levels)1270 void Test(
1271     OffsetT         num_row_pixels,
1272     OffsetT         num_rows,
1273     OffsetT         row_stride_bytes,
1274     int             entropy_reduction,
1275     LevelT          max_level,
1276     int             max_num_levels)
1277 {
1278     int num_levels[NUM_ACTIVE_CHANNELS];
1279 
1280 // Unnecessary testing
1281 //    // All the same level
1282 //    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1283 //    {
1284 //        num_levels[channel] = max_num_levels;
1285 //    }
1286 //    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1287 //        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
1288 
1289     // All different levels
1290     num_levels[0] = max_num_levels;
1291     for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel)
1292     {
1293         num_levels[channel] = (num_levels[channel - 1] / 2) + 1;
1294     }
1295     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1296         num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
1297 }
1298 
1299 
1300 
1301 /**
1302  * Test different entropy-levels
1303  */
1304 template <
1305     typename        SampleT,
1306     int             NUM_CHANNELS,
1307     int             NUM_ACTIVE_CHANNELS,
1308     typename        CounterT,
1309     typename        LevelT,
1310     typename        OffsetT>
Test(OffsetT num_row_pixels,OffsetT num_rows,OffsetT row_stride_bytes,LevelT max_level,int max_num_levels)1311 void Test(
1312     OffsetT         num_row_pixels,
1313     OffsetT         num_rows,
1314     OffsetT         row_stride_bytes,
1315     LevelT          max_level,
1316     int             max_num_levels)
1317 {
1318     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1319         num_row_pixels, num_rows, row_stride_bytes, 0,   max_level, max_num_levels);
1320 
1321     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1322         num_row_pixels, num_rows, row_stride_bytes, -1,  max_level, max_num_levels);
1323 
1324     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1325         num_row_pixels, num_rows, row_stride_bytes, 5,   max_level, max_num_levels);
1326 }
1327 
1328 
1329 /**
1330  * Test different row strides
1331  */
1332 template <
1333     typename        SampleT,
1334     int             NUM_CHANNELS,
1335     int             NUM_ACTIVE_CHANNELS,
1336     typename        CounterT,
1337     typename        LevelT,
1338     typename        OffsetT>
Test(OffsetT num_row_pixels,OffsetT num_rows,LevelT max_level,int max_num_levels)1339 void Test(
1340     OffsetT         num_row_pixels,
1341     OffsetT         num_rows,
1342     LevelT          max_level,
1343     int             max_num_levels)
1344 {
1345     OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT);
1346 
1347     // No padding
1348     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1349         num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels);
1350 
1351     // 13 samples padding
1352     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1353         num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels);
1354 }
1355 
1356 
1357 /**
1358  * Test different problem sizes
1359  */
1360 template <
1361     typename        SampleT,
1362     int             NUM_CHANNELS,
1363     int             NUM_ACTIVE_CHANNELS,
1364     typename        CounterT,
1365     typename        LevelT,
1366     typename        OffsetT>
Test(LevelT max_level,int max_num_levels)1367 void Test(
1368     LevelT          max_level,
1369     int             max_num_levels)
1370 {
1371     // 0 row/col images
1372     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1373         OffsetT(1920), OffsetT(0), max_level, max_num_levels);
1374     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1375         OffsetT(0), OffsetT(0), max_level, max_num_levels);
1376 
1377     // 1080 image
1378     Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1379         OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
1380 
1381     // Sample different aspect ratios sizes
1382     for (OffsetT rows = 1; rows < 1000000; rows *= 1000)
1383     {
1384         for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000)
1385         {
1386             Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1387                 cols, rows, max_level, max_num_levels);
1388         }
1389     }
1390 
1391     // Randomly select linear problem size between 1:10,000,000
1392     unsigned int max_int = (unsigned int) -1;
1393     for (int i = 0; i < 4; ++i)
1394     {
1395         unsigned int num_items;
1396         RandomBits(num_items);
1397         num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
1398         num_items = CUB_MAX(1, num_items);
1399 
1400         Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
1401             OffsetT(num_items), 1, max_level, max_num_levels);
1402     }
1403 }
1404 
1405 
1406 
1407 /**
1408  * Test different channel interleavings (valid specialiation)
1409  */
1410 template <
1411     typename        SampleT,
1412     typename        CounterT,
1413     typename        LevelT,
1414     typename        OffsetT>
TestChannels(LevelT max_level,int max_num_levels,Int2Type<true>)1415 void TestChannels(
1416     LevelT          max_level,
1417     int             max_num_levels,
1418     Int2Type<true>  /*is_valid_tag*/)
1419 {
1420     Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
1421     Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
1422     Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
1423     Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
1424 }
1425 
1426 
1427 /**
1428  * Test different channel interleavings (invalid specialiation)
1429  */
1430 template <
1431     typename        SampleT,
1432     typename        CounterT,
1433     typename        LevelT,
1434     typename        OffsetT>
TestChannels(LevelT,int,Int2Type<false>)1435 void TestChannels(
1436     LevelT          /*max_level*/,
1437     int             /*max_num_levels*/,
1438     Int2Type<false> /*is_valid_tag*/)
1439 {}
1440 
1441 
1442 
1443 //---------------------------------------------------------------------
1444 // Main
1445 //---------------------------------------------------------------------
1446 
1447 
1448 
1449 
1450 /**
1451  * Main
1452  */
main(int argc,char ** argv)1453 int main(int argc, char** argv)
1454 {
1455     int num_row_pixels = -1;
1456     int entropy_reduction = 0;
1457     int num_rows = 1;
1458 
1459     // Initialize command line
1460     CommandLineArgs args(argc, argv);
1461     g_verbose = args.CheckCmdLineFlag("v");
1462     g_verbose_input = args.CheckCmdLineFlag("v2");
1463     args.GetCmdLineArgument("n", num_row_pixels);
1464 
1465     int row_stride_pixels = num_row_pixels;
1466 
1467     args.GetCmdLineArgument("rows", num_rows);
1468     args.GetCmdLineArgument("stride", row_stride_pixels);
1469     args.GetCmdLineArgument("i", g_timing_iterations);
1470     args.GetCmdLineArgument("repeat", g_repeat);
1471     args.GetCmdLineArgument("entropy", entropy_reduction);
1472 #if defined(QUICK_TEST) || defined(QUICKER_TEST)
1473     bool compare_npp = args.CheckCmdLineFlag("npp");
1474 #endif
1475 
1476 
1477     // Print usage
1478     if (args.CheckCmdLineFlag("help"))
1479     {
1480         printf("%s "
1481             "[--n=<pixels per row>] "
1482             "[--rows=<number of rows>] "
1483             "[--stride=<row stride in pixels>] "
1484             "[--i=<timing iterations>] "
1485             "[--device=<device-id>] "
1486             "[--repeat=<repetitions of entire test suite>]"
1487             "[--entropy=<entropy-reduction factor (default 0)>]"
1488             "[--v] "
1489             "[--cdp]"
1490             "[--npp]"
1491             "\n", argv[0]);
1492         exit(0);
1493     }
1494 
1495     // Initialize device
1496     CubDebugExit(args.DeviceInit());
1497 
1498     // Get ptx version
1499     int ptx_version = 0;
1500     CubDebugExit(PtxVersion(ptx_version));
1501 
1502     if (num_row_pixels < 0)
1503     {
1504         num_row_pixels      = 1920 * 1080;
1505         row_stride_pixels   = num_row_pixels;
1506     }
1507 
1508 #if defined(QUICKER_TEST)
1509 
1510     // Compile/run quick tests
1511     {
1512         // HistogramEven: unsigned char 256 bins
1513         typedef unsigned char       SampleT;
1514         typedef int                 LevelT;
1515 
1516         LevelT  max_level           = 256;
1517         int     num_levels[1]       = {257};
1518         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
1519 
1520         TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1521         // The NPP path doesn't compile as of 2020-06:
1522         // No Dispatch<int, int, NPP> specialization defined.
1523 //        if (compare_npp)
1524 //            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1525     }
1526 
1527     {
1528         // HistogramRange: signed char 256 bins
1529         typedef signed char         SampleT;
1530         typedef int                 LevelT;
1531 
1532         LevelT  max_level           = 256;
1533         int     num_levels[1]       = {257};
1534         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
1535 
1536         TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1537     }
1538 
1539 
1540 
1541 #elif defined(QUICK_TEST)
1542 
1543     // Compile/run quick tests
1544     {
1545         // HistogramEven: unsigned char 256 bins
1546         typedef unsigned char       SampleT;
1547         typedef int                 LevelT;
1548 
1549         LevelT  max_level           = 256;
1550         int     num_levels[1]       = {257};
1551         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
1552 
1553         TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1554         // The NPP path doesn't compile as of 2020-06:
1555         // No Dispatch<int, int, NPP> specialization defined.
1556 //        if (compare_npp)
1557 //            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1558     }
1559 
1560     {
1561         // HistogramEven: 4/4 multichannel Unsigned char 256 bins
1562         typedef unsigned char       SampleT;
1563         typedef int                 LevelT;
1564 
1565         LevelT  max_level           = 256;
1566         int     num_levels[4]       = {257, 257, 257, 257};
1567         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
1568 
1569         TestEven<CUB, SampleT, 4, 4, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1570     }
1571 
1572     {
1573         // HistogramEven: 3/4 multichannel Unsigned char 256 bins
1574         typedef unsigned char       SampleT;
1575         typedef int                 LevelT;
1576 
1577         LevelT  max_level           = 256;
1578         int     num_levels[3]       = {257, 257, 257};
1579         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
1580 
1581         TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1582         // The NPP path doesn't compile as of 2020-06:
1583         // No Dispatch<int, int, NPP> specialization defined.
1584 //        if (compare_npp)
1585 //            TestEven<NPP, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1586     }
1587 
1588     {
1589         // HistogramEven: short [0,1024] 256 bins
1590         typedef unsigned short      SampleT;
1591         typedef unsigned short      LevelT;
1592 
1593         LevelT  max_level           = 1024;
1594         int     num_levels[1]       = {257};
1595         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
1596 
1597         TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1598     }
1599 
1600     {
1601         // HistogramEven: float [0,1.0] 256 bins
1602         typedef float               SampleT;
1603         typedef float               LevelT;
1604 
1605         LevelT  max_level           = 1.0;
1606         int     num_levels[1]       = {257};
1607         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
1608 
1609         TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1610     }
1611 
1612     {
1613         // HistogramEven: 3/4 multichannel float [0,1.0] 256 bins
1614         typedef float               SampleT;
1615         typedef float               LevelT;
1616 
1617          LevelT  max_level           = 1.0;
1618          int     num_levels[3]       = {257, 257, 257};
1619          int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
1620 
1621          TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1622     }
1623 
1624     {
1625         // HistogramRange: signed char 256 bins
1626         typedef signed char         SampleT;
1627         typedef int                 LevelT;
1628 
1629         LevelT  max_level           = 256;
1630         int     num_levels[1]       = {257};
1631         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
1632 
1633         TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1634     }
1635 
1636     {
1637         // HistogramRange: 3/4 channel, unsigned char, varied bins (256, 128, 64)
1638         typedef unsigned char       SampleT;
1639         typedef int                 LevelT;
1640 
1641         LevelT  max_level           = 256;
1642         int     num_levels[3]       = {257, 129, 65};
1643         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
1644 
1645         TestRange<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1646     }
1647 
1648     if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
1649     {
1650         // HistogramEven: double [0,1.0] 64 bins
1651         typedef double              SampleT;
1652         typedef double              LevelT;
1653 
1654         LevelT  max_level           = 1.0;
1655         int     num_levels[1]       = {65};
1656         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
1657 
1658         TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1659     }
1660 
1661     {
1662         // HistogramEven: short [0,1024] 512 bins
1663         typedef unsigned short      SampleT;
1664         typedef unsigned short      LevelT;
1665 
1666         LevelT  max_level           = 1024;
1667         int     num_levels[1]       = {513};
1668         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
1669 
1670         TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
1671     }
1672 
1673 #else
1674 
1675     // Compile/run thorough tests
1676     for (int i = 0; i <= g_repeat; ++i)
1677     {
1678         TestChannels <unsigned char,    int, int,   int>(256,   256 + 1, Int2Type<true>());
1679         TestChannels <signed char,      int, int,   int>(256,   256 + 1, Int2Type<true>());
1680         TestChannels <unsigned short,   int, int,   int>(128,   128 + 1, Int2Type<true>());
1681         TestChannels <unsigned short,   int, int,   int>(8192,  8192 + 1, Int2Type<true>());
1682         TestChannels <float,            int, float, int>(1.0,   256 + 1, Int2Type<true>());
1683 
1684 		// Test down-conversion of size_t offsets to int
1685         TestChannels <unsigned char,    int, int,   long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>());
1686     }
1687 
1688 #endif
1689 
1690     return 0;
1691 }
1692 
1693