1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 
30 #pragma once
31 
32 #if defined(_WIN32) || defined(_WIN64)
33     #include <windows.h>
34     #undef small            // Windows is terrible for polluting macro namespace
35 #else
36     #include <sys/resource.h>
37 #endif
38 
39 #include <cuda_runtime.h>
40 
41 #include <stdio.h>
42 #include <float.h>
43 
44 #include <cmath>
45 #include <string>
46 #include <vector>
47 #include <sstream>
48 #include <iostream>
49 #include <limits>
50 
51 #include "mersenne.h"
52 #include "half.h"
53 
54 #include "cub/util_debug.cuh"
55 #include "cub/util_device.cuh"
56 #include "cub/util_type.cuh"
57 #include "cub/util_macro.cuh"
58 #include "cub/iterator/discard_output_iterator.cuh"
59 
60 /******************************************************************************
61  * Type conversion macros
62  ******************************************************************************/
63 
64 /**
65  * Return a value of type `T` with the same bitwise representation of `in`.
66  * Types `T` and `U` must be the same size.
67  */
68 template <typename T, typename U>
SafeBitCast(const U & in)69 T SafeBitCast(const U& in)
70 {
71   static_assert(sizeof(T) == sizeof(U), "Types must be same size.");
72   T out;
73   memcpy(&out, &in, sizeof(T));
74   return out;
75 }
76 
77 /******************************************************************************
78  * Assertion macros
79  ******************************************************************************/
80 
81 /**
82  * Assert equals
83  */
84 #define AssertEquals(a, b) if ((a) != (b)) { std::cerr << "\n(" << __FILE__ << ": " << __LINE__ << ")\n"; exit(1);}
85 
86 
87 /******************************************************************************
88  * Command-line parsing functionality
89  ******************************************************************************/
90 
91 /**
92  * Utility for parsing command line arguments
93  */
94 struct CommandLineArgs
95 {
96 
97     std::vector<std::string>    keys;
98     std::vector<std::string>    values;
99     std::vector<std::string>    args;
100     cudaDeviceProp              deviceProp;
101     float                       device_giga_bandwidth;
102     size_t                      device_free_physmem;
103     size_t                      device_total_physmem;
104 
105     /**
106      * Constructor
107      */
CommandLineArgsCommandLineArgs108     CommandLineArgs(int argc, char **argv) :
109         keys(10),
110         values(10)
111     {
112         using namespace std;
113 
114         // Initialize mersenne generator
115         unsigned int mersenne_init[4]=  {0x123, 0x234, 0x345, 0x456};
116         mersenne::init_by_array(mersenne_init, 4);
117 
118         for (int i = 1; i < argc; i++)
119         {
120             string arg = argv[i];
121 
122             if ((arg[0] != '-') || (arg[1] != '-'))
123             {
124                 args.push_back(arg);
125                 continue;
126             }
127 
128             string::size_type pos;
129             string key, val;
130             if ((pos = arg.find('=')) == string::npos) {
131                 key = string(arg, 2, arg.length() - 2);
132                 val = "";
133             } else {
134                 key = string(arg, 2, pos - 2);
135                 val = string(arg, pos + 1, arg.length() - 1);
136             }
137 
138             keys.push_back(key);
139             values.push_back(val);
140         }
141     }
142 
143 
144     /**
145      * Checks whether a flag "--<flag>" is present in the commandline
146      */
CheckCmdLineFlagCommandLineArgs147     bool CheckCmdLineFlag(const char* arg_name)
148     {
149         using namespace std;
150 
151         for (int i = 0; i < int(keys.size()); ++i)
152         {
153             if (keys[i] == string(arg_name))
154                 return true;
155         }
156         return false;
157     }
158 
159 
160     /**
161      * Returns number of naked (non-flag and non-key-value) commandline parameters
162      */
163     template <typename T>
NumNakedArgsCommandLineArgs164     int NumNakedArgs()
165     {
166         return args.size();
167     }
168 
169 
170     /**
171      * Returns the commandline parameter for a given index (not including flags)
172      */
173     template <typename T>
GetCmdLineArgumentCommandLineArgs174     void GetCmdLineArgument(int index, T &val)
175     {
176         using namespace std;
177         if (index < args.size()) {
178             istringstream str_stream(args[index]);
179             str_stream >> val;
180         }
181     }
182 
183     /**
184      * Returns the value specified for a given commandline parameter --<flag>=<value>
185      */
186     template <typename T>
GetCmdLineArgumentCommandLineArgs187     void GetCmdLineArgument(const char *arg_name, T &val)
188     {
189         using namespace std;
190 
191         for (int i = 0; i < int(keys.size()); ++i)
192         {
193             if (keys[i] == string(arg_name))
194             {
195                 istringstream str_stream(values[i]);
196                 str_stream >> val;
197             }
198         }
199     }
200 
201 
202     /**
203      * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
204      */
205     template <typename T>
GetCmdLineArgumentsCommandLineArgs206     void GetCmdLineArguments(const char *arg_name, std::vector<T> &vals)
207     {
208         using namespace std;
209 
210         if (CheckCmdLineFlag(arg_name))
211         {
212             // Clear any default values
213             vals.clear();
214 
215             // Recover from multi-value string
216             for (int i = 0; i < keys.size(); ++i)
217             {
218                 if (keys[i] == string(arg_name))
219                 {
220                     string val_string(values[i]);
221                     istringstream str_stream(val_string);
222                     string::size_type old_pos = 0;
223                     string::size_type new_pos = 0;
224 
225                     // Iterate comma-separated values
226                     T val;
227                     while ((new_pos = val_string.find(',', old_pos)) != string::npos)
228                     {
229                         if (new_pos != old_pos)
230                         {
231                             str_stream.width(new_pos - old_pos);
232                             str_stream >> val;
233                             vals.push_back(val);
234                         }
235 
236                         // skip over comma
237                         str_stream.ignore(1);
238                         old_pos = new_pos + 1;
239                     }
240 
241                     // Read last value
242                     str_stream >> val;
243                     vals.push_back(val);
244                 }
245             }
246         }
247     }
248 
249 
250     /**
251      * The number of pairs parsed
252      */
ParsedArgcCommandLineArgs253     int ParsedArgc()
254     {
255         return (int) keys.size();
256     }
257 
258     /**
259      * Initialize device
260      */
261     cudaError_t DeviceInit(int dev = -1)
262     {
263         cudaError_t error = cudaSuccess;
264 
265         do
266         {
267             int deviceCount;
268             error = CubDebug(cudaGetDeviceCount(&deviceCount));
269             if (error) break;
270 
271             if (deviceCount == 0) {
272                 fprintf(stderr, "No devices supporting CUDA.\n");
273                 exit(1);
274             }
275             if (dev < 0)
276             {
277                 GetCmdLineArgument("device", dev);
278             }
279             if ((dev > deviceCount - 1) || (dev < 0))
280             {
281                 dev = 0;
282             }
283 
284             error = CubDebug(cudaSetDevice(dev));
285             if (error) break;
286 
287             CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
288 
289             int ptx_version = 0;
290             error = CubDebug(cub::PtxVersion(ptx_version));
291             if (error) break;
292 
293             error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev));
294             if (error) break;
295 
296             if (deviceProp.major < 1) {
297                 fprintf(stderr, "Device does not support CUDA.\n");
298                 exit(1);
299             }
300 
301             device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000;
302 
303             if (!CheckCmdLineFlag("quiet"))
304             {
305                 printf(
306                         "Using device %d: %s (PTX version %d, SM%d, %d SMs, "
307                         "%lld free / %lld total MB physmem, "
308                         "%.3f GB/s @ %d kHz mem clock, ECC %s)\n",
309                     dev,
310                     deviceProp.name,
311                     ptx_version,
312                     deviceProp.major * 100 + deviceProp.minor * 10,
313                     deviceProp.multiProcessorCount,
314                     (unsigned long long) device_free_physmem / 1024 / 1024,
315                     (unsigned long long) device_total_physmem / 1024 / 1024,
316                     device_giga_bandwidth,
317                     deviceProp.memoryClockRate,
318                     (deviceProp.ECCEnabled) ? "on" : "off");
319                 fflush(stdout);
320             }
321 
322         } while (0);
323 
324         return error;
325     }
326 };
327 
328 /******************************************************************************
329  * Random bits generator
330  ******************************************************************************/
331 
332 int g_num_rand_samples = 0;
333 
334 
335 template <typename T>
IsNaN(T)336 bool IsNaN(T /* val */) { return false; }
337 
338 template<>
339 __noinline__ bool IsNaN<float>(float val)
340 {
341   return std::isnan(val);
342 }
343 
344 template<>
345 __noinline__ bool IsNaN<float1>(float1 val)
346 {
347     return (IsNaN(val.x));
348 }
349 
350 template<>
351 __noinline__ bool IsNaN<float2>(float2 val)
352 {
353     return (IsNaN(val.y) || IsNaN(val.x));
354 }
355 
356 template<>
357 __noinline__ bool IsNaN<float3>(float3 val)
358 {
359     return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
360 }
361 
362 template<>
363 __noinline__ bool IsNaN<float4>(float4 val)
364 {
365     return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
366 }
367 
368 template<>
369 __noinline__ bool IsNaN<double>(double val)
370 {
371   return std::isnan(val);
372 }
373 
374 template<>
375 __noinline__ bool IsNaN<double1>(double1 val)
376 {
377     return (IsNaN(val.x));
378 }
379 
380 template<>
381 __noinline__ bool IsNaN<double2>(double2 val)
382 {
383     return (IsNaN(val.y) || IsNaN(val.x));
384 }
385 
386 template<>
387 __noinline__ bool IsNaN<double3>(double3 val)
388 {
389     return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
390 }
391 
392 template<>
393 __noinline__ bool IsNaN<double4>(double4 val)
394 {
395     return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
396 }
397 
398 
399 template<>
400 __noinline__ bool IsNaN<half_t>(half_t val)
401 {
402     const auto bits = SafeBitCast<unsigned short>(val);
403 
404     // commented bit is always true, leaving for documentation:
405     return (((bits >= 0x7C01) && (bits <= 0x7FFF)) ||
406         ((bits >= 0xFC01) /*&& (bits <= 0xFFFFFFFF)*/));
407 }
408 
409 
410 
411 /**
412  * Generates random keys.
413  *
414  * We always take the second-order byte from rand() because the higher-order
415  * bits returned by rand() are commonly considered more uniformly distributed
416  * than the lower-order bits.
417  *
418  * We can decrease the entropy level of keys by adopting the technique
419  * of Thearling and Smith in which keys are computed from the bitwise AND of
420  * multiple random samples:
421  *
422  * entropy_reduction    | Effectively-unique bits per key
423  * -----------------------------------------------------
424  * -1                   | 0
425  * 0                    | 32
426  * 1                    | 25.95 (81%)
427  * 2                    | 17.41 (54%)
428  * 3                    | 10.78 (34%)
429  * 4                    | 6.42 (20%)
430  * ...                  | ...
431  *
432  */
433 template <typename K>
434 void RandomBits(
435     K &key,
436     int entropy_reduction = 0,
437     int begin_bit = 0,
438     int end_bit = sizeof(K) * 8)
439 {
440     const int NUM_BYTES = sizeof(K);
441     const int WORD_BYTES = sizeof(unsigned int);
442     const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES;
443 
444     unsigned int word_buff[NUM_WORDS];
445 
446     if (entropy_reduction == -1)
447     {
448         memset((void *) &key, 0, sizeof(key));
449         return;
450     }
451 
452     if (end_bit < 0)
453         end_bit = sizeof(K) * 8;
454 
455     while (true)
456     {
457         // Generate random word_buff
458         for (int j = 0; j < NUM_WORDS; j++)
459         {
460             int current_bit = j * WORD_BYTES * 8;
461 
462             unsigned int word = 0xffffffff;
463             word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit);
464             word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit);
465 
466             for (int i = 0; i <= entropy_reduction; i++)
467             {
468                 // Grab some of the higher bits from rand (better entropy, supposedly)
469                 word &= mersenne::genrand_int32();
470                 g_num_rand_samples++;
471             }
472 
473             word_buff[j] = word;
474         }
475 
476         memcpy(&key, word_buff, sizeof(K));
477 
478         K copy = key;
479         if (!IsNaN(copy))
480             break;          // avoids NaNs when generating random floating point numbers
481     }
482 }
483 
484 /// Randomly select number between [0:max)
485 template <typename T>
RandomValue(T max)486 T RandomValue(T max)
487 {
488     unsigned int bits;
489     unsigned int max_int = (unsigned int) -1;
490     do {
491         RandomBits(bits);
492     } while (bits == max_int);
493 
494     return (T) ((double(bits) / double(max_int)) * double(max));
495 }
496 
497 
498 /******************************************************************************
499  * Console printing utilities
500  ******************************************************************************/
501 
502 /**
503  * Helper for casting character types to integers for cout printing
504  */
505 template <typename T>
CoutCast(T val)506 T CoutCast(T val) { return val; }
507 
CoutCast(char val)508 int CoutCast(char val) { return val; }
509 
CoutCast(unsigned char val)510 int CoutCast(unsigned char val) { return val; }
511 
CoutCast(signed char val)512 int CoutCast(signed char val) { return val; }
513 
514 
515 
516 /******************************************************************************
517  * Test value initialization utilities
518  ******************************************************************************/
519 
520 /**
521  * Test problem generation options
522  */
523 enum GenMode
524 {
525     UNIFORM,            // Assign to '2', regardless of integer seed
526     INTEGER_SEED,       // Assign to integer seed
527     RANDOM,             // Assign to random, regardless of integer seed
528     RANDOM_BIT,         // Assign to randomly chosen 0 or 1, regardless of integer seed
529 };
530 
531 /**
532  * Initialize value
533  */
534 template <typename T>
535 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)
536 {
537     switch (gen_mode)
538     {
539 #if (CUB_PTX_ARCH == 0)
540     case RANDOM:
541         RandomBits(value);
542         break;
543     case RANDOM_BIT:
544         char c;
545         RandomBits(c, 0, 0, 1);
546         value = (c > 0) ? (T) 1 : (T) -1;
547         break;
548 #endif
549      case UNIFORM:
550         value = 2;
551         break;
552     case INTEGER_SEED:
553     default:
554          value = (T) index;
555         break;
556     }
557 }
558 
559 
560 /**
561  * Initialize value (bool)
562  */
563 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, int index = 0)
564 {
565     switch (gen_mode)
566     {
567 #if (CUB_PTX_ARCH == 0)
568     case RANDOM:
569     case RANDOM_BIT:
570         char c;
571         RandomBits(c, 0, 0, 1);
572         value = (c > 0);
573         break;
574 #endif
575      case UNIFORM:
576         value = true;
577         break;
578     case INTEGER_SEED:
579     default:
580         value = (index > 0);
581         break;
582     }
583 }
584 
585 
586 /**
587  * cub::NullType test initialization
588  */
589 __host__ __device__ __forceinline__ void InitValue(GenMode /* gen_mode */,
590 						   cub::NullType &/* value */,
591 						   int /* index */ = 0)
592 {}
593 
594 
595 /**
596  * cub::KeyValuePair<OffsetT, ValueT>test initialization
597  */
598 template <typename KeyT, typename ValueT>
599 __host__ __device__ __forceinline__ void InitValue(
600     GenMode                             gen_mode,
601     cub::KeyValuePair<KeyT, ValueT>&    value,
602     int                                 index = 0)
603 {
604     InitValue(gen_mode, value.value, index);
605 
606     // Assign corresponding flag with a likelihood of the last bit being set with entropy-reduction level 3
607     RandomBits(value.key, 3);
608     value.key = (value.key & 0x1);
609 }
610 
611 
612 
613 /******************************************************************************
614  * Comparison and ostream operators
615  ******************************************************************************/
616 
617 /**
618  * KeyValuePair ostream operator
619  */
620 template <typename Key, typename Value>
621 std::ostream& operator<<(std::ostream& os, const cub::KeyValuePair<Key, Value> &val)
622 {
623     os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')';
624     return os;
625 }
626 
627 
628 /******************************************************************************
629  * Comparison and ostream operators for CUDA vector types
630  ******************************************************************************/
631 
632 /**
633  * Vector1 overloads
634  */
635 #define CUB_VEC_OVERLOAD_1(T, BaseT)                        \
636     /* Ostream output */                                    \
637     std::ostream& operator<<(                               \
638         std::ostream& os,                                   \
639         const T& val)                                       \
640     {                                                       \
641         os << '(' << CoutCast(val.x) << ')';                \
642         return os;                                          \
643     }                                                       \
644     /* Inequality */                                        \
645     __host__ __device__ __forceinline__ bool operator!=(    \
646         const T &a,                                         \
647         const T &b)                                         \
648     {                                                       \
649         return (a.x != b.x);                                \
650     }                                                       \
651     /* Equality */                                          \
652     __host__ __device__ __forceinline__ bool operator==(    \
653         const T &a,                                         \
654         const T &b)                                         \
655     {                                                       \
656         return (a.x == b.x);                                \
657     }                                                       \
658     /* Test initialization */                               \
659     __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
660     {                                                       \
661         InitValue(gen_mode, value.x, index);                \
662     }                                                       \
663     /* Max */                                               \
664     __host__ __device__ __forceinline__ bool operator>(     \
665         const T &a,                                         \
666         const T &b)                                         \
667     {                                                       \
668         return (a.x > b.x);                                 \
669     }                                                       \
670     /* Min */                                               \
671     __host__ __device__ __forceinline__ bool operator<(     \
672         const T &a,                                         \
673         const T &b)                                         \
674     {                                                       \
675         return (a.x < b.x);                                 \
676     }                                                       \
677     /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                       \
678     __host__ __device__ __forceinline__ T operator+(        \
679         T a,                                                \
680         T b)                                                \
681     {                                                       \
682         T retval = make_##T(a.x + b.x);                     \
683         return retval;                                      \
684     }                                                       \
685     namespace cub {                                         \
686     template<>                                              \
687     struct NumericTraits<T>                                 \
688     {                                                       \
689         static const Category CATEGORY = NOT_A_NUMBER;      \
690         enum {                                              \
691             PRIMITIVE       = false,                        \
692             NULL_TYPE       = false,                        \
693         };                                                  \
694         static T Max()                                      \
695         {                                                   \
696             T retval = {                                    \
697                 NumericTraits<BaseT>::Max()};               \
698             return retval;                                  \
699         }                                                   \
700         static T Lowest()                                   \
701         {                                                   \
702             T retval = {                                    \
703                 NumericTraits<BaseT>::Lowest()};            \
704             return retval;                                  \
705         }                                                   \
706     };                                                      \
707     } /* namespace std */
708 
709 
710 
711 /**
712  * Vector2 overloads
713  */
714 #define CUB_VEC_OVERLOAD_2(T, BaseT)                        \
715     /* Ostream output */                                    \
716     std::ostream& operator<<(                               \
717         std::ostream& os,                                   \
718         const T& val)                                       \
719     {                                                       \
720         os << '('                                           \
721             << CoutCast(val.x) << ','                       \
722             << CoutCast(val.y) << ')';                      \
723         return os;                                          \
724     }                                                       \
725     /* Inequality */                                        \
726     __host__ __device__ __forceinline__ bool operator!=(    \
727         const T &a,                                         \
728         const T &b)                                         \
729     {                                                       \
730         return (a.x != b.x) ||                              \
731             (a.y != b.y);                                   \
732     }                                                       \
733     /* Equality */                                          \
734     __host__ __device__ __forceinline__ bool operator==(    \
735         const T &a,                                         \
736         const T &b)                                         \
737     {                                                       \
738         return (a.x == b.x) &&                              \
739             (a.y == b.y);                                   \
740     }                                                       \
741     /* Test initialization */                               \
742     __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
743     {                                                       \
744         InitValue(gen_mode, value.x, index);                \
745         InitValue(gen_mode, value.y, index);                \
746     }                                                       \
747     /* Max */                                               \
748     __host__ __device__ __forceinline__ bool operator>(     \
749         const T &a,                                         \
750         const T &b)                                         \
751     {                                                       \
752         if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
753         return a.y > b.y;                                               \
754     }                                                       \
755     /* Min */                                               \
756     __host__ __device__ __forceinline__ bool operator<(     \
757         const T &a,                                         \
758         const T &b)                                         \
759     {                                                       \
760         if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
761         return a.y < b.y;                                               \
762     }                                                       \
763     /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
764     __host__ __device__ __forceinline__ T operator+(        \
765         T a,                                         \
766         T b)                                         \
767     {                                                       \
768         T retval = make_##T(                                        \
769             a.x + b.x,                                      \
770             a.y + b.y);                                     \
771         return retval;                                      \
772     }                                                       \
773     namespace cub {                                         \
774     template<>                                              \
775     struct NumericTraits<T>                                 \
776     {                                                       \
777         static const Category CATEGORY = NOT_A_NUMBER;      \
778         enum {                                              \
779             PRIMITIVE       = false,                        \
780             NULL_TYPE       = false,                        \
781         };                                                  \
782         static T Max()                                      \
783         {                                                   \
784             T retval = {                                    \
785                 NumericTraits<BaseT>::Max(),                \
786                 NumericTraits<BaseT>::Max()};               \
787             return retval;                                  \
788         }                                                   \
789         static T Lowest()                                   \
790         {                                                   \
791             T retval = {                                    \
792                 NumericTraits<BaseT>::Lowest(),             \
793                 NumericTraits<BaseT>::Lowest()};            \
794             return retval;                                  \
795         }                                                   \
796     };                                                      \
797     } /* namespace cub */
798 
799 
800 
801 /**
802  * Vector3 overloads
803  */
804 #define CUB_VEC_OVERLOAD_3(T, BaseT)                        \
805     /* Ostream output */                                    \
806     std::ostream& operator<<(                               \
807         std::ostream& os,                                   \
808         const T& val)                                       \
809     {                                                       \
810         os << '('                                           \
811             << CoutCast(val.x) << ','                       \
812             << CoutCast(val.y) << ','                       \
813             << CoutCast(val.z) << ')';                      \
814         return os;                                          \
815     }                                                       \
816     /* Inequality */                                        \
817     __host__ __device__ __forceinline__ bool operator!=(    \
818         const T &a,                                         \
819         const T &b)                                         \
820     {                                                       \
821         return (a.x != b.x) ||                              \
822             (a.y != b.y) ||                                 \
823             (a.z != b.z);                                   \
824     }                                                       \
825     /* Equality */                                          \
826     __host__ __device__ __forceinline__ bool operator==(    \
827         const T &a,                                         \
828         const T &b)                                         \
829     {                                                       \
830         return (a.x == b.x) &&                              \
831             (a.y == b.y) &&                                 \
832             (a.z == b.z);                                   \
833     }                                                       \
834     /* Test initialization */                               \
835     __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
836     {                                                       \
837         InitValue(gen_mode, value.x, index);                \
838         InitValue(gen_mode, value.y, index);                \
839         InitValue(gen_mode, value.z, index);                \
840     }                                                       \
841     /* Max */                                               \
842     __host__ __device__ __forceinline__ bool operator>(     \
843         const T &a,                                         \
844         const T &b)                                         \
845     {                                                       \
846         if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
847         if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
848         return a.z > b.z;                                               \
849     }                                                       \
850     /* Min */                                               \
851     __host__ __device__ __forceinline__ bool operator<(     \
852         const T &a,                                         \
853         const T &b)                                         \
854     {                                                       \
855         if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
856         if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
857         return a.z < b.z;                                               \
858     }                                                       \
859     /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
860     __host__ __device__ __forceinline__ T operator+(        \
861         T a,                                                \
862         T b)                                                \
863     {                                                       \
864         T retval = make_##T(                                        \
865             a.x + b.x,                                      \
866             a.y + b.y,                                      \
867             a.z + b.z);                                     \
868         return retval;                                      \
869     }                                                       \
870     namespace cub {                                         \
871     template<>                                              \
872     struct NumericTraits<T>                                 \
873     {                                                       \
874         static const Category CATEGORY = NOT_A_NUMBER;      \
875         enum {                                              \
876             PRIMITIVE       = false,                        \
877             NULL_TYPE       = false,                        \
878         };                                                  \
879         static T Max()                                      \
880         {                                                   \
881             T retval = {                                    \
882                 NumericTraits<BaseT>::Max(),                \
883                 NumericTraits<BaseT>::Max(),                \
884                 NumericTraits<BaseT>::Max()};               \
885             return retval;                                  \
886         }                                                   \
887         static T Lowest()                                   \
888         {                                                   \
889             T retval = {                                    \
890                 NumericTraits<BaseT>::Lowest(),             \
891                 NumericTraits<BaseT>::Lowest(),             \
892                 NumericTraits<BaseT>::Lowest()};            \
893             return retval;                                  \
894         }                                                   \
895     };                                                      \
896     } /* namespace cub */
897 
898 
899 /**
900  * Vector4 overloads
901  */
902 #define CUB_VEC_OVERLOAD_4(T, BaseT)                        \
903     /* Ostream output */                                    \
904     std::ostream& operator<<(                               \
905         std::ostream& os,                                   \
906         const T& val)                                       \
907     {                                                       \
908         os << '('                                           \
909             << CoutCast(val.x) << ','                       \
910             << CoutCast(val.y) << ','                       \
911             << CoutCast(val.z) << ','                       \
912             << CoutCast(val.w) << ')';                      \
913         return os;                                          \
914     }                                                       \
915     /* Inequality */                                        \
916     __host__ __device__ __forceinline__ bool operator!=(    \
917         const T &a,                                         \
918         const T &b)                                         \
919     {                                                       \
920         return (a.x != b.x) ||                              \
921             (a.y != b.y) ||                                 \
922             (a.z != b.z) ||                                 \
923             (a.w != b.w);                                   \
924     }                                                       \
925     /* Equality */                                          \
926     __host__ __device__ __forceinline__ bool operator==(    \
927         const T &a,                                         \
928         const T &b)                                         \
929     {                                                       \
930         return (a.x == b.x) &&                              \
931             (a.y == b.y) &&                                 \
932             (a.z == b.z) &&                                 \
933             (a.w == b.w);                                   \
934     }                                                       \
935     /* Test initialization */                               \
936     __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
937     {                                                       \
938         InitValue(gen_mode, value.x, index);                \
939         InitValue(gen_mode, value.y, index);                \
940         InitValue(gen_mode, value.z, index);                \
941         InitValue(gen_mode, value.w, index);                \
942     }                                                       \
943     /* Max */                                               \
944     __host__ __device__ __forceinline__ bool operator>(     \
945         const T &a,                                         \
946         const T &b)                                         \
947     {                                                       \
948         if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
949         if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
950         if (a.z > b.z) return true; else if (b.z > a.z) return false;   \
951         return a.w > b.w;                                               \
952     }                                                       \
953     /* Min */                                               \
954     __host__ __device__ __forceinline__ bool operator<(     \
955         const T &a,                                         \
956         const T &b)                                         \
957     {                                                       \
958         if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
959         if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
960         if (a.z < b.z) return true; else if (b.z < a.z) return false;   \
961         return a.w < b.w;                                               \
962     }                                                       \
963     /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
964     __host__ __device__ __forceinline__ T operator+(        \
965         T a,                                                \
966         T b)                                                \
967     {                                                       \
968         T retval = make_##T(                                        \
969             a.x + b.x,                                      \
970             a.y + b.y,                                      \
971             a.z + b.z,                                      \
972             a.w + b.w);                                     \
973         return retval;                                      \
974     }                                                       \
975     namespace cub {                                         \
976     template<>                                              \
977     struct NumericTraits<T>                                 \
978     {                                                       \
979         static const Category CATEGORY = NOT_A_NUMBER;      \
980         enum {                                              \
981             PRIMITIVE       = false,                        \
982             NULL_TYPE       = false,                        \
983         };                                                  \
984         static T Max()                                      \
985         {                                                   \
986             T retval = {                                    \
987                 NumericTraits<BaseT>::Max(),                \
988                 NumericTraits<BaseT>::Max(),                \
989                 NumericTraits<BaseT>::Max(),                \
990                 NumericTraits<BaseT>::Max()};               \
991             return retval;                                  \
992         }                                                   \
993         static T Lowest()                                   \
994         {                                                   \
995             T retval = {                                    \
996                 NumericTraits<BaseT>::Lowest(),             \
997                 NumericTraits<BaseT>::Lowest(),             \
998                 NumericTraits<BaseT>::Lowest(),             \
999                 NumericTraits<BaseT>::Lowest()};            \
1000             return retval;                                  \
1001         }                                                   \
1002     };                                                      \
1003     } /* namespace cub */
1004 
1005 /**
1006  * All vector overloads
1007  */
1008 #define CUB_VEC_OVERLOAD(COMPONENT_T, BaseT)                    \
1009     CUB_VEC_OVERLOAD_1(COMPONENT_T##1, BaseT)                   \
1010     CUB_VEC_OVERLOAD_2(COMPONENT_T##2, BaseT)                   \
1011     CUB_VEC_OVERLOAD_3(COMPONENT_T##3, BaseT)                   \
1012     CUB_VEC_OVERLOAD_4(COMPONENT_T##4, BaseT)
1013 
1014 /**
1015  * Define for types
1016  */
1017 CUB_VEC_OVERLOAD(char, char)
1018 CUB_VEC_OVERLOAD(short, short)
1019 CUB_VEC_OVERLOAD(int, int)
1020 CUB_VEC_OVERLOAD(long, long)
1021 CUB_VEC_OVERLOAD(longlong, long long)
1022 CUB_VEC_OVERLOAD(uchar, unsigned char)
1023 CUB_VEC_OVERLOAD(ushort, unsigned short)
1024 CUB_VEC_OVERLOAD(uint, unsigned int)
1025 CUB_VEC_OVERLOAD(ulong, unsigned long)
1026 CUB_VEC_OVERLOAD(ulonglong, unsigned long long)
1027 CUB_VEC_OVERLOAD(float, float)
1028 CUB_VEC_OVERLOAD(double, double)
1029 
1030 
1031 //---------------------------------------------------------------------
1032 // Complex data type TestFoo
1033 //---------------------------------------------------------------------
1034 
1035 /**
1036  * TestFoo complex data type
1037  */
1038 struct TestFoo
1039 {
1040     long long   x;
1041     int         y;
1042     short       z;
1043     char        w;
1044 
1045     // Factory
MakeTestFooTestFoo1046     static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w)
1047     {
1048         TestFoo retval = {x, y, z, w};
1049         return retval;
1050     }
1051 
1052     // Assignment from int operator
1053     __host__ __device__ __forceinline__ TestFoo& operator =(int b)
1054     {
1055         x = b;
1056         y = b;
1057         z = b;
1058         w = b;
1059         return *this;
1060     }
1061 
1062     // Summation operator
1063     __host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const
1064     {
1065         return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w);
1066     }
1067 
1068     // Inequality operator
1069     __host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const
1070     {
1071         return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w);
1072     }
1073 
1074     // Equality operator
1075     __host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const
1076     {
1077         return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w);
1078     }
1079 
1080     // Less than operator
1081     __host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const
1082     {
1083         if (x < b.x) return true; else if (b.x < x) return false;
1084         if (y < b.y) return true; else if (b.y < y) return false;
1085         if (z < b.z) return true; else if (b.z < z) return false;
1086         return w < b.w;
1087     }
1088 
1089     // Greater than operator
1090     __host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const
1091     {
1092         if (x > b.x) return true; else if (b.x > x) return false;
1093         if (y > b.y) return true; else if (b.y > y) return false;
1094         if (z > b.z) return true; else if (b.z > z) return false;
1095         return w > b.w;
1096     }
1097 
1098 };
1099 
1100 /**
1101  * TestFoo ostream operator
1102  */
1103 std::ostream& operator<<(std::ostream& os, const TestFoo& val)
1104 {
1105     os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')';
1106     return os;
1107 }
1108 
1109 /**
1110  * TestFoo test initialization
1111  */
1112 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, int index = 0)
1113 {
1114     InitValue(gen_mode, value.x, index);
1115     InitValue(gen_mode, value.y, index);
1116     InitValue(gen_mode, value.z, index);
1117     InitValue(gen_mode, value.w, index);
1118 }
1119 
1120 
1121 /// numeric_limits<TestFoo> specialization
1122 namespace cub {
1123 template<>
1124 struct NumericTraits<TestFoo>
1125 {
1126     static const Category CATEGORY = NOT_A_NUMBER;
1127     enum {
1128         PRIMITIVE       = false,
1129         NULL_TYPE       = false,
1130     };
1131     static TestFoo Max()
1132     {
1133         return TestFoo::MakeTestFoo(
1134             NumericTraits<long long>::Max(),
1135             NumericTraits<int>::Max(),
1136             NumericTraits<short>::Max(),
1137             NumericTraits<char>::Max());
1138     }
1139 
1140     static TestFoo Lowest()
1141     {
1142         return TestFoo::MakeTestFoo(
1143             NumericTraits<long long>::Lowest(),
1144             NumericTraits<int>::Lowest(),
1145             NumericTraits<short>::Lowest(),
1146             NumericTraits<char>::Lowest());
1147     }
1148 };
1149 } // namespace cub
1150 
1151 
1152 //---------------------------------------------------------------------
1153 // Complex data type TestBar (with optimizations for fence-free warp-synchrony)
1154 //---------------------------------------------------------------------
1155 
1156 /**
1157  * TestBar complex data type
1158  */
1159 struct TestBar
1160 {
1161     long long       x;
1162     int             y;
1163 
1164     // Constructor
1165     __host__ __device__ __forceinline__ TestBar() : x(0), y(0)
1166     {}
1167 
1168     // Constructor
1169     __host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b)
1170     {}
1171 
1172     // Constructor
1173     __host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y)
1174     {}
1175 
1176     // Assignment from int operator
1177     __host__ __device__ __forceinline__ TestBar& operator =(int b)
1178     {
1179         x = b;
1180         y = b;
1181         return *this;
1182     }
1183 
1184     // Summation operator
1185     __host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const
1186     {
1187         return TestBar(x + b.x, y + b.y);
1188     }
1189 
1190     // Inequality operator
1191     __host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const
1192     {
1193         return (x != b.x) || (y != b.y);
1194     }
1195 
1196     // Equality operator
1197     __host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const
1198     {
1199         return (x == b.x) && (y == b.y);
1200     }
1201 
1202     // Less than operator
1203     __host__ __device__ __forceinline__ bool operator <(const TestBar &b) const
1204     {
1205         if (x < b.x) return true; else if (b.x < x) return false;
1206         return y < b.y;
1207     }
1208 
1209     // Greater than operator
1210     __host__ __device__ __forceinline__ bool operator >(const TestBar &b) const
1211     {
1212         if (x > b.x) return true; else if (b.x > x) return false;
1213         return y > b.y;
1214     }
1215 
1216 };
1217 
1218 
1219 /**
1220  * TestBar ostream operator
1221  */
1222 std::ostream& operator<<(std::ostream& os, const TestBar& val)
1223 {
1224     os << '(' << val.x << ',' << val.y << ')';
1225     return os;
1226 }
1227 
1228 /**
1229  * TestBar test initialization
1230  */
1231 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, int index = 0)
1232 {
1233     InitValue(gen_mode, value.x, index);
1234     InitValue(gen_mode, value.y, index);
1235 }
1236 
1237 /// numeric_limits<TestBar> specialization
1238 namespace cub {
1239 template<>
1240 struct NumericTraits<TestBar>
1241 {
1242     static const Category CATEGORY = NOT_A_NUMBER;
1243     enum {
1244         PRIMITIVE       = false,
1245         NULL_TYPE       = false,
1246     };
1247     static TestBar Max()
1248     {
1249         return TestBar(
1250             NumericTraits<long long>::Max(),
1251             NumericTraits<int>::Max());
1252     }
1253 
1254     static TestBar Lowest()
1255     {
1256         return TestBar(
1257             NumericTraits<long long>::Lowest(),
1258             NumericTraits<int>::Lowest());
1259     }
1260 };
1261 } // namespace cub
1262 
1263 
1264 /******************************************************************************
1265  * Helper routines for list comparison and display
1266  ******************************************************************************/
1267 
1268 
1269 /**
1270  * Compares the equivalence of two arrays
1271  */
1272 template <typename S, typename T, typename OffsetT>
1273 int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true)
1274 {
1275     for (OffsetT i = 0; i < len; i++)
1276     {
1277         if (computed[i] != reference[i])
1278         {
1279             if (verbose) std::cout << "INCORRECT: [" << i << "]: "
1280                 << CoutCast(computed[i]) << " != "
1281                 << CoutCast(reference[i]);
1282             return 1;
1283         }
1284     }
1285     return 0;
1286 }
1287 
1288 
1289 /**
1290  * Compares the equivalence of two arrays
1291  */
1292 template <typename OffsetT>
1293 int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true)
1294 {
1295     for (OffsetT i = 0; i < len; i++)
1296     {
1297         if (computed[i] != reference[i])
1298         {
1299             float difference = std::abs(computed[i]-reference[i]);
1300             float fraction = difference / std::abs(reference[i]);
1301 
1302             if (fraction > 0.0001)
1303             {
1304                 if (verbose) std::cout << "INCORRECT: [" << i << "]: "
1305                     << "(computed) " << CoutCast(computed[i]) << " != "
1306                     << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
1307                 return 1;
1308             }
1309         }
1310     }
1311     return 0;
1312 }
1313 
1314 
1315 /**
1316  * Compares the equivalence of two arrays
1317  */
1318 template <typename OffsetT>
1319 int CompareResults(cub::NullType* computed, cub::NullType* reference, OffsetT len, bool verbose = true)
1320 {
1321     return 0;
1322 }
1323 
1324 /**
1325  * Compares the equivalence of two arrays
1326  */
1327 template <typename OffsetT>
1328 int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true)
1329 {
1330     for (OffsetT i = 0; i < len; i++)
1331     {
1332         if (computed[i] != reference[i])
1333         {
1334             double difference = std::abs(computed[i]-reference[i]);
1335             double fraction = difference / std::abs(reference[i]);
1336 
1337             if (fraction > 0.0001)
1338             {
1339                 if (verbose) std::cout << "INCORRECT: [" << i << "]: "
1340                     << CoutCast(computed[i]) << " != "
1341                     << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
1342                 return 1;
1343             }
1344         }
1345     }
1346     return 0;
1347 }
1348 
1349 
1350 /**
1351  * Verify the contents of a device array match those
1352  * of a host array
1353  */
1354 int CompareDeviceResults(
1355     cub::NullType */* h_reference */,
1356     cub::NullType */* d_data */,
1357     size_t /* num_items */,
1358     bool /* verbose */ = true,
1359     bool /* display_data */ = false)
1360 {
1361     return 0;
1362 }
1363 
1364 /**
1365  * Verify the contents of a device array match those
1366  * of a host array
1367  */
1368 template <typename S, typename OffsetT>
1369 int CompareDeviceResults(
1370     S *h_reference,
1371     cub::DiscardOutputIterator<OffsetT> d_data,
1372     size_t num_items,
1373     bool verbose = true,
1374     bool display_data = false)
1375 {
1376     return 0;
1377 }
1378 
1379 /**
1380  * Verify the contents of a device array match those
1381  * of a host array
1382  */
1383 template <typename S, typename T>
1384 int CompareDeviceResults(
1385     S *h_reference,
1386     T *d_data,
1387     size_t num_items,
1388     bool verbose = true,
1389     bool display_data = false)
1390 {
1391     // Allocate array on host
1392     T *h_data = (T*) malloc(num_items * sizeof(T));
1393 
1394     // Copy data back
1395     cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
1396 
1397     // Display data
1398     if (display_data)
1399     {
1400         printf("Reference:\n");
1401         for (int i = 0; i < int(num_items); i++)
1402         {
1403             std::cout << CoutCast(h_reference[i]) << ", ";
1404         }
1405         printf("\n\nComputed:\n");
1406         for (int i = 0; i < int(num_items); i++)
1407         {
1408             std::cout << CoutCast(h_data[i]) << ", ";
1409         }
1410         printf("\n\n");
1411     }
1412 
1413     // Check
1414     int retval = CompareResults(h_data, h_reference, num_items, verbose);
1415 
1416     // Cleanup
1417     if (h_data) free(h_data);
1418 
1419     return retval;
1420 }
1421 
1422 
1423 /**
1424  * Verify the contents of a device array match those
1425  * of a device array
1426  */
1427 template <typename T>
1428 int CompareDeviceDeviceResults(
1429     T *d_reference,
1430     T *d_data,
1431     size_t num_items,
1432     bool verbose = true,
1433     bool display_data = false)
1434 {
1435     // Allocate array on host
1436     T *h_reference = (T*) malloc(num_items * sizeof(T));
1437     T *h_data = (T*) malloc(num_items * sizeof(T));
1438 
1439     // Copy data back
1440     cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
1441     cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
1442 
1443     // Display data
1444     if (display_data) {
1445         printf("Reference:\n");
1446         for (int i = 0; i < num_items; i++)
1447         {
1448             std::cout << CoutCast(h_reference[i]) << ", ";
1449         }
1450         printf("\n\nComputed:\n");
1451         for (int i = 0; i < num_items; i++)
1452         {
1453             std::cout << CoutCast(h_data[i]) << ", ";
1454         }
1455         printf("\n\n");
1456     }
1457 
1458     // Check
1459     int retval = CompareResults(h_data, h_reference, num_items, verbose);
1460 
1461     // Cleanup
1462     if (h_reference) free(h_reference);
1463     if (h_data) free(h_data);
1464 
1465     return retval;
1466 }
1467 
1468 
1469 /**
1470  * Print the contents of a host array
1471  */
1472 void DisplayResults(
1473     cub::NullType   */* h_data */,
1474     size_t          /* num_items */)
1475 {}
1476 
1477 
1478 /**
1479  * Print the contents of a host array
1480  */
1481 template <typename InputIteratorT>
1482 void DisplayResults(
1483     InputIteratorT h_data,
1484     size_t num_items)
1485 {
1486     // Display data
1487     for (int i = 0; i < int(num_items); i++)
1488     {
1489         std::cout << CoutCast(h_data[i]) << ", ";
1490     }
1491     printf("\n");
1492 }
1493 
1494 
1495 /**
1496  * Print the contents of a device array
1497  */
1498 template <typename T>
1499 void DisplayDeviceResults(
1500     T *d_data,
1501     size_t num_items)
1502 {
1503     // Allocate array on host
1504     T *h_data = (T*) malloc(num_items * sizeof(T));
1505 
1506     // Copy data back
1507     cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
1508 
1509     DisplayResults(h_data, num_items);
1510 
1511     // Cleanup
1512     if (h_data) free(h_data);
1513 }
1514 
1515 
1516 /******************************************************************************
1517  * Segment descriptor generation
1518  ******************************************************************************/
1519 
1520 /**
1521  * Initialize segments
1522  */
1523 void InitializeSegments(
1524     int     num_items,
1525     int     num_segments,
1526     int     *h_segment_offsets,
1527     bool    verbose = false)
1528 {
1529     if (num_segments <= 0)
1530         return;
1531 
1532     unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments;
1533     int offset = 0;
1534     for (int i = 0; i < num_segments; ++i)
1535     {
1536         h_segment_offsets[i] = offset;
1537 
1538         unsigned int segment_length = RandomValue((expected_segment_length * 2) + 1);
1539         offset += segment_length;
1540         offset = CUB_MIN(offset, num_items);
1541     }
1542     h_segment_offsets[num_segments] = num_items;
1543 
1544     if (verbose)
1545     {
1546         printf("Segment offsets: ");
1547         DisplayResults(h_segment_offsets, num_segments + 1);
1548     }
1549 }
1550 
1551 
1552 /******************************************************************************
1553  * Timing
1554  ******************************************************************************/
1555 
1556 
1557 struct CpuTimer
1558 {
1559 #if defined(_WIN32) || defined(_WIN64)
1560 
1561     LARGE_INTEGER ll_freq;
1562     LARGE_INTEGER ll_start;
1563     LARGE_INTEGER ll_stop;
1564 
1565     CpuTimer()
1566     {
1567         QueryPerformanceFrequency(&ll_freq);
1568     }
1569 
1570     void Start()
1571     {
1572         QueryPerformanceCounter(&ll_start);
1573     }
1574 
1575     void Stop()
1576     {
1577         QueryPerformanceCounter(&ll_stop);
1578     }
1579 
1580     float ElapsedMillis()
1581     {
1582         double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart);
1583         double stop  = double(ll_stop.QuadPart) / double(ll_freq.QuadPart);
1584 
1585         return float((stop - start) * 1000);
1586     }
1587 
1588 #else
1589 
1590     rusage start;
1591     rusage stop;
1592 
1593     void Start()
1594     {
1595         getrusage(RUSAGE_SELF, &start);
1596     }
1597 
1598     void Stop()
1599     {
1600         getrusage(RUSAGE_SELF, &stop);
1601     }
1602 
1603     float ElapsedMillis()
1604     {
1605         float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec;
1606         float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec;
1607 
1608         return (sec * 1000) + (usec / 1000);
1609     }
1610 
1611 #endif
1612 };
1613 
1614 struct GpuTimer
1615 {
1616     cudaEvent_t start;
1617     cudaEvent_t stop;
1618 
1619     GpuTimer()
1620     {
1621         cudaEventCreate(&start);
1622         cudaEventCreate(&stop);
1623     }
1624 
1625     ~GpuTimer()
1626     {
1627         cudaEventDestroy(start);
1628         cudaEventDestroy(stop);
1629     }
1630 
1631     void Start()
1632     {
1633         cudaEventRecord(start, 0);
1634     }
1635 
1636     void Stop()
1637     {
1638         cudaEventRecord(stop, 0);
1639     }
1640 
1641     float ElapsedMillis()
1642     {
1643         float elapsed;
1644         cudaEventSynchronize(stop);
1645         cudaEventElapsedTime(&elapsed, start, stop);
1646         return elapsed;
1647     }
1648 };
1649