1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29
30 #pragma once
31
32 #if defined(_WIN32) || defined(_WIN64)
33 #include <windows.h>
34 #undef small // Windows is terrible for polluting macro namespace
35 #else
36 #include <sys/resource.h>
37 #endif
38
39 #include <cuda_runtime.h>
40
41 #include <stdio.h>
42 #include <float.h>
43
44 #include <cmath>
45 #include <string>
46 #include <vector>
47 #include <sstream>
48 #include <iostream>
49 #include <limits>
50
51 #include "mersenne.h"
52 #include "half.h"
53
54 #include "cub/util_debug.cuh"
55 #include "cub/util_device.cuh"
56 #include "cub/util_type.cuh"
57 #include "cub/util_macro.cuh"
58 #include "cub/iterator/discard_output_iterator.cuh"
59
60 /******************************************************************************
61 * Type conversion macros
62 ******************************************************************************/
63
64 /**
65 * Return a value of type `T` with the same bitwise representation of `in`.
66 * Types `T` and `U` must be the same size.
67 */
68 template <typename T, typename U>
SafeBitCast(const U & in)69 T SafeBitCast(const U& in)
70 {
71 static_assert(sizeof(T) == sizeof(U), "Types must be same size.");
72 T out;
73 memcpy(&out, &in, sizeof(T));
74 return out;
75 }
76
77 /******************************************************************************
78 * Assertion macros
79 ******************************************************************************/
80
81 /**
82 * Assert equals
83 */
84 #define AssertEquals(a, b) if ((a) != (b)) { std::cerr << "\n(" << __FILE__ << ": " << __LINE__ << ")\n"; exit(1);}
85
86
87 /******************************************************************************
88 * Command-line parsing functionality
89 ******************************************************************************/
90
91 /**
92 * Utility for parsing command line arguments
93 */
94 struct CommandLineArgs
95 {
96
97 std::vector<std::string> keys;
98 std::vector<std::string> values;
99 std::vector<std::string> args;
100 cudaDeviceProp deviceProp;
101 float device_giga_bandwidth;
102 size_t device_free_physmem;
103 size_t device_total_physmem;
104
105 /**
106 * Constructor
107 */
CommandLineArgsCommandLineArgs108 CommandLineArgs(int argc, char **argv) :
109 keys(10),
110 values(10)
111 {
112 using namespace std;
113
114 // Initialize mersenne generator
115 unsigned int mersenne_init[4]= {0x123, 0x234, 0x345, 0x456};
116 mersenne::init_by_array(mersenne_init, 4);
117
118 for (int i = 1; i < argc; i++)
119 {
120 string arg = argv[i];
121
122 if ((arg[0] != '-') || (arg[1] != '-'))
123 {
124 args.push_back(arg);
125 continue;
126 }
127
128 string::size_type pos;
129 string key, val;
130 if ((pos = arg.find('=')) == string::npos) {
131 key = string(arg, 2, arg.length() - 2);
132 val = "";
133 } else {
134 key = string(arg, 2, pos - 2);
135 val = string(arg, pos + 1, arg.length() - 1);
136 }
137
138 keys.push_back(key);
139 values.push_back(val);
140 }
141 }
142
143
144 /**
145 * Checks whether a flag "--<flag>" is present in the commandline
146 */
CheckCmdLineFlagCommandLineArgs147 bool CheckCmdLineFlag(const char* arg_name)
148 {
149 using namespace std;
150
151 for (int i = 0; i < int(keys.size()); ++i)
152 {
153 if (keys[i] == string(arg_name))
154 return true;
155 }
156 return false;
157 }
158
159
160 /**
161 * Returns number of naked (non-flag and non-key-value) commandline parameters
162 */
163 template <typename T>
NumNakedArgsCommandLineArgs164 int NumNakedArgs()
165 {
166 return args.size();
167 }
168
169
170 /**
171 * Returns the commandline parameter for a given index (not including flags)
172 */
173 template <typename T>
GetCmdLineArgumentCommandLineArgs174 void GetCmdLineArgument(int index, T &val)
175 {
176 using namespace std;
177 if (index < args.size()) {
178 istringstream str_stream(args[index]);
179 str_stream >> val;
180 }
181 }
182
183 /**
184 * Returns the value specified for a given commandline parameter --<flag>=<value>
185 */
186 template <typename T>
GetCmdLineArgumentCommandLineArgs187 void GetCmdLineArgument(const char *arg_name, T &val)
188 {
189 using namespace std;
190
191 for (int i = 0; i < int(keys.size()); ++i)
192 {
193 if (keys[i] == string(arg_name))
194 {
195 istringstream str_stream(values[i]);
196 str_stream >> val;
197 }
198 }
199 }
200
201
202 /**
203 * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
204 */
205 template <typename T>
GetCmdLineArgumentsCommandLineArgs206 void GetCmdLineArguments(const char *arg_name, std::vector<T> &vals)
207 {
208 using namespace std;
209
210 if (CheckCmdLineFlag(arg_name))
211 {
212 // Clear any default values
213 vals.clear();
214
215 // Recover from multi-value string
216 for (int i = 0; i < keys.size(); ++i)
217 {
218 if (keys[i] == string(arg_name))
219 {
220 string val_string(values[i]);
221 istringstream str_stream(val_string);
222 string::size_type old_pos = 0;
223 string::size_type new_pos = 0;
224
225 // Iterate comma-separated values
226 T val;
227 while ((new_pos = val_string.find(',', old_pos)) != string::npos)
228 {
229 if (new_pos != old_pos)
230 {
231 str_stream.width(new_pos - old_pos);
232 str_stream >> val;
233 vals.push_back(val);
234 }
235
236 // skip over comma
237 str_stream.ignore(1);
238 old_pos = new_pos + 1;
239 }
240
241 // Read last value
242 str_stream >> val;
243 vals.push_back(val);
244 }
245 }
246 }
247 }
248
249
250 /**
251 * The number of pairs parsed
252 */
ParsedArgcCommandLineArgs253 int ParsedArgc()
254 {
255 return (int) keys.size();
256 }
257
258 /**
259 * Initialize device
260 */
261 cudaError_t DeviceInit(int dev = -1)
262 {
263 cudaError_t error = cudaSuccess;
264
265 do
266 {
267 int deviceCount;
268 error = CubDebug(cudaGetDeviceCount(&deviceCount));
269 if (error) break;
270
271 if (deviceCount == 0) {
272 fprintf(stderr, "No devices supporting CUDA.\n");
273 exit(1);
274 }
275 if (dev < 0)
276 {
277 GetCmdLineArgument("device", dev);
278 }
279 if ((dev > deviceCount - 1) || (dev < 0))
280 {
281 dev = 0;
282 }
283
284 error = CubDebug(cudaSetDevice(dev));
285 if (error) break;
286
287 CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
288
289 int ptx_version = 0;
290 error = CubDebug(cub::PtxVersion(ptx_version));
291 if (error) break;
292
293 error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev));
294 if (error) break;
295
296 if (deviceProp.major < 1) {
297 fprintf(stderr, "Device does not support CUDA.\n");
298 exit(1);
299 }
300
301 device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000;
302
303 if (!CheckCmdLineFlag("quiet"))
304 {
305 printf(
306 "Using device %d: %s (PTX version %d, SM%d, %d SMs, "
307 "%lld free / %lld total MB physmem, "
308 "%.3f GB/s @ %d kHz mem clock, ECC %s)\n",
309 dev,
310 deviceProp.name,
311 ptx_version,
312 deviceProp.major * 100 + deviceProp.minor * 10,
313 deviceProp.multiProcessorCount,
314 (unsigned long long) device_free_physmem / 1024 / 1024,
315 (unsigned long long) device_total_physmem / 1024 / 1024,
316 device_giga_bandwidth,
317 deviceProp.memoryClockRate,
318 (deviceProp.ECCEnabled) ? "on" : "off");
319 fflush(stdout);
320 }
321
322 } while (0);
323
324 return error;
325 }
326 };
327
328 /******************************************************************************
329 * Random bits generator
330 ******************************************************************************/
331
332 int g_num_rand_samples = 0;
333
334
335 template <typename T>
IsNaN(T)336 bool IsNaN(T /* val */) { return false; }
337
338 template<>
339 __noinline__ bool IsNaN<float>(float val)
340 {
341 return std::isnan(val);
342 }
343
344 template<>
345 __noinline__ bool IsNaN<float1>(float1 val)
346 {
347 return (IsNaN(val.x));
348 }
349
350 template<>
351 __noinline__ bool IsNaN<float2>(float2 val)
352 {
353 return (IsNaN(val.y) || IsNaN(val.x));
354 }
355
356 template<>
357 __noinline__ bool IsNaN<float3>(float3 val)
358 {
359 return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
360 }
361
362 template<>
363 __noinline__ bool IsNaN<float4>(float4 val)
364 {
365 return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
366 }
367
368 template<>
369 __noinline__ bool IsNaN<double>(double val)
370 {
371 return std::isnan(val);
372 }
373
374 template<>
375 __noinline__ bool IsNaN<double1>(double1 val)
376 {
377 return (IsNaN(val.x));
378 }
379
380 template<>
381 __noinline__ bool IsNaN<double2>(double2 val)
382 {
383 return (IsNaN(val.y) || IsNaN(val.x));
384 }
385
386 template<>
387 __noinline__ bool IsNaN<double3>(double3 val)
388 {
389 return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
390 }
391
392 template<>
393 __noinline__ bool IsNaN<double4>(double4 val)
394 {
395 return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
396 }
397
398
399 template<>
400 __noinline__ bool IsNaN<half_t>(half_t val)
401 {
402 const auto bits = SafeBitCast<unsigned short>(val);
403
404 // commented bit is always true, leaving for documentation:
405 return (((bits >= 0x7C01) && (bits <= 0x7FFF)) ||
406 ((bits >= 0xFC01) /*&& (bits <= 0xFFFFFFFF)*/));
407 }
408
409
410
411 /**
412 * Generates random keys.
413 *
414 * We always take the second-order byte from rand() because the higher-order
415 * bits returned by rand() are commonly considered more uniformly distributed
416 * than the lower-order bits.
417 *
418 * We can decrease the entropy level of keys by adopting the technique
419 * of Thearling and Smith in which keys are computed from the bitwise AND of
420 * multiple random samples:
421 *
422 * entropy_reduction | Effectively-unique bits per key
423 * -----------------------------------------------------
424 * -1 | 0
425 * 0 | 32
426 * 1 | 25.95 (81%)
427 * 2 | 17.41 (54%)
428 * 3 | 10.78 (34%)
429 * 4 | 6.42 (20%)
430 * ... | ...
431 *
432 */
433 template <typename K>
434 void RandomBits(
435 K &key,
436 int entropy_reduction = 0,
437 int begin_bit = 0,
438 int end_bit = sizeof(K) * 8)
439 {
440 const int NUM_BYTES = sizeof(K);
441 const int WORD_BYTES = sizeof(unsigned int);
442 const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES;
443
444 unsigned int word_buff[NUM_WORDS];
445
446 if (entropy_reduction == -1)
447 {
448 memset((void *) &key, 0, sizeof(key));
449 return;
450 }
451
452 if (end_bit < 0)
453 end_bit = sizeof(K) * 8;
454
455 while (true)
456 {
457 // Generate random word_buff
458 for (int j = 0; j < NUM_WORDS; j++)
459 {
460 int current_bit = j * WORD_BYTES * 8;
461
462 unsigned int word = 0xffffffff;
463 word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit);
464 word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit);
465
466 for (int i = 0; i <= entropy_reduction; i++)
467 {
468 // Grab some of the higher bits from rand (better entropy, supposedly)
469 word &= mersenne::genrand_int32();
470 g_num_rand_samples++;
471 }
472
473 word_buff[j] = word;
474 }
475
476 memcpy(&key, word_buff, sizeof(K));
477
478 K copy = key;
479 if (!IsNaN(copy))
480 break; // avoids NaNs when generating random floating point numbers
481 }
482 }
483
484 /// Randomly select number between [0:max)
485 template <typename T>
RandomValue(T max)486 T RandomValue(T max)
487 {
488 unsigned int bits;
489 unsigned int max_int = (unsigned int) -1;
490 do {
491 RandomBits(bits);
492 } while (bits == max_int);
493
494 return (T) ((double(bits) / double(max_int)) * double(max));
495 }
496
497
498 /******************************************************************************
499 * Console printing utilities
500 ******************************************************************************/
501
502 /**
503 * Helper for casting character types to integers for cout printing
504 */
505 template <typename T>
CoutCast(T val)506 T CoutCast(T val) { return val; }
507
CoutCast(char val)508 int CoutCast(char val) { return val; }
509
CoutCast(unsigned char val)510 int CoutCast(unsigned char val) { return val; }
511
CoutCast(signed char val)512 int CoutCast(signed char val) { return val; }
513
514
515
516 /******************************************************************************
517 * Test value initialization utilities
518 ******************************************************************************/
519
520 /**
521 * Test problem generation options
522 */
523 enum GenMode
524 {
525 UNIFORM, // Assign to '2', regardless of integer seed
526 INTEGER_SEED, // Assign to integer seed
527 RANDOM, // Assign to random, regardless of integer seed
528 RANDOM_BIT, // Assign to randomly chosen 0 or 1, regardless of integer seed
529 };
530
531 /**
532 * Initialize value
533 */
534 template <typename T>
535 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)
536 {
537 switch (gen_mode)
538 {
539 #if (CUB_PTX_ARCH == 0)
540 case RANDOM:
541 RandomBits(value);
542 break;
543 case RANDOM_BIT:
544 char c;
545 RandomBits(c, 0, 0, 1);
546 value = (c > 0) ? (T) 1 : (T) -1;
547 break;
548 #endif
549 case UNIFORM:
550 value = 2;
551 break;
552 case INTEGER_SEED:
553 default:
554 value = (T) index;
555 break;
556 }
557 }
558
559
560 /**
561 * Initialize value (bool)
562 */
563 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, int index = 0)
564 {
565 switch (gen_mode)
566 {
567 #if (CUB_PTX_ARCH == 0)
568 case RANDOM:
569 case RANDOM_BIT:
570 char c;
571 RandomBits(c, 0, 0, 1);
572 value = (c > 0);
573 break;
574 #endif
575 case UNIFORM:
576 value = true;
577 break;
578 case INTEGER_SEED:
579 default:
580 value = (index > 0);
581 break;
582 }
583 }
584
585
586 /**
587 * cub::NullType test initialization
588 */
589 __host__ __device__ __forceinline__ void InitValue(GenMode /* gen_mode */,
590 cub::NullType &/* value */,
591 int /* index */ = 0)
592 {}
593
594
595 /**
596 * cub::KeyValuePair<OffsetT, ValueT>test initialization
597 */
598 template <typename KeyT, typename ValueT>
599 __host__ __device__ __forceinline__ void InitValue(
600 GenMode gen_mode,
601 cub::KeyValuePair<KeyT, ValueT>& value,
602 int index = 0)
603 {
604 InitValue(gen_mode, value.value, index);
605
606 // Assign corresponding flag with a likelihood of the last bit being set with entropy-reduction level 3
607 RandomBits(value.key, 3);
608 value.key = (value.key & 0x1);
609 }
610
611
612
613 /******************************************************************************
614 * Comparison and ostream operators
615 ******************************************************************************/
616
617 /**
618 * KeyValuePair ostream operator
619 */
620 template <typename Key, typename Value>
621 std::ostream& operator<<(std::ostream& os, const cub::KeyValuePair<Key, Value> &val)
622 {
623 os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')';
624 return os;
625 }
626
627
628 /******************************************************************************
629 * Comparison and ostream operators for CUDA vector types
630 ******************************************************************************/
631
632 /**
633 * Vector1 overloads
634 */
635 #define CUB_VEC_OVERLOAD_1(T, BaseT) \
636 /* Ostream output */ \
637 std::ostream& operator<<( \
638 std::ostream& os, \
639 const T& val) \
640 { \
641 os << '(' << CoutCast(val.x) << ')'; \
642 return os; \
643 } \
644 /* Inequality */ \
645 __host__ __device__ __forceinline__ bool operator!=( \
646 const T &a, \
647 const T &b) \
648 { \
649 return (a.x != b.x); \
650 } \
651 /* Equality */ \
652 __host__ __device__ __forceinline__ bool operator==( \
653 const T &a, \
654 const T &b) \
655 { \
656 return (a.x == b.x); \
657 } \
658 /* Test initialization */ \
659 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \
660 { \
661 InitValue(gen_mode, value.x, index); \
662 } \
663 /* Max */ \
664 __host__ __device__ __forceinline__ bool operator>( \
665 const T &a, \
666 const T &b) \
667 { \
668 return (a.x > b.x); \
669 } \
670 /* Min */ \
671 __host__ __device__ __forceinline__ bool operator<( \
672 const T &a, \
673 const T &b) \
674 { \
675 return (a.x < b.x); \
676 } \
677 /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \
678 __host__ __device__ __forceinline__ T operator+( \
679 T a, \
680 T b) \
681 { \
682 T retval = make_##T(a.x + b.x); \
683 return retval; \
684 } \
685 namespace cub { \
686 template<> \
687 struct NumericTraits<T> \
688 { \
689 static const Category CATEGORY = NOT_A_NUMBER; \
690 enum { \
691 PRIMITIVE = false, \
692 NULL_TYPE = false, \
693 }; \
694 static T Max() \
695 { \
696 T retval = { \
697 NumericTraits<BaseT>::Max()}; \
698 return retval; \
699 } \
700 static T Lowest() \
701 { \
702 T retval = { \
703 NumericTraits<BaseT>::Lowest()}; \
704 return retval; \
705 } \
706 }; \
707 } /* namespace std */
708
709
710
711 /**
712 * Vector2 overloads
713 */
714 #define CUB_VEC_OVERLOAD_2(T, BaseT) \
715 /* Ostream output */ \
716 std::ostream& operator<<( \
717 std::ostream& os, \
718 const T& val) \
719 { \
720 os << '(' \
721 << CoutCast(val.x) << ',' \
722 << CoutCast(val.y) << ')'; \
723 return os; \
724 } \
725 /* Inequality */ \
726 __host__ __device__ __forceinline__ bool operator!=( \
727 const T &a, \
728 const T &b) \
729 { \
730 return (a.x != b.x) || \
731 (a.y != b.y); \
732 } \
733 /* Equality */ \
734 __host__ __device__ __forceinline__ bool operator==( \
735 const T &a, \
736 const T &b) \
737 { \
738 return (a.x == b.x) && \
739 (a.y == b.y); \
740 } \
741 /* Test initialization */ \
742 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \
743 { \
744 InitValue(gen_mode, value.x, index); \
745 InitValue(gen_mode, value.y, index); \
746 } \
747 /* Max */ \
748 __host__ __device__ __forceinline__ bool operator>( \
749 const T &a, \
750 const T &b) \
751 { \
752 if (a.x > b.x) return true; else if (b.x > a.x) return false; \
753 return a.y > b.y; \
754 } \
755 /* Min */ \
756 __host__ __device__ __forceinline__ bool operator<( \
757 const T &a, \
758 const T &b) \
759 { \
760 if (a.x < b.x) return true; else if (b.x < a.x) return false; \
761 return a.y < b.y; \
762 } \
763 /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \
764 __host__ __device__ __forceinline__ T operator+( \
765 T a, \
766 T b) \
767 { \
768 T retval = make_##T( \
769 a.x + b.x, \
770 a.y + b.y); \
771 return retval; \
772 } \
773 namespace cub { \
774 template<> \
775 struct NumericTraits<T> \
776 { \
777 static const Category CATEGORY = NOT_A_NUMBER; \
778 enum { \
779 PRIMITIVE = false, \
780 NULL_TYPE = false, \
781 }; \
782 static T Max() \
783 { \
784 T retval = { \
785 NumericTraits<BaseT>::Max(), \
786 NumericTraits<BaseT>::Max()}; \
787 return retval; \
788 } \
789 static T Lowest() \
790 { \
791 T retval = { \
792 NumericTraits<BaseT>::Lowest(), \
793 NumericTraits<BaseT>::Lowest()}; \
794 return retval; \
795 } \
796 }; \
797 } /* namespace cub */
798
799
800
801 /**
802 * Vector3 overloads
803 */
804 #define CUB_VEC_OVERLOAD_3(T, BaseT) \
805 /* Ostream output */ \
806 std::ostream& operator<<( \
807 std::ostream& os, \
808 const T& val) \
809 { \
810 os << '(' \
811 << CoutCast(val.x) << ',' \
812 << CoutCast(val.y) << ',' \
813 << CoutCast(val.z) << ')'; \
814 return os; \
815 } \
816 /* Inequality */ \
817 __host__ __device__ __forceinline__ bool operator!=( \
818 const T &a, \
819 const T &b) \
820 { \
821 return (a.x != b.x) || \
822 (a.y != b.y) || \
823 (a.z != b.z); \
824 } \
825 /* Equality */ \
826 __host__ __device__ __forceinline__ bool operator==( \
827 const T &a, \
828 const T &b) \
829 { \
830 return (a.x == b.x) && \
831 (a.y == b.y) && \
832 (a.z == b.z); \
833 } \
834 /* Test initialization */ \
835 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \
836 { \
837 InitValue(gen_mode, value.x, index); \
838 InitValue(gen_mode, value.y, index); \
839 InitValue(gen_mode, value.z, index); \
840 } \
841 /* Max */ \
842 __host__ __device__ __forceinline__ bool operator>( \
843 const T &a, \
844 const T &b) \
845 { \
846 if (a.x > b.x) return true; else if (b.x > a.x) return false; \
847 if (a.y > b.y) return true; else if (b.y > a.y) return false; \
848 return a.z > b.z; \
849 } \
850 /* Min */ \
851 __host__ __device__ __forceinline__ bool operator<( \
852 const T &a, \
853 const T &b) \
854 { \
855 if (a.x < b.x) return true; else if (b.x < a.x) return false; \
856 if (a.y < b.y) return true; else if (b.y < a.y) return false; \
857 return a.z < b.z; \
858 } \
859 /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \
860 __host__ __device__ __forceinline__ T operator+( \
861 T a, \
862 T b) \
863 { \
864 T retval = make_##T( \
865 a.x + b.x, \
866 a.y + b.y, \
867 a.z + b.z); \
868 return retval; \
869 } \
870 namespace cub { \
871 template<> \
872 struct NumericTraits<T> \
873 { \
874 static const Category CATEGORY = NOT_A_NUMBER; \
875 enum { \
876 PRIMITIVE = false, \
877 NULL_TYPE = false, \
878 }; \
879 static T Max() \
880 { \
881 T retval = { \
882 NumericTraits<BaseT>::Max(), \
883 NumericTraits<BaseT>::Max(), \
884 NumericTraits<BaseT>::Max()}; \
885 return retval; \
886 } \
887 static T Lowest() \
888 { \
889 T retval = { \
890 NumericTraits<BaseT>::Lowest(), \
891 NumericTraits<BaseT>::Lowest(), \
892 NumericTraits<BaseT>::Lowest()}; \
893 return retval; \
894 } \
895 }; \
896 } /* namespace cub */
897
898
899 /**
900 * Vector4 overloads
901 */
902 #define CUB_VEC_OVERLOAD_4(T, BaseT) \
903 /* Ostream output */ \
904 std::ostream& operator<<( \
905 std::ostream& os, \
906 const T& val) \
907 { \
908 os << '(' \
909 << CoutCast(val.x) << ',' \
910 << CoutCast(val.y) << ',' \
911 << CoutCast(val.z) << ',' \
912 << CoutCast(val.w) << ')'; \
913 return os; \
914 } \
915 /* Inequality */ \
916 __host__ __device__ __forceinline__ bool operator!=( \
917 const T &a, \
918 const T &b) \
919 { \
920 return (a.x != b.x) || \
921 (a.y != b.y) || \
922 (a.z != b.z) || \
923 (a.w != b.w); \
924 } \
925 /* Equality */ \
926 __host__ __device__ __forceinline__ bool operator==( \
927 const T &a, \
928 const T &b) \
929 { \
930 return (a.x == b.x) && \
931 (a.y == b.y) && \
932 (a.z == b.z) && \
933 (a.w == b.w); \
934 } \
935 /* Test initialization */ \
936 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \
937 { \
938 InitValue(gen_mode, value.x, index); \
939 InitValue(gen_mode, value.y, index); \
940 InitValue(gen_mode, value.z, index); \
941 InitValue(gen_mode, value.w, index); \
942 } \
943 /* Max */ \
944 __host__ __device__ __forceinline__ bool operator>( \
945 const T &a, \
946 const T &b) \
947 { \
948 if (a.x > b.x) return true; else if (b.x > a.x) return false; \
949 if (a.y > b.y) return true; else if (b.y > a.y) return false; \
950 if (a.z > b.z) return true; else if (b.z > a.z) return false; \
951 return a.w > b.w; \
952 } \
953 /* Min */ \
954 __host__ __device__ __forceinline__ bool operator<( \
955 const T &a, \
956 const T &b) \
957 { \
958 if (a.x < b.x) return true; else if (b.x < a.x) return false; \
959 if (a.y < b.y) return true; else if (b.y < a.y) return false; \
960 if (a.z < b.z) return true; else if (b.z < a.z) return false; \
961 return a.w < b.w; \
962 } \
963 /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \
964 __host__ __device__ __forceinline__ T operator+( \
965 T a, \
966 T b) \
967 { \
968 T retval = make_##T( \
969 a.x + b.x, \
970 a.y + b.y, \
971 a.z + b.z, \
972 a.w + b.w); \
973 return retval; \
974 } \
975 namespace cub { \
976 template<> \
977 struct NumericTraits<T> \
978 { \
979 static const Category CATEGORY = NOT_A_NUMBER; \
980 enum { \
981 PRIMITIVE = false, \
982 NULL_TYPE = false, \
983 }; \
984 static T Max() \
985 { \
986 T retval = { \
987 NumericTraits<BaseT>::Max(), \
988 NumericTraits<BaseT>::Max(), \
989 NumericTraits<BaseT>::Max(), \
990 NumericTraits<BaseT>::Max()}; \
991 return retval; \
992 } \
993 static T Lowest() \
994 { \
995 T retval = { \
996 NumericTraits<BaseT>::Lowest(), \
997 NumericTraits<BaseT>::Lowest(), \
998 NumericTraits<BaseT>::Lowest(), \
999 NumericTraits<BaseT>::Lowest()}; \
1000 return retval; \
1001 } \
1002 }; \
1003 } /* namespace cub */
1004
1005 /**
1006 * All vector overloads
1007 */
1008 #define CUB_VEC_OVERLOAD(COMPONENT_T, BaseT) \
1009 CUB_VEC_OVERLOAD_1(COMPONENT_T##1, BaseT) \
1010 CUB_VEC_OVERLOAD_2(COMPONENT_T##2, BaseT) \
1011 CUB_VEC_OVERLOAD_3(COMPONENT_T##3, BaseT) \
1012 CUB_VEC_OVERLOAD_4(COMPONENT_T##4, BaseT)
1013
1014 /**
1015 * Define for types
1016 */
1017 CUB_VEC_OVERLOAD(char, char)
1018 CUB_VEC_OVERLOAD(short, short)
1019 CUB_VEC_OVERLOAD(int, int)
1020 CUB_VEC_OVERLOAD(long, long)
1021 CUB_VEC_OVERLOAD(longlong, long long)
1022 CUB_VEC_OVERLOAD(uchar, unsigned char)
1023 CUB_VEC_OVERLOAD(ushort, unsigned short)
1024 CUB_VEC_OVERLOAD(uint, unsigned int)
1025 CUB_VEC_OVERLOAD(ulong, unsigned long)
1026 CUB_VEC_OVERLOAD(ulonglong, unsigned long long)
1027 CUB_VEC_OVERLOAD(float, float)
1028 CUB_VEC_OVERLOAD(double, double)
1029
1030
1031 //---------------------------------------------------------------------
1032 // Complex data type TestFoo
1033 //---------------------------------------------------------------------
1034
1035 /**
1036 * TestFoo complex data type
1037 */
1038 struct TestFoo
1039 {
1040 long long x;
1041 int y;
1042 short z;
1043 char w;
1044
1045 // Factory
MakeTestFooTestFoo1046 static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w)
1047 {
1048 TestFoo retval = {x, y, z, w};
1049 return retval;
1050 }
1051
1052 // Assignment from int operator
1053 __host__ __device__ __forceinline__ TestFoo& operator =(int b)
1054 {
1055 x = b;
1056 y = b;
1057 z = b;
1058 w = b;
1059 return *this;
1060 }
1061
1062 // Summation operator
1063 __host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const
1064 {
1065 return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w);
1066 }
1067
1068 // Inequality operator
1069 __host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const
1070 {
1071 return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w);
1072 }
1073
1074 // Equality operator
1075 __host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const
1076 {
1077 return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w);
1078 }
1079
1080 // Less than operator
1081 __host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const
1082 {
1083 if (x < b.x) return true; else if (b.x < x) return false;
1084 if (y < b.y) return true; else if (b.y < y) return false;
1085 if (z < b.z) return true; else if (b.z < z) return false;
1086 return w < b.w;
1087 }
1088
1089 // Greater than operator
1090 __host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const
1091 {
1092 if (x > b.x) return true; else if (b.x > x) return false;
1093 if (y > b.y) return true; else if (b.y > y) return false;
1094 if (z > b.z) return true; else if (b.z > z) return false;
1095 return w > b.w;
1096 }
1097
1098 };
1099
1100 /**
1101 * TestFoo ostream operator
1102 */
1103 std::ostream& operator<<(std::ostream& os, const TestFoo& val)
1104 {
1105 os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')';
1106 return os;
1107 }
1108
1109 /**
1110 * TestFoo test initialization
1111 */
1112 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, int index = 0)
1113 {
1114 InitValue(gen_mode, value.x, index);
1115 InitValue(gen_mode, value.y, index);
1116 InitValue(gen_mode, value.z, index);
1117 InitValue(gen_mode, value.w, index);
1118 }
1119
1120
1121 /// numeric_limits<TestFoo> specialization
1122 namespace cub {
1123 template<>
1124 struct NumericTraits<TestFoo>
1125 {
1126 static const Category CATEGORY = NOT_A_NUMBER;
1127 enum {
1128 PRIMITIVE = false,
1129 NULL_TYPE = false,
1130 };
1131 static TestFoo Max()
1132 {
1133 return TestFoo::MakeTestFoo(
1134 NumericTraits<long long>::Max(),
1135 NumericTraits<int>::Max(),
1136 NumericTraits<short>::Max(),
1137 NumericTraits<char>::Max());
1138 }
1139
1140 static TestFoo Lowest()
1141 {
1142 return TestFoo::MakeTestFoo(
1143 NumericTraits<long long>::Lowest(),
1144 NumericTraits<int>::Lowest(),
1145 NumericTraits<short>::Lowest(),
1146 NumericTraits<char>::Lowest());
1147 }
1148 };
1149 } // namespace cub
1150
1151
1152 //---------------------------------------------------------------------
1153 // Complex data type TestBar (with optimizations for fence-free warp-synchrony)
1154 //---------------------------------------------------------------------
1155
1156 /**
1157 * TestBar complex data type
1158 */
1159 struct TestBar
1160 {
1161 long long x;
1162 int y;
1163
1164 // Constructor
1165 __host__ __device__ __forceinline__ TestBar() : x(0), y(0)
1166 {}
1167
1168 // Constructor
1169 __host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b)
1170 {}
1171
1172 // Constructor
1173 __host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y)
1174 {}
1175
1176 // Assignment from int operator
1177 __host__ __device__ __forceinline__ TestBar& operator =(int b)
1178 {
1179 x = b;
1180 y = b;
1181 return *this;
1182 }
1183
1184 // Summation operator
1185 __host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const
1186 {
1187 return TestBar(x + b.x, y + b.y);
1188 }
1189
1190 // Inequality operator
1191 __host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const
1192 {
1193 return (x != b.x) || (y != b.y);
1194 }
1195
1196 // Equality operator
1197 __host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const
1198 {
1199 return (x == b.x) && (y == b.y);
1200 }
1201
1202 // Less than operator
1203 __host__ __device__ __forceinline__ bool operator <(const TestBar &b) const
1204 {
1205 if (x < b.x) return true; else if (b.x < x) return false;
1206 return y < b.y;
1207 }
1208
1209 // Greater than operator
1210 __host__ __device__ __forceinline__ bool operator >(const TestBar &b) const
1211 {
1212 if (x > b.x) return true; else if (b.x > x) return false;
1213 return y > b.y;
1214 }
1215
1216 };
1217
1218
1219 /**
1220 * TestBar ostream operator
1221 */
1222 std::ostream& operator<<(std::ostream& os, const TestBar& val)
1223 {
1224 os << '(' << val.x << ',' << val.y << ')';
1225 return os;
1226 }
1227
1228 /**
1229 * TestBar test initialization
1230 */
1231 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, int index = 0)
1232 {
1233 InitValue(gen_mode, value.x, index);
1234 InitValue(gen_mode, value.y, index);
1235 }
1236
1237 /// numeric_limits<TestBar> specialization
1238 namespace cub {
1239 template<>
1240 struct NumericTraits<TestBar>
1241 {
1242 static const Category CATEGORY = NOT_A_NUMBER;
1243 enum {
1244 PRIMITIVE = false,
1245 NULL_TYPE = false,
1246 };
1247 static TestBar Max()
1248 {
1249 return TestBar(
1250 NumericTraits<long long>::Max(),
1251 NumericTraits<int>::Max());
1252 }
1253
1254 static TestBar Lowest()
1255 {
1256 return TestBar(
1257 NumericTraits<long long>::Lowest(),
1258 NumericTraits<int>::Lowest());
1259 }
1260 };
1261 } // namespace cub
1262
1263
1264 /******************************************************************************
1265 * Helper routines for list comparison and display
1266 ******************************************************************************/
1267
1268
1269 /**
1270 * Compares the equivalence of two arrays
1271 */
1272 template <typename S, typename T, typename OffsetT>
1273 int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true)
1274 {
1275 for (OffsetT i = 0; i < len; i++)
1276 {
1277 if (computed[i] != reference[i])
1278 {
1279 if (verbose) std::cout << "INCORRECT: [" << i << "]: "
1280 << CoutCast(computed[i]) << " != "
1281 << CoutCast(reference[i]);
1282 return 1;
1283 }
1284 }
1285 return 0;
1286 }
1287
1288
1289 /**
1290 * Compares the equivalence of two arrays
1291 */
1292 template <typename OffsetT>
1293 int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true)
1294 {
1295 for (OffsetT i = 0; i < len; i++)
1296 {
1297 if (computed[i] != reference[i])
1298 {
1299 float difference = std::abs(computed[i]-reference[i]);
1300 float fraction = difference / std::abs(reference[i]);
1301
1302 if (fraction > 0.0001)
1303 {
1304 if (verbose) std::cout << "INCORRECT: [" << i << "]: "
1305 << "(computed) " << CoutCast(computed[i]) << " != "
1306 << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
1307 return 1;
1308 }
1309 }
1310 }
1311 return 0;
1312 }
1313
1314
1315 /**
1316 * Compares the equivalence of two arrays
1317 */
1318 template <typename OffsetT>
1319 int CompareResults(cub::NullType* computed, cub::NullType* reference, OffsetT len, bool verbose = true)
1320 {
1321 return 0;
1322 }
1323
1324 /**
1325 * Compares the equivalence of two arrays
1326 */
1327 template <typename OffsetT>
1328 int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true)
1329 {
1330 for (OffsetT i = 0; i < len; i++)
1331 {
1332 if (computed[i] != reference[i])
1333 {
1334 double difference = std::abs(computed[i]-reference[i]);
1335 double fraction = difference / std::abs(reference[i]);
1336
1337 if (fraction > 0.0001)
1338 {
1339 if (verbose) std::cout << "INCORRECT: [" << i << "]: "
1340 << CoutCast(computed[i]) << " != "
1341 << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
1342 return 1;
1343 }
1344 }
1345 }
1346 return 0;
1347 }
1348
1349
1350 /**
1351 * Verify the contents of a device array match those
1352 * of a host array
1353 */
1354 int CompareDeviceResults(
1355 cub::NullType */* h_reference */,
1356 cub::NullType */* d_data */,
1357 size_t /* num_items */,
1358 bool /* verbose */ = true,
1359 bool /* display_data */ = false)
1360 {
1361 return 0;
1362 }
1363
1364 /**
1365 * Verify the contents of a device array match those
1366 * of a host array
1367 */
1368 template <typename S, typename OffsetT>
1369 int CompareDeviceResults(
1370 S *h_reference,
1371 cub::DiscardOutputIterator<OffsetT> d_data,
1372 size_t num_items,
1373 bool verbose = true,
1374 bool display_data = false)
1375 {
1376 return 0;
1377 }
1378
1379 /**
1380 * Verify the contents of a device array match those
1381 * of a host array
1382 */
1383 template <typename S, typename T>
1384 int CompareDeviceResults(
1385 S *h_reference,
1386 T *d_data,
1387 size_t num_items,
1388 bool verbose = true,
1389 bool display_data = false)
1390 {
1391 // Allocate array on host
1392 T *h_data = (T*) malloc(num_items * sizeof(T));
1393
1394 // Copy data back
1395 cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
1396
1397 // Display data
1398 if (display_data)
1399 {
1400 printf("Reference:\n");
1401 for (int i = 0; i < int(num_items); i++)
1402 {
1403 std::cout << CoutCast(h_reference[i]) << ", ";
1404 }
1405 printf("\n\nComputed:\n");
1406 for (int i = 0; i < int(num_items); i++)
1407 {
1408 std::cout << CoutCast(h_data[i]) << ", ";
1409 }
1410 printf("\n\n");
1411 }
1412
1413 // Check
1414 int retval = CompareResults(h_data, h_reference, num_items, verbose);
1415
1416 // Cleanup
1417 if (h_data) free(h_data);
1418
1419 return retval;
1420 }
1421
1422
1423 /**
1424 * Verify the contents of a device array match those
1425 * of a device array
1426 */
1427 template <typename T>
1428 int CompareDeviceDeviceResults(
1429 T *d_reference,
1430 T *d_data,
1431 size_t num_items,
1432 bool verbose = true,
1433 bool display_data = false)
1434 {
1435 // Allocate array on host
1436 T *h_reference = (T*) malloc(num_items * sizeof(T));
1437 T *h_data = (T*) malloc(num_items * sizeof(T));
1438
1439 // Copy data back
1440 cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
1441 cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
1442
1443 // Display data
1444 if (display_data) {
1445 printf("Reference:\n");
1446 for (int i = 0; i < num_items; i++)
1447 {
1448 std::cout << CoutCast(h_reference[i]) << ", ";
1449 }
1450 printf("\n\nComputed:\n");
1451 for (int i = 0; i < num_items; i++)
1452 {
1453 std::cout << CoutCast(h_data[i]) << ", ";
1454 }
1455 printf("\n\n");
1456 }
1457
1458 // Check
1459 int retval = CompareResults(h_data, h_reference, num_items, verbose);
1460
1461 // Cleanup
1462 if (h_reference) free(h_reference);
1463 if (h_data) free(h_data);
1464
1465 return retval;
1466 }
1467
1468
1469 /**
1470 * Print the contents of a host array
1471 */
1472 void DisplayResults(
1473 cub::NullType */* h_data */,
1474 size_t /* num_items */)
1475 {}
1476
1477
1478 /**
1479 * Print the contents of a host array
1480 */
1481 template <typename InputIteratorT>
1482 void DisplayResults(
1483 InputIteratorT h_data,
1484 size_t num_items)
1485 {
1486 // Display data
1487 for (int i = 0; i < int(num_items); i++)
1488 {
1489 std::cout << CoutCast(h_data[i]) << ", ";
1490 }
1491 printf("\n");
1492 }
1493
1494
1495 /**
1496 * Print the contents of a device array
1497 */
1498 template <typename T>
1499 void DisplayDeviceResults(
1500 T *d_data,
1501 size_t num_items)
1502 {
1503 // Allocate array on host
1504 T *h_data = (T*) malloc(num_items * sizeof(T));
1505
1506 // Copy data back
1507 cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
1508
1509 DisplayResults(h_data, num_items);
1510
1511 // Cleanup
1512 if (h_data) free(h_data);
1513 }
1514
1515
1516 /******************************************************************************
1517 * Segment descriptor generation
1518 ******************************************************************************/
1519
1520 /**
1521 * Initialize segments
1522 */
1523 void InitializeSegments(
1524 int num_items,
1525 int num_segments,
1526 int *h_segment_offsets,
1527 bool verbose = false)
1528 {
1529 if (num_segments <= 0)
1530 return;
1531
1532 unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments;
1533 int offset = 0;
1534 for (int i = 0; i < num_segments; ++i)
1535 {
1536 h_segment_offsets[i] = offset;
1537
1538 unsigned int segment_length = RandomValue((expected_segment_length * 2) + 1);
1539 offset += segment_length;
1540 offset = CUB_MIN(offset, num_items);
1541 }
1542 h_segment_offsets[num_segments] = num_items;
1543
1544 if (verbose)
1545 {
1546 printf("Segment offsets: ");
1547 DisplayResults(h_segment_offsets, num_segments + 1);
1548 }
1549 }
1550
1551
1552 /******************************************************************************
1553 * Timing
1554 ******************************************************************************/
1555
1556
1557 struct CpuTimer
1558 {
1559 #if defined(_WIN32) || defined(_WIN64)
1560
1561 LARGE_INTEGER ll_freq;
1562 LARGE_INTEGER ll_start;
1563 LARGE_INTEGER ll_stop;
1564
1565 CpuTimer()
1566 {
1567 QueryPerformanceFrequency(&ll_freq);
1568 }
1569
1570 void Start()
1571 {
1572 QueryPerformanceCounter(&ll_start);
1573 }
1574
1575 void Stop()
1576 {
1577 QueryPerformanceCounter(&ll_stop);
1578 }
1579
1580 float ElapsedMillis()
1581 {
1582 double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart);
1583 double stop = double(ll_stop.QuadPart) / double(ll_freq.QuadPart);
1584
1585 return float((stop - start) * 1000);
1586 }
1587
1588 #else
1589
1590 rusage start;
1591 rusage stop;
1592
1593 void Start()
1594 {
1595 getrusage(RUSAGE_SELF, &start);
1596 }
1597
1598 void Stop()
1599 {
1600 getrusage(RUSAGE_SELF, &stop);
1601 }
1602
1603 float ElapsedMillis()
1604 {
1605 float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec;
1606 float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec;
1607
1608 return (sec * 1000) + (usec / 1000);
1609 }
1610
1611 #endif
1612 };
1613
1614 struct GpuTimer
1615 {
1616 cudaEvent_t start;
1617 cudaEvent_t stop;
1618
1619 GpuTimer()
1620 {
1621 cudaEventCreate(&start);
1622 cudaEventCreate(&stop);
1623 }
1624
1625 ~GpuTimer()
1626 {
1627 cudaEventDestroy(start);
1628 cudaEventDestroy(stop);
1629 }
1630
1631 void Start()
1632 {
1633 cudaEventRecord(start, 0);
1634 }
1635
1636 void Stop()
1637 {
1638 cudaEventRecord(stop, 0);
1639 }
1640
1641 float ElapsedMillis()
1642 {
1643 float elapsed;
1644 cudaEventSynchronize(stop);
1645 cudaEventElapsedTime(&elapsed, start, stop);
1646 return elapsed;
1647 }
1648 };
1649