1 /******************************************************************************
2 * Copyright 2010 Duane Merrill
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may ob3ain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 *
17 *
18 *
19 * AUTHORS' REQUEST:
20 *
21 * If you use|reference|benchmark this code, please cite our Technical
22 * Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
23 *
24 * @TechReport{ Merrill:Sorting:2010,
25 * author = "Duane Merrill and Andrew Grimshaw",
26 * title = "Revisiting Sorting for GPGPU Stream Architectures",
27 * year = "2010",
28 * institution = "University of Virginia, Department of Computer Science",
29 * address = "Charlottesville, VA, USA",
30 * number = "CS2010-03"
31 * }
32 *
33 * For more information, see our Google Code project site:
34 * http://code.google.com/p/back40computing/
35 *
36 * Thanks!
37 ******************************************************************************/
38
39 /******************************************************************************
40 * Simple test driver program for *large-problem* radix sorting.
41 *
42 * Useful for demonstrating how to integrate radix sorting into
43 * your application
44 ******************************************************************************/
45
46 /******************************************************************************
47 * Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
48 ******************************************************************************/
49 #ifdef _WIN32
50 #pragma warning(disable : 4996)
51 #endif
52 #include <stdlib.h>
53 #include <stdio.h>
54 #include <string.h>
55 #include <math.h>
56 #include <float.h>
57 #include <algorithm>
58 #include <string>
59
60 //#include <iostream>
61 #include <sstream>
62 /**********************
63 *
64 */
65
66 #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
67 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
68 #include "../btgui/Bullet3AppSupport/b3Clock.h"
69
70 cl_context g_cxMainContext;
71 cl_device_id g_device;
72 cl_command_queue g_cqCommandQueue;
73
74 /***********************
75 *
76 */
77
78 bool g_verbose;
79 ///Preferred OpenCL device/platform. When < 0 then no preference is used.
80 ///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
81 ///Preferred device/platform take priority over this platform-vendor match
82 int gPreferredDeviceId = -1;
83 int gPreferredPlatformId = -1;
84
85 /******************************************************************************
86 * Routines
87 ******************************************************************************/
88
89 /**
90 * Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given
91 * number of iterations, displaying runtime information.
92 *
93 * @param[in] num_elements
94 * Size in elements of the vector to sort
95 * @param[in] h_keys
96 * Vector of keys to sort
97 * @param[in] iterations
98 * Number of times to invoke the GPU sorting primitive
99 * @param[in] cfg
100 * Config
101 */
102 template <typename K>
TimedSort(unsigned int num_elements,K * h_keys,unsigned int iterations)103 void TimedSort(
104 unsigned int num_elements,
105 K *h_keys,
106 unsigned int iterations)
107 {
108 printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
109
110 int max_elements = num_elements;
111 b3AlignedObjectArray<unsigned int> hostData;
112 hostData.resize(num_elements);
113 for (int i = 0; i < num_elements; i++)
114 {
115 hostData[i] = h_keys[i];
116 }
117
118 b3RadixSort32CL sorter(g_cxMainContext, g_device, g_cqCommandQueue);
119
120 b3OpenCLArray<unsigned int> gpuData(g_cxMainContext, g_cqCommandQueue);
121 gpuData.copyFromHost(hostData);
122 //sorter.executeHost(gpuData);
123 sorter.execute(gpuData);
124
125 b3AlignedObjectArray<unsigned int> hostDataSorted;
126 gpuData.copyToHost(hostDataSorted);
127
128 clFinish(g_cqCommandQueue);
129
130 {
131 //printf("Key-values, %d iterations, %d elements", iterations, num_elements);
132
133 // Create sorting enactor
134
135 // Perform the timed number of sorting iterations
136 double elapsed = 0;
137 float duration = 0;
138 b3Clock watch;
139
140 //warm-start
141 gpuData.copyFromHost(hostData);
142 clFinish(g_cqCommandQueue);
143 sorter.execute(gpuData);
144
145 watch.reset();
146
147 for (int i = 0; i < iterations; i++)
148 {
149 // Move a fresh copy of the problem into device storage
150 gpuData.copyFromHost(hostData);
151 clFinish(g_cqCommandQueue);
152
153 // Start GPU timing record
154 double startMs = watch.getTimeMicroseconds() / 1e3;
155
156 // Call the sorting API routine
157 sorter.execute(gpuData);
158
159 clFinish(g_cqCommandQueue);
160
161 double stopMs = watch.getTimeMicroseconds() / 1e3;
162
163 duration = stopMs - startMs;
164
165 // End GPU timing record
166 elapsed += (double)duration;
167 printf("duration = %f\n", duration);
168 }
169
170 // Display timing information
171 double avg_runtime = elapsed / iterations;
172 // double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
173 // printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
174 double throughput = ((double)num_elements) / avg_runtime / 1000.0;
175 printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
176
177 gpuData.copyToHost(hostData);
178 for (int i = 0; i < num_elements; i++)
179 {
180 h_keys[i] = hostData[i];
181 }
182 }
183 }
184
185 /**
186 * Key-value sorting. Uses the GPU to sort the specified vector of elements for the given
187 * number of iterations, displaying runtime information.
188 *
189 * @param[in] num_elements
190 * Size in elements of the vector to sort
191 * @param[in] h_keys
192 * Vector of keys to sort
193 * @param[in,out] h_values
194 * Vector of values to sort
195 * @param[in] iterations
196 * Number of times to invoke the GPU sorting primitive
197 * @param[in] cfg
198 * Config
199 */
200 template <typename K, typename V>
TimedSort(unsigned int num_elements,K * h_keys,V * h_values,unsigned int iterations)201 void TimedSort(
202 unsigned int num_elements,
203 K *h_keys,
204 V *h_values,
205 unsigned int iterations)
206 {
207 printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
208
209 int max_elements = num_elements;
210 b3AlignedObjectArray<b3SortData> hostData;
211 hostData.resize(num_elements);
212 for (int i = 0; i < num_elements; i++)
213 {
214 hostData[i].m_key = h_keys[i];
215 hostData[i].m_value = h_values[i];
216 }
217
218 b3RadixSort32CL sorter(g_cxMainContext, g_device, g_cqCommandQueue);
219
220 b3OpenCLArray<b3SortData> gpuData(g_cxMainContext, g_cqCommandQueue);
221 gpuData.copyFromHost(hostData);
222 //sorter.executeHost(gpuData);
223 sorter.execute(gpuData);
224
225 b3AlignedObjectArray<b3SortData> hostDataSorted;
226 gpuData.copyToHost(hostDataSorted);
227 #if 0
228 for (int i=0;i<num_elements;i++)
229 {
230 printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
231 printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
232 }
233 #endif
234
235 clFinish(g_cqCommandQueue);
236
237 {
238 //printf("Key-values, %d iterations, %d elements", iterations, num_elements);
239
240 // Create sorting enactor
241
242 // Perform the timed number of sorting iterations
243 double elapsed = 0;
244 float duration = 0;
245 b3Clock watch;
246
247 //warm-start
248 gpuData.copyFromHost(hostData);
249 sorter.execute(gpuData);
250 clFinish(g_cqCommandQueue);
251
252 watch.reset();
253
254 for (int i = 0; i < iterations; i++)
255 {
256 // Move a fresh copy of the problem into device storage
257 gpuData.copyFromHost(hostData);
258 clFinish(g_cqCommandQueue);
259
260 // Start GPU timing record
261 double startMs = watch.getTimeMicroseconds() / 1e3;
262
263 // Call the sorting API routine
264 sorter.execute(gpuData);
265 clFinish(g_cqCommandQueue);
266
267 double stopMs = watch.getTimeMicroseconds() / 1e3;
268
269 duration = stopMs - startMs;
270
271 // End GPU timing record
272 elapsed += (double)duration;
273 printf("duration = %f\n", duration);
274 }
275
276 // Display timing information
277 double avg_runtime = elapsed / iterations;
278 // double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
279 // printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
280 double throughput = ((double)num_elements) / avg_runtime / 1000.0;
281 printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
282
283 gpuData.copyToHost(hostData);
284 for (int i = 0; i < num_elements; i++)
285 {
286 h_keys[i] = hostData[i].m_key;
287 h_values[i] = hostData[i].m_value;
288 }
289 }
290 }
291
292 /**
293 * Generates random 32-bit keys.
294 *
295 * We always take the second-order byte from rand() because the higher-order
296 * bits returned by rand() are commonly considered more uniformly distributed
297 * than the lower-order bits.
298 *
299 * We can decrease the entropy level of keys by adopting the technique
300 * of Thearling and Smith in which keys are computed from the bitwise AND of
301 * multiple random samples:
302 *
303 * entropy_reduction | Effectively-unique bits per key
304 * -----------------------------------------------------
305 * -1 | 0
306 * 0 | 32
307 * 1 | 25.95
308 * 2 | 17.41
309 * 3 | 10.78
310 * 4 | 6.42
311 * ... | ...
312 *
313 */
314 template <typename K>
RandomBits(K & key,int entropy_reduction=0,int lower_key_bits=sizeof (K)* 8)315 void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
316 {
317 const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
318 unsigned char key_bits[NUM_UCHARS];
319
320 do
321 {
322 for (int j = 0; j < NUM_UCHARS; j++)
323 {
324 unsigned char quarterword = 0xff;
325 for (int i = 0; i <= entropy_reduction; i++)
326 {
327 quarterword &= (rand() >> 7);
328 }
329 key_bits[j] = quarterword;
330 }
331
332 if (lower_key_bits < sizeof(K) * 8)
333 {
334 unsigned long long base = 0;
335 memcpy(&base, key_bits, sizeof(K));
336 base &= (1 << lower_key_bits) - 1;
337 memcpy(key_bits, &base, sizeof(K));
338 }
339
340 memcpy(&key, key_bits, sizeof(K));
341
342 } while (key != key); // avoids NaNs when generating random floating point numbers
343 }
344
345 /******************************************************************************
346 * Templated routines for printing keys/values to the console
347 ******************************************************************************/
348
349 template <typename T>
PrintValue(T val)350 void PrintValue(T val)
351 {
352 printf("%d", val);
353 }
354
355 template <>
PrintValue(float val)356 void PrintValue<float>(float val)
357 {
358 printf("%f", val);
359 }
360
361 template <>
PrintValue(double val)362 void PrintValue<double>(double val)
363 {
364 printf("%f", val);
365 }
366
367 template <>
PrintValue(unsigned char val)368 void PrintValue<unsigned char>(unsigned char val)
369 {
370 printf("%u", val);
371 }
372
373 template <>
PrintValue(unsigned short val)374 void PrintValue<unsigned short>(unsigned short val)
375 {
376 printf("%u", val);
377 }
378
379 template <>
PrintValue(unsigned int val)380 void PrintValue<unsigned int>(unsigned int val)
381 {
382 printf("%u", val);
383 }
384
385 template <>
PrintValue(long val)386 void PrintValue<long>(long val)
387 {
388 printf("%ld", val);
389 }
390
391 template <>
PrintValue(unsigned long val)392 void PrintValue<unsigned long>(unsigned long val)
393 {
394 printf("%lu", val);
395 }
396
397 template <>
PrintValue(long long val)398 void PrintValue<long long>(long long val)
399 {
400 printf("%lld", val);
401 }
402
403 template <>
PrintValue(unsigned long long val)404 void PrintValue<unsigned long long>(unsigned long long val)
405 {
406 printf("%llu", val);
407 }
408
409 /**
410 * Compares the equivalence of two arrays
411 */
412 template <typename T, typename SizeT>
CompareResults(T * computed,T * reference,SizeT len,bool verbose=true)413 int CompareResults(T *computed, T *reference, SizeT len, bool verbose = true)
414 {
415 printf("\n");
416 for (SizeT i = 0; i < len; i++)
417 {
418 if (computed[i] != reference[i])
419 {
420 printf("INCORRECT: [%lu]: ", (unsigned long)i);
421 PrintValue<T>(computed[i]);
422 printf(" != ");
423 PrintValue<T>(reference[i]);
424
425 if (verbose)
426 {
427 printf("\nresult[...");
428 for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++)
429 {
430 PrintValue<T>(computed[j]);
431 printf(", ");
432 }
433 printf("...]");
434 printf("\nreference[...");
435 for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++)
436 {
437 PrintValue<T>(reference[j]);
438 printf(", ");
439 }
440 printf("...]");
441 }
442
443 return 1;
444 }
445 }
446
447 printf("CORRECT\n");
448 return 0;
449 }
450
451 /**
452 * Creates an example sorting problem whose keys is a vector of the specified
453 * number of K elements, values of V elements, and then dispatches the problem
454 * to the GPU for the given number of iterations, displaying runtime information.
455 *
456 * @param[in] iterations
457 * Number of times to invoke the GPU sorting primitive
458 * @param[in] num_elements
459 * Size in elements of the vector to sort
460 * @param[in] cfg
461 * Config
462 */
463 template <typename K, typename V>
TestSort(unsigned int iterations,int num_elements,bool keys_only)464 void TestSort(
465 unsigned int iterations,
466 int num_elements,
467 bool keys_only)
468 {
469 // Allocate the sorting problem on the host and fill the keys with random bytes
470
471 K *h_keys = NULL;
472 K *h_reference_keys = NULL;
473 V *h_values = NULL;
474 h_keys = (K *)malloc(num_elements * sizeof(K));
475 h_reference_keys = (K *)malloc(num_elements * sizeof(K));
476 if (!keys_only) h_values = (V *)malloc(num_elements * sizeof(V));
477
478 // Use random bits
479 for (unsigned int i = 0; i < num_elements; ++i)
480 {
481 RandomBits<K>(h_keys[i], 0);
482 //h_keys[i] = num_elements-i;
483 //h_keys[i] = 0xffffffffu-i;
484 if (!keys_only)
485 h_values[i] = h_keys[i]; //0xffffffffu-i;
486
487 h_reference_keys[i] = h_keys[i];
488 }
489
490 // Run the timing test
491 if (keys_only)
492 {
493 TimedSort<K>(num_elements, h_keys, iterations);
494 }
495 else
496 {
497 TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
498 }
499
500 // cudaThreadSynchronize();
501
502 // Display sorted key data
503 if (g_verbose)
504 {
505 printf("\n\nKeys:\n");
506 for (int i = 0; i < num_elements; i++)
507 {
508 PrintValue<K>(h_keys[i]);
509 printf(", ");
510 }
511 printf("\n\n");
512 }
513
514 // Verify solution
515 std::sort(h_reference_keys, h_reference_keys + num_elements);
516 CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
517 printf("\n");
518 fflush(stdout);
519
520 // Free our allocated host memory
521 if (h_keys != NULL) free(h_keys);
522 if (h_values != NULL) free(h_values);
523 }
524
525 /**
526 * Displays the commandline usage for this tool
527 */
Usage()528 void Usage()
529 {
530 printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n");
531 printf("\n");
532 printf("\t--v\tDisplays sorted results to the console.\n");
533 printf("\n");
534 printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
535 printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
536 printf("\n");
537 printf("\t--n\tThe number of elements to comprise the sample problem\n");
538 printf("\t\t\tDefault = 512\n");
539 printf("\n");
540 printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
541 printf("\n");
542 }
543
544 /******************************************************************************
545 * Command-line parsing
546 ******************************************************************************/
547 #include <map>
548 #include <algorithm>
549 #include <string>
550
551 class b3CommandLineArgs
552 {
553 protected:
554 std::map<std::string, std::string> pairs;
555
556 public:
557 // Constructor
b3CommandLineArgs(int argc,char ** argv)558 b3CommandLineArgs(int argc, char **argv)
559 {
560 using namespace std;
561
562 for (int i = 1; i < argc; i++)
563 {
564 string arg = argv[i];
565
566 if ((arg[0] != '-') || (arg[1] != '-'))
567 {
568 continue;
569 }
570
571 string::size_type pos;
572 string key, val;
573 if ((pos = arg.find('=')) == string::npos)
574 {
575 key = string(arg, 2, arg.length() - 2);
576 val = "";
577 }
578 else
579 {
580 key = string(arg, 2, pos - 2);
581 val = string(arg, pos + 1, arg.length() - 1);
582 }
583 pairs[key] = val;
584 }
585 }
586
CheckCmdLineFlag(const char * arg_name)587 bool CheckCmdLineFlag(const char *arg_name)
588 {
589 using namespace std;
590 map<string, string>::iterator itr;
591 if ((itr = pairs.find(arg_name)) != pairs.end())
592 {
593 return true;
594 }
595 return false;
596 }
597
598 template <typename T>
599 void GetCmdLineArgument(const char *arg_name, T &val);
600
ParsedArgc()601 int ParsedArgc()
602 {
603 return pairs.size();
604 }
605 };
606
607 template <typename T>
GetCmdLineArgument(const char * arg_name,T & val)608 void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
609 {
610 using namespace std;
611 map<string, string>::iterator itr;
612 if ((itr = pairs.find(arg_name)) != pairs.end())
613 {
614 istringstream strstream(itr->second);
615 strstream >> val;
616 }
617 }
618
619 template <>
GetCmdLineArgument(const char * arg_name,char * & val)620 void b3CommandLineArgs::GetCmdLineArgument<char *>(const char *arg_name, char *&val)
621 {
622 using namespace std;
623 map<string, string>::iterator itr;
624 if ((itr = pairs.find(arg_name)) != pairs.end())
625 {
626 string s = itr->second;
627 val = (char *)malloc(sizeof(char) * (s.length() + 1));
628 strcpy(val, s.c_str());
629 }
630 else
631 {
632 val = NULL;
633 }
634 }
635
636 /******************************************************************************
637 * Main
638 ******************************************************************************/
639
640 extern bool gDebugSkipLoadingBinary;
641
myprintf(const char * msg)642 void myprintf(const char *msg)
643 {
644 (void *)msg;
645 }
main(int argc,char ** argv)646 int main(int argc, char **argv)
647 {
648 //gDebugSkipLoadingBinary = true;
649
650 // b3SetCustomPrintfFunc(myprintf);
651
652 cl_int ciErrNum;
653 b3CommandLineArgs args(argc, argv);
654
655 args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
656 args.GetCmdLineArgument("platformId", gPreferredPlatformId);
657
658 b3Printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n");
659 cl_platform_id platformId;
660 // g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
661 g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0, gPreferredDeviceId, gPreferredPlatformId, &platformId);
662 //g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_CPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
663
664 oclCHECKERROR(ciErrNum, CL_SUCCESS);
665
666 int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext);
667
668 if (!numDev)
669 {
670 b3Error("error: no OpenCL devices\n");
671 exit(0);
672 }
673 int devId = 0;
674 g_device = b3OpenCLUtils_getDevice(g_cxMainContext, devId);
675 b3OpenCLUtils_printDeviceInfo(g_device);
676 // create a command-queue
677 g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
678 oclCHECKERROR(ciErrNum, CL_SUCCESS);
679
680 //srand(time(NULL));
681 srand(0); // presently deterministic
682
683 unsigned int num_elements = 8 * 1024 * 1024; //4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
684 unsigned int iterations = 10;
685 bool keys_only = true;
686
687 //
688 // Check command line arguments
689 //
690
691 if (args.CheckCmdLineFlag("help"))
692 {
693 Usage();
694 return 0;
695 }
696
697 args.GetCmdLineArgument("i", iterations);
698 args.GetCmdLineArgument("n", num_elements);
699
700 keys_only = !args.CheckCmdLineFlag("key-values");
701 g_verbose = args.CheckCmdLineFlag("v");
702
703 TestSort<unsigned int, unsigned int>(
704 iterations,
705 num_elements,
706 keys_only);
707 }
708